From 0459bbfed09b3ecc4e3e40d00a0a217de2d5e8d0 Mon Sep 17 00:00:00 2001 From: Daniel Gildea Date: Sun, 3 Jan 2021 11:23:56 -0500 Subject: [PATCH 01/18] author urls in style name/id Issue #623 Now generates author pages with urls in form name/id for most people this looks like: people/d/david-chiang/david-chiang/ Matt Post has an ORCID in name_variants.yaml, so his page is: people/m/matt-post/0000-0002-1297-6794/ and then there is: people/y/yang-liu/yang-liu-edinburgh/ people/y/yang-liu/yang-liu-ict/ people/y/yang-liu/yang-liu-icsi/ people/y/yang-liu/yang-liu-umich/ I don't know how to make the old URLs people/m/matt-post/ resolve. --- bin/create_hugo_pages.py | 2 +- bin/create_hugo_yaml.py | 3 ++- data/yaml/name_variants.yaml | 2 ++ hugo/layouts/partials/author_link.html | 7 ++++--- 4 files changed, 9 insertions(+), 5 deletions(-) diff --git a/bin/create_hugo_pages.py b/bin/create_hugo_pages.py index d807ea52d1..a2ca3d17f0 100755 --- a/bin/create_hugo_pages.py +++ b/bin/create_hugo_pages.py @@ -137,7 +137,7 @@ def create_people(srcdir, clean=False): data = yaml.load(f, Loader=Loader) # Create a page stub for each person for name, entry in data.items(): - person_dir = "{}/content/people/{}".format(srcdir, name[0]) + person_dir = "{}/content/people/{}/{}".format(srcdir, entry["slug"][0], entry["slug"]) if not os.path.exists(person_dir): os.makedirs(person_dir) yaml_data = {"name": name, "title": entry["full"], "lastname": entry["last"]} diff --git a/bin/create_hugo_yaml.py b/bin/create_hugo_yaml.py index 0b93ef26c2..f5eb8f22dc 100755 --- a/bin/create_hugo_yaml.py +++ b/bin/create_hugo_yaml.py @@ -30,6 +30,7 @@ from docopt import docopt from collections import defaultdict +from slugify import slugify from tqdm import tqdm import logging as log import os @@ -76,7 +77,7 @@ def export_anthology(anthology, outdir, clean=False, dryrun=False): name = anthology.people.get_canonical_name(id_) log.debug("export_anthology: processing person '{}'".format(repr(name))) data = name.as_dict() - data["slug"] = id_ + data["slug"] = slugify(repr(name)) or "NONE" if id_ in anthology.people.comments: data["comment"] = anthology.people.comments[id_] if id_ in anthology.people.similar: diff --git a/data/yaml/name_variants.yaml b/data/yaml/name_variants.yaml index 6c73c6149e..ad962cc9af 100644 --- a/data/yaml/name_variants.yaml +++ b/data/yaml/name_variants.yaml @@ -7469,6 +7469,8 @@ - canonical: {first: Bruce, last: Porter} variants: - {first: Bruce W., last: Porter} +- canonical: {first: Matt, last: Post} + id: 0000-0002-1297-6794 - canonical: {first: Oana, last: Postolache} variants: - {first: Oana-Diana, last: Postolache} diff --git a/hugo/layouts/partials/author_link.html b/hugo/layouts/partials/author_link.html index 9c8c2669d0..4615fb490f 100644 --- a/hugo/layouts/partials/author_link.html +++ b/hugo/layouts/partials/author_link.html @@ -8,7 +8,8 @@ - person: A dict with key "id" (the ID of the person to link to, e.g., "hector-martinez-alonso"), and optional key "full" (a variant spelling) - class (optional): CSS classes for the link */}} -{{ $first_letter := slicestr .person.id 0 1 }} -{{ $entry := index .ctx.Site.Data.people $first_letter .person.id }} -{{ $link_to := printf "/people/%s/%s.md" $first_letter .person.id }} +{{ $id_first_letter := slicestr .person.id 0 1 }} +{{ $entry := index .ctx.Site.Data.people $id_first_letter .person.id }} +{{ $name_first_letter := slicestr $entry.slug 0 1 }} +{{ $link_to := printf "/people/%s/%s/%s.md" $name_first_letter $entry.slug .person.id }} {{ if isset .person "full" }}{{ .person.full }}{{ else }}{{ $entry.full }}{{ end }} From d94ef2027eb07070ffd6f3d37ee9efb0ea6d9910 Mon Sep 17 00:00:00 2001 From: Daniel Gildea Date: Sun, 3 Jan 2021 11:36:11 -0500 Subject: [PATCH 02/18] autofix --- bin/create_hugo_pages.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/bin/create_hugo_pages.py b/bin/create_hugo_pages.py index a2ca3d17f0..8909928c81 100755 --- a/bin/create_hugo_pages.py +++ b/bin/create_hugo_pages.py @@ -137,7 +137,9 @@ def create_people(srcdir, clean=False): data = yaml.load(f, Loader=Loader) # Create a page stub for each person for name, entry in data.items(): - person_dir = "{}/content/people/{}/{}".format(srcdir, entry["slug"][0], entry["slug"]) + person_dir = "{}/content/people/{}/{}".format( + srcdir, entry["slug"][0], entry["slug"] + ) if not os.path.exists(person_dir): os.makedirs(person_dir) yaml_data = {"name": name, "title": entry["full"], "lastname": entry["last"]} From e541c453dd6f57fce2d943f6a0272202375920c3 Mon Sep 17 00:00:00 2001 From: Matt Post Date: Thu, 29 May 2025 22:30:32 -0400 Subject: [PATCH 03/18] Add ORCID script and apply to 2025.naacl main (#623) --- bin/ingest_orcids.py | 154 +++ data/xml/2025.naacl.xml | 2304 +++++++++++++++++++-------------------- 2 files changed, 1306 insertions(+), 1152 deletions(-) create mode 100755 bin/ingest_orcids.py diff --git a/bin/ingest_orcids.py b/bin/ingest_orcids.py new file mode 100755 index 0000000000..8f3f0338f5 --- /dev/null +++ b/bin/ingest_orcids.py @@ -0,0 +1,154 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" +Copyright 2025 Matt Post + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +--- +Reads an aclpub2 directory that has been already ingested and adds +ORCIDS to authors. + + python bin/ingest_orcids.py /path/to/papers.yml volume_id + +e.g., + + python bin/ingest_orcids.py 2025-naacl-long.yml 2025.naacl-long +""" + +import click +import yaml +import sys +import os +from pathlib import Path +import lxml.etree as etree +from typing import Dict, List + +from anthology.utils import indent + + +def parse_paper_yaml(paper_path: str) -> List[Dict[str, str]]: + """ + Reads papers.yml to get metadata. Skips non-archival papers. + """ + # load the YAML file + papers = [] + if not os.path.exists(paper_path): + print(f"No such file: {paper_path}", file=sys.stderr) + sys.exit(1) + with open(paper_path, 'r', encoding='utf-8') as f: + papers = yaml.safe_load(f) + + for paper in papers: + if "archival" not in paper: + paper["archival"] = True + + # print(f"Loaded {len(papers)} papers from {paper_path}", file=sys.stderr) + return papers + + +@click.command() +# add a positional argument for the paper YAML file +@click.argument( + 'paper_yaml', + type=click.Path(exists=True, dir_okay=False, readable=True), + required=True, +) +@click.argument( + 'full_volume_id', + type=str, + required=True, +) +def main( + paper_yaml: str, + full_volume_id: str, +): + anthology_datadir = Path(sys.argv[0]).parent / ".." / "data" + # anthology = Anthology( + # importdir=anthology_datadir, require_bibkeys=False + # ) + + # venue_index = VenueIndex(srcdir=anthology_datadir) + # venue_keys = [venue["slug"].lower() for _, venue in venue_index.items()] + + # people = AnthologyIndex(srcdir=anthology_datadir) + # people.bibkeys = load_bibkeys(anthology_datadir) + + # Load the papers.yaml file, skipping non-archival papers + papers = [p for p in parse_paper_yaml(paper_yaml) if p["archival"]] + print(f"Found {len(papers)} archival papers", file=sys.stderr) + + for paper in papers: + print("PAPER:", paper['id'], file=sys.stderr) + for author in paper['authors']: + print( + f" {author['first_name']} {author['last_name']} ({author.get('institution', '')})", + file=sys.stderr, + ) + + collection_id, volume_name = full_volume_id.split('-') + + # open the paper XML file + collection_file = anthology_datadir / 'xml' / f'{collection_id}.xml' + if not os.path.exists(collection_file): + print(f"No such collection file {collection_file}", file=sys.stderr) + sys.exit(1) + + root_node = etree.parse(collection_file).getroot() + volume_node = root_node.find(f"./volume[@id='{volume_name}']") + if volume_node is None: + print( + f"No volume node with id '{volume_name}' found in {collection_file}", + file=sys.stderr, + ) + sys.exit(1) + + assert len(papers) == len(volume_node.findall('./paper')), ( + f"Number of papers in YAML ({len(papers)}) does not match number in XML ({len(volume_node.findall('./paper'))})" + ) + + for paper, paper_node in zip(papers, volume_node.findall('./paper')): + # paper_num = int(paper["id"]) + paper_num = int(paper_node.attrib['id']) + print(f"PAPER: YAML={paper_num}", file=sys.stderr) + + # assert paper_num == paper_id_xml, ( + # f"Paper ID mismatch: YAML={paper_num}, XML={paper_id_xml}" + # ) + + def get_author_xml(author_xml): + name = "" + if (first := author_xml.find('first')) is not None: + name += first.text or "" + if (last := author_xml.find('last')) is not None: + if name: + name += " " + name += last.text or "" + return name + + for author_yaml, author_node in zip(paper['authors'], paper_node.findall('./author')): + print(f"- Author YAML={author_yaml['first_name']} {author_yaml['last_name']} XML={get_author_xml(author_node)}", file=sys.stderr) + if orcid := author_yaml.get('orcid'): + orcid = orcid.split('/')[-1] # Extract the ORCID from the URL if it's a full URL + # Check if the ORCID is already set in the XML + # Set the ORCID attribute + author_node.attrib['orcid'] = orcid + + indent(root_node) + tree = etree.ElementTree(root_node) + tree.write(collection_file, encoding='UTF-8', xml_declaration=True, with_tail=True) + + +if __name__ == '__main__': + main() diff --git a/data/xml/2025.naacl.xml b/data/xml/2025.naacl.xml index 68785d24b8..b3afbe524c 100644 --- a/data/xml/2025.naacl.xml +++ b/data/xml/2025.naacl.xml @@ -44,9 +44,9 @@ ZeyuanLiu ZiyuHuanUniversity of Pennsylvania, University of Pennsylvania XiyaoWang - JiafeiLyu - JianTao - XiuLiTsinghua University + JiafeiLyu + JianTao + XiuLiTsinghua University FurongHuangUniversity of Maryland HuazheXuTsinghua University, Tsinghua University 50-72 @@ -56,13 +56,13 @@ <fixed-case>C</fixed-case>og<fixed-case>LM</fixed-case>: Tracking Cognitive Development of Large Language Models - XinglinWang - PeiwenYuan + XinglinWang + PeiwenYuan ShaoxiongFengRedNote YiweiLi BoyuanPan HedaWang - YaoHu + YaoHu KanLi 73-87 Piaget’s Theory of Cognitive Development (PTC) posits that the development of cognitive levels forms the foundation for human learning across various abilities. As Large Language Models (LLMs) have recently shown remarkable abilities across a wide variety of tasks, we are curious about the cognitive levels of current LLMs: to what extent they have developed and how this development has been achieved. To this end, we construct a benchmark CogLM (Cognitive Ability Evaluation for Language Model) based on PTC to assess the cognitive levels of LLMs. CogLM comprises 1,220 questions spanning 10 cognitive abilities crafted by more than 20 human experts, providing a comprehensive testbed for the cognitive levels of LLMs. Through extensive experiments across multiple mainstream LLMs with CogLM, we find that: (1) In our testing framework, advanced LLMs (such as GPT-4) have demonstrated human-like cognitive abilities, comparable to those of a 20-year-old human. (2) The parameter size and optimization objective are two key factors affecting the cognitive levels of LLMs. (3) The performance on downstream tasks is positively correlated with the level of cognitive abilities. These findings fill the gap in research on the cognitive abilities of LLMs, tracing the development of LLMs from a cognitive perspective and guiding the future direction of their evolution. @@ -72,9 +72,9 @@ Improving and Assessing the Fidelity of Large Language Models Alignment to Online Communities Minh DucChu - ZihaoHe + ZihaoHe RebeccaDorn - KristinaLermanUniversity of Southern California and USC Information Sciences Institute + KristinaLermanUniversity of Southern California and USC Information Sciences Institute 88-111 Large language models (LLMs) have shown promise in representing individuals and communities, offering new ways to study complex social dynamics. However, effectively aligning LLMs with specific human groups and systematically assessing the fidelity of the alignment remains a challenge. This paper presents a robust framework for aligning LLMs with online communities via instruction-tuning and comprehensively evaluating alignment across various aspects of language, including authenticity, emotional tone, toxicity, and harm. We demonstrate the utility of our approach by applying it to online communities centered on dieting and body image. We administer an eating disorder psychometric test to the aligned LLMs to reveal unhealthy beliefs and successfully differentiate communities with varying levels of eating disorder risk. Our results highlight the potential of LLMs in automated moderation and broader applications in public health and social science research. 2025.naacl-long.5 @@ -83,13 +83,13 @@ Improving Retrospective Language Agents via Joint Policy Gradient Optimization XueyangFeng - BoLan + BoLan QuanyuDaiHuawei Technologies Ltd. - LeiWang + LeiWang JiakaiTang - XuChenRenmin University of China + XuChenRenmin University of China ZhenhuaDong - Ji-RongWenRenmin University of China + Ji-RongWenRenmin University of China 112-141 In recent research advancements within the community, large language models (LLMs) have sparked great interest in creating autonomous agents. However, current prompt-based agents often heavily rely on large-scale LLMs. Meanwhile, although fine-tuning methods significantly enhance the capabilities of smaller LLMs, the fine-tuned agents often lack the potential for self-reflection and self-improvement. To address these challenges, we introduce a novel agent framework named RetroAct, which is a framework that jointly optimizes both task-planning and self-reflective evolution capabilities in language agents. Specifically, we develop a two-stage joint optimization process that integrates imitation learning and reinforcement learning, and design an off-policy joint policy gradient optimization algorithm with imitation learning regularization to enhance the data efficiency and training stability in agent tasks. RetroAct significantly improves the performance of open-source models, reduces dependency on closed-source LLMs, and enables fine-tuned agents to learn and evolve continuously. We conduct extensive experiments across various testing environments, demonstrating RetroAct has substantial improvements in task performance and decision-making processes. 2025.naacl-long.6 @@ -102,9 +102,9 @@ ZhiyuanHu YangLiu ZhichengZhangAlibaba Group - FeiWangXi’an Jiaotong University + FeiWangXi’an Jiaotong University Michael QizheShiehNational University of Singapore - WenmengZhouAlibaba Group + WenmengZhouAlibaba Group 142-160 Large Language Models (LLMs) excel in stand-alone code tasks like HumanEval and MBPP, but struggle with handling entire code repositories. This challenge has prompted research on enhancing LLM-codebase interaction at a repository scale. Current solutions rely on similarity-based retrieval or manual tools and APIs, each with notable drawbacks. Similarity-based retrieval often has low recall in complex tasks, while manual tools and APIs are typically task-specific and require expert knowledge, reducing their generalizability across diverse code tasks and real-world applications. To mitigate these limitations, we introduce CodexGraph, a system that integrates LLM agents with graph database interfaces extracted from code repositories. By leveraging the structural properties of graph databases and the flexibility of the graph query language, CodexGraph enables the LLM agent to construct and execute queries, allowing for precise, code structure-aware context retrieval and code navigation. We assess CodexGraph using three benchmarks: CrossCodeEval, SWE-bench, and EvoCodeBench. Additionally, we develop five real-world coding applications. With a unified graph database schema, CodexGraph demonstrates competitive performance and potential in both academic and real-world environments, showcasing its versatility and efficacy in software engineering. Our code and demo will be released soon. 2025.naacl-long.7 @@ -116,7 +116,7 @@ YuxuanFan XinZhangMicrosoft PeiyiWang - HoufengWang + HoufengWang 161-178 Human Preference Alignment (HPA) can assist large language models (LLMs) to generate safe content. Due to the heavy cost of fine-tuning, tuning-free methods have emerged, typically modifying LLM decoding via post-processing. In this paper, we propose a novel and effective approach for HPA in a tuning-free way, named In-Context Direct Preference Optimization (ICDPO). We first rethink the derivation procedures of DPO, based on which we conversely build an instant scorer using the states of the LLM before and after ICL. It enables LLMs to both generate and select the well-aligned response, which is precisely estimated by the aforementioned instant scorer, thereby enhancing the final performance. ICDPO can be further enhanced with a two-stage retriever and an upgraded scorer. Extensive experiments show its effectiveness, particularly in outperforming multiple tuning-free baselines, even competitiveness with SFT and DPO. We also conduct detailed analyses to offer comprehensive insights into ICDPO. 2025.naacl-long.8 @@ -133,9 +133,9 @@ What the #?*!: Disentangling Hate Across Target Identities - YipingJinPompeu Fabra University + YipingJinPompeu Fabra University LeoWannerCatalan Institute for Research and Advanced Studies and Universitat Pompeu Fabra - Aneesh MoideenKoyaKnorex + Aneesh MoideenKoyaKnorex 199-221 Hate speech (HS) classifiers do not perform equally well in detecting hateful expressions towards different target identities. They also demonstrate systematic biases in predicted hatefulness scores. Tapping on two recently proposed functionality test datasets for HS detection, we quantitatively analyze the impact of different factors on HS prediction. Experiments on popular industrial and academic models demonstrate that HS detectors assign a higher hatefulness score merely based on the mention of specific target identities. Besides, models often confuse hatefulness and the polarity of emotions. This result is worrisome as the effort to build HS detectors might harm the vulnerable identity groups we wish to protect: posts expressing anger or disapproval of hate expressions might be flagged as hateful themselves. We also carry out a study inspired by social psychology theory, which reveals that the accuracy of hatefulness prediction correlates strongly with the intensity of the stereotype. 2025.naacl-long.10 @@ -156,7 +156,7 @@ The <fixed-case>R</fixed-case>ussian-focused embedders’ exploration: ru<fixed-case>MTEB</fixed-case> benchmark and <fixed-case>R</fixed-case>ussian embedding model design ArtemSnegirevSaluteDevices - MariaTikhonovaHigher School of Economics + MariaTikhonovaHigher School of Economics MaksimovaAnna AlenaFenogenova AleksandrAbramov @@ -167,7 +167,7 @@ <fixed-case>PRACTIQ</fixed-case>: A Practical Conversational Text-to-<fixed-case>SQL</fixed-case> dataset with Ambiguous and Unanswerable Queries - MingwenDong + MingwenDong NischalAshok Kumar YiqunHuAWS AI Labs AnujChauhanAmazon @@ -186,9 +186,9 @@ <fixed-case>MIRAGE</fixed-case>-Bench: Automatic Multilingual Benchmark Arena for Retrieval-Augmented Generation Systems - NandanThakurUniversity of Waterloo + NandanThakurUniversity of Waterloo SulemanKaziVectara - GeLuoVectara Inc. + GeLuoVectara Inc. JimmyLinUniversity of Waterloo AminAhmadVectara 274-298 @@ -204,7 +204,7 @@ HieuDaoNational University of Singapore ShafiqJotySalesForce.com and Nanyang Technological University KenjiKawaguchiNational University of Singapore - Nancy F.Chen + Nancy F.Chen Min-YenKanNational University of Singapore 299-330 We present the first systematic evaluation examining format bias in performance of large language models (LLMs). Our approach distinguishes between two categories of an evaluation metric under format constraints to reliably and accurately assess performance: one measures performance when format constraints are adhered to, while the other evaluates performance regardless of constraint adherence. We then define a metric for measuring the format bias of LLMs and establish effective strategies to reduce it. Subsequently, we present our empirical format bias evaluation spanning four commonly used categories—multiple-choice question-answer, wrapping, list, and mapping—covering 15 widely-used formats. Our evaluation on eight generation tasks uncovers significant format bias across state-of-the-art LLMs. We further discover that improving the format-instruction following capabilities of LLMs across formats potentially reduces format bias. Based on our evaluation findings, we study prompting and fine-tuning with synthesized format data techniques to mitigate format bias. Our methods successfully reduce the variance in ChatGPT’s performance among wrapping formats from 235.33 to 0.71 (%^2) @@ -226,7 +226,7 @@ Soumya SuvraGhosalUniversity of Maryland, College Park SoumyabrataPalAdobe Systems KoyelMukherjeeAdobe Research - DineshManochaUniversity of Maryland, College Park + DineshManochaUniversity of Maryland, College Park 351-365 Large Language Models (LLMs) have recently demonstrated impressive few-shot learning capabilities through in-context learning (ICL). However, ICL performance is highly dependent on the choice of few-shot demonstrations, making the selection of the most optimal examples a persistent research challenge. This issue is further amplified in low-resource Indic languages, where the scarcity of ground-truth data complicates the selection process. In this work, we propose PromptRefine, a novel Alternating Minimization approach for example selection that improves ICL performance on low-resource Indic languages. PromptRefine leverages auxiliary example banks from related high-resource Indic languages and employs multi-task learning techniques to align language-specific retrievers, enabling effective cross-language retrieval. Additionally, we incorporate diversity in the selected examples to enhance generalization and reduce bias. Through comprehensive evaluations on four text generation tasks—Cross-Lingual Question Answering, Multilingual Question Answering, Machine Translation, and Cross-Lingual Summarization using state-of-the-art LLMs such as LLAMA-3.1-8B, LLAMA-2-7B, Qwen-2-7B, and Qwen-2.5-7B, we demonstrate that PromptRefine significantly outperforms existing frameworks for retrieving examples. 2025.naacl-long.17 @@ -235,9 +235,9 @@ Unlocking Decoding-time Controllability: Gradient-Free Multi-Objective Alignment with Contrastive Prompts TingchenFu - YupengHouUniversity of California, San Diego - JulianMcAuleyUniversity of California, San Diego, University of California, San Diego - RuiYanRenmin University of China + YupengHouUniversity of California, San Diego + JulianMcAuleyUniversity of California, San Diego, University of California, San Diego + RuiYanRenmin University of China 366-384 The task of multi-objective alignment aims at balancing and controlling the different alignment objectives, e.g., helpfulness, harmlessness and honesty) of large language models to meet the personalized requirements of different users. However, previous methods tend to train multiple models to deal with various user preferences, with the number of trained models growing linearly with the number of alignment objectives and the number of different preferences. Meanwhile, existing methods are generally poor in extensibility and require significant re-training for each new alignment objective considered. Considering the limitation of previous approaches, we propose MCA, which constructs an expert prompt and an adversarial prompt for each objective to contrast at the decoding time and balances the objectives through combining the contrast. Our approach is verified to be superior to previous methods in obtaining a well-distributed Pareto front among different alignment objectives. 2025.naacl-long.18 @@ -255,10 +255,10 @@ <fixed-case>M</fixed-case>o<fixed-case>DS</fixed-case>: Moderating a Mixture of Document Speakers to Summarize Debatable Queries in Document Collections NishantBalepur AlexaSiuAdobe - NedimLipkaAdobe Systems - FranckDernoncourt + NedimLipkaAdobe Systems + FranckDernoncourt TongSunAdobe Systems - Jordan LeeBoyd-GraberUniversity of Maryland, College Park + Jordan LeeBoyd-GraberUniversity of Maryland, College Park PuneetMathurAdobe Systems 465-491 Query-focused summarization (QFS) gives a summary of documents to answer a query.Past QFS work assumes queries have one answer, ignoring debatable ones (*Is law school worth it?*).We introduce **Debatable QFS (DQFS)**, a task to create summaries that answer debatable queries via documents with opposing perspectives; summaries must *comprehensively cover* all sources and *balance perspectives*, favoring no side.These goals elude LLM QFS systems, which: 1) lack structured content plans, failing to guide LLMs to write balanced summaries, and 2) employ the same query to retrieve contexts across documents, failing to cover all perspectives specific to each document’s content.To overcome this, we design MoDS, a multi-LLM framework mirroring human panel discussions.MoDS treats documents as individual Speaker LLMs and has a Moderator LLM that picks speakers to respond to tailored queries for planned topics.Speakers use tailored queries to retrieve relevant contexts from their documents and supply perspectives, which are tracked in a rich outline, yielding a content plan to guide the final summary.Experiments on ConflictingQA with controversial web queries and DebateQFS, our new dataset of debate queries from Debatepedia, show MoDS beats SOTA by 38-59% in topic paragraph coverage and balance, based on new citation metrics. Users also find MoDS’s summaries to be readable and more balanced. @@ -267,7 +267,7 @@ Aligning Sentence Simplification with <fixed-case>ESL</fixed-case> Learner’s Proficiency for Language Acquisition - GuanlinLi + GuanlinLi YukiAraseTokyo Institute of Technology, Tokyo Institute of Technology and AIST, National Institute of Advanced Industrial Science and Technology NoelCrespiTelecom SudParis 492-507 @@ -290,8 +290,8 @@ YilongXu JinhuaGaoInstitute of Computing Technology, Chinese Academy of Sciences XiaomingYu, Chinese Academy of Sciences - BaolongBi - HuaweiShenInstitute of Computing Technology, Chinese Academy of Sciences + BaolongBi + HuaweiShenInstitute of Computing Technology, Chinese Academy of Sciences XueqiChengInstitute of Computing Technology, Chinese Academy 545-561 Large Language Model (LLM) can enhance its credibility and verifiability by generating text with citations. However, existing research on citation generation is predominantly limited to sentence-level statements, neglecting the significance of positional fine-grained citations that can appear anywhere within sentences. To facilitate further exploration of the positional fine-grained citation generation, we propose ALiiCE, the first automatic evaluation framework for this task. Our method employs a dependency tree based approach to parse the sentence-level claim into atomic claims. Then ALiiCE evaluates citation quality using three metrics, including positional fine-grained citation recall, precision, and coefficient of variation of citation positions. We evaluate the positional fine-grained citation generation performance of several LLMs on long-form QA datasets. Our experiments and analyses demonstrate the effectiveness and reasonableness of ALiiCE. We offer our insights into the current advancements and future directions for the positional fine-grained citation generation task. @@ -334,10 +334,10 @@ SiQinMicrosoft MinghuaMa YuKangMicrosoft - QingweiLinMicrosoft Research + QingweiLinMicrosoft Research SaravanRajmohanMicrosoft - DongmeiZhangMicrosoft - QiZhangMicrosoft + DongmeiZhangMicrosoft + QiZhangMicrosoft 597-622 We introduce UFO, a UI-Fcused agent designed to fulfill user requests tailored to Windows OS applications by observing and analyzing the GUI and control information of these applications. UFO utilizes a hierarchical dual-agent framework that decomposes user requests using a divide-and-conquer approach, enabling seamless navigation and addressing sub-tasks across multiple applications. It also incorporates a control interaction module tailored for Windows OS, which detects control elements effectively and allows for fully automated execution. As a result, UFO simplifies complex and time-consuming processes into tasks that can be completed with natural language commands.We conducted testing of UFO across 9 popular Windows applications, encompassing a variety of scenarios. The results derived from both quantitative metrics and real-case studies, underscore the superior effectiveness of UFOin fulfilling user requests. To the best of our knowledge, UFO stands as the first UI agent specifically tailored for task completion within the Windows OS. 2025.naacl-long.26 @@ -349,7 +349,7 @@ MaharshiGorUniversity of Maryland, College Park EveFleisig IshaniMondalMicrosoft - Jordan LeeBoyd-GraberUniversity of Maryland, College Park + Jordan LeeBoyd-GraberUniversity of Maryland, College Park 623-642 Adversarial datasets should validate AI robustness by providing samples on which humans perform well, but models do not. However, as models evolve, datasets can become obsolete. Measuring whether a dataset remains adversarial is hindered by the lack of a standardized metric for measuring adversarialness. We propose ADVSCORE, a human-grounded evaluation metric that assesses a dataset’s adversarialness by capturing models’ and humans’ varying abilities, while also identifying poor examples. We then use ADVSCORE to motivate a new dataset creation pipeline for realistic and high-quality adversarial samples, enabling us to collect an adversarial question answering (QA) dataset, ADVQA. We apply ADVSCORE using 9,347 human responses and ten language models’ predictions to track model improvement over five years (2020–2024). ADVSCORE thus provides guidance for achieving robustness comparable with human capabilities. Furthermore, it helps determine to what extent adversarial datasets continue to pose challenges, ensuring that, rather than reflecting outdated or overly artificial difficulties, they effectively test model capabilities. 2025.naacl-long.27 @@ -386,8 +386,8 @@ DiFu ChunyuanLiMicrosoft Research Alexander GHauptmannSchool of Computer Science, Carnegie Mellon University - YonatanBiskMeta and Carnegie Mellon University - YimingYangSchool of Computer Science, Carnegie Mellon University + YonatanBiskMeta and Carnegie Mellon University + YimingYangSchool of Computer Science, Carnegie Mellon University 694-717 Preference modeling techniques, such as direct preference optimization (DPO), has shown effective in enhancing the generalization abilities of large language model (LLM). However, in tasks involving video instruction-following, providing informative feedback, especially for open-ended conversations, remains a significant challenge. While previous studies have explored using large multimodal models (LMMs) as reward models for guiding preference modeling, their ability to accurately assess the quality of generated responses and their alignment with video content has not been conclusively demonstrated. This paper introduces a novel framework that utilizes detailed video captions as a proxy of video content, enabling language models to incorporate this information as supporting evidence for scoring video Question Answering (QA) predictions. Our approach demonstrates robust alignment with OpenAI GPT-4V model’s reward mechanism, which directly takes video frames as input. Furthermore, we show that applying our reward mechanism to DPO algorithm significantly improves model performance on open-ended video QA tasks. 2025.naacl-long.30 @@ -395,7 +395,7 @@ <fixed-case>F</fixed-case>lexi<fixed-case>GPT</fixed-case>: Pruning and Extending Large Language Models with Low-Rank Weight Sharing - James SealeSmithSamsung + James SealeSmithSamsung Chi-HengLinSamsung Research America ShikharTuliSamsung Research HarisJeelani @@ -415,8 +415,8 @@ JiarongPanEindhoven University of Technology BoXiongStanford University YunjieHeUniversität Stuttgart; Bosch Center for Artificial Intelligence - EvgenyKharlamovRobert Bosch GmbH, Bosch and University of Oslo - SteffenStaabUniversity of Stuttgart and University of Southampton + EvgenyKharlamovRobert Bosch GmbH, Bosch and University of Oslo + SteffenStaabUniversity of Stuttgart and University of Southampton 731-750 Knowledge graph embeddings (KGE) apply machine learning methods on knowledge graphs (KGs) to provide non-classical reasoning capabilities based on similarities and analogies. The learned KG embeddings are typically used to answer queries by ranking all potential answers, but rankings often lack a meaningful probabilistic interpretation - lower-ranked answers do not necessarily have a lower probability of being true. This limitation makes it difficult to quantify uncertainty of model’s predictions, posing challenges for the application of KGE methods in high-stakes domains like medicine. We address this issue by applying the theory of conformal prediction that allows generating answer sets, which contain the correct answer with probabilistic guarantees. We explain how conformal prediction can be used to generate such answer sets for link prediction tasks. Our empirical evaluation on four benchmark datasets using six representative KGE methods validates that the generated answer sets satisfy the probabilistic guarantees given by the theory of conformal prediction. We also demonstrate that the generated answer sets often have a sensible size and that the size adapts well with respect to the difficulty of the query. 2025.naacl-long.32 @@ -425,8 +425,8 @@ Parameter-free and Accessible Prompt Learning to Enhance Adversarial Robustness for Pre-trained Vision-Language Models XingranZhouAnt Group - KunYang - ChangtaoMiao + KunYang + ChangtaoMiao BingyuHu ZhuoerXuAnt Group ShiwenCuiant group @@ -453,7 +453,7 @@ TerraBlevinsUniversität Vienna AlisaLiuUniversity of Washington LukeZettlemoyerUniversity of Washington, Facebook and Meta - Noah A.SmithUniversity of Washington and Allen Institute for Artificial Intelligence + Noah A.SmithUniversity of Washington and Allen Institute for Artificial Intelligence 785-798 Despite their wide adoption, the biases and unintended behaviors of language models remain poorly understood. In this paper, we identify and characterize a phenomenon never discussed before, which we call semantic leakage, where models leak irrelevant information from the prompt into the generation in unexpected ways. We propose an evaluation setting to detect semantic leakage both by humans and automatically, curate a diverse test suite for diagnosing this behavior, and measure significant semantic leakage in 13 flagship models. We also show that models exhibit semantic leakage in languages besides English and across different settings and generation scenarios. This discovery highlights yet another type of bias in language models that affects their generation patterns and behavior. 2025.naacl-long.35 @@ -464,11 +464,11 @@ RuihanYang JiangjieChenByteDance Inc. YikaiZhang - SiyuYuan + SiyuYuan AiliChen KyleRichardsonAllen Institute for Artificial Intelligence - YanghuaXiaoFudan University - DeqingYangFudan University + YanghuaXiaoFudan University + DeqingYangFudan University 799-819 Language agents powered by large language models (LLMs) are increasingly valuable as decision-making tools in domains such as gaming and programming. However, these agents often face challenges in achieving high-level goals without detailed instructions and in adapting to environments where feedback is delayed. In this paper, we present SELFGOAL, a novel automatic approach designed to enhance agents’ capabilities to achieve high-level goals with limited human prior and environmental feedback. The core concept of SELFGOAL involves adaptively breaking down a high-level goal into a tree structure of more practical subgoals during the interaction with environments while identifying the most useful subgoals and progressively updating this structure. Experimental results demonstrate that SELFGOAL significantly enhances the performance of language agents across various tasks, including competitive, cooperative, and deferred feedback environments. 2025.naacl-long.36 @@ -478,7 +478,7 @@ Familiarity: Better Evaluation of Zero-Shot Named Entity Recognition by Quantifying Label Shifts in Synthetic Training Data JonasGolde PatrickHaller - MaxPloner + MaxPloner FabioBarth NicolaasJedema AlanAkbik @@ -489,7 +489,7 @@ Learning to Summarize from <fixed-case>LLM</fixed-case>-generated Feedback - HwanjunSongKorea Advanced Institute of Science & Technology + HwanjunSongKorea Advanced Institute of Science & Technology TaewonYun YuhoLee JihwanOhKorea Advanced Institute of Science & Technology @@ -519,12 +519,12 @@ HaoLiu HaoyuWangHuawei Technologies Ltd. WeiHeHuawei Noah’s Ark Lab - BinfanZheng + BinfanZheng WeihaoWangHuawei Technologies Ltd. - QiangLiHuawei Technologies Ltd. + QiangLiHuawei Technologies Ltd. WeijianSun - YunheWangHuawei Noah’s Ark Lab - DachengTaoNanyang Technological University + YunheWangHuawei Noah’s Ark Lab + DachengTaoNanyang Technological University 876-891 Large language models (LLMs) have achieved remarkable performance on various NLP tasks, yet their potential in more challenging task like finance, has not been fully explored. In this paper, we present CFinBench: a meticulously crafted, the most comprehensive evaluation benchmark to date, for assessing the financial knowledge of LLMs under Chinese context. In practice, to better align with the career trajectory of Chinese financial practitioners, we build a systematic evaluation from 4 first-level categories: (1) Financial Subject: whether LLMs can memorize the necessary basic knowledge of financial subjects, such as economics, statistics and auditing. (2) Financial Qualification: whether LLMs can obtain the needed financial qualified certifications, such as certified public accountant, securities qualification and banking qualification. (3) Financial Practice: whether LLMs can fulfill the practical financial jobs, such as tax consultant, junior accountant and securities analyst. (4) Financial Law: whether LLMs can meet the requirement of financial laws and regulations, such as tax law, insurance law and economic law. CFinBench comprises 99,100 questions spanning 43 second-level categories with 3 question types: single-choice, multiple-choice and judgment. We conduct extensive experiments on a wide spectrum of representative LLMs with various model size on CFinBench. The results show that GPT4 and some Chinese-oriented models lead the benchmark, with the highest average accuracy being 66.02%, highlighting the challenge presented by CFinBench. All the data and evaluation code are open sourced at https://cfinbench.github.io/ 2025.naacl-long.40 @@ -533,7 +533,7 @@ <fixed-case>LLM</fixed-case>-Based Explicit Models of Opponents for Multi-Agent Games XiaoPengYu - WanpengZhangPeking University + WanpengZhangPeking University ZongqingLuPeking University 892-911 In multi-agent scenarios, the ability to anticipate and respond to opponents is essential, particularly in environments involving adversarial and collaborative interactions. In this paper, we introduce Explicit Models of Opponents (EMO) based on Large Language Models (LLMs), enabling agents to better predict and adapt to diverse, dynamic multi-agent interactions. Unlike traditional methods that often simplify multi-agent interactions using a single opponent model, EMO constructs an individual model for each opponent and aligns these models working in synergy through a bi-level feedback-refinement framework. We test EMO alongside several reasoning methods in multi-player deduction games, where agents must infer hidden information about their opponents. The results show that EMO significantly enhances agents’ decision-making, outperforming traditional single-model approaches. Our findings demonstrate that EMO can be a powerful tool for enhancing LLM-based agents in complex multi-agent systems. @@ -543,13 +543,13 @@ <fixed-case>S</fixed-case>eq<fixed-case>AR</fixed-case>: Jailbreak <fixed-case>LLM</fixed-case>s with Sequential Auto-Generated Characters YanYang - ZeguanXiao + ZeguanXiao XinLu - HongruWangThe Chinese University of Hong Kong - XuetaoWeiSouthern University of Science and Technology - HailiangHuangShanghai University of Finance and Economics - GuanhuaChenSouthern University of Science and Technology - YunChenShanghai University of Finance and Economics + HongruWangThe Chinese University of Hong Kong + XuetaoWeiSouthern University of Science and Technology + HailiangHuangShanghai University of Finance and Economics + GuanhuaChenSouthern University of Science and Technology + YunChenShanghai University of Finance and Economics 912-931 The widespread applications of large language models (LLMs) have brought about concerns regarding their potential misuse. Although aligned with human preference data before release, LLMs remain vulnerable to various malicious attacks. In this paper, we adopt a red-teaming strategy to enhance LLM safety and introduce SeqAR, a simple yet effective framework to design jailbreak prompts automatically. The SeqAR framework generates and optimizes multiple jailbreak characters and then applies sequential jailbreak characters in a single query to bypass the guardrails of the target LLM. Different from previous work which relies on proprietary LLMs or seed jailbreak templates crafted by human expertise, SeqAR can generate and optimize the jailbreak prompt in a cold-start scenario using open-sourced LLMs without any seed jailbreak templates. Experimental results show that SeqAR achieves attack success rates of 88% and 60% in bypassing the safety alignment of GPT-3.5-1106 and GPT-4, respectively. Furthermore, we extensively evaluate the transferability of the generated templates across different LLMs and held-out malicious requests, while also exploring defense strategies against the jailbreak attack designed by SeqAR. 2025.naacl-long.42 @@ -559,26 +559,26 @@ <fixed-case>JMMMU</fixed-case>: A <fixed-case>J</fixed-case>apanese Massive Multi-discipline Multimodal Understanding Benchmark for Culture-aware Evaluation ShotaOnoharaThe University of Tokyo, Tokyo Institute of Technology AtsuyukiMiyaiThe University of Tokyo - YukiImajuku + YukiImajuku KazukiEgashira - JeonghunBaekThe University of Tokyo + JeonghunBaekThe University of Tokyo XiangYueCarnegie Mellon University GrahamNeubigCarnegie Mellon University - KiyoharuAizawaThe University of Tokyo + KiyoharuAizawaThe University of Tokyo 932-950 2025.naacl-long.43 onohara-etal-2025-jmmmu <fixed-case>EASYTOOL</fixed-case>: Enhancing <fixed-case>LLM</fixed-case>-based Agents with Concise Tool Instruction - SiyuYuan + SiyuYuan KaitaoSongMicrosoft JiangjieChenByteDance Inc. - XuTan + XuTan YongliangShenZhejiang University KanRenShanghaiTech University - DongshengLiMicrosoft Research Asia - DeqingYangFudan University + DongshengLiMicrosoft Research Asia + DeqingYangFudan University 951-972 There has been a rising interest in utilizing tools in applications of autonomous agents based on large language models (LLMs) to address intricate real-world tasks. To develop LLMbased agents, it usually requires LLMs to understand many tool functions from different tool documentations. However, these documentations could be diverse, redundant, or incomplete, which immensely affects the capability of LLMs in using tools. Current LLMs exhibit satisfactory instruction-following capabilities based on instruction-following fine-tuning process. Motivated by this, in this paper, we introduce EASYTOOL, a framework transforming diverse and lengthy tool documentation into a unified and concise tool instruction to fully leverage instruction-following capabilities of LLMs for easier tool usage. EASYTOOL purifies essential information from extensive tool documentation of different sources, and elaborates a unified interface (i.e., tool instruction) to offer standardized tool descriptions and functionalities for LLM-based agents. Extensive experiments on multiple different tasks demonstrate that EASYTOOL can significantly reduce token consumption and improve the performance of LLM-based agents on tool utilization in real-world scenarios. Our code is available in supplemental materials. Our code is available at https://github.com/microsoft/JARVIS/tree/main/easytool. 2025.naacl-long.44 @@ -586,8 +586,8 @@ Decoding Hate: Exploring Language Models’ Reactions to Hate Speech - PalomaPiotUniversidade da Coruña - JavierParaparUniversidad de La Coruña + PalomaPiotUniversidade da Coruña + JavierParaparUniversidad de La Coruña 973-990 Hate speech is a harmful form of online expression, often manifesting as derogatory posts. It is a significant risk in digital environments. With the rise of Large Language Models (LLMs), there is concern about their potential to replicate hate speech patterns, given their training on vast amounts of unmoderated internet data. Understanding how LLMs respond to hate speech is crucial for their responsible deployment. However, the behaviour of LLMs towards hate speech has been limited compared. This paper investigates the reactions of seven state-of-the-art LLMs (LLaMA 2, Vicuna, LLaMA 3, Mistral, GPT-3.5, GPT-4, and Gemini Pro) to hate speech. Through qualitative analysis, we aim to reveal the spectrum of responses these models produce, highlighting their capacity to handle hate speech inputs. We also discuss strategies to mitigate hate speech generation by LLMs, particularly through fine-tuning and guideline guardrailing. Finally, we explore the models’ responses to hate speech framed in politically correct language. 2025.naacl-long.45 @@ -595,9 +595,9 @@ Babysit A Language Model From Scratch: Interactive Language Learning by Trials and Demonstrations - ZiqiaoMa + ZiqiaoMa ZekunWangGeorgia Institute of Technology - JoyceChaiUniversity of Michigan + JoyceChaiUniversity of Michigan 991-1010 Humans are efficient language learners and inherently social creatures. Our language development is largely shaped by our social interactions, for example, the demonstration and feedback from caregivers. Contrary to human language learning, recent advancements in large language models have primarily adopted a non-interactive training paradigm, and refined pre-trained models through feedback afterward. In this work, we explore how corrective feedback from interactions influences neural language acquisition from scratch through systematically controlled experiments, assessing whether it contributes to word learning efficiency in language models. We introduce a trial-and-demonstration (TnD) learning framework that incorporates three distinct components: student trials, teacher demonstrations, and a reward conditioned on language competence at various developmental stages. Our experiments reveal that the TnD approach accelerates word acquisition for student models of equal and smaller numbers of parameters, and we highlight the significance of both trials and demonstrations. We further show that the teacher’s choices of words influence students’ word-specific learning efficiency, and a practice-makes-perfect effect is evident by a strong correlation between the frequency of words in trials and their respective learning curves. Our findings suggest that interactive language learning, with teacher demonstrations and active trials, can facilitate efficient word learning in language models. 2025.naacl-long.46 @@ -605,7 +605,7 @@ <fixed-case>M</fixed-case>o<fixed-case>CE</fixed-case>: Adaptive Mixture of Contextualization Experts for Byte-based Neural Machine Translation - LanglinHuangWashington University, Saint Louis + LanglinHuangWashington University, Saint Louis MengyuBu YangFengInstitute of Computing Technology, Chinese Academy of Sciences 1011-1028 @@ -615,7 +615,7 @@ <fixed-case>LLM</fixed-case>-Human Pipeline for Cultural Grounding of Conversations - RajkumarPujariPurdue University + RajkumarPujariPurdue University DanGoldwasserPurdue University and Purdue University 1029-1048 Conversations often adhere to well-understood social norms that vary across cultures. For example, while addressing parents by name is commonplace in the West, it is rare in most Asian cultures. Adherence or violation of such norms often dictates the tenor of conversations. Humans are able to navigate social situations requiring cultural awareness quite adeptly. However, it is a hard task for NLP models.In this paper, we tackle this problem by introducing a Cultural Context Schema for conversations. It comprises (1) conversational information such as emotions, dialogue acts, etc., and (2) cultural information such as social norms, violations, etc. We generate ~110k social norm and violation descriptions for ~23k conversations from Chinese culture using LLMs. We refine them using automated verification strategies which are evaluated against culturally aware human judgements. We organize these descriptions into meaningful structures we call Norm Concepts, using an interactive human-in-loop framework. We ground the norm concepts and the descriptions in conversations using symbolic annotation. Finally, we use the obtained dataset for downstream tasks such as emotion, sentiment, and dialogue act detection. We show that it significantly improves the empirical performance. @@ -625,13 +625,13 @@ <fixed-case>ACCESS</fixed-case> : A Benchmark for Abstract Causal Event Discovery and Reasoning VyVoMonash University - LizhenQuMonash University - TaoFeng - YunchengHua - XiaoxiKang + LizhenQuMonash University + TaoFeng + YunchengHua + XiaoxiKang SonghaiFan - TimDwyerMonash University - Lay-KiSoonMonash University + TimDwyerMonash University + Lay-KiSoonMonash University GholamrezaHaffariMonash University, Monash University and Monash University 1049-1074 2025.naacl-long.49 @@ -639,8 +639,8 @@ Unmasking Implicit Bias: Evaluating Persona-Prompted <fixed-case>LLM</fixed-case> Responses in Power-Disparate Social Scenarios - Bryan Chen ZhengyuTan - Roy Ka-WeiLeeSingapore University of Technology and Design + Bryan Chen ZhengyuTan + Roy Ka-WeiLeeSingapore University of Technology and Design 1075-1108 Large language models (LLMs) have demonstrated remarkable capabilities in simulating human behaviour and social intelligence. However, they risk perpetuating societal biases, especially when demographic information is involved. We introduce a novel framework using cosine distance to measure semantic shifts in responses and an LLM-judged Preference Win Rate (WR) to assess how demographic prompts affect response quality across power-disparate social scenarios. Evaluating five LLMs over 100 diverse social scenarios and nine demographic axes, our findings suggest a “default persona” bias toward middle-aged, able-bodied, native-born, Caucasian, atheistic males with centrist views. Moreover, interactions involving specific demographics are associated with lower-quality responses. Lastly, the presence of power disparities increases variability in response semantics and quality across demographic groups, suggesting that implicit biases may be heightened under power-imbalanced conditions. These insights expose the demographic biases inherent in LLMs and offer potential paths toward future bias mitigation efforts in LLMs. 2025.naacl-long.50 @@ -648,7 +648,7 @@ <fixed-case>G</fixed-case>lo<fixed-case>COM</fixed-case>: A Short Text Neural Topic Model via Global Clustering Context - Quang DucNguyenNanyang Technological University + Quang DucNguyenNanyang Technological University TungNguyen Duc AnhNguyenHanoi University of Science and Technology Linh NgoVanHanoi University of Science and Technology @@ -662,7 +662,7 @@ Reversed Attention: On The Gradient Descent Of Attention Layers In <fixed-case>GPT</fixed-case> ShaharKatzComputer Science Departmen, Technion-Israel Institute of Technology - LiorWolfTel Aviv University, Tel Aviv University and Tel Aviv University + LiorWolfTel Aviv University, Tel Aviv University and Tel Aviv University 1125-1152 The success of Transformer-based Language Models (LMs) stems from their attention mechanism. While this mechanism has been extensively studied in explainability research, particularly through the attention values obtained during the forward pass of LMs, the backward pass of attention has been largely overlooked.In this work, we study the mathematics of the backward pass of attention, revealing that it implicitly calculates an attention matrix we refer to as “Reversed Attention”.We visualized Reversed Attention and examine its properties, demonstrating its ability to elucidate the models’ behavior and edit dynamics.In an experimental setup, we showcase the ability of Reversed Attention to directly alter the forward pass of attention, without modifying the model’s weights, using a novel method called “attention patching”.In addition to enhancing the comprehension of how LMs configure attention layers during backpropagation, Reversed Attention maps contribute to a more interpretable backward pass. 2025.naacl-long.52 @@ -671,7 +671,7 @@ Self-Harmonized Chain of Thought ZiqiJin - WeiLuSingapore University of Technology and Design + WeiLuSingapore University of Technology and Design 1153-1174 Chain-of-thought (CoT) prompting has demonstrated the capacity of large language models to perform complex reasoning through intermediate steps. While effective, current CoT methods face challenges: Zero-shot-CoT can lead to reasoning errors, and Few-shot-CoT requires labor-intensive manual demonstrations. Auto-CoT attempts to address these issues by automatically generating diverse demonstrations, but this diversity can lead to inconsistent reasoning patterns. We propose ECHO (Self-Harmonized Chain of Thought), a novel method that unifies diverse solution paths into a consistent and effective reasoning pattern. ECHO employs an iterative process to refine and harmonize automatically generated demonstrations, mitigating the limitations of existing approaches. Our comprehensive experiments across arithmetic, commonsense, and symbolic reasoning tasks demonstrate that ECHO outperforms Auto-CoT by an average of 2.8%. These findings suggest that ECHO represents a significant step towards more robust and generalizable automated reasoning in large language models. 2025.naacl-long.53 @@ -679,9 +679,9 @@ <fixed-case>A</fixed-case>na<fixed-case>S</fixed-case>core: Understanding Semantic Parallelism in Proportional Analogies - LiyanWangWaseda University - HaotongWang - YvesLepageWaseda University + LiyanWangWaseda University + HaotongWang + YvesLepageWaseda University 1175-1188 Formulaic criteria for proportional analogies, which capture relational mappings between two ratios of terms, are mainly confined to the formal level. As analogy datasets grow more complex, especially in evaluating the cognitive abilities of Large Language Models (LLMs), assessing parallelism in them becomes increasingly challenging and often requires human annotation. In this work, we propose AnaScore, an automatic metric for evaluating the strength of semantic parallelism in sentence analogies. AnaScore systematically provides formalized explanations for shared relational patterns at the level of conceptual knowledge. We apply AnaScore to annotate several existing datasets, considering different directions of the relations, and uncover artifacts in data construction. Our experiments with various LLMs demonstrate the efficacy of the AnaScore metric in capturing the inherent quality of analogical relationships, showing a positive correlation between analogy quality and model performance. Thanks to this metric, we clearly demonstrate that formally explainable examples are more beneficial for analogical reasoning, while ambiguous analogies with no clear criterion tend to hinder inference. 2025.naacl-long.54 @@ -689,8 +689,8 @@ Generating Complex Question Decompositions in the Face of Distribution Shifts - KelvinHanLORIA/CNRS, Université of Lorraine - ClaireGardentCNRS + KelvinHanLORIA/CNRS, Université of Lorraine + ClaireGardentCNRS 1189-1211 Question decomposition has been found to help large language models’ (LLMs) performance on complex question answering (QA) by breaking these questions into simpler sub-questions for answering. Nonetheless, performance on the task remains dominated by supervised approaches, suggesting room for making LLMs better decomposers. One way of improving LLM training and fine-tuning is to leverage synthetic training data, but the superior performance of supervised approaches collapses in the face of distribution shifts, making them unsuitable for generating synthetic data across new domains and at scale. To address this, we propose an approach to generate synthetic decomposition data with only five annotated examples; we do this by (i) extending recent advancements in using LLM-as-judge and for reranking in novel ways, as well as (ii) using a panel of smaller-sized LLMs for data generation instead of resource-intensive larger models. Through careful validation of our approach over two benchmark datasets, we show that our data generation and modelling approaches bring consistent improvements over using few-shot prompting with LLMs for the task. Our code and models can be found at https://github.com/hankelvin/complex_question_decomposition. 2025.naacl-long.55 @@ -698,13 +698,13 @@ Diversify-verify-adapt: Efficient and Robust Retrieval-Augmented Ambiguous Question Answering - YeonjunIn - SungchulKimAdobe Systems - Ryan A.RossiAdobe Research + YeonjunIn + SungchulKimAdobe Systems + Ryan A.RossiAdobe Research MehrabTanjimAdobe Research - TongYuAdobe Research + TongYuAdobe Research RitwikSinha - ChanyoungParkKorea Advanced Institute of Science and Technology + ChanyoungParkKorea Advanced Institute of Science and Technology 1212-1233 The retrieval augmented generation (RAG) framework addresses an ambiguity in user queries in QA systems by retrieving passages that cover all plausible interpretations and generating comprehensive responses based on the passages. However, our preliminary studies reveal that a single retrieval process often suffers from low-quality results, as the retrieved passages frequently fail to capture all plausible interpretations. Although the iterative RAG approach has been proposed to address this problem, it comes at the cost of significantly reduced efficiency. To address these issues, we propose the diversify-verify-adapt (DIVA) framework. DIVA first diversifies the retrieved passages to encompass diverse interpretations. Subsequently, DIVA verifies the quality of the passages and adapts the most suitable approach tailored to their quality. This approach improves the QA systems’ accuracy and robustness by handling low quality retrieval issue in ambiguous questions, while enhancing efficiency. 2025.naacl-long.56 @@ -713,9 +713,9 @@ Unifying <fixed-case>AI</fixed-case> Tutor Evaluation: An Evaluation Taxonomy for Pedagogical Ability Assessment of <fixed-case>LLM</fixed-case>-Powered <fixed-case>AI</fixed-case> Tutors Kaushal KumarMaurya - Kv AdityaSrivatsaMohamed bin Zayed University of Artificial Intelligence + Kv AdityaSrivatsaMohamed bin Zayed University of Artificial Intelligence KseniiaPetukhova - EkaterinaKochmarMohamed bin Zayed University of Artificial Intelligence + EkaterinaKochmarMohamed bin Zayed University of Artificial Intelligence 1234-1251 In this paper, we investigate whether current state-of-the-art large language models (LLMs) are effective as AI tutors and whether they demonstrate pedagogical abilities necessary for good AI tutoring in educational dialogues. Previous efforts towards evaluation have beenlimited to subjective protocols and benchmarks. To bridge this gap, we propose a unified evaluation taxonomy with eight pedagogical dimensions based on key learning sciences principles, which is designed to assess the pedagogical value of LLM-powered AI tutor responses grounded in student mistakes or confusions in the mathematical domain. We release MRBench – a new evaluation benchmark containing 192 conversations and 1,596 responses from seven state-of-the-art LLM-based and human tutors, providing gold annotations for eight pedagogical dimensions. We assess reliability of the popular Prometheus2 and Llama-3.1-8B LLMs as evaluators and analyze each tutor’s pedagogical abilities, highlighting which LLMs are good tutors and which ones are more suitable as question-answering systems. We believe that the presented taxonomy, benchmark, and human-annotated labels will streamline the evaluation process and help track the progress in AI tutors’ development. 2025.naacl-long.57 @@ -726,7 +726,7 @@ KuniakiSaitoOMRON SINICX Chen-YuLeeGoogle KihyukSohnFacebook - YoshitakaUshikuOMRON SINIC X, NexaScience, RIKEN, Ridge-i and National Institute of Advanced Industrial Science and Technology + YoshitakaUshikuOMRON SINIC X, NexaScience, RIKEN, Ridge-i and National Institute of Advanced Industrial Science and Technology 1252-1269 Language model (LM) stores diverse factual knowledge in their parameters, which is learned during self-supervised training on unlabeled documents and is made extractable by instruction-tuning. For knowledge-intensive tasks, it is essential to memorize information in a way that makes it extractable from LM’s parameters with diverse queries. However, LMs suffer from a phenomenon called “perplexity curse”; despite minimizing document perplexity during training, LMs struggle to extract information via a question prompt. In this paper, we study the problem by fine-tuning LMs for new data and find a very intriguing fact that all studied LMs suffer from positional bias in the training document, i.e., they struggle to answer questions about the information described in the middle or at the end of the training document. Our study indicates that this problem stems from the auto-regressive training, ie., predicting the next token given all previous tokens, thus adding regularization mitigates the issue. Our discoveries supported by extensive analysis will be an important key to extracting knowledge from the parameters of LMs. We will publish our code and dataset upon acceptance. 2025.naacl-long.58 @@ -750,11 +750,11 @@ Balancing Forget Quality and Model Utility: A Reverse <fixed-case>KL</fixed-case>-Divergence Knowledge Distillation Approach for Better Unlearning in <fixed-case>LLM</fixed-case>s - BichenWang + BichenWang YuzheZi YixinSun YanyanZhaoHarbin Institute of Technology - BingQinHarbin Institute of Technology + BingQinHarbin Institute of Technology 1306-1321 As concern for privacy rights has grown and the size of language model training datasets has expanded, research into machine unlearning for large language models (LLMs) has become crucial. Before the era of LLMs, research on machine unlearning mainly focused on classification tasks in small parameter models. However, as parameter sizes have grown and unlearning targets have become more complex, unlearning has become more challenging, especially in scenarios involving generation instead of classification, as the output space of such models is significantly larger and more diverse. Existing methods based on gradient ascent and its variants often struggle with balancing forget quality and model utility, leading to either over unlearning or partial unlearning. To address this challenge, we propose Reverse KL-Divergence based Knowledge Distillation for Unlearning (RKLU), a novel unlearning method for LLMs. RKLU focuses on precisely unlearning the components of the token distribution related to the unlearning target, allowing us to achieve significant forget quality while maintaining model utility in our experiments. 2025.naacl-long.60 @@ -762,9 +762,9 @@ <fixed-case>A</fixed-case>gent<fixed-case>M</fixed-case>ove: A Large Language Model based Agentic Framework for Zero-shot Next Location Prediction - JieFengTsinghua University, Tsinghua University + JieFengTsinghua University, Tsinghua University YuweiDu - JieZhao + JieZhao YongLi 1322-1338 Next location prediction plays a crucial role in various real-world applications. Recently, due to the limitation of existing deep learning methods, attempts have been made to apply large language models (LLMs) to zero-shot next location prediction task. However, they directly generate the final output using LLMs without systematic design, which limits the potential of LLMs to uncover complex mobility patterns and underestimates their extensive reserve of global geospatial knowledge. In this paper, we introduce AgentMove, a systematic agentic prediction framework to achieve generalized next location prediction. In AgentMove, we first decompose the mobility prediction task and design specific modules to complete them, including spatial-temporal memory for individual mobility pattern mining, world knowledge generator for modeling the effects of urban structure and collective knowledge extractor for capturing the shared patterns among population. Finally, we combine the results of three modules and conduct a reasoning step to generate the final predictions. Extensive experiments utilizing mobility data from two distinct sources reveal that AgentMove surpasses the leading baseline by 3.33% to 8.57% across 8 out of 12 metrics and it shows robust predictions with various LLMs as base and also less geographical bias across cities. Our codes are available via https://github.com/tsinghua-fib-lab/AgentMove. @@ -782,12 +782,12 @@ Generating Long-form Story Using Dynamic Hierarchical Outlining with Memory-Enhancement QianyueWang - JinwuHu + JinwuHu ZhengpingLi - YufengWang - DaiyuanLi - YuHuGuangdong University of Technology - MingkuiTan + YufengWang + DaiyuanLi + YuHuGuangdong University of Technology + MingkuiTan 1352-1391 Long-form story generation task aims to produce coherent and sufficiently lengthy text, essential for applications such as novel writingand interactive storytelling. However, existing methods, including LLMs, rely on rigid outlines or lack macro-level planning, making it difficult to achieve both contextual consistency and coherent plot development in long-form story generation. To address this issues, we propose Dynamic Hierarchical Outlining with Memory-Enhancement long-form story generation method, named DOME, to generate the long-form story with coherent content and plot. Specifically, the Dynamic Hierarchical Outline(DHO) mechanism incorporates the novel writing theory into outline planning and fuses the plan and writing stages together, improving the coherence of the plot by ensuring the plot completeness and adapting to the uncertainty during story generation. A Memory-Enhancement Module (MEM) based on temporal knowledge graphs is introduced to store and access the generated content, reducing contextual conflicts and improving story coherence. Finally, we propose a Temporal Conflict Analyzer leveraging temporal knowledge graphs to automatically evaluate the contextual consistency of long-form story. Experiments demonstrate that DOME significantly improves the fluency, coherence, and overall quality of generated long stories compared to state-of-the-art methods. 2025.naacl-long.63 @@ -795,13 +795,13 @@ Little Giants: Synthesizing High-Quality Embedding Data at Scale - HaonanChen - LiangWangMicrosoft Research + HaonanChen + LiangWangMicrosoft Research NanYangMicrosoft Research Asia - YutaoZhu - ZiliangZhao + YutaoZhu + ZiliangZhao FuruWeiMicrosoft Research - ZhichengDouRenmin University of China + ZhichengDouRenmin University of China 1392-1411 Synthetic data generation has become an increasingly popular way of training models without the need for large, manually labeled datasets. For tasks like text embedding, synthetic data offers diverse and scalable training examples, significantly reducing the cost of human annotation. However, most current approaches rely heavily on proprietary models like GPT-4, which are expensive and inefficient for generating large-scale embedding data. In this paper, we introduce SPEED, a framework that aligns open-source small models (8B) to efficiently generate large-scale synthetic embedding data. Through supervised fine-tuning, preference optimization, and self-improvement, SPEED enables small open-source models to produce high-quality data. Remarkably, SPEED uses only less than 1/10 of the GPT API calls, outperforming the state-of-the-art embedding model E5_mistral when both are trained solely on their synthetic data. Using this efficient generator, we conduct a comprehensive study on how various factors within the alignment pipeline impact data quality and reveal the scaling law for synthetic embedding data. Our codes and models are released in https://github.com/haon-chen/SPEED. 2025.naacl-long.64 @@ -809,11 +809,11 @@ Can <fixed-case>LLM</fixed-case>s Convert Graphs to Text-Attributed Graphs? - ZehongWang + ZehongWang SidneyLiu - ZheyuanZhang - TianyiMaUniversity of Notre Dame - ChuxuZhangUniversity of Connecticut + ZheyuanZhang + TianyiMaUniversity of Notre Dame + ChuxuZhangUniversity of Connecticut YanfangYeUniversity of Notre Dame 1412-1432 Graphs are ubiquitous structures found in numerous real-world applications, such as drug discovery, recommender systems, and social network analysis. To model graph-structured data, graph neural networks (GNNs) have become a popular tool. However, existing GNN architectures encounter challenges in cross-graph learning where multiple graphs have different feature spaces. To address this, recent approaches introduce text-attributed graphs (TAGs), where each node is associated with a textual description, which can be projected into a unified feature space using textual encoders. While promising, this method relies heavily on the availability of text-attributed graph data, which is difficult to obtain in practice. To bridge this gap, we propose a novel method named Topology-Aware Node description Synthesis (TANS), leveraging large language models (LLMs) to convert existing graphs into text-attributed graphs. The key idea is to integrate topological information into LLMs to explain how graph topology influences node semantics. We evaluate our TANS on text-rich, text-limited, and text-free graphs, demonstrating its applicability. Notably, on text-free graphs, our method significantly outperforms existing approaches that manually design node features, showcasing the potential of LLMs for preprocessing graph-structured data in the absence of textual information. The code and data are available at https://github.com/Zehong-Wang/TANS. @@ -825,8 +825,8 @@ HaoranLiao ShaohuaHuShanghai Jiaotong University ZhihaoZhu - HaoHeShanghai Jiao Tong University - YaohuiJinShanghai Jiaotong University + HaoHeShanghai Jiao Tong University + YaohuiJinShanghai Jiaotong University 1433-1453 Chain-of-thought (CoT) and subsequent methods adopted a deductive paradigm that decomposes the reasoning process, demonstrating remarkable performances across NLP tasks. However, such a paradigm faces the challenge of getting bogged down in low-level semantic details, hindering large language models (LLMs) from correctly understanding, selecting, and compositing conditions. In this work, we present Overarching Prompting (OaP), a simple prompting method that elicits the high-level thinking of LLMs. Specifically, OaP first abstracts the whole problem into a simplified archetype and formulates strategies grounded in concepts and principles, establishing an overarching perspective for guiding reasoning. We conducted experiments with SoTA models, including ChatGPT, InstructGPT, and Llama3-70B-instruct, and received promising performances across tasks including Knowledge QA, Mathematical, and Open-Domain Reasoning. For instance, OaP improved ChatGPT and CoT by 19.0% and 3.1% on MMLU’s College Physics, 8.8% and 2.3% on GSM8k, and 10.3% and 2.5% on StrategyQA, respectively. 2025.naacl-long.66 @@ -834,14 +834,14 @@ On the Role of Speech Data in Reducing Toxicity Detection Bias - SamuelBellFacebook + SamuelBellFacebook Mariano CoriaMeglioliMeta MeganRichardsNew York University - EduardoSánchezUniversity College London, University of London and Meta + EduardoSánchezUniversity College London, University of London and Meta ChristopheRopersMeta and Syntexys Inc SkylerWang - AdinaWilliamsFAIR (Meta Platforms Inc.) - LeventSagunMeta + AdinaWilliamsFAIR (Meta Platforms Inc.) + LeventSagunMeta Marta R.Costa-jussàMeta 1454-1468 Text toxicity detection systems exhibit significant biases, producing disproportionate rates of false positives on samples mentioning demographic groups. But what about toxicity detection in speech? To investigate the extent to which text-based biases are mitigated by speech-based systems, we produce a set of high-quality group annotations for the multilingual MuTOX dataset, and then leverage these annotations to systematically compare speech- and text-based toxicity classifiers. Our findings indicate that access to speech data during inference supports reduced bias against group mentions, particularly for ambiguous and disagreement-inducing samples. Our results also suggest that improving classifiers, rather than transcription pipelines, is more helpful for reducing group bias. We publicly release our annotations and provide recommendations for future toxicity dataset construction. @@ -850,11 +850,11 @@ <fixed-case>ITALIC</fixed-case>: An <fixed-case>I</fixed-case>talian Culture-Aware Natural Language Benchmark - AndreaSeveso + AndreaSeveso DanielePotertì EdoardoFedericiUniversity of Milan - MarioMezzanzanica - FabioMercorio + MarioMezzanzanica + FabioMercorio 1469-1478 We present ITALIC, a large-scale benchmark dataset of 10,000 multiple-choice questions designed to evaluate the natural language understanding of the Italian language and culture. ITALIC spans 12 domains, exploiting public tests to score domain experts in real-world scenarios. We detail our data collection process, stratification techniques, and selection strategies. ITALIC provides a comprehensive assessment suite that captures commonsense reasoning and linguistic proficiency in a morphologically rich language. We establish baseline performances using 17 state-of-the-art LLMs, revealing current limitations in Italian language understanding and highlighting significant linguistic complexity and cultural specificity challenges. ITALIC serves as a benchmark for evaluating existing models and as a roadmap for future research, encouraging the development of more sophisticated and culturally aware natural language systems. 2025.naacl-long.68 @@ -863,9 +863,9 @@ <fixed-case>RAP</fixed-case>: A Metric for Balancing Repetition and Performance in Open-Source Large Language Models DonghaoHuang - Thanh-SonNguyenAgency for Science, Technology and Research + Thanh-SonNguyenAgency for Science, Technology and Research FionaLiausviaInstitute of High Performance Computing, Singapore, A*STAR - ZhaoxiaWangSingapore Management University + ZhaoxiaWangSingapore Management University 1479-1496 Large Language Models (LLMs) have significantly advanced natural language processing, but content repetition in open-source LLMs remains a critical challenge that adversely affects user experience. The repetition penalty parameter (RPP) aims to mitigate this issue by preventing repeated content generation, but excessive use of RPP can compromise the overall quality. In this paper, we propose Repetition-Aware Performance (RAP), a novel evaluation metric that quantifies and integrates repetition penalty into the assessment of model performance, enabling tuning of RPP. We evaluate our approach using twelve open-source LLMs, ranging from 2 billion to 70 billion parameters, tested on question answering and machine translation tasks across three datasets with varying prompting techniques. Experimental results show that RAP effectively tunes RPP, helping to identify a trade-off value that significantly reduces repetition while minimizing performance loss. Upon acceptance, we will release the code and the dataset of generated text, providing a valuable resource for further research on repetition detection and LLMs evaluation. 2025.naacl-long.69 @@ -873,11 +873,11 @@ Improving Data Annotation for Low-Resource Relation Extraction with Logical Rule-Augmented Collaborative Language Models - XiyangLiuBeihang University + XiyangLiuBeihang University ChunmingHuBeijing University of Aeronautics and Astronautics - RichongZhang - JunfanChenBeihang University - BaowenXuAvic Digital Corporation + RichongZhang + JunfanChenBeihang University + BaowenXuAvic Digital Corporation 1497-1510 Low-resource relation extraction aims to identify semantic relationships between entities using scarce labeled data. Recent studies exploit large language models to recognize relations based on retrieved examplars, yielding promising results. However, the reliability of predictions from these methods is constrained by the presence of irrelevant context within demonstrations and the inherent flaws of large language models in producing undesired outputs. Inspired by the precision and generalization of abstract logic, in this paper, we propose distilling logical rules to uniformly represent task knowledge sourced from distinct origins and facilitate deductive reasoning. We develop a collaborative annotating framework that iteratively integrates high-confidence predictions of rule-enhanced relation extractors with varying scales, efficiently obtaining reliable pseudo annotations from massive unlabeled samples without human supervision. Experiments under two inference settings show that our approach achieves new state-of-the-art performance on benchmark datasets in few-shot scenarios. 2025.naacl-long.70 @@ -888,7 +888,7 @@ YaraShamshoumComputer Science Department, Technion - Israel Institute of Technology NitzanHodos YuvalSieradzkiComputer Science Department, Technion - Israel Institute of Technology - AssafSchusterTechnion - Israel Institute of Technology, Technion + AssafSchusterTechnion - Israel Institute of Technology, Technion 1511-1524 We introduce CompAct, a technique that reduces peak memory utilization on GPU by 25-30% for pretraining and 50% for fine-tuning of LLMs. Peak device memory is a major limiting factor in training LLMs, with various recent works aiming to reduce model memory. However most works don’t target the largest component of allocated memory during training: the model’s compute graph, which is stored for the backward pass. By storing low-rank, compressed activations to be used in the backward pass we greatly reduce the required memory, unlike previous methods which only reduce optimizer overheads or the number of trained parameters. Our compression uses random projection matrices, thus avoiding additional memory overheads. Comparisons with previous techniques for either pretraining or fine-tuning show that CompAct substantially improves existing compute-performance tradeoffs. We expect CompAct’s savings to scale even higher for larger models. 2025.naacl-long.71 @@ -899,10 +899,10 @@ PengHunanjing university SizheLiunanjing university ChangjiangGao - XinHuangChina Mobile Communications Company Limited Research Institute + XinHuangChina Mobile Communications Company Limited Research Institute XueHan - JunlanFeng - ChaoDengChina Mobile Research Institute + JunlanFeng + ChaoDengChina Mobile Research Institute ShujianHuangNanjing University 1525-1542 Large Language Models have demonstrated impressive reasoning capabilities across multiple languages. However, the relationship between capabilities in different languages is less explored. In this work, we decompose the process of reasoning tasks into two separated components: knowledge retrieval and knowledge-free reasoning, and analyze the relationship between cross-lingual transferability and these two components. With adapted commonsense reasoning datasets and constructed knowledge-free reasoning datasets, we show that the knowledge-free reasoning capability can be nearly perfectly transferred across various source-target language directions despite the secondary impact of resource in some specific target languages, while cross-lingual knowledge retrieval significantly hinders the transfer. Moreover, by analyzing the hidden states and feed-forward network neuron activation during the reasoning, we show that higher similarity of hidden representations and larger overlap of activated neurons could explain the better cross-lingual transferability of knowledge-free reasoning than knowledge retrieval. Thus, we hypothesize that knowledge-free reasoning shares similar neurons in different languages for reasoning, while knowledge is stored separately in different languages. @@ -911,7 +911,7 @@ What Did <fixed-case>I</fixed-case> Do Wrong? Quantifying <fixed-case>LLM</fixed-case>s’ Sensitivity and Consistency to Prompt Engineering - FedericoErricaNEC + FedericoErricaNEC DavideSanvito GiuseppeSiracusanoNEC RobertoBifulcoNEC @@ -927,7 +927,7 @@ XiaohangSun DhruvaPatilAmazon AvijitVajpayeeAmazon - ZhuLiuAmazon Prime Video + ZhuLiuAmazon Prime Video VimalBhatAmazon NajmehSadoughiAmazon 1559-1570 @@ -938,13 +938,13 @@ Mitigating Hallucinations in Multi-modal Large Language Models via Image Token Attention-Guided Decoding XinhaoXu - HuiChenTsinghua University, Tsinghua University - MengyaoLyuTsinghua University - SichengZhaoTsinghua University - YizheXiongSchool of Software, Tsinghua University - ZijiaLinKuaishou Technology - JungongHanThe University of Sheffield and University of Sheffield - GuiguangDingTsinghua University + HuiChenTsinghua University, Tsinghua University + MengyaoLyuTsinghua University + SichengZhaoTsinghua University + YizheXiongSchool of Software, Tsinghua University + ZijiaLinKuaishou Technology + JungongHanThe University of Sheffield and University of Sheffield + GuiguangDingTsinghua University 1571-1590 Multi-modal large language models (MLLMs) integrate the inherent text generation capabilities of large language models with an understanding of other modalities, promising wide applications in open-ended tasks. Despite their success, they often generate plausible but incorrect content. This phenomenon, known as hallucination, significantly impacts their practical deployment. In this paper, we delve into the intrinsic characteristics of hallucination from the perspective of interaction between input and output tokens. We find that the hallucination typically occurs with attention reduction of output tokens to image tokens. Based on this observation, we introduce image Token attention-guided Decoding (iTaD), a plug-and-play method which leverages MLLMs’ internal representations to mitigate their hallucinations. We first define an image token attention vector to measure the inter-layer differences in attention of output tokens to image tokens across different layers. Based on the vector, we design a novel layer selection strategy and conduct inter-layer contrastive decoding to highlight the progression in image understanding, thereby exploiting attention to image tokens to mitigate hallucinations. Extensive experiments well demonstrate iTaD’s effectiveness across different MLLMs and benchmarks. 2025.naacl-long.75 @@ -964,7 +964,7 @@ NadavBorenstein ArnavAroraUniversity of Copenhagen Lucie-AiméeKaffeeHugging Face - IsabelleAugensteinUniversity of Copenhagen + IsabelleAugensteinUniversity of Copenhagen 1607-1627 Studying human values is instrumental for cross-cultural research, enabling a better understanding of preferences and behaviour of society at large and communities therein. To study the dynamics of communities online, we propose a method to computationally analyse values present on Reddit. Our method allows analysis at scale, complementing survey based approaches. We train a value relevance and a value polarity classifier, which we thoroughly evaluate using in-domain and out-of-domain human annotations. Using these, we automatically annotate over nine million posts across 12k subreddits with Schwartz values. Our analysis unveils both previously recorded and novel insights into the values prevalent within various online communities. For instance, we discover a very negative stance towards conformity in the Vegan and AbolishTheMonarchy subreddits. Additionally, our study of geographically specific subreddits highlights the correlation between traditional values and conservative U.S. states. Through our work, we demonstrate how our dataset and method can be used as a complementary tool for qualitative study of online communication. 2025.naacl-long.77 @@ -985,11 +985,11 @@ <fixed-case>MATO</fixed-case>: A Model-Agnostic Training Optimization for Aspect Sentiment Triplet Extraction - ShaopengTangWuhan University of Technology - LinLiWuhan University of Technology - XiaohuiTaoUniversity of Southern Queensland - LeqiZhongWuhan University of Technology - QingXieWuhan University of Technology + ShaopengTangWuhan University of Technology + LinLiWuhan University of Technology + XiaohuiTaoUniversity of Southern Queensland + LeqiZhongWuhan University of Technology + QingXieWuhan University of Technology 1648-1662 As an important fine-grained sentiment analysis task, aspect sentiment triplet extraction (ASTE) aims to identify three elements, i.e., aspect, opinion and sentiment polarity as a triplet. Advanced ASTE researches have mostly explored triplet-wise ability to achieve superior improvement. However, existing models with strong in-house performances may struggle to generalize to the challenging cases with the diverse expression of inter-triplet and intra-triplet elements. To this end, we propose a **M**odel-**A**gnostic **T**raining **O**ptimization (**MATO**) to improve ASTE model inference consistent with expected results facing triplet element diversity. Specifically, we design inter-triplet and intra-triplet metamorphic relations (MRs), and calculate the violation rate (VR) on each element of one triplet through metamorphic testing (MT), indicating the capacity to accommodate the diverse elements. Moreover, we propose an element-wise diversity-aware loss based on the VRs of aspect, opinion and sentiment, which can be jointly trained with existed ASTE models via uncertainty weighing. Conducted on four benchmark datasets and seven ASTE models, experimental results show that our MATO can enhance their diversity capacity, decreasing the average element-wise VRs by 3.28% to 15.36%. Meanwhile, our MATO is comparable to or better than those in terms of F1-score. 2025.naacl-long.79 @@ -997,10 +997,10 @@ Dynamic Data Mixing Maximizes Instruction Tuning for Mixture-of-Experts - TongZhu - DaizeDongShanghai Artificial Intelligence Laboratory + TongZhu + DaizeDongShanghai Artificial Intelligence Laboratory XiaoyeQuShanghai Artificial Intelligence Laboratory - JiachengRuan + JiachengRuan WenliangChenSoochow University, China YuChengThe Chinese University of Hong Kong 1663-1677 @@ -1021,7 +1021,7 @@ <fixed-case>R</fixed-case>eas<fixed-case>VQA</fixed-case>: Advancing <fixed-case>V</fixed-case>ideo<fixed-case>QA</fixed-case> with Imperfect Reasoning Process JianxinLiang - XiaojunMengNoah’s Ark Lab, Huawei Technologies Ltd. + XiaojunMengNoah’s Ark Lab, Huawei Technologies Ltd. HuishuaiZhangPeking University YueqianWang JianshengWei @@ -1033,9 +1033,9 @@ Divergent Thoughts toward One Goal: <fixed-case>LLM</fixed-case>-based Multi-Agent Collaboration System for Electronic Design Automation HaoyuanWu - HaishengZhengShanghai Artificial Intelligence Laboratory + HaishengZhengShanghai Artificial Intelligence Laboratory ZhuolunHeDepartment of Computer Science and Engineering, The Chinese University of Hong Kong - BeiYuDepartment of Computer Science and Engineering, The Chinese University of Hong Kong + BeiYuDepartment of Computer Science and Engineering, The Chinese University of Hong Kong 1710-1721 Recently, with the development of tool-calling capabilities in large language models (LLMs), these models have demonstrated significant potential for automating electronic design automation (EDA) flows by interacting with EDA tool APIs via EDA scripts.However, considering the limited understanding of EDA tools, LLMs face challenges in practical scenarios where diverse interfaces of EDA tools exist across different platforms.Additionally, EDA flow automation often involves intricate, long-chain tool-calling processes, increasing the likelihood of errors in intermediate steps.Any errors will lead to the instability and failure of EDA flow automation.To address these challenges, we introduce EDAid, a multi-agent collaboration system where multiple agents harboring divergent thoughts converge towards a common goal, ensuring reliable and successful EDA flow automation. Specifically, each agent is controlled by ChipLlama models, which are expert LLMs fine-tuned for EDA flow automation.Our experiments demonstrate the state-of-the-art (SOTA) performance of our ChipLlama models and validate the effectiveness of our EDAid in the automation of complex EDA flows, showcasing superior performance compared to single-agent systems. 2025.naacl-long.83 @@ -1052,12 +1052,12 @@ <fixed-case>S</fixed-case>afety<fixed-case>Q</fixed-case>uizzer: Timely and Dynamic Evaluation on the Safety of <fixed-case>LLM</fixed-case>s ZhichaoShiinstitute of computing technology, Chinese Academy of Sciences - ShaolingJing + ShaolingJing YiCheng HaoZhang - YuanzhuoWangChinese Academy of Sciences + YuanzhuoWangChinese Academy of Sciences JieZhangInstitute of Computing Technology, Chinese Academy of Sciences - HuaweiShenInstitute of Computing Technology, Chinese Academy of Sciences + HuaweiShenInstitute of Computing Technology, Chinese Academy of Sciences XueqiChengInstitute of Computing Technology, Chinese Academy 1733-1747 With the expansion of the application of Large Language Models (LLMs), concerns about their safety have grown among researchers. Numerous studies have demonstrated the potential risks of LLMs generating harmful content and have proposed various safety assessment benchmarks to evaluate these risks. However, the evaluation questions in current benchmarks, especially for Chinese, are too straightforward, making them easily rejected by target LLMs, and difficult to update with practical relevance due to their lack of correlation with real-world events. This hinders the effective application of these benchmarks in continuous evaluation tasks. To address these limitations, we propose SafetyQuizzer, a question-generation framework designed to evaluate the safety of LLMs more sustainably in the Chinese context. SafetyQuizzer leverages a finetuned LLM and jailbreaking attack templates to generate subtly offensive questions, which reduces the decline rate. Additionally, by utilizing retrieval-augmented generation, SafetyQuizzer incorporates the latest real-world events into evaluation questions, improving the adaptability of the benchmarks. Our experiments demonstrate that evaluation questions generated by SafetyQuizzer significantly reduce the decline rate compared to other benchmarks while maintaining a comparable attack success rate. Our code is available at https://github.com/zhichao-stone/SafetyQuizzer. Warning: this paper contains examples that may be offensive or upsetting. @@ -1066,8 +1066,8 @@ Privacy Checklist: Privacy Violation Detection Grounding on Contextual Integrity Theory - HaoranLi - WeiFanHong Kong University of Science and Technology + HaoranLi + WeiFanHong Kong University of Science and Technology YulinChenNational University of Singapore ChengJiayangDepartment of Computer Science and Engineering, Hong Kong University of Science and Technology TianshuChu @@ -1082,7 +1082,7 @@ Investigating the (De)Composition Capabilities of Large Language Models in Natural-to-Formal Language Conversion ZiyaoXuPeking University - HoufengWang + HoufengWang 1767-1783 Humans have strong capabilities of decomposition and composition in natural-to-formal language conversion (N2F) when faced with an unfamiliar formal language, and can easily cope with compositional gaps and counter-intuitive symbolic names. To investigate whether large language models (LLMs) have this set of basic capabilities in N2F, we propose the STD framework. This framework semi-automatically performs sample and task construction, allowing decoupled evaluation of the set of decomposition and composition capabilities of LLMs in N2F. Based on this framework, we evaluate and analyze the most advanced LLMs, and the main findings include that: (1) the LLMs are deficient in both decomposition and composition; (2) the LLMs show a wide coverage of error types that can be attributed to deficiencies in natural language understanding and the learning and use of symbolic systems; (3) compositional gaps and counter-intuitive symbolic names both affect the decomposition and composition of the LLMs. Our work provides a new perspective for investigating the basic capabilities of decomposition and composition of LLMs in N2F. The detailed analysis of deficiencies and attributions can help subsequent improvements of LLMs. 2025.naacl-long.87 @@ -1097,8 +1097,8 @@ YangXuHarbin Institute of Technology LiboQinCentral South University XiaomingShiEast China Normal University - ZemingLiu - XudongHanMohamed bin Zayed University of Artificial Intelligence + ZemingLiu + XudongHanMohamed bin Zayed University of Artificial Intelligence QiShi QingfuZhuHarbin Institute of Technology WanxiangCheHarbin Institute of Technology @@ -1112,7 +1112,7 @@ LingxiaoLuoComputer Science and Technology, Tsinghua University BingdaTang XuanzhongChen, Tsinghua University - RongHanTsinghua University, Tsinghua University + RongHanTsinghua University, Tsinghua University TingChenTsinghua University 1800-1821 Recent advancements in Vision Language Models (VLMs) have demonstrated remarkable promise in generating visually grounded responses. However, their application in the medical domain is hindered by unique challenges. For instance, most VLMs rely on a single method of visual grounding, whereas complex medical tasks demand more versatile approaches. Additionally, while most VLMs process only 2D images, a large portion of medical images are 3D. The lack of medical data further compounds these obstacles. To address these challenges, we present VividMed, a vision language model with versatile visual grounding for medicine. Our model supports generating both semantic segmentation masks and instance-level bounding boxes, and accommodates various imaging modalities, including both 2D and 3D data. We design a three-stage training procedure and an automatic data synthesis pipeline based on open datasets and models. Besides visual grounding tasks, VividMed also excels in other common downstream tasks, including Visual Question Answering (VQA) and report generation. Ablation studies empirically show that the integration of visual grounding ability leads to improved performance on these tasks. Our code is publicly available at https://github.com/function2-llx/MMMM. @@ -1122,10 +1122,10 @@ Mixture of Multimodal Adapters for Sentiment Analysis KezhouChen - ShuoWangUniversity of Science and Technology of China - HuixiaBenAnhui University of Science and Technology - ShengengTangHefei University of Technology - YanbinHaoHefei University of Technology + ShuoWangUniversity of Science and Technology of China + HuixiaBenAnhui University of Science and Technology + ShengengTangHefei University of Technology + YanbinHaoHefei University of Technology 1822-1833 Pre-trained language model (PLM) have achieved great success in text sentiment analysis. However, in practical applications, sentiment is not only conveyed through language but also hidden in other modalities. Therefore, multimodal sentiment analysis (MSA) has attracted increasing research interest. Compared to text sentiment analysis, MSA is challenging since (1) emotions hidden in body movements or vocal timbres eclipse traditional analytical methods, and (2) transferring PLM to MSA task requires huge training parameters. Thus, to solve these issues, we introduce the Mixture of Multimodal Adapters (MMA) into the PLM. Specifically, we first design a mixture-of-multimodal-experts module to capture and fuse emotional movements from different data. Meanwhile, we use a compression parameter for each expert to reduce the training burden. We apply our method to two benchmark datasets and achieve state-of-the-art performance with a tiny trainable parameter count. For example, compared to the current state-of-the-art method, AcFormer, we only need 1/22 of its training parameters amount (130M\rightarrow6M) to achieve better results. 2025.naacl-long.90 @@ -1133,7 +1133,7 @@ The Impact of Inference Acceleration on Bias of <fixed-case>LLM</fixed-case>s - ElisabethKirstenRuhr-Universität Bochum + ElisabethKirstenRuhr-Universität Bochum IvanHabernalRuhr-Universität Bochum VedantNandaUniversity of Maryland, College Park, MPI-SWS and University of Maryland, College Park & MPI-SWS Muhammad BilalZafarRuhr-Universität Bochum and Research Center for Trustworthy Data Science and Security @@ -1144,32 +1144,32 @@ <fixed-case>A</fixed-case>fri<fixed-case>H</fixed-case>ate: A Multilingual Collection of Hate Speech and Abusive Language Datasets for <fixed-case>A</fixed-case>frican Languages - Shamsuddeen HassanMuhammadImperial College London and Bayero University, Kano-Nigeria - IdrisAbdulmuminAhmadu Bello University - Abinew AliAyeleBahir Dar University, Universität Hamburg - David IfeoluwaAdelaniMcGill University - Ibrahim SaidAhmadNortheastern University + Shamsuddeen HassanMuhammadImperial College London and Bayero University, Kano-Nigeria + IdrisAbdulmuminAhmadu Bello University + Abinew AliAyeleBahir Dar University, Universität Hamburg + David IfeoluwaAdelaniMcGill University + Ibrahim SaidAhmadNortheastern University Saminu MohammadAliyu - PaulRöttgerBocconi University - AbigailOppong - AndiswaBukula - Chiamaka IjeomaChukwunekeNnamdi Azikiwe University - Ebrahim ChekolJibril + PaulRöttgerBocconi University + AbigailOppong + AndiswaBukula + Chiamaka IjeomaChukwunekeNnamdi Azikiwe University + Ebrahim ChekolJibril Elyas AbdiIsmail - EsubalewAlemnehBahir Dar University - Hagos TesfahunGebremichael - Lukman JibrilAliyu + EsubalewAlemnehBahir Dar University + Hagos TesfahunGebremichael + Lukman JibrilAliyu MeriemBeloucifUppsala University OumaimaHourraneAl Akhawayn University - RooweitherMabuyaNorth-West University - SalomeyOsei + RooweitherMabuyaNorth-West University + SalomeyOsei SamuelRutunda Tadesse DestawBelay Tadesse KebedeGugeHaramaya University - Tesfa TegegneAsfaw - Lilian Diana AwuorWanzareMaseno University + Tesfa TegegneAsfaw + Lilian Diana AwuorWanzareMaseno University Nelson OdhiamboOnyango - Seid MuhieYimamUniversität Hamburg + Seid MuhieYimamUniversität Hamburg NedjmaOusidhoumCardiff University 1854-1871 Hate speech and abusive language are global phenomena that need socio-cultural background knowledge to be understood, identified, and moderated. However, in many regions of the Global South, there have been several documented occurrences of (1) absence of moderation and (2) censorship due to the reliance on keyword spotting out of context. Further, high-profile individuals have frequently been at the center of the moderation process, while large and targeted hate speech campaigns against minorities have been overlooked.These limitations are mainly due to the lack of high-quality data in the local languages and the failure to include local communities in the collection, annotation, and moderation processes. To address this issue, we present AfriHate: a multilingual collection of hate speech and abusive language datasets in 15 African languages. Each instance in AfriHate is a tweet annotated by native speakers familiar with the regional culture. We report the challenges related to the construction of the datasets and present various classification baseline results with and without using LLMs. We find that model performance highly depends on the language and that multilingual models can help boost performance in low-resource settings. @@ -1178,14 +1178,14 @@ Revealing the Barriers of Language Agents in Planning - JianXie + JianXie KexunZhangCarnegie Mellon University JiangjieChenByteDance Inc. - SiyuYuan + SiyuYuan KaiZhang YikaiZhang - LeiLiSchool of Computer Science, Carnegie Mellon University - YanghuaXiaoFudan University + LeiLiSchool of Computer Science, Carnegie Mellon University + YanghuaXiaoFudan University 1872-1888 Autonomous planning has been an ongoing pursuit since the inception of artificial intelligence. Based on curated problem solvers, early planning agents could deliver precise solutions for specific tasks but lacked generalization. The emergence of large language models (LLMs) and their powerful reasoning capabilities has reignited interest in autonomous planning by automatically generating reasonable solutions for given tasks. However, prior research and our experiments show that current language agents still lack human-level planning abilities. Even the state-of-the-art reasoning model, OpenAI o1, achieves only 15.6% on one of the complex real-world planning benchmarks. This highlights a critical question: What hinders language agents from achieving human-level planning? Although existing studies have highlighted weak performance in agent planning, the deeper underlying issues and the mechanisms and limitations of the strategies proposed to address them remain insufficiently understood. In this work, we apply the feature attribution study and identify two key factors that hinder agent planning: the limited role of constraints and the diminishing influence of questions. We also find that although current strategies help mitigate these challenges, they do not fully resolve them, indicating that agents still have a long way to go before reaching human-level intelligence. 2025.naacl-long.93 @@ -1208,8 +1208,8 @@ Option Symbol Matters: Investigating and Mitigating Multiple-Choice Option Symbol Bias of Large Language Models - ZhenYang - PingJianBeijing Institute of Technology + ZhenYang + PingJianBeijing Institute of Technology ChengzhiLiBeijing Institute of Technology 1902-1917 Multiple-Choice Question Answering (MCQA) is a widely used task in the evaluation of Large Language Models (LLMs). In this work, we reveal that current LLMs’ performance in MCQA could be heavily influenced by the choice of option symbol sets, due to the option symbol bias. That is, when altering only the option symbols (e.g., A/B/C/D\rightarrowi/ii/iii/iv), the results could vary sharply, leading to a margin of approximately 10% in accuracy. To uncover the mechanisms behind this, we investigate the internal components of LLMs from a causal perspective. By measuring the causal effects, we identify a small subset of attention heads responsible for the symbol bias. Subsequently, we interpret these key components in a human-understandable way, showing that attention heads with higher causal effects are more likely to focus on only option symbols, while those with lower causal effects tend to distribute their attention across the content of questions and options. It also motivates us to pursue debiasing based on the causal effects. Specifically, to mitigate such bias, we propose a tuning-free, causal effect driven debiasing method which intervenes the activations of identified components according to their causal effects, with stronger interventions corresponding to higher causal effects. Experimental results demonstrate that the proposed method not only alleviates aforementioned bias, but also improves the MCQA performance of LLMs. @@ -1218,10 +1218,10 @@ <fixed-case>DAWN</fixed-case>-<fixed-case>ICL</fixed-case>: Strategic Planning of Problem-solving Trajectories for Zero-Shot In-Context Learning - XinyuTangRenmin University of China - XiaoleiWangRenmin University of China - XinZhaoRenmin University of China - Ji-RongWenRenmin University of China + XinyuTangRenmin University of China + XiaoleiWangRenmin University of China + XinZhaoRenmin University of China + Ji-RongWenRenmin University of China 1918-1934 Zero-shot in-context learning (ZS-ICL) aims to conduct in-context learning (ICL) without using human-annotated demonstrations.Existing ZS-ICL methods either use large language models (LLMs) to generate (input, label) pairs as pseudo-demonstrations or leverage historical pseudo-demonstrations to help solve the current problem.They assume that all problems are from the same task and traverse them in a random order.However, in real-world scenarios, problems usually come from diverse tasks, and only a few belong to the same task.The random traversing order may generate unreliable pseudo-demonstrations and lead to error accumulation.To address this problem, we reformulate ZS-**ICL** as a planning problem and propose a **D**emonstration-**AW**are Mo**N**te Carlo Tree Search (MCTS) approach (DAWN-ICL), which leverages MCTS to strategically plan the problem-solving trajectories for ZS-ICL.In addition, to achieve effective and efficient Q value estimation, we propose a demonstration-aware Q-value function and use it to enhance the selection phase and accelerate the expansion and simulation phases in MCTS.Extensive experiments demonstrate the effectiveness and efficiency of DAWN-ICL on in-domain and cross-domain scenarios, and it even outperforms ICL using human-annotated demonstrations.The code is available at https://github.com/txy77/MCTS4ZSICL. 2025.naacl-long.96 @@ -1244,7 +1244,7 @@ Towards Efficient and Multifaceted Computer-assisted Pronunciation Training Leveraging Hierarchical Selective State Space Model and Decoupled Cross-entropy Loss Fu-AnChaoNational Taiwan Normal University - BerlinChenNational Taiwan Normal University + BerlinChenNational Taiwan Normal University 1947-1961 Prior efforts in building computer-assisted pronunciation training (CAPT) systems often treat automatic pronunciation assessment (APA) and mispronunciation detection and diagnosis (MDD) as separate fronts: the former aims to provide multiple pronunciation aspect scores across diverse linguistic levels, while the latter focuses instead on pinpointing the precise phonetic pronunciation errors made by non-native language learners. However, it is generally expected that a full-fledged CAPT system should perform both functionalities simultaneously and efficiently. In response to this surging demand, we in this work first propose HMamba, a novel CAPT approach that seamlessly integrates APA and MDD tasks in parallel. In addition, we introduce a novel loss function, decoupled cross-entropy loss (deXent), specifically tailored for MDD to facilitate better-supervised learning for detecting mispronounced phones, thereby enhancing overall performance. A comprehensive set of empirical results on the speechocean762 benchmark dataset demonstrates the effectiveness of our approach on APA. Notably, our proposed approach also yields a considerable improvement in MDD performance over a strong baseline, achieving an F1-score of 63.85%. Our codes are made available at https://github.com/Fuann/hmamba 2025.naacl-long.98 @@ -1254,7 +1254,7 @@ Information-Guided Identification of Training Data Imprint in (Proprietary) Large Language Models AbhilashaRavichanderUniversity of Washington and School of Computer Science, Carnegie Mellon University JillianFisherUniversity of Washington - TaylorSorensenUniversity of Washington and Brigham Young University + TaylorSorensenUniversity of Washington and Brigham Young University XimingLuUniversity of Washington MariaAntoniak Bill YuchenLinxAI and University of Washington @@ -1268,13 +1268,13 @@ An Interpretable and Crosslingual Method for Evaluating Second-Language Dialogues - RenaGao + RenaGao JingxuanWu XuetongWu - CarstenRoever - JingWu - LongLv - Jey HanLau + CarstenRoever + JingWu + LongLv + Jey HanLau 1979-2008 We analyse the cross-lingual transferability of a dialogue evaluation framework that assesses the relationships between micro-level linguistic features (e.g. backchannels) and macro-level interactivity labels (e.g. topic management), originally designed for English-as-a-second-language dialogues. To this end, we develop CNIMA (**C**hinese **N**on-Native **I**nteractivity **M**easurement and **A**utomation), a Chinese-as-a-second-language labelled dataset with 10K dialogues. We found the evaluation framework to be robust across languages, revealing language-specific and language-universal relationships between micro-level and macro-level features. Next, we propose an automated, interpretable approach with low data requirements that scores the overall quality of a second-language dialogue based on the framework. Our approach is interpretable in that it reveals the key linguistic and interactivity features that contributed to the overall quality score. As our approach does not require labelled data, it can also be adapted to other languages for second-language dialogue evaluation. 2025.naacl-long.100 @@ -1286,8 +1286,8 @@ HaoweiWang JunjieWangInstitute of Software, Chinese Academy of Sciences MingyangLiInstitute of Software Chinese Academy of Sciences - YuekaiHuang - DandanWangInstitute of Software Chinese Academy of Sciences + YuekaiHuang + DandanWangInstitute of Software Chinese Academy of Sciences QingWangInstitute of Software, Chinese Academy of Sciences 2009-2028 Tool-calling has changed Large Language Model (LLM) applications by integrating external tools, significantly enhancing their functionality across diverse tasks. However, this integration also introduces new security vulnerabilities, particularly in the tool scheduling mechanisms of LLM, which have not been extensively studied. To fill this gap, we present ToolCommander, a novel framework designed to exploit vulnerabilities in LLM tool-calling systems through adversarial tool injection. Our framework employs a well-designed two-stage attack strategy. Firstly, it injects malicious tools to collect user queries, then dynamically updates the injected tools based on the stolen information to enhance subsequent attacks. These stages enable ToolCommander to execute privacy theft, launch denial-of-service attacks, and even manipulate business competition by triggering unscheduled tool-calling. Notably, the ASR reaches 91.67% for privacy theft and hits 100% for denial-of-service and unscheduled tool calling in certain cases. Our work demonstrates that these vulnerabilities can lead to severe consequences beyond simple misuse of tool-calling systems, underscoring the urgent need for robust defensive strategies to secure LLM Tool-calling systems. @@ -1296,7 +1296,7 @@ <fixed-case>COVE</fixed-case>: <fixed-case>CO</fixed-case>ntext and <fixed-case>VE</fixed-case>racity prediction for out-of-context images - JonathanTonglet + JonathanTonglet GabrielThiemTechnische Universität Darmstadt IrynaGurevychInstitute for Computer Science, Artificial Intelligence and Technology, Mohamed bin Zayed University of Artificial Intelligence and Technische Universität Darmstadt 2029-2049 @@ -1307,7 +1307,7 @@ Discourse-Driven Evaluation: Unveiling Factual Inconsistency in Long Document Summarization YangZhongUniversity of Pittsburgh - DianeLitmanUniversity of Pittsburgh, University of Pittsburgh and University of Pittsburgh + DianeLitmanUniversity of Pittsburgh, University of Pittsburgh and University of Pittsburgh 2050-2073 Detecting factual inconsistency for long document summarization remains challenging, given the complex structure of the source article and long summary length. In this work, we study factual inconsistency errors and connect them with a line of discourse analysis. We find that errors are more common in complex sentences and are associated with several discourse features. We propose a framework that decomposes long texts into discourse-inspired chunks and utilizes discourse information to better aggregate sentence-level scores predicted by NLI models. Our approach shows improved performance on top of different model baselines over several evaluation benchmarks, covering rich domains of texts, focusing on long document summarization. This underscores the significance of incorporating discourse features in developing models for scoring summaries for long document factual inconsistency. 2025.naacl-long.103 @@ -1315,10 +1315,10 @@ Language Models are Crossword Solvers - SoumadeepSahaIndian Statistical Institute, Kolkata + SoumadeepSahaIndian Statistical Institute, Kolkata SutanoyaChakrabortyIndian Statistical Institute - SaptarshiSahaIndian Statistical Institute - UtpalGarainIndian Statistical Institute + SaptarshiSahaIndian Statistical Institute + UtpalGarainIndian Statistical Institute 2074-2090 Crosswords are a form of word puzzle that require a solver to demonstrate a high degree of proficiency in natural language understanding, wordplay, reasoning, and world knowledge, along with adherence to character and length constraints. In this paper we tackle the challenge of solving crosswords with large language models (LLMs). We demonstrate that the current generation of language models shows significant competence at deciphering cryptic crossword clues and outperforms previously reported state-of-the-art (SoTA) results by a factor of 2-3 in relevant benchmarks. We also develop a search algorithm that builds off this performance to tackle the problem of solving full crossword grids with out-of-the-box LLMs for the very first time, achieving an accuracy of 93% on New York Times crossword puzzles. Additionally, we demonstrate that LLMs generalize well and are capable of supporting answers with sound rationale. 2025.naacl-long.104 @@ -1326,9 +1326,9 @@ <fixed-case>WH</fixed-case>o<fixed-case>W</fixed-case>: A Cross-domain Approach for Analysing Conversation Moderation - Ming-BinChen + Ming-BinChen LeaFrermannUniversity of Melbourne - Jey HanLauThe University of Melbourne + Jey HanLauThe University of Melbourne 2091-2126 We propose WHoW, an evaluation framework for analyzing the facilitation strategies of moderators across different domains/scenarios by examining their motives (Why), dialogue acts (How) and target speaker (Who). Using this framework, we annotated 5,657 moderation sentences with human judges and 15,494 sentences with GPT-4o from two domains: TV debates and radio panel discussions. Comparative analysis demonstrates the framework’s cross-domain generalisability and reveals distinct moderation strategies: debate moderators emphasise coordination and facilitate interaction through questions and instructions, while panel discussion moderators prioritize information provision and actively participate in discussions. Our analytical framework works for different moderation scenarios, enhances our understanding of moderation behaviour through automatic large-scale analysis, and facilitates the development of moderator agents. 2025.naacl-long.105 @@ -1336,9 +1336,9 @@ Uplifting Lower-Income Data: Strategies for Socioeconomic Perspective Shifts in Large Multi-modal Models - JoanNwatuUniversity of Michigan - Ann Arbor - OanaIgnatSanta Clara University - RadaMihalceaUniversity of Michigan + JoanNwatuUniversity of Michigan - Ann Arbor + OanaIgnatSanta Clara University + RadaMihalceaUniversity of Michigan 2127-2144 Recent work has demonstrated that the unequal representation of cultures and socioeconomic groups in training data leads to biased Large Multi-modal (LMM) models. To improve LMM model performance on underrepresented data, we propose and evaluate several prompting strategies using non-English, geographic, and socioeconomic attributes. We show that these geographic and socioeconomic integrated prompts favor retrieving topic appearances commonly found in data from low-income households across different countries leading to improved LMM model performance on lower-income data. Our analyses identify and highlight contexts where these strategies yield the most improvements. 2025.naacl-long.106 @@ -1352,7 +1352,7 @@ JiapengWuLayer 6 NoëlVouitsisLayer 6 AI GuangweiYuLayer6 AI - Jesse C.CresswellLayer 6 AI + Jesse C.CresswellLayer 6 AI RasaHosseinzadehLayer6 2145-2160 Text-to-SQL generation enables non-experts to interact with databases via natural language. Recent advances rely on large closed-source models like GPT-4 that present challenges in accessibility, privacy, and latency. To address these issues, we focus on developing small, efficient, and open-source text-to-SQL models. We demonstrate the benefits of sampling multiple candidate SQL generations and propose our method, MSc-SQL, to critique them using associated metadata. Our sample critiquing model evaluates multiple outputs simultaneously, achieving state-of-the-art performance compared to other open-source models while remaining competitive with larger models at a much lower cost. Full code can be found at github.com/layer6ai-labs/msc-sql. @@ -1393,7 +1393,7 @@ Analyzing and Evaluating Correlation Measures in <fixed-case>NLG</fixed-case> Meta-Evaluation MingqiGao - XinyuHuPeking University + XinyuHuPeking University LiLin XiaojunWan 2199-2222 @@ -1404,9 +1404,9 @@ Cascading Large Language Models for Salient Event Graph Generation XingweiTanUniversity of Sheffield - YuxiangZhouKing’s College London + YuxiangZhouKing’s College London GabrielePergolaUniversity of Warwick - YulanHeKing’s College London, University of London + YulanHeKing’s College London, University of London 2223-2245 Generating event graphs from long documents is challenging due to the inherent complexity of multiple tasks involved such as detecting events, identifying their relationships, and reconciling unstructured input with structured graphs. Recent studies typically consider all events with equal importance, failing to distinguish salient events crucial for understanding narratives. This paper presents CALLMSAE, a CAscading Large Language Model framework for SAlient Event graph generation, which leverages the capabilities of LLMs and eliminates the need for costly human annotations. We first identify salient events by prompting LLMs to generate summaries, from which salient events are identified. Next, we develop an iterative code refinement prompting strategy to generate event relation graphs, removing hallucinated relations and recovering missing edges. Powered by CALLMSAE, we present NYT-SEG, a large-scale automatically annotated event graph dataset which can serve as distant supervision signals. Fine-tuning contextualised graph generation models on NYT-SEG outperforms the models trained on CAEVO data. Results on a human-annotated test set show that the proposed method generates salient and more accurate graphs, outperforming competitive baselines. 2025.naacl-long.112 @@ -1415,12 +1415,12 @@ Token-Level Density-Based Uncertainty Quantification Methods for Eliciting Truthfulness of Large Language Models ArtemVazhentsevSkolkovo Institute of Science and Technology and Artificial Intelligence Research Institute - LyudmilaRvanovaITMO University + LyudmilaRvanovaITMO University IvanLazichny AlexanderPanchenkoSkoltech MaximPanovMohamed bin Zayed University of Artificial Intelligence - TimothyBaldwinMohamed bin Zayed University of Artificial Intelligence and The University of Melbourne - ArtemShelmanovMohamed bin Zayed University of Artificial Intelligence + TimothyBaldwinMohamed bin Zayed University of Artificial Intelligence and The University of Melbourne + ArtemShelmanovMohamed bin Zayed University of Artificial Intelligence 2246-2262 Uncertainty quantification (UQ) is a prominent approach for eliciting truthful answers from large language models (LLMs). To date, information-based and consistency-based UQ have been the dominant UQ methods for text generation via LLMs. Density-based methods, despite being very effective for UQ in text classification with encoder-based models, have not been very successful with generative LLMs. In this work, we adapt Mahalanobis Distance (MD) – a well-established UQ technique in classification tasks – for text generation and introduce a new supervised UQ method. Our method extracts token embeddings from multiple layers of LLMs, computes MD scores for each token, and uses linear regression trained on these features to provide robust uncertainty scores. Through extensive experiments on eleven datasets, we demonstrate that our approach substantially improves over existing UQ methods, providing accurate and computationally efficient uncertainty scores for both sequence-level selective generation and claim-level fact-checking tasks. Our method also exhibits strong generalization to out-of-domain data, making it suitable for a wide range of LLM-based applications. 2025.naacl-long.113 @@ -1428,8 +1428,8 @@ How Can We Diagnose and Treat Bias in Large Language Models for Clinical Decision-Making? - KenzaBenkirane - JackieKayUniversity College London, University of London and DeepMind + KenzaBenkirane + JackieKayUniversity College London, University of London and DeepMind MariaPerez-OrtizUniversity College London, University of London 2263-2288 Recent advancements in Large Language Models (LLMs) have positioned them as powerful tools for clinical decision-making, with rapidly expanding applications in healthcare. However, concerns about bias remain a significant challenge in the clinical implementation of LLMs, particularly regarding gender and ethnicity. This research investigates the evaluation and mitigation of bias in LLMs applied to complex clinical cases, focusing on gender and ethnicity biases. We introduce a novel Counterfactual Patient Variations (CPV) dataset derived from the JAMA Clinical ChallengeUsing this dataset, we built a framework for bias evaluation, employing both Multiple Choice Questions (MCQs) and corresponding explanations. We explore prompting with eight LLMs and fine-tuning as debiasing methods. Our findings reveal that addressing social biases in LLMs requires a multidimensional approach as mitigating gender bias can occur while introducing ethnicity biases, and that gender bias in LLM embeddings varies significantly across medical specialities. We demonstrate that evaluating both MCQ response and explanation processes is crucial, as correct responses can be based on biased reasoning. We provide a framework for evaluating LLM bias in real-world clinical cases, offer insights into the complex nature of bias in these models, and present strategies for bias mitigation. @@ -1438,15 +1438,15 @@ From Redundancy to Relevance: Information Flow in <fixed-case>LVLM</fixed-case>s Across Reasoning Tasks - XiaofengZhang + XiaofengZhang YihaoQuan ChenShenAlibaba Group - XiaosongYuan + XiaosongYuan ShaotianYanAlibaba Group - LiangXie + LiangXie WenxiaoWangZhejiang University ChaochenGu - HaoTangPeking University + HaoTangPeking University JiepingYeAlibaba Group 2289-2299 Large Vision Language Models (LVLMs) achieve great performance on visual-language reasoning tasks, however, the black-box nature of LVLMs hinders in-depth research on the reasoning mechanism. As all images need to be converted into image tokens to fit the input format of large language models (LLMs) along with natural language prompts, sequential visual representation is essential to the performance of LVLMs, and the information flow analysis approach can be an effective tool for determining interactions between these representations. In this paper, we propose integrating attention analysis with LLaVA-CAM, concretely, attention scores highlight relevant regions during forward propagation, while LLaVA-CAM captures gradient changes through backward propagation, revealing key image features. By exploring the information flow from the perspective of visual representation contribution, we observe that it tends to converge in shallow layers but diversify in deeper layers. To validate our analysis, we conduct comprehensive experiments with truncation strategies across various LVLMs for visual question answering and image captioning tasks, and experimental results not only verify our hypothesis but also reveal a consistent pattern of information flow convergence in the corresponding layers, and the information flow cliff layer will be different due to different contexts. @@ -1457,7 +1457,7 @@ Patent-<fixed-case>CR</fixed-case>: A Dataset for Patent Claim Revision LekangJiang Pascal A.Scherz - StefanGoetzUniversity of Cambridge and Duke University + StefanGoetzUniversity of Cambridge and Duke University 2300-2314 This paper presents Patent-CR, the first dataset created for the patent claim revision task in English. It includes both initial patent applications rejected by patent examiners and the final granted versions. Unlike normal text revision tasks that predominantly focus on enhancing sentence quality, such as grammar correction and coherence improvement, patent claim revision aims at ensuring the claims meet stringent legal criteria. These criteria are beyond novelty and inventiveness, including clarity of scope, technical accuracy, language precision, and legal robustness. We assess various large language models (LLMs) through professional human evaluation, including general LLMs with different sizes and architectures, text revision models, and domain-specific models. Our results indicate that LLMs often bring ineffective edits that deviate from the target revisions. In addition, domain-specific models and the method of fine-tuning show promising results. Notably, GPT-4 outperforms other tested LLMs, but further revisions are still necessary to reach the examination standard. Furthermore, we demonstrate the inconsistency between automated and human evaluation results, suggesting that GPT-4-based automated evaluation has the highest correlation with human judgment. This dataset, along with our preliminary empirical research, offers invaluable insights for further exploration in patent claim revision. 2025.naacl-long.116 @@ -1495,7 +1495,7 @@ XiaoniDuanPurdue University ZhuoyanLiPurdue University Chien-JuHoWashington University in St. Louis - MingYinPurdue University + MingYinPurdue University 2359-2372 Crowdsourcing has been increasingly utilized to gather subjective assessment, such as evaluating the toxicity of texts. Since there doesnot exist a single “ground truth” answer for subjective annotations, obtaining annotations to accurately reflect the opinions of differentsubgroups becomes a key objective for these subjective assessment tasks. Traditionally, this objective is accomplished by directly soliciting a large number of annotations from each subgroup, which can be costly especially when annotators of certain subgroups are hard to access. In this paper, using toxicity evaluation as an example, we explore the feasibility of using perspective taking—that is, asking annotators to take the point of views of a certain subgroup and estimate opinions within that subgroup—as a way to achieve this objective cost-efficiently. Our results show that compared to the baseline approach of directly soliciting annotations from the target subgroup, perspective taking could lead to better estimates of the subgroup-level opinion when annotations from the target subgroup is costly while the budget is limited. Moreover, prompting annotators to take the perspectives of contrasting subgroups simultaneously can further improve the quality of the estimates. Finally, we find that aggregating multiple perspective-taking annotations while soliciting a small number of annotations directly from the target subgroup for calibration leads to the highest-quality estimates under limited budget. 2025.naacl-long.119 @@ -1516,9 +1516,9 @@ <fixed-case>L</fixed-case>i<fixed-case>PO</fixed-case>: Listwise Preference Optimization through Learning-to-Rank TianqiLiuGoogle DeepMind - ZhenQinGoogle Deepmind + ZhenQinGoogle Deepmind JunruWu - JiamingShenGoogle DeepMind + JiamingShenGoogle DeepMind MishaKhalmanGoogle RishabhJoshiGoogle YaoZhaoGoogle @@ -1534,13 +1534,13 @@ Adaptive Prompting: Ad-hoc Prompt Composition for Social Bias Detection - MaximilianSpliethöverLeibniz University Hannover + MaximilianSpliethöverLeibniz University Hannover TimKnebler - FabianFumagalli - MaximilianMuschalikInstitute of Computer Science, Ludwig-Maximilians-Universität München - BarbaraHammerUniversität Bielefeld - EykeHüllermeierLudwig-Maximilians-Universität München - HenningWachsmuthLeibniz Universität Hannover + FabianFumagalli + MaximilianMuschalikInstitute of Computer Science, Ludwig-Maximilians-Universität München + BarbaraHammerUniversität Bielefeld + EykeHüllermeierLudwig-Maximilians-Universität München + HenningWachsmuthLeibniz Universität Hannover 2421-2449 Recent advances on instruction fine-tuning have led to the development of various prompting techniques for large language models, such as explicit reasoning steps. However, the success of techniques depends on various parameters, such as the task, language model, and context provided. Finding an effective prompt is, therefore, often a trial-and-error process. Most existing approaches to automatic prompting aim to optimize individual techniques instead of compositions of techniques and their dependence on the input. To fill this gap, we propose an adaptive prompting approach that predicts the optimal prompt composition ad-hoc for a given input. We apply our approach to social bias detection, a highly context-dependent task that requires semantic understanding. We evaluate it with three large language models on three datasets, comparing compositions to individual techniques and other baselines. The results underline the importance of finding an effective prompt composition. Our approach robustly ensures high detection performance, and is best in several settings. Moreover, first experiments on other tasks support its generalizability. 2025.naacl-long.122 @@ -1573,7 +1573,7 @@ <fixed-case>MEDA</fixed-case>: Dynamic <fixed-case>KV</fixed-case> Cache Allocation for Efficient Multimodal Long-Context Inference ZhongweiWan HuiShen - XinWang + XinWang CheLiu ZhedaMai MiZhangThe Ohio State University @@ -1596,7 +1596,7 @@ <fixed-case>S</fixed-case>afe<fixed-case>Q</fixed-case>uant: <fixed-case>LLM</fixed-case> Safety Analysis via Quantized Gradient Inspection SindhuPadakandlaFujitsu Research India Pvt Ltd SadbhavanaBabarFujitsu Research and Development Center Co. Ltm. - Rathod DarshanD + Rathod DarshanD ManoharKaulFujitsu Research and Development Center Co. Ltm. 2522-2536 Contemporary jailbreak attacks on Large Language Models (LLMs) employ sophisticated techniques with obfuscated content to bypass safety guardrails. Existing defenses either use computationally intensive LLM verification or require adversarial fine-tuning, leaving models vulnerable to advanced attacks. We introduce SafeQuant, a novel defense framework that leverages quantized gradient patterns to identify harmful prompts efficiently. Our key insight is that when generating identical responses like “Sure”, LLMs exhibit distinctly different internal gradient patterns for safe versus harmful prompts, reflecting conflicts with safety training. By capturing these patterns through selective gradient masking and quantization, SafeQuant significantly outperforms existing defenses across multiple benchmarks while maintaining model utility. The method demonstrates particular effectiveness against sophisticated attacks like WordGame prompts and persuasive adversarial attacks, achieving an F1-score of 0.80 on WordGame dataset and outperforming state-of-the-art (SoTA) methods like GradSafe by an absolute margin of 57%. @@ -1605,11 +1605,11 @@ Exploring Large Language Models for Effective Rumor Detection on Social Media - YirongZeng - XiaoDing + YirongZeng + XiaoDing BiboCaiHarbin Institute of Technology TingLiuHarbin Institute of Technology - BingQinHarbin Institute of Technology + BingQinHarbin Institute of Technology 2537-2552 In this paper, we explore using Large Language Models (LLMs) for rumor detection on social media. It involves assessing the veracity of claims on social media based on social context (e.g., comments, propagation patterns). LLMs, despite their impressive capabilities in text-based reasoning tasks, struggle to achieve promising rumor detection performance when facing long structured social contexts. Our preliminary analysis shows that large-scale contexts hinder LLMs’ reasoning abilities, while moderate contexts perform better for LLMs, highlighting the need for refined contexts. Accordingly, we propose a semantic-propagation collaboration-base framework that integrates small language models (e.g., graph attention network) with LLMs for effective rumor detection. It models contexts by enabling text semantic and propagation patterns to collaborate through graph attention mechanisms, and reconstruct the context by aggregating attention values during inference. Also, a cluster-based unsupervised method to refine context is proposed for generalization. Extensive experiments demonstrate the effectiveness of proposed methods in rumor detection. This work bridges the gap for LLMs in facing long, structured data and offers a novel solution for rumor detection on social media. 2025.naacl-long.128 @@ -1617,9 +1617,9 @@ No Simple Answer to Data Complexity: An Examination of Instance-Level Complexity Metrics for Classification Tasks - Ryan A.Cook + Ryan A.Cook John P.LalorUniversity of Notre Dame - AhmedAbbasiUniversity of Notre Dame + AhmedAbbasiUniversity of Notre Dame 2553-2573 Natural Language Processing research has become increasingly concerned with understanding data quality and complexity at the instance level. Instance-level complexity scores can be used for tasks such as filtering out noisy observations and subsampling informative examples. However, there exists a diverse taxonomy of complexity metrics that can be used for a classification task, making metric selection itself a difficult task. We empirically examine the relationship between these metrics and find that simply storing training loss provides similar complexity rankings as other more computationally intensive techniques. Metric similarity allows us to subsample data with higher aggregate complexity along several metrics using a single a priori available meta-feature. Further, this choice of complexity metric does not impact demographic fairness, even in downstream predictions. Researchers should consider metric availability and similarity, as using the wrong metric or sampling strategy may hurt performance. 2025.naacl-long.129 @@ -1639,10 +1639,10 @@ ThibaudLetenoUniversité Jean Monnet IrinaProskurina AntoineGourruUniversité Jean Monnet - JulienVelcinERIC + JulienVelcinERIC CharlotteLaclauTélecom Paris GuillaumeMetzlerUniversité Lumiére (Lyon II) - ChristopheGravierUniversity Jean Monnet + ChristopheGravierUniversity Jean Monnet 2590-2612 Aligning language models with human values is crucial, especially as they become more integrated into everyday life. While models are often adapted to user preferences, it is equally important to ensure they align with moral norms and behaviours in real-world social situations. Despite significant progress in languages like English and Chinese, French has seen little attention in this area, leaving a gap in understanding how LLMs handle moral reasoning in this language. To address this gap, we introduce HistoiresMorales, a French dataset derived from MoralStories, created through translation and subsequently refined with the assistance of native speakers to guarantee grammatical accuracy and adaptation to the French cultural context. We also rely on annotations of the moral values within the dataset to ensure their alignment with French norms. HistoiresMorales covers a wide range of social situations, including differences in tipping practices, expressions of honesty in relationships, and responsibilities toward animals. To foster future research, we also conduct preliminary experiments on the alignment of multilingual models on French and English data and the robustness of the alignment. We find that while LLMs are generally aligned with human moral norms by default, they can be easily influenced with user-preference optimization for both moral and immoral data. 2025.naacl-long.131 @@ -1653,7 +1653,7 @@ KwangheeChoi EunjungYeoCMU, Carnegie Mellon University KalvinChangSchool of Computer Science, Carnegie Mellon University - ShinjiWatanabeCarnegie Mellon University + ShinjiWatanabeCarnegie Mellon University David RMortensenCarnegie Mellon University 2613-2628 Allophony refers to the variation in the phonetic realization of a phoneme based on its phonetic environment. Modeling allophones is crucial for atypical pronunciation assessment, which involves distinguishing atypical from typical pronunciations. However, recent phoneme classifier-based approaches often simplify this by treating various realizations as a single phoneme, bypassing the complexity of modeling allophonic variation. Motivated by the acoustic modeling capabilities of frozen self-supervised speech model (S3M) features, we propose MixGoP, a novel approach that leverages Gaussian mixture models to model phoneme distributions with multiple subclusters. Our experiments show that MixGoP achieves state-of-the-art performance across four out of five datasets, including dysarthric and non-native speech. Our analysis further suggests that S3M features capture allophonic variation more effectively than MFCCs and Mel spectrograms, highlighting the benefits of integrating MixGoP with S3M features. @@ -1663,8 +1663,8 @@ <fixed-case>SAPIENT</fixed-case>: Mastering Multi-turn Conversational Recommendation with Strategic Planning and <fixed-case>M</fixed-case>onte <fixed-case>C</fixed-case>arlo Tree Search HanwenDu - BoPengGoogle - XiaNingOhio State University, Columbus + BoPengGoogle + XiaNingOhio State University, Columbus 2629-2648 Conversational Recommender Systems (CRS) proactively engage users in interactive dialogues to elicit user preferences and provide personalized recommendations. Existing methods train Reinforcement Learning (RL)-based agent with greedy action selection or sampling strategy, and may suffer from suboptimal conversational planning. To address this, we present a novel Monte Carlo Tree Search (MCTS)-based CRS framework SAPIENT. SAPIENT consists of a conversational agent (S-agent) and a conversational planner (S-planner). S-planner builds a conversational search tree with MCTS based on the initial actions proposed by S-agent to find conversation plans. The best conversation plans from S-planner are used to guide the training of S-agent, creating a self-training loop where S-agent can iteratively improve its capability for conversational planning. Furthermore, we propose an efficient variant SAPIENT for trade-off between training efficiency and performance. Extensive experiments on four benchmark datasets validate the effectiveness of our approach, showing that SAPIENT outperforms the state-of-the-art baselines. Our code and data are accessible through https://github.com/ninglab/SAPIENT. 2025.naacl-long.133 @@ -1673,7 +1673,7 @@ Reliability of Topic Modeling KaylaSchroeder - ZachWood-DoughtyNorthwestern University + ZachWood-DoughtyNorthwestern University 2649-2662 Topic models allow researchers to extract latent factors from text data and use those variables in downstream statistical analyses. However, these methodologies can vary significantly due to initialization differences, randomness in sampling procedures, or noisy data. Reliability of these methods is of particular concern as many researchers treat learned topic models as ground truth for subsequent analyses. In this work, we show that the standard practice for quantifying topic model reliability fails to capture essential aspects of the variation in two widely-used topic models. Drawing from a extensive literature on measurement theory, we provide empirical and theoretical analyses of three other metrics for evaluating the reliability of topic models. On synthetic and real-world data, we show that McDonald’s \omega provides the best encapsulation of reliability. This metric provides an essential tool for validation of topic model methodologies that should be a standard component of any topic model-based research. 2025.naacl-long.134 @@ -1682,7 +1682,7 @@ Style Transfer with Multi-iteration Preference Optimization ShuaiLiuUniversity of Southern California, Information Sciences Institute - JonathanMayUniversity of Southern California and USC/ISI + JonathanMayUniversity of Southern California and USC/ISI 2663-2681 Numerous recent techniques for text style transfer characterize their approaches as variants of reinforcement learning and preference optimization. In this work, we consider the relationship between these approaches and a class of optimization approaches developed primarily for (non-neural) statistical machine translation, formerly known as ‘tuning’. Inspired by these techniques from the past, we improve upon established preference optimization approaches, incorporating multiple iterations of exploration and optimization, and choosing contrastive examples by following a ‘hope’ vs ‘fear’ sampling strategy. Cognizant of the difference between machine translation and style transfer, however, we further tailor our framework with a new pseudo-parallel data generation method and a dynamic weighted reward aggregation method to tackle the lack of parallel data and the need for a multi-objective reward. We evaluate our model on two commonly used text style transfer datasets. Through automatic and human evaluation results we show the effectiveness and the superiority of our model compared to state-of-the-art baselines. 2025.naacl-long.135 @@ -1704,7 +1704,7 @@ <fixed-case>ALERT</fixed-case>: An <fixed-case>LLM</fixed-case>-powered Benchmark for Automatic Evaluation of Recommendation Explanations YichuanLi - XinyangZhangAmazon + XinyangZhangAmazon ChenweiZhangAmazon MaoLiAmazon TianyiLiuAmazon @@ -1712,7 +1712,7 @@ YifanGaoAmazon KyuminLeeWorcester Polytechnic Institute KaizeDingNorthwestern University - ZhengyangWangAmazon + ZhengyangWangAmazon ZhihanZhang JingboShangUniversity of California, San Diego XianLiAmazon @@ -1726,12 +1726,12 @@ <fixed-case>DETQUS</fixed-case>: Decomposition-Enhanced Transformers for <fixed-case>QU</fixed-case>ery-focused Summarization YasirKhan XinleiWu - SangpilYoumUniversity of Florida + SangpilYoumUniversity of Florida JustinHo Aryaan MehboobShaikh JairoGarciga RohanSharmaUniversity of Florida - Bonnie JDorrUniversity of Florida + Bonnie JDorrUniversity of Florida 2720-2731 Query-focused tabular summarization is an emerging task in table-to-text generation that synthesizes a summary response from tabular data based on user queries. Traditional transformer-based approaches face challenges due to token limitations and the complexity of reasoning over large tables. To address these challenges, we introduce DETQUS (Decomposition-Enhanced Transformers for QUery-focused Summarization), a system designed to improve summarization accuracy by leveraging tabular decomposition alongside a fine-tuned encoder-decoder model. DETQUS employs a large language model to selectively reduce table size, retaining only query-relevant columns while preserving essential information. This strategy enables more efficient processing of large tables and enhances summary quality. Our approach, equipped with table-based QA model Omnitab, achieves a ROUGE-L score of 0.4437, outperforming the previous state-ofthe- art REFACTOR model (ROUGE-L: 0.422). These results highlight DETQUS as a scalable and effective solution for query-focused tabular summarization, offering a structured alternative to more complex architectures. 2025.naacl-long.138 @@ -1739,29 +1739,29 @@ <fixed-case>I</fixed-case>roko<fixed-case>B</fixed-case>ench: A New Benchmark for <fixed-case>A</fixed-case>frican Languages in the Age of Large Language Models - David IfeoluwaAdelaniMcGill University + David IfeoluwaAdelaniMcGill University JessicaOjoLelapa AI - Israel AbebeAzime + Israel AbebeAzime Jian YunZhuang - Jesujoba OluwadaraAlabiUniversität des Saarlandes + Jesujoba OluwadaraAlabiUniversität des Saarlandes XuanliHeUniversity College London, University of London - MillicentOchiengMicrosoft + MillicentOchiengMicrosoft SaraHookerCohere For AI - AndiswaBukula - En-Shiun AnnieLee - Chiamaka IjeomaChukwunekeNnamdi Azikiwe University + AndiswaBukula + En-Shiun AnnieLee + Chiamaka IjeomaChukwunekeNnamdi Azikiwe University HappyBuzaabaPrinceton University - Blessing KudzaisheSibanda + Blessing KudzaisheSibanda Godson KoffiKalipe JonathanMukiibi - SalomonKabongo KabenamualuTIB/L3S - FoutseYuehgoh - MmasibidiSetaka + SalomonKabongo KabenamualuTIB/L3S + FoutseYuehgoh + MmasibidiSetaka LolwethuNdolela - NkirukaOduAfrican University of Science and Technology - RooweitherMabuyaNorth-West University - SalomeyOsei - Shamsuddeen HassanMuhammadImperial College London and Bayero University, Kano-Nigeria + NkirukaOduAfrican University of Science and Technology + RooweitherMabuyaNorth-West University + SalomeyOsei + Shamsuddeen HassanMuhammadImperial College London and Bayero University, Kano-Nigeria SokharSamb Tadesse KebedeGugeHaramaya University Tombekai VangoniSherman @@ -1773,9 +1773,9 @@ The Impact of Domain-Specific Terminology on Machine Translation for Finance in <fixed-case>E</fixed-case>uropean Languages - ArturoOncevayJ.P. Morgan Chase - ChareseSmileyJ.P. Morgan Chase - XiaomoLiuJP Morgan AI Research + ArturoOncevayJ.P. Morgan Chase + ChareseSmileyJ.P. Morgan Chase + XiaomoLiuJP Morgan AI Research 2758-2775 Domain-specific machine translation (MT) poses significant challenges due to specialized terminology, particularly when translating across multiple languages with scarce resources. In this study, we present the first impact analysis of domain-specific terminology on multilingual MT for finance, focusing on European languages within the subdomain of macroeconomics. To this end, we construct a multi-parallel corpus from the European Central Bank, aligned across 22 languages. Using this resource, we compare open-source multilingual MT systems with large language models (LLMs) that possess multilingual capabilities. Furthermore, by developing and curating an English financial glossary, we propose a methodology to analyze the relationship between translation performance (into English) and the accuracy of financial term matching, obtaining significant correlation results. Finally, using the multi-parallel corpus and the English glossary, we automatically align a multilingual financial terminology, validating the English-Spanish alignments and incorporating them into our discussion. Our findings provide valuable insights into the current state of financial MT for European languages and offer resources for future research and system improvements. 2025.naacl-long.140 @@ -1783,12 +1783,12 @@ Benchmarking Language Model Creativity: A Case Study on Code Generation - YiningLuUniversity of Notre Dame + YiningLuUniversity of Notre Dame DixuanWang TianjianLiJohns Hopkins University DongweiJiang SanjeevKhudanpurWhiting School of Engineering - MengJiangUniversity of Notre Dame + MengJiangUniversity of Notre Dame DanielKhashabiJohns Hopkins University 2776-2794 As LLMs become increasingly prevalent, it is interesting to consider how “creative” these models can be. From cognitive science, creativity consists of at least two key characteristics: convergent thinking (purposefulness to achieve a given goal) and divergent thinking (adaptability to explore new environments or constraints) (CITATION). In this work, we introduce a framework for quantifying LLM creativity that incorporates the two design ingredients: (1) We introduce DENIAL PROMPTING which pushes LLMs to develop more creative solutions to a given problem by incrementally imposing new constraints on the previous solution, compelling LLMs to adopt new strategies. (2) We define NEOGAUGE, a metric that quantifies both convergent and divergent thinking in the generated creative responses by LLMs. We test the proposed framework on Codeforces problems, which serve as both a natural dataset for coding tasks and a collection of prior human solutions. We quantify NEOGAUGE for various proprietary and open-source models and find that even the most creative model, GPT-4, still falls short of demonstrating human-like creativity. We also experiment with advanced reasoning strategies (MCTS, self-correction, etc.) and observe no significant improvement in creativity. As a by-product of our analysis, we release NEOCODER dataset for reproducing our results on future models. @@ -1799,10 +1799,10 @@ Have <fixed-case>LLM</fixed-case>s Reopened the Pandora’s Box of <fixed-case>AI</fixed-case>-Generated Fake News? XinyuWangPennsylvania State University WenboZhang - SaiKoneruPennsylvania State University + SaiKoneruPennsylvania State University HangzhiGuoPennsylvania State University BonamMingolePennsylvania State University - S. ShyamSundarPennsylvania State University + S. ShyamSundarPennsylvania State University SarahRajtmajerPennsylvania State University AmulyaYadavPennsylvania State University 2795-2811 @@ -1813,8 +1813,8 @@ Probe-Free Low-Rank Activation Intervention ChongheJiang - BaoNguyen - Anthony Man-ChoSoThe Chinese University of Hong Kong + BaoNguyen + Anthony Man-ChoSoThe Chinese University of Hong Kong Viet AnhNguyenThe Chinese University of Hong Kong 2812-2824 Language models (LMs) can produce texts that appear accurate and coherent but contain untruthful or toxic content. Inference-time interventions that edit the hidden activations have shown promising results in steering the LMs towards desirable generations. Existing activation intervention methods often comprise an activation probe to detect undesirable generation, triggering the activation modification to steer subsequent generation. This paper proposes a probe-free intervention method FLORAIN for all attention heads in a specific activation layer. It eliminates the need to train classifiers for probing purposes. The intervention function is parametrized by a sample-wise nonlinear low-rank mapping, which is trained by minimizing the distance between the modified activations and their projection onto the manifold of desirable content. Under specific constructions of the manifold and projection distance, we show that the intervention strategy can be computed efficiently by solving a smooth optimization problem. The empirical results, benchmarked on multiple base models, demonstrate that FLORAIN consistently outperforms several baseline methods in enhancing model truthfulness and quality across generation and multiple-choice tasks. Our implementation can be found at https://github.com/nguyenngocbaocmt02/EFI. @@ -1837,7 +1837,7 @@ JuliusCheng MaikeZüfleKarlsruher Institut für Technologie VilémZouharDepartment of Computer Science, ETHZ - ETH Zurich - AndreasVlachosUniversity of Cambridge + AndreasVlachosUniversity of Cambridge 2849-2862 Reranking, or scoring a list of prediction candidates from a machine translation system with an external scoring model and returning the highest-scoring candidate, remains a simple and effective method for improving prediction quality. However, reranking with high quality scoring models can add substantial computational cost to the translation pipeline, which we address in this work by framing list reranking as a Bayesian optimization (BayesOpt) problem over the candidate list, where unknown scores are modeled with a Gaussian process. This algorithm scores candidates iteratively, choosing next candidates by balancing between exploration, choosing to score those that differ from candidates already scored, and exploitation, choosing to score those that resemble high-scoring candidates.This procedure finds high-scoring candidates while scoring only a fraction of the candidates list; given candidate lists of 200 random samples (before deduplication), our method achieves the same CometKiwi score using only 70 scoring evaluations on average compared to scoring a random subset of 180 candidates. We also propose multi-fidelity BayesOpt for list reranking, where scores obtained from a noisier but cheaper proxy scoring model are incorporated into the search process. We show that well-trained distilled proxy scorers can further improve the performance of BayesOpt. 2025.naacl-long.145 @@ -1857,8 +1857,8 @@ PengLu IvanKobyzevHuawei Noah’s Ark Lab MehdiRezagholizadehAdvanced Micro Devices - BoxingChenHuawei Technologies Ltd. - PhilippeLanglaisUniversité de Montréal + BoxingChenHuawei Technologies Ltd. + PhilippeLanglaisUniversité de Montréal 2884-2898 Recent advancements in Large Language Models (LLMs) have set themselves apart with their exceptional performance in complex language modelling tasks. However, these models are also known for their significant computational and storage requirements, primarily due to the quadratic computation complexity of softmax attention. To mitigate this issue, linear attention has been designed to reduce the quadratic space-time complexity that is inherent in standard transformers. In this work, we embarked on a comprehensive exploration of three key components that substantially impact the performance of the Gated Linear Attention module: feature maps, normalization, and the gating mechanism. We developed a feature mapping function to address some crucial issues that previous suggestions overlooked. Then we offered further rationale for the integration of normalization layers to stabilize the training process. Moreover, we explored the saturation phenomenon of the gating mechanism and augmented it with a refining module. We conducted extensive experiments and showed our architecture outperforms previous Gated Linear Attention mechanisms in extensive tasks including training from scratch and post-linearization with continual pre-training. 2025.naacl-long.147 @@ -1867,7 +1867,7 @@ Intrinsic Bias is Predicted by Pretraining Data and Correlates with Downstream Performance in Vision-Language Encoders KshitishGhate - IsaacSlaughterUniversity of Washington + IsaacSlaughterUniversity of Washington KyraWilsonUniversity of Washington Mona T.DiabCarnegie Mellon University AylinCaliskanUniversity of Washington @@ -1878,7 +1878,7 @@ Benchmarking Failures in Tool-Augmented Language Models - EduardoTreviño + EduardoTreviño HugoContant JamesNgai GrahamNeubigCarnegie Mellon University @@ -1891,7 +1891,7 @@ Entity Decomposition with Filtering: A Zero-Shot Clinical Named Entity Recognition Framework RezaAverly, Ohio State University, Columbus - XiaNingOhio State University, Columbus + XiaNingOhio State University, Columbus 2935-2951 Clinical named entity recognition (NER) aims to retrieve important entities within clinical narratives. Recent works have demonstrated that large language models (LLMs) can achieve strong performance in this task. While previous works focus on proprietary LLMs, we investigate how open NER LLMs, trained specifically for entity recognition, perform in clinical NER. Our initial experiment reveals significant contrast in performance for some clinical entities and how a simple exploitment on entity types can alleviate this issue. In this paper, we introduce a novel framework, entity decomposition with filtering, or EDF. Our key idea is to decompose the entity recognition task into several retrievals of entity sub-types and then filter them. Our experimental results demonstrate the efficacies of our framework and the improvements across all metrics, models, datasets, and entity types. Our analysis also reveals substantial improvement in recognizing previously missed entities using entity decomposition. We further provide a comprehensive evaluation of our framework and an in-depth error analysis to pave future works. 2025.naacl-long.150 @@ -1902,14 +1902,14 @@ ShenglaiZengMichigan State University JiankunZhang BinghengLiMichigan State University - YupingLinMichigan State University + YupingLinMichigan State University TianqiZhengAmazon DanteEveraertAmazon HanqingLuAmazon HuiLiuAmazon - HuiLiu + HuiLiu YueXingMichigan State University - Monica XiaoChengAmazon + Monica XiaoChengAmazon JiliangTangMichigan State University 2952-2969 Retrieval-Augmented Generation (RAG) systems have shown promise in enhancing the performance of Large Language Models (LLMs). However, these systems face challenges in effectively integrating external knowledge with the LLM’s internal knowledge, often leading to issues with misleading or unhelpful information. This work aims to provide a systematic study on knowledge checking in RAG systems. We conduct a comprehensive analysis of LLM representation behaviors and demonstrate the significance of using representations in knowledge checking. Motivated by the findings, we further develop representation-based classifiers for knowledge filtering. We show substantial improvements in RAG performance, even when dealing with noisy knowledge databases. Our study provides new insights into leveraging LLM representations for enhancing the reliability and effectiveness of RAG systems. @@ -1920,8 +1920,8 @@ The Power of Many: Multi-Agent Multimodal Models for Cultural Image Captioning LongjuBai AnganaBorah - OanaIgnatSanta Clara University - RadaMihalceaUniversity of Michigan + OanaIgnatSanta Clara University + RadaMihalceaUniversity of Michigan 2970-2993 Large Multimodal Models (LMMs) exhibit impressive performance across various multimodal tasks. However, their effectiveness in cross-cultural contexts remains limited due to the predominantly Western-centric nature of most data and models. Conversely, multi-agent models have shown significant capability in solving complex tasks. Our study evaluates the collective performance of LMMs in a multi-agent interaction setting for the novel task of cultural image captioning. Our contributions are as follows: (1) We introduce MosAIC, a Multi-Agent framework to enhance cross-cultural Image Captioning using LMMs with distinct cultural personas; (2) We provide a dataset of culturally enriched image captions in English for images from China, India, and Romania across three datasets: GeoDE, GD-VCR, CVQA; (3) We propose a culture-adaptable metric for evaluating cultural information within image captions; and (4) We show that the multi-agent interaction outperforms single-agent models across different metrics, and offer valuable insights for future research. 2025.naacl-long.152 @@ -1930,8 +1930,8 @@ Prepending or Cross-Attention for Speech-to-Text? An Empirical Comparison Tsz KinLamUniversity of Edinburgh, University of Edinburgh - MarcoGaidoFondazione Bruno Kessler - SaraPapiFondazione Bruno Kessler + MarcoGaidoFondazione Bruno Kessler + SaraPapiFondazione Bruno Kessler LuisaBentivogliFondazione Bruno Kessler BarryHaddowUniversity of Edinburgh 2994-3006 @@ -1941,8 +1941,8 @@ <fixed-case>CORRECT</fixed-case>: Context- and Reference-Augmented Reasoning and Prompting for Fact-Checking - Delvin CeZhangThe Pennsylvania State University - DongwonLeeThe Pennsylvania State University + Delvin CeZhangThe Pennsylvania State University + DongwonLeeThe Pennsylvania State University 3007-3019 Fact-checking the truthfulness of claims usually requires reasoning over multiple evidence sentences. Oftentimes, evidence sentences may not be always self-contained, and may require additional contexts and references from elsewhere to understand coreferential expressions, acronyms, and the scope of a reported finding. For example, evidence sentences from an academic paper may need contextual sentences in the paper and descriptions in its cited papers to determine the scope of a research discovery. However, most fact-checking models mainly focus on the reasoning within evidence sentences, and ignore the auxiliary contexts and references. To address this problem, we propose a novel method, Context- and Reference-augmented Reasoning and Prompting. For evidence reasoning, we construct a three-layer evidence graph with evidence, context, and reference layers. We design intra- and cross-layer reasoning to integrate three graph layers into a unified evidence embedding. For verdict prediction, we design evidence-conditioned prompt encoder, which produces unique prompt embeddings for each claim. These evidence-conditioned prompt embeddings and claims are unified for fact-checking. Experiments verify the strength of our model. 2025.naacl-long.154 @@ -1961,12 +1961,12 @@ <fixed-case>DREAM</fixed-case>: Improving Video-Text Retrieval Through Relevance-Based Augmentation Using Large Foundation Models YimuWangUniversity of Waterloo - ShuaiYuanTikTok - BoXue + ShuaiYuanTikTok + BoXue XiangruJianServiceNow Research and University of Waterloo WeiPangVector Institute MushiWang - NingYuNetflix Eyeline Studios + NingYuNetflix Eyeline Studios 3037-3056 Recent progress in video-text retrieval has been driven largely by advancements in model architectures and training strategies. However, the representation learning capabilities of video-text retrieval models remain constrained by low-quality and limited training data annotations. To address this issue, we present a novel Video-Text Retrieval Paradigm with Relevance-based Augmentation, namely dReAm, which enhances video and text data using large foundation models to learn more generalized features. Specifically, we first adopt a simple augmentation method, which generates self-similar data by randomly duplicating or dropping subwords and frames. In addition, inspired by the recent advancement in visual and language generative models, we propose a more robust augmentation method through textual paraphrasing and video stylization using large language models (LLMs) and visual generative models (VGMs). To further enrich video and text information, we propose a relevance-based augmentation method, where LLMs and VGMs generate and integrate new relevant information into the original data. Leveraging this enriched data, extensive experiments on several video-text retrieval benchmarks demonstrate the superiority of dReAm over existing methods. Code will be available upon acceptance. 2025.naacl-long.156 @@ -1976,12 +1976,12 @@ <fixed-case>T</fixed-case>o<fixed-case>W</fixed-case>: Thoughts of Words Improve Reasoning in Large Language Models ZhikunXuArizona State University MingShen - JacobDineen + JacobDineen ZhaonanLiArizona State University XiaoYeArizona State University ShijieLuArizona State University AswinRrv - ChittaBaralArizona State University + ChittaBaralArizona State University BenZhouArizona State University 3057-3075 We introduce thoughts of words (ToW), a novel training-time data-augmentation method for next-word prediction. ToW views next-word prediction as a core reasoning task and injects fine-grained thoughts explaining what the next word should be and how it is related to the previous contexts in pre-training texts. Our formulation addresses two fundamental drawbacks of existing next-word prediction learning schemes: they induce factual hallucination and are inefficient for models to learn the implicit reasoning processes in raw texts. While there are many ways to acquire such thoughts of words, we explore the first step of acquiring ToW annotations through distilling from larger models. After continual pre-training with only 70K ToW annotations, we effectively improve models’ reasoning performances by 7% to 9% on average and reduce model hallucination by up to 10%. At the same time, ToW is entirely agnostic to tasks and applications, introducing no additional biases on labels or semantics. @@ -2001,7 +2001,7 @@ <fixed-case>ERAS</fixed-case>: Evaluating the Robustness of <fixed-case>C</fixed-case>hinese <fixed-case>NLP</fixed-case> Models to Morphological Garden Path Errors - QinchanLiSimon Fraser University + QinchanLiSimon Fraser University SophieHao 3100-3111 In languages without orthographic word boundaries, NLP models perform _word segmentation_, either as an explicit preprocessing step or as an implicit step in an end-to-end computation. This paper shows that Chinese NLP models are vulnerable to _morphological garden path errors_—errors caused by a failure to resolve local word segmentation ambiguities using sentence-level morphosyntactic context. We propose a benchmark, _ERAS_, that tests a model’s vulnerability to morphological garden path errors by comparing its behavior on sentences with and without local segmentation ambiguities. Using ERAS, we show that word segmentation models make morphological garden path errors on locally ambiguous sentences, but do not make equivalent errors on unambiguous sentences. We further show that sentiment analysis models with character-level tokenization make implicit garden path errors, even without an explicit word segmentation step in the pipeline. Our results indicate that models’ segmentation of Chinese text often fails to account for morphosyntactic context. @@ -2032,11 +2032,11 @@ Specializing Large Language Models to Simulate Survey Response Distributions for Global Populations YongCao - HaijiangLiu + HaijiangLiu ArnavAroraUniversity of Copenhagen - IsabelleAugensteinUniversity of Copenhagen - PaulRöttgerBocconi University - DanielHershcovichUniversity of Copenhagen + IsabelleAugensteinUniversity of Copenhagen + PaulRöttgerBocconi University + DanielHershcovichUniversity of Copenhagen 3141-3154 Large-scale surveys are essential tools for informing social science research and policy, but running surveys is costly and time-intensive. If we could accurately simulate group-level survey results, this would therefore be very valuable to social science research. Prior work has explored the use of large language models (LLMs) for simulating human behaviors, mostly through prompting. In this paper, we are the first to specialize LLMs for the task of simulating survey response distributions. As a testbed, we use country-level results from two global cultural surveys. We devise a fine-tuning method based on first-token probabilities to minimize divergence between predicted and actual response distributions for a given question. Then, we show that this method substantially outperforms other methods and zero-shot classifiers, even on unseen questions, countries, and a completely unseen survey. While even our best models struggle with the task, especially on unseen questions, our results demonstrate the benefits of specialization for simulation, which may accelerate progress towards sufficiently accurate simulation in the future. 2025.naacl-long.162 @@ -2073,10 +2073,10 @@ Multimodal Needle in a Haystack: Benchmarking Long-Context Capability of Multimodal Large Language Models HengyiWang - HaizhouShiRutgers University, New Brunswick + HaizhouShiRutgers University, New Brunswick ShiweiTanRutgers University WeiyiQinRutgers University - WenyuanWangRutgers University + WenyuanWangRutgers University TunyuZhang AkshayNambiMicrosoft Research TanujaGanuMicrosoft @@ -2090,29 +2090,29 @@ <fixed-case>W</fixed-case>orld<fixed-case>C</fixed-case>uisines: A Massive-Scale Benchmark for Multilingual and Multicultural Visual Question Answering on Global Cuisines Genta IndraWinataCapital One FrederikusHudi - Patrick AmadeusIrawan + Patrick AmadeusIrawan DavidAnugraha - Rifki AfinaPutriUniversitas Gadjah Mada + Rifki AfinaPutriUniversitas Gadjah Mada WangYutong - AdamNohejlNara Institute of Science and Technology, Japan + AdamNohejlNara Institute of Science and Technology, Japan Ubaidillah AriqPrathama NedjmaOusidhoumCardiff University AfifaAmrianiNo Institution AnarRzayev - AnirbanDasCapital One + AnirbanDasCapital One AshmariPramodya AuliaAdila - BryanWilie - Candy OliviaMawalimJapan Advanced Institute of Science and Technology, Tokyo Institute of Technology + BryanWilie + Candy OliviaMawalimJapan Advanced Institute of Science and Technology, Tokyo Institute of Technology Cheng ChingLamSingapore Management University DaudAbolade - EmmanueleChersoniThe Hong Kong Polytechnic University + EmmanueleChersoniThe Hong Kong Polytechnic University EnricoSantus FarizIkhwantri GarryKuwantoBoston University, Boston University HanyangZhaoColumbia University Haryo AkbariantoWibowoMohamed bin Zayed University of Artificial Intelligence - HolyLoveniaAI Singapore + HolyLoveniaAI Singapore Jan Christian BlaiseCruzMohamed bin Zayed University of Artificial Intelligence Jan Wira GotamaPutra JunhoMyungKorea Advanced Institute of Science and Technology @@ -2125,20 +2125,20 @@ PeeratLimkonchotiwatAI Singapore RajDabreDepartment of Computer Science, Indian Institute of Technology, Madras, Indian Institute of Technology, Madras and National Institute of Information and Communications Technology (NICT), National Institute of Advanced Industrial Science and Technology Rio AlexanderAudino - SamuelCahyawijayaCohere + SamuelCahyawijayaCohere Shi-XiongZhangCapitalOne Stephanie YuliaSalim - YiZhouCardiff University + YiZhouCardiff University YinxuanGuiFudan University - David IfeoluwaAdelaniMcGill University - En-Shiun AnnieLee - ShogoOkada - AyuPurwariantiInstitut Teknologi Bandung + David IfeoluwaAdelaniMcGill University + En-Shiun AnnieLee + ShogoOkada + AyuPurwariantiInstitut Teknologi Bandung Alham FikriAjiMohamed bin Zayed University of Artificial Intelligence - TaroWatanabeNara Institute of Science and Technology, Japan - Derry TantiWijayaMonash University and Boston University + TaroWatanabeNara Institute of Science and Technology, Japan + Derry TantiWijayaMonash University and Boston University AliceOhKorea Advanced Institute of Science and Technology - Chong-WahNgoSingapore Management University + Chong-WahNgoSingapore Management University 3242-3264 Vision Language Models (VLMs) often struggle with culture-specific knowledge, particularly in languages other than English and in underrepresented cultural contexts. To evaluate their understanding of such knowledge, we introduce WorldCuisines, a massive-scale benchmark for multilingual and multicultural, visually grounded language understanding. This benchmark includes a visual question answering (VQA) dataset with text-image pairs across 30 languages and dialects, spanning 9 language families and featuring over 1 million data points, making it the largest multicultural VQA benchmark to date. It includes tasks for identifying dish names and their origins. We provide evaluation datasets in two sizes (12k and 60k instances) alongside a training dataset (1 million instances). Our findings show that while VLMs perform better with correct location context, they struggle with adversarial contexts and predicting specific regional cuisines and languages. To support future research, we release a knowledge base with annotated food entries and images along with the VQA data. 2025.naacl-long.167 @@ -2147,12 +2147,12 @@ Extracting and Understanding the Superficial Knowledge in Alignment RunjinChen - Gabriel JacobPerin + Gabriel JacobPerin XuxiChenUniversity of Texas at Austin XilunChenArizona State University - YanHanAmazon - Nina S. T.HirataUniversidade de São Paulo - JunyuanHongUniversity of Texas at Austin + YanHanAmazon + Nina S. T.HirataUniversidade de São Paulo + JunyuanHongUniversity of Texas at Austin BhavyaKailkhuraLawrence Livermore National Laboratory 3265-3280 Alignment of large language models (LLMs) with human values and preferences, often achieved through fine-tuning based on human feedback, is essential for ensuring safe and responsible AI behaviors. However, the process typically requires substantial data and computation resources. Recent studies have revealed that alignment might be attainable at lower costs through simpler methods, such as in-context learning. This leads to the question: Is alignment predominantly superficial? In this paper, we delve into this question and provide a quantitative analysis. We formalize the concept of superficial knowledge, defining it as knowledge that can be acquired through easily token restyling, without affecting the model’s ability to capture underlying causal relationships between tokens. We propose a method to extract and isolate those superficial knowledge from aligned models, focusing on the shallow modifications to the final token selection process. By comparing models augmented only with superficial knowledge to fully aligned models, we quantify the superficial portion of alignment. Our findings reveal that while superficial knowledge constitutes a significant portion of alignment, particularly in safety and detoxification tasks, it is not the whole story. Tasks requiring reasoning and contextual understanding still rely on deeper knowledge. Additionally, we demonstrate two practical advantages of isolated superficial knowledge: (1) it can be transferred between models, enabling efficient offsite alignment of larger models using extracted superficial knowledge from smaller models, and (2) it is recoverable, allowing for the restoration of alignment in compromised models without sacrificing performance. @@ -2163,7 +2163,7 @@ Smurfs: Multi-Agent System using Context-Efficient <fixed-case>DFSDT</fixed-case> for Tool Planning JunzhiChen JuhaoLiang - BenyouWangThe Chinese University of Hong Kong, Shenzhen + BenyouWangThe Chinese University of Hong Kong, Shenzhen 3281-3298 Teaching large language models (LLMs) to use tools for solving complex problems can grant them human-like reasoning abilities. ReAct and its variants are popular frameworks for tool use in both single-agent and multi-agent systems. To address issues like error propagation and limited exploration in ReAct, the Deep First Search Decision Tree (DFSDT) was proposed, but it faces challenges such as rollback instability, redundant context, and premature termination in single-agent settings. We introduce “Smurfs,” a novel multi-agent system (MAS) that enhances DFSDT with a modular, context-efficient, and training-free design. Smurfs surpasses baseline methods in both the open-ended StableToolBench and the closed-ended HotpotQA tasks, reducing token usage by 60.9% compared to DFSDT and enabling Mistral-7b to perform on par with GPT-4-DFSDT. Extensive ablation studies confirm the effectiveness of Smurfs’ core components, offering valuable insights for the construction and interpretation of MAS, and paving the way for future exploration. We release the code at https://github.com/FreedomIntelligence/Smurfs. 2025.naacl-long.169 @@ -2174,8 +2174,8 @@ NanXuUniversity of Southern California FeiWangUniversity of Southern California ShengZhangMicrosoft - HoifungPoonMicrosoft - MuhaoChenUniversity of California, Davis and University of Southern California + HoifungPoonMicrosoft + MuhaoChenUniversity of California, Davis and University of Southern California 3299-3324 Motivated by in-context learning (ICL) capabilities of Large Language Models (LLMs), multimodal LLMs with additional visual modality are also exhibited with similar ICL abilities when multiple image-text pairs are provided as demonstrations. However, relatively less work has been done to investigate the principles behind how and why multimodal ICL works. We conduct a systematic and principled evaluation of multimodal ICL for models of different scales on a broad spectrum of new yet critical tasks. Through perturbations over different modality information, we show that modalities matter differently across tasks in multimodal ICL. Guided by task-specific modality impact, we recommend modality-driven demonstration strategies to boost ICL performance. We also find that models may follow inductive biases from multimodal ICL even if they are rarely seen in or contradict semantic priors from pretraining data. Our principled analysis provides a comprehensive way of understanding the role of demonstrations in multimodal in-context learning, and sheds light on effectively improving multimodal ICL on a wide range of tasks. 2025.naacl-long.170 @@ -2186,7 +2186,7 @@ TianjianLiJohns Hopkins University HaoranXu WeitingTan - KentonMurrayJohns Hopkins University + KentonMurrayJohns Hopkins University DanielKhashabiJohns Hopkins University 3325-3343 Data abundance across different domains exhibits a long-tailed distribution: few domains have abundant data, while most face data scarcity. Our work focuses on a multilingual setting, where available data is heavily skewed toward high-resource languages, creating significant imbalances in training data sizes across languages. This disparity challenges training language models to perform uniformly well in all languages. Two common strategies to address this issue are upsampling low-resource languages (Temperature Sampling) and upweighting their loss functions (Scalarization). These methods are often assumed to be equivalent, but this equivalence has not been rigorously established, prompting our investigation.Through theoretical and empirical analysis, we identify when these two methods are equivalent and when they diverge. We prove that they are equivalent under full gradient descent but differ under stochastic gradient descent due to differences in gradient variance. Specifically, Temperature Sampling exhibits lower variance in gradient estimation, leading to faster convergence but a higher risk of overfitting. Based on these insights, we propose Cooldown, a strategy that starts by heavily upsampling low-resource languages to accelerate convergence and gradually reduces the upsampling to prevent overfitting—achieving the best of both worlds. Our method competes effectively with existing data re-weighting techniques while offering computational efficiency. @@ -2216,9 +2216,9 @@ <fixed-case>W</fixed-case>hen2<fixed-case>C</fixed-case>all: When (not) to Call Tools - HayleyRossHarvard University, Harvard University + HayleyRossHarvard University, Harvard University Ameya SunilMahabaleshwarkarNVIDIA - YoshiSuharaNVIDIA + YoshiSuharaNVIDIA 3391-3409 Leveraging external tools is a key feature for modern Language Models (LMs) to expand their capabilities and integrate them into existing systems. However, existing benchmarks primarily focus on the accuracy of tool calling—whether the correct tool is called with the correct parameters—and less on evaluating when LMs should (not) call tools. We develop a new benchmark, When2Call, which evaluates tool-calling decision-making: when to generate a tool call, when to ask follow-up questions and when to admit the question can’t be answered with the tools provided. We find that state-of-the-art tool-calling LMs show significant room for improvement on When2Call, indicating the importance of this benchmark. We also develop a training set for When2Call and leverage the multiple-choice nature of the benchmark to develop a preference optimization training regime, which shows considerably more improvement than traditional fine-tuning. We release the benchmark and training data as well as evaluation scripts. 2025.naacl-long.174 @@ -2247,7 +2247,7 @@ Who Relies More on World Knowledge and Bias for Syntactic Ambiguity Resolution: Humans or <fixed-case>LLM</fixed-case>s? - So YoungLeeMiami University + So YoungLeeMiami University RussellScheinberg AmberShorePortland State University AmeetaAgrawalPortland State University @@ -2262,7 +2262,7 @@ JingxuanTu BingyangYeBrandeis University XinruiHu - NianwenXueBrandeis University + NianwenXueBrandeis University JamesPustejovskyBrandeis University 3499-3513 Cross-Document Event Coreference (CDEC) annotation is challenging and difficult to scale, resulting in existing datasets being small and lacking diversity. We introduce a new approach leveraging large language models (LLMs) to decontextualize event mentions, by simplifying the document-level annotation task to sentence pairs with enriched context, enabling the creation of Richer EventCorefBank (RECB), a denser and more expressive dataset annotated at faster speed. Decontextualization has been shown to improve annotation speed without compromising quality and to enhance model performance. Our baseline experiment indicates that systems trained on RECB achieve comparable results on the EventCorefBank(ECB+) test set, showing the high quality of our dataset and its generalizability on other CDEC datasets. In addition, our evaluation shows that the strong baseline models are still struggling with RECB comparing to other CDEC datasets, suggesting that the richness and diversity of RECB present significant challenges to current CDEC systems. @@ -2283,7 +2283,7 @@ Beyond End-to-End <fixed-case>VLM</fixed-case>s: Leveraging Intermediate Text Representations for Superior Flowchart Understanding - JunyiYeNew Jersey Institute of Technology + JunyiYeNew Jersey Institute of Technology AnkanDash WenpengYinPennsylvania State University GuilingWangNew Jersey Institute of Technology @@ -2298,7 +2298,7 @@ CheyenneWing María XimenaJuárez Huerta ÁngelesMárquez Hernandez - FrancisTyers + FrancisTyers 3549-3562 The development of digital linguistic resources is essential for enhancing the inclusion of indigenous and marginalized languages in the digital domain. Indigenous languages of Mexico, despite representing vast typological diversity and millions of speakers, have largely been overlooked in NLP until recently. In this paper, we present a corpus of audio and annotated transcriptions of Western Sierra Puebla Nahuatl, an endangered variety of Nahuatl spoken in Puebla, Mexico. The data made available in this corpus are useful for ASR, spelling normalization, and word-level language identification. We detail the corpus-creation process, and describe experiments to report benchmark results for each of these important NLP tasks. The corpus audio and text is made freely available. 2025.naacl-long.181 @@ -2309,7 +2309,7 @@ HanjieChenRice University ZhouxiangFangJohns Hopkins University YashSinglaJohns Hopkins University and Johns Hopkins University - MarkDredzeDepartment of Computer Science, Whiting School of Engineering and Bloomberg + MarkDredzeDepartment of Computer Science, Whiting School of Engineering and Bloomberg 3563-3599 LLMs have demonstrated impressive performance in answering medical questions, such as achieving passing scores on medical licensing examinations. However, medical board exams or general clinical questions do not capture the complexity of realistic clinical cases. Moreover, the lack of reference explanations means we cannot easily evaluate the reasoning of model decisions, a crucial component of supporting doctors in making complex medical decisions. To address these challenges, we construct two new datasets: JAMA Clinical Challenge and Medbullets. JAMA Clinical Challenge consists of questions based on challenging clinical cases, while Medbullets comprises simulated clinical questions. Both datasets are structured as multiple-choice question-answering tasks, accompanied by expert-written explanations. We evaluate seven LLMs on the two datasets using various prompts. Experiments demonstrate that our datasets are harder than previous benchmarks. In-depth automatic and human evaluations of model-generated explanations provide insights into the promise and deficiency of LLMs for explainable medical QA. 2025.naacl-long.182 @@ -2319,7 +2319,7 @@ Unfamiliar Finetuning Examples Control How Language Models Hallucinate KatieKang EricWallaceOpenAI and University of California Berkeley - ClaireTomlinUC Berkeley + ClaireTomlinUC Berkeley AviralKumarCarnegie Mellon University and Google DeepMind SergeyLevineUniversity of California Berkeley 3600-3612 @@ -2332,7 +2332,7 @@ GuangyaWan YuqiWu JieChenUniversity of Alberta - ShengLiUniversity of Virginia, Charlottesville + ShengLiUniversity of Virginia, Charlottesville 3613-3635 Self-consistency mitigates hallucinations in Large Language Models (LLMs) by sampling multiple reasoning paths, but it lacks a systematic approach to determine the optimal number of samples or select the most faithful rationale. To address this limitation, we introduce Reasoning-Aware Self-Consistency (RASC), a novel framework that enhances sampling efficiency and reasoning faithfulness by dynamically evaluating both outputs and rationales. RASC assesses the quality of reasoning and the consistency of answers for each generated sample, using these assessments to guide early stopping decisions and rationale selection. The framework employs criteria-based stopping and weighted majority voting, enabling more informed choices on when to halt sampling and which rationale to select. Our comprehensive experiments across diverse question-answering datasets demonstrate that RASC outperforms existing methods, reducing sample usage by approximately 70% while maintaining accuracy. Moreover, RASC facilitates the selection of high-fidelity rationales, thereby improving the faithfulness of LLM outputs. Our approach effectively addresses the efficiency-accuracy trade-off in LLM reasoning tasks, offering a new perspective for more nuanced, faithful, and effective utilization of LLMs in resource-constrained environments. 2025.naacl-long.184 @@ -2344,8 +2344,8 @@ SharonScottP&G OllieLiuUniversity of Southern California Kelly L.Anderson - RickardStureborgDuke University - AmanTyagi + RickardStureborgDuke University + AmanTyagi BhuwanDhingraDuke University 3636-3655 Multimodal information extraction (MIE) is crucial for scientific literature, where valuable data is often spread across text, figures, and tables. In materials science, extracting structured information from research articles can accelerate the discovery of new materials. However, the multimodal nature and complex interconnections of scientific content present challenges for traditional text-based methods. We introduce MatViX, a benchmark consisting of 324 full-length research articles and 1,688 complex structured JSON files, carefully curated by domain experts in polymer nanocomposites and biodegradation. These JSON files are extracted from text, tables, and figures in full-length documents, providing a comprehensive challenge for MIE. We introduce a novel evaluation method to assess the accuracy of curve similarity and the alignment of hierarchical structures. Additionally, we benchmark vision-language models (VLMs) in a zero-shot manner, capable of processing long contexts and multimodal inputs. Our results demonstrate significant room for improvement in current models. @@ -2354,12 +2354,12 @@ Towards Rationality in Language and Multimodal Agents: A Survey - BowenJiangUniversity of Pennsylvania + BowenJiangUniversity of Pennsylvania YangxinyuXie XiaomengWang YuanYuan - ZhuoqunHao - XinyiBaiGoogle DeepMind + ZhuoqunHao + XinyiBaiGoogle DeepMind Weijie JSuUniversity of Pennsylvania, University of Pennsylvania Camillo JoseTaylorUniversity of Pennsylvania, University of Pennsylvania TanwiMallickArgonne National Laboratory @@ -2371,9 +2371,9 @@ <fixed-case>C</fixed-case>lu<fixed-case>S</fixed-case>an<fixed-case>T</fixed-case>: Differentially Private and Semantically Coherent Text Sanitization Ahmed MusaAwon - YunLuUniversity of Victoria + YunLuUniversity of Victoria SheraPotkaUniversity of Victoria - AlexThomoUniversity of Victoria + AlexThomoUniversity of Victoria 3676-3693 We introduce CluSanT, a novel text sanitization framework based on Metric Local Differential Privacy (MLDP). Our framework consists of three components: token clustering, cluster embedding, and token sanitization. For the first, CluSanT employs Large Language Models (LLMs) to create—a set of potential substitute tokens which we meaningfully cluster. Then, we develop a parameterized cluster embedding that balances the trade-off between privacy and utility. Lastly, we propose a MLDP algorithm which sanitizes/substitutes sensitive tokens in a text with the help of our embedding. Notably, our MLDP-based framework can be tuned with parameters such that (1) existing state-of-the-art (SOTA) token sanitization algorithms can be described—and improved—via our framework with extremal values of our parameters, and (2) by varying our parameters, we allow for a whole spectrum of privacy-utility tradeoffs between the two SOTA. Our experiments demonstrate CluSanT’s balance between privacy and semantic coherence, highlighting its capability as a valuable framework for privacy-preserving text sanitization. 2025.naacl-long.187 @@ -2382,7 +2382,7 @@ <fixed-case>T</fixed-case>urking<fixed-case>B</fixed-case>ench: A Challenge Benchmark for Web Agents KevinXu - YeganehKordi + YeganehKordi TanayNayak AdiAsija YizhongWangDepartment of Computer Science, University of Washington @@ -2411,13 +2411,13 @@ <fixed-case>DPL</fixed-case>: Diverse Preference Learning Without A Reference Model - AbhijnanNath + AbhijnanNath AndreyVolozinOptum SaumajitSaha Albert AristotleNanda GalinaGruninOptum RahulBhotikaOptum Labs - NikhilKrishnaswamyColorado State University + NikhilKrishnaswamyColorado State University 3727-3747 In direct preference alignment in LLMs, most existing methods seek to retrieve the reward function directly from preference data. However, real-world preference data often contains diversity in preference annotations reflective of true human preferences. Existing algorithms, including KTO, do not directly utilize such nuances in the annotations which limits their applicability. In this work, we propose Diverse Preference Learning (DPL), a reference model-free method that simultaneously learns a baseline desirability in LLM responses while being robust to the diversity of preference annotations. Our experiments for instruction-following on Ultrafeedback and AlpacaEval 2.0 and for text-summarization on Reddit TL;DR suggest that DPL is consistently better at learning the diversity of preferences compared to existing methods, including those that require a reference model in memory. Apart from overall quality, we find that DPL’s completions, on average, are more honest, helpful, truthful and safe compared to existing methods. 2025.naacl-long.190 @@ -2441,7 +2441,7 @@ RuipuLuo JiwenZhangFudan University MinghuiQiu - XuanjingHuangFudan University + XuanjingHuangFudan University ZhongyuWeiFudan University 3769-3798 2025.naacl-long.192 @@ -2449,10 +2449,10 @@ <fixed-case>ACCORD</fixed-case>: Closing the Commonsense Measurability Gap - FrançoisRoewer-Després + FrançoisRoewer-Després JinyueFeng ZiningZhuStevens Institute of Technology - FrankRudziczDalhousie University + FrankRudziczDalhousie University 3799-3829 We present ACCORD, a framework and benchmark suite for disentangling the commonsense grounding and reasoning abilities of large language models (LLMs) through controlled, multi-hop counterfactuals. ACCORD introduces formal elements to commonsense reasoning to explicitly control and quantify reasoning complexity beyond the typical 1 or 2 hops. Uniquely, ACCORD can automatically generate benchmarks of arbitrary reasoning complexity, so it scales with future LLM improvements. Indeed, our experiments on state-of-the-art LLMs show performance degrading to below random chance with only moderate scaling, leaving substantial headroom for improvement. We release a leaderboard of the benchmark suite tested in this work, as well as code for automatically generating more complex benchmarks. 2025.naacl-long.193 @@ -2461,7 +2461,7 @@ <fixed-case>CRMA</fixed-case>rena: Understanding the Capacity of <fixed-case>LLM</fixed-case> Agents to Perform Professional <fixed-case>CRM</fixed-case> Tasks in Realistic Environments Kung-HsiangHuangSalesForce.com - AksharaPrabhakarSalesforce Research + AksharaPrabhakarSalesforce Research SidharthDhawanSalesForce.com YixinMaoSalesForce.com HuanWangSalesforce.com @@ -2476,7 +2476,7 @@ Mamba-Shedder: Post-Transformer Compression for Efficient Selective Structured State Space Models - Juan PabloMunozIntel + Juan PabloMunozIntel JinjieYuanIntel NileshJainIntel Corp 3851-3863 @@ -2487,11 +2487,11 @@ <fixed-case>CBT</fixed-case>-Bench: Evaluating Large Language Models on Assisting Cognitive Behavior Therapy MianZhang - XianjunYangFacebook + XianjunYangFacebook XinluZhangLinkedIn TravisLabrum Jamie C.ChiuPrinceton University - Shaun M.EackUniversity of Pittsburgh + Shaun M.EackUniversity of Pittsburgh FeiFangCarnegie Mellon University William YangWangUC Santa Barbara ZhiyuChen @@ -2504,7 +2504,7 @@ An Efficient Gloss-Free Sign Language Translation Using Spatial Configurations and Motion Dynamics with <fixed-case>LLM</fixed-case>s Eui JunHwangKorea Advanced Institute of Science & Technology SukminChoKorea Advanced Institute of Science and Technology - JunmyeongLee + JunmyeongLee Jong C.ParkKorea Advanced Institute of Science and Technology 3901-3920 Gloss-free Sign Language Translation (SLT) converts sign videos into spoken language sentences without relying on glosses, which are the written representations of signs. Recently, Large Language Models (LLMs) have shown remarkable translation performance in gloss-free methods by harnessing their powerful natural language generation capabilities. However, these methods often rely on domain-specific fine-tuning of visual encoders to achieve optimal results. By contrast, we emphasize the importance of capturing the spatial configurations and motion dynamics in sign language. With this in mind, we introduce Spatial and Motion-based Sign Language Translation (SpaMo), a novel LLM-based SLT framework. The core idea of SpaMo is simple yet effective: instead of domain-specific tuning, we use off-the-shelf visual encoders to extract spatial and motion features, which are then input into an LLM along with a language prompt. Additionally, we employ a visual-text alignment process as a lightweight warm-up step before applying SLT supervision. Our experiments demonstrate that SpaMo achieves state-of-the-art performance on three popular datasets—PHOENIX14T, CSL-Daily, and How2Sign—without visual fine-tuning. @@ -2536,9 +2536,9 @@ Temporal-Aware Soft Prompt Tuning for Automatic Text Dating - HaiWang + HaiWang YuzhiLiang - HanRenGuangdong University of Foreign Studies + HanRenGuangdong University of Foreign Studies 3975-3987 This paper presents Temporal-aware Soft Prompt Tuning (TASPT), a novel approach for automatic text dating. Unlike existing methods, which often overlook the evolution of word meanings in texts spanning long periods, TASPT incorporates the unique characteristics of historical texts. It introduces a temporal-aware text representation that dynamically captures both semantic variance and invariance. This representation is combined with a soft prompt, enabling efficient parameter tuning for automatic text dating. Experiments show that TASPT outperforms all existing methods on two diachronic datasets: the Twenty-Four Histories and the Royal Society Corpus. 2025.naacl-long.200 @@ -2547,14 +2547,14 @@ Sparser Mixture-of-Adapters with Cross-Layer Generalization ZiyueLi - TianyiZhouUniversity of Maryland, College Park + TianyiZhouUniversity of Maryland, College Park 3988-4002 2025.naacl-long.201 li-zhou-2025-sparser How to Align Multiple Signed Language Corpora for Better Sign-to-Sign Translations? - MertInanNortheastern University + MertInanNortheastern University YangZhongUniversity of Pittsburgh VidyaGanesh MaliheAlikhaniNortheastern University @@ -2565,17 +2565,17 @@ Communication Makes Perfect: Persuasion Dataset Construction via Multi-<fixed-case>LLM</fixed-case> Communication - WeichengMaGeorgia Institute of Technology - HefanZhangDartmouth College + WeichengMaGeorgia Institute of Technology + HefanZhangDartmouth College IvoryYang - ShiyuJi + ShiyuJi JoiceChen FarnooshHashemi ShubhamMohole EthanGearey MichaelMacy SaeedHassanpourDartmouth College - SoroushVosoughiDartmouth College + SoroushVosoughiDartmouth College 4017-4045 Large Language Models (LLMs) have shown proficiency in generating persuasive dialogue, yet concerns about the fluency and sophistication of their outputs persist. This paper presents a multi-LLM communication framework designed to enhance the generation of persuasive data automatically. This framework facilitates the efficient production of high-quality, diverse linguistic content with minimal human oversight. Through extensive evaluations, we demonstrate that the generated data excels in naturalness, linguistic diversity, and the strategic use of persuasion, even in complex scenarios involving social taboos. The framework also proves adept at generalizing across novel contexts. Our results highlight the framework’s potential to significantly advance research in both computational and social science domains concerning persuasive communication. 2025.naacl-long.203 @@ -2585,7 +2585,7 @@ Soft Prompting for Unlearning in Large Language Models KarunaBhaila Minh-HaoVan - XintaoWu + XintaoWu 4046-4056 The widespread popularity of Large Language Models (LLMs), partly due to their emerging in-context learning ability, has highlighted the importance of ethical and safety considerations for deployment. Motivated by corresponding data protection guidelines, we investigate machine unlearning for LLMs. In contrast to the growing literature on fine-tuning methods to achieve unlearning, we focus on a comparatively lightweight alternative called soft prompting to realize unlearning in LLMs. With losses designed to enforce forgetting as well as utility preservation, our framework Soft Prompting for Unlearning (SPUL) learns prompt tokens that are prepended to a query to induce unlearning of specific training examples at inference time without updating LLM parameters. We conduct a rigorous evaluation of the proposed method, and results indicate that SPUL can significantly improve the trade-off between utility and forgetting for text classification and question-answering. We further validate our method with LLMs of varying parameter sizes to highlight its flexibility and provide detailed insights into the choice of hyperparameters and the influence of the size of unlearning data. 2025.naacl-long.204 @@ -2594,7 +2594,7 @@ Mutual-pairing Data Augmentation for Fewshot Continual Relation Extraction Nguyen HoangAnh - QuyenTranQualcomm Inc, QualComm + QuyenTranQualcomm Inc, QualComm Thanh XuanNguyen Nguyen Thi NgocDiep Linh NgoVanHanoi University of Science and Technology @@ -2623,13 +2623,13 @@ Protecting Privacy in Multimodal Large Language Models with <fixed-case>MLLMU</fixed-case>-Bench - ZheyuanLiuUniversity of Notre Dame + ZheyuanLiuUniversity of Notre Dame GuangyaoDou - MengzhaoJia - ZhaoxuanTanUniversity of Notre Dame - QingkaiZengAmazon + MengzhaoJia + ZhaoxuanTanUniversity of Notre Dame + QingkaiZengAmazon YongleYuan - MengJiangUniversity of Notre Dame + MengJiangUniversity of Notre Dame 4105-4135 Generative models such as Large Language Models (LLM) and Multimodal Large Language models (MLLMs) trained on massive web corpora can memorize and disclose individuals’ confidential and private data, raising legal and ethical concerns. While many previous works have addressed this issue in LLM via machine unlearning, it remains largely unexplored for MLLMs. To tackle this challenge, we introduce Multimodal Large Language Model Unlearning Benchmark (MLLMU-Bench), a novel benchmark aimed at advancing the understanding of multimodal machine unlearning. MLLMU-Bench consists of 500 fictitious profiles and 153 profiles for public celebrities, each profile feature over 14 customized question-answer pairs, evaluated from both multimodal (image+text) and unimodal (text) perspectives. The benchmark is divided into four sets to assess unlearning algorithms in terms of efficacy, generalizability, and model utility. Finally, we provide baseline results using existing generative model unlearning algorithms. Surprisingly, our experiments show that unimodal unlearning algorithms excel in generation tasks, while multimodal unlearning approaches perform better in classification with multimodal inputs. 2025.naacl-long.207 @@ -2660,7 +2660,7 @@ ChengWang YiweiWangUniversity of California, Merced YujunCaiThe University of Queensland - BryanHooiNational University of Singapore + BryanHooiNational University of Singapore 4183-4194 Retrieval-augmented generation (RAG) systems enhance large language models by incorporating external knowledge, addressing issues like outdated internal knowledge and hallucination. However, their reliance on external knowledge bases makes them vulnerable to corpus poisoning attacks, where adversarial passages can be injected to manipulate retrieval results. Existing methods for crafting such passages, such as random token replacement or training inversion models, are often slow and computationally expensive, requiring either access to retriever’s gradients or large computational resources. To address these limitations, we propose Dynamic Importance-Guided Genetic Algorithm (DIGA), an efficient black-box method that leverages two key properties of retrievers: insensitivity to token order and bias towards influential tokens. By focusing on these characteristics, DIGA dynamically adjusts its genetic operations to generate effective adversarial passages with significantly reduced time and memory usage. Our experimental evaluation shows that DIGA achieves superior efficiency and scalability compared to existing methods, while maintaining comparable or better attack success rates across multiple datasets. 2025.naacl-long.210 @@ -2681,7 +2681,7 @@ <fixed-case>CVE</fixed-case>-Bench: Benchmarking <fixed-case>LLM</fixed-case>-based Software Engineering Agent’s Ability to Repair Real-World <fixed-case>CVE</fixed-case> Vulnerabilities PeiranWang XiaogengLiuUniversity of Wisconsin - Madison - ChaoweiXiaoUniversity of Wisconsin - Madison and NVIDIA + ChaoweiXiaoUniversity of Wisconsin - Madison and NVIDIA 4207-4224 Automated vulnerability repair is a crucial field within software engineering and security research. Large Language Models (LLMs) and LLM agents have demonstrated significant potential in this domain by understanding descriptions in natural language and generating corresponding formal code. Although the coding capabilities of LLMs have advanced rapidly, evaluation benchmarks for real-world programming setups are still lagging, preventing the development of LLM and LLM agents in real-world vulnerability repair. To this end, we introduce CVE-Bench, an evaluation framework consisting of 509 Common Vulnerabilities and Exposures (CVEs) from four programming languages and 120 popular open-source repositories. Unlike previous vulnerability repair benchmarks, which only involve the code input and output, we provide LLM agents with a test environment that simulates the real-world vulnerability repair process. This environment provides multiple levels of CVE information modeling, such as black-box testing and white-box testing. It enables the agents to use static analysis tools to assist their repair process. Our evaluation reveals that the SWE-agent can only repair 21% of vulnerabilities at its best. Furthermore, they lack expert knowledge about how to use the analysis tool to assist in vulnerability repair. 2025.naacl-long.212 @@ -2693,7 +2693,7 @@ ShreyaShankarUniversity of California Berkeley HarrisonChaseLangChain WilliamHinthorn - AdityaParameswaranUniversity of California, Berkeley + AdityaParameswaranUniversity of California, Berkeley 4225-4245 Large language models (LLMs) are increasingly deployed in specialized production data processing pipelines across diverse domains—such as finance, marketing, and e-commerce. However, when running them in production across many inputs, they often fail to follow instructions or meet developer expectations. To improve reliability in these applications, creating assertions or guardrails for LLM outputs to run alongside the pipelines is essential. Yet, determining the right set of assertions that capture developer requirements for a task is challenging. In this paper, we introduce PROMPTEVALS, a dataset of 2087 LLM pipeline prompts with 12623 corresponding assertion criteria, sourced from developers using our open-source LLM pipeline tools. This dataset is larger than previous collections. Using a hold-out test split of PROMPTEVALS as a benchmark, we evaluated closed- and open-source models in generating relevant assertions. Notably, our fine-tuned Mistral and Llama 3 models outperform GPT-4o by 20.93% on average, offering both reduced latency and improved performance. We believe our dataset can spur further research in LLM reliability, alignment, and prompt engineering. 2025.naacl-long.213 @@ -2701,15 +2701,15 @@ <fixed-case>T</fixed-case>ool<fixed-case>F</fixed-case>low: Boosting <fixed-case>LLM</fixed-case> Tool-Calling Through Natural and Coherent Dialogue Synthesis - ZezhongWang + ZezhongWang XingshanZengHuawei Technologies Ltd. - WeiwenLiuHuawei Technologies Ltd. - LiangyouLiHuawei Noah’s Ark Lab + WeiwenLiuHuawei Technologies Ltd. + LiangyouLiHuawei Noah’s Ark Lab YashengWang LifengShangHuawei Technologies Ltd. - XinJiang - QunLiuHuawei Noah’s Ark Lab - Kam-FaiWongThe Chinese University of Hong Kong + XinJiang + QunLiuHuawei Noah’s Ark Lab + Kam-FaiWongThe Chinese University of Hong Kong 4246-4263 Supervised fine-tuning (SFT) is a common method to enhance the tool calling capabilities of Large Language Models (LLMs), with the training data often being synthesized. The current data synthesis process generally involves sampling a set of tools, formulating a requirement based on these tools, and generating the call statements. However, tools sampled randomly lack relevance, making them difficult to combine and thus reducing the diversity of the data. Additionally, current work overlooks the coherence between turns of dialogues, leading to a gap between the synthesized data and real-world scenarios. To address these issues, we propose a Graph-based Sampling strategy to sample more relevant tool combinations, and a Planned-generation strategy to create plans that guide the synthesis of coherent dialogues. We integrate these two strategies and enable multiple agents to synthesize the dialogue data interactively, resulting in our tool-calling data synthesis pipeline ToolFlow. Data quality assessments demonstrate improvements in the naturalness and coherence of our synthesized dialogues. Finally, we apply SFT on LLaMA-3.1-8B using 8,000 synthetic dialogues generated with ToolFlow. Results show that the model achieves tool-calling performance comparable to or even surpassing GPT-4, while maintaining strong general capabilities. 2025.naacl-long.214 @@ -2717,8 +2717,8 @@ Fighting Spurious Correlations in Text Classification via a Causal Learning Perspective - YuqingZhouGeorge Mason University - ZiweiZhuGeorge Mason University + YuqingZhouGeorge Mason University + ZiweiZhuGeorge Mason University 4264-4274 In text classification tasks, models often rely on spurious correlations for predictions, incorrectly associating irrelevant features with the target labels. This issue limits the robustness and generalization of models, especially when faced with out-of-distribution data where such spurious correlations no longer hold. To address this challenge, we propose the Causally Calibrated Robust Classifier (CCR), which aims to reduce models’ reliance on spurious correlations and improve model robustness. Our approach integrates a causal feature selection method based on counterfactual reasoning, along with an unbiased inverse propensity weighting (IPW) loss function. By focusing on selecting causal features, we ensure that the model relies less on spurious features during prediction. We theoretically justify our approach and empirically show that CCR achieves state-of-the-art performance among methods without group labels, and in some cases, it can compete with the models that utilize group labels. Our code can be found at: https://github.com/yuqing-zhou/Causal-Learning-For-Robust-Classifier. 2025.naacl-long.215 @@ -2728,11 +2728,11 @@ Knowledge-Aware Query Expansion with Large Language Models for Textual and Relational Retrieval YuXiaUniversity of California, San Diego JundaWuUniversity of California, San Diego - SungchulKimAdobe Systems - TongYuAdobe Research - Ryan A.RossiAdobe Research - HaoliangWangAdobe Research - JulianMcAuleyUniversity of California, San Diego, University of California, San Diego + SungchulKimAdobe Systems + TongYuAdobe Research + Ryan A.RossiAdobe Research + HaoliangWangAdobe Research + JulianMcAuleyUniversity of California, San Diego, University of California, San Diego 4275-4286 Large language models (LLMs) have been used to generate query expansions augmenting original queries for improving information search. Recent studies also explore providing LLMs with initial retrieval results to generate query expansions more grounded to document corpus. However, these methods mostly focus on enhancing textual similarities between search queries and target documents, overlooking document relations. For queries like “Find me a highly rated camera for wildlife photography compatible with my Nikon F-Mount lenses”, existing methods may generate expansions that are semantically similar but structurally unrelated to user intents. To handle such semi-structured queries with both textual and relational requirements, in this paper we propose a knowledge-aware query expansion framework, augmenting LLMs with structured document relations from knowledge graph (KG). To further address the limitation of entity-based scoring in existing KG-based methods, we leverage document texts as rich KG node representations and use document-based relation filtering for our Knowledge-Aware Retrieval (KAR). Extensive experiments on three datasets of diverse domains show the advantages of our method compared against state-of-the-art baselines on textual and relational semi-structured retrieval. 2025.naacl-long.216 @@ -2740,8 +2740,8 @@ <fixed-case>SVD</fixed-case>-<fixed-case>LLM</fixed-case> V2: Optimizing Singular Value Truncation for Large Language Model Compression - XinWang - SamiulAlam + XinWang + SamiulAlam ZhongweiWan HuiShen MiZhangThe Ohio State University @@ -2752,7 +2752,7 @@ <fixed-case>A</fixed-case>udio<fixed-case>B</fixed-case>ench: A Universal Benchmark for Audio Large Language Models - BinWangI2R, A*STAR + BinWangI2R, A*STAR XunlongZouA*STAR GeyuLinInstitute of Infocomm Research, A*STAR ShuoSun, A*STAR @@ -2760,7 +2760,7 @@ WenyuZhangI2R, A*STAR ZhengyuanLiuI2R AiTiAwI2R - Nancy F.Chen + Nancy F.Chen 4297-4316 We introduce AudioBench, a universal benchmark designed to evaluate Audio Large Language Models (AudioLLMs). It encompasses 8 distinct tasks and 26 datasets, among which, 7 are newly proposed datasets. The evaluation targets three main aspects: speech understanding, audio scene understanding, and voice understanding (paralinguistic). Despite recent advancements, there lacks a comprehensive benchmark for AudioLLMs on instruction following capabilities conditioned on audio signals. AudioBench addresses this gap by setting up datasets as well as desired evaluation metrics. Besides, we also evaluated the capabilities of five popular models and found that no single model excels consistently across all tasks. We outline the research outlook for AudioLLMs and anticipate that our open-sourced evaluation toolkit, data, and leaderboard will offer a robust testbed for future model developments. 2025.naacl-long.218 @@ -2771,9 +2771,9 @@ ZirunGuo ShuleiWang WangLin - WeicaiYan - YangyangWuZhejiang University - TaoJinZhejiang University + WeicaiYan + YangyangWuZhejiang University + TaoJinZhejiang University 4317-4327 Missing modality issues are common in real-world applications, arising from factors such as equipment failures and privacy concerns. When fine-tuning pre-trained models on downstream datasets with missing modalities, performance can degrade significantly. Current methods often aggregate various missing cases to train recovery modules or align multimodal features, resulting in suboptimal performance, high computational costs, and the risk of catastrophic forgetting in continual environments where data arrives sequentially. In this paper, we formulate the dynamic missing modality problem as a continual learning task and introduce the continual multimodal missing modality task. To address this challenge efficiently, we introduce three types of prompts: modality-specific, task-aware, and task-specific prompts. These prompts enable the model to learn intra-modality, inter-modality, intra-task, and inter-task features. Furthermore, we propose a contrastive task interaction strategy to explicitly learn prompts correlating different modalities. We conduct extensive experiments on three public datasets, where our method consistently outperforms state-of-the-art approaches. 2025.naacl-long.219 @@ -2781,8 +2781,8 @@ Benchmarking and Building Zero-Shot <fixed-case>H</fixed-case>indi Retrieval Model with <fixed-case>H</fixed-case>indi-<fixed-case>BEIR</fixed-case> and <fixed-case>NLLB</fixed-case>-E5 - ArkadeepAcharya - RudraMurthyIBM India Ltd + ArkadeepAcharya + RudraMurthyIBM India Ltd VishwajeetKumarInternational Business Machines JaydeepSen 4328-4348 @@ -2795,11 +2795,11 @@ MuzhiLi CehaoYangThe Hong Kong University of Science and Technology ChengjinXuInternational Digital Economy Academy - XuhuiJiangInternational Digital Economy Academy, International Digital Economy Academy - YiyanQiIDEA + XuhuiJiangInternational Digital Economy Academy, International Digital Economy Academy + YiyanQiIDEA JianGuoInternational Digital Economy Academy, International Digital Economy Academy - Ho-fungLeungThe Chinese University of Hong Kong and - IrwinKing + Ho-fungLeungThe Chinese University of Hong Kong and + IrwinKing 4349-4363 The Knowledge Graph Completion (KGC) task aims to infer the missing entity from an incomplete triple. Existing embedding-based methods rely solely on triples in the KG, which is vulnerable to specious relation patterns and long-tail entities. On the other hand, text-based methods struggle with the semantic gap between KG triples and natural language. Apart from triples, entity contexts (e.g., labels, descriptions, aliases) also play a significant role in augmenting KGs. To address these limitations, we propose KGR3, a context-enriched framework for KGC. KGR3 is composed of three modules. Firstly, the Retrieval module gathers supporting triples from the KG, collects plausible candidate answers from a base embedding model, and retrieves context for each related entity. Then, the Reasoning module employs a large language model to generate potential answers for each query triple. Finally, the Re-ranking module combines candidate answers from the two modules mentioned above, and fine-tunes an LLM to provide the best answer. Extensive experiments on widely used datasets demonstrate that KGR3 consistently improves various KGC methods. Specifically, the best variant of KGR3 achieves absolute Hits@1 improvements of 12.3% and 5.6% on the FB15k237 and WN18RR datasets. 2025.naacl-long.221 @@ -2807,9 +2807,9 @@ See-Saw Modality Balance: See Gradient, and Sew Impaired Vision-Language Balance to Mitigate Dominant Modality Bias - JunehyoungKwonChung-Ang University + JunehyoungKwonChung-Ang University MiHyeonKimKorea Telecom Research - EunjuLeeChung-Ang University + EunjuLeeChung-Ang University JuhwanChoiAITRICS YoungBinKimChung-Ang University 4364-4378 @@ -2822,8 +2822,8 @@ JiaweiLiu YanjiaoLiu XunGong - TingtingWangJilin University - HongChenTongji University + TingtingWangJilin University + HongChenTongji University YunfengHuJilin University 4379-4391 Emergent abilities of large language models (LLMs) have significantly advanced their application in autonomous vehicle (AV) research. Safe integration of LLMs into vehicles, however, necessitates their thorough understanding of dynamic traffic environments. Towards this end, this study introduces a framework leveraging LLMs’ built-in extrapolation capabilities for vehicle trajectory prediction, thereby evaluating their comprehension of the evolution of traffic agents’ behaviors and interactions over time. The framework employs a traffic encoder to extract spatial-level scene features from agents’ observed trajectories to facilitate efficient scene representation. To focus on LLM’s innate capabilities, scene features are then converted into LLM-compatible tokens through a reprogramming adapter and finally decoded into predicted trajectories with a linear decoder. Experimental results quantitatively demonstrate the framework’s efficacy in enabling off-the-shelf, frozen LLMs to achieve competitive trajectory prediction performance, with qualitative analyses revealing their enhanced understanding of complex, multi-agent traffic scenarios. @@ -2832,9 +2832,9 @@ Stronger Models are Not Always Stronger Teachers for Instruction Tuning - ZhangchenXu - FengqingJiangUniversity of Washington - LuyaoNiuUniversity of Washington + ZhangchenXu + FengqingJiangUniversity of Washington + LuyaoNiuUniversity of Washington Bill YuchenLinxAI and University of Washington RadhaPoovendranUniversity of Washington, Seattle 4392-4405 @@ -2846,11 +2846,11 @@ Efficient and Effective Prompt Tuning via Prompt Decomposition and Compressed Outer Product PengxiangLan HaoyuXu - EnnengYang - YuliangLiang + EnnengYang + YuliangLiang GuibingGuoNortheastern University - JianzheZhaoNortheastern University - XingweiWangNortheastern University + JianzheZhaoNortheastern University + XingweiWangNortheastern University 4406-4421 Prompt tuning (PT) offers a cost-effective alternative to fine-tuning large-scale pre-trained language models (PLMs), requiring only a few parameters in soft prompt tokens added before the input text. However, existing PT approaches face two significant issues: i They overlook intrinsic semantic associations between soft prompt tokens, leading to high discreteness and limited interactions, thus reducing the model’s comprehension and effectiveness in complex tasks. ii Due to the complexity of downstream tasks, long soft prompt is necessitated to improve performance, but prompt length correlates positively with memory usage and computational costs. Achieving high efficiency and performance remains an ongoing challenge. To address these issues, we propose a novel Low-parameters Prompt Tuning (LAMP) method, which leverages prompt decomposition and compressed outer product. Specifically, the prompt decomposition module employs Truncated SVD to reduce training parameters and significantly lower the dimensionality of the soft prompt parameter space. It then utilizes a compressed outer product module to facilitate multiple interactions among prompt tokens, exploring their intrinsic associations to enhance knowledge representation. Finally, LAMP uses average pooling to reduce memory usage and training/inference time. Extensive experiments across six architectures and eight datasets demonstrate that LAMP outperforms state-of-the-art PT-based and LoRA-based methods in performance and efficiency. 2025.naacl-long.225 @@ -2858,10 +2858,10 @@ Threshold Filtering Packing for Supervised Fine-Tuning: Training Related Samples within Packs - JianchengDong + JianchengDong LeiJiangUniversity of Illinois at Chicago WeiJinEmory University - LuCheng + LuCheng 4422-4435 Packing for Supervised Fine-Tuning (SFT) in autoregressive models involves concatenating data points of varying lengths until reaching the designed maximum length to facilitate GPU processing. However, randomly concatenating data points can lead to cross-contamination of sequences due to the significant difference in their subject matter. The mainstream approaches in SFT ensure that each token in the attention calculation phase only focuses on tokens within its own short sequence, without providing additional learning signals for the preceding context. To address these challenges, we introduce Threshold Filtering Packing (TFP), a method that selects samples with related context while maintaining sufficient diversity within the same pack. Our experiments show that TFP offers a simple-to-implement and scalable approach that significantly enhances SFT performance, with observed improvements of up to 7% on GSM8K, 4% on HumanEval. Furthermore, results from bias benchmark datasets highlight TFP’s promising performance in improving fairness while also boosting prediction accuracy by 15%. 2025.naacl-long.226 @@ -2871,8 +2871,8 @@ Transferable Post-training via Inverse Value Learning XinyuLu XueruWen - YaojieLuInstitute of Software, Chinese Academy of Sciences - BowenYuAlibaba Group + YaojieLuInstitute of Software, Chinese Academy of Sciences + BowenYuAlibaba Group HongyuLinInstitute of Software, Chinese Academy of Sciences HaiyangYu LeSunInstitute of Software, Chinese Academy of Sciences @@ -2888,8 +2888,8 @@ HeegyuKimAjou University JeonTaeyangAjou University SeungHwanChoi - SeungtaekChoiYanolja - HyunsoukChoAjou University + SeungtaekChoiYanolja + HyunsoukChoAjou University 4448-4475 Text-to-SQL systems have become crucial for translating natural language into SQL queries in various industries, enabling non-technical users to perform complex data operations. The need for accurate evaluation methods has increased as these systems have grown more sophisticated. However, the Execution Accuracy (EX), the most prevalent evaluation metric, still shows many false positives and negatives. Thus, this paper introduces **FLEX(False-Less EXecution)**, a novel approach to evaluating text-to-SQL systems using large language models (LLMs) to emulate human expert-level evaluation of SQL queries. Our metric improves agreement with human experts (from 62 to 87.04 in Cohen’s kappa) with comprehensive context and sophisticated criteria. Our extensive experiments yield several key insights: (1) Models’ performance increases by over 2.6 points on average, substantially affecting rankings on Spider and BIRD benchmarks; (2) The underestimation of models in EX primarily stems from annotation quality issues; and (3) Model performance on particularly challenging questions tends to be overestimated. This work contributes to a more accurate and nuanced evaluation of text-to-SQL systems, potentially reshaping our understanding of state-of-the-art performance in this field. 2025.naacl-long.228 @@ -2898,7 +2898,7 @@ <fixed-case>AID</fixed-case>: Adaptive Integration of Detectors for Safe <fixed-case>AI</fixed-case> with Language Models XinranWangUniversity of Minnesota - Twin Cities - EnmaoDiaoColAI + EnmaoDiaoColAI QiLeUniversity of Minnesota - Twin Cities JieDing AliAnwarUniversity of Minnesota @@ -2912,9 +2912,9 @@ JiayangYu YihangZhang BinWang - PeiqinLinInstitut für Informatik - YongKangLiuNortheast University at Qinhuangdao Campus - ShiFengNortheastern University, China + PeiqinLinInstitut für Informatik + YongKangLiuNortheast University at Qinhuangdao Campus + ShiFengNortheastern University, China 4493-4506 Fine-tuning is a key approach for adapting language models to specific downstream tasks, but updating all model parameters becomes impractical as model sizes increase.Parameter-Efficient Fine-Tuning (PEFT) methods, such as Low-Rank Adaptation (LoRA), address this challenge by introducing additional adaptation parameters into pre-trained weight matrices.However, LoRA’s performance varies across different insertion points within the model, highlighting potential parameter inefficiency due to unnecessary insertions. To this end, we propose SSMLoRA (**S**tate **S**pace **M**odel **L**ow-**R**ank **A**daptation), an extension of LoRA that incorporates a State Space Model (SSM) to interconnect low-rank matrices. SSMLoRA ensures that performance is maintained even with sparser insertions. SSMLoRA allows the model to not only map inputs to a low-rank space for better feature extraction but also leverage the computations from the previous low-rank space. Our method achieves comparable performance to LoRA on the General Language Understanding Evaluation (GLUE) benchmark while using only half the parameters. Additionally, due to its structure, SSMLoRA shows promise in handling tasks with longer input sequences. 2025.naacl-long.230 @@ -2925,7 +2925,7 @@ TungNguyen TueLe Hoang TranVuong - Quang DucNguyenNanyang Technological University + Quang DucNguyenNanyang Technological University Duc AnhNguyenHanoi University of Science and Technology Linh NgoVanHanoi University of Science and Technology SangDinhHanoi University of Science and Technology @@ -2953,7 +2953,7 @@ A Top-down Graph-based Tool for Modeling Classical Semantic Maps: A Case Study of Supplementary Adverbs - ZhuLiu + ZhuLiu CunliangKong YingLiuTsinghua University, Tsinghua University MaosongSunTsinghua University @@ -2964,11 +2964,11 @@ <fixed-case>U</fixed-case>ni<fixed-case>HGKR</fixed-case>: Unified Instruction-aware Heterogeneous Knowledge Retrievers - DehaiMin + DehaiMin ZhiyangXu GuilinQi LifuHuangUniversity of California, Davis - ChenyuYouState University of New York at Stony Brook + ChenyuYouState University of New York at Stony Brook 4577-4594 Existing information retrieval (IR) models often assume a homogeneous structure for knowledge sources and user queries, limiting their applicability in real-world settings where retrieval is inherently heterogeneous and diverse. In this paper, we introduce UniHGKR, a unified instruction-aware heterogeneous knowledge retriever that (1) builds a unified retrieval space for heterogeneous knowledge and (2) follows diverse user instructions to retrieve knowledge in specified types. UniHGKR consists of three principal stages, including heterogeneous self-supervised pretraining, text-anchored embedding alignment, and instruction-aware retriever fine-tuning, enabling it to generalize across varied retrieval contexts. This framework is highly scalable, with a BERT-based version and a UniHGKR-7B version trained on large language models. Also, we introduce CompMix-IR, the first native heterogeneous knowledge retrieval benchmark. It includes two retrieval scenarios with various instructions, over 9,400 question answer (QA) pairs, and a corpus of 10 million entries, covering four different types of data. Extensive experiments show that UniHGKR consistently outperform state-of-the-art methods on CompMix-IR, achieving up to 6.36% and 54.23% relative improvements in two scenarios, respectively. Finally, by equipping our retriever for open-domain heterogeneous QA systems, we achieve a new state-of-the-art result on the popular ConvMix task, with an absolute improvement of up to 5.90 points. 2025.naacl-long.234 @@ -2979,9 +2979,9 @@ VipulGuptaPennsylvania State University CandaceRossMeta DavidPantoja - Rebecca J.PassonneauPennsylvania State University + Rebecca J.PassonneauPennsylvania State University MeganUngFacebook AI Research - AdinaWilliamsFAIR (Meta Platforms Inc.) + AdinaWilliamsFAIR (Meta Platforms Inc.) 4595-4615 One of the most challenging problems facing NLP today is evaluation. Some of the most pressing issues pertain to benchmark saturation, data contamination, and diversity in the quality of test examples. To address these concerns, we propose Selection Methodology for Accurate, Reduced, and Targeted (SMART) filtering, a novel approach to select a high-quality subset of examples from existing benchmark datasets by systematically removing less informative and lower quality examples. Our approach applies three filtering criteria, removing (i) easy examples, (ii) data-contaminated examples, and (iii) examples that are similar to each other based on distance in an embedding space. We demonstrate the effectiveness of SMART Filtering on three multiple choice QA datasets, where our methodology increases efficiency by reducing dataset size by 48% on average, while increasing Pearson correlation with rankings from ChatBot Arena, a more open-ended human evaluation setting. Our method enables us to be more efficient, whether we are using SMART Filtering to make new benchmarks more challenging, or to revitalize older, human generated datasets, while still preserving the relative model rankings. 2025.naacl-long.235 @@ -2991,10 +2991,10 @@ Entropy-Based Decoding for Retrieval-Augmented Large Language Models ZexuanQiuThe Chinese University of Hong Kong ZijingOuImperial College London - BinWu + BinWu JingjingLi AiweiLiuTsinghua University - IrwinKing + IrwinKing 4616-4627 Augmenting Large Language Models (LLMs) with retrieved external knowledge has proven effective in improving the factual accuracy of generated responses. Despite their success, retrieval-augmented LLMs still face the distractibility issue, where the generated responses are negatively influenced by noise from both external and internal knowledge sources. In this paper, we introduce a novel, training-free decoding method guided by entropy considerations to mitigate this issue. Our approach utilizes entropy-based document-parallel ensemble decoding to prioritize low-entropy distributions from retrieved documents, thereby enhancing the extraction of relevant information of context. Additionally, it incorporates a contrastive decoding mechanism that contrasts the obtained low-entropy ensemble distribution with the high-entropy distribution derived from the model’s internal knowledge across layers, which ensures a greater emphasis on reliable external information. Extensive experiments on open-domain question answering datasets demonstrate the superiority of our method. 2025.naacl-long.236 @@ -3003,7 +3003,7 @@ What We Talk About When We Talk About <fixed-case>LM</fixed-case>s: Implicit Paradigm Shifts and the Ship of Language Models ShengqiZhuCornell University - JeffreyRzeszotarskiCornell University + JeffreyRzeszotarskiCornell University 4628-4646 The term Language Models (LMs) as a time-specific collection of models of interest is constantly reinvented, with its referents updated much like the *Ship of Theseus* replaces its parts but remains the same ship in essence. In this paper, we investigate this *Ship of Language Models* problem, wherein scientific evolution takes the form of continuous, implicit retrofits of key *existing* terms. We seek to initiate a novel perspective of scientific progress, in addition to the more well-studied emergence of *new* terms. To this end, we construct the data infrastructure based on recent NLP publications. Then, we perform a series of text-based analyses toward a detailed, quantitative understanding of the use of Language Models as a term of art. Our work highlights how systems and theories influence each other in scientific discourse, and we call for attention to the transformation of this Ship that we all are contributing to. 2025.naacl-long.237 @@ -3013,8 +3013,8 @@ Diversity Helps Jailbreak Large Language Models WeiliangZhao DanielBen-Levi - WeiHao - JunfengYangColumbia University + WeiHao + JunfengYangColumbia University ChengzhiMaoGoogle 4647-4680 We have uncovered a powerful jailbreak technique that leverages large language models’ ability to diverge from prior context, enabling them to bypass safety constraints and generate harmful outputs. By simply instructing the LLM to deviate and obfuscate previous attacks, our method dramatically outperforms existing approaches, achieving up to a 62.83% higher success rate in compromising ten leading chatbots, including GPT-4, Gemini, and Llama, while using only 12.9% of the queries. This revelation exposes a critical flaw in current LLM safety training, suggesting that existing methods may merely mask vulnerabilities rather than eliminate them. Our findings sound an urgent alarm for the need to revolutionize testing methodologies to ensure robust and reliable LLM security. @@ -3023,10 +3023,10 @@ Constrained Decoding with Speculative Lookaheads - Nishanth SridharNakshatriPurdue University + Nishanth SridharNakshatriPurdue University ShamikRoyAmazon RajarshiDasAWS AI Labs - SutheeChaidaroonAmazon + SutheeChaidaroonAmazon LeonidBoytsovAmazon RashmiGangadharaiahAmazon 4681-4700 @@ -3038,9 +3038,9 @@ <fixed-case>D</fixed-case>y<fixed-case>PCL</fixed-case>: Dynamic Phoneme-level Contrastive Learning for Dysarthric Speech Recognition WonjunLee SoleeIm - HeejinDoPohang University of Science and Technology + HeejinDoPohang University of Science and Technology YunsuKimaiXplain, Inc. - JungseulOkPOSTECH + JungseulOkPOSTECH GaryLee 4701-4712 Dysarthric speech recognition often suffers from performance degradation due to the intrinsic diversity of dysarthric severity and extrinsic disparity from normal speech. To bridge these gaps, we propose a Dynamic Phoneme-level Contrastive Learning (DyPCL) method, which leads to obtaining invariant representations across diverse speakers. We decompose the speech utterance into phoneme segments for phoneme-level contrastive learning, leveraging dynamic connectionist temporal classification alignment. Unlike prior studies focusing on utterance-level embeddings, our granular learning allows discrimination of subtle parts of speech. In addition, we introduce dynamic curriculum learning, which progressively transitions from easy negative samples to difficult-to-distinguishable negative samples based on phonetic similarity of phoneme. Our approach to training by difficulty levels alleviates the inherent variability of speakers, better identifying challenging speeches. Evaluated on the UASpeech dataset, DyPCL outperforms baseline models, achieving an average 22.10% relative reduction in word error rate (WER) across the overall dysarthria group. @@ -3050,10 +3050,10 @@ Revisiting Early Detection of Sexual Predators via Turn-level Optimization JinMyeongAn - SangwonRyuPohang University of Science and Technology - HeejinDoPohang University of Science and Technology + SangwonRyuPohang University of Science and Technology + HeejinDoPohang University of Science and Technology YunsuKimaiXplain, Inc. - JungseulOkPOSTECH + JungseulOkPOSTECH GaryLee 4713-4724 Online grooming is a severe social threat where sexual predators gradually entrap child victims with subtle and gradual manipulation. Therefore, timely intervention for online grooming is critical for proactive protection. However, previous methods fail to determine the optimal intervention points (i.e., jump to conclusions) as they rely on chat-level risk labels by causing weak supervision of risky utterances. For timely detection, we propose speed control reinforcement learning (SCoRL), incorporating a practical strategy derived from luring communication theory (LCT). To capture the predator’s turn-level entrapment, we use a turn-level risk label based on the LCT. Then, we design a novel speed control reward function that balances the trade-off between speed and accuracy based on turn-level risk label; thus, SCoRL can identify the optimal intervention moment. In addition, we introduce a turn-level metric for precise evaluation, identifying limitations in previously used chat-level metrics. Experimental results show that SCoRL effectively preempted online grooming, offering a more proactive and timely solution. Further analysis reveals that our method enhances performance while intuitively identifying optimal early intervention points. @@ -3087,8 +3087,8 @@ <fixed-case>R</fixed-case>each<fixed-case>A</fixed-case>gent: Enhancing Mobile Agent via Page Reaching and Operation - QinzhuoWu - WeiLiu + QinzhuoWu + WeiLiu JianLuanXiaomi Corporation BinWangAI Lab, Xiaomi Inc. 4760-4775 @@ -3099,12 +3099,12 @@ Learning to Solve Domain-Specific Calculation Problems with Knowledge-Intensive Programs Generator ChengyuanLiu - ShihangWangAlibaba Group + ShihangWangAlibaba Group LizhiQingAlibaba Group JunLin JiZhangAlibaba Group FeiWuZhejiang University - KunKuangZhejiang University + KunKuangZhejiang University 4776-4791 Domain Large Language Models (LLMs) are developed for domain-specific tasks based on general LLMs. But it still requires professional knowledge to facilitate the expertise for some domain-specific tasks. In this paper, we investigate into knowledge-intensive calculation problems. We find that the math problems to be challenging for LLMs, when involving complex domain-specific rules and knowledge documents, rather than simple formulations of terminologies. Therefore, we propose a pipeline to solve the domain-specific calculation problems with Knowledge-Intensive Programs Generator more effectively, named as KIPG. It generates knowledge-intensive programs according to the domain-specific documents. For each query, key variables are extracted, then outcomes which are dependent on domain knowledge are calculated with the programs. By iterative preference alignment, the code generator learns to improve the logic consistency with the domain knowledge. Taking legal domain as an example, we have conducted experiments to prove the effectiveness of our pipeline, and extensive analysis on the modules. We also find that the code generator is also adaptable to other domains, without training on the new knowledge. 2025.naacl-long.245 @@ -3112,9 +3112,9 @@ <fixed-case>SLIM</fixed-case>: Let <fixed-case>LLM</fixed-case> Learn More and Forget Less with Soft <fixed-case>L</fixed-case>o<fixed-case>RA</fixed-case> and Identity Mixture - JiayiHanInspur Group Co, Ltd + JiayiHanInspur Group Co, Ltd LiangDuTencent - HongweiDu + HongweiDu XiangguoZhou YiwenWu YuanfangZhang @@ -3131,18 +3131,18 @@ LiangChen TaianGuo FuZeng - YushengZhao + YushengZhao BohanWu YeYuan - HaozheZhao + HaozheZhao ZhihuiGuo YichiZhang JingyangYuan - WeiJuSichuan University + WeiJuSichuan University LuchenLiu TianyuLiu - BaobaoChangPeking University - MingZhangPeking University + BaobaoChangPeking University + MingZhangPeking University 4805-4822 Large Multimodal Models (LMMs) exhibit impressive cross-modal understanding and reasoning abilities, often assessed through multiple-choice questions (MCQs) that include an image, a question, and several options. However, many benchmarks used for such evaluations suffer from systematic biases. Remarkably, Large Language Models (LLMs) without any visual perception capabilities achieve non-trivial performance, undermining the credibility of these evaluations. To address this issue while maintaining the efficiency of MCQ evaluations, we propose MMEVALPRO, a benchmark designed to avoid Type-I errors through a trilogy evaluation pipeline and more rigorous metrics. For each original question from existing benchmarks, human annotators augment it by creating one perception question and one knowledge anchor question through a meticulous annotation process. MMEVALPRO comprises 2,138 question triplets, totaling 6,414 distinct questions. Two-thirds of these questions are manually labeled by human experts, while the rest are sourced from existing benchmarks (MMMU, ScienceQA, and MathVista). Compared with the existing benchmarks, our experiments with the latest LLMs and LMMs demonstrate that MMEVALPRO is **more challenging** (the best LMM lags behind human performance by 31.73%, compared to an average gap of 8.03% in previous benchmarks) and **more trustworthy** (the best LLM trails the best LMM by 23.09%, whereas the gap for previous benchmarks is just 14.64%). Our in-depth analysis explains the reason for the large performance gap and justifies the trustworthiness of evaluation, underscoring its significant potential for advancing future research. 2025.naacl-long.247 @@ -3151,10 +3151,10 @@ <fixed-case>M</fixed-case>i<fixed-case>L</fixed-case>o<fixed-case>RA</fixed-case>: Harnessing Minor Singular Components for Parameter-Efficient <fixed-case>LLM</fixed-case> Finetuning HanqingWang - YixiaLi - ShuoWang - GuanhuaChenSouthern University of Science and Technology - YunChenShanghai University of Finance and Economics + YixiaLi + ShuoWang + GuanhuaChenSouthern University of Science and Technology + YunChenShanghai University of Finance and Economics 4823-4836 Efficient finetuning of large language models (LLMs) aims to adapt the LLMs with reduced computational and memory costs. Previous LoRA-based approaches initialize the low-rank matrices with Gaussian distribution and zero values while keeping the original weight matrices frozen. However, the trainable model parameters optimized in an unguided subspace might interfere with the well-learned subspace of the pretrained weight matrices. In this paper, we propose MiLoRA, a simple yet effective LLM finetuning approach that only updates the minor singular components of the weight matrix while keeping the principal singular components frozen. It is observed that the minor matrix corresponds to the noisy or long-tail information, while the principal matrix contains important knowledge. The MiLoRA initializes the low-rank matrices within a subspace that is orthogonal to the principal matrix, thus the pretrained knowledge is expected to be well preserved. During finetuning, MiLoRA makes the most use of the less-optimized subspace for learning the labeled dataset. Extensive experiments on commonsense reasoning, math reasoning, instruction following and visual instruction following benchmarks present the superior performance of our method. 2025.naacl-long.248 @@ -3163,7 +3163,7 @@ Analyzing (In)Abilities of <fixed-case>SAE</fixed-case>s via Formal Languages AbhinavMenon - ManishShrivastavaInternational Institute of Information Technology Hyderabad, India + ManishShrivastavaInternational Institute of Information Technology Hyderabad, India DavidKrueger Ekdeep SinghLubanaHarvard University, Harvard University 4837-4862 @@ -3173,9 +3173,9 @@ Multimodal Cognitive Reframing Therapy via Multi-hop Psychotherapeutic Reasoning - SubinKimPohang University of Science and Technology - HoonraeKim - HeejinDoPohang University of Science and Technology + SubinKimPohang University of Science and Technology + HoonraeKim + HeejinDoPohang University of Science and Technology GaryLee 4863-4880 Previous research has revealed the potential of large language models (LLMs) to support cognitive reframing therapy; however, their focus was primarily on text-based methods, often overlooking the importance of non-verbal evidence crucial in real-life therapy. To alleviate this gap, we extend the textual cognitive reframing to multimodality, incorporating visual clues. Specifically, we present a new dataset called Multi Modal-Cognitive Support Conversation (M2CoSC), which pairs each GPT-4-generated dialogue with an image that reflects the virtual client’s facial expressions.To better mirror real psychotherapy, where facial expressions lead to interpreting implicit emotional evidence, we propose a multi-hop psychotherapeutic reasoning approach that explicitly identifies and incorporates subtle evidence. Our comprehensive experiments with both LLMs and vision-language models (VLMs) demonstrate that the VLMs’ performance as psychotherapists is significantly improved with the M2CoSC dataset. Furthermore, the multi-hop psychotherapeutic reasoning method enables VLMs to provide more thoughtful and empathetic suggestions, outperforming standard prompting methods. @@ -3184,10 +3184,10 @@ Explanation based In-Context Demonstrations Retrieval for Multilingual Grammatical Error Correction - WeiLi - WenLuoPeking University - GuangyuePeng - HoufengWang + WeiLi + WenLuoPeking University + GuangyuePeng + HoufengWang 4881-4897 Grammatical error correction (GEC) aims to correct grammatical, spelling, and semantic errors in natural language text. With the growing of large language models (LLMs), direct text generation has gradually become the focus of the GEC methods, and few-shot in-context learning presents a cost-effective solution. However, selecting effective in-context examples remains challenging, as the similarity between input texts does not necessarily correspond to similar grammatical error patterns. In this paper, we propose a novel retrieval method based on natural language grammatical error explanations (GEE) to address this issue. Our method retrieves suitable few-shot demonstrations by matching the GEE of the test input with that of pre-constructed database samples, where explanations for erroneous samples are generated by LLMs. We conducted multilingual GEC few-shot experiments on both major open-source and closed-source LLMs. Experiments across five languages show that our method outperforms existing semantic and BM25-based retrieval techniques, without requiring additional training or language adaptation. This also suggests that matching error patterns is key to selecting examples. Our code and the constructed database will be publicly available after the paper is published. 2025.naacl-long.251 @@ -3195,11 +3195,11 @@ A Unified Supervised and Unsupervised Dialogue Topic Segmentation Framework Based on Utterance Pair Modeling - ShihaoYang - ZiyiZhang - YueJiang - ChunshengQin - ShuhuaLiu + ShihaoYang + ZiyiZhang + YueJiang + ChunshengQin + ShuhuaLiu 4898-4908 The Dialogue Topic Segmentation task aims to divide a dialogue into different topic paragraphs in order to better understand the structure and content of the dialogue. Due to the short sentences, serious references and non-standard language in the dialogue, it is difficult to determine the boundaries of the topic. Although the unsupervised approaches based on LLMs performs well, it is still difficult to surpass the supervised methods based on classical models in specific domains. To this end, this paper proposes UPS (Utterance Pair Segment), a dialogue topic segmentation method based on utterance pair relationship modeling, unifying the supervised and unsupervised network architectures. For supervised pre-training, the model predicts the adjacency and topic affiliation of utterances in dialogues. For unsupervised pre-training, the dialogue-level and utterance-level relationship prediction tasks are used to train the model. The pre-training and fine-tuning strategies are carried out in different scenarios, such as supervised, few-shot, and unsupervised data. By adding a domain adapter and a task adapter to the Transformer, the model learns in the pre-training and fine-tuning stages, respectively, which significantly improves the segmentation effect. As the result, the proposed method has achieved the best results on multiple benchmark datasets across various scenarios. 2025.naacl-long.252 @@ -3207,11 +3207,11 @@ Evaluating Small Language Models for News Summarization: Implications and Factors Influencing Performance - BoruiXuShandong University - YaoChenNational University of Singapore - ZeyiWenHong Kong University of Science and Technology (Guangzhou) - WeiguoLiu - BingshengHeNational University of Singapore + BoruiXuShandong University + YaoChenNational University of Singapore + ZeyiWenHong Kong University of Science and Technology (Guangzhou) + WeiguoLiu + BingshengHeNational University of Singapore 4909-4922 The increasing demand for efficient summarization tools in resource-constrained environments highlights the need for effective solutions. While large language models (LLMs) deliver superior summarization quality, their high computational resource requirements limit practical use applications. In contrast, small language models (SLMs) present a more accessible alternative, capable of real-time summarization on edge devices. However, their summarization capabilities and comparative performance against LLMs remain underexplored. This paper addresses this gap by presenting a comprehensive evaluation of 19 SLMs for news summarization across 2,000 news samples, focusing on relevance, coherence, factual consistency, and summary length. Our findings reveal significant variations in SLM performance, with top-performing models such as Phi3-Mini and Llama3.2-3B-Ins achieving results comparable to those of 70B LLMs while generating more concise summaries. Notably, SLMs are better suited for simple prompts, as overly complex prompts may lead to a decline in summary quality. Additionally, our analysis indicates that instruction tuning does not consistently enhance the news summarization capabilities of SLMs. This research not only contributes to the understanding of SLMs but also provides practical insights for researchers seeking efficient summarization solutions that balance performance and resource use. 2025.naacl-long.253 @@ -3221,7 +3221,7 @@ Dynamic Fisher-weighted Model Merging via <fixed-case>B</fixed-case>ayesian Optimization SanwooLeePeking University JiahaoLiuMeituan - QifanWangMeta AI + QifanWangMeta AI JingangWangMeituan XunliangCaiMeituan YunfangWu @@ -3233,7 +3233,7 @@ <fixed-case>AI</fixed-case>-Assisted Human Evaluation of Machine Translation VilémZouharDepartment of Computer Science, ETHZ - ETH Zurich - TomKocmiCohere + TomKocmiCohere MrinmayaSachanSwiss Federal Institute of Technology 4936-4950 2025.naacl-long.255 @@ -3244,7 +3244,7 @@ WentaoGe ShunianChenShenzhen Research Institute of Big Data HardyChenUniversity of Texas at Dallas - NuoChenNational University of Singapore and The Chinese University of Hong Kong, Shenzhen + NuoChenNational University of Singapore and The Chinese University of Hong Kong, Shenzhen JunyingChen ZhihongChenStanford University WenyaXieUniversity of Minnesota - Twin Cities @@ -3257,7 +3257,7 @@ ZhangZhiyi JianquanLi XiangWanShenzhen Research Institute of Big Data - BenyouWangThe Chinese University of Hong Kong, Shenzhen + BenyouWangThe Chinese University of Hong Kong, Shenzhen 4951-4974 Multimodal large language models (MLLMs) have broadened the scope of AI applications. Existing automatic evaluation methodologies for MLLMs are mainly limited in evaluating objective queries without considering real-world user experiences, inadequately addressing the nuances of creative and associative multimodal tasks. However, the open-ended and subjective nature of such tasks poses a significant challenge to the evaluation methodology, where it is difficult to define the ground-truth answers for them. To this end, in our paper, we propose a new evaluation paradigm for MLLMs, which is evaluating MLLMs with per-sample criteria using potent MLLM as the judge. To validate the feasibility and effectiveness of this paradigm, we design a benchmark, dubbed MLLM-Bench, by curating the evaluation samples across six comprehensive cognitive levels. We benchmark 26 popular MLLMs in a pairwise-comparison fashion, showing diverse performance across models. Moreover, the validity of our benchmark manifests itself in reaching 88.02% agreement with human evaluation. We contend that the proposed paradigm explores the potential of MLLMs as effective evaluation tools with the help of per-sample criteria. 2025.naacl-long.256 @@ -3266,7 +3266,7 @@ <fixed-case>A</fixed-case>gent<fixed-case>S</fixed-case>ense: Benchmarking Social Intelligence of Language Agents through Interactive Scenarios XinyiMou - JingcongLiangFudan University + JingcongLiangFudan University JiayuLinFudan University XinnongZhangFudan University XiaweiLiu @@ -3274,7 +3274,7 @@ RongYeByteDance LeiChen HaoyuKuangFudan University - XuanjingHuangFudan University + XuanjingHuangFudan University ZhongyuWeiFudan University 4975-5001 Large language models (LLMs) are increasingly leveraged to empower autonomous agents to simulate human beings in various fields of behavioral research. However, evaluating their capacity to navigate complex social interactions remains a challenge. Previous studies face limitations due to insufficient scenario diversity, complexity, and a single-perspective focus. To this end, we introduce AgentSense: Benchmarking Social Intelligence of Language Agents through Interactive Scenarios. Drawing on Dramaturgical Theory, AgentSense employs a bottom-up approach to create 1,225 diverse social scenarios constructed from extensive scripts. We evaluate LLM-driven agents through multi-turn interactions, emphasizing both goal completion and implicit reasoning. We analyze goals using ERG theory and conduct comprehensive experiments. Our findings highlight that LLMs struggle with goals in complex social scenarios, especially high-level growth needs, and even GPT-4o requires improvement in private information reasoning. @@ -3317,7 +3317,7 @@ HaoLi YanZhangTencent YuchengHuang - JunLang + JunLang WenqiangLiuTencent 5041-5053 Aspect Sentiment Triplet Extraction (ASTE) is a thriving research area with impressive outcomes being achieved on high-resource languages. However, the application of cross-lingual transfer to the ASTE task has been relatively unexplored, and current code-switching methods still suffer from term boundary detection issues and out-of-dictionary problems. In this study, we introduce a novel Test-Time Code-SWitching (TT-CSW) framework, which bridges the gap between the bilingual training phase and the monolingual test-time prediction. During training, a generative model is developed based on bilingual code-switched training data and can produce bilingual ASTE triplets for bilingual inputs. In the testing stage, we employ an alignment-based code-switching technique for test-time augmentation. Extensive experiments on cross-lingual ASTE datasets validate the effectiveness of our proposed method. We achieve an average improvement of 3.7% in terms of weighted-averaged F1 in four datasets with different languages. Additionally, we set a benchmark using ChatGPT and GPT-4, and demonstrate that even smaller generative models fine-tuned with our proposed TT-CSW framework surpass ChatGPT and GPT-4 by 14.2% and 5.0% respectively. @@ -3329,32 +3329,32 @@ XiaomanWang DanYuanEast China Normal University XinLiuEast China Normal University - YikeZhao + YikeZhao XiaoxiaoZhangEast China Normal University XizhiChen - YunshiLanEast China Normal University + YunshiLanEast China Normal University 5054-5068 2025.naacl-long.261 wang-etal-2025-viscgec Are We Done with <fixed-case>MMLU</fixed-case>? - Aryo PradiptaGemaAnthropic and University of Edinburgh, University of Edinburgh + Aryo PradiptaGemaAnthropic and University of Edinburgh, University of Edinburgh Joshua Ong JunLeang GiwonHongUniversity of Edinburgh, University of Edinburgh - AlessioDevoto - Alberto Carlo MariaMancino + AlessioDevoto + Alberto Carlo MariaMancino RohitSaxenaUniversity of Edinburgh, University of Edinburgh XuanliHeUniversity College London, University of London - YuZhaoUniversity of Edinburgh + YuZhaoUniversity of Edinburgh XiaotangDu Mohammad RezaGhasemi MadaniUniversity of Trento ClaireBarale RobertMcHardyAssemblyAI JoshuaHarrisUK Health Security Agency JeanKaddour - EmileVan KriekenEdinburgh University, University of Edinburgh - PasqualeMinerviniUniversity of Edinburgh, University of Edinburgh + EmileVan KriekenEdinburgh University, University of Edinburgh + PasqualeMinerviniUniversity of Edinburgh, University of Edinburgh 5069-5096 Maybe not. We identify and analyse errors in the popular Massive Multitask Language Understanding (MMLU) benchmark. Even though MMLU is widely adopted, our analysis demonstrates numerous ground truth errors that obscure the true capabilities of LLMs. For example, we find that 57% of the analysed questions in the Virology subset contain errors. To address this issue, we introduce a comprehensive framework for identifying dataset errors using a novel error annotation protocol. Then, we create MMLU-Redux, which is a subset of 5,700 manually re-annotated questions across all 57 MMLU subjects. Using MMLU-Redux, we demonstrate significant discrepancies with the model performance metrics that were originally reported. Our results strongly advocate for revising MMLU’s error-ridden questions to enhance its future utility and reliability as a benchmark. Therefore, we open up MMLU-Redux for additional annotation. 2025.naacl-long.262 @@ -3366,7 +3366,7 @@ ShaohangWeiPeking University XuWang KuiXueShanghai Artificial Intelligence Laboratory - ShaotingZhangShanghai Artificial Intelligence Laboratory + ShaotingZhangShanghai Artificial Intelligence Laboratory XiaofanZhangShanghai Jiaotong University 5097-5116 Integrating tools into Large Language Models (LLMs) has facilitated the widespread application. Despite this, in specialized downstream task contexts, reliance solely on tools is insufficient to fully address the complexities of the real world. This particularly restricts the effective deployment of LLMs in fields such as medicine. In this paper, we focus on the downstream tasks of medical calculators, which use standardized tests to assess an individual’s health status. We introduce MeNTi, a universal agent architecture for LLMs. MeNTi integrates a specialized medical toolkit and employs meta-tool and nested calling mechanisms to enhance LLM tool utilization. Specifically, it achieves flexible tool selection and nested tool calling to address practical issues faced in intricate medical scenarios, including calculator selection, slot filling, and unit conversion. To assess the capabilities of LLMs for quantitative assessment throughout the clinical process of calculator scenarios, we introduce CalcQA. This benchmark requires LLMs to use medical calculators to perform calculations and assess patient health status. CalcQA is constructed by professional physicians and includes 100 case-calculator pairs, complemented by a toolkit of 281 medical tools. The experimental results demonstrate significant performance improvements with our framework. This research paves new directions for applying LLMs in demanding scenarios of medicine. @@ -3375,15 +3375,15 @@ Steering Knowledge Selection Behaviours in <fixed-case>LLM</fixed-case>s via <fixed-case>SAE</fixed-case>-Based Representation Engineering - YuZhaoUniversity of Edinburgh - AlessioDevoto + YuZhaoUniversity of Edinburgh + AlessioDevoto GiwonHongUniversity of Edinburgh, University of Edinburgh XiaotangDu - Aryo PradiptaGemaAnthropic and University of Edinburgh, University of Edinburgh - HongruWangThe Chinese University of Hong Kong + Aryo PradiptaGemaAnthropic and University of Edinburgh, University of Edinburgh + HongruWangThe Chinese University of Hong Kong XuanliHeUniversity College London, University of London - Kam-FaiWongThe Chinese University of Hong Kong - PasqualeMinerviniUniversity of Edinburgh, University of Edinburgh + Kam-FaiWongThe Chinese University of Hong Kong + PasqualeMinerviniUniversity of Edinburgh, University of Edinburgh 5117-5136 Large language models (LLMs) can store a significant amount of factual knowledge in their parameters. However, their parametric knowledge may conflict with the information provided in the context—this phenomenon, known as context-memory knowledge conflicts, can lead to undesirable model behaviour, such as reliance on outdated or incorrect information. Analysing the internal activations of LLMs, we find that they can internally register the signals of knowledge conflict at mid-layers. Such signals allow us to detect whether a knowledge conflict occurs and use inference-time intervention strategies to resolve it. In this work, we propose SpARE, a training-free representation engineering method that uses pre-trained sparse auto-encoders (SAEs) to control the knowledge selection behaviour of LLMs. SpARE identifies the functional features that control the knowledge selection behaviours and applies them to edit the internal activations of LLMs at inference time. Our experimental results show that SpARE can effectively control the usage of either knowledge source to resolve knowledge conflict in open-domain question-answering tasks, surpassing existing representation engineering methods (+10%) as well as contrastive decoding methods (+15%). 2025.naacl-long.264 @@ -3393,13 +3393,13 @@ <fixed-case>M</fixed-case>o<fixed-case>D</fixed-case>ification: Mixture of Depths Made Easy ChenZhangBeijing Institute of Technology MeizhiZhong - QimengWangXiaohongshu + QimengWangXiaohongshu XuantaoLu ZheyuYeXiaohongshu Inc ChengqiangLuXiaohongshu - YanGao + YanGao YaoHuXiaohongshu - KehaiChenHarbin Institute of Technology (Shenzhen) + KehaiChenHarbin Institute of Technology (Shenzhen) MinZhangHarbin Institute of Technology, Shenzhen DaweiSongBeijing Institute of Technology and Open University 5137-5149 @@ -3410,12 +3410,12 @@ On the Vulnerability of Text Sanitization MengTong - KejiangChenUniversity of Science and Technology of China + KejiangChenUniversity of Science and Technology of China XiaojianYuanUniversity of Science and Technology of China JiayangLiuNanyang Technological University - WeimingZhangUniversity of Science and Technology of China + WeimingZhangUniversity of Science and Technology of China NenghaiYuUniversity of Science and Technology of China - JieZhangA*STAR + JieZhangA*STAR 5150-5164 Text sanitization, which employs differential privacy to replace sensitive tokens with new ones, represents a significant technique for privacy protection. Typically, its performance in preserving privacy is evaluated by measuring the attack success rate (ASR) of reconstruction attacks, where attackers attempt to recover the original tokens from the sanitized ones. However, current reconstruction attacks on text sanitization are developed empirically, making it challenging to accurately assess the effectiveness of sanitization. In this paper, we aim to provide a more accurate evaluation of sanitization effectiveness. Inspired by the works of Palamidessi et al., we implement theoretically optimal reconstruction attacks targeting text sanitization. We derive their bounds on ASR as benchmarks for evaluating sanitization performance. For real-world applications, we propose two practical reconstruction attacks based on these theoretical findings. Our experimental results underscore the necessity of reassessing these overlooked risks. Notably, one of our attacks achieves a 46.4% improvement in ASR over the state-of-the-art baseline, with a privacy budget of \epsilon=4.0 on the SST-2 dataset. Our code is available at: https://github.com/mengtong0110/On-the-Vulnerability-of-Text-Sanitization. 2025.naacl-long.266 @@ -3426,7 +3426,7 @@ AmeyHengle PrasoonBajpai SohamDanMicrosoft - TanmoyChakrabortyIndian Institute of Technology, Delhi + TanmoyChakrabortyIndian Institute of Technology, Delhi 5165-5180 While recent large language models (LLMs) demonstrate remarkable abilities in responding to queries in diverse languages, their ability to handle long multilingual contexts is unexplored. As such, a systematic evaluation of the long-context capabilities of LLMs in multilingual settings is crucial, specifically in the context of information retrieval. To address this gap, we introduce the MultiLingual Needle-in-a-Haystack (MLNeedle) test, designed to assess a model’s ability to retrieve relevant information (the needle) from a collection of multilingual distractor texts (the haystack). This test serves as an extension of the multilingual question-answering task, encompassing both monolingual and cross-lingual retrieval. We evaluate four state-of-the-art LLMs on MLNeedle. Our findings reveal that model performance can vary significantly with language and needle position. Specifically, we observe that model performance is the lowest when the needle is (i) in a language outside the English language family, and (ii) located in the middle of the input context. Furthermore, although some models claim a context size of 8k tokens or greater, none demonstrate satisfactory cross-lingual retrieval performance as the context length increases. Our analysis provides key insights into the long-context behavior of LLMs in multilingual settings to guide future evaluation protocols. To our knowledge, this is the first study to investigate the multilingual long-context behavior of LLMs. 2025.naacl-long.267 @@ -3436,7 +3436,7 @@ Verify-in-the-Graph: Entity Disambiguation Enhancement for Complex Claim Verification with Interactive Graph Representation HoangPhamViettel AI & Data Service Center Thanh-DoNguyen - Khac-Hoai NamBuiViettel Group + Khac-Hoai NamBuiViettel Group 5181-5197 Claim verification is a long-standing and challenging task that demands not only high accuracy but also explainability and thoroughness of the verification process. This task becomes an emerging research issue in the era of large language models (LLMs) since real-world claims are often complex, featuring intricate semantic structures or obfuscated entities. Traditional approaches typically address this by decomposing claims into sub-claims and querying a knowledge base to resolve hidden or ambiguous entities. However, the absence of effective disambiguation strategies for these entities can compromise the entire verification process. To address these challenges, we propose Verify-in-the-Graph (VeGraph), a novel framework leveraging the reasoning and comprehension abilities of LLM agents. VeGraph operates in three phases: (1) Graph Representation - an input claim is decomposed into structured triplets, forming a graph-based representation that integrates both structured and unstructured information; (2) Entity Disambiguation -VeGraph iteratively interacts with the knowledge base to resolve ambiguous entities within the graph for deeper sub-claim verification; and (3) Verification - remaining triplets are verified to complete the fact-checking process. Experiments using Meta-Llama-3-70B (instruct version) show that VeGraph achieves competitive performance compared to baselines across benchmarks (HoVer and FEVEROUS), effectively addressing claim verification challenges. Our source code and data are available for further exploitation. 2025.naacl-long.268 @@ -3445,9 +3445,9 @@ Exploring the Potential of Large Language Models for Heterophilic Graphs YuxiaWuSingapore Management University - ShujieLiBeijing University of Posts and Telecommunications - YuanFangSingapore Management University - ChuanShiBeijing University of Post and Telecommunication, Tsinghua University + ShujieLiBeijing University of Posts and Telecommunications + YuanFangSingapore Management University + ChuanShiBeijing University of Post and Telecommunication, Tsinghua University 5198-5211 Large language models (LLMs) have presented significant opportunities to enhance various machine learning applications, including graph neural networks (GNNs). By leveraging the vast open-world knowledge within LLMs, we can more effectively interpret and utilize textual data to better characterize heterophilic graphs, where neighboring nodes often have different labels. However, existing approaches for heterophilic graphs overlook the rich textual data associated with nodes, which could unlock deeper insights into their heterophilic contexts. In this work, we explore the potential of LLMs for modeling heterophilic graphs and propose a novel two-stage framework: LLM-enhanced edge discriminator and LLM-guided edge reweighting. In the first stage, we fine-tune the LLM to better identify homophilic and heterophilic edges based on the textual content of their nodes. In the second stage, we adaptively manage message propagation in GNNs for different edge types based on node features, structures, and heterophilic or homophilic characteristics. To cope with the computational demands when deploying LLMs in practical scenarios, we further explore model distillation techniques to fine-tune smaller, more efficient models that maintain competitive performance. Extensive experiments validate the effectiveness of our framework, demonstrating the feasibility of using LLMs to enhance node classification on heterophilic graphs. 2025.naacl-long.269 @@ -3466,10 +3466,10 @@ <fixed-case>DIRAS</fixed-case>: Efficient <fixed-case>LLM</fixed-case> Annotation of Document Relevance for Retrieval Augmented Generation JingweiNiETHZ - ETH Zurich - TobiasSchimanski + TobiasSchimanski MeihongLin MrinmayaSachanSwiss Federal Institute of Technology - ElliottAshSwiss Federal Institute of Technology + ElliottAshSwiss Federal Institute of Technology MarkusLeippoldUniversity of Zurich 5238-5258 Retrieval Augmented Generation (RAG) is widely employed to ground responses to queries on domain-specific documents. But do RAG implementations leave out important information when answering queries that need an integrated analysis of information (e.g., Tell me good news in the stock market today.)? To address these concerns, RAG developers need to annotate information retrieval (IR) data for their domain of interest, which is challenging because (1) domain-specific queries usually need nuanced definitions of relevance beyond shallow semantic relevance; and (2) human or GPT-4 annotation is costly and cannot cover all (query, document) pairs (i.e., annotation selection bias), thus harming the effectiveness in evaluating IR recall. To address these challenges, we propose DIRAS (**D**omain-specific **I**nformation **R**etrieval **A**nnotation with **S**calability), a manual-annotation-free schema that fine-tunes open-sourced LLMs to consider nuanced relevance definition and annotate (partial) relevance labels with calibrated relevance scores. Extensive evaluation shows that DIRAS enables smaller (8B) LLMs to achieve GPT-4-level performance on annotating and ranking unseen (query, document) pairs, and is helpful for real-world RAG development. @@ -3478,12 +3478,12 @@ Hello Again! <fixed-case>LLM</fixed-case>-powered Personalized Agent for Long-term Dialogue - HaoLi + HaoLi ChenghaoYang AnZhangNational University of Singapore YangDengSingapore Management University - XiangWangUniversity of Science and Technology of China - Tat-SengChuaNational University of Singapore + XiangWangUniversity of Science and Technology of China + Tat-SengChuaNational University of Singapore 5259-5276 Open-domain dialogue systems have seen remarkable advancements with the development of large language models (LLMs). Nonetheless, most existing dialogue systems predominantly focus on brief single-session interactions, neglecting the real-world demands for long-term companionship and personalized interactions with chatbots. Crucial to addressing this real-world need are event summary and persona management, which enable reasoning for appropriate long-term dialogue responses. Recent progress in the human-like cognitive and reasoning capabilities of LLMs suggests that LLM-based agents could significantly enhance automated perception, decision-making, and problem-solving. In response to this potential, we introduce a model-agnostic framework, the Long-term Dialogue Agent (LD-Agent), which incorporates three independently tunable modules dedicated to event perception, persona extraction, and response generation. For the event memory module, long and short-term memory banks are employed to separately focus on historical and ongoing sessions, while a topic-based retrieval mechanism is introduced to enhance the accuracy of memory retrieval. Furthermore, the persona module conducts dynamic persona modeling for both users and agents. The integration of retrieved memories and extracted personas is subsequently fed into the generator to induce appropriate responses. The effectiveness, generality, and cross-domain capabilities of LD-Agent are empirically demonstrated across various illustrative benchmarks, models, and tasks. The code is released at https://github.com/leolee99/LD-Agent. 2025.naacl-long.272 @@ -3493,7 +3493,7 @@ My <fixed-case>LLM</fixed-case> might Mimic <fixed-case>AAE</fixed-case> - But When Should It? Sandra CamilleSandoval ChristabelAcquayeUniversity of Maryland, College Park - Kwesi AduCobbinaUniversity of Maryland, College Park + Kwesi AduCobbinaUniversity of Maryland, College Park Mohammad NayeemTeliUniversity of Maryland, College Park Hal DauméIiiUniversity of Maryland, College Park 5277-5302 @@ -3503,13 +3503,13 @@ High-Dimension Human Value Representation in Large Language Models - SamuelCahyawijayaCohere + SamuelCahyawijayaCohere DelongChenHong Kong University of Science and Technology YejinBang - LeilaKhalatbari - BryanWilie - ZiweiJi - EtsukoIshiiAmazon + LeilaKhalatbari + BryanWilie + ZiweiJi + EtsukoIshiiAmazon PascaleFungHKUST 5303-5330 The widespread application of Large Language Models (LLMs) across various tasks and fields has necessitated the alignment of these models with human values and preferences. Given various approaches of human value alignment, such as Reinforcement Learning with Human Feedback (RLHF), constitutional learning, and safety fine-tuning etc., there is an urgent need to understand the scope and nature of human values injected into these LLMs before their deployment and adoption. We propose UniVar, a high-dimensional neural representation of symbolic human value distributions in LLMs, orthogonal to model architecture and training data. This is a continuous and scalable representation, self-supervised from the value-relevant output of 8 LLMs and evaluated on 15 open-source and commercial LLMs. Through UniVar, we visualize and explore how LLMs prioritize different values in 25 languages and cultures, shedding light on the complex interplay between human values and language modeling. @@ -3519,8 +3519,8 @@ Not all Hallucinations are Good to Throw Away When it Comes to Legal Abstractive Summarization NihedBendahman - KarenPinel-SauvagnatIRIT - Université Paul Sabatier - GillesHubertIRIT, University of Toulouse + KarenPinel-SauvagnatIRIT - Université Paul Sabatier + GillesHubertIRIT, University of Toulouse Mokhtar BoumedyenBillami 5331-5344 Automatic summarization of legal documents requires a thorough understanding of their specificities, mainly with respect to the vocabulary used by legal experts. Indeed, the latter rely heavily on their external knowledge when writing summaries, in order to contextualize the main entities of the source document. This leads to reference summaries containing many abstractions, that sota models struggle to generate. In this paper, we propose an entity-driven approach aiming at learning the model to generate factual hallucinations, as close as possible to the abstractions of the reference summaries. We evaluated our approach on two different datasets, with legal documents in English and French. Results show that our approach allows to reduce non-factual hallucinations and maximize both summary coverage and factual hallucinations at entity-level. Moreover, the overall quality of summaries is also improved, showing that guiding summarization with entities is a valuable solution for legal documents summarization. @@ -3547,7 +3547,7 @@ Token-based Decision Criteria Are Suboptimal in In-context Learning - HakazeCho + HakazeCho YoshihiroSakaiJapan Advanced Institute of Science and Technology MarikoKato KenshiroTanaka @@ -3563,7 +3563,7 @@ AmeyHengle Aswini KumarPadhi AnilBandhakavi - TanmoyChakrabortyIndian Institute of Technology, Delhi + TanmoyChakrabortyIndian Institute of Technology, Delhi 5402-5419 Counterspeech has emerged as a popular and effective strategy for combating online hate speech, sparking growing research interest in automating its generation using language models. However, the field still lacks standardised evaluation protocols and reliable automated evaluation metrics that align with human judgement. Current automatic evaluation methods, primarily based on similarity metrics, do not effectively capture the complex and independent attributes of counterspeech quality, such as contextual relevance, aggressiveness, or argumentative coherence. This has led to an increased dependency on labor-intensive human evaluations to assess automated counter-speech generation methods. To address these challenges, we introduce ‘CSEval‘, a novel dataset and framework for evaluating counterspeech quality across four dimensions: *contextual-relevance*, *aggressiveness*, *argument-coherence*, and *suitableness*. Furthermore, we propose *Auto-Calibrated COT for Counterspeech Evaluation* (‘Auto-CSEval‘), a prompt-based method with auto-calibrated chain-of-thoughts (CoT) for scoring counterspeech using large language models. Our experiments show that ‘Auto-CSEval‘ outperforms traditional metrics like ROUGE, METEOR, and BertScore in correlating with human judgement, indicating a significant improvement in automated counterspeech evaluation. 2025.naacl-long.279 @@ -3573,7 +3573,7 @@ Multilingual Machine Translation with Open Large Language Models at Practical Scale: An Empirical Study MenglongCuiXiaomi Corporation PengzhiGaoXiaomi Corporation - WeiLiu + WeiLiu JianLuanXiaomi Corporation BinWangAI Lab, Xiaomi Inc. 5420-5443 @@ -3585,7 +3585,7 @@ <fixed-case>RAG</fixed-case> <fixed-case>LLM</fixed-case>s are Not Safer: A Safety Analysis of Retrieval-Augmented Generation for Large Language Models BangAnUniversity of Maryland, College Park ShiyueZhangBloomberg - MarkDredzeDepartment of Computer Science, Whiting School of Engineering and Bloomberg + MarkDredzeDepartment of Computer Science, Whiting School of Engineering and Bloomberg 5444-5474 Efforts to ensure the safety of large language models (LLMs) include safety fine-tuning, evaluation, and red teaming.However, despite the widespread use of the Retrieval-Augmented Generation (RAG) framework, AI safety work focuses on standard LLMs, which means we know little about how RAG use cases change a model’s safety profile. We conduct a detailed comparative analysis of RAG and non-RAG frameworks with eleven LLMs. We find that RAG can make models less safe and change their safety profile. We explore the causes of this change and find that even combinations of safe models with safe documents can cause unsafe generations. In addition, we evaluate some existing red teaming methods for RAG settings and show that they are less effective than when used for non-RAG settings. Our work highlights the need for safety research and red-teaming methods specifically tailored for RAG LLMs. 2025.naacl-long.281 @@ -3593,9 +3593,9 @@ Evaluating Evidence Attribution in Generated Fact Checking Explanations - RuiXingMohamed bin Zayed University of Artificial Intelligence and University of Melbourne - TimothyBaldwinMohamed bin Zayed University of Artificial Intelligence and The University of Melbourne - Jey HanLauThe University of Melbourne + RuiXingMohamed bin Zayed University of Artificial Intelligence and University of Melbourne + TimothyBaldwinMohamed bin Zayed University of Artificial Intelligence and The University of Melbourne + Jey HanLauThe University of Melbourne 5475-5496 Automated fact-checking systems often struggle with trustworthiness, as their generated explanations can include hallucinations. In this work, we explore evidence attribution for fact-checking explanation generation. We introduce a novel evaluation protocol, citation masking and recovery, to assess attribution quality in generated explanations. We implement our protocol using both human annotators and automatic annotators and found that LLM annotation correlates with human annotation, suggesting that attribution assessment can be automated. Finally, our experiments reveal that: (1) the best-performing LLMs still generate explanations that are not always accurate in their attribution; and (2) human-curated evidence is essential for generating better explanations. 2025.naacl-long.282 @@ -3605,10 +3605,10 @@ <fixed-case>ETHIC</fixed-case>: Evaluating Large Language Models on Long-Context Tasks with High Information Coverage TaewhooLeeKorea University ChanwoongYoonKorea University - KyochulJangKorea University - DonghyeonLee - MinjuSong - HyunjaeKimYale University + KyochulJangKorea University + DonghyeonLee + MinjuSong + HyunjaeKimYale University JaewooKangKorea University 5497-5512 Recent advancements in large language models (LLM) capable of processing extremely long texts highlight the need for a dedicated evaluation benchmark to assess their long-context capabilities. However, existing methods, like the needle-in-a-haystack test, do not effectively assess whether these models fully utilize contextual information, raising concerns about the reliability of current evaluation techniques. To thoroughly examine the effectiveness of existing benchmarks, we introduce a new metric called information coverage (IC), which quantifies the proportion of the input context necessary for answering queries. Our findings indicate that current benchmarks exhibit low IC; although the input context may be extensive, the actual usable context is often limited. To address this, we present ETHIC, a novel benchmark designed to assess LLMs’ ability to leverage the entire context. Our benchmark comprises 1,986 test instances spanning four long-context tasks with high IC scores in the domains of books, debates, medicine, and law. Our evaluations reveal significant performance drops in contemporary LLMs, highlighting a critical challenge in managing long contexts. Our benchmark is available at https://github.com/dmis-lab/ETHIC. @@ -3619,8 +3619,8 @@ Aggregation Artifacts in Subjective Tasks Collapse Large Language Models’ Posteriors GeorgiosChochlakisUniversity of Southern California AlexandrosPotamianosAmazon, University of Southern California and National Technical University of Athens - KristinaLermanUniversity of Southern California and USC Information Sciences Institute - ShrikanthNarayananUniversity of Southern California + KristinaLermanUniversity of Southern California and USC Information Sciences Institute + ShrikanthNarayananUniversity of Southern California 5513-5528 In-context Learning (ICL) has become the primary method for performing natural language tasks with Large Language Models (LLMs). The knowledge acquired during pre-training is crucial for this few-shot capability, providing the model with task priors. However, recent studies have shown that ICL predominantly relies on retrieving task priors rather than “learning” to perform tasks. This limitation is particularly evident in complex subjective domains such as emotion and morality, where priors significantly influence posterior predictions. In this work, we examine whether this is the result of the aggregation used in corresponding datasets, where trying to combine low-agreement, disparate annotations might lead to annotation artifacts that create detrimental noise in the prompt. Moreover, we evaluate the posterior bias towards certain annotators by grounding our study in appropriate, quantitative measures of LLM priors. Our results indicate that aggregation is a confounding factor in the modeling of subjective tasks, and advocate focusing on modeling individuals instead. However, aggregation does not explain the entire gap between ICL and the state of the art, meaning other factors in such tasks also account for the observed phenomena. Finally, by rigorously studying annotator-level labels, we find that it is possible for minority annotators to both better align with LLMs and have their perspectives further amplified. 2025.naacl-long.284 @@ -3630,9 +3630,9 @@ <fixed-case>A</fixed-case>rabic Dataset for <fixed-case>LLM</fixed-case> Safeguard Evaluation YasserAshrafMohamed bin Zayed University of Artificial Intelligence YuxiaWang - BinGuMohamed bin Zayed University of Artificial Intelligence - PreslavNakovMohamed bin Zayed University of Artificial Intelligence - TimothyBaldwinMohamed bin Zayed University of Artificial Intelligence and The University of Melbourne + BinGuMohamed bin Zayed University of Artificial Intelligence + PreslavNakovMohamed bin Zayed University of Artificial Intelligence + TimothyBaldwinMohamed bin Zayed University of Artificial Intelligence and The University of Melbourne 5529-5546 The growing use of large language models (LLMs) has raised concerns regarding their safety. While many studies have focused on English, the safety of LLMs in Arabic, with its linguistic and cultural complexities, remains under-explored. Here, we aim to bridge this gap. In particular, we present an Arab-region-specific safety evaluation dataset consisting of 5,799 questions, including direct attacks, indirect attacks, and harmless requests with sensitive words, adapted to reflect the socio-cultural context of the Arab world. To uncover the impact of different stances in handling sensitive and controversial topics, we propose a dual-perspective evaluation framework. It assesses the LLM responses from both governmental and opposition viewpoints. Experiments over five leading Arabic-centric and multilingual LLMs reveal substantial disparities in their safety performance. This reinforces the need for culturally specific datasets to ensure the responsible deployment of LLMs. 2025.naacl-long.285 @@ -3645,7 +3645,7 @@ ZhehuaiChen VitalyLavrukhinNVIDIA JagadeeshBalamNVIDIA - LeiLiSchool of Computer Science, Carnegie Mellon University + LeiLiSchool of Computer Science, Carnegie Mellon University BorisGinsburgNVIDIA 5547-5557 Simultaneous machine translation (SMT) takes streaming input utterances and incrementally produces target text. Existing SMT methods only use the partial utterance that has already arrived at the input and the generated hypothesis. Motivated by human interpreters’ technique to forecast future words before hearing them, we propose Translation by Anticipating Future (TAF), a method to improve translation quality while retaining low latency. Its core idea is to use a large language model (LLM) to predict future source words and opportunistically translate without introducing too much risk. We evaluate our TAF and multiple baselines of SMT on four language directions. Experiments show that TAF achieves the best translation quality-latency trade-off and outperforms the baselines by up to 5 BLEU points at the same latency (three words). @@ -3655,19 +3655,19 @@ <fixed-case>G</fixed-case>uide<fixed-case>LLM</fixed-case>: Exploring <fixed-case>LLM</fixed-case>-Guided Conversation with Applications in Autobiography Interviewing JinhaoDuan - XinyuZhao + XinyuZhao ZhuoxuanZhangColumbia University and Brown University Eunhye GraceKo LilyBoddy ChenanWang TianhaoLi AlexanderRasgon - JunyuanHongUniversity of Texas at Austin + JunyuanHongUniversity of Texas at Austin Min KyungLeeUniversity of Texas at Austin ChenxiYuanNew Jersey Institute of Technology - QiLongUniversity of Pennsylvania + QiLongUniversity of Pennsylvania YingDingUniversity of Texas, Austin - TianlongChenUniversity of North Carolina at Chapel Hill + TianlongChenUniversity of North Carolina at Chapel Hill KaidiXuDrexel University 5558-5588 Although Large Language Models (LLMs) succeed in human-guided conversations such as instruction following and question answering, the potential of LLM-guided conversations—where LLMs direct the discourse and steer the conversation’s objectives—remains under-explored. In this study, we first characterize LLM-guided conversation into three fundamental components: (i) Goal Navigation; (ii) Context Management; (iii) Empathetic Engagement, and propose GuideLLM as an installation. We then implement an interviewing environment for the evaluation of LLM-guided conversation. Specifically, various topics are involved in this environment for comprehensive interviewing evaluation, resulting in around 1.4k turns of utterances, 184k tokens, and over 200 events mentioned during the interviewing for each chatbot evaluation. We compare GuideLLM with 6 state-of-the-art LLMs such as GPT-4o and Llama-3-70b-Instruct, from the perspective of interviewing quality, and autobiography generation quality. For automatic evaluation, we derive user proxies from multiple autobiographies and employ LLM-as-a-judge to score LLM behaviors. We further conduct a human-involved experiment by employing 45 human participants to chat with GuideLLM and baselines. We then collect human feedback, preferences, and ratings regarding the qualities of conversation and autobiography. Experimental results indicate that GuideLLM significantly outperforms baseline LLMs in automatic evaluation and achieves consistent leading performances in human ratings. @@ -3679,7 +3679,7 @@ HanxuHu SimonYu PinzhenChenUniversity of Edinburgh - EdoardoPontiUniversity of Edinburgh + EdoardoPontiUniversity of Edinburgh 5589-5610 We find that existing instruction-tuned models usually struggle to adhere to a query with multiple intentions, which impairs their performance when the completion of several tasks is demanded by a single command. Hence, this paper teaches models to respond to sequential instructions. Our first attempt stems from a task-driven perspective, manually creating additional intermediate tasks to train multilingual and visual question answering. Next, we develop an automatic and generic process that turns instructions in existing data into diverse and complex task chains. Models that underwent sequential instruction tuning follow a list of instructions better and deliver higher results in coding, maths, and open-ended generation. Moreover, we put forward a new benchmark named SeqEval to evaluate a model’s ability to follow all the instructions in a sequence, which further corroborates the benefits of our sequential instruction tuning method. 2025.naacl-long.288 @@ -3699,11 +3699,11 @@ Elevating Legal <fixed-case>LLM</fixed-case> Responses: Harnessing Trainable Logical Structures and Semantic Knowledge with Legal Reasoning - RujingYao - YangWu + RujingYao + YangWu ChenghaoWangPeking University JingweiXiong - FangWangNankai University + FangWangNankai University XiaozhongLiuWorcester Polytechnic Institute 5630-5642 Large Language Models (LLMs) have achieved impressive results across numerous domains, yet they experience notable deficiencies in legal question-answering tasks. LLMs often generate generalized responses that lack the logical specificity required for expert legal advice and are prone to hallucination, providing answers that appear correct but are unreliable. Retrieval-Augmented Generation (RAG) techniques offer partial solutions to address this challenge, but existing approaches typically focus only on semantic similarity, neglecting the logical structure essential to legal reasoning. In this paper, we propose the Logical-Semantic Integration Model (LSIM), a novel supervised framework that bridges semantic and logical coherence. LSIM comprises three components: reinforcement learning predicts a structured fact-rule chain for each question, a trainable Deep Structured Semantic Model (DSSM) retrieves the most relevant candidate questions by integrating semantic and logical features, and in-context learning generates the final answer using the retrieved content. Our experiments on a real-world legal QA dataset-validated through both automated metrics and human evaluation-demonstrate that LSIM significantly enhances accuracy and reliability compared to existing methods. @@ -3732,9 +3732,9 @@ <fixed-case>C</fixed-case>on<fixed-case>QR</fixed-case>et: A New Benchmark for Fine-Grained Automatic Evaluation of Retrieval Augmented Computational Argumentation - KaustubhDholeEmory University + KaustubhDholeEmory University KaiShuEmory University - EugeneAgichteinEmory University + EugeneAgichteinEmory University 5687-5713 Computational argumentation, which involves generating answers or summaries for controversial topics like abortion bans and vaccination, has become increasingly important in today’s polarized environment. Sophisticated LLM capabilities offer the potential to provide nuanced, evidence-based answers to such questions through Retrieval-Augmented Argumentation (RAArg), leveraging real-world evidence for high-quality, grounded arguments. However, evaluating RAArg remains challenging, as human evaluation is costly and difficult for complex, lengthy answers on complicated topics. At the same time, re-using existing argumentation datasets is no longer sufficient, as they lack long, complex arguments and realistic evidence from potentially misleading sources, limiting holistic evaluation of retrieval effectiveness and argument quality. To address these gaps, we investigate automated evaluation methods using multiple fine-grained LLM judges, providing better and more interpretable assessments than traditional single-score metrics and even previously reported human crowdsourcing. To validate the proposed techniques, we introduce ConQRet, a new benchmark featuring long and complex human-authored arguments on debated topics, grounded in real-world websites, allowing an exhaustive evaluation across retrieval effectiveness, argument quality, and groundedness. We validate our LLM Judges on a prior dataset and the new ConQRet benchmark. Our proposed LLM Judges and the ConQRet benchmark can enable rapid progress in computational argumentation and can be naturally extended to other complex retrieval-augmented generation tasks. 2025.naacl-long.293 @@ -3742,10 +3742,10 @@ <fixed-case>S</fixed-case>ynth<fixed-case>D</fixed-case>etox<fixed-case>M</fixed-case>: <fixed-case>M</fixed-case>odern <fixed-case>LLM</fixed-case>s are Few-Shot Parallel Detoxification Data Annotators - DaniilMoskovskiy + DaniilMoskovskiy NikitaSushko SergeyPletenev - ElenaTutubalinaKazan Federal University + ElenaTutubalinaKazan Federal University AlexanderPanchenkoSkoltech 5714-5733 Existing approaches to multilingual text detoxification are hampered by the scarcity of parallel multilingual datasets. In this work, we introduce a pipeline for the generation of multilingual parallel detoxification data. We also introduce SynthDetoxM, a manually collected and synthetically generated multilingual parallel text detoxification dataset comprising 16,000 high-quality detoxification sentence pairs across German, French, Spanish and Russian. The data was sourced from different toxicity evaluation datasets and then rewritten with nine modern open-source LLMs in few-shot setting. Our experiments demonstrate that models trained on the produced synthetic datasets have superior performance to those trained on the human-annotated MultiParaDetox dataset even in data limited setting. Models trained on SynthDetoxM outperform all evaluated LLMs in few-shot setting. We release our dataset and code to help further research in multilingual text detoxification. @@ -3759,7 +3759,7 @@ OghenevovweIkumariegbeUniversity of Arizona DaniyalKashif EduardoBlancoUniversity of Arizona - StevenCorman + StevenCorman 5734-5749 Event Argument Extraction (EAE) is a key task in natural language processing, focusing on identifying and classifying event arguments in text. However, the widely adopted exact span match (ESM) evaluation metric has notable limitations due to its rigid span constraints, often misidentifying valid predictions as errors and underestimating system performance. In this paper, we evaluate nine state-of-the-art EAE models on the RAMS and GENEVA datasets, highlighting ESM’s limitations. To address these issues, we introduce BEMEAE (Beyond Exact Span Match for Event Argument Extraction), a novel evaluation metric that recognizes predictions that are semantically equivalent to or improve upon the reference. BEMEAE integrates deterministic components with a semantic matching component for more accurate assessment. Our experiments demonstrate that BEMEAE aligns more closely with human judgments. We show that BEMEAE not only leads to higher F1 scores compared to ESM but also results in significant changes in model rankings, underscoring ESM’s inadequacy for comprehensive evaluation of EAE. 2025.naacl-long.295 @@ -3768,9 +3768,9 @@ u<fixed-case>D</fixed-case>istil-Whisper: Label-Free Data Filtering for Knowledge Distillation in Low-Data Regimes AbdulWaheedSchool of Computer Science, Carnegie Mellon University - KarimaKadaouiMohamed bin Zayed University of Artificial Intelligence + KarimaKadaouiMohamed bin Zayed University of Artificial Intelligence BhikshaRajCarnegie Mellon University - MuhammadAbdul-MageedUniversity of British Columbia + MuhammadAbdul-MageedUniversity of British Columbia 5750-5767 Recent work on distilling Whisper’s knowledge into small models using pseudo-labels shows promising performance while reducing the size by up to 50%. This results in small, efficient, and dedicated models. However, a critical step of distillation using pseudo-labels involves filtering high-quality predictions and using only those during training. This step requires ground truth labels to compare with and filter low-quality examples, making the process dependent on human labels. Additionally, the distillation process requires a large amount of data thereby limiting its applicability in low-resource settings. To address this, we propose a distillation framework that does not require any labeled data. Through experimentation, we show that our best-distilled models outperform the teacher model by 5-7 WER points and are on par with or outperform similar supervised data filtering setups. When scaling the data, our models significantly outperform all zero-shot and supervised models. Our models are also 25-50% more compute- and memory-efficient while maintaining performance equal to or better than that of the teacher model. For more details about our models, dataset, and other resources, please visit our GitHub page: https://github.com/UBC-NLP/uDistilWhisper. 2025.naacl-long.296 @@ -3782,9 +3782,9 @@ XiaodongLiuMicrosoft Research WeiweiYangMicrosoft Tsui-WeiWengUniversity of California, San Diego - HaoChengMicrosoft Research + HaoChengMicrosoft Research AidanSanUniversity of Virginia, Charlottesville - MichelGalleyMicrosoft + MichelGalleyMicrosoft JianfengGao 5768-5786 Recent research has shown that Large Language Models (LLMs) are vulnerable to automated jailbreak attacks, where adversarial suffixes crafted by algorithms appended to harmful queries bypass safety alignment and trigger unintended responses. Current methods for generating these suffixes are computationally expensive and have low Attack Success Rates (ASR), especially against well-aligned models like Llama2 and Llama3. To overcome these limitations, we introduce **ADV-LLM**, an iterative self-tuning process that crafts adversarial LLMs with enhanced jailbreak ability. Our framework significantly reduces the computational cost of generating adversarial suffixes while achieving nearly 100% ASR on various open-source LLMs. Moreover, it exhibits strong attack transferability to closed-source models, achieving 99% ASR on GPT-3.5 and 49% ASR on GPT-4, despite being optimized solely on Llama3. Beyond improving jailbreak ability, ADV-LLM provides valuable insights for future safety alignment research through its ability to generate large datasets for studying LLM safety. @@ -3793,14 +3793,14 @@ <fixed-case>V</fixed-case>oice<fixed-case>T</fixed-case>ext<fixed-case>B</fixed-case>lender: Augmenting Large Language Models with Speech Capabilities via Single-Stage Joint Speech-Text Supervised Fine-Tuning - YifanPengCarnegie Mellon University + YifanPengCarnegie Mellon University Krishna CPuvvadaNVIDIA ZhehuaiChen - PiotrZelaskoNVIDIA + PiotrZelaskoNVIDIA HeHuangNVIDIA KunalDhawanNVIDIA KeHuNVIDIA - ShinjiWatanabeCarnegie Mellon University + ShinjiWatanabeCarnegie Mellon University JagadeeshBalamNVIDIA BorisGinsburgNVIDIA 5787-5802 @@ -3810,10 +3810,10 @@ Rethinking Word Similarity: Semantic Similarity through Classification Confusion - KaitlynZhouStanford University + KaitlynZhouStanford University HaishanGao Sarah LiChen - DanEdelsteinStanford University + DanEdelsteinStanford University DanJurafskyStanford University ChenShani 5803-5817 @@ -3849,7 +3849,7 @@ AvidanShah AlexandreAraujoNew York University DavidWagnerUniversity of California Berkeley - ChawinSitawarin + ChawinSitawarin 5850-5876 Making large language models (LLMs) safe for mass deployment is a complex and ongoing challenge. Efforts have focused on aligning models to human preferences (RLHF), essentially embedding a “safety feature” into the model’s parameters. The Greedy Coordinate Gradient (GCG) algorithm (Zou et al., 2023b) emerges as one of the most popular automated jailbreaks, an attack that circumvents this safety training. So far, it is believed that such optimization-based attacks (unlike hand-crafted ones) are sample-specific. To make them universal and transferable, one has to incorporate multiple samples and models into the objective function. Contrary to this belief, we find that the adversarial prompts discovered by such optimizers are inherently prompt-universal and transferable, even when optimized on a single model and a single harmful request. To further exploit this phenomenon, we introduce IRIS, a new objective to these optimizers to explicitly deactivate the safety feature to create an even stronger universal and transferable attack. Without requiring a large number of queries or accessing output token probabilities, our universal and transferable attack achieves a 25% success rate against the state-of-the-art Circuit Breaker defense (Zou et al., 2024), compared to 2.5% by white-box GCG. Crucially, IRIS also attains state-of-the-art transfer rates on frontier models: GPT-3.5-Turbo (90%), GPT-4o-mini (86%), GPT-4o (76%), o1-mini (54%), o1-preview (48%), o3-mini (66%), and deepseek-reasoner (90%). 2025.naacl-long.302 @@ -3862,11 +3862,11 @@ Ji YongChoLG Corporation ShayneLongpre ChaeeunKim - DongkeunYoonKorea Advanced Institute of Science & Technology + DongkeunYoonKorea Advanced Institute of Science & Technology GuijinSon YejinChoKorea Advanced Institute of Science & Technology SheikhShafayatKAIST - JinheonBaekKorea Advanced Institute of Science & Technology + JinheonBaekKorea Advanced Institute of Science & Technology Sue HyunParkKorea Advanced Institute of Science & Technology HyeonbinHwang JinkyungJoKorea Advanced Institute of Science & Technology @@ -3878,7 +3878,7 @@ NamgyuHo Se JuneJooKorea Advanced Institute of Science & Technology MiyoungKoKorea Advanced Institute of Science and Technology - YoonjooLeeKorea Advanced Institute of Science & Technology + YoonjooLeeKorea Advanced Institute of Science & Technology HyungjooChae JaminShinNAVER JoelJang @@ -3886,8 +3886,8 @@ Bill YuchenLinxAI and University of Washington SeanWelleckCarnegie Mellon University GrahamNeubigCarnegie Mellon University - MoontaeLeeLG Corporation and University of Illinois, Chicago - KyungjaeLeeUniversity of Seoul + MoontaeLeeLG Corporation and University of Illinois, Chicago + KyungjaeLeeUniversity of Seoul MinjoonSeoKorea Advanced Institute of Science & Technology and Config Intelligence 5877-5919 As language models (LMs) become capable of handling a wide range of tasks, their evaluation is becoming as challenging as their development. Most generation benchmarks currently assess LMs using abstract evaluation criteria-like helpfulness and harmlessness-which often lack the flexibility and granularity of human assessment. Additionally, these benchmarks tend to focus disproportionately on specific capabilities such as instruction following, leading to coverage bias. To overcome these limitations, we introduce the BiGGen Bench, a principled generation benchmark designed to thoroughly evaluate nine distinct capabilities of LMs across 77 diverse tasks. A key feature of the BiGGen Bench is its use of instance-specific evaluation criteria, closely mirroring the nuanced discernment of human evaluation. We apply this benchmark to assess 100 frontier LMs using five evaluator LMs. Our code, data, and evaluation results are all publicly available at https://github.com/prometheus-eval/prometheus-eval. @@ -3915,9 +3915,9 @@ Uncovering Bias in Large Vision-Language Models at Scale with Counterfactuals PhillipHowardIntel - Kathleen C.FraserNational Research Council Canada + Kathleen C.FraserNational Research Council Canada AnahitaBhiwandiwalla - SvetlanaKiritchenkoNational Research Council Canada + SvetlanaKiritchenkoNational Research Council Canada 5946-5991 With the advent of Large Language Models (LLMs) possessing increasingly impressive capabilities, a number of Large Vision-Language Models (LVLMs) have been proposed to augment LLMs with visual inputs. Such models condition generated text on both an input image and a text prompt, enabling a variety of use cases such as visual question answering and multimodal chat. While prior studies have examined the social biases contained in text generated by LLMs, this topic has been relatively unexplored in LVLMs. Examining social biases in LVLMs is particularly challenging due to the confounding contributions of bias induced by information contained across the text and visual modalities. To address this challenging problem, we conduct a large-scale study of text generated by different LVLMs under counterfactual changes to input images, producing over 57 million responses from popular models. Our multi-dimensional bias evaluation framework reveals that social attributes such as perceived race, gender, and physical characteristics depicted in images can significantly influence the generation of toxic content, competency-associated words, harmful stereotypes, and numerical ratings of individuals. 2025.naacl-long.305 @@ -3925,12 +3925,12 @@ <fixed-case>AEGIS</fixed-case>2.0: A Diverse <fixed-case>AI</fixed-case> Safety Dataset and Risks Taxonomy for Alignment of <fixed-case>LLM</fixed-case> Guardrails - ShaonaGhoshNVIDIA - PrasoonVarshneyNVIDIA + ShaonaGhoshNVIDIA + PrasoonVarshneyNVIDIA Makesh NarsimhanSreedharNVIDIA AishwaryaPadmakumarNVIDIA - TraianRebedeaNVIDIA and University Politehnica of Bucharest - Jibin RajanVargheseNVIDIA + TraianRebedeaNVIDIA and University Politehnica of Bucharest + Jibin RajanVargheseNVIDIA ChristopherParisien 5992-6026 As Large Language Models (LLMs) and generative AI become increasingly widespread, concerns about content safety have grown in parallel. Currently, there is a clear lack of high-quality, human-annotated datasets that address the full spectrum of LLM-related safety risks and are usable for commercial applications. To bridge this gap, we propose a comprehensive and adaptable taxonomy for categorizing safety risks, structured into 12 top-level hazard categories with an extension to 9 fine-grained subcategories. This taxonomy is designed to meet the diverse requirements of downstream users, offering more granular and flexible tools for managing various risk types. Using a hybrid data generation pipeline that combines human annotations with a multi-LLM “jury” system to assess the safety of responses we obtain Aegis2.0, a carefully curated collection of 34,248 samples of human-LLM interactions, annotated according to our proposed taxonomy. To validate its effectiveness, we demonstrate that several lightweight models, trained using parameter-efficient techniques on Aegis2.0, achieve performance competitive with leading safety models fully fine-tuned on much larger, non-commercial datasets generated leveraging GPT-4. Additionally, we introduce a novel training blend that combines topic following data with safety data. This approach enhances the adaptability of guard models, enabling them to generalize to new risk categories defined during inference. We plan to open-source Aegis2.0 data and models to the research community to aid in safety guardrailing of LLMs. @@ -3952,7 +3952,7 @@ YuchenZhuangGeorgia Institute of Technology JingfengYangAmazon HaomingJiangAmazon - XinLiuAmazon + XinLiuAmazon KeweiCheng SanketLokegaonkarAmazon YifanGaoAmazon @@ -3960,14 +3960,14 @@ TianyiLiuAmazon BinxuanHuangAmazon ZhengLiAmazon - ZhengyangWangAmazon + ZhengyangWangAmazon PeiChenTexas A&M University - College Station RuijieWang RongzhiZhangGeorgia Institute of Technology and Zhejiang University NasserZalmoutAmazon PriyankaNigam - BingYinAmazon - ChaoZhangGeorgia Institute of Technology + BingYinAmazon + ChaoZhangGeorgia Institute of Technology 6041-6068 Due to the scarcity of agent-oriented pre-training data, LLM-based autonomous agents typically rely on complex prompting or extensive fine-tuning, which often fails to introduce new capabilities while preserving strong generalizability. We introduce Hephaestus-Forge, the first large-scale pre-training corpus designed to enhance the fundamental capabilities of LLM agents in API function calling, intrinsic reasoning and planning, and adapting to environmental feedback. Hephaestus-Forge comprises 103B agent-specific data encompassing 76,537 APIs, including both tool documentation to introduce knowledge of API functions and function calling trajectories to strengthen intrinsic reasoning. To explore effective training protocols, we investigate scaling laws to identify the optimal recipe in data mixing ratios. By continual pre-training on Hephaestus-Forge, Hephaestus outperforms small- to medium-scale open-source LLMs and rivals commercial LLMs on three agent benchmarks, demonstrating the effectiveness of our pre-training corpus in enhancing fundamental agentic capabilities and generalization of LLMs to new tasks or environments. 2025.naacl-long.308 @@ -3975,8 +3975,8 @@ <fixed-case>T</fixed-case>iny<fixed-case>T</fixed-case>hinker: Distilling Reasoning through Coarse-to-Fine Knowledge Internalization with Self-Reflection - ShengminPiao - SanghyunParkYonsei University + ShengminPiao + SanghyunParkYonsei University 6069-6087 Large Language Models exhibit impressive reasoning capabilities across diverse tasks, motivating efforts to distill these capabilities into smaller models through generated reasoning data. However, direct training on such synthesized reasoning data may lead to superficial imitation of reasoning process, rather than fostering a genuine integration of reasoning capabilities with underlying knowledge. To address this, we propose TinyThinker, a framework introducing two novel approaches. First, we introduce a three-stage process that incrementally guides the student model through the reasoning process, progressively refining knowledge from coarse to fine granularity. Second, we develop a two-phase training framework comprising an initial reasoning acquisition phase followed by a self-reflection phase utilizing self-generated data. Experiments on commonsense reasoning benchmarks demonstrate that TinyThinker achieves superior performance compared to baselines. Ablation studies further validate the effectiveness of each component in our framework. We expect that TinyThinker can be extended to other knowledge-intensive reasoning tasks, offering an alternative strategy for developing effective reasoning capabilities in smaller language models. Codes are available at https://github.com/shengminp/TinyThinker. 2025.naacl-long.309 @@ -3986,10 +3986,10 @@ <fixed-case>V</fixed-case>is<fixed-case>D</fixed-case>o<fixed-case>M</fixed-case>: Multi-Document <fixed-case>QA</fixed-case> with Visually Rich Elements Using Multimodal Retrieval-Augmented Generation MananSuri PuneetMathurAdobe Systems - FranckDernoncourt + FranckDernoncourt KanikaGoswamiIndira Gandhi Delhi Technical University for Women - Ryan A.RossiAdobe Research - DineshManochaUniversity of Maryland, College Park + Ryan A.RossiAdobe Research + DineshManochaUniversity of Maryland, College Park 6088-6109 Understanding information from a collection of multiple documents, particularly those with visually rich elements, is important for document-grounded question answering. This paper introduces VisDoMBench, the first comprehensive benchmark designed to evaluate QA systems in multi-document settings with rich multimodal content, including tables, charts, and presentation slides. We propose VisDoMRAG, a novel multimodal Retrieval Augmented Generation (RAG) approach that simultaneously utilizes visual and textual RAG, combining robust visual retrieval capabilities with sophisticated linguistic reasoning. VisDoMRAG employs a multi-step reasoning process encompassing evidence curation and chain-of-thought reasoning for concurrent textual and visual RAG pipelines. A key novelty of VisDoMRAG is its consistency-constrained modality fusion mechanism, which aligns the reasoning processes across modalities at inference time to produce a coherent final answer. This leads to enhanced accuracy in scenarios where critical information is distributed across modalities and improved answer verifiability through implicit context attribution. Through extensive experiments involving open-source and proprietary large language models, we benchmark state-of-the-art document QA methods on VisDoMBench. Extensive results show that VisDoMRAG outperforms unimodal and long-context LLM baselines for end-to-end multimodal document QA by 12-20%. 2025.naacl-long.310 @@ -4000,9 +4000,9 @@ MingCheng JiayingGongeBay Inc. ChenhanYuanAlibaba Group - William AIngramVirginia Polytechnic Institute and State University - EdwardFoxVirginia Polytechnic Institute and State University - HodaEldardiry, Virginia Polytechnic Institute and State University + William AIngramVirginia Polytechnic Institute and State University + EdwardFoxVirginia Polytechnic Institute and State University + HodaEldardiry, Virginia Polytechnic Institute and State University 6110-6130 Existing text simplification or paraphrase datasets mainly focus on sentence-level text generation in a general domain. These datasets are typically developed without using domain knowledge. In this paper, we release a novel dataset, VTechAGP, which is the first academic-to-general-audience text paraphrase dataset consisting of document-level these and dissertation academic and general-audience abstract pairs from 8 colleges authored over 25 years. We also propose a novel dynamic soft prompt generative language model, DSPT5. For training, we leverage a contrastive-generative loss function to learn the keyword vectors in the dynamic prompt. For inference, we adopt a crowd-sampling decoding strategy at both semantic and structural levels to further select the best output candidate. We evaluate DSPT5 and various state-of-the-art large language models (LLMs) from multiple perspectives. Results demonstrate that the SOTA LLMs do not provide satisfactory outcomes, while the lightweight DSPT5 can achieve competitive results. To the best of our knowledge, we are the first to build a benchmark dataset and solutions for academic-to-general-audience text paraphrase dataset. Models will be public after acceptance. 2025.naacl-long.311 @@ -4012,7 +4012,7 @@ Large Language Models Share Representations of Latent Grammatical Concepts Across Typologically Diverse Languages JannikBrinkmannNew York University and University of Mannheim ChrisWendlerNortheastern University - ChristianBarteltTechnische Universität Clausthal + ChristianBarteltTechnische Universität Clausthal AaronMuellerNortheastern University and Technion - Israel Institute of Technology 6131-6150 Human bilinguals often use similar brain regions to process multiple languages, depending on when they learned their second language and their proficiency. In large language models (LLMs), how are multiple languages learned and encoded? In this work, we explore the extent to which LLMs share representations of morphsyntactic concepts such as grammatical number, gender, and tense across languages. We train sparse autoencoders on Llama-3-8B and Aya-23-8B, and demonstrate that abstract grammatical concepts are often encoded in feature directions shared across many languages. We use causal interventions to verify the multilingual nature of these representations; specifically, we show that ablating only multilingual features decreases classifier performance to near-chance across languages. We then use these features to precisely modify model behavior in a machine translation task; this demonstrates both the generality and selectivity of these feature’s roles in the network. Our findings suggest that even models trained predominantly on English data can develop robust, cross-lingual abstractions of morphosyntactic concepts. @@ -4021,9 +4021,9 @@ Examining and Adapting Time for Multilingual Classification via Mixture of Temporal Experts - WeisiLiuUniversity of Memphis + WeisiLiuUniversity of Memphis GuangzengHanUniversity of Memphis - XiaoleiHuangUniversity of Memphis + XiaoleiHuangUniversity of Memphis 6151-6166 Time is implicitly embedded in classification process: classifiers are usually built on existing data while to be applied on future data whose distributions (e.g., label and token) may change. However, existing state-of-the-art classification models merely consider the temporal variations and primarily focus on English corpora, which leaves temporal studies less explored, let alone under multilingual settings. In this study, we fill the gap by treating time as domains (e.g., 2024 vs. 2025), examining temporal effects, and developing a domain adaptation framework to generalize classifiers over time on four languages, English, Danish, French, and German. Our framework proposes Mixture of Temporal Experts (MoTE) to leverage both semantic and data distributional shifts to learn and adapt temporal trends into classification models. Our analysis shows classification performance varies over time across different languages, and we experimentally demonstrate that MoTE can enhance classifier generalizability over temporal data shifts. Our study provides analytic insights and addresses the need for time-aware models that perform robustly in multilingual scenarios. 2025.naacl-long.313 @@ -4039,12 +4039,12 @@ <fixed-case>E</fixed-case>vo<fixed-case>A</fixed-case>gent: Towards Automatic Multi-Agent Generation via Evolutionary Algorithms - SiyuYuan + SiyuYuan KaitaoSongMicrosoft JiangjieChenByteDance Inc. - XuTan - DongshengLiMicrosoft Research Asia - DeqingYangFudan University + XuTan + DongshengLiMicrosoft Research Asia + DeqingYangFudan University 6192-6217 The rise of powerful large language models (LLMs) has spurred a new trend in building LLM-based autonomous agents for solving complex tasks, especially multi-agent systems. Despite the remarkable progress, we notice that existing works are heavily dependent on human-designed frameworks, which greatly limits the functional scope and scalability of agent systems. How to automatically extend the specialized agent to multi-agent systems to improve task-solving capability still remains a significant challenge. In this paper, we introduce EVOAGENT, a generic method to automatically extend specialized agents to multi-agent systems via the evolutionary algorithm, thereby improving the effectiveness of LLM-based agents in solving tasks. Specifically, we consider the existing agent frameworks as the initial individual and then apply a series of evolutionary operators (e.g., mutation, crossover, selection, etc.) to generate multiple agents with diverse settings. Experimental results across various tasks show that EVOAGENT can significantly enhance the tasksolving capability of LLM-based agents, and can be generalized to any LLM-based agent framework to extend them into multi-agent systems. Resources are available at https://evo-agent.github.io/. 2025.naacl-long.315 @@ -4056,10 +4056,10 @@ QiujieXieFudan University XiaolongWang QingqiuLi - YuejieZhang - RuiFengFudan University + YuejieZhang + RuiFengFudan University TaoZhang - ShangGaoDeakin University + ShangGaoDeakin University 6218-6240 Role-playing agents (RPAs) powered by large language models (LLMs) have been widely utilized in dialogue systems for their capability to deliver personalized interactions. Current evaluations of RPAs mainly focus on personality fidelity, tone imitation, and knowledge consistency, while overlooking emotional fidelity, a key factor that affects user experience. To this end, we propose a benchmark called EmoCharacter to assess emotional fidelity of RPAs in dialogues. EmoCharacter includes two benchmark datasets (single-turn and multi-turn dialogues), three evaluation settings, and six metrics to measure the emotional fidelity between RPAs and the characters they portray. Based on EmoCharacter, we conduct extensive evaluations on RPAs powered by seven widely used LLMs with representative role-playing methods. Our empirical findings reveal that: (1) Contrary to intuition, current role-playing methods often reduce the emotional fidelity of LLMs in dialogues; (2) Enhancing the general capabilities of LLMs does not necessarily improve the emotional fidelity of RPAs; (3) Fine-tuning or In-Context Learning based on real dialogue data can enhance emotional fidelity. 2025.naacl-long.316 @@ -4069,7 +4069,7 @@ Language Models can Categorize System Inputs for Performance Analysis DominicSobhani RuiqiZhong - EdisonMarrese-TaylorThe Univesity of Tokyo and AIST, National Institute of Advanced Industrial Science and Technology + EdisonMarrese-TaylorThe Univesity of Tokyo and AIST, National Institute of Advanced Industrial Science and Technology KeisukeSakaguchiTohoku University YutakaMatsuoThe University of Tokyo and The University of Tokyo 6241-6257 @@ -4083,7 +4083,7 @@ HaotianXiaUniversity of California, Irvine ZhaoweiLiu HanyangCao - ZhiYang + ZhiYang ZhiqiangLiu SizheWang JinyiNiu @@ -4093,7 +4093,7 @@ XiaomingHuang BingZhuHSBC Lab ZhongyuWeiFudan University - YunChenShanghai University of Finance and Economics + YunChenShanghai University of Finance and Economics WeiningShenUniversity of California, Irvine LiwenZhangShanghai University of Finance and Economics 6258-6292 @@ -4103,10 +4103,10 @@ Rethinking the Role of <fixed-case>LLM</fixed-case>s for Document-level Relation Extraction: a Refiner with Task Distribution and Probability Fusion - FuZhangNortheastern University - XinlongJinNortheastern University - JingweiChengNortheastern University, China - HongsenYu + FuZhangNortheastern University + XinlongJinNortheastern University + JingweiChengNortheastern University, China + HongsenYu HuangmingXu 6293-6312 Document-level relation extraction (DocRE) provides a broad context for extracting one or more relations for each entity pair. Large language models (LLMs) have made great progress in relation extraction tasks. However, one of the main challenges we face is that LLMs have difficulty in multi-label relation prediction tasks. Additionally, another noteworthy challenge and discovery we reveal: the small language models (SLMs) for DocRE tend to classify existing relations as ”no relation” (NA), while LLMs tend to predict existing relations for all entity pairs. To address these challenges, we propose a novel method that utilizes LLMs as a refiner, employing task distribution and probability fusion. The task distribution we carefully designed aims to distinguish hard and easy tasks, and feed hard tasks to our LLMs-based framework to reevaluate and refine. Further, in order to effectively solve the multi-label relation prediction problem in the refinement process, we propose a probability fusion method, ensuring and enhancing fusion predictions by maintaining a balance between SLMs and LLMs. Extensive experiments on widely-used datasets demonstrate that our method outperforms existing LLMbased methods without fine-tuning by an average of 25.2% F1. Refining SLMs using our method consistently boosts the performance of the SLMs, achieving new state-of-the-art results compared to existing SLMs and LLMs. Our code: https://github.com/Drasick/Drell. @@ -4117,7 +4117,7 @@ Decomposition Dilemmas: Does Claim Decomposition Boost or Burden Fact-Checking Performance? QishengHuNanyang Technological University QuanyuLong - WenyaWangNanyang Technological University + WenyaWangNanyang Technological University 6313-6336 Fact-checking pipelines increasingly adopt the Decompose-Then-Verify paradigm, where texts are broken down into smaller claims for individual verification and subsequently combined for a veracity decision. While decomposition is widely-adopted in such pipelines, its effects on final fact-checking performance remain underexplored. Some studies have reported improvements from decompostition, while others have observed performance declines, indicating its inconsistent impact. To date, no comprehensive analysis has been conducted to understand this variability. To address this gap, we present an in-depth analysis that explicitly examines the impact of decomposition on downstream verification performance. Through error case inspection and experiments, we introduce a categorization of decomposition errors and reveal a trade-off between accuracy gains and the noise introduced through decomposition. Our analysis provides new insights into understanding current system’s instability and offers guidance for future studies toward improving claim decomposition in fact-checking pipelines. 2025.naacl-long.320 @@ -4149,14 +4149,14 @@ <fixed-case>C</fixed-case>haracter<fixed-case>B</fixed-case>ox: Evaluating the Role-Playing Capabilities of <fixed-case>LLM</fixed-case>s in Text-Based Virtual Worlds - LeiWang - JianxunLian + LeiWang + JianxunLian YiHuang YanqiDaiRenmin University of China - HaoxuanLi - XuChenRenmin University of China - XingXieMicrosoft Research Asia - Ji-RongWenRenmin University of China + HaoxuanLi + XuChenRenmin University of China + XingXieMicrosoft Research Asia + Ji-RongWenRenmin University of China 6372-6391 Role-playing is a crucial capability of Large Language Models (LLMs), enabling a wide range of practical applications, including intelligent non-player characters, digital twins, and emotional companions. Evaluating this capability in LLMs is challenging due to the complex dynamics involved in role-playing, such as maintaining character fidelity throughout a storyline and navigating open-ended narratives without a definitive ground truth. Current evaluation methods, which primarily focus on question-answering or conversational snapshots, fall short of adequately capturing the nuanced character traits and behaviors essential for authentic role-playing. In this paper, we propose CharacterBox, which is a simulation sandbox designed to generate situational fine-grained character behavior trajectories. These behavior trajectories enable a more comprehensive and in-depth evaluation of role-playing capabilities. CharacterBox consists of two main components: the character agent and the narrator agent. The character agent, grounded in psychological and behavioral science, exhibits human-like behaviors, while the narrator agent coordinates interactions between character agents and environmental changes. Additionally, we introduce two trajectory-based methods that leverage CharacterBox to enhance LLM performance. To reduce costs and facilitate the adoption of CharacterBox by public communities, we fine-tune two smaller models, CharacterNR and CharacterRM, as substitutes for GPT API calls, and demonstrate their competitive performance compared to advanced GPT APIs. The code is available at https://github.com/Paitesanshi/CharacterBox. 2025.naacl-long.323 @@ -4176,10 +4176,10 @@ <fixed-case>C</fixed-case>o<fixed-case>ME</fixed-case>: An Unlearning-based Approach to Conflict-free Model Editing - DahyunJungKorea University - JaehyungSeo - JaewookLeeKorea University - ChanjunParkKorea University + DahyunJungKorea University + JaehyungSeo + JaewookLeeKorea University + ChanjunParkKorea University HeuiseokLim 6410-6422 Large language models (LLMs) often retain outdated or incorrect information from pre-training, which undermines their reliability. While model editing methods have been developed to address such errors without full re-training, they frequently suffer from knowledge conflicts, where outdated information interferes with new knowledge. In this work, we propose Conflict-free Model Editing (CoME), a novel framework that enhances the accuracy of knowledge updates in LLMs by selectively removing outdated knowledge. CoME leverages unlearning to mitigate knowledge interference, allowing new information to be integrated without compromising relevant linguistic features. Through experiments on GPT-J and LLaMA-3 using Counterfact and ZsRE datasets, we demonstrate that CoME improves both editing accuracy and model reliability when applied to existing editing methods. Our results highlight that the targeted removal of outdated knowledge is crucial for enhancing model editing effectiveness and maintaining the model’s generative performance. @@ -4188,7 +4188,7 @@ On The Origin of Cultural Biases in Language Models: From Pre-training Data to Linguistic Phenomena - TarekNaous + TarekNaous WeiXuGeorgia Institute of Technology 6423-6443 Language Models (LMs) have been shown to exhibit a strong preference towards entities associated with Western culture when operating in non-Western languages. In this paper, we aim to uncover the origins of entity-related cultural biases in LMs by analyzing several contributing factors, including the representation of entities in pre-training data and the impact of variations in linguistic phenomena across languages. We introduce CAMeL-2, a parallel Arabic-English benchmark of 58,086 entities associated with Arab and Western cultures and 367 masked natural contexts for entities. Our evaluations using CAMeL-2 reveal reduced performance gaps between cultures by LMs when tested in English compared to Arabic. We find that LMs struggle in Arabic with entities that appear at high frequencies in pre-training, where entities can hold multiple word senses. This also extends to entities that exhibit high lexical overlap with languages that are not Arabic but use the Arabic script. Further, we show how frequency-based tokenization leads to this issue in LMs, which gets worse with larger Arabic vocabularies. We will make CAMeL-2 available at: https://github.com/tareknaous/camel2 @@ -4198,7 +4198,7 @@ Adapting Sentence-level Automatic Metrics for Document-level Simplification Evaluation MounicaMaddelaBloomberg - FernandoAlva-ManchegoCardiff University + FernandoAlva-ManchegoCardiff University 6444-6459 Text simplification aims to enhance the clarity and comprehensibility of a complex text while preserving its original meaning. Previous research on the automatic evaluation of text simplification has primarily focused on sentence simplification, with commonly used metrics such as SARI and advanced metrics such as LENS being trained and evaluated at the sentence level. However, these metrics often underperform on longer texts. In our study, we propose a novel approach to adapt existing sentence-level metrics for paragraph- or document-level simplification. We benchmark our approach against a wide variety of existing reference-based and reference-less metrics across multiple domains. Empirical results demonstrate that our approach outperforms traditional sentence-level metrics in terms of correlation with human judgment. Furthermore, we evaluate the sensitivity and robustness of various metrics to different types of errors produced by existing text simplification systems. 2025.naacl-long.327 @@ -4208,7 +4208,7 @@ Decoding Speculative Decoding MinghaoYanDepartment of Computer Science, University of Wisconsin - Madison SaurabhAgarwalUniversity of Wisconsin, Madison - ShivaramVenkataramanMicrosoft and University of Wisconsin, Madison + ShivaramVenkataramanMicrosoft and University of Wisconsin, Madison 6460-6473 Speculative Decoding is a widely used technique to speed up inference for Large Language Models (LLMs) without sacrificing quality. When performing inference, speculative decoding uses a smaller draft model to generate speculative tokens and then uses the target LLM to verify those draft tokens. The speedup provided by speculative decoding heavily depends on the choice of the draft model. In this work, we perform a detailed study comprising over 350 experiments with LLaMA-65B and OPT-66B using speculative decoding and delineate the factors that affect the performance gain provided by speculative decoding. Our experiments indicate that the performance of speculative decoding depends heavily on the latency of the draft model, and the draft model’s capability in language modeling does not correlate strongly with its performance in speculative decoding. Based on these insights we explore a new design space for draft models and design hardware-efficient draft models for speculative decoding. Our newly designed draft model can provide 111% higher throughput than existing draft models and our approach generalizes further to all LLaMA models (1/2/3.1) and supervised fine-tuned models. 2025.naacl-long.328 @@ -4237,14 +4237,14 @@ Self-<fixed-case>DC</fixed-case>: When to Reason and When to Act? Self Divide-and-Conquer for Compositional Unknown Questions - HongruWangThe Chinese University of Hong Kong - BoyangXue + HongruWangThe Chinese University of Hong Kong + BoyangXue BaohangZhouNankai University TianhuaZhangChinese University of Hong Kong, The Chinese University of Hong Kong CunxiangWang - HuiminWangJarvis Research Center, Tencent YouTu Lab - GuanhuaChenSouthern University of Science and Technology - Kam-FaiWongThe Chinese University of Hong Kong + HuiminWangJarvis Research Center, Tencent YouTu Lab + GuanhuaChenSouthern University of Science and Technology + Kam-FaiWongThe Chinese University of Hong Kong 6510-6525 Previous research has typically concentrated on leveraging the internal knowledge of Large Language Models (LLMs) to answer known questions (i.e., internal reasoning such as generate-then-read). In contrast, for questions that fall outside their known scope, these models rely on external knowledge retrieval to provide accurate responses (i.e., external acting such as retrieve-then-read). However, few previous works consider the compositional questions, which consist of several known and unknown sub-questions, necessitating the dynamic combination of previous two methods (i.e., internal reasoning and external acting) to achieve a better trade-off between effectiveness and efficiency. To this end, we introduce a Self Divide-and-Conquer (Self-DC) framework, accompanying with the first Compositional unknown Question-Answering dataset (CuQA). This framework enables LLMs to adaptively choose between using internal knowledge and retrieving external knowledge as needed, resulting in a better trade-off between effectiveness and efficiency. Experimental results on two datasets demonstrate that Self-DC can achieve comparable or even better performance with much fewer external calls compared with several strong baselines. 2025.naacl-long.331 @@ -4252,7 +4252,7 @@ <fixed-case>TRANSIENTTABLES</fixed-case>: Evaluating <fixed-case>LLM</fixed-case>s’ Reasoning on Temporally Evolving Semi-structured Tables - AbhilashShankarampetaUniversity of California, San Diego + AbhilashShankarampetaUniversity of California, San Diego HarshMahajan TusharKatariaUniversity of Utah DanRoth @@ -4266,8 +4266,8 @@ <fixed-case>A</fixed-case>dvisor<fixed-case>QA</fixed-case>: Towards Helpful and Harmless Advice-seeking Question Answering with Collective Intelligence MinbeomKim HwanheeLeeChung-Ang University - JoonsukParkUniversity of Richmond - HwaranLeeSogang University + JoonsukParkUniversity of Richmond + HwaranLeeSogang University KyominJung 6545-6565 As the integration of large language models into daily life is on the rise, there is still a lack of dataset for *advising on subjective and personal dilemmas*. To address this gap, we introduce AdvisorQA, which aims to improve LLMs’ capability to offer advice for deeply subjective concerns, utilizing the LifeProTips Reddit forum. This forum features a dynamic interaction where users post advice-seeking questions, receiving an average of 8.9 advice per query, with 164.2 upvotes from hundreds of users, embodying a *collective intelligence*. Therefore, we’ve completed a dataset encompassing daily life questions, diverse corresponding responses, and majority vote ranking, which we use to train a helpfulness metric. In baseline experiments, models aligned with AdvisorQA dataset demonstrated improved helpfulness through our automatic metric, as well as GPT-4 and human evaluations. Additionally, we expanded the independent evaluation axis to include harmlessness. AdvisorQA marks a significant leap in enhancing QA systems to provide subjective, helpful, and harmless advice, showcasing LLMs’ improved understanding of human subjectivity. @@ -4277,10 +4277,10 @@ t<fixed-case>RAG</fixed-case>: Term-level Retrieval-Augmented Generation for Domain-Adaptive Retrieval DohyeonLeeSeoul National University - JongyoonKimSeoul National University + JongyoonKimSeoul National University JihyukKimLG Corporation Seung-wonHwangSeoul National University - JoonsukParkUniversity of Richmond + JoonsukParkUniversity of Richmond 6566-6578 Neural retrieval models have emerged as an effective tool for information retrieval, but their performance suffers when there is a domain shift between training and test data distributions. Recent work aims to construct pseudo-training data for the target domain by generating domain-adapted pseudo-queries using large language models (LLMs). However, we identifies that LLMs exhibit a “seen term bias” where the generated pseudo-queries fail to include relevant “unseen” terms as expected for domain adaptation purposes. To address this limitation, we propose to improve the term recall of unseen query terms, by using term-level Retrieval-Augmented Generation (tRAG). Specifically, unlike existing document-level RAG, we propose to generate domain-specific keywords from all documents in the corpus, including those unseen in any individual document. To filter hallucination, generated keywords are retrieved and reranked, leveraging relevance feedback from both retrievers and LLMs. Experiments on the BEIR benchmark show tRAG significantly improves recall for unseen terms by 10.6% and outperforms LLM and retrieval-augmented generation baselines on overall retrieval performance. 2025.naacl-long.334 @@ -4290,7 +4290,7 @@ <fixed-case>JRE</fixed-case>-<fixed-case>L</fixed-case>: Journalist, Reader, and Editor <fixed-case>LLM</fixed-case>s in the Loop for Science Journalism for the General Audience GongyaoJiang XinranShi - QiongLuoThe Hong Kong University of Science and Technology (Guangzhou) and Hong Kong University of Science and Technology + QiongLuoThe Hong Kong University of Science and Technology (Guangzhou) and Hong Kong University of Science and Technology 6579-6594 Science journalism reports current scientific discoveries to non-specialists, aiming to enable public comprehension of the state of the art. This task is challenging as the audience often lacks specific knowledge about the presented research. We propose JRE-L, a framework that integrates three LLMs mimicking the writing-reading-feedback-revision loop. In JRE-L, one LLM acts as the journalist, another LLM as the general public reader, and the third LLM as an editor. The journalist’s writing is iteratively refined by feedback from the reader and suggestions from the editor. Our experiments demonstrate that by leveraging the collaboration of two 7B and one 1.8B open-source LLMs, we can generate articles that are more accessible than those generated by existing methods, including prompting single advanced models such as GPT-4 and other LLM-collaboration strategies. Our code is publicly available at github.com/Zzoay/JRE-L. 2025.naacl-long.335 @@ -4301,8 +4301,8 @@ ZicheLiu RuiKe YajiaoLiu - FengJiang - HaizhouLiThe Chinese University of Hong Kong (Shenzhen); National University of Singapore and National University of Singapore + FengJiang + HaizhouLiThe Chinese University of Hong Kong (Shenzhen); National University of Singapore and National University of Singapore 6595-6611 Data selection for fine-tuning large language models (LLMs) aims to choose a high-quality subset from existing datasets, allowing the trained model to outperform baselines trained on the full dataset. However, the expanding body of research lacks a clear, unified framework, and the variability in experimental settings complicates systematic comparisons.While existing surveys comprehensively overview the stages and methods of data selection, they often overlook an in-depth exploration of the fine-tuning phase. In this paper, we conduct a focused review of recent data selection techniques for fine-tuning LLMs, analyzing a dozen key studies. We introduce a novel three-stage scheme—comprising feature extraction, criteria design, and selector evaluation—to systematically categorize and evaluate these methods. Additionally, we propose a unified comparison approach that incorporates ratio-based efficiency and ranking-based feasibility metrics to address inconsistencies across experiments. Our findings reveal that methods emphasizing more targeted quality measurement achieve higher efficiency but at the cost of feasibility. Finally, we discuss trends and highlight four key challenges in fine-tuning data selection, offering potential directions for future research. 2025.naacl-long.336 @@ -4312,10 +4312,10 @@ Graph Neural Network Enhanced Retrieval for Question Answering of Large Language Models ZijianLiHong Kong University of Science and Technology QingyanGuo - JiaweiShaoChina Telecom + JiaweiShaoChina Telecom LeiSongMicrosoft - JiangBianMicrosoft - JunZhangThe Hong Kong University of Science and Technology + JiangBianMicrosoft + JunZhangThe Hong Kong University of Science and Technology RuiWangMicrosoft 6612-6633 Retrieval augmented generation has revolutionized large language model (LLM) outputs by providing factual supports. Nevertheless, it struggles to capture all the necessary knowledge for complex reasoning questions. Existing retrieval methods typically divide reference documents into passages, treating them in isolation. These passages, however, are often interrelated, such as passages that are contiguous or share the same keywords. Therefore, it is crucial to recognize such relatedness for enhancing the retrieval process. In this paper, we propose a novel retrieval method, called GNN-Ret, which leverages graph neural networks (GNNs) to enhance retrieval by exploiting the relatedness between passages. Specifically, we first construct a graph of passages by connecting passages that are structure-related or keyword-related. A graph neural network (GNN) is then leveraged to exploit the relationships between passages and improve the retrieval of supporting passages. Furthermore, we extend our method to handle multi-hop reasoning questions using a recurrent graph neural network (RGNN), named RGNN-Ret. At each step, RGNN-Ret integrates the graphs of passages from previous steps, thereby enhancing the retrieval of supporting passages. Extensive experiments on benchmark datasets demonstrate that GNN-Ret achieves higher accuracy for question answering with a single query of LLMs than strong baselines that require multiple queries, and RGNN-Ret further improves accuracy and achieves state-of-the-art performance, with up to 10.4 accuracy improvement on the 2WikiMQA dataset. @@ -4325,7 +4325,7 @@ Pula: Training Large Language Models for Setswana NathanBrown - VukosiMarivateUniversity of Pretoria + VukosiMarivateUniversity of Pretoria 6634-6656 In this work we present Pula, a suite of bilingual language models proficient in both Setswana and English. Leveraging recent advancements in data availability and efficient fine-tuning, Pula 8B and Pula 14B outperform GPT-4o and Gemini 1.5 Pro on English-Setswana translation tasks and achieve state-of-the-art performance on Setswana reasoning tasks for their size. We release the weights for Pula 1B, 3B, 8B, and 14B as well as training logs and training and evaluation code. Alongside Pula, we release the largest-ever Setswana text corpus, Marothodi, and the first comprehensive Setswana instruction-tuning dataset, Medupi, consisting of reformatted datasets, translated corpora, and synthetic LLM-generated text. To accompany this data, we release the code used for dataset construction, formatting, filtering, and scraping. Last, we release two Setswana LLM-translated benchmarks, MMLU-tsn and GSM8K-tsn, to measure Setswana knowledge and reasoning capabilities. 2025.naacl-long.338 @@ -4335,8 +4335,8 @@ <fixed-case>L</fixed-case>egal<fixed-case>V</fixed-case>iz: Legal Text Visualization by Text To Diagram Generation EriOnami TaikiMiyanishiThe University of Tokyo - KokiMaedaInstitute of Science Tokyo - ShuheiKuritaNational Institute of Informatics and New York University + KokiMaedaInstitute of Science Tokyo + ShuheiKuritaNational Institute of Informatics and New York University 6657-6676 Legal documents including judgments and court orders require highly sophisticated legal knowledge for understanding. To disclose expert knowledge for non-experts, we explore the problem of visualizing legal texts with easy-to-understand diagrams and propose a novel dataset of LegalViz with 23 languages and 7,010 cases of legal document and visualization pairs, using the DOT graph description language of Graphviz. LegalViz provides a simple diagram from a complicated legal corpus identifying legal entities, transactions, legal sources, and statements at a glance, that are essential in each judgment. In addition, we provide new evaluation metrics for the legal diagram visualization by considering graph structures, textual similarities, and legal contents. We conducted empirical studies on few-shot and finetuning large language models for generating legal diagrams and evaluated them with these metrics, including legal content-based evaluation within 23 languages. Models trained with LegalViz outperform existing models including GPTs, confirming the effectiveness of our dataset. 2025.naacl-long.339 @@ -4344,10 +4344,10 @@ Active Few-Shot Learning for Text Classification - SaeedAhmadniaUniversity of Illinois at Chicago - ArashYousefi JordehiUniversity of Guilan + SaeedAhmadniaUniversity of Illinois at Chicago + ArashYousefi JordehiUniversity of Guilan MahsaHosseini Khasheh HeyranUniversity of Guilan - Seyed AbolghasemMirroshandelUniversity of Guilan + Seyed AbolghasemMirroshandelUniversity of Guilan OwenRambowStony Brook University CorneliaCarageaUniversity of Illinois at Chicago 6677-6694 @@ -4360,7 +4360,7 @@ Cong-Duy TNguyenSchool of Computer Science and Engineering, Nanyang Technological University XiaobaoWuNanyang Technological University Thong ThanhNguyen - ShuaiZhao + ShuaiZhao Khoi M.Le Nguyen VietAnhNanyang Technological University FengYichao @@ -4372,7 +4372,7 @@ <fixed-case>R</fixed-case>esearch<fixed-case>A</fixed-case>gent: Iterative Research Idea Generation over Scientific Literature with Large Language Models - JinheonBaekKorea Advanced Institute of Science & Technology + JinheonBaekKorea Advanced Institute of Science & Technology Sujay KumarJauharMicrosoft Research SilviuCucerzanMicrosoft Sung JuHwangKorea Advanced Institute of Science and Technology and AITRICS @@ -4383,9 +4383,9 @@ Logit Separability-Driven Samples and Multiple Class-Related Words Selection for Advancing In-Context Learning - ZixiaoZhu - ZijianFeng - HanzhangZhou + ZixiaoZhu + ZijianFeng + HanzhangZhou JunlangQianNanyang Technological University KezhiMaoNanyang Technological University 6739-6759 @@ -4395,8 +4395,8 @@ Identifying Emerging Concepts in Large Corpora - SiboMaStanford University - JulianNyarkoStanford University + SiboMaStanford University + JulianNyarkoStanford University 6760-6778 We introduce a new method to identify emerging concepts in large text corpora. By analyzing changes in the heatmaps of the underlying embedding space, we are able to detect these concepts with high accuracy shortly after they originate, in turn outperforming common alternatives. We further demonstrate the utility of our approach by analyzing speeches in the U.S. Senate from 1941 to 2015. Our results suggest that the minority party is more active in introducing new concepts into the Senate discourse. We also identify specific concepts that closely correlate with the Senators’ racial, ethnic, and gender identities. An implementation of our method is publicly available. 2025.naacl-long.344 @@ -4415,7 +4415,7 @@ From Distributional to Overton Pluralism: Investigating Large Language Model Alignment ThomLakeUniversity of Texas at Austin and Indeed - EunsolChoiNew York University + EunsolChoiNew York University GregDurrettUniversity of Texas at Austin 6794-6814 The alignment process changes several properties of a large language model’s (LLM’s) output distribution. We analyze two aspects of post-alignment distributional shift of LLM responses. First, we re-examine previously reported reductions in response diversity post-alignment. Our analysis suggests that an apparent drop in the diversity of responses is largely explained by quality control and information aggregation. Alignment suppresses irrelevant and unhelpful content while shifting the output distribution toward longer responses that cover information spanning several responses from the base LLM, essentially presenting diverse information in a single response. Finding little evidence that alignment suppresses useful information, it is natural to ask the opposite question: do aligned models surface information that cannot be recovered from base models? Our second investigation shows this is not the case and the behavior of aligned models is recoverable from base models without fine-tuning. A combination of in-context examples and lower-resolution semantic hints about response content can elicit responses from base LLMs that are as similar to alignment-tuned LLM responses as alignment-tuned LLM responses are to each other. Taken together, these results indicate that current alignment techniques capture but do not extend the useful subset of assistant-like base LLM behavior, providing further evidence for the Superficial Alignment Hypothesis. They also show that in-context alignment can go surprisingly far as a strategy for imitating aligned LLMs without fine-tuning. Our code and data is available at [github.com/thomlake/investigating-alignment](https://github.com/thomlake/investigating-alignment). @@ -4427,19 +4427,19 @@ MohanZhang PingzhiLi JiePengUniversity of Science and Technology of China - MufanQiuDepartment of Computer Science, University of North Carolina at Chapel Hill - TianlongChenUniversity of North Carolina at Chapel Hill + MufanQiuDepartment of Computer Science, University of North Carolina at Chapel Hill + TianlongChenUniversity of North Carolina at Chapel Hill 6815-6825 2025.naacl-long.347 zhang-etal-2025-advancing <fixed-case>L</fixed-case>ib<fixed-case>E</fixed-case>volution<fixed-case>E</fixed-case>val: A Benchmark and Study for Version-Specific Code Generation - SachitKuharAmazon + SachitKuharAmazon Wasi UddinAhmadNVIDIA ZijianWangAmazon AWS AI Labs NihalJainAmazon - HaifengQianNVIDIA + HaifengQianNVIDIA BaishakhiRayColumbia University Murali KrishnaRamanathanAmazon XiaofeiMaAmazon Web Services @@ -4453,12 +4453,12 @@ Evaluating and Mitigating Object Hallucination in Large Vision-Language Models: Can They Still See Removed Objects? YixiaoHeBeijing University of Posts and Telecommunications HaifengSunBeijing University of Posts and Telecommunications, Beijing University of Posts and Telecommunications and Beijing University of Posts and Telecommunications - PengfeiRen - JingyuWang + PengfeiRen + JingyuWang HuazhengWang QiQiBeijing University of Posts and Telecommunications - ZiruiZhuangBeijing University of Posts and Telecommunications - JingWang + ZiruiZhuangBeijing University of Posts and Telecommunications + JingWang 6841-6858 Large Vision-Language Models (LVLMs) have a significant issue with object hallucinations, where researchers have noted that LVLMs often mistakenly determine objects as present in images where they do not actually exist. Some recent studies evaluate the occurrence of object hallucinations by asking LVLMs whether they see objects that do not exist in input images. However, we observe that these evaluation methods have some limitations, such as the objects being questioned potentially having little relevance to the image. In this paper, we introduce a more challenging benchmark for evaluating object hallucinations by removing objects from images and then asking the model whether it can still see the removed objects. Our evaluation result reveals that LVLMs suffer from severe hallucinations, as they often still claim to see the removed objects. Through our analysis, we find that biases in training result in LVLMs lacking guidance on learning about the absence of objects, which in turn leads to a lack of ability to determine that objects do not exist in images. To address this issue, we further propose oDPO, a direct preference optimization objective based on visual objects. By guiding LVLMs to learn to determine the existence of objects, oDPO effectively alleviates object hallucinations. It achieves more competitive results than other hallucination mitigation approaches across multiple object hallucination benchmarks and enhances the performance of LVLMs in various vision-language tasks. 2025.naacl-long.349 @@ -4469,7 +4469,7 @@ ShaoyangXu YongqiLeng LinhaoYu - DeyiXiongTianjin University + DeyiXiongTianjin University 6859-6877 As large language models (LLMs) become increasingly accessible in many countries, it is essential to align them to serve pluralistic human values across cultures. However, pluralistic culture alignment in LLMs remain an open problem. In this paper, we propose CultureSPA, a Self-Pluralising Culture Alignment framework that allows LLMs to simultaneously align to pluralistic cultures. The framework first generates questions on various culture topics, then yields LLM outputs in response to these generated questions under both culture-aware and culture-unaware settings. By comparing culture-aware/unaware outputs, we are able to detect and collect culture-related instances. These instances are employed to fine-tune LLMs to serve pluralistic cultures in either a culture-joint or culture-specific way. Extensive experiments demonstrate that CultureSPA significantly improves the alignment of LLMs to diverse cultures without compromising general abilities. And further improvements can be achieved if CultureSPA is combined with advanced prompt engineering techniques. Comparisons between culture-joint and culture-specific tuning strategies, along with variations in data quality and quantity, illustrate the robustness of our method. We also explore the mechanisms underlying CultureSPA and the relations between different cultures it reflects. 2025.naacl-long.350 @@ -4486,11 +4486,11 @@ <fixed-case>D</fixed-case>raw<fixed-case>E</fixed-case>du<fixed-case>M</fixed-case>ath: Evaluating Vision Language Models with Expert-Annotated Students’ Hand-Drawn Math Images SamiBaralWorcester Polytechnic Institute - LiLucyUniversity of California Berkeley + LiLucyUniversity of California Berkeley RyanKnightInsource Services, Inc AliceNg - LucaSoldainiAllen Institute for Artificial Intelligence - NeilHeffernan + LucaSoldainiAllen Institute for Artificial Intelligence + NeilHeffernan KyleLoAllen Institute for Artificial Intelligence 6902-6920 In real-world settings, vision language models (VLMs) should robustly handle naturalistic, noisy visual content as well as domain-specific language and concepts. For example, K-12 educators using digital learning platforms may need to examine and provide feedback across many images of students’ math work. To assess the potential of VLMs to support educators in settings like this one, we introduce DrawEduMath, an English-language dataset of 2,030 images of students’ handwritten responses to K-12 math problems. Teachers provided detailed annotations, including free-form descriptions of each image and 11,661 question-answer (QA) pairs. These annotations capture a wealth of pedagogical insights, ranging from students’ problem-solving strategies to the composition of their drawings, diagrams, and writing. We evaluate VLMs on teachers’ QA pairs, as well as 44,362 synthetic QA pairs derived from teachers’ descriptions using language models (LMs). We show that even state-of-the-art VLMs leave much room for improvement on DrawEduMath questions. We also find that synthetic QAs, though imperfect, can yield similar model rankings as teacher-written QAs. We release DrawEduMath to support the evaluation of VLMs’ abilities to reason mathematically over images gathered with educational contexts in mind. @@ -4519,15 +4519,15 @@ Legal Judgment Prediction based on Knowledge-enhanced Multi-Task and Multi-Label Text Classification - AngLi + AngLi YiquanWuZhejiang University - MingCaiZhejiang University + MingCaiZhejiang University AdamJatowtUniversität Innsbruck XiangZhou WeimingLuZhejiang University ChanglongSunAlibaba Group FeiWuZhejiang University - KunKuangZhejiang University + KunKuangZhejiang University 6957-6970 Legal judgment prediction (LJP) is an essential task for legal AI, aiming at predicting judgments based on the facts of a case. Legal judgments can involve multiple law articles and charges. Although recent methods in LJP have made notable progress, most are constrained to single-task settings (e.g., only predicting charges) or single-label settings (e.g., not accommodating cases with multiple charges), diverging from the complexities of real-world scenarios. In this paper, we address the challenge of predicting relevant law articles and charges within the framework of legal judgment prediction, treating it as a multi-task and multi-label text classification problem. We introduce a knowledge-enhanced approach, called K-LJP, that incorporates (I) ”label-level knowledge” (such as definitions and relationships among labels) to enhance the representation of case facts for each task, and (ii) ”task-level knowledge” (such as the alignment between law articles and corresponding charges) to improve task synergy. Comprehensive experiments demonstrate our method’s effectiveness in comparison to state-of-the-art (SOTA) baselines. 2025.naacl-long.355 @@ -4536,12 +4536,12 @@ <fixed-case>SP</fixed-case>e<fixed-case>C</fixed-case>trum: A Grounded Framework for Multidimensional Identity Representation in <fixed-case>LLM</fixed-case>-Based Agent KeyeunLeeSeoul National University - Seo HyeongKim - SeolheeLee - JinsuEunSeoul National University - YenaKoSeoul National University + Seo HyeongKim + SeolheeLee + JinsuEunSeoul National University + YenaKoSeoul National University HayeonJeonSeoul National University - Esther HehsunKim + Esther HehsunKim SeonghyeChoSeoul National University SoeunYangSeoul National University Eun-meeKimSeoul National University @@ -4554,7 +4554,7 @@ Beemo: Benchmark of Expert-edited Machine-generated Outputs EkaterinaArtemovaToloka AI - Jason SLucas + Jason SLucas SaranyaVenkatramanAmazon JooyoungLeeAmazon SergeiTilgaToloka AI @@ -4567,9 +4567,9 @@ <fixed-case>SANDW</fixed-case>i<fixed-case>CH</fixed-case>: Semantical Analysis of Neighbours for Disambiguating Words in Context ad Hoc - DanielGuzman Olivares - LaraQuijanoUniversidad Autónoma de Madrid - FedericoLiberatore + DanielGuzman Olivares + LaraQuijanoUniversidad Autónoma de Madrid + FedericoLiberatore 7019-7033 The rise of generative chat-based Large Language Models (LLMs) over the past two years has spurred a race to develop systems that promise near-human conversational and reasoning experiences. However, recent studies indicate that the language understanding offered by these models remains limited and far from human-like performance, particularly in grasping the contextual meanings of words—an essential aspect of reasoning. In this paper, we present a simple yet computationally efficient framework for multilingual Word Sense Disambiguation (WSD). Our approach reframes the WSD task as a cluster discrimination analysis over a semantic network refined from BabelNet using group algebra. We validate our methodology across multiple WSD benchmarks, achieving a new state of the art for all languages and tasks, as well as in individual assessments by part of speech. Notably, our model significantly surpasses the performance of current alternatives, even in low-resource languages, while reducing the parameter count by 72%. 2025.naacl-long.358 @@ -4578,7 +4578,7 @@ Towards Automatic Evaluation for Image Transcreation SimranKhanujaCMU, Carnegie Mellon University - VivekIyerUniversity of Edinburgh, University of Edinburgh + VivekIyerUniversity of Edinburgh, University of Edinburgh XiaoyuHe GrahamNeubigCarnegie Mellon University 7034-7047 @@ -4590,8 +4590,8 @@ <fixed-case>I</fixed-case>mg<fixed-case>T</fixed-case>rojan: Jailbreaking Vision-Language Models with <fixed-case>ONE</fixed-case> Image XijiaTao ShuaiZhong - LeiLiUniversity of Hong Kong - QiLiuUniversity of Hong Kong + LeiLiUniversity of Hong Kong + QiLiuUniversity of Hong Kong LingpengKongDepartment of Computer Science, The University of Hong Kong 7048-7063 There has been an increasing interest in the alignment of large language models (LLMs) with human values. However, the safety issues of their integration with a vision module, or vision language models (VLMs), remain relatively underexplored. In this paper, we propose a novel jailbreaking attack against VLMs, aiming to bypass their safety barrier when a user inputs harmful instructions. A scenario where our poisoned (image, text) data pairs are included in the training data is assumed. By replacing the original textual captions with malicious jailbreak prompts, our method can perform jailbreak attacks with the poisoned images. Moreover, we analyze the effect of poison ratios and positions of trainable parameters on our attack’s success rate. For evaluation, we design two metrics to quantify the success rate and the stealthiness of our attack. Together with a list of curated harmful instructions, a benchmark for measuring attack efficacy is provided. We demonstrate the efficacy of our attack by comparing it with baseline methods. @@ -4605,9 +4605,9 @@ JunyiLi RuiyangRen ShijieWang - XinZhaoRenmin University of China - YangSongBOSS Zhipin - TaoZhang + XinZhaoRenmin University of China + YangSongBOSS Zhipin + TaoZhang 7064-7074 Existing large language models (LLMs) show exceptional problem-solving capabilities but might struggle with complex reasoning tasks. Despite the successes of chain-of-thought and tree-based search methods, they mainly depend on the internal knowledge of LLMs to search over intermediate reasoning steps, limited to dealing with simple tasks involving fewer reasoning steps. In this paper, we propose RAG-Star, a novel RAG approach that integrates the retrieved information to guide the tree-based deliberative reasoning process that relies on the inherent knowledge of LLMs. By leveraging Monte Carlo Tree Search, RAG-Star iteratively plans intermediate sub-queries and answers for reasoning based on the LLM itself. To consolidate internal and external knowledge, we propose a retrieval-augmented verification that utilizes query- and answer-aware reward modeling to provide feedback for the inherent reasoning of LLMs. Our experiments involving Llama-3.1-8B-Instruct and GPT-4o demonstrate that RAG-Star significantly outperforms previous RAG and reasoning methods. Our codes and data are publicly available at https://github.com/RUCAIBox/RAG-Star. 2025.naacl-long.361 @@ -4615,15 +4615,15 @@ Mitigating Biases of Large Language Models in Stance Detection with Counterfactual Augmented Calibration - AngLi - JingqianZhaoHarbin Institute of Technology - BinLiangThe Chinese University of Hong Kong + AngLi + JingqianZhaoHarbin Institute of Technology + BinLiangThe Chinese University of Hong Kong LinGuiKing’s College London, University of London HuiWang XiZeng XingweiLiang - Kam-FaiWongThe Chinese University of Hong Kong - RuifengXuHarbin Institute of Technology + Kam-FaiWongThe Chinese University of Hong Kong + RuifengXuHarbin Institute of Technology 7075-7092 Stance detection is critical for understanding the underlying position or attitude expressed toward a topic. Large language models (LLMs) have demonstrated significant advancements across various natural language processing tasks including stance detection, however, their performance in stance detection is limited by biases and spurious correlations inherent due to their data-driven nature. Our statistical experiment reveals that LLMs are prone to generate biased stances due to sentiment-stance spurious correlations and preference towards certain individuals and topics. Furthermore, the results demonstrate a strong negative correlation between stance bias and stance detection performance, underscoring the importance of mitigating bias to enhance the utility of LLMs in stance detection. Therefore, in this paper, we propose a Counterfactual Augmented Calibration Network (FACTUAL), which a novel calibration network is devised to calibrate potential bias in the stance prediction of LLMs. Further, to address the challenge of effectively learning bias representations and the difficulty in the generalizability of debiasing, we construct counterfactual augmented data. This approach enhances the calibration network, facilitating the debiasing and out-of-domain generalization. Experimental results on in-target and zero-shot stance detection tasks show that the proposed FACTUAL can effectively mitigate biases of LLMs, achieving state-of-the-art results. 2025.naacl-long.362 @@ -4632,9 +4632,9 @@ Beyond the Next Token: Towards Prompt-Robust Zero-Shot Classification via Efficient Multi-Token Prediction JunlangQianNanyang Technological University - ZixiaoZhu - HanzhangZhou - ZijianFeng + ZixiaoZhu + HanzhangZhou + ZijianFeng ZepengZhaiBeijing University of Posts and Telecommunications KezhiMaoNanyang Technological University 7093-7115 @@ -4666,8 +4666,8 @@ Towards Inducing Long-Context Abilities in Multilingual Neural Machine Translation Models - VarunGummaMicrosoft - Pranjal AChitaleMicrosoft Research + VarunGummaMicrosoft + Pranjal AChitaleMicrosoft Research KalikaBaliMicrosoft Research Labs 7158-7170 Neural Machine Translation (NMT) models have traditionally used Sinusoidal Positional Embeddings (PEs), which often struggle to capture long-range dependencies and are inefficient for handling extended context or document-level translation tasks. This work addresses the challenge of transitioning pre-trained NMT models from absolute Sinusoidal PEs to Relative PEs, such as RoPE and ALiBi, without compromising performance. We demonstrate that parameter-efficient fine-tuning, using only a small amount of high-quality data, can successfully facilitate this transition. Experimental results indicate that switching from Sinusoidal to Relative PEs results in competitive translation quality on sentence-level evaluation benchmarks. Additionally, models trained with RoPE consistently outperform those using ALiBi and Sinusoidal PEs on document-level benchmarks across both string-based metrics and qualitative evaluations. Moreover, we find that a small amount of long-context data in a few languages is sufficient for cross-lingual length generalization, thereby inducing long-context capabilities. @@ -4678,8 +4678,8 @@ Yeah, Un, Oh: Continuous and Real-time Backchannel Prediction with Fine-tuning of Voice Activity Projection KojiInoueKyoto University DiveshLalaKyoto University - GabrielSkantzeKTH Royal Institute of Technology, Stockholm, Sweden - TatsuyaKawaharaKyoto University, Tokyo Institute of Technology + GabrielSkantzeKTH Royal Institute of Technology, Stockholm, Sweden + TatsuyaKawaharaKyoto University, Tokyo Institute of Technology 7171-7181 In human conversations, short backchannel utterances such as “yeah” and “oh” play a crucial role in facilitating smooth and engaging dialogue.These backchannels signal attentiveness and understanding without interrupting the speaker, making their accurate prediction essential for creating more natural conversational agents.This paper proposes a novel method for real-time, continuous backchannel prediction using a fine-tuned Voice Activity Projection (VAP) model.While existing approaches have relied on turn-based or artificially balanced datasets, our approach predicts both the timing and type of backchannels in a continuous and frame-wise manner on unbalanced, real-world datasets.We first pre-train the VAP model on a general dialogue corpus to capture conversational dynamics and then fine-tune it on a specialized dataset focused on backchannel behavior.Experimental results demonstrate that our model outperforms baseline methods in both timing and type prediction tasks, achieving robust performance in real-time environments.This research offers a promising step toward more responsive and human-like dialogue systems, with implications for interactive spoken dialogue applications such as virtual assistants and robots. 2025.naacl-long.367 @@ -4687,10 +4687,10 @@ Prompt Compression for Large Language Models: A Survey - ZongqianLi + ZongqianLi YinhongLiu - YixuanSuCohere - NigelCollierUniversity of Cambridge + YixuanSuCohere + NigelCollierUniversity of Cambridge 7182-7195 Leveraging large language models (LLMs) for complex natural language tasks typically requires long-form prompts to convey detailed requirements and information, which results in increased memory usage and inference costs. To mitigate these challenges, multiple efficient methods have been proposed, with prompt compression gaining significant research interest. This survey provides an overview of prompt compression techniques, categorized into hard prompt methods and soft prompt methods. First, the technical approaches of these methods are compared, followed by an exploration of various ways to understand their mechanisms, including the perspectives of attention optimization, Parameter-Efficient Fine-Tuning (PEFT), modality integration, and new synthetic language. We also examine the downstream adaptations of various prompt compression techniques. Finally, the limitations of current prompt compression methods are analyzed, and several future directions are outlined, such as optimizing the compression encoder, combining hard and soft prompts methods, and leveraging insights from multimodality. 2025.naacl-long.368 @@ -4714,7 +4714,7 @@ TaoGeTencent AI Lab XunWangMicrosoft YanXiaResearch, Microsoft - ManLan + ManLan FuruWeiMicrosoft Research 7212-7234 Strategic reasoning is a complex yet essential capability for intelligent agents. It requires Large Language Model (LLM) agents to adapt their strategies dynamically in multi-agent environments. Unlike static reasoning tasks, success in these contexts depends on anticipating other agents’ beliefs and actions while continuously adjusting strategies to achieve individual goals. LLMs and LLM agents often struggle with strategic reasoning due to the absence of a reasoning framework that enables them to dynamically infer others’ perspectives and adapt to changing environments. Inspired by the Level-K framework from game theory and behavioral economics, which extends reasoning from simple reactions to structured strategic depth, we propose a novel framework: “K-Level Reasoning with Large Language Models (K-R).” This framework employs recursive mechanisms to enable LLMs to achieve varying levels of strategic depth, allowing agents to form higher order beliefs—beliefs about others’ beliefs. We validate this framework through rigorous testing on four testbeds: two classical game theory problems and two social intelligence tasks. The results demonstrate the advantages of K-R in strategic reasoning. Our work presents the first recursive implementation of strategic depth in large language models (LLMs). It establishes a foundation for future research into theory of mind and strategic reasoning in LLMs. @@ -4724,8 +4724,8 @@ <fixed-case>S</fixed-case>yllo<fixed-case>B</fixed-case>io-<fixed-case>NLI</fixed-case>: Evaluating Large Language Models on Biomedical Syllogistic Reasoning MagdalenaWysockaCRUK NBC Manchester Institute and Technical University of Gdansk - DaniloCarvalhoUniversity of Manchester - OskarWysocki + DaniloCarvalhoUniversity of Manchester + OskarWysocki MarcoValentinoUniversity of Sheffield AndreFreitasIdiap Research Institute and University of Manchester 7235-7258 @@ -4744,10 +4744,10 @@ <fixed-case>MGM</fixed-case>: Global Understanding of Audience Overlap Graphs for Predicting the Factuality and the Bias of News Media - Muhammad ArslanManzoorMohamed bin Zayed University of Artificial Intelligence - RuihongZeng + Muhammad ArslanManzoorMohamed bin Zayed University of Artificial Intelligence + RuihongZeng DilshodAzizov - PreslavNakovMohamed bin Zayed University of Artificial Intelligence + PreslavNakovMohamed bin Zayed University of Artificial Intelligence ShangsongLiangSUN YAT-SEN UNIVERSITY 7279-7295 In the current era of rapidly growing digital data, evaluating the political bias and factuality of news outlets has become more important for seeking reliable information online. In this work, we study the classification problem of profiling news media from the lens of political bias and factuality. Traditional profiling methods, such as Pre-trained Language Models (PLMs) and Graph Neural Networks (GNNs) have shown promising results, but they face notable challenges. PLMs focus solely on textual features, causing them to overlook the complex relationships between entities, while GNNs often struggle with media graphs containing disconnected components and insufficient labels. To address these limitations, we propose MediaGraphMind (MGM), an effective solution within a variational Expectation-Maximization (EM) framework. Instead of relying on limited neighboring nodes, MGM leverages features, structural patterns, and label information from globally similar nodes. Such a framework not only enables GNNs to capture long-range dependencies for learning expressive node representations but also enhances PLMs by integrating structural information and therefore improving the performance of both models. The extensive experiments demonstrate the effectiveness of the proposed framework and achieve new state-of-the-art results. Further, we share our repository which contains the dataset, code, and documentation. @@ -4756,8 +4756,8 @@ A Logical Fallacy-Informed Framework for Argument Generation - LucaMouchel - DebjitPaulEPFL - EPF Lausanne + LucaMouchel + DebjitPaulEPFL - EPF Lausanne ShaoboCui RobertWestEPFL - EPF Lausanne and Microsoft AntoineBosselutSwiss Federal Institute of Technology Lausanne @@ -4770,16 +4770,16 @@ <fixed-case>LL</fixed-case>a<fixed-case>MA</fixed-case>-Berry: Pairwise Optimization for Olympiad-level Mathematical Reasoning via O1-like <fixed-case>M</fixed-case>onte <fixed-case>C</fixed-case>arlo Tree Search DiZhangShanghai Artificial Intelligence Laboratory - JianboWuUniversity of California, Merced + JianboWuUniversity of California, Merced JingdiLeiShanghai Artificial Intelligence Laboratory and Beijing Institute of Technology TongCheNVIDIA JiatongLiHong Kong Polytechnic University - TongXieUniversity of New South Wales and GreenDynamics + TongXieUniversity of New South Wales and GreenDynamics XiaoshuiHuangShanghai Jiaotong University ShufeiZhangShanghai AI Lab MarcoPavoneNVIDIA and Stanford University YuqiangLi - WanliOuyangShanghai AI Lab + WanliOuyangShanghai AI Lab DongzhanZhouShanghai Artificial Intelligence Laboratory 7315-7337 This paper presents LLaMA-Berry, an advanced mathematical reasoning framework to enhance the problem-solving ability of large language models (LLMs). The framework combines Monte Carlo Tree Search with Self-Refine (SR-MCTS) to optimize the reasoning paths and utilizes a pairwise reward model to evaluate different paths globally. By leveraging the self-critique and rewriting capabilities of LLMs, our SR-MCTS overcomes the inefficiencies and limitations of conventional step-wise and greedy search algorithms, enabling a more efficient exploration of solution spaces. To guide the search process, we propose the Pairwise Preference Reward Model (PPRM), which predicts pairwise preferences between solutions through instruction-following capabilities trained by Reinforcement Learning from Human Feedback (RLHF). Finally, the Enhanced Borda Count (EBC) method is adopted to synthesize pairwise preferences into global quantile scores for evaluations. This approach mitigates the challenges of scoring variability and non-independent distributions in mathematical reasoning tasks. The framework has been tested on general and advanced benchmarks, showing superior search efficiency and performance compared to existing open-source and closed-source methods, particularly in complex Olympiad-level benchmarks, including AIME24 and AMC23. @@ -4803,7 +4803,7 @@ Script-Agnosticism and its Impact on Language Identification for <fixed-case>D</fixed-case>ravidian Languages MilindAgarwal JoshuaOtten - AntoniosAnastasopoulosAthena Research Center and George Mason University + AntoniosAnastasopoulosAthena Research Center and George Mason University 7364-7384 Language identification is used as the first step in many data collection and crawling efforts because it allows us to sort online text into language-specific buckets. However, many modern languages, such as Konkani, Kashmiri, Punjabi etc., are synchronically written in several scripts. Moreover, languages with different writing systems do not share significant lexical, semantic, and syntactic properties in neural representation spaces, which is a disadvantage for closely related languages and low-resource languages, especially those from the Indian Subcontinent. To counter this, we propose learning script-agnostic representations using several different experimental strategies (upscaling, flattening, and script mixing) focusing on four major Dravidian languages (Tamil, Telugu, Kannada, and Malayalam). We find that word-level script randomization and exposure to a language written in multiple scripts is extremely valuable for downstream script-agnostic language identification, while also maintaining competitive performance on naturally occurring text. 2025.naacl-long.377 @@ -4812,10 +4812,10 @@ <fixed-case>NAT</fixed-case>: Enhancing Agent Tuning with Negative Samples RenxiWang - XudongHanMohamed bin Zayed University of Artificial Intelligence + XudongHanMohamed bin Zayed University of Artificial Intelligence YixuanZhang - TimothyBaldwinMohamed bin Zayed University of Artificial Intelligence and The University of Melbourne - HaonanLi + TimothyBaldwinMohamed bin Zayed University of Artificial Intelligence and The University of Melbourne + HaonanLi 7385-7398 Interaction trajectories between agents and environments have proven effective in tuning LLMs into task-specific agents. However, constructing these trajectories, especially successful trajectories, is often computationally and time intensive due to the relatively low success rates of even the most advanced LLMs, such as GPT-4 and Claude. Additionally, common training paradigms like supervised fine-tuning (SFT) and reinforcement learning (RL) not only require large volumes of data but also have specific demands regarding the trajectories used. For instance, existing SFT approaches typically utilize only positive examples, limiting their efficiency in low-resource scenarios. To address this, we introduce Negative-Aware Training (NAT), a straightforward yet effective method that leverages both successful and failed trajectories for fine-tuning, maximizing the utility of limited resources. Experimental results demonstrate that NAT consistently surpasses existing methods, including SFT, DPO, and PPO, across various tasks. 2025.naacl-long.378 @@ -4830,7 +4830,7 @@ ZijingShi ZhenhaoChen FuYujie - ZeyuZhangThe Australian National University + ZeyuZhangThe Australian National University ShiyuJiang MiaoFangNortheastern University at Qinhuangdao LingChenUniversity of Technology Sydney @@ -4847,8 +4847,8 @@ JustinVasselli MiyuOba YusukeSakaiNara Institute of Science and Technology, Japan - HidetakaKamigaitoNara Institute of Science and Technology - TaroWatanabeNara Institute of Science and Technology, Japan + HidetakaKamigaitoNara Institute of Science and Technology + TaroWatanabeNara Institute of Science and Technology, Japan 7416-7432 The grammatical knowledge of language models (LMs) is often measured using a benchmark of linguistic minimal pairs, where LMs are presented with a pair of acceptable and unacceptable sentences and required to judge which is more acceptable. Conventional approaches compare sentence probabilities directly, but large language models (LLMs) provide nuanced evaluation methods using prompts and templates. We therefore investigate how to derive the most accurate acceptability judgments from LLMs to comprehensively evaluate their grammatical knowledge. Through extensive experiments in both English and Chinese, we compare nine judgment methods and demonstrate that two of them, in-template LP (a probability readout method) and Yes/No probability computing (a prompting-based method), achieve higher accuracy than the conventional approach. Our analysis reveals that the top two methods excel in different linguistic phenomena, suggesting they access different aspects of the LLMs’ grammatical knowledge. We find that ensembling the two methods achieves even higher accuracy. Consequently, we recommend these techniques, either individually or ensembled, as more effective alternatives to conventional approaches for assessing grammatical knowledge in LLMs. 2025.naacl-long.380 @@ -4857,11 +4857,11 @@ Is Your <fixed-case>LLM</fixed-case> Outdated? A Deep Look at Temporal Generalization ChenghaoZhu - NuoChenNational University of Singapore and The Chinese University of Hong Kong, Shenzhen + NuoChenNational University of Singapore and The Chinese University of Hong Kong, Shenzhen YufeiGao YunyiZhangThe Chinese University of Hong Kong - PrayagTiwariHalmstad University - BenyouWangThe Chinese University of Hong Kong, Shenzhen + PrayagTiwariHalmstad University + BenyouWangThe Chinese University of Hong Kong, Shenzhen 7433-7457 The rapid advancement of Large Language Models (LLMs) has led to the development of benchmarks that consider temporal dynamics, however, there remains a gap in understanding how well these models can generalize across temporal contexts due to the inherent dynamic nature of language and information. This paper introduces the concept of temporal generalization in LLMs, including bias in past and future generalizations. Then we introduce FreshBench, a new evaluation framework that employs fresh text and event prediction for assessing LLMs’ temporal adaptability, ensuring the evaluation process free from data leakage and subjective bias. The experiment shows significant temporal biases and a decline in performance over time. 2025.naacl-long.381 @@ -4869,10 +4869,10 @@ Towards a Perspectivist Turn in Argument Quality Assessment - JuliaRombergGESIS Leibniz Institute for the Social Sciences + JuliaRombergGESIS Leibniz Institute for the Social Sciences MaximilianMaurerGESIS Leibniz Institute for the Social Sciences - HenningWachsmuthLeibniz Universität Hannover - GabriellaLapesaGESIS – Leibniz Institute for the Social Sciences and Heinrich-Heine University Düsseldorf + HenningWachsmuthLeibniz Universität Hannover + GabriellaLapesaGESIS – Leibniz Institute for the Social Sciences and Heinrich-Heine University Düsseldorf 7458-7485 The assessment of argument quality depends on well-established logical, rhetorical, and dialectical properties that are unavoidably subjective: multiple valid assessments may exist, there is no unequivocal ground truth. This aligns with recent paths in machine learning, which embrace the co-existence of different perspectives. However, this potential remains largely unexplored in NLP research on argument quality. One crucial reason seems to be the yet unexplored availability of suitable datasets. We fill this gap by conducting a systematic review of argument quality datasets. We assign them to a multi-layered categorization targeting two aspects: (a) What has been annotated: we collect the quality dimensions covered in datasets and consolidate them in an overarching taxonomy, increasing dataset comparability and interoperability. (b) Who annotated: we survey what information is given about annotators, enabling perspectivist research and grounding our recommendations for future actions. To this end, we discuss datasets suitable for developing perspectivist models (i.e., those containing individual, non-aggregated annotations), and we showcase the importance of a controlled selection of annotators in a pilot study. 2025.naacl-long.382 @@ -4882,7 +4882,7 @@ A Picture is Worth A Thousand Numbers: Enabling <fixed-case>LLM</fixed-case>s Reason about Time Series via Visualization HaoxinLiu ChenghaoLiuSalesforce AI Research - B. AdityaPrakashGeorgia Institute of Technology + B. AdityaPrakashGeorgia Institute of Technology 7486-7518 Large language models (LLMs), with demonstrated reasoning abilities across multiple domains, have been largely underexplored fortime-series reasoning (TsR), which is ubiquitous in the real world. In this work, wepropose TimerBed, the first comprehensivetestbed for evaluating LLMs’ TsR performance.Specifically, TimerBed includes stratified reasoning patterns with real-world tasks, diversecombinations of LLMs and reasoning strategies, and various supervised models as comparison anchors. We perform extensive experiments with TimerBed, test multiple current beliefs, and observe the initial failuresof LLMs in TsR, as evidenced by the ineffectiveness of zero shot (ZST) and performancedegradation of few shot in-context learning(ICL). Further, we identify one possible rootcause: the numerical modeling of data. Toaddress this, we propose a prompt-based solution VL-Time, with visualization-modeled dataand language-guided reasoning. Experimental results demonstrate that VL-Time enablesmultimodal LLMs to be non-trivial ZST andpowerful ICL reasoners for time series, achieving about 140% average performance improvement and 99% average token costs reduction.TimerBed and VL-Time are available at https://github.com/AdityaLab/DeepTime/. 2025.naacl-long.383 @@ -4893,9 +4893,9 @@ JooyoungLeeAmazon ToshiniAgrawal AdakuUchenduMIT Lincoln Laboratory - ThaiLeIndiana University + ThaiLeIndiana University JinghuiChenPennsylvania State University - DongwonLeeThe Pennsylvania State University + DongwonLeeThe Pennsylvania State University 7519-7534 Recent studies have raised concerns about the potential threats large language models (LLMs) pose to academic integrity and copyright protection. Yet, their investigation is predominantly focused on literal copies of original texts. Also, how LLMs can facilitate the detection of LLM-generated plagiarism remains largely unexplored. To address these gaps, we introduce PlagBench, a dataset of 46.5K synthetic text pairs that represent three major types of plagiarism: verbatim copying, paraphrasing, and summarization. These samples are generated by three advanced LLMs. We rigorously validate the quality of PlagBench through a combination of fine-grained automatic evaluation and human annotation. We then utilize this dataset for two purposes: (1) to examine LLMs’ ability to transform original content into accurate paraphrases and summaries, and (2) to evaluate the plagiarism detection performance of five modern LLMs alongside three specialized plagiarism checkers. Our results show that GPT-3.5 Turbo can produce high-quality paraphrases and summaries without significantly increasing text complexity compared to GPT-4 Turbo. However, in terms of detection, GPT-4 outperforms other LLMs and commercial detection tools by 20%, highlights the evolving capabilities of LLMs not only in content generation but also in plagiarism detection. Data and source code are available at https://github.com/Brit7777/plagbench. 2025.naacl-long.384 @@ -4904,12 +4904,12 @@ Commonality and Individuality! Integrating Humor Commonality with Speaker Individuality for Humor Recognition HaohaoZhu - XiaokunZhangCity University of HongKong + XiaokunZhangCity University of HongKong ZeyuanZeng - JunyuLu + JunyuLu ZewenBai LiangYangDalian University of Technology - HongfeiLin + HongfeiLin 7535-7547 Humor recognition aims to identify whether a specific speaker’s text is humorous. Current methods for humor recognition mainly suffer from two limitations: (1) they solely focus on one aspect of humor commonalities, ignoring the multifaceted nature of humor; and (2) they typically overlook the critical role of speaker individuality, which is essential for a comprehensive understanding of humor expressions. To bridge these gaps, we introduce the Commonality and Individuality Incorporated Network for Humor Recognition (CIHR), a novel model designed to enhance humor recognition by integrating multifaceted humor commonalities with the distinctive individuality of speakers. The CIHR features a Humor Commonality Analysis module that explores various perspectives of multifaceted humor commonality within user texts, and a Speaker Individuality Extraction module that captures both static and dynamic aspects of a speaker’s profile to accurately model their distinctive individuality. Additionally, Static and Dynamic Fusion modules are introduced to effectively incorporate the humor commonality with speaker’s individuality in the humor recognition process. Extensive experiments demonstrate the effectiveness of CIHR, underscoring the importance of concurrently addressing both multifaceted humor commonality and distinctive speaker individuality in humor recognition. 2025.naacl-long.385 @@ -4920,7 +4920,7 @@ YananMa ChenghaoXiaoDurham University ChenhanYuanAlibaba Group - Sabine N Van DerVeerUniversity of Manchester + Sabine N Van DerVeerUniversity of Manchester LamieceHassan ChenghuaLinUniversity of Manchester GoranNenadicUniversity of Manchester @@ -4931,8 +4931,8 @@ A Zero-Shot Open-Vocabulary Pipeline for Dialogue Understanding - AbdulfattahSafa - Gözde GülŞahinKoç University + AbdulfattahSafa + Gözde GülŞahinKoç University 7562-7579 Dialogue State Tracking (DST) is crucial for understanding user needs and executing appropriate system actions in task-oriented dialogues. Majority of existing DST methods are designed to work within predefined ontologies and assume the availability of gold domain labels, struggling with adapting to new slots values. While Large Language Models (LLMs)-based systems show promising zero-shot DST performance, they either require extensive computational resources or they underperform existing fully-trained systems, limiting their practicality. To address these limitations, we propose a zero-shot, open-vocabulary system that integrates domain classification and DST in a single pipeline. Our approach includes reformulating DST as a question-answering task for less capable models and employing self-refining prompts for more adaptable ones. Our system does not rely on fixed slot values defined in the ontology allowing the system to adapt dynamically. We compare our approach with existing SOTA, and show that it provides up to 20% better Joint Goal Accuracy (JGA) over previous methods on datasets like MultiWOZ 2.1, with up to 90% fewer requests to the LLM API. 2025.naacl-long.387 @@ -4946,7 +4946,7 @@ RajarshiMandalIndian Institute of Technology Kharagpur AvikHalder ShanuKumarMicrosoft - SagnikBasuIndian Institute of Technology Kharagpur + SagnikBasuIndian Institute of Technology Kharagpur ParagAgrawalMicrosoft RimaHazraSingapore University of Technology and Design AnimeshMukherjeeIndian Institute of Technology Kharagpur @@ -4962,7 +4962,7 @@ HadasOrgadComputer Science Department, Technion - Israel Institute of Technology RinonGalNVIDIA and Tel Aviv University, Tel Aviv YoadTewel - GalChechikBar Ilan University and NVIDIA + GalChechikBar Ilan University and NVIDIA YonatanBelinkovTechnion, Technion 7618-7632 Text-to-image (T2I) diffusion models rely on encoded prompts to guide the image generation process. Typically, these prompts are extended to a fixed length by appending padding tokens to the input. Despite being a default practice, the influence of padding tokens on the image generation process has not been investigated. In this work, we conduct the first in-depth analysis of the role padding tokens play in T2I models. We develop two causal techniques to analyze how information is encoded in the representation of tokens across different components of the T2I pipeline. Using these techniques, we investigate when and how padding tokens impact the image generation process. Our findings reveal three distinct scenarios: padding tokens may affect the model’s output during text encoding, during the diffusion process, or be effectively ignored. Moreover, we identify key relationships between these scenarios and the model’s architecture (cross or self-attention) and its training process (frozen or trained text encoder). These insights contribute to a deeper understanding of the mechanisms of padding tokens, potentially informing future model design and training practices in T2I systems. @@ -4971,7 +4971,7 @@ In-Context Learning (and Unlearning) of Length Biases - StephanieSchoch + StephanieSchoch YangfengJiUniversity of Virginia 7633-7671 Large language models have demonstrated strong capabilities to learn in-context, where exemplar input-output pairings are appended to the prompt for demonstration. However, existing work has demonstrated the ability of models to learn lexical and label biases in-context, which negatively impacts both performance and robustness of models. The impact of other statistical data biases remains under-explored, which this work aims to address. We specifically investigate the impact of length biases on in-context learning. We demonstrate that models do learn length biases in the context window for their predictions, and further empirically analyze the factors that modulate the level of bias exhibited by the model. In addition, we show that learning length information in-context can be used to counter the length bias that has been encoded in models (e.g., via fine-tuning). This reveals the power of in-context learning in debiasing model prediction behaviors without the need for costly parameter updates. @@ -4980,11 +4980,11 @@ <fixed-case>A</fixed-case>d<fixed-case>TEC</fixed-case>: A Unified Benchmark for Evaluating Text Quality in Search Engine Advertising - PeinanZhangCyberAgent AI Lab + PeinanZhangCyberAgent AI Lab YusukeSakaiNara Institute of Science and Technology, Japan MasatoMitaCyberAgent Inc. HirokiOuchiNAIST - TaroWatanabeNara Institute of Science and Technology, Japan + TaroWatanabeNara Institute of Science and Technology, Japan 7672-7691 As the fluency of ad texts automatically generated by natural language generation technologies continues to improve, there is an increasing demand to assess the quality of these creatives in real-world setting.We propose **AdTEC**, the first public benchmark to evaluate ad texts from multiple perspectives within practical advertising operations.Our contributions are as follows: (i) Defining five tasks for evaluating the quality of ad texts, as well as constructing a Japanese dataset based on the practical operational experiences of advertising agencies, which are typically maintained in-house. (ii) Validating the performance of existing pre-trained language models (PLMs) and human evaluators on this dataset. (iii) Analyzing the characteristics and providing challenges of the benchmark.Our results show that while PLMs have a practical level of performance in several tasks, humans continue to outperform them in certain domains, indicating that there remains significant potential for further improvement in this area. 2025.naacl-long.391 @@ -4992,9 +4992,9 @@ Empowering Retrieval-based Conversational Recommendation with Contrasting User Preferences - HeejinKook + HeejinKook JunyoungKimSung Kyun Kwan University - SeongminPark + SeongminPark JongwukLeeSungkyunkwan University 7692-7707 Conversational recommender systems (CRSs) are designed to suggest the target item that the user is likely to prefer through multi-turn conversations. Recent studies stress that capturing sentiments in user conversations improves recommendation accuracy. However, they employ a single user representation, which may fail to distinguish between contrasting user intentions, such as likes and dislikes, potentially leading to suboptimal performance. To this end, we propose a novel conversational recommender model, called COntrasting user pReference expAnsion and Learning (CORAL). Firstly, CORAL extracts the user’s hidden pref- erences through contrasting preference expansion using the reasoning capacity of the LLMs. Based on the potential preference, CORAL explicitly differentiates the contrasting preferences and leverages them into the recommendation process via preference-aware learning. Extensive experiments show that CORAL significantly outperforms existing methods in three benchmark datasets, improving up to 99.72% in Recall@10. The code and datasets are available at https://github.com/kookeej/CORAL. @@ -5004,7 +5004,7 @@ <fixed-case>LRQ</fixed-case>: Optimizing Post-Training Quantization for Large Language Models by Learning Low-Rank Weight-Scaling Matrices Jung HyunLeeNAVER CLOVA - JeonghoonKimKorea Advanced Institute of Science & Technology and NAVER + JeonghoonKimKorea Advanced Institute of Science & Technology and NAVER June YongYang Se JungKwonNAVER Cloud EunhoYangKorea Advanced Institute of Science & Technology @@ -5029,10 +5029,10 @@ <fixed-case>LLM</fixed-case>s as Meta-Reviewers’ Assistants: A Case Study EftekharHossainUniversity of Central Florida - Sanjeev KumarSinhaAuburn University + Sanjeev KumarSinhaAuburn University NamanBansalAuburn University R. AlexanderKnipperAuburn University - SouvikaSarkarWichita State University + SouvikaSarkarWichita State University JohnSalvadorUniversity of Central Florida YashMahajan Sri Ram Pavan KumarGuttikonda @@ -5040,8 +5040,8 @@ Md. MahadiHassanUniversity of Central Florida MatthewFreestone Matthew C. WilliamsJr. - DongjiFengGustavus Adolphus College - SantuKarmakerUniversity of Central Florida + DongjiFengGustavus Adolphus College + SantuKarmakerUniversity of Central Florida 7763-7803 One of the most important yet onerous tasks in the academic peer-reviewing process is composing meta-reviews, which involves assimilating diverse opinions from multiple expert peers, formulating one’s self-judgment as a senior expert, and then summarizing all these perspectives into a concise holistic overview to make an overall recommendation. This process is time-consuming and can be compromised by human factors like fatigue, inconsistency, missing tiny details, etc. Given the latest major developments in Large Language Models (LLMs), it is very compelling to rigorously study whether LLMs can help meta-reviewers perform this important task better. In this paper, we perform a case study with three popular LLMs, i.e., GPT-3.5, LLaMA2, and PaLM2, to assist meta-reviewers in better comprehending multiple experts’ perspectives by generating a controlled multi-perspective-summary (MPS) of their opinions. To achieve this, we prompt three LLMs with different types/levels of prompts based on the recently proposed TELeR taxonomy. Finally, we perform a detailed qualitative study of the MPSs generated by the LLMs and report our findings. 2025.naacl-long.395 @@ -5050,7 +5050,7 @@ A Survey of <fixed-case>NLP</fixed-case> Progress in <fixed-case>S</fixed-case>ino-<fixed-case>T</fixed-case>ibetan Low-Resource Languages ShuhengLiuGeorgia Institute of Technology - MichaelBestGeorgia Institute of Technology + MichaelBestGeorgia Institute of Technology 7804-7825 Despite the increasing effort in including more low-resource languages in NLP/CL development, most of the world’s languages are still absent. In this paper, we take the example of the Sino-Tibetan language family which consists of hundreds of low-resource languages, and we look at the representation of these low-resource languages in papers archived on ACL Anthology. Our findings indicate that while more techniques and discussions on more languages are present in more publication venues over the years, the overall focus on this language family has been minimal. The lack of attention might be owing to the small number of native speakers and governmental support of these languages. The current development of large language models, albeit successful in a few quintessential rich-resource languages, are still trailing when tackling these low-resource languages. Our paper calls for the attention in NLP/CL research on the inclusion of low-resource languages, especially as increasing resources are poured into the development of data-driven language models. 2025.naacl-long.396 @@ -5059,7 +5059,7 @@ Enhancing Language Model Hypernetworks with Restart: A Study on Optimization YihanZhangPeking University - JieFuShanghai Artificial Intelligence Laboratory + JieFuShanghai Artificial Intelligence Laboratory RongrongJi JieChenPeking University 7826-7838 @@ -5069,7 +5069,7 @@ Functional Lexicon in Subword Tokenization - Zachary WilliamHoptonUniversity of Zurich and University of Zurich + Zachary WilliamHoptonUniversity of Zurich and University of Zurich YvesScherrerUniversity of Oslo TanjaSamardzicUniversity of Zurich 7839-7853 @@ -5079,15 +5079,15 @@ Getting More Juice Out of Your Data: Hard Pair Refinement Enhances Visual-Language Models Without Extra Data - HaonanWang + HaonanWang MinbinHuang RunhuiHuangUniversity of Hong Kong LanqingHongHuawei Technologies Ltd. - HangXuHuawei Noah‘s Ark Lab + HangXuHuawei Noah‘s Ark Lab TianyangHu XiaodanLiangSUN YAT-SEN UNIVERSITY ZhenguoLiDepartment of Computer Science and Engineering, Hong Kong University of Science and Technology and Huawei Noah’s Ark Lab - HongChengThe Chinese University of Hong Kong + HongChengThe Chinese University of Hong Kong KenjiKawaguchiNational University of Singapore 7854-7873 Contrastive Language-Image Pre-training (CLIP) has become the standard for cross- modal image-text representation learning. Improving CLIP typically requires additional data and retraining with new loss functions, but these demands raise resource and time costs, limiting practical use. In this work, we introduce HELIP, a cost-effective strategy that improves CLIP models by exploiting challenging text-image pairs within existing datasets in continuous training. This eliminates the need for additional data or extensive retraining. Moreover, HELIP integrates effortlessly into current training pipelines with minimal code modifications, allowing for quick and seamless implementation. On comprehensive benchmarks, HELIP consistently boosts existing models. In particular, within just two epochs of training, it improves zero-shot classification accuracy on ImageNet for SLIP models pre-trained on CC3M, CC12M, and YFCC15M datasets by 3.05%, 4.47%, and 10.1% , respectively. In addition, on fine-grained classification datasets, HELIP improves the zero-shot performance of CLIP and SLIP by an average of 8.4% and 18.6%, and their linear probe performance by an average of 9.5% and 3.0%. @@ -5096,17 +5096,17 @@ Evaluating the Prompt Steerability of Large Language Models - ErikMiehlingIBM Research + ErikMiehlingIBM Research MichaelDesmond KarthikeyanNatesan RamamurthyInternational Business Machines Elizabeth M.DalyIBM Research Kush R.VarshneyInternational Business Machines EitanFarchiInternational Business Machines - PierreDogninInternational Business Machines + PierreDogninInternational Business Machines JesusRiosInternational Business Machines DjallelBouneffouf MiaoLiuInternational Business Machines - PrasannaSattigeriIBM Research + PrasannaSattigeriIBM Research 7874-7900 Building pluralistic AI requires designing models that are able to be shaped to represent a wide range of value systems and cultures. Achieving this requires first being able to evaluate the degree to which a given model is capable of reflecting various personas. To this end, we propose a benchmark for evaluating the steerability of model personas as a function of prompting. Our design is based on a formal definition of prompt steerability, which analyzes the degree to which a model’s joint behavioral distribution can be shifted from its baseline. By defining steerability indices and inspecting how these indices change as a function of steering effort, we can estimate the steerability of a model across various persona dimensions and directions. Our benchmark reveals that the steerability of many current models is limited — due to both a skew in their baseline behavior and an asymmetry in their steerability across many persona dimensions. We release an implementation of our benchmark at https://github.com/IBM/prompt-steering. 2025.naacl-long.400 @@ -5115,7 +5115,7 @@ A Data-Driven Method for Analyzing and Quantifying Lyrics-Dance Motion Relationships KentoWatanabeAIST, National Institute of Advanced Industrial Science and Technology - MasatakaGotoAIST, National Institute of Advanced Industrial Science and Technology + MasatakaGotoAIST, National Institute of Advanced Industrial Science and Technology 7901-7916 Dancing to music with lyrics is a popular form of expression. While it is generally accepted that there are relationships between lyrics and dance motions, previous studies have not explored these relationships. A major challenge is that the relationships between lyrics and dance motions are not constant throughout a song but are instead localized to specific parts. To address this challenge, we hypothesize that lyrics and dance motions that co-occur across multiple songs are related. Based on this hypothesis, we propose a novel data-driven method to detect the parts of songs where meaningful relationships between lyrics and dance motions exist. We use clustering to transform lyrics and dance motions into symbols, enabling the calculation of co-occurrence frequencies and detection of significant correlations. The effectiveness of our method is validated by a dataset of time-synchronized lyrics and dance motions, which showed high correlation values for emotionally salient lyrics such as “love”, which is expressed in heart-shaped motions. Furthermore, using our relationship detection method, we propose a method for retrieving dance motions from lyrics that outperforms previous text-to-motion retrieval methods, which focus on prose and non-dance motions. 2025.naacl-long.401 @@ -5127,7 +5127,7 @@ GeorgiosPantazopoulos NikolasVitsakis IoannisKonstasHeriot-Watt University - AlessandroSugliaHeriot-Watt University + AlessandroSugliaHeriot-Watt University 7917-7936 As Vision and Language models (VLMs) become accessible across the globe, it is important that they demonstrate cultural knowledge. In his paper, we introduce CROPE, a visual question answering benchmark designed to probe the knowledge of culture-specific concepts and evaluate the capacity for cultural adaptation through contextual information. This allows us to distinguish between parametric knowledge acquired during training and contextual knowledge provided during inference via visual and textual descriptions. Our evaluation of several state-of-the-art open VLMs shows large performance disparities between culture-specific and common concepts in the parametric setting. Moreover, experiments with contextual knowledge indicate that models struggle to effectively utilize multimodal information and bind culture specific concepts to their depictions. Our findings reveal limitations in the cultural understanding and adaptability of current VLMs that need to be addressed toward more culturally inclusive models. 2025.naacl-long.402 @@ -5137,7 +5137,7 @@ <fixed-case>P</fixed-case>ic<fixed-case>P</fixed-case>ersona-<fixed-case>TOD</fixed-case> : A Dataset for Personalizing Utterance Style in Task-Oriented Dialogue with Image Persona JihyunLee YejinJeon - SeungyeonSeo + SeungyeonSeo GaryLee 7937-7958 Task-Oriented Dialogue (TOD) systems are designed to fulfill user requests through natural language interactions, yet existing systems often produce generic, monotonic responses that lack individuality and fail to adapt to users’ personal attributes. To address this, we introduce PicPersona-TOD, a novel dataset that incorporates user images as part of the persona, enabling personalized responses tailored to user-specific factors such as age or emotional context. This is facilitated by first impressions, dialogue policy-guided prompting, and the use of external knowledge to reduce hallucinations. Human evaluations confirm that our dataset enhances user experience, with personalized responses contributing to a more engaging interaction. Additionally, we introduce a new NLG model, Pictor, which not only personalizes responses, but also demonstrates robust performance across unseen domains. @@ -5150,7 +5150,7 @@ ShangZhou DanqingWangCMU, Carnegie Mellon University William YangWangUC Santa Barbara - LeiLiSchool of Computer Science, Carnegie Mellon University + LeiLiSchool of Computer Science, Carnegie Mellon University 7959-7973 Sampling is a basic operation for large language models (LLMs). In reinforcement learning rollouts and meta generation algorithms such as Best-of-N, it is essential to sample correct trajectories within a given compute budget. To find an optimal allocation for sample compute budgets, several choices need to be made:Which sampling configurations (model, temperature, language, etc.) to use?How many samples to generate in each configuration?We formulate these choices as a learning problem and propose OSCA, an algorithm that Optimizes Sample Compute Allocation by finding an optimal mix of different inference configurations.Our experiments show that with our learned mixed allocation, we can achieve accuracy better than the best single configuration with 128x less compute on code generation and 25x less compute on 4 reasoning tasks.is also shown to be effective in agentic workflows beyond single-turn tasks, achieving a better accuracy on SWE-Bench with 3x less compute than the default configuration.Our code and generations are released at https://github.com/LeiLiLab/OSCA. 2025.naacl-long.404 @@ -5160,7 +5160,7 @@ Large Language Models for <fixed-case>P</fixed-case>ersian-<fixed-case>E</fixed-case>nglish Idiom Translation SaraRezaeimanesh FaezehHosseiniTeias - YadollahYaghoobzadeh + YadollahYaghoobzadeh 7974-7985 Large language models (LLMs) have shown superior capabilities in translating figurative language compared to neural machine translation (NMT) systems. However, the impact of different prompting methods and LLM-NMT combinations on idiom translation has yet to be thoroughly investigated. This paper introduces two parallel datasets of sentences containing idiomatic expressions for Persian\rightarrowEnglish and English\rightarrowPersian translations, with Persian idioms sampled from our PersianIdioms resource, a collection of 2,200 idioms and their meanings, with 700 including usage examples.Using these datasets, we evaluate various open- and closed-source LLMs, NMT models, and their combinations. Translation quality is assessed through idiom translation accuracy and fluency. We also find that automatic evaluation methods like LLM-as-a-judge, BLEU, and BERTScore are effective for comparing different aspects of model performance. Our experiments reveal that Claude-3.5-Sonnet delivers outstanding results in both translation directions. For English\rightarrowPersian, combining weaker LLMs with Google Translate improves results, while Persian\rightarrowEnglish translations benefit from single prompts for simpler models and complex prompts for advanced ones. 2025.naacl-long.405 @@ -5168,9 +5168,9 @@ Follow the Beaten Path: The Role of Route Patterns on Vision-Language Navigation Agents Generalization Abilities - Kourosh TBaghaeiGeorge Mason University - DieterPfoserGeorge Mason University - AntoniosAnastasopoulosAthena Research Center and George Mason University + Kourosh TBaghaeiGeorge Mason University + DieterPfoserGeorge Mason University + AntoniosAnastasopoulosAthena Research Center and George Mason University 7986-8005 Vision and language navigation (VLN) is a challenging task towards the creation of embodied agents that requires spatial and temporal reasoning over the instructions provided in natural language and aligning them with the visual perception of an environment. Although a number of methods and approaches have been developed, none achieves human level performance in outdoor settings (by up to 75 percent). The contributions of visual and language modalities to the success of VLN have been studied, however here we focus on an overlooked property of routes and show that navigational instructions can be represented as patterns of actions that also describe trajectory shapes. Through carefully crafted experiments, we show that agents generalization to unseen environments depends not only on visual and linguistic features, but also on the shape of trajectories presented to the model during the fine-tuning. Our experiments show that the diversity of patterns of actions during training is a key contributor to high success rates for agents. Last, we propose a solution based on data augmentation that fills the gap in missing patterns of training data. Our findings will guide researchers towards improved practices in the development and evaluation of VLN datasets and agents. 2025.naacl-long.406 @@ -5179,7 +5179,7 @@ Sneaking Syntax into Transformer Language Models with Tree Regularization AnanjanNandiStanford University - Christopher DManningComputer Science Department, Stanford University + Christopher DManningComputer Science Department, Stanford University ShikharMurtyStanford University 8006-8024 While compositional accounts of human language understanding are based on a hierarchical tree-like process, neural models like transformers lack a direct inductive bias for such tree structures. Introducing syntactic inductive biases could unlock more robust and data-efficient learning in transformer language models (LMs), but existing methods for incorporating such structure greatly restrict models, either limiting their expressivity or increasing inference complexity. This work instead aims to softly inject syntactic inductive biases into given transformer circuits, through a structured regularizer. We introduce TreeReg, an auxiliary loss function that converts bracketing decisions from silver parses into a set of differentiable orthogonality constraints on vector hidden states. TreeReg integrates seamlessly with the standard LM objective, requiring no architectural changes. LMs pre-trained with TreeReg on natural language corpora such as WikiText-103 achieve up to 10% lower perplexities on out-of-distribution data and up to 9.5 point improvements in syntactic generalization, requiring less than half the training data to outperform standard LMs. TreeReg still provides gains for pre-trained LLMs: Continued pre-training of Sheared Llama with TreeReg results in improved syntactic generalization, and fine-tuning on MultiNLI with TreeReg mitigates degradation of performance on adversarial NLI benchmarks by 41.2 points. We release all code to guide future research. @@ -5189,7 +5189,7 @@ Meta-Cultural Competence: Climbing the Right Hill of Cultural Awareness SougataSahaMohamed bin Zayed University of Artificial Intelligence - Saurabh KumarPandeyMohamed bin Zayed University of Artificial Intelligence + Saurabh KumarPandeyMohamed bin Zayed University of Artificial Intelligence MonojitChoudhuryMohamed bin Zayed University of Artificial Intelligence 8025-8042 Numerous recent studies have shown that Large Language Models (LLMs) are biased towards a Western and Anglo-centric worldview, which compromises their usefulness in non-Western cultural settings. However, “culture” is a complex, multifaceted topic, and its awareness, representation, and modeling in LLMs and LLM-based applications can be defined and measured in numerous ways. In this position paper, we ask what does it mean for an LLM to possess “cultural awareness”, and through a thought experiment, which is an extension of the Octopus test proposed by Bender and Koller (2020), we argue that it is not cultural awareness or knowledge, rather meta-cultural competence, which is required of an LLM and LLM-based AI system that will make it useful across various, including completely unseen, cultures. We lay out the principles of meta-cultural competence AI systems, and discuss ways to measure and model those. @@ -5199,7 +5199,7 @@ Reading between the Lines: Can <fixed-case>LLM</fixed-case>s Identify Cross-Cultural Communication Gaps? SougataSahaMohamed bin Zayed University of Artificial Intelligence - Saurabh KumarPandeyMohamed bin Zayed University of Artificial Intelligence + Saurabh KumarPandeyMohamed bin Zayed University of Artificial Intelligence HarshitGupta MonojitChoudhuryMohamed bin Zayed University of Artificial Intelligence 8043-8067 @@ -5214,7 +5214,7 @@ ZongyueQinUniversity of California, Los Angeles NehaPrakriya YizhouSunUniversity of California, Los Angeles - JasonCongUniversity of California, Los Angeles + JasonCongUniversity of California, Los Angeles 8068-8089 Transformer-based large language models (LLM) have been widely used in language processing applications. However, due to the memory constraints of the devices, most of them restrict the context window. Even though recurrent models in previous works can memorize past tokens to enable unlimited context and maintain effectiveness, they have “flat” memory architectures. Such architectures have limitations in selecting and filtering information. Since humans are good at learning and self-adjustment, we believe that imitating brain memory hierarchy is beneficial for model memorization. Thus, we propose the Hierarchical Memory Transformer (HMT), a novel framework that facilitates a model’s long-context processing ability by imitating human memorization behavior. Leveraging memory-augmented segment-level recurrence, we organize the memory hierarchy by preserving tokens from early input segments, passing memory embeddings along the sequence, and recalling relevant information from history. Evaluating general language modeling, question-answering tasks, and the summarization task, we show that HMT consistently improves the long-context processing ability of existing models. Furthermore, HMT achieves a comparable or superior generation quality to long-context LLMs with 2 \sim 57\times fewer parameters and 2.5 \sim 116\times less inference memory, significantly outperforming previous memory-augmented models. 2025.naacl-long.410 @@ -5222,8 +5222,8 @@ Faux Polyglot: A Study on Information Disparity in Multilingual Large Language Models - NikhilSharmaJohns Hopkins University - KentonMurrayJohns Hopkins University + NikhilSharmaJohns Hopkins University + KentonMurrayJohns Hopkins University ZiangXiaoDepartment of Computer Science, Whiting School of Engineering 8090-8107 Although the multilingual capability of LLMs offers new opportunities to overcome the language barrier, do these capabilities translate into real-life scenarios where linguistic divide and knowledge conflicts between multilingual sources are known occurrences? In this paper, we studied LLM’s linguistic preference in a cross-language RAG-based information search setting. We found that LLMs displayed systemic bias towards information in the same language as the query language in both document retrieval and answer generation. Furthermore, in scenarios where no information is in the language of the query, LLMs prefer documents in high-resource languages during generation, potentially reinforcing the dominant views. Such bias exists for both factual and opinion-based queries. Our results highlight the linguistic divide within multilingual LLMs in information search systems. The seemingly beneficial multilingual capability of LLMs may backfire on information parity by reinforcing language-specific filter bubbles further marginalizing low-resource views. @@ -5232,7 +5232,7 @@ Teaching Models to Balance Resisting and Accepting Persuasion - EliasStengel-Eskin + EliasStengel-Eskin PeterHaseAnthropic MohitBansalUniversity of North Carolina at Chapel Hill 8108-8122 @@ -5307,8 +5307,8 @@ <fixed-case>C</fixed-case>om<fixed-case>PO</fixed-case>: Community Preferences for Language Model Personalization SachinKumarOhio State University, Columbus Chan YoungPark - YuliaTsvetkovDepartment of Computer Science, University of Washington - Noah A.SmithUniversity of Washington and Allen Institute for Artificial Intelligence + YuliaTsvetkovDepartment of Computer Science, University of Washington + Noah A.SmithUniversity of Washington and Allen Institute for Artificial Intelligence HannanehHajishirzi 8246-8279 Conventional algorithms for training language models (LMs) with human feedback rely on preferences that are assumed to account for an “average” user, disregarding subjectivity and finer-grained variations. Recent studies have raised concerns that aggregating such diverse and often contradictory human feedback to finetune models results in generic models that generate outputs not preferred by many user groups, as they tend to average out styles and norms. To address this issue, we draw inspiration from recommendation systems and propose ComPO, a method to personalize preference optimization in LMs by contextualizing the probability distribution of model outputs with the preference provider. Focusing on group-level preferences rather than individuals, we collect and release ComPRed, a question answering dataset with community-level preferences from Reddit. This dataset facilitates studying diversity in preferences without incurring privacy concerns associated with individual feedback. Our experiments reveal that conditioning language models on a community identifier (i.e., subreddit name) during preference tuning substantially enhances model performance. Conversely, replacing this context with random subreddit identifiers significantly diminishes performance, highlighting the effectiveness of our approach in tailoring responses to communities’ preferences. @@ -5317,7 +5317,7 @@ <fixed-case>G</fixed-case>round<fixed-case>C</fixed-case>ocoa: A Benchmark for Evaluating Compositional & Conditional Reasoning in Language Models - HarshKohliThe Ohio State University, Columbus + HarshKohliThe Ohio State University, Columbus SachinKumarOhio State University, Columbus HuanSunThe Ohio State University, Columbus 8280-8295 @@ -5330,8 +5330,8 @@ Aly M.Kassem OmarMahmoud NiloofarMireshghallah - HyunwooKimNVIDIA - YuliaTsvetkovDepartment of Computer Science, University of Washington + HyunwooKimNVIDIA + YuliaTsvetkovDepartment of Computer Science, University of Washington YejinChoiComputer Science Department, Stanford University and NVIDIA SherifSaadUniversity of Windsor SantuRanaDeakin University @@ -5342,7 +5342,7 @@ Evaluating Contextualized Representations of (<fixed-case>S</fixed-case>panish) Ambiguous Words: A New Lexical Resource and Empirical Analysis - Pamela DRiviereUniversity of California, San Diego + Pamela DRiviereUniversity of California, San Diego Anne L.Beatty-MartínezUniversity of California, San Diego SeanTrottUniversity of California, San Diego 8322-8338 @@ -5352,11 +5352,11 @@ Understanding <fixed-case>LLM</fixed-case>s’ Fluid Intelligence Deficiency: An Analysis of the <fixed-case>ARC</fixed-case> Task - JunjieWuHKUST + JunjieWuHKUST MoYuWeChat AI, Tencent LemaoLiuTencent - Dit-YanYeungHong Kong University of Science and Technology - JieZhou + Dit-YanYeungHong Kong University of Science and Technology + JieZhou 8339-8360 While LLMs have exhibited strong performance on various NLP tasks, it is noteworthy that most of these tasks rely on utilizing the vast amount of knowledge encoded in LLMs’ parameters, rather than solving new problems without prior knowledge. In cognitive research, the latter ability is referred to as fluid intelligence, which is considered to be critical for assessing human intelligence. Recent research on fluid intelligence assessments has highlighted significant deficiencies in LLMs’ abilities. In this paper, we analyze the challenges LLMs face in demonstrating fluid intelligence through controlled experiments, using the most representative ARC task as an example. Our study revealed three major limitations in existing LLMs: limited ability for skill composition, unfamiliarity with abstract input formats, and the intrinsic deficiency of left-to-right decoding. Our data and code will be publicly released, and the data is also attached in the submission. 2025.naacl-long.423 @@ -5364,11 +5364,11 @@ <fixed-case>F</fixed-case>ed<fixed-case>S</fixed-case>pa<fixed-case>LLM</fixed-case>: Federated Pruning of Large Language Models - GuangjiBai - YijiangLiArgonne National Laboratory + GuangjiBai + YijiangLiArgonne National Laboratory ZilinghanLiArgonne National Laboratory - LiangZhaoEmory University - KibaekKimArgonne National Laboratory + LiangZhaoEmory University + KibaekKimArgonne National Laboratory 8361-8373 Large Language Models (LLMs) achieve state-of-the-art performance but are challenging to deploy due to their high computational and storage demands. Pruning can reduce model size, yet existing methods assume public access to calibration data, which is impractical for privacy-sensitive applications. To address the challenge of pruning LLMs in privacy-preserving settings, we propose FedSpaLLM, the first federated learning framework designed specifically for pruning LLMs. FedSpaLLM enables clients to locally prune their models based on private data while accounting for system heterogeneity and maintaining communication efficiency. Our framework introduces several key innovations: (1) a novel \ell_0-norm aggregation function that ensures only non-zero weights are averaged across clients, preserving important model parameters; (2) an adaptive mask expansion technique that meets global sparsity targets while accommodating client-specific pruning decisions; and (3) a layer sampling strategy that reduces communication overhead and personalizes the pruning process based on client resources. Extensive experiments show that FedSpaLLM improves pruning performance in diverse federated settings. 2025.naacl-long.424 @@ -5379,17 +5379,17 @@ ZhihanZhang ShiyangLiAmazon ZixuanZhang - XinLiuAmazon + XinLiuAmazon HaomingJiangAmazon XianfengTangAmazon YifanGaoAmazon ZhengLiAmazon HaodongWangAmazon - ZhaoxuanTanUniversity of Notre Dame + ZhaoxuanTanUniversity of Notre Dame YichuanLi QingyuYinAmazon - BingYinAmazon - MengJiangUniversity of Notre Dame + BingYinAmazon + MengJiangUniversity of Notre Dame 8374-8398 The instruction hierarchy, which establishes a priority order from system messages to user messages, conversation history, and tool outputs, is essential for ensuring consistent and safe behavior in language models (LMs). Despite its importance, this topic receives limited attention, and there is a lack of comprehensive benchmarks for evaluating models’ ability to follow the instruction hierarchy. We bridge this gap by introducing IHEval, a novel benchmark comprising 3,538 examples across nine tasks, covering cases where instructions in different priorities either align or conflict. Our evaluation of popular LMs highlights their struggle to recognize instruction priorities. All evaluated models experience a sharp performance decline when facing conflicting instructions, compared to their original instruction-following performance. Moreover, the most competitive open-source model only achieves 48% accuracy in resolving such conflicts. Our results underscore the need for targeted optimization in the future development of LMs. 2025.naacl-long.425 @@ -5398,17 +5398,17 @@ Afrispeech-Dialog: A Benchmark Dataset for Spontaneous <fixed-case>E</fixed-case>nglish Conversations in Healthcare and Beyond MardhiyahSanniIntron Health - TassallahAbdullahi + TassallahAbdullahi Devendra DeepakKayande EmmanuelAyodeleIntron Health Naome AEtori Michael SamwelMollel Moshood O.YekiniMasakhane - ChibuzorOkocha - Lukman EnegiIsmailaJohns Hopkins University - FolafunmiOmofoyeUniversity of North Carolina at Chapel Hill - Boluwatife A.Adewale - TobiOlatunji + ChibuzorOkocha + Lukman EnegiIsmailaJohns Hopkins University + FolafunmiOmofoyeUniversity of North Carolina at Chapel Hill + Boluwatife A.Adewale + TobiOlatunji 8399-8417 Speech technologies are transforming interactions across various sectors, from healthcare to call centers and robots, yet their performance on African-accented conversations remains underexplored. We introduce Afrispeech-Dialog, a benchmark dataset of 50 simulated medical and non-medical African-accented English conversations, designed to evaluate automatic speech recognition (ASR) and related technologies. We assess state-of-the-art (SOTA) speaker diarization and ASR systems on long-form, accented speech, comparing their performance with native accents and discover a 10%+ performance degradation. Additionally, we explore medical conversation summarization capabilities of large language models (LLMs) to demonstrate the impact of ASR errors on downstream medical summaries, providing insights into the challenges and opportunities for speech technologies in the Global South. Our work highlights the need for more inclusive datasets to advance conversational AI in low-resource settings. 2025.naacl-long.426 @@ -5419,7 +5419,7 @@ PhilipSchroeder Nathaniel W.Morgan HongyinLuoMassachusetts Institute of Technology - James R.GlassMassachusetts Institute of Technology + James R.GlassMassachusetts Institute of Technology 8418-8442 Large language models (LLMs) have shown impressive capabilities across diverse settings, but still struggle as the length and complexity of the context increases. To address this challenge, we propose Thinking Recursively and Dynamically (ThReaD). THREAD frames model generation as a thread of execution that, based on the context, can run to completion or dynamically spawn new threads. By spawning, threads can offload work (e.g., thinking, retrieving information) to child threads, which only return tokens needed for the parent thread to do its work. We apply THREAD in the settings of LLM task solving and question answering, where the dynamic threading allows the model to recursively decompose the given task or question into progressively simpler sub-problems that can be solved by separate child threads. We test THREAD, implemented using a few-shot learning approach, on diverse benchmarks for agent tasks and data-grounded question answering. THREAD achieves state-of-the-art performance with GPT-4 and GPT-3.5 on these benchmarks, including ALFWorld, TextCraft, and WebShop, along with two new benchmarks, DataCommons QA and MIMIC-III ICU QA. In addition, THREAD outperforms existing frameworks by 10% to 50% absolute points with smaller models, including Llama-3-8b and CodeLlama-7b. 2025.naacl-long.427 @@ -5428,9 +5428,9 @@ <fixed-case>CORG</fixed-case>: Generating Answers from Complex, Interrelated Contexts HyunjiLeeKorea Advanced Institute of Science & Technology - FranckDernoncourt - TrungBuiAdobe Research - SeunghyunYoonAdobe Research + FranckDernoncourt + TrungBuiAdobe Research + SeunghyunYoonAdobe Research 8443-8460 In a real-world corpus, knowledge frequently recurs across documents but often contains inconsistencies due to ambiguous naming, outdated information, or errors, leading to complex interrelationships between contexts. Previous research has shown that language models struggle with these complexities, typically focusing on single factors in isolation. We classify these relationships into four types: distracting, ambiguous, counterfactual, and duplicated. Our analysis reveals that no single approach effectively addresses all these interrelationships simultaneously. Therefore, we introduce Context Organizer (COrg), a framework that organizes multiple contexts into independently processed groups. This design allows the model to efficiently find all relevant answers while ensuring disambiguation. COrg consists of three key components: a graph constructor, a reranker, and an aggregator. Our results demonstrate that COrg balances performance and efficiency effectively, outperforming existing grouping methods and achieving comparable results to more computationally intensive, single-context approaches. 2025.naacl-long.428 @@ -5441,7 +5441,7 @@ Kang-ilLeeSeoul National University HyukhunKohSeoul National University DongryeolLeeSeoul National University - SeunghyunYoonAdobe Research + SeunghyunYoonAdobe Research MinsungKim KyominJung 8461-8474 @@ -5451,7 +5451,7 @@ On the Analysis and Distillation of Emergent Outlier Properties in Pre-trained Language Models - TianyangZhaoAmazon + TianyangZhaoAmazon Kunwar YashrajSinghAmazon SrikarAppalarajuAmazon PengTangAmazon @@ -5465,7 +5465,7 @@ Open-World Evaluation for Retrieving Diverse Perspectives Hung-TingChenNew York University - EunsolChoiNew York University + EunsolChoiNew York University 8508-8528 We study retrieving a set of documents that covers various perspectives on a complex and contentious question (e.g., will ChatGPT do more harm than good?). We curate a Benchmark for Retrieval Diversity for Subjective questions (BERDS), where each example consists of a question and diverse perspectives associated with the question, sourced from survey questions and debate websites. On this data, retrievers paired with a corpus are evaluated to surface a document set that contains diverse perspectives. Our framing diverges from most retrieval tasks in that document relevancy cannot be decided by simple string matches to references. Instead, we build a language model-based automatic evaluator that decides whether each retrieved document contains a perspective. This allows us to evaluate the performance of three different types of corpus (Wikipedia, web snapshot, and corpus constructed on the fly with retrieved pages from the search engine) paired with retrievers. Retrieving diverse documents remains challenging, with the outputs from existing retrievers covering all perspectives on only 33.74% of the examples. We further study the impact of query expansion and diversity-focused reranking approaches and analyze retriever sycophancy. Together, we lay the foundation for future studies in retrieval diversity handling complex queries. 2025.naacl-long.431 @@ -5474,7 +5474,7 @@ Analyzing the Inner Workings of Transformers in Compositional Generalization RyomaKumon - HitomiYanakathe University of Tokyo + HitomiYanakathe University of Tokyo 8529-8540 The compositional generalization abilities of neural models have been sought after for human-like linguistic competence.The popular method to evaluate such abilities is to assess the models’ input-output behavior.However, that does not reveal the internal mechanisms, and the underlying competence of such models in compositional generalization remains unclear.To address this problem, we explore the inner workings of a Transformer model byfinding an existing subnetwork that contributes to the generalization performance and by performing causal analyses on how the model utilizes syntactic features.We find that the model depends on syntactic features to output the correct answer, but that the subnetwork with much better generalization performance than the whole model relies on a non-compositional algorithm in addition to the syntactic features.We also show that the subnetwork improves its generalization performance relatively slowly during the training compared to the in-distribution one, and the non-compositional solution is acquired in the early stages of the training. 2025.naacl-long.432 @@ -5482,10 +5482,10 @@ Substance Beats Style: Why Beginning Students Fail to Code with <fixed-case>LLM</fixed-case>s - FrancescaLucchetti + FrancescaLucchetti ZixuanWu ArjunGuhaNortheastern University - Molly QFeldmanOberlin College + Molly QFeldmanOberlin College Carolyn JaneAndersonWellesley College 8541-8610 Although LLMs are increasing the productivity of professional programmers, existing work shows that beginners struggle to prompt LLMs to solve text-to-code tasks (Nguyen et al., 2024; Prather et al., 2024b; Mordechai et al., 2024). Why is this the case? This paper explores two competing hypotheses about the cause of student-LLM miscommunication: (1) students simply lack the technical vocabulary needed to write good prompts, and (2) students do not understand the extent of information that LLMs need to solve code generation tasks. We study (1) with a causal intervention experiment on technical vocabulary and (2) by analyzing graphs that abstract how students edit prompts and the different failures that they encounter. We find that substance beats style: a poor grasp of technical vocabulary is merely correlated with prompt failure; that the information content of prompts predicts success; that students get stuck making trivial edits; and more. Our findings have implications for the use of LLMs in programming education, and for efforts to make computing more accessible with LLMs. @@ -5494,7 +5494,7 @@ Reverse Thinking Makes <fixed-case>LLM</fixed-case>s Stronger Reasoners - JustinChen + JustinChen ZifengWangGoogle HamidPalangiGoogle RujunHanGoogle @@ -5504,7 +5504,7 @@ SwaroopMishraGoogle MohitBansalUniversity of North Carolina at Chapel Hill Chen-YuLeeGoogle - TomasPfisterGoogle + TomasPfisterGoogle 8611-8630 Reverse thinking plays a crucial role in human reasoning. Humans can reason not only from a problem to a solution but also in reverse, i.e., start from the solution and reason towards the problem. This often enhances overall reasoning performance as it enables consistency checks between their forward and backward thinking. To enable Large Language Models (LLMs) to perform reverse thinking, we introduce Reverse-Enhanced Thinking (RevThink), a framework composed of data augmentation and learning objectives. In RevThink, we augment the dataset by collecting structured forward-backward reasoning from a teacher model, consisting of: (1) the original question, (2) forward reasoning, (3) backward question, and (4) backward reasoning. We then employ three objectives to train a smaller student model in a multi-task learning fashion: (a) generate forward reasoning from a question, (b) generate a backward question from a question, and (c) generate backward reasoning from the backward question. Experiments across 12 datasets covering commonsense, math, and logical reasoning show an average 13.53% improvement over the student model’s zero-shot performance and a 6.84% improvement over the strongest knowledge distillation baselines. Moreover, our method demonstrates sample efficiency – using only 10% of the correct forward reasoning from the training data, it outperforms a standard fine-tuning method trained on 10x more forward reasoning. RevThink also exhibits strong generalization to out-of-distribution held-out datasets. 2025.naacl-long.434 @@ -5513,14 +5513,14 @@ Towards Lifelong Dialogue Agents via Timeline-based Memory Management Kai Tzu-iunnOng - NamyoungKimYonsei University + NamyoungKimYonsei University MinjuGwakYonsei University HyungjooChae TaeyoonKwonYonsei University YohanJoSeoul National University Seung-wonHwangSeoul National University - DonghaLeeYonsei University - JinyoungYeoYonsei University + DonghaLeeYonsei University + JinyoungYeoYonsei University 8631-8661 To achieve lifelong human-agent interaction, dialogue agents need to constantly memorize perceived information and properly retrieve it for response generation (RG). While prior studies focus on getting rid of outdated memories to improve retrieval quality, we argue that such memories provide rich, important contextual cues for RG (e.g., changes in user behaviors) in long-term conversations. We present THEANINE, a framework for LLM-based lifelong dialogue agents. THEANINE discards memory removal and manages large-scale memories by linking them based on their temporal and cause-effect relation. Enabled by this linking structure, THEANINE augments RG with memory timelines - series of memories representing the evolution or causality of relevant past events. Along with THEANINE, we introduce TeaFarm, a counterfactual-driven evaluation scheme, addressing the limitation of G-Eval and human efforts when assessing agent performance in integrating past memories into RG. A supplementary video for THEANINE and data for TeaFarm are at https://huggingface.co/spaces/ResearcherScholar/Theanine. 2025.naacl-long.435 @@ -5529,7 +5529,7 @@ <fixed-case>S</fixed-case>tyle<fixed-case>D</fixed-case>istance: Stronger Content-Independent Style Embeddings with Synthetic Parallel Examples AjayPatel - JiachengZhu + JiachengZhu JustinQiu ZacharyHorvitz MariannaApidianakiUniversity of Pennsylvania, University of Pennsylvania @@ -5548,7 +5548,7 @@ LiXiaoqing KaiSong YaqianZhouFudan University, Tsinghua University - XipengQiuFudan University + XipengQiuFudan University 8686-8707 2025.naacl-long.437 he-etal-2025-fine @@ -5562,7 +5562,7 @@ ShaohuiKuang KaiSong YaqianZhouFudan University, Tsinghua University - XipengQiuFudan University + XipengQiuFudan University 8708-8733 With the rapid development of large language models (LLMs), due to their strong performance across various fields, LLM-based evaluation methods (LLM-as-a-Judge) have become widely used in natural language generation (NLG) evaluation. However, these methods encounter the following challenges: (1) distinguishing instruction-following ability, (2) being applicable across diverse NLG tasks, and (3) identifying low-quality outputs. To address these issues, we propose CAMIEval, a multidimensional comparative evaluation method based on instruction-following. Specifically, we define three fundamental dimensions of instruction-following: relevance, factuality, and adherence. Subsequently, we introduce a concrete Chain-of-Thoughts (ConcreteCoT) process to enhance the accuracy of evaluations. In addition, we trained a “regrettable model” RegretLM to generate low-quality outputs, which helps the evaluator better identify the potential shortcomings of the candidate output by comparing low-quality outputs with reference outputs. Through this comparison, the evaluator can generate instruction-specific dimensions that complement the fundamental dimensions, forming a more comprehensive evaluation metric system. Experiments on two NLG evaluation benchmarks demonstrate that CAMIEval consistently outperforms existing methods in terms of correlation with human evaluations, providing a general and accurate framework for evaluating the outputs of LLMs. 2025.naacl-long.438 @@ -5579,9 +5579,9 @@ ZhaoyuZhangAmazon QinLuAmazon KaiwenMen - NingXieAmazon + NingXieAmazon HuashengLi - BingYinAmazon + BingYinAmazon HanLiAmazon LingyunWangAmazon 8734-8750 @@ -5591,10 +5591,10 @@ Language Models Can Infer Action Semantics for Symbolic Planners from Environment Feedback - Wang BillZhuUniversity of Southern California + Wang BillZhuUniversity of Southern California IshikaSinghUniversity of Southern California RobinJiaUniversity of Southern California - JesseThomasonUniversity of Southern California and Amazon + JesseThomasonUniversity of Southern California and Amazon 8751-8773 Symbolic planners can discover a sequence of actions from initial to goal states given expert-defined, domain-specific logical action semantics. Large Language Models (LLMs) can directly generate such sequences, but limitations in reasoning and state-tracking often result in plans that are insufficient or unexecutable. We propose Predicting Semantics of Actions with Language Models (PSALM), which automatically learns action semantics by leveraging the strengths of both symbolic planners and LLMs. PSALM repeatedly proposes and executes plans, using the LLM to partially generate plans and to infer domain-specific action semantics based on execution outcomes. PSALM maintains a belief over possible action semantics that is iteratively updated until a goal state is reached. Experiments on 7 environments show that when learning just from one goal, PSALM boosts plan success rate from 36.4% (on Claude-3.5) to 100%, and explores the environment more efficiently than prior work to infer ground truth domain action semantics. 2025.naacl-long.440 @@ -5602,10 +5602,10 @@ <fixed-case>SLM</fixed-case>-Mod: Small Language Models Surpass <fixed-case>LLM</fixed-case>s at Content Moderation - XianyangZhan - AgamGoyalUniversity of Illinois at Urbana-Champaign + XianyangZhan + AgamGoyalUniversity of Illinois at Urbana-Champaign YilunChen - EshwarChandrasekharanUniversity of Illinois at Urbana-Champaign + EshwarChandrasekharanUniversity of Illinois at Urbana-Champaign KoustuvSahaDepartment of Computer Science 8774-8790 Large language models (LLMs) have shown promise in many natural language understanding tasks, including content moderation. However, these models can be expensive to query in real-time and do not allow for a community-specific approach to content moderation. To address these challenges, we explore the use of open-source small language models (SLMs) for community-specific content moderation tasks. We fine-tune and evaluate SLMs (less than 15B parameters) by comparing their performance against much larger open- and closed-sourced models in both a zero-shot and few-shot setting. Using 150K comments from 15 popular Reddit communities, we find that SLMs outperform zero-shot LLMs at content moderation-11.5% higher accuracy and 25.7% higher recall on average across all communities. Moreover, few-shot in-context learning leads to only a marginal increase in the performance of LLMs, still lacking compared to SLMs. We further show the promise of cross-community content moderation, which has implications for new communities and the development of cross-platform moderation techniques. Finally, we outline directions for future work on language model based content moderation. @@ -5630,7 +5630,7 @@ HengyuanZhang DaweiLi XinZhangAnt Group - TianlongChenUniversity of North Carolina at Chapel Hill + TianlongChenUniversity of North Carolina at Chapel Hill 8811-8826 Reinforcement Learning with Human Feedback (RLHF) is the key to the success of large language models (LLMs) in recent years. In this work, we first introduce the concepts of knowledge breadth and knowledge depth, which measure the comprehensiveness and depth of an LLM or knowledge source respectively. We reveal that the imbalance in the number of prompts and responses can lead to a potential disparity in breadth and depth learning within alignment tuning datasets by showing that even a simple uniform method for balancing the number of instructions and responses can lead to significant improvements. Building on this, we further propose Balanced Preference Optimization (BPO), designed to dynamically augment the knowledge depth of each sample. BPO is motivated by the observation that the usefulness of knowledge varies across samples, necessitating tailored learning of knowledge depth. To achieve this, we introduce gradient-based clustering, estimating the knowledge informativeness and usefulness of each augmented sample based on the model’s optimization direction. Our experimental results across various benchmarks demonstrate that BPO outperforms other baseline methods in alignment tuning while maintaining training efficiency. Furthermore, we conduct a detailed analysis of each component of BPO, providing guidelines for future research in preference data optimization. 2025.naacl-long.443 @@ -5661,12 +5661,12 @@ Kill two birds with one stone: generalized and robust <fixed-case>AI</fixed-case>-generated text detection via dynamic perturbations - YinghanZhou - JuanWenChina Agricultural University + YinghanZhou + JuanWenChina Agricultural University WanliPengChina Agricultural University XueYimingChina Agricultural University - ZiWeiZhang - WuZhengxianChina Agricultural University + ZiWeiZhang + WuZhengxianChina Agricultural University 8864-8875 The growing popularity of large language models has raised concerns regarding the potential to misuse AI-generated text (AIGT). It becomes increasingly critical to establish an excellent AIGT detection method with high generalization and robustness.While, existing methods either focus on model generalization or concentrate on robustness.The unified mechanism, to simultaneously address the challenges of generalization and robustness, is less explored. In this paper, we first empirically reveal an intrinsic mechanism for model generalization and robustness of AIGT detection task.Then, we proposed a novel AIGT detection method (DP-Net) via dynamic perturbations introduced by a reinforcement learning with elaborated reward and action.Experimentally, extensive results show that the proposed DP-Net significantly outperforms some state-of-the-art AIGT detection methods for generalization capacity in three cross-domain scenarios.Meanwhile, the DP-Net achieves best robustness under two text adversarial attacks. 2025.naacl-long.446 @@ -5679,7 +5679,7 @@ FangzhiXuXi’an Jiaotong University JianbingZhangNanjing University HaoZhou - YangLiu + YangLiu 8876-8892 Chain-of-thought (CoT) has proven to improve the reasoning capability of large language models (LLMs). However, due to the complexity of multimodal scenarios and the difficulty in collecting high-quality CoT data, CoT reasoning in multimodal LLMs has been largely overlooked. To this end, we propose a simple yet effective self-training framework, R^3V, which iteratively enhances the model’s Vision-language Reasoning by Reflecting on CoT Rationales. Our framework consists of two interleaved parts: (1) iteratively bootstrapping positive and negative solutions for reasoning datasets, and (2) reflection on rationale for learning from mistakes. Specifically, we introduce the self-refine and self-select losses, enabling the model to refine flawed rationale and derive the correct answer by comparing rationale candidates. Experiments on a wide range of vision-language tasks show that R^3V consistently improves multimodal LLM reasoning, achieving a relative improvement of 23% to 60% over GPT-distilled baselines. Additionally, our approach supports self-reflection on generated solutions, further boosting performance through test-time computation. Our code is available at https://github.com/njucckevin/MM-Self-Improve. 2025.naacl-long.447 @@ -5687,7 +5687,7 @@ Emergence of Episodic Memory in Transformers: Characterizing Changes in Temporal Structure of Attention Scores During Training - Deven MaheshMistry + Deven MaheshMistry AnooshkaBajaj YashAggarwal Sahaj SinghMaini @@ -5699,11 +5699,11 @@ Knowledge Graph-Guided Retrieval Augmented Generation - XiangrongZhu - YuexiangXieAlibaba Group - YiLiunanjing university - YaliangLiAlibaba Group - WeiHuNanjing University + XiangrongZhu + YuexiangXieAlibaba Group + YiLiunanjing university + YaliangLiAlibaba Group + WeiHuNanjing University 8912-8924 Retrieval-augmented generation (RAG) has emerged as a promising technology for addressing hallucination issues in the responses generated by large language models (LLMs). Existing studies on RAG primarily focus on applying semantic-based approaches to retrieve isolated relevant chunks, which ignore their intrinsic relationships. In this paper, we propose a novel Knowledge Graph-Guided Retrieval Augmented Generation (KG^2RAG) framework that utilizes knowledge graphs (KGs) to provide fact-level relationships between chunks, improving the diversity and coherence of the retrieved results. Specifically, after performing a semantic-based retrieval to provide seed chunks, KG^2RAG employs a KG-guided chunk expansion process and a KG-based chunk organization process to deliver relevant and important knowledge in well-organized paragraphs. Extensive experiments conducted on the HotpotQA dataset and its variants demonstrate the advantages of KG^2RAG compared to existing RAG-based approaches, in terms of both response quality and retrieval quality. 2025.naacl-long.449 @@ -5715,12 +5715,12 @@ XinlongYang ZihengGao JiLiuAMD - GuanchenLi + GuanchenLi ZhuangLiuAdvanced Micro Devices DongLi JinzhangPeng LuTian - EmadBarsoumAMD + EmadBarsoumAMD 8925-8938 Large Language Models (LLMs) inherently use autoregressive decoding, which lacks parallelism in inference and results in significantly slow inference speed. While methods such as Medusa constructs parallelized heads, they lack adequate information interaction across different prediction positions. To overcome this limitation, we introduce Amphista, an enhanced speculative decoding framework that builds upon Medusa. Specifically, Amphista models an *Auto-embedding Block* capable of parallel inference, incorporating bi-directional attention to enable interaction between different drafting heads. Additionally, Amphista integrates *Staged Adaptation Layers*, which ensure a seamless transition of semantic information from the target model’s autoregressive inference to the drafting heads’ non-autoregressive inference, effectively achieving paradigm shift and feature fusion. Experimental results on Vicuna models using MT-Bench and Spec-Bench demonstrate that Amphista achieves substantial acceleration while maintaining generation quality. On MT-Bench, Amphista delivers up to **2.75×** speedup over vanilla autoregressive decoding and **1.40×** over Medusa on Vicuna 33B in wall-clock time. 2025.naacl-long.450 @@ -5742,7 +5742,7 @@ DongryeolLeeSeoul National University YerinHwangSeoul National University YongilKimLG Corporation - JoonsukParkUniversity of Richmond + JoonsukParkUniversity of Richmond KyominJung 8962-8984 In line with the principle of honesty, there has been a growing effort to train large language models (LLMs) to generate outputs containing epistemic markers. However, evaluation in the presence of epistemic markers has been largely overlooked, raising a critical question: Could the use of epistemic markers in LLM-generated outputs lead to unintended negative consequences? To address this, we present EMBER, a benchmark designed to assess the robustness of LLM-judges to epistemic markers in both single and pairwise evaluation settings. Our findings, based on evaluations using **EMBER**, reveal that all tested LLM-judges, including GPT-4o, show a notable lack of robustness in the presence of epistemic markers. Specifically, we observe a negative bias toward epistemic markers, with a stronger bias against markers expressing uncertainty. This suggests that LLM-judges are influenced by the presence of these markers and do not focus solely on the correctness of the content. @@ -5751,11 +5751,11 @@ Dynamic Uncertainty Ranking: Enhancing Retrieval-Augmented In-Context Learning for Long-Tail Knowledge in <fixed-case>LLM</fixed-case>s - ShuyangYu + ShuyangYu RunxueBaoGE HealthCare ParminderBhatiaGEHC - TahaKass-HoutGE HealthCare - JiayuZhouUniversity of Michigan - Ann Arbor and Michigan State University + TahaKass-HoutGE HealthCare + JiayuZhouUniversity of Michigan - Ann Arbor and Michigan State University CaoXiaoGE Healthcare 8985-8997 Large language models (LLMs) can learn vast amounts of knowledge from diverse domains during pre-training. However, long-tail knowledge from specialized domains is often scarce and underrepresented, rarely appearing in the models’ memorization. Prior work has shown that in-context learning (ICL) with retriever augmentation can help LLMs better capture long-tail knowledge, reducing their reliance on pre-trained data. Despite these advances, we observe that LLM predictions for long-tail questions remain uncertain to variations in retrieved samples. To take advantage of the uncertainty in ICL for guiding LLM predictions toward correct answers on long-tail samples, we propose a reinforcement learning-based dynamic uncertainty ranking method for retrieval-augmented ICL that accounts for the varying impact of each retrieved sample on LLM predictions. Our approach prioritizes more informative and stable samples while demoting misleading ones, updating rankings based on the feedback from the LLM w.r.t. each retrieved sample. To enhance training efficiency and reduce query costs, we introduce a learnable dynamic ranking threshold, adjusted when the model encounters negative prediction shifts. Experimental results on various question-answering datasets from different domains show that our method outperforms the best baseline by 2.76%, with a notable 5.96% boost in accuracy on long-tail questions that elude zero-shot inference. Our code is available at https://github.com/Yu-shuyan/uncertian_ranker. @@ -5765,12 +5765,12 @@ <fixed-case>S</fixed-case>eq1<fixed-case>F</fixed-case>1<fixed-case>B</fixed-case>: Efficient Sequence-Level Pipeline Parallelism for Large Language Model Training SunAo - WeilinZhaoTsinghua University, Tsinghua University + WeilinZhaoTsinghua University, Tsinghua University XuHanTsinghua University, Tsinghua University - ChengYangBeijing University of Posts and Telecommunications - XinrongZhangByteDance Inc. - ZhiyuanLiuTsinghua University - ChuanShiBeijing University of Post and Telecommunication, Tsinghua University + ChengYangBeijing University of Posts and Telecommunications + XinrongZhangByteDance Inc. + ZhiyuanLiuTsinghua University + ChuanShiBeijing University of Post and Telecommunication, Tsinghua University MaosongSunTsinghua University 8998-9008 Training large language models (LLMs) heavily relies on distributed training strategies, among which pipeline parallelism (PP) plays a crucial role. As training sequences extend to 32k or even 128k tokens, current PP methods face severe bottlenecks, including substantial pipeline bubbles and high memory footprint, greatly hindering training throughput and model scalability. This paper introduces a sequence-level one-forward-one-backward (1F1B) PP method, named Seq1F1B, tailored for training LLMs on long sequences with high training throughput and memory efficiency. Unlike typical PP methods, which adopt batch-level pipeline schedule, Seq1F1B schedules the pipeline of training LLMs at the sequence level. It uses a computational strategy to partition sequences appropriately, significantly reducing pipeline bubbles and memory footprint. Compared to competitive PP baselines such as Megatron 1F1B PP, Seq1F1B achieves 1.14X training throughput with half memory footprint.Notably, Seq1F1B trains an LLM with 30B parameters on sequences up to 64k tokens using 64X NVIDIA A100 GPUs without using recomputation strategies, a feat unachievable with existing methods.We have released our code on GitHub to facilitate further research and development in LLM training on long sequences: https://github.com/thunlp/Seq1F1B. @@ -5779,7 +5779,7 @@ Differentially Private Learning Needs Better Model Initialization and Self-Distillation - Ivoline C.NgongUniversity of Vermont + Ivoline C.NgongUniversity of Vermont JosephNearUniversity of Vermont NiloofarMireshghallah 9009-9027 @@ -5793,7 +5793,7 @@ TaehyunLeeSeoul National University JaewooAhnSeoul National University Jae HyukSung - GunheeKimSeoul National University + GunheeKimSeoul National University 9028-9048 Conceptual combination is a cognitive process that merges basic concepts, enabling the creation of complex expressions. During this process, the properties of combination (e.g., the whiteness of a peeled apple) can be inherited from basic concepts, newly emerge, or be canceled. However, previous studies have evaluated a limited set of properties and have not examined the generative process.To address this gap, we introduce the Conceptual Combination with Property Type dataset (CCPT), which consists of 12.3K annotated triplets of noun phrases, properties, and property types. Using CCPT, we establish three types of tasks to evaluate LLMs for conceptual combination thoroughly.Our key findings are threefold:(1) Our automatic metric grading property emergence and cancellation closely corresponds with human judgments.(2) LLMs, including OpenAI’s o1, struggle to generate noun phrases which possess given emergent properties.(3) Our proposed method, inspired by cognitive psychology model that explains how relationships between concepts are formed, improves performances in all generative tasks.The dataset and experimental code are available at https://github.com/seokwon99/CCPT.git. 2025.naacl-long.456 @@ -5801,10 +5801,10 @@ <fixed-case>CRS</fixed-case>core: Grounding Automated Evaluation of Code Review Comments in Code Claims and Smells - AtharvaNaik + AtharvaNaik MarcusAlenius DanielFriedMeta AI and Carnegie Mellon University - CarolynRoseSchool of Computer Science, Carnegie Mellon University + CarolynRoseSchool of Computer Science, Carnegie Mellon University 9049-9076 The task of automated code review has recently gained a lot of attention from the machine learning community. However, current review comment evaluation metrics rely on comparisons with a human-written reference for a given code change (also called a diff ). Furthermore, code review is a one-to-many problem, like generation and summarization, with many “valid reviews” for a diff. Thus, we develop CRScore — a reference-free metric to measure dimensions of review quality like conciseness, comprehensiveness, and relevance. We design CRScore to evaluate reviews in a way that is grounded in claims and potential issues detected in the code by LLMs and static analyzers. We demonstrate that CRScore can produce valid, fine-grained scores of review quality that have the greatest alignment with human judgment among open-source metrics (0.54 Spearman correlation) and are more sensitive than reference-based metrics. We also release a corpus of 2.9k human-annotated review quality scores for machine-generated and GitHub review comments to support the development of automated metrics. 2025.naacl-long.457 @@ -5814,9 +5814,9 @@ <fixed-case>KS</fixed-case>-Lottery: Finding Certified Lottery Tickets for Multilingual Transfer in Large Language Models FeiYuan ChangMa - ShuaiYuan - QiushiSunUniversity of Hong Kong - LeiLiSchool of Computer Science, Carnegie Mellon University + ShuaiYuan + QiushiSunUniversity of Hong Kong + LeiLiSchool of Computer Science, Carnegie Mellon University 9077-9090 The lottery ticket hypothesis posits the existence of “winning tickets” within a randomly initialized neural network. Do winning tickets exist for LLMs in fine-tuning scenarios? How can we find such winning tickets? In this paper, we propose KS-Lottery, a method to identify a small subset of LLM parameters highly effective in multilingual fine-tuning. Our key idea is to use Kolmogorov-Smirnov Test to analyze the distribution shift of parameters before and after fine-tuning. We further theoretically prove that KS-Lottery can find the certified winning tickets in the embedding layer, fine-tuning on the found parameters is guaranteed to perform as well as full fine-tuning. Comparing KS-Lottery with other tuning algorithms on translation tasks, the experimental results show that KS-Lottery finds a much smaller set of parameters for fine-tuning while achieving the comparable performance as full fine-tuning LLM. Surprisingly, we find that fine-tuning 18 tokens’ embedding of LLaMA suffices to reach the fine-tuning translation performance . 2025.naacl-long.458 @@ -5824,14 +5824,14 @@ <fixed-case>PA</fixed-case>-<fixed-case>RAG</fixed-case>: <fixed-case>RAG</fixed-case> Alignment via Multi-Perspective Preference Optimization - JiayiWuEast China Normal University - HengyiCai + JiayiWuEast China Normal University + HengyiCai LingyongYanBaidu Inc. - HaoSun - XiangLiEast China Normal University - ShuaiqiangWang - DaweiYinBaidu - MingGao + HaoSun + XiangLiEast China Normal University + ShuaiqiangWang + DaweiYinBaidu + MingGao 9091-9112 The emergence of Retrieval-augmented generation (RAG) has alleviated the issues of outdated and hallucinatory content in the generation of large language models (LLMs), yet it still reveals numerous limitations. When a general-purpose LLM serves as the RAG generator, it often suffers from inadequate response informativeness, response robustness, and citation quality. Past approaches to tackle these limitations, either by incorporating additional steps beyond generating responses or optimizing the generator through supervised fine-tuning (SFT), still failed to align with the RAG requirement thoroughly. Consequently, optimizing the RAG generator from multiple preference perspectives while maintaining its end-to-end LLM form remains a challenge. To bridge this gap, we propose Multiple Perspective Preference Alignment for Retrieval-Augmented Generation (PA-RAG), a method for optimizing the generator of RAG systems to align with RAG requirements comprehensively. Specifically, we construct high-quality instruction fine-tuning data and multi-perspective preference data by sampling varied quality responses from the generator across different prompt documents quality scenarios. Subsequently, we optimize the generator using SFT and Direct Preference Optimization (DPO). Extensive experiments conducted on four question-answer datasets across three LLMs demonstrate that PA-RAG can significantly enhance the performance of RAG generators. Our code and datasets are available at https://github.com/wujwyi/PA-RAG. 2025.naacl-long.459 @@ -5849,10 +5849,10 @@ <fixed-case>IMRRF</fixed-case>: Integrating Multi-Source Retrieval and Redundancy Filtering for <fixed-case>LLM</fixed-case>-based Fake News Detection - DayangLi + DayangLi FanxiaoLi BingbingSong - LiTang + LiTang WeiZhouYunnan University 9127-9142 The widespread use of social networks has significantly accelerated the dissemination of information but has also facilitated the rapid spread of fake news, leading to various negative consequences. Recently, with the emergence of large language models (LLMs), researchers have focused on leveraging LLMs for automated fake news detection. Unfortunately, many issues remain to be addressed. First, the evidence retrieved to verify given fake news is often insufficient, limiting the performance of LLMs when reasoning directly from this evidence. Additionally, the retrieved evidence frequently contains substantial redundant information, which can interfere with the LLMs’ judgment. To address these limitations, we propose a Multiple Knowledge Sources Retrieval and LLM Knowledge Conversion framework, which enriches the evidence available for claim verification. We also introduce a Redundant Information Filtering Strategy, which minimizes the influence of irrelevant information on the LLM reasoning process. Extensive experiments conducted on two challenging fact-checking datasets demonstrate that our proposed method outperforms state-of-the-art fact-checking baselines. Our code is available at https://github.com/quark233/IMRRF/tree/main. @@ -5874,7 +5874,7 @@ <fixed-case>SMAB</fixed-case>: <fixed-case>MAB</fixed-case> based word Sensitivity Estimation Framework and its Applications in Adversarial Text Generation - Saurabh KumarPandeyMohamed bin Zayed University of Artificial Intelligence + Saurabh KumarPandeyMohamed bin Zayed University of Artificial Intelligence SachinVashistha DebrupDasUniversity of Massachusetts at Amherst SomakAdityaIndian Institute of Technology Kharagpur @@ -5886,9 +5886,9 @@ <fixed-case>M</fixed-case>ana<fixed-case>TTS</fixed-case> <fixed-case>P</fixed-case>ersian: a recipe for creating <fixed-case>TTS</fixed-case> datasets for lower resource languages - Mahta FetratQharabaghSharif University of Technology - ZahraDehghanian - Hamid R.Rabiee + Mahta FetratQharabaghSharif University of Technology + ZahraDehghanian + Hamid R.Rabiee 9177-9206 In this study, we introduce ManaTTS, the most extensive publicly accessible single-speaker Persian corpus, and a comprehensive framework for collecting transcribed speech datasets for the Persian language. ManaTTS, released under the open CC-0 license, comprises approximately 86 hours of audio with a sampling rate of 44.1 kHz. The dataset is supported by a fully transparent, MIT-licensed pipeline, a testament to innovation in the field. It includes unique tools for sentence tokenization, bounded audio segmentation, and a novel forced alignment method. This alignment technique is specifically designed for low-resource languages, addressing a crucial need in the field. With this dataset, we trained a Tacotron2-based TTS model, achieving a Mean Opinion Score (MOS) of 3.76, which is remarkably close to the MOS of 3.86 for the utterances generated by the same vocoder and natural spectrogram, and the MOS of 4.01 for the natural waveform, demonstrating the exceptional quality and effectiveness of the corpus. 2025.naacl-long.464 @@ -5897,8 +5897,8 @@ <fixed-case>C</fixed-case>ulture<fixed-case>I</fixed-case>nstruct: Curating Multi-Cultural Instructions at Scale Viet ThanhPhamMonash University - ZhuangLiRoyal Melbourne Institute of Technology - LizhenQuMonash University + ZhuangLiRoyal Melbourne Institute of Technology + LizhenQuMonash University GholamrezaHaffariMonash University, Monash University and Monash University 9207-9228 Large language models, despite their remarkable success in recent years, still exhibit severe cultural bias. Therefore, in this paper, we introduce CultureInstruct, a large-scale instruction-tuning dataset designed to reduce cultural bias in LLMs. CultureInstruct is constructed with an automatic pipeline, utilizing public web sources and a specialized LLM to generate instruction. Our data comprises 430K instructions, ranging from classic NLP tasks to complex reasoning. CultureInstruct also covers 11 most relevant topics to cultural knowledge, making it highly diverse. Our experiments show that fine-tuning LLMs with CultureInstruct results in consistent improvements across three types of cultural benchmarks, including (i) general cultural knowledge, (ii) human opinions and values, and (iii) linguistic cultural bias. Our best model, Qwen2-Instruct 72B + CultureInstruct, outperforms GPT-4o Mini and GPT-4o with 18.47% and 13.07% average relative improvements on cultural benchmarks. @@ -5920,12 +5920,12 @@ <fixed-case>D</fixed-case>ense<fixed-case>SSM</fixed-case>: State Space Models with Dense Hidden Connection for Efficient Large Language Models WeiHeHuawei Noah’s Ark Lab - KaiHanHuawei Noah’s Ark Lab + KaiHanHuawei Noah’s Ark Lab YehuiTangHuawei Technologies Ltd. ChengchengWangHuawei Technologies Ltd. YujieYang TianyuGuoHuawei Technologies Ltd. - YunheWangHuawei Noah’s Ark Lab + YunheWangHuawei Noah’s Ark Lab 9243-9254 Large language models (LLMs) face a significant challenge due to the excessive computational and memory requirements of the commonly used Transformer architecture. While state space model (SSM) is a new type of foundational network architecture offering lower computational complexity, their performance has yet to fully rival that of Transformers. This paper introduces DenseSSM, a novel approach to enhance the flow of hidden information between layers in SSMs. By selectively integrating shallow-layer hidden states into deeper layers, DenseSSM retains fine-grained information crucial for the final output. This incremental improvement maintains the training parallelizability and inference efficiency of SSMs while significantly boosting performance. The proposed method is broadly applicable to various SSM types, including RetNet and Mamba, and DenseSSM achieves significant performance improvements on public benchmarks, demonstrating its effectiveness and versatility. 2025.naacl-long.467 @@ -5935,10 +5935,10 @@ A Mixed-Language Multi-Document News Summarization Dataset and a Graphs-Based Extract-Generate Model ShengxiangGaoKunming University of Science and Technology FangNanKunmimg University of Science and Technology - YongbingZhang + YongbingZhang YuxinHuang KaiwenTanKunmimg University of Science and Technology - ZhengtaoYuKunming University of Science and Technology + ZhengtaoYuKunming University of Science and Technology 9255-9265 Existing research on news summarization primarily focuses on single-language single-document (SLSD), single-language multi-document (SLMD) or cross-language single-document (CLSD). However, in real-world scenarios, news about an international event often involves multiple documents in different languages, i.e., mixed-language multi-document (MLMD). Therefore, summarizing MLMD news is of great significance. However, the lack of datasets for MLMD news summarization has constrained the development of research in this area. To fill this gap, we construct a mixed-language multi-document news summarization dataset (MLMD-news), which contains four different languages and 10,992 source document cluster and target summary pairs. Additionally, we propose a graph-based extract-generate model and benchmark various methods on the MLMD-news dataset and publicly release our dataset and code, aiming to advance research in summarization within MLMD scenarios. 2025.naacl-long.468 @@ -5954,7 +5954,7 @@ MiladNasrGoogle Christopher A.Choquette-ChooGoogle DeepMind KatherineLeeGoogle - A. FederCooperResearch, Microsoft and Computer Science Department, Stanford University + A. FederCooperResearch, Microsoft and Computer Science Department, Stanford University 9266-9291 Large language models (LLMs) are susceptible to memorizing training data, raising concerns about the potential extraction of sensitive information at generation time. Discoverable extraction is the most common method for measuring this issue: split a training example into a prefix and suffix, then prompt the LLM with the prefix, and deem the example extractable if the LLM generates the matching suffix using greedy sampling. This definition yields a yes-or-no determination of whether extraction was successful with respect to a single query. Though efficient to compute, we show that this definition is unreliable because it does not account for non-determinism present in more realistic (non-greedy) sampling schemes, for which LLMs produce a range of outputs for the same prompt. We introduce probabilistic discoverable extraction, which, without additional cost, relaxes discoverable extraction by considering multiple queries to quantify the probability of extracting a target sequence. We evaluate our probabilistic measure across different models, sampling schemes, and training-data repetitions, and find that this measure provides more nuanced information about extraction risk compared to traditional discoverable extraction. 2025.naacl-long.469 @@ -5963,7 +5963,7 @@ Audio Is the Achilles’ Heel: Red Teaming Audio Large Multimodal Models HaoYangMonash University - LizhenQuMonash University + LizhenQuMonash University EhsanShareghiMonash University GholamrezaHaffariMonash University, Monash University and Monash University 9292-9306 @@ -5976,8 +5976,8 @@ YunshengNi ChuanjianLiuHuawei Technologies Ltd. YehuiTangHuawei Technologies Ltd. - KaiHanHuawei Noah’s Ark Lab - YunheWangHuawei Noah’s Ark Lab + KaiHanHuawei Noah’s Ark Lab + YunheWangHuawei Noah’s Ark Lab 9307-9320 Speculative decoding emerges as a pivotal technique for enhancing the inference speed of Large Language Models (LLMs). Despite recent research aiming to improve prediction efficiency, multi-sample speculative decoding has been overlooked due to varying numbers of accepted tokens within a batch in the verification phase. Vanilla method adds padding tokens in order to ensure that the number of new tokens remains consistent across samples. However, this increases the computational and memory access overhead, thereby reducing the speedup ratio. We propose a novel method that can resolve the issue of inconsistent tokens accepted by different samples without necessitating an increase in memory or computing overhead. Furthermore, our proposed method can handle the situation where the prediction tokens of different samples are inconsistent without the need to add padding tokens. Sufficient experiments demonstrate the efficacy of our method. Our code will be released later. 2025.naacl-long.471 @@ -5998,8 +5998,8 @@ <fixed-case>MAPW</fixed-case>ise: Evaluating Vision-Language Models for Advanced Map Queries SrijaMukhopadhyay AbhishekRajgaria - PreranaKhatiwada - ManishShrivastavaInternational Institute of Information Technology Hyderabad, India + PreranaKhatiwada + ManishShrivastavaInternational Institute of Information Technology Hyderabad, India DanRoth VivekGuptaArizona State University 9348-9378 @@ -6010,7 +6010,7 @@ Pay More Attention to Images: Numerous Images-Oriented Multimodal Summarization MinXiao - JunnanZhuInstitute of automation, Chinese academy of science, Chinese Academy of Sciences + JunnanZhuInstitute of automation, Chinese academy of science, Chinese Academy of Sciences FeifeiZhaiInstitute of automation, Chinese academy of science, Chinese Academy of Sciences ChengqingZongInstitute of automation, Chinese academy of science, Chinese Academy of Sciences YuZhouInstitute of Automation, Chinese Academy of Sciences @@ -6023,11 +6023,11 @@ S<tex-math>^2</tex-math>-<fixed-case>MAD</fixed-case>: Breaking the Token Barrier to Enhance Multi-Agent Debate Efficiency YutingZengUniversity of Science and Technology of China WeizheHuangUniversity of Science and Technology of China - LeiJiang - TongxuanLiuJD.com + LeiJiang + TongxuanLiuJD.com XiTaiJin - Chen TianyingTiana - JingLi + Chen TianyingTiana + JingLi XiaohuaXuUniversity of Science and Technology of China 9393-9408 Large language models (LLMs) have demonstrated remarkable capabilities across various natural language processing (NLP) scenarios, but they still face challenges when handling complex arithmetic and logical reasoning tasks. While Chain-Of-Thought (CoT) reasoning, self-consistency (SC) and self-correction strategies have attempted to guide models in sequential, multi-step reasoning, Multi-agent Debate (MAD) has emerged as a viable approach for enhancing the reasoning capabilities of LLMs. By increasing both the number of agents and the frequency of debates, the performance of LLMs improves significantly. However, this strategy results in a significant increase in token costs, presenting a barrier to scalability. To address this challenge, we introduce a novel sparsification strategy designed to reduce token costs within MAD. This approach minimizes ineffective exchanges of information and unproductive discussions among agents, thereby enhancing the overall efficiency of the debate process. We conduct comparative experiments on multiple datasets across various models, demonstrating that our approach significantly reduces the token costs in MAD to a considerable extent. Specifically, compared to MAD, our approach achieves an impressive reduction of up to 94.5% in token costs while maintaining performance degradation below 2.0%. @@ -6039,7 +6039,7 @@ BingzhengGanHuawei Technologies Ltd. YufanZhaoHuawei International Pte. Ltd. TianyiZhang - JingHuangHuawei Technologies Ltd. + JingHuangHuawei Technologies Ltd. LiYusu Shu XianTeo ChangwangZhangCCF Theoretical Computer Science Technical Committee and OPPO Research Institute @@ -6053,7 +6053,7 @@ <fixed-case>S</fixed-case>creen<fixed-case>QA</fixed-case>: Large-Scale Question-Answer Pairs Over Mobile App Screenshots Yu-ChungHsiaoCisco FedirZubachGoogle - GillesBaechlerGoogle DeepMind + GillesBaechlerGoogle DeepMind SrinivasSunkara VictorCarbuneGoogle JasonLinStanford University @@ -6068,7 +6068,7 @@ Cross-Lingual and Cross-Cultural Variation in Image Descriptions UriBergerHebrew University of Jerusalem - EdoardoPontiUniversity of Edinburgh + EdoardoPontiUniversity of Edinburgh 9453-9465 Do speakers of different languages talk differently about what they see? Behavioural and cognitive studies report cultural effects on perception; however, these are mostly limited in scope and hard to replicate. In this work, we conduct the first large-scale empirical study of cross-lingual variation in image descriptions. Using a multimodal dataset with 31 languages and images from diverse locations, we develop a method to accurately identify entities mentioned in captions and present in the images, then measure how they vary across languages. Our analysis reveals that pairs of languages that are geographically or genetically closer tend to mention the same entities more frequently. We also identify entity categories whose saliency is universally high (such as animate beings), low (clothing accessories) or displaying high variance across languages (landscape). In a case study, we measure the differences in a specific language pair (e.g., Japanese mentions clothing far more frequently than English). Furthermore, our method corroborates previous small-scale studies, including 1) Rosch et al. (1976)’s theory of basic-level categories, demonstrating a preference for entities that are neither too generic nor too specific, and 2) Miyamoto et al. (2006)’s hypothesis that environments afford patterns of perception, such as entity counts. Overall, our work reveals the presence of both universal and culture-specific patterns in entity mentions. 2025.naacl-long.478 @@ -6088,7 +6088,7 @@ Not All Adapters Matter: Selective Adapter Freezing for Memory-Efficient Fine-Tuning of Language Models HyegangSonKorea University - YonglakSon + YonglakSon ChanghoonKimSoongsil University Young GeunKimKorea University 9479-9496 @@ -6098,11 +6098,11 @@ Bridging the Gap between Expert and Language Models: Concept-guided Chess Commentary Generation and Evaluation - JaechangKimPOSTECH + JaechangKimPOSTECH JinminGoh InseokHwangPohang University of Science and Technology JaewoongChoKRAFTON - JungseulOkPOSTECH + JungseulOkPOSTECH 9497-9516 Deep learning-based expert models have reached superhuman performance in decision-making domains such as chess and Go. However, it is under-explored to explain or comment on given decisions although it is important for model explainability and human education. The outputs of expert models are accurate, but yet difficult to interpret for humans. On the other hand, large language models (LLMs) can produce fluent commentary but are prone to hallucinations due to their limited decision-making capabilities. To bridge this gap between expert models and LLMs, we focus on chess commentary as a representative task of explaining complex decision-making processes through language and address both the generation and evaluation of commentary. We introduce Concept-guided Chess Commentary generation (CCC) for producing commentary and GPT-based Chess Commentary Evaluation (GCC-Eval) for assessing it. CCC integrates the decision-making strengths of expert models with the linguistic fluency of LLMs through prioritized, concept-based explanations. GCC-Eval leverages expert knowledge to evaluate chess commentary based on informativeness and linguistic quality. Experimental results, validated by both human judges and GCC-Eval, demonstrate that CCC generates commentary which is accurate, informative, and fluent. 2025.naacl-long.481 @@ -6121,10 +6121,10 @@ Culture-<fixed-case>TRIP</fixed-case>: Culturally-Aware Text-to-Image Generation with Iterative Prompt Refinement - SuchaeJeong + SuchaeJeong InseongChoi - YoungsikYunDongguk University - JihieKimDongguk University + YoungsikYunDongguk University + JihieKimDongguk University 9543-9573 2025.naacl-long.483 jeong-etal-2025-culture @@ -6133,7 +6133,7 @@ Behavior-<fixed-case>SD</fixed-case>: Behaviorally Aware Spoken Dialogue Generation with Large Language Models SehunLeeSeoul National University Kang-wookKim - GunheeKimSeoul National University + GunheeKimSeoul National University 9574-9593 Spoken dialogue involves behaviors like turn-taking, interruptions, filler words, and backchannels, which make interactions more natural and engaging but are often overlooked in language models. These models struggle to explicitly model these behavioral traits, resulting in a less natural and personalized communication style that aligns with user needs. To address this challenge, we make two key contributions. First, we introduce Behavior-SD, a large-scale dataset containing over 100K spoken dialogues (2,164 hours) annotated with various conversational behaviors, synthesized via LLMs to model diverse full-duplex interactions. Second, we propose BeDLM, the first dialogue model capable of generating natural conversations conditioned on specific behavioral and narrative contexts, supporting simultaneous contributions from both speakers. Through human evaluations and behavior-adherence metrics, we demonstrate that BeDLM outperforms baseline models in generating natural, coherent, and behaviorally rich dialogues. Our work opens new possibilities for developing behaviorally-aware dialogue systems that more closely mimic human conversational dynamics, enhancing user engagement and communication effectiveness. 2025.naacl-long.484 @@ -6141,7 +6141,7 @@ Is Translation All You Need? A Study on Solving Multilingual Tasks with Large Language Models - ChaoqunLiu + ChaoqunLiu WenxuanZhangSingapore University of Technology and Design YiranZhaoNational University of Singapore Anh TuanLuuNanyang Technological University @@ -6164,7 +6164,7 @@ Towards Quantifying Commonsense Reasoning with Mechanistic Insights - AbhinavJoshiIndian Institute of Technology, Kanpur + AbhinavJoshiIndian Institute of Technology, Kanpur AreebAhmad DivyakshShukla AshutoshModiIIT Kanpur @@ -6211,7 +6211,7 @@ Grounding Fallacies Misrepresenting Scientific Publications in Evidence MaxGlocknerTechnische Universität Darmstadt YufangHouIT:U Interdisciplinary Transformation University Austria, Technische Universität Darmstadt and IBM Research Ireland - PreslavNakovMohamed bin Zayed University of Artificial Intelligence + PreslavNakovMohamed bin Zayed University of Artificial Intelligence IrynaGurevychInstitute for Computer Science, Artificial Intelligence and Technology, Mohamed bin Zayed University of Artificial Intelligence and Technische Universität Darmstadt 9732-9767 Health-related misinformation claims often falsely cite a credible biomedical publication as evidence. These publications only superficially seem to support the false claim, when logical fallacies are applied. In this work, we aim to detect and to highlight such fallacies, which requires assessing the exact content of the misrepresented publications. To achieve this, we introduce MissciPlus, an extension of the fallacy detection dataset Missci. MissciPlus extends Missci by grounding the applied fallacies in real-world passages from misrepresented studies. This creates a realistic test-bed for detecting and verbalizing fallacies under real-world input conditions, and enables new and realistic passage-retrieval tasks. MissciPlus is the first logical fallacy dataset which pairs the real-world misrepresented evidence with incorrect claims, identical to the input to evidence-based fact-checking models. With MissciPlus, we i) benchmark retrieval models in identifying passages that support claims only with fallacious reasoning, ii) evaluate how well LLMs verbalize fallacious reasoning based on misrepresented scientific passages, and iii) assess the effectiveness of fact-checking models in refuting claims that misrepresent biomedical research. Our findings show that current fact-checking models struggle to use misrepresented scientific passages to refute misinformation. Moreover, these passages can mislead LLMs into accepting false claims as true. @@ -6220,10 +6220,10 @@ Has this Fact been Edited? Detecting Knowledge Edits in Language Models - PaulYoussef - ZhixueZhaoUniversity of Sheffield, University of Sheffield - ChristinSeifertPhillips-Universität Marburg and University of Twente - JörgSchlöttererUniversität Mannheim and Phillips-Universität Marburg + PaulYoussef + ZhixueZhaoUniversity of Sheffield, University of Sheffield + ChristinSeifertPhillips-Universität Marburg and University of Twente + JörgSchlöttererUniversität Mannheim and Phillips-Universität Marburg 9768-9784 Knowledge editing methods (KEs) can update language models’ obsolete or inaccurate knowledge learned from pre-training. However, KEs can be used for malicious applications, e.g., inserting misinformation and toxic content. Knowing whether a generated output is based on edited knowledge or first-hand knowledge from pre-training can increase users’ trust in generative models and provide more transparency. Driven by this, we propose a novel task: detecting knowledge edits in language models. Given an edited model and a fact retrieved by a prompt from an edited model, the objective is to classify the knowledge as either unedited (based on the pre-training), or edited (based on subsequent editing). We instantiate the task with four KEs, two large language models (LLMs), and two datasets. Additionally, we propose using hidden state representations and probability distributions as features for the detection model. Our results reveal that using these features as inputs to a simple AdaBoost classifier establishes a strong baseline. This baseline classifier requires a small amount of training data and maintains its performance even in cross-domain settings. Our work lays the groundwork for addressing potential malicious model editing, which is a critical challenge associated with the strong generative capabilities of LLMs. 2025.naacl-long.492 @@ -6255,7 +6255,7 @@ Grammar Control in Dialogue Response Generation for Language Learning Chatbots DominikGlandorfEPFL - EPF Lausanne PengCuiETHZ - ETH Zurich - DetmarMeurersEberhard-Karls-Universität Tübingen + DetmarMeurersEberhard-Karls-Universität Tübingen MrinmayaSachanSwiss Federal Institute of Technology 9820-9839 Chatbots based on large language models offer cheap conversation practice opportunities for language learners. However, they are hard to control for linguistic forms that correspond to learners’ current needs, such as grammar. We control grammar in chatbot conversation practice by grounding a dialogue response generation model in a pedagogical repository of grammar skills. We also explore how this control helps learners to produce specific grammar. We comprehensively evaluate prompting, fine-tuning, and decoding strategies for grammar-controlled dialogue response generation. Strategically decoding Llama3 outperforms GPT-3.5 when tolerating minor response quality losses. Our simulation predicts grammar-controlled responses to support grammar acquisition adapted to learner proficiency. Existing language learning chatbots and research on second language acquisition benefit from these affordances. Code available on GitHub. @@ -6269,9 +6269,9 @@ WanlongLiu NicolasGarneau YongCao - WenyuChen - HaizhouLiThe Chinese University of Hong Kong (Shenzhen); National University of Singapore and National University of Singapore - DanielHershcovichUniversity of Copenhagen + WenyuChen + HaizhouLiThe Chinese University of Hong Kong (Shenzhen); National University of Singapore and National University of Singapore + DanielHershcovichUniversity of Copenhagen 9840-9867 Recent studies have highlighted the presence of cultural biases in Large Language Models (LLMs), yet often lack a robust methodology to dissect these phenomena comprehensively. Our work aims to bridge this gap by delving into the Food domain—a universally relevant yet culturally diverse aspect of human life. We introduce FmLAMA, a multilingual dataset centered on food-related cultural facts and variations in food practices. We analyze LLMs across various architectures and configurations, evaluating their performance in both monolingual and multilingual settings. By leveraging templates in six different languages, we investigate how LLMs interact with language-specific and cultural knowledge. Our findings reveal that (1) LLMs demonstrate a pronounced bias towards food knowledge prevalent in the United States; (2) Incorporating relevant cultural context significantly improves LLMs’ ability to access cultural knowledge; (3) The efficacy of LLMs in capturing cultural nuances is highly dependent on the interplay between the probing language, the specific model architecture, and the cultural context in question. This research underscores the complexity of integrating cultural understanding into LLMs and emphasizes the importance of culturally diverse datasets to mitigate biases and enhance model performance across different cultural domains. 2025.naacl-long.496 @@ -6281,10 +6281,10 @@ Palette of Language Models: A Solver for Controlled Text Generation ZheYangChina Mobile Research Institute YiHuangChina Mobile Research Institute - YaqinChen + YaqinChen XiaotingWuXiaotingWu - JunlanFeng - ChaoDengChina Mobile Research Institute + JunlanFeng + ChaoDengChina Mobile Research Institute 9868-9881 Recent advancements in large language models have revolutionized text generation with their remarkable capabilities. These models can produce controlled texts that closely adhere to specific requirements when prompted appropriately. However, designing an optimal prompt to control multiple attributes simultaneously can be challenging. A common approach is to linearly combine single-attribute models, but this strategy often overlooks attribute overlaps and can lead to conflicts. Therefore, we propose a novel combination strategy inspired by the Law of Total Probability and Conditional Mutual Information Minimization on generative language models. This method has been adapted for single-attribute control scenario and is termed the Palette of Language Models due to its theoretical linkage between attribute strength and generation style, akin to blending colors on an artist’s palette. Moreover, positive correlation and attribute enhancement are advanced as theoretical properties to guide a rational combination strategy design. We conduct experiments on both single control and multiple control settings, and achieve surpassing results. 2025.naacl-long.497 @@ -6293,8 +6293,8 @@ <fixed-case>MAMM</fixed-case>-Refine: A Recipe for Improving Faithfulness in Generation with Multi-Agent Collaboration DavidWanDepartment of Computer Science, University of North Carolina at Chapel Hill - JustinChen - EliasStengel-Eskin + JustinChen + EliasStengel-Eskin MohitBansalUniversity of North Carolina at Chapel Hill 9882-9901 Multi-agent collaboration among models has shown promise in reasoning tasks but is underexplored in long-form generation tasks like summarization and question-answering. We extend multi-agent multi-model reasoning to generation, specifically to improving faithfulness through refinement, i.e., revising model-generated outputs to remove factual inconsistencies. We investigate how iterative collaboration among multiple instances and types of large language models (LLMs) enhances subtasks in the refinement process, such as error detection, critiquing unfaithful sentences, and making corrections based on critiques. We design intrinsic evaluations for each subtask, with our findings indicating that both multi-agent (multiple instances) and multi-model (diverse LLM types) approaches benefit error detection and critiquing. Additionally, reframing critiquing and refinement as reranking rather than generation tasks improves multi-agent performance. We consolidate these insights into a final “recipe” called **M**ulti-**A**gent **M**ulti-**M**odel **Refine**ment (MAMM-Refine), where multi-agent and multi-model collaboration significantly boosts performance on three summarization datasets as well as on long-form question answering, demonstrating the effectiveness and generalizability of our recipe. Our code is publicly available. @@ -6306,7 +6306,7 @@ JunqingHeInternational Digital Econemy Academy LiangZhu RuiWangInternational Digital Economy Academy, International Digital Economy Academy - XiWangUniversity of Sheffield + XiWangUniversity of Sheffield GholamrezaHaffariMonash University, Monash University and Monash University JiaxingZhangIDEA 9902-9921 @@ -6317,7 +6317,7 @@ Assessing the State of the Art in Scene Segmentation AlbinZeheUniversity of Würzburg - ElisabethFischerBayerische Julius-Maximilians-Universität Würzburg + ElisabethFischerBayerische Julius-Maximilians-Universität Würzburg AndreasHothoBayerische Julius-Maximilians-Universität Würzburg 9922-9941 The detection of scenes in literary texts is a recently introduced segmentation task in computational literary studies. Its goal is to partition a fictional text into segments that are coherent across the dimensions time, space, action and character constellation. This task is very challenging for automatic methods, since it requires a high-level understanding of the text. In this paper, we provide a thorough analysis of the State of the Art and challenges in this task, identifying and solving a problem in the training procedure for previous approaches, analysing the generalisation capabilities of the models and comparing the BERT-based SotA to current Llama models, as well as providing an analysis of what causes errors in the models. Our change in training procedure provides a significant increase in performance. We find that Llama-based models are more robust to different types of texts, while their overall performance is slightly worse than that of BERT-based models. @@ -6327,9 +6327,9 @@ <fixed-case>DCE</fixed-case>-<fixed-case>LLM</fixed-case>: Dead Code Elimination with Large Language Models MinyuChenShanghai Jiaotong University - GuoqiangLiShanghai Jiao Tong University - Ling-IWu - RuibangLiu + GuoqiangLiShanghai Jiao Tong University + Ling-IWu + RuibangLiu 9942-9955 Dead code introduces several challenges in software development, such as increased binary size and maintenance difficulties. It can also obscure logical errors and be exploited for obfuscation in malware. For LLM-based code-related tasks, dead code introduces vulnerabilities that can mislead these models, raising security concerns. Although modern compilers and IDEs offer dead code elimination, sophisticated patterns can bypass these tools. A universal approach that includes classification, location, explanation, and correction is needed, yet current tools often require significant manual effort. We present DCE-LLM, a framework for automated dead code elimination using a small CodeBERT model with an attribution-based line selector to efficiently locate suspect code. LLMs then generate judgments and explanations, fine-tuned on a large-scale, annotated dead code dataset to provide detailed explanations and patches. DCE-LLM outperforms existing tools, with advanced unreachability detection, automated correction, and support for multiple programming languages. Experimental results show DCE-LLM achieves over 94% F1 scores for unused and unreachable code, significantly surpassing GPT-4o by 30%. 2025.naacl-long.501 @@ -6339,10 +6339,10 @@ Instruct-of-Reflection: Enhancing Large Language Models Iterative Reflection Capabilities via Dynamic-Meta Instruction LipingLiu ChunhongZhang - LikangWuTianjin University and Tianjin University - ChuangZhao - ZhengHu - MingHeLenovo Group Limited + LikangWuTianjin University and Tianjin University + ChuangZhao + ZhengHu + MingHeLenovo Group Limited JianpingFanAI Lab at Lenovo Research, Hangzhou Dianzi University and Northwest University 9956-9978 Self-reflection for Large LanguageModels (LLMs) has gained significant attention. Existing approaches involve models iterating and improving their previous responses based on LLMs’ internal reflection ability or external feedback. However, recent research has raised doubts about whether intrinsic self-correction without external feedback may even degrade performance. Based on our empirical evidence, we find that current static reflection methods may lead to redundant, drift, and stubborn issues. To mitigate this, we introduce **I**nstruct-**o**f-**R**eflec**t**ion (**IoRT**), a novel and general reflection framework that leverages dynamic-meta instruction to enhance the iterative reflection capability of LLMs. Specifically, we propose the instructor driven by the meta-thoughts and self-consistency classifier, generates various instructions, including refresh, stop, and select, to guide the next reflection iteration. Our experiments demonstrate that IoRT achieves an average improvement of 10.1% over established baselines in mathematical and commonsense reasoning tasks, highlighting its efficacy and applicability. Our code is available at https://github.com/llp635/IoRT. @@ -6359,8 +6359,8 @@ JunhwaChoiSamsung SDS SeonghoJoeSamsung TaeheeLeeSamsung SDS - YoungjuneGwonSamsung SDS - SungrohYoonSeoul National University + YoungjuneGwonSamsung SDS + SungrohYoonSeoul National University 9979-10001 A binary decision task, like yes-no questions or answer verification, reflects a significant real-world scenario such as where users look for confirmation about the correctness of their decisions on specific issues. In this work, we observe that language models exhibit a negative bias in the binary decisions of complex reasoning tasks. Based on our observations and the rationale about attention-based model dynamics, we propose a negative attention score (NAS) to systematically and quantitatively formulate negative bias. Based on NAS, we identify attention heads that attend to negative tokens provided in the instructions as answer candidate of binary decisions, regardless of the question in the prompt, and validate their association with the negative bias. Additionally, we propose the negative attention score alignment (NASA) method, which is a parameter-efficient fine-tuning technique to address the extracted negatively biased attention heads. Experimental results from various domains of reasoning tasks and large model search space demonstrate that NASA significantly reduces the gap between precision and recall caused by negative bias while preserving their generalization abilities. 2025.naacl-long.503 @@ -6373,8 +6373,8 @@ LanyuChen JingyuLi HaojingChenUniversity of Electronic Science and Technology of China - VictorGutierrez BasultoCardiff University - Jeff Z.PanUniversity of Edinburgh, University of Edinburgh + VictorGutierrez BasultoCardiff University + Jeff Z.PanUniversity of Edinburgh, University of Edinburgh HanjieChenRice University 10002-10039 **Multimodal Chain of Thought (MCoT)** is a popular prompting strategy for improving the performance of multimodal large language models (MLLMs) across a range of complex reasoning tasks. Despite its popularity, there is a notable absence of automated methods for evaluating the quality of reasoning steps in MCoT. To address this gap, we propose **Multimodal Chain-of-Thought Evaluation (MiCEval)**, a framework designed to assess the correctness of reasoning chains by evaluating the quality of both the description and each reasoning step. The evaluation of the description component focuses on the accuracy of the image descriptions, while the reasoning step evaluates the quality of each step as it is conditionally generated based on the preceding steps. MiCEval is built upon a fine-grained dataset with annotations that rate each step according to correctness, relevance, and informativeness. Extensive experiments on four state-of-the-art MLLMs show that step-wise evaluations using MiCEval align more closely with human judgments compared to existing methods based on cosine similarity or fine-tuning approaches. MiCEval datasets and code can be found at: [https://anonymous_github/MicEval](https://anonymous.4open.science/r/MiCEval-847F/README.md). @@ -6385,13 +6385,13 @@ <fixed-case>C</fixed-case>artesian<fixed-case>M</fixed-case>o<fixed-case>E</fixed-case>: Boosting Knowledge Sharing among Experts via <fixed-case>C</fixed-case>artesian Product Routing in Mixture-of-Experts ZhenpengSu XingW - ZijiaLinKuaishou Technology - YizheXiongSchool of Software, Tsinghua University + ZijiaLinKuaishou Technology + YizheXiongSchool of Software, Tsinghua University MinxuanLv - GuangyuanMa - HuiChenTsinghua University, Tsinghua University + GuangyuanMa + HuiChenTsinghua University, Tsinghua University SonglinHu - GuiguangDingTsinghua University + GuiguangDingTsinghua University 10040-10055 Large language models (LLM) have been attracting much attention from the community recently, due to their remarkable performance in all kinds of downstream tasks. According to the well-known scaling law, scaling up a dense LLM enhances its capabilities, but also significantly increases the computational complexity. Mixture-of-Experts (MoE) models address that by allowing the model size to grow without substantially raising training or inference costs. Yet MoE models face challenges regarding knowledge sharing among experts, making their performance somehow sensitive to routing accuracy. To tackle that, previous works introduced shared experts and combined their outputs with those of the top K routed experts in an addition manner. In this paper, inspired by collective matrix factorization to learn shared knowledge among data, we propose CartesianMoE, which implements more effective knowledge sharing among experts in more like a multiplication manner. Extensive experimental results indicate that CartesianMoE outperforms previous MoE models for building LLMs, in terms of both perplexity and downstream task performance. And we also find that CartesianMoE achieves better expert routing robustness. 2025.naacl-long.505 @@ -6400,8 +6400,8 @@ Measuring and Benchmarking Large Language Models’ Capabilities to Generate Persuasive Language Amalie BrogaardPauliAarhus University - IsabelleAugensteinUniversity of Copenhagen - IraAssentAarhus University + IsabelleAugensteinUniversity of Copenhagen + IraAssentAarhus University 10056-10075 We are exposed to much information trying to influence us, such as teaser messages, debates, politically framed news, and propaganda — all of which use persuasive language. With the recent interest in Large Language Models (LLMs), we study the ability of LLMs to produce persuasive text. As opposed to prior work which focuses on particular domains or types of persuasion, we conduct a general study across various domains to measure and benchmark to what degree LLMs produce persuasive language - both when explicitly instructed to rewrite text to be more or less persuasive and when only instructed to paraphrase. We construct the new dataset Persuasive-Pairs of pairs of a short text and its rewrite by an LLM to amplify or diminish persuasive language. We multi-annotate the pairs on a relative scale for persuasive language: a valuable resource in itself, and for training a regression model to score and benchmark persuasive language, including for new LLMs across domains. In our analysis, we find that different ‘personas’ in LLaMA3’s system prompt change persuasive language substantially, even when only instructed to paraphrase. 2025.naacl-long.506 @@ -6412,7 +6412,7 @@ SshubamVerma Mohammed Safi Ur RahmanKhanIndian Institute of Technology, Madras, Dhirubhai Ambani Institute Of Information and Communication Technology and Indian Institute of Technology, Madras, Dhirubhai Ambani Institute Of Information and Communication Technology VishwajeetKumarInternational Business Machines - RudraMurthyIBM India Ltd + RudraMurthyIBM India Ltd JaydeepSen 10076-10132 Evaluating Large Language Models (LLMs) in low-resource and linguistically diverse languages remains a significant challenge in NLP, particularly for languages using non-Latin scripts like those spoken in India. Existing benchmarks predominantly focus on English, leaving substantial gaps in assessing LLM capabilities in these languages. We introduce MILU, a Multi-task Indic Language Understanding Benchmark, a comprehensive evaluation benchmark designed to address this gap. MILU spans 8 domains and 41 subjects across 11 Indic languages, reflecting general and culturally specific knowledge. With an India-centric design, incorporates material from regional and state-level examinations, covering topics such as local history, arts, festivals, and laws, alongside standard subjects like science and mathematics. We evaluate over 42 LLMs, and find that current LLMs struggle with MILU, with GPT-4o achieving the highest average accuracy at 74 percent. Open multilingual models outperform language-specific fine-tuned models, which perform only slightly better than random baselines. Models also perform better in high resource languages as compared to low resource ones. Domain-wise analysis indicates that models perform poorly in culturally relevant areas like Arts and Humanities, Law and Governance compared to general fields like STEM. To the best of our knowledge, MILU is the first of its kind benchmark focused on Indic languages, serving as a crucial step towards comprehensive cultural evaluation. All code, benchmarks, and artifacts are publicly available to foster open research. @@ -6421,10 +6421,10 @@ <fixed-case>A</fixed-case>uto<fixed-case>E</fixed-case>val-<fixed-case>T</fixed-case>o<fixed-case>D</fixed-case>: Automated Evaluation of Task-oriented Dialog Systems - ArihantJainAmazon + ArihantJainAmazon PuravAggarwalAmazon RishavSahayAmazon - ChaoshengDongAmazon + ChaoshengDongAmazon AnoopSaladiAmazon 10133-10148 Task-oriented Dialog systems (ToD) are essential in automating user interactions, but their complex design and dynamic nature make evaluation particularly challenging. Current evaluation methodologies heavily depend on human annotators, which can be inefficient, subjective, and expensive to scale. To advance the field, there is a pressing need for a reliable, scalable, and systematic evaluation framework that can provide comprehensive insights into ToD system performance. In this paper, we propose, AutoEval-TOD, an automated end-to-end evaluation framework using large language models (LLMs). Our framework first interacts with the ToD system and then assesses its performance across key dimensions by analyzing both the ToD’s responses and internal states. We validate our approach by applying it to multiple ToD systems, highlighting its adaptability and potential for widespread use in both research and industrial settings. @@ -6443,14 +6443,14 @@ Logic-of-Thought: Injecting Logic into Contexts for Full Reasoning in Large Language Models - TongxuanLiuJD.com + TongxuanLiuJD.com WenjiangXuInstitute of automation, Chinese academy of science, Chinese Academy of Sciences WeizheHuangUniversity of Science and Technology of China YutingZengUniversity of Science and Technology of China JiaxingWangJD.com XingyuWangInstitute of automation, Chinese academy of science HailongYang - JingLi + JingLi 10168-10185 Large Language Models (LLMs) have demonstrated remarkable capabilities across various tasks but their performance in complex logical reasoning tasks remains unsatisfactory. Although some prompting methods, such as Chain-of-Thought, can improve the reasoning ability of LLMs to some extent, they suffer from an unfaithful issue where derived conclusions may not align with the generated reasoning chain. To address this issue, some studies employ the approach of propositional logic to further enhance logical reasoning abilities of LLMs. However, the potential omissions in the extraction of logical expressions in these methods can cause information loss in the logical reasoning process, thereby generating incorrect results. To this end, we propose Logic-of-Thought (LoT) prompting which employs propositional logic to generate expanded logical information descriptions and utilizes them as an additional augmentation to original contexts, thereby ensuring information completeness and enhancing logical reasoning ability. LoT is orthogonal to existing prompting methods and can be seamlessly integrated with them. Extensive experiments demonstrate that LoT boosts the performance of various prompting methods with a striking margin across five logical reasoning tasks. In particular, LoT enhances Chain-of-Thought’s performance on the ReClor dataset by +4.35%, improves Chain-of-Thought with Self-Consistency’s performance on the RuleTaker dataset by +3.52%, and boosts performance of Tree-of-Thoughts on the ProofWriter dataset by +8%. 2025.naacl-long.510 @@ -6460,7 +6460,7 @@ <fixed-case>IFIR</fixed-case>: A Comprehensive Benchmark for Evaluating Instruction-Following in Expert-Domain Information Retrieval TingyuSongUniversity of the Chinese Academy of Sciences GuoGan - MingshengShangCIGIT + MingshengShangCIGIT YilunZhaoYale University 10186-10204 We introduce IFIR, the first comprehensive benchmark designed to evaluate instruction-following information retrieval (IR) in expert domains. IFIR includes 2,426 high-quality examples and covers eight subsets across four specialized domains: finance, law, healthcare, and science literature. Each subset addresses one or more domain-specific retrieval tasks, replicating real-world scenarios where customized instructions are critical. IFIR enables a detailed analysis of instruction-following retrieval capabilities by incorporating instructions at different levels of complexity. We also propose a novel LLM-based evaluation method to provide a more precise and reliable assessment of model performance in following instructions. Through extensive experiments on 15 frontier retrieval models, including those based on LLMs, our results reveal that current models face significant challenges in effectively following complex, domain-specific instructions. We further provide in-depth analyses to highlight these limitations, offering valuable insights to guide future advancements in retriever development. @@ -6469,12 +6469,12 @@ <fixed-case>QAVA</fixed-case>: Query-Agnostic Visual Attack to Large Vision-Language Models - YudongZhang - RuobingXie + YudongZhang + RuobingXie JianshengChenUniversity of Science and Technology Beijing - XingwuSunTencent AI Platform - ZhanhuiKang - YuWangTsinghua University, Tsinghua University + XingwuSunTencent AI Platform + ZhanhuiKang + YuWangTsinghua University, Tsinghua University 10205-10218 In typical multimodal tasks, such as Visual Question Answering (VQA), adversarial attacks targeting a specific image and question can lead large vision-language models (LVLMs) to provide incorrect answers. However, it is common for a single image to be associated with multiple questions, and LVLMs may still answer other questions correctly even for an adversarial image attacked by a specific question. To address this, we introduce the query-agnostic visual attack (QAVA), which aims to create robust adversarial examples that generate incorrect responses to unspecified and unknown questions. Compared to traditional adversarial attacks focused on specific images and questions, QAVA significantly enhances the effectiveness and efficiency of attacks on images when the question is unknown, achieving performance comparable to attacks on known target questions. Our research broadens the scope of visual adversarial attacks on LVLMs in practical settings, uncovering previously overlooked vulnerabilities, particularly in the context of visual adversarial threats. The code is available at https://github.com/btzyd/qava. 2025.naacl-long.512 @@ -6485,9 +6485,9 @@ JieHe YijunYangEdinburgh University, University of Edinburgh WanqiuLong - DeyiXiongTianjin University - VictorGutierrez BasultoCardiff University - Jeff Z.PanUniversity of Edinburgh, University of Edinburgh + DeyiXiongTianjin University + VictorGutierrez BasultoCardiff University + Jeff Z.PanUniversity of Edinburgh, University of Edinburgh 10219-10244 Large language models (LLMs) have demonstrated immense potential across various tasks. However, research for exploring and improving the capabilities of LLMs in interpreting graph structures remains limited. To address this gap, we conduct a comprehensive evaluation of prompting current open-source LLMs on graph-to-text generation tasks. Although we explored the optimal prompting strategies and proposed a novel and effective diversity-difficulty-based few-shot sample selection method, we found that the improvements from tuning-free approaches were incremental, as LLMs struggle with planning on complex graphs, particularly those with a larger number of triples. To further improve LLMs in planning with graph sequences and grounding in truth, we introduce a new graph-to-text dataset, PlanGTG, annotated with two sub-tasks: reordering and attribution. Through extensive automatic and human evaluations, we demonstrate significant improvements in the quality of generated text from both few-shot learning and fine-tuning perspectives using the PlanGTG dataset. Our study paves the way for new research directions in graph-to-text generation. 2025.naacl-long.513 @@ -6495,9 +6495,9 @@ The Plagiarism Singularity Conjecture - SriramRanga - RuiMao - ErikCambriaNanyang Technological University + SriramRanga + RuiMao + ErikCambriaNanyang Technological University AnupamChattopadhyayNanyang Technological University 10245-10255 2025.naacl-long.514 @@ -6506,7 +6506,7 @@ Ensembling Large Language Models with Process Reward-Guided Tree Search for Better Complex Reasoning SungjinPark - XiaoLiuMicrosoft Research Asia + XiaoLiuMicrosoft Research Asia YeyunGong EdwardChoiKorea Advanced Institute of Science and Technology 10256-10277 @@ -6526,9 +6526,9 @@ Soft Language Prompts for Language Transfer - IvanVykopalKempelen Institute of Intelligent Technologies and Brno University of Technology - SimonOstermannGerman Research Center for AI - MarianSimkoKempelen Institute of Intelligent Technologies + IvanVykopalKempelen Institute of Intelligent Technologies and Brno University of Technology + SimonOstermannGerman Research Center for AI + MarianSimkoKempelen Institute of Intelligent Technologies 10294-10313 Cross-lingual knowledge transfer, especially between high- and low-resource languages, remains challenging in natural language processing (NLP). This study offers insights for improving cross-lingual NLP applications through the combination of parameter-efficient fine-tuning methods. We systematically explore strategies for enhancing cross-lingual transfer through the incorporation of language-specific and task-specific adapters and soft prompts. We present a detailed investigation of various combinations of these methods, exploring their efficiency across 16 languages, focusing on 10 mid- and low-resource languages. We further present to our knowledge the first use of soft prompts for language transfer, a technique we call soft language prompts. Our findings demonstrate that in contrast to claims of previous work, a combination of language and task adapters does not always work best; instead, combining a soft language prompt with a task adapter outperforms most configurations in many cases. 2025.naacl-long.517 @@ -6538,7 +6538,7 @@ <fixed-case>PICL</fixed-case>e: Pseudo-annotations for In-Context Learning in Low-Resource Named Entity Detection SepidehMamoolerSchool of Computer and Communication Sciences, EPFL - EPF Lausanne SyrielleMontariolEPFL - EPF Lausanne - AlexanderMathisEPFL - EPF Lausanne + AlexanderMathisEPFL - EPF Lausanne AntoineBosselutSwiss Federal Institute of Technology Lausanne 10314-10331 In-context learning (ICL) enables Large Language Models (LLMs) to perform tasks using few demonstrations, facilitating task adaptation when labeled examples are hard to come by. However, ICL is sensitive to the choice of demonstrations, and it remains unclear which demonstration attributes enable in-context generalization. In this work, we conduct a perturbation study of in-context demonstrations for low-resource Named Entity Detection (NED). Our surprising finding is that in-context demonstrations with partially-correct annotated entity mentions can be as effective for task transfer as fully correct demonstrations. Based off our findings, we propose Pseudo-annotated In-Context Learning (PICLe), a framework for in-context learning with noisy, pseudo-annotated demonstrations. PICLe leverages LLMs to annotate large quantities of demonstrations in a zero-shot first pass. We then cluster these synthetic demonstrations, sample specific sets of in-context demonstrations from each cluster, and predict entity mentions using each set independently. Finally, we use self-verification to select the final set of entity mentions. We extensively evaluate PICLe on five biomedical NED datasets and show that, with zero human-annotation, PICLe outperforms ICL in low-resource settings where few gold examples can be used as in-context demonstrations. @@ -6547,9 +6547,9 @@ Can Large Language Models Invent Algorithms to Improve Themselves? - YoichiIshibashiNEC + YoichiIshibashiNEC TaroYanoNEC - MasafumiOyamadaNEC + MasafumiOyamadaNEC 10332-10363 Large Language Models (LLMs) have shown remarkable performance improvements and are rapidly gaining adoption in industry. However, the methods for improving LLMs are still designed by humans, which restricts the invention of new model-improving algorithms to human expertise and imagination. To address this, we propose the Self-Developing framework, which enables LLMs to autonomously generate and learn model-improvement algorithms. In this framework, the seed model generates, applies, and learns model-improving algorithms, continuously improving both the seed model and the algorithms themselves. Among model-improving strategies, we focus on model merging algorithms. In mathematical reasoning tasks, Self-Developing discovers novel merging strategies and outperforms human-designed methods. On GSM8k, the discovered algorithms improve the seed model by 6% and surpass human-designed methods by 4.3%. Moreover, they exhibit strong transferability, achieving a 7.4% performance gain on out-of-domain models. These results suggest that LLMs can autonomously develop effective model-improvement techniques beyond human intuition. 2025.naacl-long.519 @@ -6557,18 +6557,18 @@ Simulating Classroom Education with <fixed-case>LLM</fixed-case>-Empowered Agents - ZheyuanZhang - DanielZhang-Li - JifanYu - LinluGong + ZheyuanZhang + DanielZhang-Li + JifanYu + LinluGong JinchangZhouTsinghua University, Tsinghua University - ZhanxinHao - JianxiaoJiangTsinghua University - JieCao - HuiqinLiuTsinghua University, Tsinghua University - ZhiyuanLiuTsinghua University - LeiHouTsinghua University, Tsinghua University - JuanziLi + ZhanxinHao + JianxiaoJiangTsinghua University + JieCao + HuiqinLiuTsinghua University, Tsinghua University + ZhiyuanLiuTsinghua University + LeiHouTsinghua University, Tsinghua University + JuanziLi 10364-10379 Large language models (LLMs) have been applied across various intelligent educational tasks to assist teaching. While preliminary studies have focused on task-specific, independent LLM-empowered agents, the potential of LLMs within a multi-agent collaborative framework for classroom simulation with real user participation remains unexplored. In this work, we propose SimClass, a multi-agent classroom simulation teaching framework. We recognize representative class roles and introduce a novel class control mechanism for automatic classroom teaching, and conduct user experiments in two real-world courses. Using the Flanders Interactive Analysis System and Community of Inquiry theoretical frameworks from educational analysis, we demonstrate that LLMs can simulate a dynamic learning environment for users with active teacher-student and student-student interactions. We also observe group behaviors among agents in SimClass, where agents collaborate to create enlivening interactions in classrooms to improve user learning process. We hope this work pioneers the application of LLM-empowered multi-agent systems in virtual classroom teaching. Our implementation and service can be found at https://github.com/THU-MAIC/SimClass. 2025.naacl-long.520 @@ -6576,9 +6576,9 @@ A Grounded Typology of Word Classes - ColemanHaleyUniversity of Edinburgh + ColemanHaleyUniversity of Edinburgh SharonGoldwaterUniversity of Edinburgh - EdoardoPontiUniversity of Edinburgh + EdoardoPontiUniversity of Edinburgh 10380-10399 In this work, we propose a grounded approach to meaning in language typology. Using images captioned across languages, we can treat the images as an empirical language agnostic representation of meaning, allowing the quantification of language function and semantics. Using principles from information theory, we define “groundedness”, an empirical measure of contextual semantic contentfulness which can be computed using multilingual (vision-and-)language models. As an initial application, we apply this measure to the typology of word classes. We find our measure captures the contentfulness asymmetry between functional (grammatical) and lexical (content) classes across languages, but contradicts the view that functional classes do not convey content. We release a dataset of groundedness scores for 30 languages. Our results suggest that the grounded typology approach can provide quantitative evidence about semantic function in language. 2025.naacl-long.521 @@ -6587,10 +6587,10 @@ <fixed-case>SSH</fixed-case>: Sparse Spectrum Adaptation via Discrete Hartley Transformation YixianShen - QiBiUniversity of Amsterdam - Jia-hongHuangUniversity of Amsterdam, King Abdullah University of Science and Technology and National Taiwan University + QiBiUniversity of Amsterdam + Jia-hongHuangUniversity of Amsterdam, King Abdullah University of Science and Technology and National Taiwan University HongyiZhu - Andy D.PimentelUniversity of Amsterdam + Andy D.PimentelUniversity of Amsterdam AnujPathaniaUniversity of Amsterdam 10400-10415 Low-rank adaptation (LoRA) has been demonstrated effective in reducing the trainable parameter number when fine-tuning a large foundation model (LLM). However, it still encounters computational and memory challenges when scaling to larger models or addressing more complex task adaptation.In this work, we introduce **Sparse Spectrum Adaptation via Discrete Hartley Transformation (SSH)**, a novel approach that significantly reduces the number of trainable parameters while enhancing model performance. It selects the most informative spectral components across all layers, under the guidance of the initial weights after a discrete Hartley transformation (DHT). The lightweight inverse DHT then projects the spectrum back into the spatial domain for updates.Extensive experiments across both single-modality tasks—such as language understanding and generation—and multi-modality tasks—such as video-text understanding—demonstrate that SSH outperforms existing parameter-efficient fine-tuning (PEFT) methods while achieving substantial reductions in computational cost and memory requirements. For instance, during instruction tuning on the LLaMA3.1 8B model, SSH achieves higher accuracy with only 0.048M trainable parameters compared to LoRA’s 33.5M, while reducing computational intensity up to 55% compared to FourierFT. @@ -6599,11 +6599,11 @@ <fixed-case>LLM</fixed-case>-guided Plan and Retrieval: A Strategic Alignment for Interpretable User Satisfaction Estimation in Dialogue - SangyeopKimCoxwave and Seoul National University + SangyeopKimCoxwave and Seoul National University SohhyungParkSeoul National University - JaewonJungSeoul National University + JaewonJungSeoul National University JinseokKimSeoul National University - SungzoonChoSeoul National University + SungzoonChoSeoul National University 10416-10430 Understanding user satisfaction with conversational systems, known as User Satisfaction Estimation (USE), is essential for assessing dialogue quality and enhancing user experiences. However, existing methods for USE face challenges due to limited understanding of underlying reasons for user dissatisfaction and the high costs of annotating user intentions. To address these challenges, we propose PRAISE (Plan and Retrieval Alignment for Interpretable Satisfaction Estimation), an interpretable framework for effective user satisfaction prediction. PRAISE operates through three key modules. The Strategy Planner develops strategies, which are natural language criteria for classifying user satisfaction. The Feature Retriever then incorporates knowledge on user satisfaction from Large Language Models (LLMs) and retrieves relevance features from utterances. Finally, the Score Analyzer evaluates strategy predictions and classifies user satisfaction. Experimental results demonstrate that PRAISE achieves state-of-the-art performance on three benchmarks for the USE task. Beyond its superior performance, PRAISE offers additional benefits. It enhances interpretability by providing instance-level explanations through effective alignment of utterances with strategies. Moreover, PRAISE operates more efficiently than existing approaches by eliminating the need for LLMs during the inference phase. 2025.naacl-long.523 @@ -6614,7 +6614,7 @@ SuminAnKorea University JunyoungSungKorea University WonpyoParkGoogle - ChanjunParkKorea University + ChanjunParkKorea University Paul HongsuckSeoKorea University 10431-10442 While large language models (LLMs) excel in generating coherent and contextually rich outputs, their capacity to efficiently handle long-form contexts is limited by fixed-length position embeddings. Additionally, the computational cost of processing long sequences increases quadratically, making it challenging to extend context length. To address these challenges, we propose Long-form Context Injection with Recurrent Compression (LCIRC), a method that enables the efficient processing long-form sequences beyond the model’s length limit through recurrent compression without retraining the entire model. We further introduce query dependent context modeling, which selectively compresses query-relevant information, ensuring that the model retains the most pertinent content. Our empirical results demonstrate that Query Dependent LCIRC (QD-LCIRC) significantly improves LLM’s ability to manage extended contexts, making it well-suited for tasks that require both comprehensive context understanding and query relevance. @@ -6623,9 +6623,9 @@ A Template Is All You Meme - LukeBatesTechnische Universität Darmstadt + LukeBatesTechnische Universität Darmstadt Peter EbertChristensen - PreslavNakovMohamed bin Zayed University of Artificial Intelligence + PreslavNakovMohamed bin Zayed University of Artificial Intelligence IrynaGurevychInstitute for Computer Science, Artificial Intelligence and Technology, Mohamed bin Zayed University of Artificial Intelligence and Technische Universität Darmstadt 10443-10475 Templatic memes, characterized by a semantic structure adaptable to the creator’s intent, represent a significant yet underexplored area within meme processing literature. With the goal of establishing a new direction for computational meme analysis, here we create a knowledge base composed of more than 5,200 meme templates, information about them, and 54,000 examples of template instances (templatic memes). To investigate the semantic signal of meme templates, we show that we can match memes in datasets to base templates contained in our knowledge base with a distance-based lookup. To demonstrate the power of meme templates, we create TSplit, a method to reorganize datasets, where a template or templatic instance can only appear in either the training or test split. Our re-split datasets enhance general meme knowledge and improve sample efficiency, leading to more robust models. Our examination of meme templates results in state-of-the-art performance for every dataset we consider, paving the way for analysis grounded in templateness. @@ -6635,8 +6635,8 @@ <fixed-case>LLM</fixed-case>s vs Established Text Augmentation Techniques for Classification: When do the Benefits Outweight the Costs? JanCeginBrno University of Technology - JakubSimkoKempelen Institute of Intelligent Technologies - PeterBrusilovskyUniversity of Pittsburgh + JakubSimkoKempelen Institute of Intelligent Technologies + PeterBrusilovskyUniversity of Pittsburgh 10476-10496 The generative large language models (LLMs) are increasingly being used for data augmentation tasks, where text samples are LLM-paraphrased and then used for classifier fine-tuning. Previous studies have compared LLM-based augmentations with established augmentation techniques, but the results are contradictory: some report superiority of LLM-based augmentations, while other only marginal increases (and even decreases) in performance of downstream classifiers. A research that would confirm a clear cost-benefit advantage of LLMs over more established augmentation methods is largely missing. To study if (and when) is the LLM-based augmentation advantageous, we compared the effects of recent LLM augmentation methods with established ones on 6 datasets, 3 classifiers and 2 fine-tuning methods. We also varied the number of seeds and collected samples to better explore the downstream model accuracy space. Finally, we performed a cost-benefit analysis and show that LLM-based methods are worthy of deployment only when very small number of seeds is used. Moreover, in many cases, established methods lead to similar or better model accuracies. 2025.naacl-long.526 @@ -6648,7 +6648,7 @@ AssafBen-KishMassachusetts Institute of Technology and Tel Aviv University YonatanBittonGoogle IdanSzpektorGoogle - RajaGiryesTel Aviv University + RajaGiryesTel Aviv University 10497-10518 Recent research increasingly focuses on training vision-language models (VLMs) with long, detailed image captions. However, small-scale VLMs often struggle to balance the richness of these captions with the risk of hallucinating content during fine-tuning. In this paper, we explore how well VLMs adapt to such captions. To quantify caption quality, we propose Decomposed NLI (DNLI), an evaluation framework that breaks down generated captions into individual propositions, assessing each in isolation. This fine-grained analysis reveals a critical balance between capturing descriptive details and preventing hallucinations. Our findings show that simply reducing caption complexity or employing standard data curation techniques does not effectively resolve this issue. To tackle this challenge, we introduce Knowledge Adapted (KnowAda) fine-tuning, a data-centric approach that automatically adapts training data with the model’s existing knowledge and visual understanding. KnowAda minimizes hallucinations while preserving high descriptiveness. We validate this approach across several small-scale VLMs (up to 7B parameters) and dense caption datasets, demonstrating that KnowAda effectively balances hallucination reduction and descriptiveness. Our results show that KnowAda outperforms various baselines in both automatic metrics and human evaluations. 2025.naacl-long.527 @@ -6656,9 +6656,9 @@ Self-Training Meets Consistency: Improving <fixed-case>LLM</fixed-case>s’ Reasoning with Consistency-Driven Rationale Evaluation - JaehyeokLeeSung Kyun Kwan University + JaehyeokLeeSung Kyun Kwan University KeisukeSakaguchiTohoku University - JinYeongBakSungKyunKwan University + JinYeongBakSungKyunKwan University 10519-10539 Self-training approach for large language models (LLMs) improves reasoning abilities by training the models on their self-generated rationales. Previous approaches have labeled rationales that produce correct answers for a given question as appropriate for training. However, a single measure risks misjudging rationale quality, leading the models to learn flawed reasoning patterns. To address this issue, we propose CREST (Consistency-driven Rationale Evaluation for Self-Training), a self-training framework that further evaluates each rationale through follow-up questions and leverages this evaluation to guide its training. Specifically, we introduce two methods: (1) filtering out rationales that frequently result in incorrect answers on follow-up questions and (2) preference learning based on mixed preferences from rationale evaluation results of both original and follow-up questions. Experiments on three question-answering datasets using open LLMs show that CREST not only improves the logical robustness and correctness of rationales but also improves reasoning abilities compared to previous self-training approaches. 2025.naacl-long.528 @@ -6674,9 +6674,9 @@ Evaluating Input Feature Explanations through a Unified Diagnostic Evaluation Framework - JingyiSunUniversity of Copenhagen - PepaAtanasovaUniversity of Copenhagen - IsabelleAugensteinUniversity of Copenhagen + JingyiSunUniversity of Copenhagen + PepaAtanasovaUniversity of Copenhagen + IsabelleAugensteinUniversity of Copenhagen 10559-10577 Explaining the decision-making process of machine learning models is crucial for ensuring their reliability and transparency for end users. One popular explanation form highlights key input features, such as i) tokens (e.g., Shapley Values and Integrated Gradients), ii) interactions between tokens (e.g., Bivariate Shapley and Attention-based methods), or iii) interactions between spans of the input (e.g., Louvain Span Interactions). However, these explanation types have only been studied in isolation, making it difficult to judge their respective applicability. To bridge this gap, we develop a unified framework that facilitates an automated and direct comparison between highlight and interactive explanations comprised of four diagnostic properties. We conduct an extensive analysis across these three types of input feature explanations – each utilizing three different explanation techniques–across two datasets and two models, and reveal that each explanation has distinct strengths across the different diagnostic properties. Nevertheless, interactive span explanations outperform other types of input feature explanations across most diagnostic properties. Despite being relatively understudied, our analysis underscores the need for further research to improve methods generating these explanation types. Additionally, integrating them with other explanation types that perform better in certain characteristics could further enhance their overall effectiveness. 2025.naacl-long.530 @@ -6703,16 +6703,16 @@ Mitigating Tail Narrowing in <fixed-case>LLM</fixed-case> Self-Improvement via Socratic-Guided Sampling - YiwenDing + YiwenDing ZhihengXi WeiHeFudan University LizhuoyuanLizhuoyuan YitaoZhaiMeituan - ShiXiaoweiMeituan + ShiXiaoweiMeituan XunliangCaiMeituan TaoGuiFudan University QiZhangFudan University - XuanjingHuangFudan University + XuanjingHuangFudan University 10627-10646 Self-improvement methods enable large language models (LLMs) to generate solutions themselves and iteratively train on filtered, high-quality rationales. This process proves effective and reduces the reliance on human supervision in LLMs’ reasoning, but the performance soon plateaus. We delve into the process and find that models tend to over-sample on easy queries and under-sample on queries they have yet to master. As iterations proceed, this imbalance in sampling is exacerbated, leading to a long-tail distribution where solutions to difficult queries almost diminish. This phenomenon limits the performance gain of self-improving models. A straightforward solution is brute-force sampling to balance the distribution, which significantly raises computational costs. In this paper, we introduce Guided Self-Improvement (GSI), a strategy aimed at improving the efficiency of sampling challenging heavy-tailed data. It leverages Socratic-style guidance signals to help LLM reasoning with complex queries, reducing the exploration effort and minimizing computational overhead. Experiments on four models across diverse mathematical tasks show that GSI strikes a balance between performance and efficiency, while also being effective on held-out tasks. 2025.naacl-long.533 @@ -6720,7 +6720,7 @@ <fixed-case>F</fixed-case>act<fixed-case>E</fixed-case>val: Evaluating the Robustness of Fact Verification Systems in the Era of Large Language Models - MamtaMamtaKing’s College London, University of London + MamtaMamtaKing’s College London, University of London OanaCocarascuKing’s College London 10647-10660 Whilst large language models (LLMs) have made significant advances in every natural language processing task, studies have shown that these models are vulnerable to small perturbations in the inputs, raising concerns about their robustness in the real-world. Given the rise of misinformation online and its significant impact on society, fact verification is one area in which assessing the robustness of models developed for this task is crucial. However, the robustness of LLMs in fact verification remains largely unexplored. In this paper, we introduce FactEval, a novel large-scale benchmark for extensive evaluation of LLMs in the fact verification domain covering 17 realistic word-level and character-level perturbations and 4 types of subpopulations. We investigate the robustness of several LLMs in zero-shot, few-shot, and chain-of-thought prompting. Our analysis using FEVER, one of the largest and most widely-used datasets for fact verification, reveals that LLMs are brittle to small input changes and also exhibit performance variations across different subpopulations. @@ -6730,7 +6730,7 @@ Analyzing Memorization in Large Language Models through the Lens of Model Attribution Tarun RamMentaAdobe Systems - SusmitAgrawal + SusmitAgrawal ChiragAgarwalUniversity of Virginia, Charlottesville 10661-10689 Large Language Models (LLMs) are prevalent in modern applications but often memorize training data, leading to privacy breaches and copyright issues. Existing research has mainly focused on post-hoc analyses—such as extracting memorized content or developing memorization metrics—without exploring the underlying architectural factors that contribute to memorization. In this work, we investigate memorization from an architectural lens by analyzing how attention modules at different layers impact its memorization and generalization performance. Using attribution techniques, we systematically intervene in the LLM’s architecture by bypassing attention modules at specific blocks while keeping other components like layer normalization and MLP transformations intact. We provide theorems analyzing our intervention mechanism from a mathematical view, bounding the difference in layer outputs with and without our attributions. Our theoretical and empirical analyses reveal that attention modules in deeper transformer blocks are primarily responsible for memorization, whereas earlier blocks are crucial for the model’s generalization and reasoning capabilities. We validate our findings through comprehensive experiments on different LLM families (Pythia and GPT-Neo) and five benchmark datasets. Our insights offer a practical approach to mitigate memorization in LLMs while preserving their performance, contributing to safer and more ethical deployment in real-world applications. @@ -6742,7 +6742,7 @@ BingfengChenGuangdong University of Technology ShaobinShi YongqiLuo - BoyanXu + BoyanXu RuichuCaiGuangdong University of Technology ZhifengHaoShantou University 10690-10708 @@ -6762,12 +6762,12 @@ <fixed-case>MCQG</fixed-case>-<fixed-case>SR</fixed-case>efine: Multiple Choice Question Generation and Evaluation with Iterative Self-Critique, Correction, and Comparison Feedback - ZonghaiYaoUniversity of Massachusetts at Amherst + ZonghaiYaoUniversity of Massachusetts at Amherst AdityaParashar HuixueZhou - Won SeokJangUniversity of Massachusetts at Lowell + Won SeokJangUniversity of Massachusetts at Lowell Feiyun Ouyang - ZhichaoYangOptum AI + ZhichaoYangOptum AI HongYuColumbia University 10728-10777 Automatic question generation (QG) is essential for AI and NLP, particularly in intelligent tutoring, dialogue systems, and fact verification. Generating multiple-choice questions (MCQG) for professional exams, like the United States Medical Licensing Examination (USMLE), is particularly challenging, requiring domain expertise and complex multi-hop reasoning for high-quality questions. However, current large language models (LLMs) like GPT-4 struggle with professional MCQG due to outdated knowledge, hallucination issues, and prompt sensitivity, resulting in unsatisfactory quality and difficulty. To address these challenges, we propose MCQG-SRefine, an LLM self-refine-based (Critique and Correction) framework for converting medical cases into high-quality USMLE-style questions. By integrating expert-driven prompt engineering with iterative self-critique and self-correction feedback, MCQG-SRefine significantly enhances human expert satisfaction regarding both the quality and difficulty of the questions. Furthermore, we introduce an LLM-as-Judge-based automatic metric to replace the complex and costly expert evaluation process, ensuring reliable and expert-aligned assessments. @@ -6788,7 +6788,7 @@ RuichuCaiGuangdong University of Technology JunhaoLu ZhongjieChenGuangdong University of Technology - BoyanXu + BoyanXu ZhifengHaoShantou University 10790-10802 Zero-shot Named Entity Recognition (ZS-NER) aims to recognize entities in unseen domains without specific annotated data. A key challenge is handling missing entities while ensuring accurate type recognition, hindered by: 1) the pre-training assumption that each entity has a single type, overlooking diversity, and 2) insufficient contextual knowledge for type reasoning. To address this, we propose IRRA (Integrated Recall and Retrieval Augmentation), a novel two-stage framework leveraging large language model techniques. In the Recall Augmented Entity Extracting stage, we built a perturbed dataset to induce the model to exhibit missing or erroneous extracted entities. Based on this, we trained an enhanced model to correct these errors. This approach can improve the ZS-NER’s recall rate. In the Retrieval Augmented Type Correcting stage, we employ Retrieval-Augmented Generation techniques to locate entity-related unannotated contexts, with the additional contextual information significantly improving the accuracy of type correcting. Extensive evaluations demonstrate the state-of-the-art performance of our IRRA, with significant improvements in zero-shot cross-domain settings validated through both auto-evaluated metrics and analysis. Our implementation will be open-sourced athttps://github.com/DMIRLAB-Group/IRRA. @@ -6797,13 +6797,13 @@ <fixed-case>KMI</fixed-case>: A Dataset of <fixed-case>K</fixed-case>orean Motivational Interviewing Dialogues for Psychotherapy - HyunjongKim + HyunjongKim SuyeonLeeIndustrial Engineering YeongjaeChoSeoul National University EunseoRyu YohanJoSeoul National University SuranSeongKorea Counseling Graduate University - SungzoonChoSeoul National University + SungzoonChoSeoul National University 10803-10828 The increasing demand for mental health services has led to the rise of AI-driven mental health chatbots, though challenges related to privacy, data collection, and expertise persist. Motivational Interviewing (MI) is gaining attention as a theoretical basis for boosting expertise in the development of these chatbots. However, existing datasets are showing limitations for training chatbots, leading to a substantial demand for publicly available resources in the field of MI and psychotherapy. These challenges are even more pronounced in non-English languages, where they receive less attention. In this paper, we propose a novel framework that simulates MI sessions enriched with the expertise of professional therapists. We train an MI forecaster model that mimics the behavioral choices of professional therapists and employ Large Language Models (LLMs) to generate utterances through prompt engineering. Then, we present KMI, the first synthetic dataset theoretically grounded in MI, containing 1,000 high-quality Korean Motivational Interviewing dialogues. Through an extensive expert evaluation of the generated dataset and the dialogue model trained on it, we demonstrate the quality, expertise, and practicality of KMI. We also introduce novel metrics derived from MI theory in order to evaluate dialogues from the perspective of MI. 2025.naacl-long.541 @@ -6824,7 +6824,7 @@ AndreiPanferov IvanIlinKing Abdullah University of Science and Technology HanGuoMassachusetts Institute of Technology - PeterRichtárikKing Abdullah University of Science and Technology (KAUST) + PeterRichtárikKing Abdullah University of Science and Technology (KAUST) DanAlistarh 10857-10886 Quantizing large language models has become a standard way to reduce their memory and computational costs. Typically, existing methods focus on breaking down the problem into individual layer-wise sub-problems, and minimizing per-layer error, measured via various metrics. Yet, this approach currently lacks theoretical justification and the metrics employed may be sub-optimal. In this paper, we present a “linearity theorem” establishing a direct relationship between the layer-wise reconstruction error and the model perplexity increase due to quantization. This insight enables two novel applications: (1) a simple data-free LLM quantization method using Hadamard rotations and MSE-optimal grids, dubbed HIGGS, which outperforms all prior data-free approaches such as the extremely popular NF4 quantized format, and (2) an optimal solution to the problem of finding non-uniform per-layer quantization levels which match a given compression constraint, obtained by reduction to dynamic programming. On the practical side, we demonstrate improved accuracy-compression trade-offs on Llama-family models, advancing both data-free and non-uniform quantization for large language models. @@ -6836,7 +6836,7 @@ BadrAlKhamissiEPFL - EPF Lausanne GretaTuckuteMassachusetts Institute of Technology AntoineBosselutSwiss Federal Institute of Technology Lausanne - MartinSchrimpfEPFL - EPF Lausanne + MartinSchrimpfEPFL - EPF Lausanne 10887-10911 Large language models (LLMs) exhibit remarkable capabilities on not just language tasks, but also various tasks that are not linguistic in nature, such as logical reasoning and social inference. In the human brain, neuroscience has identified a core language system that selectively and causally supports language processing. We here ask whether similar specialization for language emerges in LLMs. We identify language-selective units within 18 popular LLMs, using the same localization approach that is used in neuroscience. We then establish the causal role of these units by demonstrating that ablating LLM language-selective units – but not random units – leads to drastic deficits in language tasks. Correspondingly, language-selective LLM units are more aligned to brain recordings from the human language system than random units. Finally, we investigate whether our localization method extends to other cognitive domains: while we find specialized networks in some LLMs for reasoning and social capabilities, there are substantial differences among models. These findings provide functional and causal evidence for specialization in large language models, and highlight parallels with the functional organization in the brain. 2025.naacl-long.544 @@ -6844,13 +6844,13 @@ <fixed-case>M</fixed-case>ix<fixed-case>LLM</fixed-case>: Dynamic Routing in Mixed Large Language Models - XinyuanWang + XinyuanWang YanchiLiuNEC-Labs WeiChengNEC-Labs XujiangZhaoNEC Labs America - ZhengzhangChenNEC Labs America + ZhengzhangChenNEC Labs America WenchaoYuUniversity of California, Los Angeles - YanjieFuArizona State University + YanjieFuArizona State University HaifengChen 10912-10922 Large Language Models (LLMs) exhibit potential artificial generic intelligence recently, however, their usage is costly with high response latency. Given mixed LLMs with their own strengths and weaknesses, LLM routing aims to identify the most suitable model for each query in the stream to maximize response quality and minimize cost and latency. However, the challenges involve: (1) dynamic trade-offs among quality, cost, and latency; (2) enabling continual learning in deployed systems; and (3) navigating a varying (e.g., new LLM addition or old LLM removal) set of LLM candidates over time. To bridge these gaps, we develop MixLLM, a dynamic contextual-bandit-based routing system for query-LLM assignment. Specifically, we first leverage query tags to enhance query embeddings for the routing task. Next, we design lightweight prediction models to estimate the response qualities and costs of queries over LLMs. We then devise a meta-decision maker to choose the query-LLM assignments to best tradeoff response quality, cost, and latency. Finally, the system benefits from continual training, allowing it to adapt to evolving queries and user feedback over time. Our extensive experiments show that MixLLM achieves the best trade-offs in response quality, cost, and latency (97.25% of GPT-4’s quality at 24.18% of the cost under the time constraint). @@ -6861,7 +6861,7 @@ Continual Learning in Multilingual Sign Language Translation ShakibYazdaniGerman Research Center for AI Josef VanGenabithGerman Research Center for AI and Universität des Saarlandes - CristinaEspaña-BonetBarcelona Supercomputing Center and German Research Center for AI + CristinaEspaña-BonetBarcelona Supercomputing Center and German Research Center for AI 10923-10938 The field of sign language translation (SLT) is still in its infancy, as evidenced by the low translation quality, even when using deep learn- ing approaches. Probably because of this, many common approaches in other machine learning fields have not been explored in sign language. Here, we focus on continual learning for mul- tilingual SLT. We experiment with three con- tinual learning methods and compare them to four more naive baseline and fine-tuning ap- proaches. We work with four sign languages (ASL, BSL, CSL and DGS) and three spo- ken languages (Chinese, English and German). Our results show that incremental fine-tuning is the best performing approach both in terms of translation quality and transfer capabilities, and that continual learning approaches are not yet fully competitive given the current SOTA in SLT. 2025.naacl-long.546 @@ -6888,8 +6888,8 @@ <fixed-case>PORT</fixed-case>: Preference Optimization on Reasoning Traces SalemLahlouMohamed bin Zayed University of Artificial Intelligence - AbdalgaderAbubakerTechnology Innovation Institute and University of Khartoum - HakimHacidTII + AbdalgaderAbubakerTechnology Innovation Institute and University of Khartoum + HakimHacidTII 10989-11005 Preference optimization methods have been successfully applied to improve not only the alignment of large language models (LLMs) with human values, but also specific natural language tasks such as summarization and stylistic continuations. This paper proposes using preference optimization methods on Chain-of-Thought steps in order to improve the mathematical reasoning performances of language models. While the chosen answers are obtained from datasets that include reasoning traces, we propose two complementary schemes for generating rejected answers: weak LLM prompting, and digit corruption. Our approach leads to increased accuracy on the GSM8K and AQuA-RAT mathematical reasoning benchmarks for Falcon2-11B and Mistral-7B. Additionally, the improved abilities transfer to non-mathematical tasks, including the ARC benchmark and symbolic reasoning challenges. For example, our method can lead to up to relative 8.47 and 18.73 increases in accuracy on the GSM8K and AQuA benchmarks respectively, without any extra annotations. This work suggests that the path towards better language reasoning abilities goes through spending resources on creating high-quality datasets of reasoning traces. 2025.naacl-long.549 @@ -6897,7 +6897,7 @@ Guiding Through Complexity: What Makes Good Supervision for Hard Reasoning Tasks? - XuanHe + XuanHe DaYin NanyunPengUniversity of California, Los Angeles 11006-11046 @@ -6917,7 +6917,7 @@ A Systematic Examination of Preference Learning through the Lens of Instruction-Following - JoongwonKimPaul G. Allen School of Computer Science and Engineering, University of Washington + JoongwonKimPaul G. Allen School of Computer Science and Engineering, University of Washington AnirudhGoyalGoogle DeepMind AstonZhangMeta BoXiong @@ -6933,12 +6933,12 @@ Lived Experience Not Found: <fixed-case>LLM</fixed-case>s Struggle to Align with Experts on Addressing Adverse Drug Reactions from Psychiatric Medication Use - MohitChandra + MohitChandra SiddharthSriramanGeorgia Institute of Technology - GauravVermaGeorgia Institute of Technology + GauravVermaGeorgia Institute of Technology Harneet SinghKhanujaGeorgia Institute of Technology Jose SuarezCampayo - ZihangLi + ZihangLi Michael L.Birnbaum MunmunDe Choudhury 11083-11113 @@ -6952,7 +6952,7 @@ TusharKhotAllen Institute for Artificial Intelligence BhavanaDalvi MishraAllen Institute for Artificial Intelligence HarshitSuranaAllen Institute for Artificial Intelligence - JulianMcAuleyUniversity of California, San Diego, University of California, San Diego + JulianMcAuleyUniversity of California, San Diego, University of California, San Diego PeterClarkAllen Institute for Artificial Intelligence Bodhisattwa PrasadMajumderAllen Institute for Artificial Intelligence 11114-11134 @@ -6963,9 +6963,9 @@ <fixed-case>LLM</fixed-case>-Supported Natural Language to Bash Translation FinnianWestenfelder - ErikHembergMassachusetts Institute of Technology + ErikHembergMassachusetts Institute of Technology StephenMoskal - Una-MayO’Reilly + Una-MayO’Reilly SilviuChiricescuCharles Stark Draper 11135-11147 The Bourne-Again Shell (Bash) command-line interface for Linux systems has complex syntax and requires extensive specialized knowledge. Using the natural language to Bash command (NL2SH) translation capabilities of large language models (LLMs) for command composition circumvents these issues. However, the NL2SH performance of LLMs is difficult to assess due to inaccurate test data and unreliable heuristics for determining the functional equivalence of Bash commands. We present a manually verified test dataset of 600 instruction-command pairs and a training dataset of 40,939 pairs, increasing the size of previous datasets by 441% and 135%, respectively. Further, we present a novel functional equivalence heuristic that combines command execution with LLM evaluation of command outputs. Our heuristic can determine the functional equivalence of two Bash commands with 95% confidence, a 16% increase over previous heuristics. Evaluation of popular LLMs using our test dataset and heuristic demonstrates that parsing, in-context learning, in-weight learning and constrained decoding can improve NL2SH accuracy by up to 32%. Our findings emphasize the importance of dataset quality, execution-based evaluation and translation method for advancing NL2SH translation. Our code is available at https://github.com/westenfelder/NL2SH @@ -6974,7 +6974,7 @@ <fixed-case>REL</fixed-case>-<fixed-case>A</fixed-case>.<fixed-case>I</fixed-case>.: An Interaction-Centered Approach To Measuring Human-<fixed-case>LM</fixed-case> Reliance - KaitlynZhouStanford University + KaitlynZhouStanford University Jena D.HwangAllen Institute for Artificial Intelligence XiangRenUniversity of Southern California NouhaDziri @@ -6987,7 +6987,7 @@ Eliciting Critical Reasoning in Retrieval-Augmented Generation via Contrastive Explanations - LeonardoRanaldi + LeonardoRanaldi MarcoValentinoUniversity of Sheffield AndreFreitasIdiap Research Institute and University of Manchester 11168-11183 @@ -7011,8 +7011,8 @@ LaurentDubreuilCornell University ImaneTerhmina YunciSunCornell University - MatthewWilkensCornell University - Marten VanSchijndelCornell University + MatthewWilkensCornell University + Marten VanSchijndelCornell University 11208-11222 This study presents a novel approach to analyzing historical language change, focusing on the evolving semantics of the French term “indigène(s)” (“indigenous”) between 1825 and 1950. While existing approaches to measuring semantic change with contextual word embeddings (CWE) rely primarily on similarity measures or clustering, these methods may not be suitable for highly imbalanced datasets, and pose challenges for interpretation. For this reason, we propose an interpretable, feature-level approach to analyzing language change, which we use to trace the semantic evolution of “indigène(s)” over a 125-year period. Following recent work on sequence embeddings (O’Neill et al., 2024), we use k-sparse autoencoders (k-SAE) (Makhzani and Frey, 2013) to interpret over 210,000 CWEs generated using sentences sourced from the French National Library. We demonstrate that k-SAEs can learn interpretable features from CWEs, as well as how differences in feature activations across time periods reveal highly specific aspects of language change. In addition, we show that diachronic change in feature activation frequency reflects the evolution of French colonial legal structures during the 19th and 20th centuries. 2025.naacl-long.559 @@ -7022,9 +7022,9 @@ Planetarium: A Rigorous Benchmark for Translating Text to Structured Planning Languages MaxZuoBrown University Francisco PiedrahitaVelezBrown University - XiaochenLi - MichaelLittmanBrown University, Brown University, Brown University and Georgia Institute of Technology - StephenBachComputer Science Department, Brown University and Snorkel AI + XiaochenLi + MichaelLittmanBrown University, Brown University, Brown University and Georgia Institute of Technology + StephenBachComputer Science Department, Brown University and Snorkel AI 11223-11240 Recent works have explored using language models for planning problems. One approach examines translating natural language descriptions of planning tasks into structured planning languages, such as the planning domain definition language (PDDL). Existing evaluation methods struggle to ensure semantic correctness and rely on simple or unrealistic datasets. To bridge this gap, we introduce Planetarium, a benchmark designed to evaluate language models’ ability to generate PDDL code from natural language descriptions of planning tasks. Planetarium features a novel PDDL equivalence algorithm that flexibly evaluates the correctness of generated PDDL against ground truth, along with a dataset of 145,918 text-to-PDDL pairs across 73 unique state combinations with varying levels of difficulty. Finally, we evaluate several API-access and open-weight language models that reveal this task’s complexity. For example, 96.1% of the PDDL problem descriptions generated by GPT-4o are syntactically parseable, 94.4% are solvable, but only 24.8% are semantically correct, highlighting the need for a more rigorous benchmark for this problem. 2025.naacl-long.560 @@ -7042,7 +7042,7 @@ Using Text-Based Causal Inference to Disentangle Factors Influencing Online Review Ratings - LinsenLi + LinsenLi AronCulottaTulane University NicholasMatteiTulane University 11259-11277 @@ -7056,7 +7056,7 @@ ZhiqiBuAmazon BhanukiranVinzamuriAmazon AnilRamakrishnaAmazon - Kai-WeiChangUniversity of California, Los Angeles and Amazon + Kai-WeiChangUniversity of California, Los Angeles and Amazon VolkanCevherEPFL - EPF Lausanne and Amazon Development Center Germany MingyiHongAmazon and University of Minnesota, Minneapolis 11278-11294 @@ -7066,8 +7066,8 @@ <fixed-case>REFFLY</fixed-case>: Melody-Constrained Lyrics Editing Model - SongyanZhao - BingxuanLi + SongyanZhao + BingxuanLi YufeiTian NanyunPengUniversity of California, Los Angeles 11295-11315 @@ -7087,11 +7087,11 @@ <fixed-case>M</fixed-case>ulti<fixed-case>C</fixed-case>hart<fixed-case>QA</fixed-case>: Benchmarking Vision-Language Models on Multi-Chart Problems - ZifengZhu - MengzhaoJia + ZifengZhu + MengzhaoJia ZhihanZhang LangLi - MengJiangUniversity of Notre Dame + MengJiangUniversity of Notre Dame 11341-11359 Multimodal Large Language Models (MLLMs) have demonstrated impressive abilities across various tasks, including visual question answering and chart comprehension, yet existing benchmarks for chart-related tasks fall short in capturing the complexity of real-world multi-chart scenarios. Current benchmarks primarily focus on single-chart tasks, neglecting the multi-hop reasoning required to extract and integrate information from multiple charts, which is essential in practical applications. To fill this gap, we introduce MultiChartQA, a benchmark that evaluates MLLMs’ capabilities in four key areas: direct question answering, parallel question answering, comparative reasoning, and sequential reasoning. Our evaluation of a wide range of MLLMs reveals significant performance gaps compared to humans. These results highlight the challenges in multi-chart comprehension and the potential of MultiChartQA to drive advancements in this field. Our code and data are available at https://github.com/Zivenzhu/Multi-chart-QA. 2025.naacl-long.566 @@ -7103,7 +7103,7 @@ Eva MariaVecchiUniversity of Stuttgart, Universität Stuttgart CarlottaQuensel NeeleFalk - GabriellaLapesaGESIS – Leibniz Institute for the Social Sciences and Heinrich-Heine University Düsseldorf + GabriellaLapesaGESIS – Leibniz Institute for the Social Sciences and Heinrich-Heine University Düsseldorf 11360-11395 Moderation is essential for maintaining and improving the quality of online discussions. This involves: (1) countering negativity, e.g. hate speech and toxicity, and (2) promoting positive discourse, e.g. broadening the discussion to involve other users and perspectives. While significant efforts have focused on addressing negativity, driven by an urgency to address such issues, this left moderation promoting positive discourse (henceforth PositiveModeration) under-studied. With the recent advancements in LLMs, Positive Moderation can potentially be scaled to vast conversations, fostering more thoughtful discussions and bridging the increasing divide in online interactions.We advance the understanding of Positive Moderation by annotating a dataset on 13 moderation properties, e.g. neutrality, clarity and curiosity. We extract instructions from professional moderation guidelines and use them to prompt LLaMA to generate such moderation. This is followed by extensive evaluation showing that (1) annotators rate generated higher than professional moderation, but still slightly prefer professional moderation in pairwise comparison, and (2) LLMs can be used to estimate human evaluation as an efficient alternative. 2025.naacl-long.567 @@ -7111,12 +7111,12 @@ Social Norms in Cinema: A Cross-Cultural Analysis of Shame, Pride and Prejudice - SunnyRaiSchool of Engineering and Applied Science, University of Pennsylvania + SunnyRaiSchool of Engineering and Applied Science, University of Pennsylvania KhushangZaveri ShreyaHavaldarUniversity of Pennsylvania SoumnaNema LyleUngar - Sharath ChandraGuntukuUniversity of Pennsylvania + Sharath ChandraGuntukuUniversity of Pennsylvania 11396-11415 Shame and pride are social emotions expressed across cultures to motivate and regulate people’s thoughts, feelings, and behaviors. In this paper, we introduce the first cross-cultural dataset of over 10k shame/pride-related expressions with underlying social expectations from ~5.4K Bollywood and Hollywood movies. We examine *how* and *why* shame and pride are expressed across cultures using a blend of psychology-informed language analysis combined with large language models. We find significant cross-cultural differences in shame and pride expression aligning with known cultural tendencies of the USA and India – e.g., in Hollywood, shame-expressions predominantly discuss *self* whereas shame is expressed toward *others* in Bollywood. Women are more sanctioned across cultures and for violating similar social expectations. 2025.naacl-long.568 @@ -7126,12 +7126,12 @@ The Stochastic Parrot on <fixed-case>LLM</fixed-case>’s Shoulder: A Summative Assessment of Physical Concept Understanding MoYuWeChat AI, Tencent LemaoLiuTencent - JunjieWuHKUST + JunjieWuHKUST Tsz TingChung ShunchiZhangJohns Hopkins University - JiangnanLiWeChat, Tencent Inc. - Dit-YanYeungHong Kong University of Science and Technology - JieZhou + JiangnanLiWeChat, Tencent Inc. + Dit-YanYeungHong Kong University of Science and Technology + JieZhou 11416-11431 In a systematic way, we investigate a widely asked question: Do LLMs really understand what they say?, which relates to the more familiar term Stochastic Parrot. To this end, we propose a summative assessment over a carefully designed physical concept understanding task, P HYSI C O. Our task alleviates the memorization issue via the usage of grid-format inputs that abstractly describe physical phenomena. The grids represents varying levels of understanding, from the core phenomenon, application examples to analogies to other abstract patterns in the grid world. A comprehensive study on our task demonstrates: (1) state-of-the-art LLMs, including GPT-4o, o1 and Gemini 2.0 flash thinking, lag behind humans by ∼40%; (2) the stochastic parrot phenomenon is present in LLMs, as they fail on our grid task but can describe and recognize the same concepts well in natural language; (3) our task challenges the LLMs due to intrinsic difficulties rather than the unfamiliar grid format, as in-context learning and fine-tuning on same formatted data added little to their performance. 2025.naacl-long.569 @@ -7139,8 +7139,8 @@ m<fixed-case>H</fixed-case>uman<fixed-case>E</fixed-case>val - A Multilingual Benchmark to Evaluate Large Language Models for Code Generation - NishatRaihan - AntoniosAnastasopoulos + NishatRaihan + AntoniosAnastasopoulos MarcosZampieri 11432-11461 Recent advancements in large language models (LLMs) have significantly enhanced code generation from natural language prompts. The HumanEval Benchmark, developed by OpenAI, remains the most widely used code generation benchmark. However, this and other Code LLM benchmarks face critical limitations, particularly in task diversity, test coverage, and linguistic scope. Current evaluations primarily focus on English-to-Python conversion tasks with limited test cases, potentially overestimating model performance. While recent works have addressed test coverage and programming language (PL) diversity, code generation from low-resource language prompts remains largely unexplored. To address this gap, we introduce mHumanEval, an extended benchmark supporting prompts in over 200 natural languages. We employ established machine translation methods to compile the benchmark, coupled with a quality assurance process. Furthermore, we provide expert human translations for 15 diverse natural languages (NLs). We conclude by analyzing the multilingual code generation capabilities of state-of-the-art (SOTA) Code LLMs, offering insights into the current landscape of cross-lingual code generation. @@ -7149,11 +7149,11 @@ What Do <fixed-case>VLM</fixed-case>s <fixed-case>NOTICE</fixed-case>? A Mechanistic Interpretability Pipeline for <fixed-case>G</fixed-case>aussian-Noise-free Text-Image Corruption and Evaluation - MichalGolovanevskyBrown University + MichalGolovanevskyBrown University WilliamRudman VedantPalit - CarstenEickhoffEberhard-Karls-Universität Tübingen - RitambharaSinghBrown University + CarstenEickhoffEberhard-Karls-Universität Tübingen + RitambharaSinghBrown University 11462-11482 Vision-Language Models (VLMs) have gained prominence due to their success in solving complex cross-modal tasks. However, the internal mechanisms of VLMs, particularly the roles of cross-attention and self-attention in multimodal integration, are not fully understood. To address this gap, we introduce NOTICE, a Gaussian-Noise-free Text-Image Corruption and Evaluation pipeline for mechanistic interpretability in VLMs. NOTICE introduces Semantic Image Pairs (SIP) corruption, the first visual counterpart to Symmetric Token Replacement (STR) for text. Through NOTICE, we uncover a set of “universal attention heads” in BLIP and LLaVA that consistently contribute across different tasks and modalities. In BLIP, cross-attention heads implement object detection, object suppression, and outlier suppression, whereas important self-attention heads in LLaVA only perform outlier suppression. Notably, our findings reveal that cross-attention heads perform image-grounding, while self-attention in LLaVA heads do not, highlighting key differences in how VLM architectures handle multimodal learning. 2025.naacl-long.571 @@ -7161,8 +7161,8 @@ Are explicit belief representations necessary? A comparison between Large Language Models and <fixed-case>B</fixed-case>ayesian probabilistic models - DingyiPanUniversity of California, San Diego - BenBergenUniversity of California, San Diego + DingyiPanUniversity of California, San Diego + BenBergenUniversity of California, San Diego 11483-11498 Large language models (LLMs) have exhibited certain indirect pragmatic capabilities, including interpreting indirect requests and non-literal meanings. Yet, it is unclear whether the success of LLMs on pragmatic tasks generalizes to phenomena that directly probe inferences about the beliefs of others. Indeed, LLMs’ performance on Theory of Mind (ToM) tasks is mixed. To date, the most successful computationally explicit approach to making inferences about others’ beliefs is the Rational Speech Act (RSA) framework, a Bayesian probabilistic model that encodes explicit representations of beliefs. In the present study, we ask whether LLMs outperform RSA in predicting human belief inferences, even though they do not explicitly encode belief representations. We focus specifically on projection inferences, a type of inference that directly probes belief attribution. We find that some LLMs are sensitive to factors that affect the inference process similarly to humans, yet there remains variance in human behavior not fully captured by LLMs. The RSA model, on the other hand, outperforms LLMs in capturing the variances in human data, suggesting that explicit belief representation might be necessary to construct human-like projection inferences. 2025.naacl-long.572 @@ -7170,7 +7170,7 @@ Self-Generated Critiques Boost Reward Modeling for Language Models - YueYuMeta + YueYuMeta ZhengxingChenFacebook AstonZhangMeta LiangTanFacebook @@ -7179,7 +7179,7 @@ YundiQianFacebook XueweiWangFacebook SuchinGururanganFacebook and University of Washington, Seattle - ChaoZhangGeorgia Institute of Technology + ChaoZhangGeorgia Institute of Technology MelanieKambadurFacebook DhruvMahajanMeta AI RuiHouMeta Inc. @@ -7206,11 +7206,11 @@ ZhenweiDaiAmazon YaochenXieAmazon XianfengTangAmazon - ChenLuoAmazon + ChenLuoAmazon YangLi Joyce C.HoEmory University - CarlYangEmory University - QiHeAmazon + CarlYangEmory University + QiHeAmazon 11534-11550 Retrieval-augmented generation (RAG) enhances the question answering (QA) abilities of large language models (LLMs) by integrating external knowledge. However, adapting general-purpose RAG systems to specialized fields such as science and medicine poses unique challenges due to distribution shifts and limited access to domain-specific data. To tackle this, we propose SimRAG, a self-training approach that equips LLMs with joint capabilities of question answering and question generation for domain adaptation. Our method first fine-tunes LLMs on instruction-following, question-answering, and search-related data. Then, it prompts LLMs to generate diverse domain-relevant questions from unlabeled corpora, with an additional filtering strategy to retain high-quality synthetic examples. By leveraging these synthetic examples, the LLMs can improve their performance on domain-specific RAG tasks. Experiments on 11 datasets across three different domains verify the efficacy of SimRAG over baselines by 1.2%–8.6%. 2025.naacl-long.575 @@ -7219,7 +7219,7 @@ Learning to Substitute Words with Model-based Score Ranking HongyeLiu - RicardoHenaoDuke University and King Abdullah University of Science and Technology + RicardoHenaoDuke University and King Abdullah University of Science and Technology 11551-11565 Smart word substitution aims to enhance sentence quality by improving word choices, however current benchmarks rely on human-labeled data , which suffers from subjectivity and lacks diversity due to limitations in the number of annotators. Since word choices are inherently subjective, ground-truth word substitutions generated by a small group of annotators are often incomplete and likely not generalizable. To circumvent this issue, we instead employ a model-based scoring (BARTScore) to quantify sentence quality, thus forgoing the need for human annotations. Specifically, we use this score to define a distribution for each word substitution, allowing one to test whether a substitution is statistically superior relative to others. Further, we propose a loss function that directly optimizes the alignment between model predictions and sentence scores, while also enhancing the overall quality score of a substitution. Crucially, model learning no longer requires human labels, thus avoiding the cost of annotation while maintaining the quality of the text modified with substitutions. Experimental results show that the proposed approach outperforms both masked language models (BERT, BART) and large language models (GPT-4, LLaMA). 2025.naacl-long.576 @@ -7227,7 +7227,7 @@ Multilingual Reasoning via Self-training - LeonardoRanaldi + LeonardoRanaldi GiuliaPucci 11566-11582 Although reasoning is innately language-agnostic, the multilingual capacities remains a significant challenge for large language models (LLMs). Their ability to generate structured, step-wise explanations is constantly restricted to dominant languages in pre-training data, making cross-lingual generalisation difficult and hindering broader global adoption. Recent works have introduced eclectic strategies to improve reasoning beyond English; however, these methods remain related to specific language that is not always optimal for reasoning.To improve LLMs’ multilingual reasoning abilities, we propose a modular approach that instructs the models to structure reasoning passages in a different problem space and then self-refine their capabilities to deliver step-wise reasoning passages that lead to the solution. Experiments show that our approach stably achieves significant improvements in the multilingual reasoning of various models and task, with improved reasoning consistency across languages. @@ -7238,19 +7238,19 @@ x<fixed-case>LAM</fixed-case>: A Family of Large Action Models to Empower <fixed-case>AI</fixed-case> Agent Systems JianguoZhangSalesForce AI Research TianLanSalesForce - MingZhuSalesForce.com - ZuxinLiuSalesforce AI Research + MingZhuSalesForce.com + ZuxinLiuSalesforce AI Research Thai QuocHoangSalesforce Research ShirleyKokaneSalesForce.com WeiranYaoSalesForce.com JuntaoTanSalesForce.com and Rutgers University - AksharaPrabhakarSalesforce Research + AksharaPrabhakarSalesforce Research HaolinChenSalesForce.com - ZhiweiLiuSalesforce AI Research + ZhiweiLiuSalesforce AI Research YihaoFengApple AI/ML Tulika ManojAwalgaonkarSalesForce.com RitheshR NSalesForce.com - ZeyuanChenSalesforce Inc + ZeyuanChenSalesforce Inc RanXuSalesForce.com Juan CarlosNieblesSalesforce Research and Stanford University ShelbyHeineckeSalesforce Research @@ -7264,13 +7264,13 @@ <fixed-case>P</fixed-case>ro<fixed-case>MQA</fixed-case>: Question Answering Dataset for Multimodal Procedural Activity Understanding - KimihiroHasegawa + KimihiroHasegawa WiradeeImrattanatraiAIST, National Institute of Advanced Industrial Science and Technology - Zhi-QiChengUniversity of Washington + Zhi-QiChengUniversity of Washington MasakiAsada SusanHolmCarnegie Mellon University YuranWangCMU, Carnegie Mellon University - KenFukudaAIST, National Institute of Advanced Industrial Science and Technology + KenFukudaAIST, National Institute of Advanced Industrial Science and Technology TerukoMitamuraCarnegie Mellon University 11598-11617 Multimodal systems have great potential to assist humans in procedural activities, where people follow instructions to achieve their goals. Despite diverse application scenarios, systems are typically evaluated on traditional classification tasks, e.g., action recognition or temporal action localization. In this paper, we present a novel evaluation dataset, ProMQA, to measure the advancement of systems in application-oriented scenarios. ProMQA consists of 401 multimodal procedural QA pairs on user recording of procedural activities, i.e., cooking, coupled with their corresponding instruction. For QA annotation, we take a cost-effective human-LLM collaborative approach, where the existing annotation is augmented with LLM-generated QA pairs that are later verified by humans. We then provide the benchmark results to set the baseline performance on ProMQA. Our experiment reveals a significant gap between human performance and that of current systems, including competitive proprietary multimodal models. We hope our dataset sheds light on new aspects of models’ multimodal understanding capabilities. @@ -7279,10 +7279,10 @@ Ethical Concern Identification in <fixed-case>NLP</fixed-case>: A Corpus of <fixed-case>ACL</fixed-case> <fixed-case>A</fixed-case>nthology Ethics Statements - AntoniaKaramolegkou + AntoniaKaramolegkou Sandrine SchillerHansenCopenhagen University - AriadniChristopoulouVerita International School - FilipposStamatiouCopenhagen University and University of Stellenbosch + AriadniChristopoulouVerita International School + FilipposStamatiouCopenhagen University and University of Stellenbosch AnneLauscherUniversität Hamburg AndersSøgaardCopenhagen University 11618-11635 @@ -7294,7 +7294,7 @@ <fixed-case>A</fixed-case>da<fixed-case>CAD</fixed-case>: Adaptively Decoding to Balance Conflicts between Contextual and Parametric Knowledge HanWangUniversity of North Carolina at Chapel Hill ArchikiPrasad - EliasStengel-Eskin + EliasStengel-Eskin MohitBansalUniversity of North Carolina at Chapel Hill 11636-11652 Knowledge conflict arises from discrepancies between information in the context of a large language model (LLM) and the knowledge stored in its parameters. This can hurt performance when using standard decoding techniques, which tend to ignore the context. Existing test-time contrastive methods seek to address this by comparing the LLM’s output distribution with and without the context and adjust the model according to the contrast between them. However, we find that these methods frequently misjudge the degree of conflict and struggle to handle instances that vary in their amount of conflict, with static methods over-adjusting when conflict is absent. We propose a fine-grained, instance-level approach called AdaCAD, which dynamically infers the weight of adjustment based on the degree of conflict, as measured by the Jensen-Shannon divergence between distributions representing contextual and parametric knowledge. Across four LLMs, six question-answering (QA) and three summarization datasets, we demonstrate that AdaCAD consistently outperforms other decoding baselines with average QA accuracy gains of 14.21% (absolute) over a static contrastive baseline, and improves the factuality of summaries by 6.19 (AlignScore). Lastly, we show that while contrastive baselines hurt performance when conflict is absent, AdaCAD mitigates these losses, making it more applicable to real-world datasets in which some examples have conflict and others do not. @@ -7315,11 +7315,11 @@ <fixed-case>LBC</fixed-case>: Language-Based-Classifier for Out-Of-Variable Generalization KangjunNohYonsei University - BaekryunSeong - HoyoonByunYonsei University - YoungjunChoi + BaekryunSeong + HoyoonByunYonsei University + YoungjunChoi SungjinSong - KyungwooSongYonsei University + KyungwooSongYonsei University 11666-11678 Large Language Models (LLMs) have great success in natural language processing tasks such as response generation. However, their use in tabular data has been limited due to their inferior performance compared to traditional machine learning models (TMLs) such as XGBoost. We find that the pre-trained knowledge of LLMs enables them to interpret new variables that appear in a test without additional training, a capability central to the concept of Out-of-Variable (OOV). From the findings, we propose a Language-Based-Classifier (LBC), a classifier that maximizes the benefits of LLMs to outperform TMLs on OOV tasks. LBC employs three key methodological strategies: 1) Categorical changes to adjust data to better fit the model’s understanding, 2) Advanced order and indicator to enhance data representation to the model, and 3) Using verbalizer to map logit scores to classes during inference to generate model predictions. These strategies, combined with the pre-trained knowledge of LBC, emphasize the model’s ability to effectively handle OOV tasks. We empirically and theoretically validate the superiority of LBC. LBC is the first study to apply an LLM-based model to OOV tasks. The source code is at https://github.com/ASDASDanonymous/Language-Based-Classifier-forOOVtasks. 2025.naacl-long.583 @@ -7365,9 +7365,9 @@ KangyuZhu ZiyuanQin HuahuiYi - ZekunJiang + ZekunJiang QichengLaoBeijing University of Posts and Telecommunications - ShaotingZhangShanghai Artificial Intelligence Laboratory + ShaotingZhangShanghai Artificial Intelligence Laboratory KangLi 11726-11739 While mainstream vision-language models (VLMs) have advanced rapidly in understanding image-level information, they still lack the ability to focus on specific areas designated by humans. Rather, they typically rely on large volumes of high-quality image-text paired data to learn and generate posterior attention maps. To address this critical issue, we propose leveraging visual prompts—simple visual markers in various forms—to guide and enhance the formation of region-specific attention. Thus, we introduce **MedVP**, a pioneering framework that integrates medical entity extraction, visual prompt generation, and dataset adaptation for visual prompt-guided fine-tuning. We successfully outperform recent state-of-the-art large models across multiple medical VQA datasets. Extensive experiments and Human evaluation are conducted to analyze the impact of different visual prompt forms and how they contribute to performance improvement. The results demonstrate both the effectiveness and clinical significance of our approach. @@ -7378,7 +7378,7 @@ Analyzing and Improving Coherence of Large Language Models in Question Answering IvanoLauriolaAmazon StefanoCampese - AlessandroMoschittiAmazon AGI + AlessandroMoschittiAmazon AGI 11740-11755 Large language models (LLMs) have recently revolutionized natural language processing. These models, however, often suffer from instability or lack of coherence, that is the ability of the models to generate semantically equivalent outputs when receiving diverse yet semantically equivalent input variations. In this work, we analyze the behavior of multiple LLMs, including Mixtral-8x7B, Llama2-70b, Smaug-72b, and Phi-3, when dealing with multiple lexical variations of the same info-seeking questions. Our results suggest that various LLMs struggle to consistently answer diverse equivalent queries. To address this issue, we show how redundant information encoded as a prompt can increase the coherence of these models. In addition, we introduce a Retrieval-Augmented Generation (RAG) technique that supplements LLMs with the top-k most similar questions from a question retrieval engine. This knowledge-augmentation leads to 4-8 percentage point improvement in end-to-end performance in factual question answering tasks. These findings underscore the need to enhance LLM stability and coherence through semantic awareness. 2025.naacl-long.588 @@ -7387,12 +7387,12 @@ <fixed-case>AL</fixed-case>in<fixed-case>F</fixed-case>i<fixed-case>K</fixed-case>: Learning to Approximate Linearized Future Influence Kernel for Scalable Third-Parity <fixed-case>LLM</fixed-case> Data Valuation YanzhouPanGoogle - HuaweiLinRochester Institute of Technology + HuaweiLinRochester Institute of Technology YideRan JiaminChen XiaodongYuStevens Institute of Technology WeijieZhaoRochester Institute of Technology - DenghuiZhangStevens Institute of Technology + DenghuiZhangStevens Institute of Technology ZhaozhuoXuStevens Institute of Technology 11756-11771 Large Language Models (LLMs) heavily rely on high-quality training data, making data valuation crucial for optimizing model performance, especially when working within a limited budget. In this work, we aim to offer a third-party data valuation approach that benefits both data providers and model developers. We introduce a linearized future influence kernel (LinFiK), which assesses the value of individual data samples in improving LLM performance during training. We further propose ALinFiK, a learning strategy to approximate LinFiK, enabling scalable data valuation. Our comprehensive evaluations demonstrate that this approach surpasses existing baselines in effectiveness and efficiency, demonstrating significant scalability advantages as LLM parameters increase. @@ -7404,7 +7404,7 @@ HongboZheng SuyuanWang NeerajGangwarUniversity of Illinois Urbana-Champaign - NickvashKaniUniversity of Illinois at Urbana-Champaign + NickvashKaniUniversity of Illinois at Urbana-Champaign 11772-11788 Vector representations have been pivotal in advancing natural language processing (NLP), with prior research focusing on embedding techniques for mathematical expressions using mathematically equivalent formulations. While effective, these approaches are constrained by the size and diversity of training data. In this work, we address these limitations by introducing E-Gen, a novel e-graph-based dataset generation scheme that synthesizes large and diverse mathematical expression datasets, surpassing prior methods in size and operator variety. Leveraging this dataset, we train embedding models using two strategies: (1) generating mathematically equivalent expressions, and (2) contrastive learning to explicitly group equivalent expressions. We evaluate these embeddings on both in-distribution and out-of-distribution mathematical language processing tasks, comparing them against prior methods. Finally, we demonstrate that our embedding-based approach outperforms state-of-the-art large language models (LLMs) on several tasks, underscoring the necessity of optimizing embedding methods for the mathematical data modality. The source code and datasets are available at https://github.com/MLPgroup/E-Gen. 2025.naacl-long.590 @@ -7417,7 +7417,7 @@ DaisyStantonResearch, Google SorooshMariooryadGoogle MattShannonGoogle - JulianSalazarGoogle DeepMind + JulianSalazarGoogle DeepMind David Teh-HwaKaoGoogle 11789-11806 Autoregressive (AR) Transformer-based sequence models are known to have difficulty generalizing to sequences longer than those seen during training. When applied to text-to-speech (TTS), these models tend to drop or repeat words or produce erratic output, especially for longer utterances. In this paper, we introduce enhancements aimed at AR Transformer-based encoder-decoder TTS systems that address these robustness and length generalization issues. Our approach uses an alignment mechanism to provide cross-attention operations with relative location information. The associated alignment position is learned as a latent property of the model via backpropagation and requires no external alignment information during training. While the approach is tailored to the monotonic nature of TTS input-output alignment, it is still able to benefit from the flexible modeling power of interleaved multi-head self- and cross-attention operations. A system incorporating these improvements, which we call Very Attentive Tacotron, matches the naturalness and expressiveness of a baseline T5-based TTS system, while eliminating problems with repeated or dropped words and enabling generalization to any practical utterance length. @@ -7436,13 +7436,13 @@ <fixed-case>A</fixed-case>uto<fixed-case>P</fixed-case>ar<fixed-case>LLM</fixed-case>: <fixed-case>GNN</fixed-case>-guided Context Generation for Zero-Shot Code Parallelization using <fixed-case>LLM</fixed-case>s Quazi IshtiaqueMahmudIowa State University - AliTehraniJamsaz + AliTehraniJamsaz Hung DPhan - LeChenArgonne National Laboratory - MihaiCapotăIntel Labs + LeChenArgonne National Laboratory + MihaiCapotăIntel Labs Theodore L.WillkeDataStax Nesreen K.AhmedIntel AI Research - AliJannesariIowa State University + AliJannesariIowa State University 11821-11841 In-Context Learning (ICL) has been shown to be a powerful technique to augment the capabilities of LLMs for a diverse range of tasks. This work proposes AutoParLLM, a novel way to generate context using guidance from graph neural networks (GNNs) to generate efficient parallel codes. We evaluate AutoParLLM on 12 applications from two well-known benchmark suites of parallel codes: NAS Parallel Benchmark and Rodinia Benchmark. Our results show that AutoParLLM improves the state-of-the-art LLMs (e.g., GPT-4) by 19.9% in NAS and 6.48% in Rodinia benchmark in terms of CodeBERTScore for the task of parallel code generation. Moreover, AutoParLLM improves the ability of the most powerful LLM to date, GPT-4, by achieving 17% (on NAS benchmark) and 16% (on Rodinia benchmark) better speedup. In addition, we propose OMPScore for evaluating the quality of the parallel code and show its effectiveness in evaluating parallel codes. 2025.naacl-long.593 @@ -7458,15 +7458,15 @@ MingqianZhengCMU, Carnegie Mellon University MichaelJiang LechenZhang - BowenYi + BowenYi KenanAlkiek - AbrahamIsraeliUniversity of Michigan - Ann Arbor + AbrahamIsraeliUniversity of Michigan - Ann Arbor BangzhaoShu - HuaShen + HuaShen JiaxinPeiStanford University HaotianZhang MiriamSchirmerNorthwestern University - DavidJurgensUniversity of Michigan - Ann Arbor + DavidJurgensUniversity of Michigan - Ann Arbor 11842-11866 Email is a vital conduit for human communication across businesses, organizations, and broader societal contexts. In this study, we aim to model the intents, expectations, and responsiveness in email exchanges. To this end, we release SIZZLER, a new dataset containing 1800 emails annotated with nuanced types of intents and expectations. We benchmark models ranging from feature-based logistic regression to zero-shot prompting of large language models. Leveraging the predictive model for intent, expectations, and 14 other features, we analyze 11.3M emails from GMANE to study how linguistic and social factors influence the conversational dynamics in email exchanges. Through our causal analysis, we find that the email response rates are influenced by social status, argumentation, and in certain limited contexts, the strength of social connection. 2025.naacl-long.594 @@ -7478,7 +7478,7 @@ XuhuiZhou SankethRangreji AnubhaKabraBloomberg - JuliaMendelsohn + JuliaMendelsohn FaezeBrahmanAllen Institute for Artificial Intelligence MaartenSapCarnegie Mellon University 11867-11894 @@ -7501,13 +7501,13 @@ <fixed-case>F</fixed-case>ollow<fixed-case>IR</fixed-case>: Evaluating and Teaching Information Retrieval Models to Follow Instructions OrionWeller - BenjaminChang - SeanMacAvaneyUniversity of Glasgow + BenjaminChang + SeanMacAvaneyUniversity of Glasgow KyleLoAllen Institute for Artificial Intelligence ArmanCohanYale University and Allen Institute for Artificial Intelligence BenjaminVan DurmeMicrosoft and Johns Hopkins University - DawnLawrieJohns Hopkins University - LucaSoldainiAllen Institute for Artificial Intelligence + DawnLawrieJohns Hopkins University + LucaSoldainiAllen Institute for Artificial Intelligence 11926-11942 Modern Language Models (LMs) are capable of following long and complex instructions that enable a large and diverse set of user requests. While Information Retrieval (IR) models use these LMs as the backbone of their architectures, virtually none of them allow users to provide detailed instructions alongside queries, thus limiting their ability to satisfy complex information needs. In this work, we study the use of instructions in IR systems. First, we introduce our dataset FollowIR, which contains a rigorous instruction evaluation benchmark as well as a training set for helping IR models learn to better follow real-world instructions. FollowIR repurposes detailed instructions – also known as narratives – developed for professional assessors to evaluate retrieval systems. In particular, we build our benchmark from three collections curated for shared tasks at the Text REtrieval Conference (TREC). These collections contains hundreds to thousands of labeled documents per query, making them suitable for our exploration. Through this process, we can measure how well IR models follow instructions, through a new pairwise evaluation framework. Our results indicate that existing retrieval models fail to correctly use instructions, using them for basic keywords and struggling to understand long-form information. However, we show that it is possible for IR models to learn to follow complex instructions: our new FollowIR-7B model has significant improvements after fine-tuning on our training set. 2025.naacl-long.597 @@ -7516,7 +7516,7 @@ Few-shot Personalization of <fixed-case>LLM</fixed-case>s with Mis-aligned Responses JaehyungKimYonsei University - YimingYangSchool of Computer Science, Carnegie Mellon University + YimingYangSchool of Computer Science, Carnegie Mellon University 11943-11974 As the diversity of users increases, the capability of providing personalized responses by large language models (LLMs) has become increasingly important. Existing approaches have only limited successes in LLM personalization, due to the absence of personalized learning or the reliance on shared personal data. This paper proposes a new approach for a few-shot personalization of LLMs with their mis-aligned responses (Fermi). Our key idea is to learn a set of personalized prompts for each user by progressively improving the prompts using LLMs, based on user profile (e.g., demographic information) and a few examples of previous opinions. During an iterative process of prompt improvement, we incorporate the contexts of mis-aligned responses by LLMs, which are especially crucial for the effective personalization of LLMs. In addition, we develop an effective inference method to further leverage the context of the test query and the personalized prompts. Our experimental results demonstrate that Fermi significantly improves performance across various benchmarks, compared to best-performing baselines. 2025.naacl-long.598 @@ -7527,9 +7527,9 @@ Hoang HNguyen KhyatiMahajanServiceNow Inc VikasYadav - JulianSalazarGoogle DeepMind - Philip S.YuUniversity of Illinois, Chicago - MasoudHashemiServiceNow Inc + JulianSalazarGoogle DeepMind + Philip S.YuUniversity of Illinois, Chicago + MasoudHashemiServiceNow Inc RishabhMaheshwaryServiceNow 11975-11994 Multilingual LLMs have achieved remarkable benchmark performance, but we find they continue to underperform on non-Latin script languages across contemporary LLM families. This discrepancy arises from the fact that LLMs are pretrained with orthographic scripts, which are dominated by Latin characters that obscure their shared phonology with non-Latin scripts. We propose leveraging phonemic transcriptions as complementary signals to induce script-invariant representations. Our study demonstrates that integrating phonemic signals improves performance across both non-Latin and Latin languages, with a particularly significant impact on closing the performance gap between the two. Through detailed experiments, we show that phonemic and orthographic scripts retrieve distinct examples for in-context learning (ICL). This motivates our proposed Mixed-ICL retrieval strategy, where further aggregation leads to our significant performance improvements for both Latin script languages (up to 12.6%) and non-Latin script languages (up to 15.1%) compared to randomized ICL retrieval. @@ -7542,53 +7542,53 @@ GiuseppeAttanasioInstituto de Telecomunicações IoanaBaldiniBloomberg MirunaClinciu - JordanCliveChattermill - PieterDelobelle + JordanCliveChattermill + PieterDelobelle MananDeySalesForce.com - SilHamilton + SilHamilton TimmDillUniversität Hamburg JadDoughman RitamDuttCarnegie Mellon University AvijitGhoshHugging Face and University of Connecticut - Jessica ZosaFordeBrown University - CarolinHoltermannUniversität Hamburg + Jessica ZosaFordeBrown University + CarolinHoltermannUniversität Hamburg Lucie-AiméeKaffeeHugging Face TanmayLaudHippocratic AI AnneLauscherUniversität Hamburg Roberto LLopez-Davila MaraimMasoud NikitaNangia - AnaeliaOvalleUniversity of California, Los Angeles - GiadaPistilliSorbonne University - DragomirRadevYale University - BeatriceSavoldi + AnaeliaOvalleUniversity of California, Los Angeles + GiadaPistilliSorbonne University + DragomirRadevYale University + BeatriceSavoldi VipulRahejaColumbia University, Grammarly and International Institute of Information Technology Hyderabad JeremyQinUniversité de Montréal - EstherPloeger - ArjunSubramonianUniversity of California, Los Angeles - KaustubhDholeEmory University + EstherPloeger + ArjunSubramonianUniversity of California, Los Angeles + KaustubhDholeEmory University KaiserSunDepartment of Computer Science, Whiting School of Engineering - AmirbekDjanibekov - JonibekMansurov - KayoYinUniversity of California, Berkeley + AmirbekDjanibekov + JonibekMansurov + KayoYinUniversity of California, Berkeley Emilio VillaCueva SagnikMukherjee - JerryHuangThe University of Tokyo and Université de Montréal & Mila - Quebec AI Institute - XudongShen + JerryHuangThe University of Tokyo and Université de Montréal & Mila - Quebec AI Institute + XudongShen JayGalaMohamed bin Zayed University of Artificial Intelligence HamdanAl-Ali Tair Djanibekov NurdauletMukhitulyMohamed bin Zayed University of Artificial Intelligence ShangruiNie ShanyaSharmaGoogle - KarolinaStanczakMila - Quebec Artificial Intelligence Institute and McGill University, McGill University + KarolinaStanczakMila - Quebec Artificial Intelligence Institute and McGill University, McGill University ElizaSzczechlaScott Tiger - TiagoTimponi TorrentFederal University of Juiz de Fora - DeepakTunuguntlaSaxion Universities - MarceloViridiano + TiagoTimponi TorrentFederal University of Juiz de Fora + DeepakTunuguntlaSaxion Universities + MarceloViridiano OskarVan Der Wal AdinaYakefu - AurélieNévéolLISN-CNRS / Université Paris Saclay + AurélieNévéolLISN-CNRS / Université Paris Saclay MikeZhang SydneyZinkKBR ZeerakTalatUniversity of Edinburgh, University of Edinburgh @@ -7615,7 +7615,7 @@ Allahsera AugusteTapo KevinAssogba Christopher MHoman - M. MustafaRafiqueRochester Institute of Technology + M. MustafaRafiqueRochester Institute of Technology MarcosZampieriGeorge Mason University 12060-12070 Data curation for under-resource languages enables the development of more accurate and culturally sensitive natural language processing models. However, the scarcity of well-structured multilingual datasets remains a challenge for advancing machine translation in these languages, especially for African languages. This paper focuses on creating high-quality parallel corpora that capture linguistic diversity to address this gap. We introduce Bayelemabaga, the most extensive curated multilingual dataset for machine translation in the Bambara language, the vehicular language of Mali. The dataset consists of 47K Bambara-French parallel sentences curated from 231 data sources, including short stories, formal documents, and religious literature, combining modern, historical, and indigenous languages. We present our data curation process and analyze its impact on neural machine translation by fine-tuning seven commonly used transformer-based language models, i.e., MBART, MT5, M2M-100, NLLB-200, Mistral-7B, Open-Llama-7B, and Meta-Llama3-8B on Bayelemabaga. Our evaluation on four Bambara-French language pair datasets (three existing datasets and the test set of Bayelemabaga) show up to +4.5, +11.4, and +0.27 in gains, respectively, on BLEU, CHRF++, and AfriCOMET evaluation metrics. We also conducted machine and human evaluations of translations from studied models to compare the machine translation quality of encoder-decoder and decoder-only models. Our results indicate that encoder-decoder models remain the best, highlighting the importance of additional datasets to train decoder-only models. @@ -7630,7 +7630,7 @@ SoheeYoonSamsung EdwardChoiKorea Advanced Institute of Science and Technology JaegulChooKorea Advanced Institute of Science and Technology - Won IkChoSamsung Advanced Institute of Technology + Won IkChoSamsung Advanced Institute of Technology 12071-12096 Aspect-based sentiment analysis (ABSA) is a challenging task of extracting sentiments along with their corresponding aspects and opinion terms from the text.The inherent subjectivity of span annotation makes variability in the surface forms of extracted terms, complicating the evaluation process.Traditional evaluation methods often constrain ground truths (GT) to a single term, potentially misrepresenting the accuracy of semantically valid predictions that differ in surface form.To address this limitation, we propose a novel and fully automated pipeline that expands existing evaluation sets by adding alternative valid terms for aspect and opinion. Our approach facilitates an equitable assessment of language models by accommodating multiple-answer candidates, resulting in enhanced human agreement compared to single-answer test sets (achieving up to a 10%p improvement in Kendall’s Tau score).Experimental results demonstrate that our expanded evaluation set helps uncover the capabilities of large language models (LLMs) in ABSA tasks, which is concealed by the single-answer GT sets.Consequently, our work contributes to the development of a flexible evaluation framework for ABSA by embracing diverse surface forms to span extraction tasks in a cost-effective and reproducible manner.Our code and dataset is open at https://github.com/dudrrm/zoom-in-n-out-absa. 2025.naacl-long.603 @@ -7641,12 +7641,12 @@ JianyuLiu HangyuGuoAlibaba Group RanjieDuan - XingyuanBuAlibaba Group + XingyuanBuAlibaba Group YanchengHeAlibaba Group ShilongLi HuiHuang JiahengLiuNanjing University - YuchengWang + YuchengWang ChenchenJing XingweiQuUniversity of Manchester XiaoZhangAlibaba Group @@ -7654,7 +7654,7 @@ YananWu JihaoGu YangguangLiVAST - JiankeZhu + JiankeZhu 12097-12118 Multimodal Large Language Models (MLLMs) pose unique safety challenges due to their integration of visual and textual data, thereby introducing new dimensions of potential attacks and complex risk combinations. In this paper, we begin with a detailed analysis aimed at disentangling risks through step-by-step reasoning within multimodal inputs. We find that systematic multimodal risk disentanglement substantially enhances the risk awareness of MLLMs. Via leveraging the strong discriminative abilities of multimodal risk disentanglement, we further introduce DREAM ( Disentangling Risks to Enhance Safety Alignment in MLLMs), a novel approach that enhances safety alignment in MLLMs through supervised fine-tuning and iterative Reinforcement Learning from AI Feedback (RLAIF). Experimental results show that DREAM significantly boosts safety during both inference and training phases without compromising performance on normal tasks (namely oversafety), achieving a 16.17% improvement in the SIUO safe&effective score compared to GPT-4V. @@ -7663,7 +7663,7 @@ In-Context Learning with Long-Context Models: An In-Depth Exploration - AmandaBertschCarnegie Mellon University + AmandaBertschCarnegie Mellon University MaorIvgiTel Aviv University EmilyXiao UriAlonGoogle DeepMind @@ -7677,11 +7677,11 @@ Preference Consistency Matters: Enhancing Preference Learning in Language Models with Automated Self-Curation of Training Corpora - JoonHoLeeSamsung SDS + JoonHoLeeSamsung SDS JuYounSonSamsung sds JureeSeok WooseokJang - Yeong-DaeKwonSamsung SDS + Yeong-DaeKwonSamsung SDS 12150-12169 Inconsistent annotations in training corpora, particularly within preference learning datasets, pose challenges in developing advanced language models. These inconsistencies often arise from variability among annotators and inherent multi-dimensional nature of the preferences. To address these issues, we introduce a self-curation method that preprocesses annotated datasets by leveraging proxy models trained directly on them. Our method enhances preference learning by automatically detecting and selecting consistent annotations. We validate the proposed approach through extensive instruction-following tasks, demonstrating performance improvements of up to 33% across various learning algorithms and proxy capabilities. This work offers a straightforward and reliable solution to address preference inconsistencies without relying on heuristics, serving as an initial step toward the development of more advanced preference learning methodologies. Code is available at https://github.com/Self-Curation/ . 2025.naacl-long.606 @@ -7691,7 +7691,7 @@ <fixed-case>T</fixed-case>urtle<fixed-case>B</fixed-case>ench: A Visual Programming Benchmark in Turtle Geometry SinaRismanchianUniversity of California, Irvine YasamanRazeghi - SameerSinghUniversity of California, Irvine and Allen Institute for Artificial Intelligence + SameerSinghUniversity of California, Irvine and Allen Institute for Artificial Intelligence ShayanDoroudiCarnegie Mellon University 12170-12188 Humans have the ability to reason about geometric patterns in images and scenes from a young age. However, developing large multimodal models (LMMs) capable of similar reasoning remains a challenge, highlighting the need for robust evaluation methods to assess these capabilities. We introduce TurtleBench, a benchmark designed to evaluate LMMs’ capacity to interpret geometric patterns—given visual examples, textual instructions, or both—and generate precise code outputs. Inspired by turtle geometry, a notion used to teach children foundational coding and geometric concepts, TurtleBench features tasks with patterned shapes that have underlying algorithmic logic. Our evaluation reveals that leading LMMs struggle significantly with these tasks, with GPT-4V achieving only 19% accuracy on the simplest tasks and few-shot prompting only marginally improves their performance (<2%). TurtleBench highlights the gap between human and AI performance in intuitive and visual geometrical understanding, setting the stage for future research in this area and stands as one of the few benchmarks to evaluate the integration of visual understanding and code generation capabilities in LMMs, setting the stage for future research. @@ -7700,7 +7700,7 @@ Automatically Discovering How Misogyny is Framed on Social Media - Rakshitha RaoAilneniUniversity of Texas at Dallas + Rakshitha RaoAilneniUniversity of Texas at Dallas Sanda M.HarabagiuUniversity of Texas at Dallas 12189-12208 Misogyny, which is widespread on social media, can be identified not only by recognizing its many forms but also by discovering how misogyny is framed. This paper considers the automatic discovery of misogyny problems and their frames through the Dis-MP&F method, which enables the generation of a data-driven, rich Taxonomy of Misogyny (ToM), offering new insights in the complexity of expressions of misogyny. Furthermore, the Dis-MP&F method, informed by the ToM, is capable of producing very promising results on a misogyny benchmark dataset. @@ -7710,11 +7710,11 @@ Faithful, Unfaithful or Ambiguous? Multi-Agent Debate with Initial Stance for Summary Evaluation MahnazKoupaee - Jake W.VincentAmazon + Jake W.VincentAmazon SaabMansourAmazon - IgorShalyminovAmazon + IgorShalyminovAmazon HanHeAmazon - HwanjunSongKorea Advanced Institute of Science & Technology + HwanjunSongKorea Advanced Institute of Science & Technology RaphaelShuAmazon JianfengHeAmazon YiNianUniversity of Southern California @@ -7743,7 +7743,7 @@ Language Models Predict Empathy Gaps Between Social In-groups and Out-groups - YuHouUniversity of Maryland, College Park + YuHouUniversity of Maryland, College Park Hal DauméIiiUniversity of Maryland, College Park RachelRudinger 12288-12304 @@ -7762,12 +7762,12 @@ <fixed-case>JAWAHER</fixed-case>: A Multidialectal Dataset of <fixed-case>A</fixed-case>rabic Proverbs for <fixed-case>LLM</fixed-case> Benchmarking - Samar MohamedMagdyMohamed bin Zayed University of Artificial Intelligence + Samar MohamedMagdyMohamed bin Zayed University of Artificial Intelligence Sang YunKwonUniversity of British Columbia FakhraddinAlwajih Safaa TaherAbdelfadil - ShadyShehataMohamed bin Zayed University of Artificial Intelligence - MuhammadAbdul-MageedUniversity of British Columbia + ShadyShehataMohamed bin Zayed University of Artificial Intelligence + MuhammadAbdul-MageedUniversity of British Columbia 12320-12341 Recent advancements in instruction fine-tuning, alignment methods such as reinforcement learning from human feedback (RLHF), and optimization techniques like direct preference optimization (DPO), have significantly enhanced the adaptability of large language models (LLMs) to user preferences. However, despite these innovations, many LLMs continue to exhibit biases toward Western, Anglo-centric, or American cultures, with performance on English data consistently surpassing that of other languages. This reveals a persistent cultural gap in LLMs, which complicates their ability to accurately process culturally rich and diverse figurative language, such as proverbs. To address this, we introduce *Jawaher*, a benchmark designed to assess LLMs’ capacity to comprehend and interpret Arabic proverbs. *Jawaher* includes proverbs from various Arabic dialects, along with idiomatic translations and explanations. Through extensive evaluations of both open- and closed-source models, we find that while LLMs can generate idiomatically accurate translations, they struggle with producing culturally nuanced and contextually relevant explanations. These findings highlight the need for ongoing model refinement and dataset expansion to bridge the cultural gap in figurative language processing. 2025.naacl-long.613 @@ -7778,9 +7778,9 @@ SamLinRutgers University, New Brunswick WenyueHua ZhentingWangRutgers University - MingyuJinRutgers University - LizhouFanBrigham and Women’s Hospital, Harvard University - YongfengZhangRutgers University + MingyuJinRutgers University + LizhouFanBrigham and Women’s Hospital, Harvard University + YongfengZhangRutgers University 12342-12361 Cloud-based Large Language Models (LLMs) such as ChatGPT have become increasingly integral to daily operations. Nevertheless, they also introduce privacy concerns: firstly, numerous studies underscore the risks to user privacy posed by jailbreaking cloud-based LLMs; secondly, the LLM service providers have access to all user data, which deters individuals from confidently utilizing such services. To address such concerns, we propose a simple yet effective paradigm, **EmojiPrompt**, to protect user privacy. At its core, EmojiPrompt performs generative transformation, obfuscating private data within prompts with linguistic and non-linguistic elements before submitting them to cloud-based LLMs. We evaluate EmojiPrompt’s performance across 8 datasets from various domains. We also propose simulated inference attacks to assess EmojiPrompt’s ability to preserve user privacy. The results demonstrate that EmojiPrompt effectively obfuscates user private data, while largely maintaining, or even enhancing, performances compared to the unobfuscated version. Furthermore, EmojiPrompt’s atomic-level obfuscation allows it to function exclusively with cloud-based LLMs. For source code, please refer to: https://github.com/agiresearch/EmojiCrypt. 2025.naacl-long.614 @@ -7789,11 +7789,11 @@ <fixed-case>MICE</fixed-case> for <fixed-case>CAT</fixed-case>s: Model-Internal Confidence Estimation for Calibrating Agents with Tools NishantSubramaniCarnegie Mellon University - JasonEisnerMicrosoft and Johns Hopkins University + JasonEisnerMicrosoft and Johns Hopkins University JustinSvegliatoUniversity of California, Berkeley and Microsoft BenjaminVan DurmeMicrosoft and Johns Hopkins University YuSuOhio State University - SamThomsonMicrosoft + SamThomsonMicrosoft 12362-12375 Tool-using agents that act in the world need to be both useful and safe. Well-calibrated model confidences can be used to weigh the risk versus reward of potential actions, but prior work shows that many models are poorly calibrated. Inspired by interpretability literature exploring the internals of models, we propose a novel class of model-internal confidence estimators (MICE) to better assess confidence when calling tools. MICE first decodes from each intermediate layer of the language model using logit lens and then computes similarity scores between each layer’s generation and the final output. These features are fed into a learned probabilistic classifier to assess confidence in the decoded output. On the simulated trial and error (STE) tool-calling dataset using Llama3 models, we find that MICE beats or matches the baselines on smoothed expected calibration error. Using MICE confidences to determine whether to call a tool significantly improves over strong baselines on a new metric, expected tool-calling utility. Further experiments show that MICE is sample-efficient, can generalize zero-shot to unseen APIs, and results in higher tool-calling utility in scenarios with varying risk levels. Our code is open source, available at https://github.com/microsoft/mice_for_cats. 2025.naacl-long.615 @@ -7801,11 +7801,11 @@ <fixed-case>PAT</fixed-case>: Parameter-Free Audio-Text Aligner to Boost Zero-Shot Audio Classification - AshishSeth + AshishSeth RamaneswaranSelvakumarUniversity of Maryland, College Park SonalKumar SreyanGhosh - DineshManochaUniversity of Maryland, College Park + DineshManochaUniversity of Maryland, College Park 12376-12394 Audio-Language Models (ALMs) have demonstrated remarkable performance in zero-shot audio classification. In this paper, we introduce PAT (Parameter-free Audio-Text aligner), a simple and training-free method aimed at boosting zero-shot audio classification performance of CLAP-like ALMs. To achieve this, we propose to improve the cross-modal interaction between audio and language modalities by enhancing the representations for both modalities using mutual feedback. Precisely, to enhance textual representations, we propose a prompt ensemble algorithm that automatically selects and combines the most relevant prompts from a datastore with a large pool of handcrafted prompts and weighs them according to their relevance to the audio. On the other hand, to enhance audio representations, we reweigh the frame-level audio features based on the enhanced textual information. Our proposed method does not require any additional modules or parameters and can be used with any existing CLAP-like ALM to improve zero-shot audio classification performance. We experiment across 18 diverse benchmark datasets and 6 ALMs and show that the PAT outperforms vanilla zero-shot evaluation with significant margins of 0.42%-27.0%. Additionally, we demonstrate that PAT maintains robust performance even when input audio is degraded by varying levels of noise. We make our code publicly available. 2025.naacl-long.616 @@ -7814,8 +7814,8 @@ Language Model Council: Democratically Benchmarking Foundation Models on Highly Subjective Tasks JustinZhao - Flor MiriamPlaza-del-Arco - BenjaminGenchel + Flor MiriamPlaza-del-Arco + BenjaminGenchel Amanda CercasCurry 12395-12450 As Large Language Models (LLMs) continue to evolve, evaluating them remains a persistent challenge. Many recent evaluations use LLMs as judges to score outputs from other LLMs, often relying on a single large model like GPT-4o. However, using a single LLM judge is prone to intra-model bias, and many tasks – such as those related to emotional intelligence, creative writing, and persuasiveness – may be too subjective for a single model to judge fairly. We introduce the Language Model Council (LMC), where a group of LLMs collaborate to create tests, respond to them, and evaluate each other’s responses to produce a ranking in a democratic fashion. Unlike previous approaches that focus on reducing cost or bias by using a panel of smaller models, our work examines the benefits and nuances of a fully inclusive LLM evaluation system. In a detailed case study on emotional intelligence, we deploy a council of 20 recent LLMs to rank each other on open-ended responses to interpersonal conflicts. Our results show that the LMC produces rankings that are more separable and more robust, and through a user study, we show that they are more consistent with human evaluations than any individual LLM judge. Using all LLMs for judging can be costly, however, so we use Monte Carlo simulations and hand-curated sub-councils to study hypothetical council compositions and discuss the value of the incremental LLM judge. @@ -7838,10 +7838,10 @@ SonalKumar SreyanGhosh UtkarshTyagi - Anton JeranRatnarajah + Anton JeranRatnarajah Chandra Kiran ReddyEvuru - RamaniDuraiswamiUniversity of Maryland, College Park - DineshManochaUniversity of Maryland, College Park + RamaniDuraiswamiUniversity of Maryland, College Park + DineshManochaUniversity of Maryland, College Park 12470-12483 Speech enhancement (SE) is the fundamental task of enhancing the clarity and quality of speech in the presence of non-stationary additive noise. While deterministic deep learning models have been commonly employed for SE, recent research indicates that generative models, such as denoising diffusion probabilistic models (DDPMs), have shown promise. However, different from speech generation, SE has a strong constraint to generate results in accordance with the underlying ground-truth signal. Additionally, for a wide variety of applications, SE systems need to be employed in real-time, and traditional diffusion models (DMs) requiring many iterations of a large model during inference are inefficient. To address these issues, we propose ProSE (diffusion-based Priors for SE), a novel methodology based on an alternative framework for applying diffusion models to SE. Specifically, we first apply DDPMs to generate priors in a latent space due to their powerful distribution mapping capabilities. The priors are then integrated into a transformer-based regression model for SE. The priors guide the regression model in the enhancement process. Since the diffusion process is applied to a compact latent space, the diffusion model takes fewer iterations than the traditional DM to obtain accurate estimations. Additionally, using a regression model for SE avoids the distortion issue caused by misaligned details generated by DMs. Comprehensive experiments show that ProSE achieves state-of-the-art performance on synthetic and real-world datasets using various metrics while consuming less computational costs. 2025.naacl-long.619 @@ -7849,9 +7849,9 @@ Mastering the Craft of Data Synthesis for <fixed-case>C</fixed-case>ode<fixed-case>LLM</fixed-case>s - MengChen + MengChen PhilipArthurOracle - QianyuFengOracle + QianyuFengOracle Cong Duy VuHoangOracle Corporation Yu-HengHong Mahdi KazemiMoghaddamOracle @@ -7884,12 +7884,12 @@ <fixed-case>C</fixed-case>ausal<fixed-case>E</fixed-case>val: Towards Better Causal Reasoning in Language Models LongxuanYu - DelinChen + DelinChen SihengXiongGeorgia Institute of Technology - QingyangWuUniversity of California, Los Angeles + QingyangWuUniversity of California, Los Angeles DaweiLi ZhikaiChen - XiaozeLiu + XiaozeLiu LiangmingPanUniversity of Arizona 12512-12540 Causal reasoning (CR) is a crucial aspect of intelligence, essential for problem-solving, decision-making, and understanding the world. While language models (LMs) can generate rationales for their outputs, their ability to reliably perform causal reasoning remains uncertain, often falling short in tasks requiring a deep understanding of causality. In this paper, we introduce CausalEval, a comprehensive review of research aimed at enhancing LMs for causal reasoning, coupled with an empirical evaluation of current models and methods. We categorize existing methods based on the role of LMs: either as reasoning engines or as helpers providing knowledge or data to traditional CR methods, followed by a detailed discussion of methodologies in each category. We then assess the performance of current LMs and various enhancement methods on a range of causal reasoning tasks, providing key findings and in-depth analysis. Finally, we present insights from current studies and highlight promising directions for future research. We aim for this work to serve as a comprehensive resource, fostering further advancements in causal reasoning with LMs. @@ -7905,8 +7905,8 @@ JiePengUniversity of Science and Technology of China BhavyaKailkhuraLawrence Livermore National Laboratory MeijunGaoMichigan State University - TianlongChenUniversity of North Carolina at Chapel Hill - KaixiongZhouNorth Carolina State University + TianlongChenUniversity of North Carolina at Chapel Hill + KaixiongZhouNorth Carolina State University 12541-12554 As large language models (LLMs) are increasingly deployed in diverse applications, including chatbot assistants and code generation, aligning their behavior with safety and ethical standards has become paramount. However, jailbreak attacks, which exploit vulnerabilities to elicit unintended or harmful outputs, threaten LLMs safety significantly. In this paper, we introduce Layer-AdvPatcher, a novel methodology designed to defend against jailbreak attacks by utilizing an unlearning strategy to patch specific layers within LLMs through self-augmented datasets. Our insight is that certain layer(s), tend to produce affirmative tokens when faced with harmful prompts. By identifying these layers and adversarially exposing them to generate more harmful data, one can understand their inherent and diverse vulnerabilities to attacks. With these exposures, we then “unlearn” these issues, reducing the impact of affirmative tokens and hence minimizing jailbreak risks while keeping the model’s responses to safe queries intact.We conduct extensive experiments on two models, four benchmark datasets, and multiple state-of-the-art jailbreak attacks to demonstrate the efficacy of our approach. Results indicate that our framework reduces the harmfulness and attack success rate of jailbreak attacks without compromising utility for benign queries compared to recent defense methods. Our code is publicly available at: https://github.com/oyy2000/LayerAdvPatcher 2025.naacl-long.623 @@ -7915,8 +7915,8 @@ <fixed-case>D</fixed-case>e<fixed-case>CAP</fixed-case>: Context-Adaptive Prompt Generation for Debiasing Zero-shot Question Answering in Large Language Models SuyoungBaeSung Kyun Kwan University - YunSeokChoiSungKyunKwan University - Jee-HyongLeeSungkyunkwan University + YunSeokChoiSungKyunKwan University + Jee-HyongLeeSungkyunkwan University 12555-12574 While Large Language Models (LLMs) excel in zero-shot Question Answering (QA), they tend to expose biases in their internal knowledge when faced with socially sensitive questions, leading to a degradation in performance. Existing zero-shot methods are efficient but failto consider context and prevent bias propagation in the answers. To address this, we propose *DeCAP*, a method for debiasing LLMs usingContext-Adaptive Prompt Generation. *DeCAP* leverages a *Question Ambiguity Detection* to take appropriate debiasing actions based on the context and a *Neutral Answer Guidance Generation* to suppress the LLMs make objective judgments about the context, minimizing thepropagation of bias from their internal knowledge. Our various experiments across eight LLMs show that *DeCAP* achieves state-of-the-art zero-shot debiased QA performance. This demonstrates *DeCAP*’s efficacy in enhancing the fairness and accuracy of LLMs in diverseQA settings. 2025.naacl-long.624 @@ -7935,7 +7935,7 @@ Typographic Attacks in a Multi-Image Setting - XiaomengWang + XiaomengWang ZhengyuZhaoXi’an Jiaotong University MarthaLarsonRadboud University 12594-12604 @@ -7947,8 +7947,8 @@ Tonguescape: Exploring Language Models Understanding of Vowel Articulation HarukiSakajoNara Institute of Science and Technology, Japan YusukeSakaiNara Institute of Science and Technology, Japan - HidetakaKamigaitoNara Institute of Science and Technology - TaroWatanabeNara Institute of Science and Technology, Japan + HidetakaKamigaitoNara Institute of Science and Technology + TaroWatanabeNara Institute of Science and Technology, Japan 12605-12619 Vowels are primarily characterized by tongue position. Humans have discovered these features of vowel articulation through their own experience and explicit objective observation such as using MRI. With this knowledge and our experience, we can explain and understand the relationship between tongue positions and vowels, and this knowledge is helpful for language learners to learn pronunciation. Since language models (LMs) are trained on a large amount of data that includes linguistic and medical fields, our preliminary studies indicate that an LM is able to explain the pronunciation mechanisms of vowels. However, it is unclear whether multi-modal LMs, such as vision LMs, align textual information with visual information. One question arises: do LMs associate real tongue positions with vowel articulation? In this study, we created video and image datasets from the existing real-time MRI dataset and investigated whether LMs can understand vowel articulation based on tongue positions using vision-based information. Our findings suggest that LMs exhibit potential for understanding vowels and tongue positions when reference examples are provided while they have difficulties without them. Our code for dataset building is available on GitHub. 2025.naacl-long.627 @@ -7956,9 +7956,9 @@ <fixed-case>C</fixed-case>o<fixed-case>RAC</fixed-case>: Integrating Selective <fixed-case>API</fixed-case> Document Retrieval with Question Semantic Intent for Code Question Answering - YunSeokChoiSungKyunKwan University + YunSeokChoiSungKyunKwan University CheolWonNaSungkyunkwan University - Jee-HyongLeeSungkyunkwan University + Jee-HyongLeeSungkyunkwan University 12620-12635 Automatic code question answering aims to generate precise answers to questions about code by analyzing code snippets. To provide an appropriate answer, it is necessary to accurately understand the relevant part of the code and correctly interpret the intent of the question. However, in real-world scenarios, the questioner often provides only a portion of the code along with the question, making it challenging to find an answer. The responder should be capable of providing a suitable answer using such limited information. We propose a knowledge-based framework, CoRAC, an automatic code question responder that enhances understanding through selective API document retrieval and question semantic intent clustering. We evaluate our method on three real-world benchmark datasets and demonstrate its effectiveness through various experiments. We also show that our method can generate high-quality answers compared to large language models, such as ChatGPT. 2025.naacl-long.628 @@ -7976,10 +7976,10 @@ How to Make <fixed-case>LLM</fixed-case>s Forget: On Reversing In-Context Knowledge Edits - PaulYoussef - ZhixueZhaoUniversity of Sheffield, University of Sheffield - JörgSchlöttererUniversität Mannheim and Phillips-Universität Marburg - ChristinSeifertPhillips-Universität Marburg and University of Twente + PaulYoussef + ZhixueZhaoUniversity of Sheffield, University of Sheffield + JörgSchlöttererUniversität Mannheim and Phillips-Universität Marburg + ChristinSeifertPhillips-Universität Marburg and University of Twente 12656-12669 In-context knowledge editing (IKE) enables efficient modification of large language model (LLM) outputs without parameter changes and at zero-cost. However, it can be misused to manipulate responses opaquely, e.g., insert misinformation or offensive content. Such malicious interventions could be incorporated into high-level wrapped APIs where the final input prompt is not shown to end-users. To address this issue, we investigate the detection and reversal of IKE-edits. First, we demonstrate that IKE-edits can be detected with high accuracy (F1 > 80%) using only the top-10 output probabilities of the next token, even in a black-box setting, e.g. proprietary LLMs with limited output information. Further, we introduce the novel task of reversing IKE-edits using specially tuned reversal tokens. We explore using both continuous and discrete reversal tokens, achieving over 80% accuracy in recovering original, unedited outputs across multiple LLMs. Our continuous reversal tokens prove particularly effective, with minimal impact on unedited prompts. Through analysis of output distributions, attention patterns, and token rankings, we provide insights into IKE’s effects on LLMs and how reversal tokens mitigate them. This work represents a significant step towards enhancing LLM resilience against potential misuse of in-context editing, improving their transparency and trustworthiness. 2025.naacl-long.630 @@ -7988,8 +7988,8 @@ <fixed-case>P</fixed-case>er<fixed-case>C</fixed-case>ul: A Story-Driven Cultural Evaluation of <fixed-case>LLM</fixed-case>s in <fixed-case>P</fixed-case>ersian ErfanMoosavi Monazzah - VahidRahimzadeh - YadollahYaghoobzadeh + VahidRahimzadeh + YadollahYaghoobzadeh AzadehShakeryUniversity of Tehran, University of Tehran Mohammad TaherPilehvarCardiff University and TeIAS 12670-12687 @@ -7999,7 +7999,7 @@ Towards Sustainable <fixed-case>NLP</fixed-case>: Insights from Benchmarking Inference Energy in Large Language Models - SohamPoddar + SohamPoddar ParamitaKoleyIndian Institute of Technology Kharagpur, Dhirubhai Ambani Institute Of Information and Communication Technology JanardanMisra NiloyGangulyIndian Institute of Technology Kharagpur, @@ -8012,10 +8012,10 @@ <fixed-case>CSR</fixed-case>-Bench: Benchmarking <fixed-case>LLM</fixed-case> Agents in Deployment of Computer Science Research Repositories YijiaXiao - RunhuiWang + RunhuiWang LuyangKongAmazon DavorGolac - WeiWangUniversity of California, Los Angeles + WeiWangUniversity of California, Los Angeles 12705-12723 The increasing complexity of computer science research projects demands more effective tools for deploying code repositories. Large Language Models (LLMs), such as Anthropic Claude and Meta Llama, have demonstrated significant advancements across various fields of computer science research, including the automation of diverse software engineering tasks. To evaluate the effectiveness of LLMs in handling complex code development tasks of research projects, particularly for NLP/CV/AI/ML/DM topics, we introduce CSR-Bench, a benchmark for Computer Science Research projects. This benchmark assesses LLMs from various aspects including accuracy, efficiency, and deployment script quality, aiming to explore their potential in conducting computer science research autonomously. We also introduce a novel framework, CSR-Agents, that utilizes multiple LLM agents to automate the deployment of GitHub code repositories of computer science research projects. Specifically, by checking instructions from markdown files and interpreting repository structures, the model generates and iteratively improves bash commands that set up the experimental environments and deploy the code to conduct research tasks. Preliminary results from CSR-Bench indicate that LLM agents can significantly enhance the workflow of repository deployment, thereby boosting developer productivity and improving the management of developmental workflows. 2025.naacl-long.633 @@ -8024,9 +8024,9 @@ <fixed-case>SALAD</fixed-case>: Improving Robustness and Generalization through Contrastive Learning with Structure-Aware and <fixed-case>LLM</fixed-case>-Driven Augmented Data SuyoungBaeSung Kyun Kwan University - YunSeokChoiSungKyunKwan University - HyojunKimSungkyunkwan University - Jee-HyongLeeSungkyunkwan University + YunSeokChoiSungKyunKwan University + HyojunKimSungkyunkwan University + Jee-HyongLeeSungkyunkwan University 12724-12738 In various natural language processing (NLP) tasks, fine-tuning Pre-trained Language Models (PLMs) often leads to the issue of spurious correlations, which negatively impacts performance, particularly when dealing with out-of-distribution data.To address this problem, we propose **SALAD** (**S**tructure **A**ware and **L**LM-driven **A**ugmented **D**ata), a novel approach designed to enhance model robustness and generalization by generating structure-aware and counterfactually augmented data for contrastive learning.Our method leverages a tagging-based approach to generate structure-aware positive samples and utilizes large language models (LLMs) to generate counterfactual negative samples with diverse sentence patterns. By applying contrastive learning, *SALAD* enables the model to focus on learning the structural relationships between key sentence components while minimizing reliance on spurious correlations.We validate our approach through experiments on three tasks: Sentiment Classification, Sexism Detection, and Natural Language Inference. The results demonstrate that *SALAD* not only improves model robustness and performance across different environments but also enhances generalization to out-of-distribution datasets and cross-domain scenarios. 2025.naacl-long.634 @@ -8040,7 +8040,7 @@ SihyeonParkKorea University HyeonHwangKorea University MujeenSungKyung Hee University - HyunjaeKimYale University + HyunjaeKimYale University JaewooKangKorea University 12739-12753 Large language models (LLM) hold significant potential for applications in biomedicine, but they struggle with hallucinations and outdated knowledge.While retrieval-augmented generation (RAG) is generally employed to address these issues, it also has its own set of challenges: (1) LLMs are vulnerable to irrelevant or unhelpful context, (2) medical queries are often not well-targeted for helpful information, and (3) retrievers are prone to bias toward the specific source corpus they were trained on. In this study, we present RAG^2 (RAtionale-Guided RAG), a new framework for enhancing the reliability of RAG in biomedical contexts. RAG^2 incorporates three key innovations: a small filtering model trained on perplexity-based labels of rationales, which selectively augments informative snippets of documents while filtering out distractors; LLM-generated rationales as queries to improve the utility of retrieved snippets; a structure designed to retrieve snippets evenly from a comprehensive set of four biomedical corpora, effectively mitigating retriever bias. Our experiments demonstrate that RAG^2 improves the state-of-the-art LLMs of varying sizes, with improvements of up to 6.1%, and it outperforms the previous best medical RAG model by up to 5.6% across three medical question-answering benchmarks. Our code is available at https://github.com/dmis-lab/RAG2 @@ -8049,8 +8049,8 @@ Prototype Conditioned Generative Replay for Continual Learning in <fixed-case>NLP</fixed-case> - XiChen - MinZeng + XiChen + MinZeng 12754-12770 Generative replay has proven effective in addressing the catastrophic forgetting issue of continual learning (CL) in natural language processing (NLP). However, relying on a single task-specific token or prompt often falls short in generating pseudo-samples that accurately reflect the true data distribution. This leads to issues of semantic inconsistency and scale inconsistency.To tackle these challenges, we propose a Prototype Conditioned Generative Replay (PCGR) method, which enhances generative reply by incorporating task-level statistics through a Prototype Conditioned Variational Autoencoder (PCVAE).Specifically, task-level embedding statistics are stored as prototypes for each old task. When a new task is introduced, PCVAE draws samples from task-specific prototype-based distributions to generate pseudo-samples.By incorporating the prototype, the generated pseudo-samples are both more representative and sufficiently diverse to reflect the real data distribution.Furthermore, as previously stored prototypes may become outdated due to evolving model parameters, we propose a Prototype Shift Estimation (PSE) to adjust for these changes.Experiments on NLP tasks across two different scenarios show that PCGR outperforms previous state-of-the-art (SOTA) methods. 2025.naacl-long.636 @@ -8062,7 +8062,7 @@ SushritaRakshit KushalChawlaCapitalOne Jeanne MBrettNorthwestern University - JonathanGratchUniversity of Southern California + JonathanGratchUniversity of Southern California 12771-12785 2025.naacl-long.637 hale-etal-2025-kodis From 9a2ea3665b0e1ca95cc43655066107a094be9f2b Mon Sep 17 00:00:00 2001 From: Matt Post Date: Thu, 29 May 2025 22:39:41 -0400 Subject: [PATCH 04/18] Add orcid to schema, fix pattern --- bin/ingest_orcids.py | 29 +++++++++++++++++++++-------- data/xml/2025.naacl.xml | 4 ++-- data/xml/schema.rnc | 5 +++-- 3 files changed, 26 insertions(+), 12 deletions(-) diff --git a/bin/ingest_orcids.py b/bin/ingest_orcids.py index 8f3f0338f5..8d4e7ae481 100755 --- a/bin/ingest_orcids.py +++ b/bin/ingest_orcids.py @@ -31,6 +31,7 @@ import yaml import sys import os +import re from pathlib import Path import lxml.etree as etree from typing import Dict, List @@ -114,9 +115,9 @@ def main( ) sys.exit(1) - assert len(papers) == len(volume_node.findall('./paper')), ( - f"Number of papers in YAML ({len(papers)}) does not match number in XML ({len(volume_node.findall('./paper'))})" - ) + assert len(papers) == len( + volume_node.findall('./paper') + ), f"Number of papers in YAML ({len(papers)}) does not match number in XML ({len(volume_node.findall('./paper'))})" for paper, paper_node in zip(papers, volume_node.findall('./paper')): # paper_num = int(paper["id"]) @@ -137,12 +138,24 @@ def get_author_xml(author_xml): name += last.text or "" return name - for author_yaml, author_node in zip(paper['authors'], paper_node.findall('./author')): - print(f"- Author YAML={author_yaml['first_name']} {author_yaml['last_name']} XML={get_author_xml(author_node)}", file=sys.stderr) + for author_yaml, author_node in zip( + paper['authors'], paper_node.findall('./author') + ): + print( + f"- Author YAML={author_yaml['first_name']} {author_yaml['last_name']} XML={get_author_xml(author_node)}", + file=sys.stderr, + ) if orcid := author_yaml.get('orcid'): - orcid = orcid.split('/')[-1] # Extract the ORCID from the URL if it's a full URL - # Check if the ORCID is already set in the XML - # Set the ORCID attribute + # grab ORCID pattern from orcid: \d{4}-\d{4}-\d{4}-\d{3}[0-9X] + orcid_pattern = r'\d{4}-\d{4}-\d{4}-\d{3}[0-9X]' + match = re.match(orcid_pattern, orcid) + if match: + # If the ORCID is in the expected format, use it directly + orcid = match.group(0) + else: + print(f"Invalid ORCID format: {orcid}", file=sys.stderr) + continue + author_node.attrib['orcid'] = orcid indent(root_node) diff --git a/data/xml/2025.naacl.xml b/data/xml/2025.naacl.xml index b3afbe524c..b38c55c258 100644 --- a/data/xml/2025.naacl.xml +++ b/data/xml/2025.naacl.xml @@ -1162,7 +1162,7 @@ MeriemBeloucifUppsala University OumaimaHourraneAl Akhawayn University RooweitherMabuyaNorth-West University - SalomeyOsei + SalomeyOsei SamuelRutunda Tadesse DestawBelay Tadesse KebedeGugeHaramaya University @@ -1760,7 +1760,7 @@ LolwethuNdolela NkirukaOduAfrican University of Science and Technology RooweitherMabuyaNorth-West University - SalomeyOsei + SalomeyOsei Shamsuddeen HassanMuhammadImperial College London and Bayero University, Kano-Nigeria SokharSamb Tadesse KebedeGugeHaramaya University diff --git a/data/xml/schema.rnc b/data/xml/schema.rnc index dabcbd8bed..0f4ad85d88 100644 --- a/data/xml/schema.rnc +++ b/data/xml/schema.rnc @@ -12,11 +12,12 @@ MarkupText = (text | b | i | url | fixed-case | tex-math )+ first = element first { xsd:string { pattern="(\S(.*\S)?)?" } } last = element last { xsd:string { pattern="\S(.*\S)?" } } affiliation = element affiliation { text } -orcid = element orcid { text } google-scholar = element google-scholar{ text } semantic-scholar = element semantic-scholar { text } Variant = element variant { attribute script { xsd:string }, (first? & last) } -Person = attribute id { xsd:NCName }?, (first? & last & Variant? & affiliation? & orcid? & google-scholar? & semantic-scholar? ) +Person = attribute id { xsd:NCName }?, + attribute orcid { xsd:string { pattern = "\d{4}-\d{4}-\d{4}-\d{3}[0-9X]" } }?, + (first? & last & Variant? & affiliation? & google-scholar? & semantic-scholar? ) local-filename = xsd:string { pattern="[A-Za-z0-9._\-]+" } bibtex-key = xsd:string { pattern="[A-Za-z0-9\-]+" } From 280877461bebd42e2bc3259089d1cf3f46949008 Mon Sep 17 00:00:00 2001 From: Matt Post Date: Thu, 29 May 2025 22:41:01 -0400 Subject: [PATCH 05/18] Add orcids for NAACL 2025 short --- bin/ingest_orcids.py | 2 +- data/xml/2025.naacl.xml | 244 ++++++++++++++++++++-------------------- 2 files changed, 123 insertions(+), 123 deletions(-) diff --git a/bin/ingest_orcids.py b/bin/ingest_orcids.py index 8d4e7ae481..07aef22d4c 100755 --- a/bin/ingest_orcids.py +++ b/bin/ingest_orcids.py @@ -148,7 +148,7 @@ def get_author_xml(author_xml): if orcid := author_yaml.get('orcid'): # grab ORCID pattern from orcid: \d{4}-\d{4}-\d{4}-\d{3}[0-9X] orcid_pattern = r'\d{4}-\d{4}-\d{4}-\d{3}[0-9X]' - match = re.match(orcid_pattern, orcid) + match = re.search(orcid_pattern, orcid) if match: # If the ORCID is in the expected format, use it directly orcid = match.group(0) diff --git a/data/xml/2025.naacl.xml b/data/xml/2025.naacl.xml index b38c55c258..bd0fcdc315 100644 --- a/data/xml/2025.naacl.xml +++ b/data/xml/2025.naacl.xml @@ -8088,8 +8088,8 @@ Complete Chess Games Enable <fixed-case>LLM</fixed-case> Become A Chess Master - YinqiZhangXiaohongshu - XintianHanByteDance Inc. + YinqiZhangXiaohongshu + XintianHanByteDance Inc. HaolongLi KediChen ShaohuiLin @@ -8100,7 +8100,7 @@ Predicting the Target Word of Game-playing Conversations using a Low-Rank Dialect Adapter for Decoder Models - DipankarSriragUniversity of New South Wales + DipankarSriragUniversity of New South Wales AdityaJoshiUNSW JacobEisensteinGoogle 8-17 @@ -8114,9 +8114,9 @@ OrenKalinskyAmazon TomerStavAmazon YuriRapoportAmazon - YaronFairsteinAmazon + YaronFairsteinAmazon RamYazdi - NachshonCohenAmazon + NachshonCohenAmazon AlexanderLibovAmazon GuyKushilevitzAmazon 18-32 @@ -8127,11 +8127,11 @@ Cross-Lingual Transfer Learning for Speech Translation RaoMa - MengjieQianUniversity of Cambridge + MengjieQianUniversity of Cambridge YassirFathullahUniversity of Cambridge SiyuanTangUniversity of Cambridge MarkGalesUniversity of Cambridge - KateKnillUniversity of Cambridge + KateKnillUniversity of Cambridge 33-43 There has been increasing interest in building multilingual foundation models for NLP and speech research. This paper examines how to expand the speech translation capability of these models with restricted data. Whisper, a speech foundation model with strong performance on speech recognition and English translation, is used as the example model. Using speech-to-speech retrieval to analyse the audio representations generated by the encoder, we show that utterances from different languages are mapped to a shared semantic space. This shared embedding space can then be leveraged for zero-shot cross-lingual transfer in speech translation. By fine-tuning the Whisper decoder with only English-to-Chinese speech translation data, improved performance for translation to Chinese can be obtained for multiple languages, in addition to English. Furthermore, for languages related to those seen in training it is possible to perform speech translation, despite the model never seeing the language in training, or being able to perform transcription. 2025.naacl-short.4 @@ -8143,7 +8143,7 @@ FengGu AbhilashaRavichanderUniversity of Washington and School of Computer Science, Carnegie Mellon University ShiFengGeorge Washington University - Jordan LeeBoyd-GraberUniversity of Maryland, College Park + Jordan LeeBoyd-GraberUniversity of Maryland, College Park RachelRudinger 44-64 Question answering (QA)—giving correct answers to questions—is a popular task, but we test **reverse question answering (RQA)**: for an input answer, give a question with that answer. Past work tests QA and RQA separately, but we test them jointly, comparing their difficulty, aiding benchmark design, and checking reasoning consistency. We run 16 LLMs on QA and RQA with trivia questions/answers, revealing: 1) Versus RQA, LLMs are much less accurate in RQA for numerical answers, but slightly more accurate in RQA for textual answers; 2) LLMs often answer their own invalid questions from RQA accurately in QA, so RQA errors are not just from knowledge gaps; 3) RQA errors correlate with question difficulty and inversely correlate with answer frequencies in the Dolma corpus; and 4) LLMs struggle to give valid multi-hop questions. By finding question and answer types that lead to RQA errors, we suggest improvements for LLM reasoning. @@ -8154,10 +8154,10 @@ Personalized Help for Optimizing Low-Skilled Users’ Strategy FengGu WichayapornWongkamjan - Jordan LeeBoyd-GraberUniversity of Maryland, College Park - Jonathan K.KummerfeldUniversity of Sydney + Jordan LeeBoyd-GraberUniversity of Maryland, College Park + Jonathan K.KummerfeldUniversity of Sydney DenisPeskoff - JonathanMayUniversity of Southern California and USC/ISI + JonathanMayUniversity of Southern California and USC/ISI 65-74 AIs can beat humans in game environments; however, how helpful those agents are to human remains understudied. We augment Cicero, a natural language agent that demonstrates superhuman performance in Diplomacy, to generate both move and message advice based on player intentions. A dozen Diplomacy games with novice and experienced players, with varying advice settings, show that some of the generated advice is beneficial. It helps novices compete with experienced players and in some instances even surpass them. The mere presence of advice can be advantageous, even if players do not follow it. 2025.naacl-short.6 @@ -8165,7 +8165,7 @@ Local Prompt Optimization - YashJainMicrosoft + YashJainMicrosoft VishalChowdhary 75-81 In recent years, the use of prompts to guide the output of Large Language Models have increased dramatically. However, even the best of experts struggle to choose the correct words to stitch up a prompt for the desired task. To solve this, LLM driven prompt optimization emerged as an important problem. Existing prompt optimization methods optimize a prompt globally, where in all the prompt tokens have to be optimized over a large vocabulary while solving a complex task. The large optimization space (tokens) leads to insufficient guidance for a better prompt. In this work, we introduce Local Prompt Optimization (LPO) that integrates with any general automatic prompt engineering method. We identify the optimization tokens in a prompt and nudge the LLM to focus only on those tokens in its optimization step. We observe remarkable performance improvements on Math Reasoning (GSM8k and MultiArith) and BIG-bench Hard benchmarks across various automatic prompt engineering methods. Further, we show that LPO converges to the optimal prompt faster than global methods. @@ -8174,9 +8174,9 @@ Cross-lingual Transfer of Reward Models in Multilingual Alignment - JiwooHongKorea Advanced Institute of Science & Technology + JiwooHongKorea Advanced Institute of Science & Technology NoahLee - RodrigoMartínez-CastañoUniversidad de Santiago de Compostela + RodrigoMartínez-CastañoUniversidad de Santiago de Compostela CésarRodríguez JamesThorneKAIST 82-94 @@ -8188,9 +8188,9 @@ Inference-Time Selective Debiasing to Enhance Fairness in Text Classification Models GlebKuzminArtificial Intelligence Research Institute and Institute for Systems Analysis of Russian Academy of Sciences NeemeshYadavSingapore Management University - IvanSmirnov - TimothyBaldwinMohamed bin Zayed University of Artificial Intelligence and The University of Melbourne - ArtemShelmanovMohamed bin Zayed University of Artificial Intelligence + IvanSmirnov + TimothyBaldwinMohamed bin Zayed University of Artificial Intelligence and The University of Melbourne + ArtemShelmanovMohamed bin Zayed University of Artificial Intelligence 95-107 We propose selective debiasing – an inference-time safety mechanism designed to enhance the overall model quality in terms of prediction performance and fairness, especially in scenarios where retraining the model is impractical. The method draws inspiration from selective classification, where at inference time, predictions with low quality, as indicated by their uncertainty scores, are discarded. In our approach, we identify the potentially biased model predictions and, instead of discarding them, we remove bias from these predictions using LEACE – a post-processing debiasing method. To select problematic predictions, we propose a bias quantification approach based on KL divergence, which achieves better results than standard uncertainty quantification methods. Experiments on text classification datasets with encoder-based classification models demonstrate that selective debiasing helps to reduce the performance gap between post-processing methods and debiasing techniques from the at-training and pre-processing categories. 2025.naacl-short.9 @@ -8198,16 +8198,16 @@ Automatic Evaluation of Healthcare <fixed-case>LLM</fixed-case>s Beyond Question-Answering - AnnaArias-Duart - Pablo AgustinMartin-TorresBarcelona Supercomputing Center - DanielHinjosBarcelona Supercomputing Center - PabloBernabeu-PerezBarcelona Supercomputing Center + AnnaArias-Duart + Pablo AgustinMartin-TorresBarcelona Supercomputing Center + DanielHinjosBarcelona Supercomputing Center + PabloBernabeu-PerezBarcelona Supercomputing Center Lucia UrcelayGanzabal - Marta GonzalezMalloBarcelona Supercomputing Center + Marta GonzalezMalloBarcelona Supercomputing Center Ashwin KumarGururajan - EnriqueLopez-CuenaBarcelona Supercomputing Center - SergioAlvarez-NapagaoUniversidad Politécnica de Cataluna - DarioGarcia-GasullaBarcelona Supercomputing Center + EnriqueLopez-CuenaBarcelona Supercomputing Center + SergioAlvarez-NapagaoUniversidad Politécnica de Cataluna + DarioGarcia-GasullaBarcelona Supercomputing Center 108-130 Current Large Language Models (LLMs) benchmarks are often based on open-ended or close-ended QA evaluations, avoiding the requirement of human labor. Close-ended measurements evaluate the factuality of responses but lack expressiveness. Open-ended capture the model’s capacity to produce discourse responses but are harder to assess for correctness. These two approaches are commonly used, either independently or together, though their relationship remains poorly understood. This work is focused on the healthcare domain, where both factuality and discourse matter greatly. It introduces a comprehensive, multi-axis suite for healthcare LLM evaluation, exploring correlations between open and close benchmarks and metrics. Findings include blind spots and overlaps in current methodologies. As an updated sanity check, we release a new medical benchmark–CareQA–, with both open and closed variants. Finally, we propose a novel metric for open-ended evaluations –Relaxed Perplexity– to mitigate the identified limitations. 2025.naacl-short.10 @@ -8215,8 +8215,8 @@ <fixed-case>STRUX</fixed-case>: An <fixed-case>LLM</fixed-case> for Decision-Making with Structured Explanations - YimingLu - YebowenHuUniversity of Central Florida + YimingLu + YebowenHuUniversity of Central Florida HassanForooshUniversity of Central Florida WeiJinEmory University FeiLiuEmory University @@ -8227,7 +8227,7 @@ Improving <fixed-case>V</fixed-case>ietnamese-<fixed-case>E</fixed-case>nglish Cross-Lingual Retrieval for Legal and General Domains - Toan NgocNguyen + Toan NgocNguyen Nam LeHaiHanoi University of Science and Technology Nguyen DoanHieuHanoi University of Science and Technology Dai AnNguyenHanoi University of Science and Technology @@ -8243,7 +8243,7 @@ Computational Discovery of Chiasmus in Ancient Religious Text HopeMcGovern HaleSirinJohns Hopkins University - TomLippincottDepartment of Computer Science, Whiting School of Engineering + TomLippincottDepartment of Computer Science, Whiting School of Engineering 154-160 Chiasmus, a debated literary device in Biblical texts, has captivated mystics while sparking ongoing scholarly discussion. In this paper, we introduce the first computational approach to systematically detect chiasmus within Biblical passages. Our method leverages neural embeddings to capture lexical and semantic patterns associated with chiasmus, applied at multiple levels of textual granularity (half-verses, verses). We also involve expert annotators to review a subset of the detected patterns. Despite its computational efficiency, our method achieves robust results, with high inter-annotator agreement and system accuracy of 0.80 at the verse level and 0.60 at the half-verse level. We further provide a qualitative analysis of the distribution of detected chiasmi, along with selected examples that highlight the effectiveness of our approach. 2025.naacl-short.13 @@ -8253,7 +8253,7 @@ Characterizing the Effects of Translation on Intertextuality using Multilingual Embedding Spaces HopeMcGovern HaleSirinJohns Hopkins University - TomLippincottDepartment of Computer Science, Whiting School of Engineering + TomLippincottDepartment of Computer Science, Whiting School of Engineering 161-167 Rhetorical devices are difficult to translate, but they are crucial to the translation of literary documents. We investigate the use of multilingual embedding spaces to characterize the preservation of intertextuality, one common rhetorical device, across human and machine translation. To do so, we use Biblical texts, which are both full of intertextual references and are highly translated works. We provide a metric to characterize intertextuality at the corpus level and provide a quantitative analysis of the preservation of this rhetorical device across extant human translations and machine-generated counterparts. We go on to provide qualitative analysis of cases wherein human translations over- or underemphasize the intertextuality present in the text, whereas machine translations provide a neutral baseline. This provides support for established scholarship proposing that human translators have a propensity to amplify certain literary characteristics of the original manuscripts. 2025.naacl-short.14 @@ -8262,10 +8262,10 @@ <fixed-case>LLM</fixed-case>2: Let Large Language Models Harness System 2 Reasoning ChengYangTsinghua University, Tsinghua University - ChufanShi + ChufanShi SihengLi - BoShui - YujiuYangTsinghua University + BoShui + YujiuYangTsinghua University WaiLamThe Chinese University of Hong Kong 168-177 Large language models (LLMs) have exhibited impressive capabilities across a myriad of tasks, yet they occasionally yield undesirable outputs. We posit that these limitations are rooted in the foundational autoregressive architecture of LLMs, which inherently lacks mechanisms for differentiating between desirable and undesirable results. Drawing inspiration from the dual-process theory of human cognition, we introduce LLM2, a novel framework that combines an LLM (System 1) with a process-based verifier (System 2). Within LLM2, the LLM is responsible for generating plausible candidates, while the verifier provides timely process-based feedback to distinguish desirable and undesirable outputs. The verifier is trained with a pairwise comparison loss on synthetic process-supervision data generated through our token quality exploration strategy. Empirical results on mathematical reasoning benchmarks substantiate the efficacy of LLM2, exemplified by an accuracy enhancement from 50.3 to 57.8 (+7.5) for Llama3-1B on GSM8K. Furthermore, when combined with self-consistency, LLM2 achieves additional improvements, boosting major@20 accuracy from 56.2 to 70.2 (+14.0). @@ -8277,7 +8277,7 @@ YanhongLi DavidYunis DavidMcAllesterToyota Technological Institute at Chicago - JiaweiZhouState University of New York at Stony Brook + JiaweiZhouState University of New York at Stony Brook 178-194 There has recently been considerable interest in incorporating information retrieval into large language models (LLMs). Retrieval from a dynamically expanding external corpus of text allows a model to incorporate current events and can be viewed as a form of episodic memory. Here we demonstrate that pre-processing the external corpus into semi-structured “atomic facts” makes retrieval more efficient. More specifically, we demonstrate that our particular form of atomic facts improves performance on various question answering tasks when the amount of retrieved text is limited. Limiting the amount of retrieval reduces the size of the context and improves inference efficiency. 2025.naacl-short.16 @@ -8285,7 +8285,7 @@ Sports and Women’s Sports: Gender Bias in Text Generation with Olympic Data - LauraBiesterMiddlebury College + LauraBiesterMiddlebury College 195-205 Large Language Models (LLMs) have been shown to be biased in prior work, as they generate text that is in line with stereotypical views of the world or that is not representative of the viewpoints and values of historically marginalized demographic groups. In this work, we propose using data from parallel men’s and women’s events at the Olympic Games to investigate different forms of gender bias in language models. We define three metrics to measure bias, and find that models are consistently biased against women when the gender is ambiguous in the prompt. In this case, the model frequently retrieves only the results of the men’s event with or without acknowledging them as such, revealing pervasive gender bias in LLMs in the context of athletics. 2025.naacl-short.17 @@ -8318,7 +8318,7 @@ Concept-Reversed <fixed-case>W</fixed-case>inograd Schema Challenge: Evaluating and Improving Robust Reasoning in Large Language Models via Abstraction KaiqiaoHan TianqingFangTencent AI Lab - ZhaoweiWangEdinburgh University, University of Edinburgh and Department of Computer Science and Engineering, Hong Kong University of Science and Technology + ZhaoweiWangEdinburgh University, University of Edinburgh and Department of Computer Science and Engineering, Hong Kong University of Science and Technology YangqiuSongHong Kong University of Science and Technology MarkSteedmanUniversity of Edinburgh 229-243 @@ -8330,7 +8330,7 @@ Defense against Prompt Injection Attacks via Mixture of Encodings RuiyiZhangUniversity of California, San Diego DavidSullivanMicrosoft - KyleJacksonMicrosoft + KyleJacksonMicrosoft PengtaoXieUniversity of California, San Diego and Carnegie Mellon University MeiChenMicrosoft 244-252 @@ -8341,7 +8341,7 @@ Watching the <fixed-case>AI</fixed-case> Watchdogs: A Fairness and Robustness Analysis of <fixed-case>AI</fixed-case> Safety Moderation Classifiers AkshitAchara - AnshumanChhabraUniversity of South Florida + AnshumanChhabraUniversity of South Florida 253-264 AI Safety Moderation (ASM) classifiers are designed to moderate content on social media platforms and to serve as guardrails that prevent Large Language Models (LLMs) from being fine-tuned on unsafe inputs. Owing to their potential for disparate impact, it is crucial to ensure that these classifiers: (1) do not unfairly classify content belonging to users from minority groups as unsafe compared to those from majority groups and (2) that their behavior remains robust and consistent across similar inputs. In this work, we thus examine the fairness and robustness of four widely-used, closed-source ASM classifiers: OpenAI Moderation API, Perspective API, Google Cloud Natural Language (GCNL) API, and Clarifai API. We assess fairness using metrics such as demographic parity and conditional statistical parity, comparing their performance against ASM models and a fair-only baseline. Additionally, we analyze robustness by testing the classifiers’ sensitivity to small and natural input perturbations. Our findings reveal potential fairness and robustness gaps, highlighting the need to mitigate these issues in future versions of these models. 2025.naacl-short.22 @@ -8360,9 +8360,9 @@ Is It <fixed-case>N</fixed-case>avajo? Accurate Language Detection for Endangered Athabaskan Languages IvoryYang - WeichengMaGeorgia Institute of Technology + WeichengMaGeorgia Institute of Technology ChunhuiZhangDartmouth College - SoroushVosoughiDartmouth College + SoroushVosoughiDartmouth College 277-284 Endangered languages, such as Navajo—the most widely spoken Native American language—are significantly underrepresented in contemporary language technologies, exacerbating the challenges of their preservation and revitalization. This study evaluates Google’s Language Identification (LangID) tool, which does not currently support any Native American languages. To address this, we introduce a random forest classifier trained on Navajo and twenty erroneously suggested languages by LangID. Despite its simplicity, the classifier achieves near-perfect accuracy (97-100%). Additionally, the model demonstrates robustness across other Athabaskan languages—a family of Native American languages spoken primarily in Alaska, the Pacific Northwest, and parts of the Southwestern United States—suggesting its potential for broader application. Our findings underscore the pressing need for NLP systems that prioritize linguistic diversity and adaptability over centralized, one-size-fits-all solutions, especially in supporting underrepresented languages in a multicultural world. This work directly contributes to ongoing efforts to address cultural biases in language models and advocates for the development of culturally localized NLP tools that serve diverse linguistic communities. 2025.naacl-short.24 @@ -8370,8 +8370,8 @@ Don’t Touch My Diacritics - KyleGormanThe Graduate Center, City University of New York and Google - YuvalPinterBen-Gurion University of the Negev + KyleGormanThe Graduate Center, City University of New York and Google + YuvalPinterBen-Gurion University of the Negev 285-291 The common practice of preprocessing text before feeding it into NLP models introduces many decision points which have unintended consequences on model performance. In this opinion piece, we focus on the handling of diacritics in texts originating in many languages and scripts. We demonstrate, through several case studies, the adverse effects of inconsistent encoding of diacritized characters and of removing diacritics altogether. We call on the community to adopt simple but necessary steps across all models and toolkits in order to improve handling of diacritized text and, by extension, increase equity in multilingual NLP. 2025.naacl-short.25 @@ -8382,7 +8382,7 @@ ChunhuiZhangDartmouth College YirenJianOpenAI ZhongyuOuyang - SoroushVosoughiDartmouth College + SoroushVosoughiDartmouth College 292-305 Developing video captioning models is computationally expensive. The dynamic nature of video also complicates the design of multimodal models that can effectively caption these sequences. However, we find that by using minimal computational resources and without complex modifications to address video dynamics, an image-based model can be repurposed to outperform several specialised video captioning systems. Our adapted model demonstrates top-tier performance on major benchmarks, ranking 2nd on MSR-VTT and MSVD, and 3rd on VATEX. We transform it into a competitive video captioner by post-training a typical image captioning model BLIP-2 with only 6,000 video-text pairs and simply concatenating frames—significantly fewer data than other methods, which use 2.5 to 144 million pairs. From a resource optimization perspective, this video captioning study focuses on three fundamental factors: optimizing model scale, maximizing data efficiency, and incorporating reinforcement learning. This extensive study demonstrates that a lightweight, image-based adaptation strategy can rival state-of-the-art video captioning systems, offering a practical solution for low-resource scenarios. 2025.naacl-short.26 @@ -8395,8 +8395,8 @@ CunxiaoDuSea AI LAB YanyingZhou MinghuiQiu - QianruSunSingapore Management University - HaoZhangAlibaba Group + QianruSunSingapore Management University + HaoZhangAlibaba Group JiaweiWuNational University of Singapore 306-320 Humans are accustomed to reading and writing in a forward manner, and this natural bias extends to text understanding in auto-regressive large language models (LLMs). This paper investigates whether LLMs, like humans, struggle with reverse modeling, specifically with reversed text inputs. We found that publicly available pre-trained LLMs cannot understand such inputs. However, LLMs trained from scratch with both forward and reverse texts can understand them equally well during inference across multiple languages.Our case study shows that different-content texts result in different losses if input (to LLMs) in different directions—some get lower losses for forward while some for reverse. This leads us to a simple and nice solution for data selection based on the loss differences between forward and reverse directions. Using our selected data in continued pretraining can boost LLMs’ performance by a large margin across different language understanding benchmarks. @@ -8405,7 +8405,7 @@ Preserving Multilingual Quality While Tuning Query Encoder on <fixed-case>E</fixed-case>nglish Only - OlegVasilyevPrimer Technologies + OlegVasilyevPrimer Technologies RandySawayaPrimer Technologies JohnBohannon 321-341 @@ -8416,11 +8416,11 @@ Using Contextually Aligned Online Reviews to Measure <fixed-case>LLM</fixed-case>s’ Performance Disparities Across Language Varieties ZixinTangThe Pennsylvania State University - Chieh-YangHuangMetaMetrics + Chieh-YangHuangMetaMetrics Tsung-cheLi, Academia Sinica Ho Yin SamNgPennsylvania State University - Hen-HsenHuangInstitute of Information Science, Academia Sinica - Ting-Hao KennethHuangPennsylvania State University + Hen-HsenHuangInstitute of Information Science, Academia Sinica + Ting-Hao KennethHuangPennsylvania State University 342-355 A language can have different varieties. These varieties can affect the performance of natural language processing (NLP) models, including large language models (LLMs), which are often trained on data from widely spoken varieties. This paper introduces a novel and cost-effective approach to benchmark model performance across language varieties. We argue that international online review platforms,such as Booking.com, can serve as effective data sources for constructing datasets that capture comments in different language varieties from similar real-world scenarios, like reviews for the same hotel with the same rating using the same language (e.g., Mandarin Chinese) but different language varieties (e.g., Taiwan Mandarin, Mainland Mandarin). To prove this concept, we constructed a contextually aligned dataset comprising reviews in Taiwan Mandarin and Mainland Mandarin and tested six LLMs in a sentiment analysis task. Our results show that LLMs consistently underperform in Taiwan Mandarin. 2025.naacl-short.29 @@ -8441,9 +8441,9 @@ WenbinDuanPeople’s Public Security University of China ZhiyiYin, Chinese Academy of Sciences YinghanShen - ShaolingJing + ShaolingJing JieZhangInstitute of Computing Technology, Chinese Academy of Sciences - HuaweiShenInstitute of Computing Technology, Chinese Academy of Sciences + HuaweiShenInstitute of Computing Technology, Chinese Academy of Sciences XueqiChengInstitute of Computing Technology, Chinese Academy 363-373 2025.naacl-short.31 @@ -8452,8 +8452,8 @@ <fixed-case>STEP</fixed-case>: Staged Parameter-Efficient Pre-training for Large Language Models KazukiYano - TakumiItoLangsmith Inc., Tohoku University and Machine Learning Solutions - JunSuzukiTohoku University + TakumiItoLangsmith Inc., Tohoku University and Machine Learning Solutions + JunSuzukiTohoku University 374-384 Pre-training large language models (LLMs) faces significant memory challenges due to the large size of model weights. We introduce STaged parameter-Efficient Pre-training (STEP), which integrates parameter-efficient tuning techniques with model growth. We conduct experiments on pre-training LLMs of various sizes and demonstrate that STEP achieves up to a 53.9% reduction in maximum memory requirements compared to vanilla pre-training while maintaining equivalent performance. Furthermore, we show that the model by STEP performs comparably to vanilla pre-trained models on downstream tasks after instruction tuning. 2025.naacl-short.32 @@ -8481,7 +8481,7 @@ <fixed-case>AMPS</fixed-case>: <fixed-case>ASR</fixed-case> with Multimodal Paraphrase Supervision AbhishekGupta - AmrutaParulekar + AmrutaParulekar SameepChattopadhyay PreethiJyothiIndian Institute of Technology Bombay 404-413 @@ -8494,7 +8494,7 @@ ChunlanMa AyyoobImaniMicrosoft HaotianYeCenter for Information and Language Processing - RenhaoPei + RenhaoPei EhsaneddinAsgariQatar Computing Research Institute and University of California, Berkeley HinrichSchuetze 414-439 @@ -8504,11 +8504,11 @@ <fixed-case>G</fixed-case>ame<fixed-case>T</fixed-case>ox: A Comprehensive Dataset and Analysis for Enhanced Toxicity Detection in Online Gaming Communities - UsmanNaseemMacquarie University - ShuvamShiwakoti + UsmanNaseemMacquarie University + ShuvamShiwakoti Siddhant BikramShahNortheastern University - SurendrabikramThapaVirginia Polytechnic Institute and State University - QiZhangTongji University + SurendrabikramThapaVirginia Polytechnic Institute and State University + QiZhangTongji University 440-447 The prevalence of toxic behavior in online gaming communities necessitates robust detection methods to ensure user safety. We introduce GameTox, a novel dataset comprising 53K game chat utterances annotated for toxicity detection through intent classification and slot filling. This dataset captures the complex relationship between user intent and specific linguistic features that contribute to toxic interactions. We extensively analyze the dataset to uncover key insights into the nature of toxic speech in gaming environments. Furthermore, we establish baseline performance metrics using state-of-the-art natural language processing and large language models, demonstrating the dataset’s contribution towards enhancing the detection of toxic behavior and revealing the limitations of contemporary models. Our results indicate that leveraging both intent detection and slot filling provides a significantly more granular and context-aware understanding of harmful messages. This dataset serves as a valuable resource to train advanced models that can effectively mitigate toxicity in online gaming and foster healthier digital spaces. Our dataset is publicly available at: https://github.com/shucoll/GameTox. 2025.naacl-short.37 @@ -8517,17 +8517,17 @@ <fixed-case>F</fixed-case>aith<fixed-case>B</fixed-case>ench: A Diverse Hallucination Benchmark for Summarization by <fixed-case>M</fixed-case>odern <fixed-case>LLM</fixed-case>s Forrest ShengBaoVectara, Inc. - MiaoranLiIowa State University + MiaoranLiIowa State University RenyiQuVectara - GeLuoVectara Inc. + GeLuoVectara Inc. EranaWan YujiaTangAlibaba Group - WeisiFan + WeisiFan Manveer SinghTamberUniversity of Waterloo SulemanKaziVectara VivekSourabh MikeQiTextea Inc. - RuixuanTu + RuixuanTu ChenyuXu MatthewGonzalesVectara OferMendelevitchTel Aviv University @@ -8552,7 +8552,7 @@ Great Memory, Shallow Reasoning: Limits of <tex-math>k</tex-math><fixed-case>NN</fixed-case>-<fixed-case>LM</fixed-case>s ShangyiGeng WentingZhaoCornell University - Alexander MRushCornell University and School of Engineering and Applied Sciences, Harvard University + Alexander MRushCornell University and School of Engineering and Applied Sciences, Harvard University 471-482 K-nearest neighbor language models (kNN-LMs), which integrate retrieval with next-word prediction, have demonstrated strong performance in language modeling as well as some downstream NLP benchmarks. These results have led researchers to argue that models trained on poor quality or outdated data could perform well by employing a kNN extension that has access to a higher-quality datastore. In this work, we ask whether this improved ability to recall information really translates into downstream abilities. We extensively evaluate kNN-LMs on a diverse set of tasks, ranging from sentiment classification and commonsense reasoning to multi-hop reasoning. Results show that kNN-LMs excel at memory-intensive tasks, where utilizing the patterns in the input is sufficient for determining the output, but struggle with reasoning tasks that require integrating multiple pieces of information to derive new knowledge. We further demonstrate through oracle experiments and qualitative analysis that even with perfect retrieval, kNN-LMs still fail to determine the correct answers, placing an upper bound on their reasoning performance. 2025.naacl-short.40 @@ -8561,7 +8561,7 @@ Repetition Neurons: How Do Language Models Produce Repetitions? TatsuyaHiraokaMohamed bin Zayed University of Artificial Intelligence and RIKEN - KentaroInuiMBZUAI, RIKEN and Tohoku University + KentaroInuiMBZUAI, RIKEN and Tohoku University 483-495 This paper introduces repetition neurons, which can be regarded as “skill neurons” responsible for the repetition problem in text generation tasks. These neurons are progressively activated more strongly as repetition continues, indicating that they perceive repetition as a task to copy the previous context repeatedly, similar to in-context learning. We identify these repetition neurons by comparing activation values before and after the onset of repetition in texts generated by recent pre-trained language models. We analyze the repetition neurons in three English and one Japanese pre-trained language models and observe similar patterns across them. 2025.naacl-short.41 @@ -8574,7 +8574,7 @@ TejaswiniPedapati I-HsinChungInternational Business Machines Mi-YenYehAcademia Sinica - Pin-YuChenInternational Business Machines + Pin-YuChenInternational Business Machines 496-505 Model merging is an efficient way of obtaining a multi-task model from several pretrained models without further fine-tuning, and it has gained attention in various domains, including natural language processing (NLP). Despite the efficiency, a key challenge in model merging is the seemingly inevitable decrease in task performance as the number of models increases. In this paper, we propose **S**pectral **T**runcation **A**nd **R**escale (STAR) that aims at mitigating “merging conflicts” by truncating small components in the respective spectral spaces, which is followed by an automatic parameter rescaling scheme to retain the nuclear norm of the original matrix. STAR requires no additional inference on original training data and is robust to hyperparamater choice. We demonstrate the effectiveness of STAR through extensive model merging cases on diverse NLP tasks. Specifically, STAR works robustly across varying model sizes, and can outperform baselines by 4.2% when merging 12 models on Flan-T5. Our code is publicly available at https://github.com/IBM/STAR. 2025.naacl-short.42 @@ -8583,7 +8583,7 @@ Task-driven Layerwise Additive Activation Intervention Hieu TrungNguyenVinai Research - BaoNguyen + BaoNguyen BinhNguyenNational University of Singapore Viet AnhNguyenThe Chinese University of Hong Kong 506-513 @@ -8602,7 +8602,7 @@ Black-Box Visual Prompt Engineering for Mitigating Object Hallucination in Large Vision Language Models - SangminWoo + SangminWoo KangZhouAmazon YunZhouAmazon ShuaiWangAmazon @@ -8617,9 +8617,9 @@ A Layered Debating Multi-Agent System for Similar Disease Diagnosis YutianZhaoTencent AI Lab - HuiminWangJarvis Research Center, Tencent YouTu Lab - YefengZhengWestlake University - XianWuTencent + HuiminWangJarvis Research Center, Tencent YouTu Lab + YefengZhengWestlake University + XianWuTencent 539-549 Distinguishing between extremely similar diseases is a critical and challenging aspect of clinical decision-making. Traditional classification, contrastive learning, and Large Language Models (LLMs) based methods fail to detect the subtle clues necessary for differentiation. This task demands complex reasoning and a variety of tools to identify minor differences and make informed decisions. This paper probes a novel framework that leverages LLMs and a multi-agent system to achieve accurate disease diagnosis through a process of repeated debate and reassessment. The approach aims to identify subtle differences between similar disease candidates. We structure patient information and integrate extensive medical knowledge to guide the analysis towards discerning these differences for precise diagnosis. Comprehensive experiments were conducted on two public datasets and two newly introduced datasets, JarvisD2-Chinese and JarvisD2-English, to validate the effectiveness of our method. The results confirm the efficacy of our approach, demonstrating its potential to enhance diagnostic precision in healthcare. 2025.naacl-short.46 @@ -8631,7 +8631,7 @@ TatsuyaHiraokaMohamed bin Zayed University of Artificial Intelligence and RIKEN HilalAlQuabehMohamed bin Zayed University of Artificial Intelligence BenjaminHeinzerlingTohoku University and RIKEN - KentaroInuiMBZUAI, RIKEN and Tohoku University + KentaroInuiMBZUAI, RIKEN and Tohoku University 550-561 This paper investigates whether large language models (LLMs) utilize numerical attributes encoded in a low-dimensional subspace of theembedding space when answering questions involving numeric comparisons, e.g., Was Cristiano born before Messi? We first identified,using partial least squares regression, these subspaces, which effectively encode the numerical attributes associated with the entities in comparison prompts. Further, we demonstrate causality, by intervening in these subspaces to manipulate hidden states, thereby altering the LLM’s comparison outcomes. Experiments conducted on three different LLMs showed that our results hold across different numerical attributes, indicating that LLMs utilize the linearly encoded information for numerical reasoning. 2025.naacl-short.47 @@ -8640,11 +8640,11 @@ <fixed-case>A</fixed-case>lign<fixed-case>F</fixed-case>reeze: Navigating the Impact of Realignment on the Layers of Multilingual Models Across Diverse Languages SteveBakos - DavidGuzmán + DavidGuzmán RiddhiMore Kelly ChutongLi FélixGaschiPosos - En-Shiun AnnieLee + En-Shiun AnnieLee 562-586 Realignment techniques are often employed to enhance cross-lingual transfer in multilingual language models, still, they can sometimes degrade performance in languages that differ significantly from the fine-tuned source language. This paper introduces AlignFreeze, a method that freezes either the layers’ lower half or upper half during realignment. Through controlled experiments on 4 tasks, 3 models, and in 35 languages, we find that realignment affects all the layers but can be the most detrimental to the lower ones. Freezing the lower layers can prevent performance degradation. Particularly, AlignFreeze improves Part-of-Speech (PoS) tagging performances in languages where full realignment fails: with XLM-R, it provides improvements of more than one standard deviation in accuracy in seven more languages than full realignment. 2025.naacl-short.48 @@ -8654,9 +8654,9 @@ <fixed-case>FLIQA</fixed-case>-<fixed-case>AD</fixed-case>: a Fusion Model with Large Language Model for Better Diagnose and <fixed-case>MMSE</fixed-case> Prediction of <fixed-case>A</fixed-case>lzheimer’s Disease JunhaoChen ZhiyuanDing - YanLiu + YanLiu XiangzhuZengPeking University - LingWangUniversity of Electronic Science and Technology of China + LingWangUniversity of Electronic Science and Technology of China 587-594 Tracking a patient’s cognitive status early in the onset of the disease provides an opportunity to diagnose and intervene in Alzheimer’s disease (AD). However, relying solely on magnetic resonance imaging (MRI) images with traditional classification and regression models may not fully extract finer-grained information. This study proposes a multi-task Fusion Language Image Question Answering model (FLIQA-AD) to perform AD identification and Mini Mental State Examination (MMSE) prediction. Specifically, a 3D Adapter is introduced in Vision Transformer (ViT) model for image feature extraction. The patient electronic health records (EHR) information and questions related to the disease work as text prompts to be encoded. Then, an ADFormer model, which combines self-attention and cross-attention mechanisms, is used to capture the correlation between EHR information and structure features. After that, the extracted brain structural information and textual content are combined as input sequences for the large language model (LLM) to identify AD and predict the corresponding MMSE score. Experimental results demonstrate the strong discrimination and MMSE prediction performance of the model, as well as question-answer capabilities. 2025.naacl-short.49 @@ -8664,7 +8664,7 @@ Transform Retrieval for Textual Entailment in <fixed-case>RAG</fixed-case> - XinLiang + XinLiang QuanGuo 595-599 In this paper, we introduce Transform Retrieval, a novel approach aimed at improving Textual Entailment Retrieval within the framework of Retrieval-Augmented Generation (RAG). While RAG has shown promise in enhancing Large Language Models by retrieving relevant documents to extract specific knowledge or mitigate hallucination, current retrieval methods often prioritize relevance without ensuring the retrieved documents semantically support answering the queries. Transform Retrieval addresses this gap by transforming query embeddings to better align with semantic entailment without re-encoding the document corpus. We achieve this by using a transform model and employing a contrastive learning strategy to optimize the alignment between transformed query embeddings and document embeddings for better entailment.We evaluated the framework using BERT as frozen pre-trained encoder and compared it with a fully fine-tuned skyline model. Experimental results show that Transform Retrieval with simple MLP consistently approaches the skyline across multiple datasets, demonstrating the method’s effectiveness. The high performance on HotpotQA highlights its strength in many-to-many retrieval scenarios. @@ -8698,10 +8698,10 @@ Auto-Cypher: Improving <fixed-case>LLM</fixed-case>s on Cypher generation via <fixed-case>LLM</fixed-case>-supervised generation-verification framework - AmanTiwariServiceNow Inc + AmanTiwariServiceNow Inc Shiva Krishna ReddyMalayServiceNow Inc VikasYadav - MasoudHashemiServiceNow Inc + MasoudHashemiServiceNow Inc Sathwik TejaswiMadhusudhanServiceNow Inc 623-640 Graph databases like Neo4j are gaining popularity for handling complex, interconnected data, over traditional relational databases in modeling and querying relationships. While translating natural language into SQL queries is well-researched, generating Cypher queries for Neo4j remains relatively underexplored. In this work, we present an automated, LLM Supervised, pipeline to generate high quality synthetic data for Text2Cypher. Our Cypher data generation pipeline introduces LLM-As-Database-Filler, a novel strategy for ensuring Cypher query correctness, thus resulting in high quality generations. Using our pipeline, we generate high quality Text2Cypher data - SynthCypher containing 29.8k instances across various domains and queries with varying complexities. Training open-source LLMs like LLaMa-3.1-8B, Mistral-7B, and QWEN7B on SynthCypher results in performance gains of up to 40% on the Text2Cypher test split and 30% on the SPIDER benchmark, adapted for graph databases. @@ -8720,7 +8720,7 @@ A Fair Comparison without Translationese: <fixed-case>E</fixed-case>nglish vs. Target-language Instructions for Multilingual <fixed-case>LLM</fixed-case>s TaiseiEnomotoTokyo Metropolitan University HwichanKimTokyo Metropolitan University - ZhousiChen + ZhousiChen MamoruKomachiHitotsubashi University 649-670 Most large language models are multilingual instruction executors. Prior studies suggested that English instructions are more effective than target-language instructions even for non-English tasks; however, these studies often use datasets and instructions translated from English, which introduce biases known as translationese, hindering an unbiased comparison. To address this issue, we conduct a fair comparison between English and target-language instructions by eliminating translationese effects. Contrary to previous studies, our experiments across several tasks reveal that the advantage of adopting English instructions is not overwhelming. Additionally, we report on the features of generated texts and the instruction-following abilities when using respective instructions. @@ -8740,8 +8740,8 @@ <fixed-case>S</fixed-case>cratch<fixed-case>E</fixed-case>val: Are <fixed-case>GPT</fixed-case>-4o Smarter than My Child? Evaluating Large Multimodal Models with Visual Programming Challenges RaoFu ZiyangLuoSalesforce Research and Hong Kong Baptist University - HongzhanLinHong Kong Baptist University - ZhenYe + HongzhanLinHong Kong Baptist University + ZhenYe JingMaHong Kong Baptist University 689-699 Recent advancements in large multimodal models (LMMs) have showcased impressive code generation capabilities, primarily evaluated through image-to-code benchmarks. However, these benchmarks are limited to specific visual programming scenarios where the logic reasoning and the multimodal understanding capacities are split apart. To fill this gap, we propose ScratchEval, a novel benchmark designed to evaluate the visual programming reasoning ability of LMMs. ScratchEval is based on Scratch, a block-based visual programming language widely used in children’s programming education. By integrating visual elements and embedded programming logic, ScratchEval requires the model to process both visual information and code structure, thereby comprehensively evaluating its programming intent understanding ability. Our evaluation approach goes beyond the traditional image-to-code mapping and focuses on unified logical thinking and problem-solving abilities, providing a more comprehensive and challenging framework for evaluating the visual programming ability of LMMs. ScratchEval not only fills the gap in existing evaluation methods, but also provides new insights for the future development of LMMs in the field of visual programming. @@ -8761,8 +8761,8 @@ <fixed-case>DART</fixed-case>: An <fixed-case>AIGT</fixed-case> Detector using <fixed-case>AMR</fixed-case> of Rephrased Text HyeonchuPark - ByungjunKimChung-Ang University - BugeunKimChung-Ang University + ByungjunKimChung-Ang University + BugeunKimChung-Ang University 710-721 As large language models (LLMs) generate more human-like texts, concerns about the side effects of AI-generated texts (AIGT) have grown. So, researchers have developed methods for detecting AIGT. However, two challenges remain. First, the performance of detecting black-box LLMs is low because existing models focus on probabilistic features. Second, most AIGT detectors have been tested on a single-candidate setting, which assumes that we know the origin of an AIGT and which may deviate from the real-world scenario. To resolve these challenges, we propose DART, which consists of four steps: rephrasing, semantic parsing, scoring, and multiclass classification. We conducted three experiments to test the performance of DART. The experimental result shows that DART can discriminate multiple black-box LLMs without probabilistic features and the origin of AIGT. 2025.naacl-short.59 @@ -8781,11 +8781,11 @@ Language Models “Grok” to Copy - AngLv - RuobingXie - XingwuSunTencent AI Platform - ZhanhuiKang - RuiYanRenmin University of China + AngLv + RuobingXie + XingwuSunTencent AI Platform + ZhanhuiKang + RuiYanRenmin University of China 735-741 We examine the pre-training dynamics of language models, focusing on their ability to copy text from preceding context—a fundamental skill for various LLM applications, including in-context learning (ICL) and retrieval-augmented generation (RAG). We propose a novel perspective that Transformer-based language models develop copying abilities similarly to grokking, which refers to sudden generalization on test set long after the model fit to the training set. Our experiments yield three arguments: (1) The pre-training loss decreases rapidly, while the context copying ability of models initially lags and then abruptly saturates. (2) The speed of developing copying ability is independent of the number of tokens trained, similarly to how grokking speed is unaffected by dataset size as long as the data distribution is preserved. (3) Induction heads, the attention heads responsible for copying, form from shallow to deep layers during training, mirroring the development of circuits in deeper layers during grokking. We contend that the connection between grokking and context copying can provide valuable insights for more effective language model training, ultimately improving in-context performance. For example, we demonstrated that techniques that enhance grokking, such as regularization, either accelerate or enhance the development of context copying. 2025.naacl-short.61 @@ -8794,8 +8794,8 @@ Evaluating <fixed-case>LLM</fixed-case>s for Quotation Attribution in Literary Texts: A Case Study of <fixed-case>LL</fixed-case>a<fixed-case>M</fixed-case>a3 GaspardMichel - Elena V.EpureDeezer - RomainHennequin + Elena V.EpureDeezer + RomainHennequin ChristopheCerisaraUniversity of Lorraine 742-755 Large Language Models (LLMs) have shown promising results in a variety of literary tasks, often using complex memorized details of narration and fictional characters. In this work, we evaluate the ability of Llama-3 at attributing utterances of direct-speech to their speaker in novels. The LLM shows impressive results on a corpus of 28 novels, surpassing published results with ChatGPT and encoder-based baselines by a large margin. We then validate these results by assessing the impact of book memorization and annotation contamination.We found that these types of memorization do not explain the large performance gain, making Llama-3 the new state-of-the-art for quotation attribution in English literature. We release publicly our code and data. @@ -8805,8 +8805,8 @@ Beyond Literal Token Overlap: Token Alignability for Multilinguality KatharinaHämmerl - TomaszLimisiewiczMeta and University of Washington - JindřichLibovickýCharles University Prague + TomaszLimisiewiczMeta and University of Washington + JindřichLibovickýCharles University Prague AlexanderFraserTechnical University of Munich 756-767 Previous work has considered token overlap, or even similarity of token distributions, as predictors for multilinguality and cross-lingual knowledge transfer in language models. However, these very literal metrics assign large distances to language pairs with different scripts, which can nevertheless show good cross-linguality. This limits the explanatory strength of token overlap for knowledge transfer between language pairs that use distinct scripts or follow different orthographic conventions. In this paper, we propose subword token alignability as a new way to understand the impact and quality of multilingual tokenisation. In particular, this metric predicts multilinguality much better when scripts are disparate and the overlap of literal tokens is low. We analyse this metric in the context of both encoder and decoder models, look at data size as a potential distractor, and discuss how this insight may be applied to multilingual tokenisation in future work. We recommend our subword token alignability metric for identifying optimal language pairs for cross-lingual transfer, as well as to guide the construction of better multilingual tokenisers in the future. We publish our code and reproducibility details. @@ -8816,7 +8816,7 @@ <fixed-case>I</fixed-case>dentify<fixed-case>M</fixed-case>e: A Challenging Long-Context Mention Resolution Benchmark for <fixed-case>LLM</fixed-case>s KawshikManikantan - MakarandTapaswiInternational Institute of Information Technology Hyderabad and Wadhwani Institute for Artificial Intelligence + MakarandTapaswiInternational Institute of Information Technology Hyderabad and Wadhwani Institute for Artificial Intelligence VineetGandhiInternational Institute of Information Technology Hyderabad, Dhirubhai Ambani Institute Of Information and Communication Technology ShubhamToshniwalNVIDIA 768-777 @@ -8828,7 +8828,7 @@ k<fixed-case>NN</fixed-case> Retrieval for Simple and Effective Zero-Shot Multi-speaker Text-to-Speech Karl ElHajalIdiap Research Institute AjinkyaKulkarni - EnnoHermannIdiap Research Institute + EnnoHermannIdiap Research Institute MathewMagimai Doss 778-786 While recent zero-shot multi-speaker text-to-speech (TTS) models achieve impressive results, they typically rely on extensive transcribed speech datasets from numerous speakers and intricate training pipelines. Meanwhile, self-supervised learning (SSL) speech features have emerged as effective intermediate representations for TTS. Further, SSL features from different speakers that are linearly close share phonetic information while maintaining individual speaker identity. In this study, we introduce kNN-TTS, a simple and effective framework for zero-shot multi-speaker TTS using retrieval methods which leverage the linear relationships between SSL features. Objective and subjective evaluations show that our models, trained on transcribed speech from a single speaker only, achieve performance comparable to state-of-the-art models that are trained on significantly larger training datasets. The low training data requirements mean that kNN-TTS is well suited for the development of multi-speaker TTS systems for low-resource domains and languages. We also introduce an interpolation parameter which enables fine-grained voice morphing. Demo samples are available at https://idiap.github.io/knn-tts . @@ -8839,8 +8839,8 @@ <fixed-case>CORD</fixed-case>: Balancing <fixed-case>CO</fixed-case>nsistency and Rank Distillation for Robust Retrieval-Augmented Generation YoungwonLeeSeoul National University Seung-wonHwangSeoul National University - Daniel FCamposSnowflake - FilipGralińskiSnowflake and Adam Mickiewicz University + Daniel FCamposSnowflake + FilipGralińskiSnowflake and Adam Mickiewicz University ZheweiYaoSnowflake YuxiongHeMicrosoft 787-796 @@ -8850,9 +8850,9 @@ <fixed-case>G</fixed-case>raph<fixed-case>LSS</fixed-case>: Integrating Lexical, Structural, and Semantic Features for Long Document Extractive Summarization - MargaritaBugueñoHasso Plattner Institute + MargaritaBugueñoHasso Plattner Institute Hazem AbouHamdan - GerardDe MeloHasso Plattner Institute and University of Potsdam + GerardDe MeloHasso Plattner Institute and University of Potsdam 797-804 Heterogeneous graph neural networks have recently gained attention for long document summarization, modeling the extraction as a node classification task. Although effective, these models often require external tools or additional machine learning models to define graph components, producing highly complex and less intuitive structures. We present GraphLSS, a heterogeneous graph construction for long document extractive summarization, incorporating Lexical, Structural, and Semantic features. It defines two levels of information (words and sentences) and four types of edges (sentence semantic similarity, sentence occurrence order, word in sentence, and word semantic similarity) without any need for auxiliary learning models. Experiments on two benchmark datasets show that GraphLSS is competitive with top-performing graph-based methods, outperforming recent non-graph models. We release our code on GitHub. 2025.naacl-short.67 @@ -8862,7 +8862,7 @@ Step-by-Step Fact Verification System for Medical Claims with Explainable Reasoning JurajVladikaTechnische Universität München IvanaHacajova - FlorianMatthesTechnische Universität München + FlorianMatthesTechnische Universität München 805-816 Fact verification (FV) aims to assess the veracity of a claim based on relevant evidence. The traditional approach for automated FV includes a three-part pipeline relying on short evidence snippets and encoder-only inference models. More recent approaches leverage the multi-turn nature of LLMs to address FV as a step-by-step problem where questions inquiring additional context are generated and answered until there is enough information to make a decision. This iterative method makes the verification process rational and explainable. While these methods have been tested for encyclopedic claims, exploration on domain-specific and realistic claims is missing. In this work, we apply an iterative FV system on three medical fact-checking datasets and evaluate it with multiple settings, including different LLMs, external web search, and structured reasoning using logic predicates. We demonstrate improvements in the final performance over traditional approaches and the high potential of step-by-step FV systems for domain-specific claims. 2025.naacl-short.68 @@ -8883,10 +8883,10 @@ Bottom-Up Synthesis of Knowledge-Grounded Task-Oriented Dialogues with Iteratively Self-Refined Prompts - KunQian + KunQian MaximillianChen SiyanLi - ArpitSharmaWalmart Inc. + ArpitSharmaWalmart Inc. ZhouYuColumbia University 827-844 Training conversational question-answering (QA) systems demands a substantial amount of in-domain data, which is often scarce in practice. A common solution to this challenge is to generate synthetic data. Traditional methods typically follow a top-down approach, where a large language model (LLM) generates multi-turn dialogues from a broad prompt. While this method produces coherent conversations, it offers limited fine-grained control over the content and is susceptible to hallucinations. We introduce a bottom-up conversation synthesis approach, where QA pairs are generated first and then combined into a coherent dialogue. This method offers greater control and precision by dividing the process into two distinct steps, enabling refined instructions and validations to be handled separately. Additionally, this structure allows the use of non-local models in stages that do not involve proprietary knowledge, enhancing the overall quality of the generated data. Both human and automated evaluations demonstrate that our approach produces more realistic and higher-quality dialogues compared to top-down methods. @@ -8898,7 +8898,7 @@ HuamanSunUniversity of Toronto JiaxinPeiStanford University MinjeChoiGeorgia Institute of Technology - DavidJurgensUniversity of Michigan - Ann Arbor + DavidJurgensUniversity of Michigan - Ann Arbor 845-854 Human judgments are inherently subjective and are actively affected by personal traits such as gender and ethnicity. While Large LanguageModels (LLMs) are widely used to simulate human responses across diverse contexts, their ability to account for demographic differencesin subjective tasks remains uncertain. In this study, leveraging the POPQUORN dataset, we evaluate nine popular LLMs on their abilityto understand demographic differences in two subjective judgment tasks: politeness and offensiveness. We find that in zero-shot settings, most models’ predictions for both tasks align more closely with labels from White participants than those from Asian or Black participants, while only a minor gender bias favoring women appears in the politeness task. Furthermore, sociodemographic prompting does not consistently improve and, in some cases, worsens LLMs’ ability to perceive language from specific sub-populations. These findings highlight potential demographic biases in LLMs when performing subjective judgment tasks and underscore the limitations of sociodemographic prompting as a strategy to achieve pluralistic alignment. Code and data are available at: https://github.com/Jiaxin-Pei/LLM-as-Subjective-Judge. 2025.naacl-short.71 @@ -8918,9 +8918,9 @@ Examining <fixed-case>S</fixed-case>panish Counseling with <fixed-case>MIDAS</fixed-case>: a Motivational Interviewing Dataset in <fixed-case>S</fixed-case>panish Aylin EceGunal - BowenYi + BowenYi John D.Piette - RadaMihalceaUniversity of Michigan + RadaMihalceaUniversity of Michigan VeronicaPerez-RosasUniversity of Michigan - Ann Arbor 866-872 Cultural and language factors significantly influence counseling, but Natural Language Processing research has not yet examined whether the findings of conversational analysis for counseling conducted in English apply to other languages. This paper presents a first step towards this direction. We introduce MIDAS (Motivational Interviewing Dataset in Spanish), a counseling dataset created from public video sources that contains expert annotations for counseling reflections and questions. Using this dataset, we explore language-based differences in counselor behavior in English and Spanish and develop classifiers in monolingual and multilingual settings, demonstrating its applications in counselor behavioral coding tasks. @@ -8929,17 +8929,17 @@ Self-Debiasing Large Language Models: Zero-Shot Recognition and Reduction of Stereotypes - Isabel O.GallegosStanford University - RyanAponte - Ryan A.RossiAdobe Research + Isabel O.GallegosStanford University + RyanAponte + Ryan A.RossiAdobe Research JoeBarrowPattern Data MehrabTanjimAdobe Research - TongYuAdobe Research + TongYuAdobe Research HaniehDeilamsalehyAdobe Systems RuiyiZhangAdobe Systems - SungchulKimAdobe Systems - FranckDernoncourt - NedimLipkaAdobe Systems + SungchulKimAdobe Systems + FranckDernoncourt + NedimLipkaAdobe Systems DeonnaOwens JiuxiangGuAdobe Systems 873-888 @@ -8961,10 +8961,10 @@ RamaneswaranSelvakumarUniversity of Maryland, College Park SonalKumar Hemant KumarGiri - NishitAnandUniversity of Maryland, College Park - AshishSeth + NishitAnandUniversity of Maryland, College Park + AshishSeth SreyanGhosh - DineshManochaUniversity of Maryland, College Park + DineshManochaUniversity of Maryland, College Park 899-913 Open-vocabulary audio language models (ALMs), like Contrastive Language Audio Pretraining (CLAP), represent a promising new paradigm for audio-text retrieval using natural language queries. In this paper, for the first time, we perform controlled experiments on various benchmarks to show that existing ALMs struggle to generalize to linguistic variations in textual queries. To address this issue, we propose RobustCLAP, a novel and compute-efficient technique to learn audio-language representations agnostic to linguistic variations. Specifically, we reformulate the contrastive loss used in CLAP architectures by introducing a multi-view contrastive learning objective, where paraphrases are treated as different views of the same audio scene and use this for training. Our proposed approach improves the text-to-audio retrieval performance of CLAP by 0.8%-13% across benchmarks and enhances robustness to linguistic variation. We make our code publicly available 2025.naacl-short.76 @@ -8973,7 +8973,7 @@ Giving the Old a Fresh Spin: Quality Estimation-Assisted Constrained Decoding for Automatic Post-Editing SourabhDeoghare - DipteshKanojiaUniversity of Surrey + DipteshKanojiaUniversity of Surrey PushpakBhattacharyyaIndian Institute of Technology, Bombay, Dhirubhai Ambani Institute Of Information and Communication Technology 914-925 Automatic Post-Editing (APE) systems often struggle with over-correction, where unnecessary modifications are made to a translation, diverging from the principle of minimal editing. In this paper, we propose a novel technique to mitigate over-correction by incorporating word-level Quality Estimation (QE) information during the decoding process. This method is architecture-agnostic, making it adaptable to any APE system, regardless of the underlying model or training approach. Our experiments on English-German, English-Hindi, and English-Marathi language pairs show the proposed approach yields significant improvements over their corresponding baseline APE systems, with TER gains of 0.65, 1.86, and 1.44 points, respectively. These results underscore the complementary relationship between QE and APE tasks and highlight the effectiveness of integrating QE information to reduce over-correction in APE systems. @@ -8982,12 +8982,12 @@ <fixed-case>R</fixed-case>ule<fixed-case>R</fixed-case>: Improving <fixed-case>LLM</fixed-case> Controllability by Rule-based Data Recycling - MingLi + MingLi HanChenShenzhen Institutes of Advanced Technology, Chinese Academy of Sciences, Chinese Academy of Sciences ChenguangWangState University of New York at Stony Brook DangNguyenUniversity of Maryland, College Park DianqiLiCitadel Securities - TianyiZhouUniversity of Maryland, College Park + TianyiZhouUniversity of Maryland, College Park 926-943 Large language models (LLMs) still lack delicate controllability over their responses, which is critical to enhancing their performance and the user experience. However, curating supervised fine-tuning (SFT) datasets to improve LLM controllability usually relies on human experts or proprietary LLMs, which requires additional costs. To bridge this gap, we propose Rule-based Data Recycling (RuleR), a data augmentation method incorporating multiple constraints into the original data samples according to predefined rules, which creates new training tasks to consolidate the controllability of LLMs. Instead of creating new data from scratch, RuleR “recycles” existing data by simply applying rule-based edits to their responses and appending the rule-instructions in their original instructions. Experimental results demonstrate RuleR’s effectiveness in improving LLM controllability while maintaining general instruction-following capabilities. 2025.naacl-short.78 @@ -8999,7 +8999,7 @@ SamarthGarg SagnikSengupta TirthankarGhosalOak Ridge National Laboratory - AsifEkbalIndian Institute of Technology, Jodhpur + AsifEkbalIndian Institute of Technology, Jodhpur 944-953 The growing use of large language models (LLMs) in academic peer review poses significant challenges, particularly in distinguishing AI-generated content from human-written feedback. This research addresses the problem of identifying AI-generated peer review comments, which are crucial to maintaining the integrity of scholarly evaluation. Prior research has primarily focused on generic AI-generated text detection or on estimating the fraction of peer reviews that may be AI-generated, often treating reviews as monolithic units. However, these methods fail to detect finer-grained AI-generated points within mixed-authorship reviews. To address this gap, we propose MixRevDetect, a novel method to identify AI-generated points in peer reviews. Our approach achieved an F1 score of 88.86%, significantly outperforming existing AI text detection methods. 2025.naacl-short.79 @@ -9019,12 +9019,12 @@ Capturing Human Cognitive Styles with Language: Towards an Experimental Evaluation Paradigm VasudhaVaradarajan - SyedaMahwish + SyedaMahwish XiaoranLiuIndependent Researcher JuliaBuffolino, State University of New York at Stony Brook ChristianLuhmannSUNY at Stony Brook - Ryan L.BoydUniversity of Texas at Dallas - H.SchwartzStony Brook University (SUNY) + Ryan L.BoydUniversity of Texas at Dallas + H.SchwartzStony Brook University (SUNY) 966-979 While NLP models often seek to capture cognitive states via language, the validity of predicted states is determined by comparing them to annotations created without access the cognitive states of the authors. In behavioral sciences, cognitive states are instead measured via experiments. Here, we introduce an experiment-based framework for evaluating language-based cognitive style models against human behavior. We explore the phenomenon of decision making, and its relationship to the linguistic style of an individual talking about a recent decision they made. The participants then follow a classical decision-making experiment that captures their cognitive style, determined by how preferences change during a decision exercise. We find that language features, intended to capture cognitive style, can predict participants’ decision style with moderate-to-high accuracy (AUC 0.8), demonstrating that cognitive style can be partly captured and revealed by discourse patterns. 2025.naacl-short.81 From 8b14f28fc02ed7aee46796debe674651b6814bc7 Mon Sep 17 00:00:00 2001 From: Matt Post Date: Thu, 29 May 2025 22:41:35 -0400 Subject: [PATCH 06/18] Industry --- data/xml/2025.naacl.xml | 198 ++++++++++++++++++++-------------------- 1 file changed, 99 insertions(+), 99 deletions(-) diff --git a/data/xml/2025.naacl.xml b/data/xml/2025.naacl.xml index bd0fcdc315..9dd567ccf6 100644 --- a/data/xml/2025.naacl.xml +++ b/data/xml/2025.naacl.xml @@ -9052,7 +9052,7 @@ Understanding <fixed-case>LLM</fixed-case> Development Through Longitudinal Study: Insights from the Open <fixed-case>K</fixed-case>o-<fixed-case>LLM</fixed-case> Leaderboard - ChanjunParkKorea University + ChanjunParkKorea University HyeonwooKim 1-8 This paper conducts a longitudinal study over eleven months to address the limitations of prior research on the Open Ko-LLM Leaderboard, which have relied on empirical studies with restricted observation periods of only five months. By extending the analysis duration, we aim to provide a more comprehensive understanding of the progression in developing Korean large language models (LLMs). Our study is guided by three primary research questions: (1) What are the specific challenges in improving LLM performance across diverse tasks on the Open Ko-LLM Leaderboard over time? (2) How does model size impact task performance correlations across various benchmarks? (3) How have the patterns in leaderboard rankings shifted over time on the Open Ko-LLM Leaderboard?. By analyzing 1,769 models over this period, our research offers a comprehensive examination of the ongoing advancements in LLMs and the evolving nature of evaluation frameworks. @@ -9062,7 +9062,7 @@ <fixed-case>RTSM</fixed-case>: Knowledge Distillation with Diverse Signals for Efficient Real-Time Semantic Matching in <fixed-case>E</fixed-case>-Commerce SanjayAgrawalAmazon - VivekSembiumAmazon + VivekSembiumAmazon 9-19 Semantic matching plays a pivotal role in e-commerce by facilitating better product discovery and driving sales within online stores. Transformer models have proven exceptionally effective in mapping queries to an embedding space, positioning semantically related entities (queries or products) in close proximity. Despite their effectiveness, the high computational demands of large transformer models pose challenges for their deployment in real-time scenarios. This paper presents RTSM, an advanced knowledge distillation framework designed for Real-Time Semantic Matching. Our approach develops accurate, low-latency student models by leveraging both soft labels from a teacher model and ground truth generated from pairwise query-product and query-query signals. These signals are sourced from direct audits, synthetic examples created by LLMs, user interaction data, and taxonomy-based datasets, with custom loss functions enhancing learning efficiency. Experimental evaluations on internal and external e-commerce datasets demonstrate a 2-2.5% increase in ROC-AUC compared to directly trained student models, outperforming both the teacher model and state-of-the-art knowledge distillation benchmarks. 2025.naacl-industry.2 @@ -9082,14 +9082,14 @@ How <fixed-case>LLM</fixed-case>s React to Industrial Spatio-Temporal Data? Assessing Hallucination with a Novel Traffic Incident Benchmark Dataset - QiangLiAccenture - MingkunTanUniversität Bielefeld + QiangLiAccenture + MingkunTanUniversität Bielefeld XunZhao DanZhang - DaoanZhang + DaoanZhang ShengzhaoLeiEPFL - EPF Lausanne - Anderson S.Chu - LujunLi + Anderson S.Chu + LujunLi PorawitKamnoedboonUniversity of Zurich 36-53 Large language models (LLMs) hold revolutionary potential to digitize and enhance the Health & Public Services (H&PS) industry. Despite their advanced linguistic abilities, concerns about accuracy, stability, and traceability still persist, especially in high-stakes areas such as transportation systems. Moreover, the predominance of English in LLM development raises questions about how they perform in non-English contexts. This study originated from a real world industrial GenAI application, introduces a novel cross-lingual benchmark dataset comprising nearly 99,869 real traffic incident records from Vienna (2013-2023) to assess the robustness of state-of-the-art LLMs (\geq 9) in the spatio vs temporal domain for traffic incident classification. We then explored three hypotheses — sentence indexing, date-to-text conversion, and German-to-English translation — and incorporated Retrieval Augmented Generation (RAG) to further examine the LLM hallucinations in both spatial and temporal domain. Our experiments reveal significant performance disparities in the spatio-temporal domain and demonstrate what types of hallucinations that RAG can mitigate and how it achieves this. We also provide open access to our H&PS traffic incident dataset, with the project demo and code available at Website https://sites.google.com/view/llmhallucination/home @@ -9123,11 +9123,11 @@ Finding-Centric Structuring of <fixed-case>J</fixed-case>apanese Radiology Reports and Analysis of Performance Gaps for Multiple Facilities YukiTagawaFUJIFILM - YoheiMomoki + YoheiMomoki NorihisaNakano富士フイルム株式会社 RyotaOzakiFUJIFILM MotokiTaniguchiFujifilm Corporation - MasatoshiHoriOsaka University Graduate School of Medicine + MasatoshiHoriOsaka University Graduate School of Medicine NoriyukiTomiyamaOsaka University Graduate School of Medicine 70-85 This study addresses two key challenges in structuring radiology reports: the lack of a practical structuring schema and datasets to evaluate model generalizability. To address these challenges, we propose a “Finding-Centric Structuring,” which organizes reports around individual findings, facilitating secondary use. We also construct JRadFCS, a large-scale dataset with annotated named entities (NEs) and relations, comprising 8,428 Japanese Computed Tomography (CT) reports from seven facilities, providing a comprehensive resource for evaluating model generalizability. Our experiments reveal performance gaps when applying models trained on single-facility reports to those from other facilities. We further analyze factors contributing to these gaps and demonstrate that augmenting the training set based on these performance-correlated factors can efficiently enhance model generalizability. @@ -9163,7 +9163,7 @@ Exploring Straightforward Methods for Automatic Conversational Red-Teaming - GeorgeKourInternational Business Machines + GeorgeKourInternational Business Machines NaamaZwerdling MarcelZalmanoviciInternational Business Machines AteretAnaby TavorInternational Business Machines @@ -9205,13 +9205,13 @@ <fixed-case>QS</fixed-case>pell 250<fixed-case>K</fixed-case>: A Large-Scale, Practical Dataset for <fixed-case>C</fixed-case>hinese Search Query Spell Correction DezhiYeTencent PCG - HaomeiJiaMacau University of Science and Technology + HaomeiJiaMacau University of Science and Technology JunweiHu TianBowen JieLiu HaijinLiangTencent JinMaTencent Search - WenminWangMacau University of Science and Technology + WenminWangMacau University of Science and Technology 148-155 Chinese Search Query Spell Correction is a task designed to autonomously identify and correct typographical errors within queries in the search engine. Despite the availability of comprehensive datasets like Microsoft Speller and Webis, their monolingual nature and limited scope pose significant challenges in evaluating modern pre-trained language models such as BERT and GPT. To address this, we introduce QSpell 250K, a large-scale benchmark specifically developed for Chinese Query Spelling Correction. QSpell 250K offers several advantages: 1) It contains over 250K samples, which is ten times more than previous datasets. 2) It covers a broad range of topics, from formal entities to everyday colloquialisms and idiomatic expressions. 3) It includes both Chinese and English, addressing the complexities of code-switching. Each query undergoes three rounds of high-fidelity annotation to ensure accuracy. Our extensive testing across three popular models demonstrates that QSpell 250K effectively evaluates the efficacy of representative spelling correctors. We believe that QSpell 250K will significantly advance spelling correction methodologies. The accompanying data and code will be made publicly available. 2025.naacl-industry.13 @@ -9219,7 +9219,7 @@ <fixed-case>CONSTRUCTA</fixed-case>: Automating Commercial Construction Schedules in Fabrication Facilities with Large Language Models - YifanZhang + YifanZhang XueYangIntel 156-172 Automating planning with LLMs presents transformative opportunities for traditional industries, yet remains underexplored. In commercial construction, the complexity of automated scheduling often requires manual intervention to ensure precision. We propose CONSTRUCTA, a novel framework leveraging LLMs to optimize construction schedules in complex projects like semiconductor fabrication. CONSTRUCTA addresses key challenges by: (1) integrating construction-specific knowledge through static RAG; (2) employing context-sampling techniques inspired by architectural expertise to provide relevant input; and (3) deploying Construction DPO to align schedules with expert preferences using RLHF. Experiments on proprietary data demonstrate performance improvements of +42.3% in missing value prediction, +79.1% in dependency analysis, and +28.9% in automated planning compared to baseline methods, showcasing its potential to revolutionize construction workflows and inspire domain-specific LLM advancements. @@ -9237,12 +9237,12 @@ Mitigating Bias in Item Retrieval for Enhancing Exam Assembly in Vocational Education Services - AlonsoPalominoUniversität Bielefeld + AlonsoPalominoUniversität Bielefeld AndreasFischer DavidBuschhüterGerman Research Center for AI RolandRollerGerman Research Center for AI NielsPinkwartGerman Research Center for AI - BenjaminPaassen + BenjaminPaassen 183-193 In education, high-quality exams must cover broad specifications across diverse difficulty levels during the assembly and calibration of test items to effectively measure examinees’ competence. However, balancing the trade-off of selecting relevant test items while fulfilling exam specifications without bias is challenging, particularly when manual item selection and exam assembly rely on a pre-validated item base. To address this limitation, we propose a new mixed-integer programming re-ranking approach to improve relevance, while mitigating bias on an industry-grade exam assembly platform. We evaluate our approach by comparing it against nine bias mitigation re-ranking methods in 225 experiments on a real-world benchmark data set from vocational education services. Experimental results demonstrate a 17% relevance improvement with a 9% bias reduction when integrating sequential optimization techniques with improved contextual relevance augmentation and scoring using a large language model. Our approach bridges information retrieval and exam assembly, enhancing the human-in-the-loop exam assembly process while promoting unbiased exam design 2025.naacl-industry.16 @@ -9254,7 +9254,7 @@ AvikHalder RajarshiMandalIndian Institute of Technology Kharagpur SayanLayek - IanSoboroffNational Institute of Standards and Technology + IanSoboroffNational Institute of Standards and Technology RimaHazraSingapore University of Technology and Design AnimeshMukherjeeIndian Institute of Technology Kharagpur 194-209 @@ -9274,7 +9274,7 @@ Zero-Shot <fixed-case>ATC</fixed-case> Coding with Large Language Models for Clinical Assessments ZijianChen - John-MichaelGamble + John-MichaelGamble MicaelaJantzi John P.Hirdes JimmyLin @@ -9285,11 +9285,11 @@ Navigating the Path of Writing: Outline-guided Text Generation with Large Language Models - YukyungLeeBoston University, Boston University + YukyungLeeBoston University, Boston University SoonwonKaNAVER BokyungSonNAVER PilsungKangSeoul National University - JaewookKangCoupang + JaewookKangCoupang 233-250 Large Language Models (LLMs) have impacted the writing process, enhancing productivity by collaborating with humans in content creation platforms. However, generating high-quality, user-aligned text to satisfy real-world content creation needs remains challenging. We propose WritingPath, a framework that uses explicit outlines to guide LLMs in generating goal-oriented, high-quality text. Our approach draws inspiration from structured writing planning and reasoning paths, focusing on reflecting user intentions throughout the writing process. To validate our approach in real-world scenarios, we construct a diverse dataset from unstructured blog posts to benchmark writing performance and introduce a comprehensive evaluation framework assessing the quality of outlines and generated texts. Our evaluations with various LLMs demonstrate that the WritingPath approach significantly enhances text quality according to evaluations by both LLMs and professional writers. 2025.naacl-industry.20 @@ -9300,7 +9300,7 @@ JenniferZhuUniversity of California, Berkeley and Amazon DmitriyBespalovAmazon LiwenYou - NinadKulkarniAmazon + NinadKulkarniAmazon YanjunQiAmazon and University of Virginia 251-265 Toxicity text detectors can be vulnerable to adversarial examples - small perturbations to input text that fool the systems into wrong detection. Existing attack algorithms are time-consuming and often produce invalid or ambiguous adversarial examples, making them less useful for evaluating or improving real-world toxicity content moderators. This paper proposes an annotation pipeline for quality control of generated toxic adversarial examples (TAE). We design model-based automated annotation and human-based quality verification to assess the quality requirements of . Successful should fool a target toxicity model into making benign predictions, be grammatically reasonable, appear natural like human-generated text, and exhibit semantic toxicity. When applying these requirements to more than 20 state-of-the-art (SOTA) TAE attack recipes, we find many invalid samples from a total of 940k raw TAE attack generations. We then utilize the proposed pipeline to filter and curate a high-quality TAE dataset we call TaeBench (of size 264k). Empirically, we demonstrate that TaeBench can effectively transfer-attack SOTA toxicity content moderation models and services. Our experiments also show that TaeBench with adversarial training achieve significant improvements of the robustness of two toxicity detectors. @@ -9314,7 +9314,7 @@ JihooKim SukyungLee YungiKimUpstage - ChanjunParkKorea University + ChanjunParkKorea University 266-273 The Open Ko-LLM Leaderboard has been instrumental in benchmarking Korean Large Language Models (LLMs), yet it has certain limitations. Notably, the disconnect between quantitative improvements on the overly academic leaderboard benchmarks and the qualitative impact of the models should be addressed. Furthermore, the benchmark suite is largely composed of translated versions of their English counterparts, which may not fully capture the intricacies of the Korean language. To address these issues, we propose Open Ko-LLM Leaderboard2, an improved version of the earlier Open Ko-LLM Leaderboard. The original benchmarks are entirely replaced with new tasks that are more closely aligned with real-world capabilities. Additionally, four new native Korean benchmarks are introduced to better reflect the distinct characteristics of the Korean language. Through these refinements, Open Ko-LLM Leaderboard2 seeks to provide a more meaningful evaluation for advancing Korean LLMs. 2025.naacl-industry.22 @@ -9322,7 +9322,7 @@ <fixed-case>C</fixed-case>urious<fixed-case>LLM</fixed-case>: Elevating Multi-Document Question Answering with <fixed-case>LLM</fixed-case>-Enhanced Knowledge Graph Reasoning - ZukangYang + ZukangYang ZixuanZhu JenniferZhuUniversity of California, Berkeley and Amazon 274-286 @@ -9333,7 +9333,7 @@ <fixed-case>C</fixed-case>haracter<fixed-case>GPT</fixed-case>: A Persona Reconstruction Framework for Role-Playing Agents JeiyoonPark - ChanjunParkKorea University + ChanjunParkKorea University HeuiseokLim 287-303 The recent introduction of the Assistants API highlights its potential for large language models (LLMs) in role-playing agents (RPA). However, maintaining consistent character personas remains a significant challenge due to variability in information extraction, which frequently omits critical elements such as backstory or interpersonal relationships. To address this limitation, we introduce CharacterGPT, a framework designed to dynamically reconstruct character personas through Character Persona Training (CPT). This approach incrementally updates personas by extracting traits from chapter-wise novel summaries, reflecting the progression of the narrative. Our framework is evaluated through Big Five personality evaluations and creative tasks, in which characters generate original narratives, demonstrating the efficacy of CharacterGPT in preserving persona consistency. The code and results are available at https://github.com/Jeiyoon/charactergpt @@ -9359,7 +9359,7 @@ MochiGaoTencent YunzhiTanTencent BoHu - ZangLiTencent + ZangLiTencent 318-328 Current intent detection work experiments with minor intent categories. However, in real-world scenarios of data analysis dialogue systems, intents are composed of combinations of numerous metrics and dimensions, resulting in countless intents and posing challenges for the language model. The retrieval-augmented generation (RAG) method efficiently retrieves key intents. However, the single retrieval route sometimes fails to recall target intents and causes incorrect results. To alleviate the above challenges, we introduce the DSRAG framework combining query-to-query (Q2Q) and query-to-metadata (Q2M) double-stream RAG approaches. Specifically, we build a repository of query statements for Q2Q using the query templates with the key intents. When a user’s query comes, it rapidly matches repository statements. Once the relevant query is retrieved, the results can be quickly returned. In contrast, Q2M retrieves the relevant intents from the metadata and utilizes large language models to choose the answer. Experimental results show that DSRAG achieves significant improvements compared with merely using prompt engineering and a single retrieval route. 2025.naacl-industry.26 @@ -9399,8 +9399,8 @@ <fixed-case>Q</fixed-case>uery<fixed-case>S</fixed-case>hield: A Platform to Mitigate Enterprise Data Leakage in Queries to External <fixed-case>LLM</fixed-case>s NitinRamrakhiyaniInternational Institute of Information Technology Hyderabad and Tata Consultancy Services Limited, India - DeltonMyalilTata Consultancy Services Limited, India - SachinPawar + DeltonMyalilTata Consultancy Services Limited, India + SachinPawar ManojApte Rajan MATata Consultancy Services Limited, India DivyeshSaglaniTata Consultancy Services Limited, India @@ -9412,11 +9412,11 @@ <fixed-case>S</fixed-case>wiss<fixed-case>ADT</fixed-case>: An Audio Description Translation System for <fixed-case>S</fixed-case>wiss Languages - LukasFischerUniversity of Zurich + LukasFischerUniversity of Zurich YingqiangGaoUniversity of Zurich AlexaLintnerZHAW - Zürcher Hochschule für Angewandte Wissenschaften - AnnetteRiosUniversity of Zurich - SarahEblingUniversity of Zurich + AnnetteRiosUniversity of Zurich + SarahEblingUniversity of Zurich 370-379 Audio description (AD) is a crucial accessibility service provided to blind persons and persons with visual impairment, designed to convey visual information in acoustic form. Despite recent advancements in multilingual machine translation research, the lack of well-crafted and time-synchronized AD data impedes the development of audio description translation (ADT) systems that address the needs of multilingual countries such as Switzerland. Furthermore, most ADT systems rely on text alone, and it is unclear whether incorporating visual information from video clips improves the quality of ADT outputs.In this work, we introduce SwissADT, an **emerging** ADT system for three main Swiss languages and English, designed for future use by our industry partners. By collecting well-crafted AD data augmented with video clips in German, French, Italian, and English, and leveraging the power of Large Language Models (LLMs), we aim to enhance information accessibility for diverse language populations in Switzerland by automatically translating AD scripts to the desired Swiss language. Our extensive experimental ADT results, composed of both automatic and human evaluations of ADT quality, demonstrate the promising capability of SwissADT for the ADT task. We believe that combining human expertise with the generation power of LLMs can further enhance the performance of ADT systems, ultimately benefiting a larger multilingual target population. 2025.naacl-industry.31 @@ -9425,10 +9425,10 @@ <fixed-case>C</fixed-case>hinese Morph Resolution in <fixed-case>E</fixed-case>-commerce Live Streaming Scenarios JiahaoZhu - JipengQiangYangzhou University - RanBaiChina Academic of Electronics and Information Technology - ChenyuLiu - XiaoyeOuyang + JipengQiangYangzhou University + RanBaiChina Academic of Electronics and Information Technology + ChenyuLiu + XiaoyeOuyang 380-389 E-commerce live streaming in China, particularly on platforms like Douyin, has become a major sales channel, but hosts often use morphs to evade scrutiny and engage in false advertising. This study introduces the Live Auditory Morph Resolution (LiveAMR) task to detect such violations. Unlike previous morph research focused on text-based evasion in social media and underground industries, LiveAMR targets pronunciation-based evasion in health and medical live streams. We constructed the first LiveAMR dataset with 86,790 samples and developed a method to transform the task into a text-to-text generation problem. By leveraging large language models (LLMs) to generate additional training data, we improved performance and demonstrated that morph resolution significantly enhances live streaming regulation. 2025.naacl-industry.32 @@ -9436,9 +9436,9 @@ <fixed-case>M</fixed-case>ono<fixed-case>TOD</fixed-case>ia: Translating Monologue Requests to Task-Oriented Dialogues - SebastianSteindlOstbayerische Technische Hochschule Amberg-Weiden - UlrichSchäferOstbayerische Technische Hochschule Amberg-Weiden - BerndLudwigUniversität Regensburg + SebastianSteindlOstbayerische Technische Hochschule Amberg-Weiden + UlrichSchäferOstbayerische Technische Hochschule Amberg-Weiden + BerndLudwigUniversität Regensburg 390-403 Data scarcity is one of the main problems when it comes to real-world applications of transformer-based models.This is especially evident for task-oriented dialogue (TOD) systems, which require specialized datasets, that are usually not readily available. This can hinder companies from adding TOD systems to their services.This study therefore investigates a novel approach to sourcing annotated dialogues from existing German monologue material.Focusing on a real-world example, we investigate whether these monologues can be transformed into dialogue formats suitable for training TOD systems.We show the approach with the concrete example of a company specializing in travel bookings via e-mail. We fine-tune state-of-the-art Large Language Models for the task of rewriting e-mails as dialogues and annotating them.To ensure the quality and validity of the generated data, we employ crowd workers to evaluate the dialogues across multiple criteria and to provide gold-standard annotations for the test dataset.We further evaluate the usefulness of the dialogues for training TOD systems.Our evaluation shows that the dialogues and annotations are of high quality and can serve as a valuable starting point for training TOD systems.Finally, we make the annotated dataset publicly available to foster future research. 2025.naacl-industry.33 @@ -9459,8 +9459,8 @@ Predicting <fixed-case>ICU</fixed-case> Length of Stay for Patients using Latent Categorization of Health Conditions TirthankarDasgupta - ManjiraSinhaTata Consultancy Services Limited, India - SudeshnaJanaTata Consultancy Services Limited, India + ManjiraSinhaTata Consultancy Services Limited, India + SudeshnaJanaTata Consultancy Services Limited, India 422-430 Predicting the duration of a patient’s stay in an Intensive Care Unit (ICU) is a critical challenge for healthcare administrators, as it impacts resource allocation, staffing, and patient care strategies. Traditional approaches often rely on structured clinical data, but recent developments in language models offer significant potential to utilize unstructured text data such as nursing notes, discharge summaries, and clinical reports for ICU length-of-stay (LoS) predictions. In this study, we introduce a method for analyzing nursing notes to predict the remaining ICU stay duration of patients. Our approach leverages a joint model of latent note categorization, which identifies key health-related patterns and disease severity factors from unstructured text data. This latent categorization enables the model to derive high-level insights that influence patient care planning. We evaluate our model on the widely used MIMIC-III dataset, and our preliminary findings show that it significantly outperforms existing baselines, suggesting promising industrial applications for resource optimization and operational efficiency in healthcare settings. 2025.naacl-industry.35 @@ -9468,9 +9468,9 @@ <fixed-case>R</fixed-case>evie<fixed-case>W</fixed-case>eaver: Weaving Together Review Insights by Leveraging <fixed-case>LLM</fixed-case>s and Semantic Similarity - JibanAdhikaryMicrosoft - MohammadAlqudahResearch, Microsoft - Arun PalghatUdayashankarBest Buy + JibanAdhikaryMicrosoft + MohammadAlqudahResearch, Microsoft + Arun PalghatUdayashankarBest Buy 431-448 With the rise of online retail, customer reviews have become a critical factor in shaping purchasing decisions. The sheer volume of customer reviews being generated continuously presents a challenge for consumers who must sift through an overwhelming amount of feedback. To address this issue, we introduce RevieWeaver, a novel framework that extracts key product features and provides concise review summaries. Our innovative approach not only scales efficiently to 30 million reviews but also ensures reproducibility and controllability. Moreover, it delivers unbiased and reliable assessments of products that accurately reflect the input reviews. 2025.naacl-industry.36 @@ -9486,9 +9486,9 @@ JaneCook Jack IScott NirmalaPudotaDeloitte - TimWeningerUniversity of Notre Dame - EdwardBowen - SanmitraBhattacharyaDeloitte Consulting + TimWeningerUniversity of Notre Dame + EdwardBowen + SanmitraBhattacharyaDeloitte Consulting 449-459 Medical coding standardizes clinical data but is both time-consuming and error-prone. Traditional Natural Language Processing (NLP) methods struggle with automating coding due to the large label space, lengthy text inputs, and the absence of supporting evidence annotations that justify code selection. Recent advancements in Generative Artificial Intelligence (AI) offer promising solutions to these challenges. In this work, we introduce MedCodER, an emerging Generative AI framework for automatic medical coding that leverages extraction, retrieval, and re-ranking techniques as core components. MedCodER achieves a micro-F1 score of 0.62 on International Classification of Diseases (ICD) code prediction, significantly outperforming state-of-the-art methods. Additionally, we present a new dataset containing medical records annotated with disease diagnoses, ICD codes, and supporting evidence texts (https://doi.org/10.5281/zenodo.13308316). Ablation tests confirm that MedCodER’s performance depends on the integration of each of its aforementioned components, as performance declines when these components are evaluated in isolation. 2025.naacl-industry.37 @@ -9501,7 +9501,7 @@ HongdaSheneBay Inc. Pierre-YvesVandenbussche JanetJenqeBay Inc. - HodaEldardiry, Virginia Polytechnic Institute and State University + HodaEldardiry, Virginia Polytechnic Institute and State University 460-469 Existing zero-shot product attribute value (aspect) extraction approaches in e-Commerce industry rely on uni-modal or multi-modal models, where the sellers are asked to provide detailed textual inputs (product descriptions) for the products. However, manually providing (typing) the product descriptions is time-consuming and frustrating for the sellers. Thus, we propose a cross-modal zero-shot attribute value generation framework (ViOC-AG) based on CLIP, which only requires product images as the inputs. ViOC-AG follows a text-only training process, where a task-customized text decoder is trained with the frozen CLIP text encoder to alleviate the modality gap and task disconnection. During the zero-shot inference, product aspects are generated by the frozen CLIP image encoder connected with the trained task-customized text decoder. OCR tokens and outputs from a frozen prompt-based LLM correct the decoded outputs for out-of-domain attribute values. Experiments show that ViOC-AG significantly outperforms other fine-tuned vision-language models for zero-shot attribute value extraction. 2025.naacl-industry.38 @@ -9520,14 +9520,14 @@ Evaluating Large Language Models with Enterprise Benchmarks BingZhang - MikioTakeuchi - RyoKawahara + MikioTakeuchi + RyoKawahara ShubhiAsthana - Md. MarufHossain + Md. MarufHossain Guang-JieRen KateSoule - YifanMai - YadaZhu + YifanMai + YadaZhu 485-505 The advancement of large language models (LLMs) has led to a greater challenge of having a rigorous and systematic evaluation of complex tasks performed, especially in enterprise applications. Therefore, LLMs need to be benchmarked with enterprise datasets for a variety of NLP tasks. This work explores benchmarking strategies focused on LLM evaluation, with a specific emphasis on both English and Japanese. The proposed evaluation framework encompasses 25 publicly available domain-specific English benchmarks from diverse enterprise domains like financial services, legal, climate, cyber security, and 2 public Japanese finance benchmarks. The diverse performance of 8 models across different enterprise tasks highlights the importance of selecting the right model based on the specific requirements of each task. Code and prompts are available on GitHub. 2025.naacl-industry.40 @@ -9551,7 +9551,7 @@ Kailash KarthikSaravanakumarElemental Cognition LoriMoonMoonWorks, Inc. NatnaelSeifu - AbrahamBautista-Castillo + AbrahamBautista-Castillo 515-522 Application of LLMs for complex causal question answering can be stymied by their opacity and propensity for hallucination. Although recent approaches such as Retrieval Augmented Generation and Chain of Thought prompting have improved reliability, we argue current approaches are insufficient and further fail to satisfy key criteria humans use to select and evaluate causal explanations. Inspired by findings from the social sciences, we present an implemented causal QA approach that combines iterative RAG with guidance from a formal model of causation. Our causal model is backed by the Cogent reasoning engine, allowing users to interactively perform counterfactual analysis and refine their answer. Our approach has been integrated into a deployed Collaborative Research Assistant (Cora) and we present a pilot evaluation in the life sciences domain. 2025.naacl-industry.42 @@ -9559,7 +9559,7 @@ <fixed-case>T</fixed-case>urbo<fixed-case>F</fixed-case>uzz<fixed-case>LLM</fixed-case>: Turbocharging Mutation-based Fuzzing for Effectively Jailbreaking Large Language Models in Practice - AmanGoelAmazon + AmanGoelAmazon XianWuAmazon ZheWangAmazon DmitriyBespalovAmazon @@ -9574,7 +9574,7 @@ MdKowsherUniversity of Central Florida Nusrat JahanProttasha Chun-NamYuNokia Bell Labs and Department of Computer Science - OzlemGaribayUniversity of Central Florida + OzlemGaribayUniversity of Central Florida NiloofarYousefiUniversity of Central Florida 535-543 Self-attention has revolutionized natural language processing by capturing long-range dependencies and improving context understanding. However, it comes with high computational costs and struggles with sequential data’s inherent directionality. This paper investigates and presents a simplified approach called “shared weight self-attention,” where a single weight matrix is used for Keys, Queries, and Values instead of separate matrices for each. This approach cuts training parameters by more than half and significantly reduces training time. Our method not only improves efficiency but also achieves strong performance on tasks from the GLUE benchmark, even outperforming the standard BERT baseline in handling noisy and out-of-domain data. Experimental results show a 66.53% reduction in parameter size within the attention block and competitive accuracy improvements of 3.55% and 0.89% over symmetric and pairwise attention-based BERT models, respectively. @@ -9585,10 +9585,10 @@ <fixed-case>S</fixed-case>uper<fixed-case>RAG</fixed-case>: Beyond <fixed-case>RAG</fixed-case> with Layout-Aware Graph Modeling CheningYang Duy-KhanhVuCinnamon AI - Minh-TienNguyen + Minh-TienNguyen Xuan-QuangNguyenCinnamon AI - LinhNguyenCinnamon AI - HungLeDeakin University + LinhNguyenCinnamon AI + HungLeDeakin University 544-557 This paper introduces layout-aware graph modeling for multimodal RAG. Different from traditional RAG methods that only deal with flat text chunks, the proposed method takes into account the relationship of multimodalities by using a graph structure. To do that, a graph modeling structure is defined based on document layout parsing. The structure of an input document is retained with the connection of text chunks, tables, and figures. This representation allows the method to handle complex questions that require information from multimodalities. To confirm the efficiency of the graph modeling, a flexible RAG pipeline is developed using robust components. Experimental results on four benchmark test sets confirm the contribution of the layout-aware modeling for performance improvement of the RAG pipeline. 2025.naacl-industry.45 @@ -9600,7 +9600,7 @@ AmitAgarwalOracle ArionDas BhargavaKumarTD Securities - SrikantPandaOracle + SrikantPandaOracle PriyaranjanPattnayakOracle Taki HasanRafiHanyang University TejaswiniKumarColumbia University @@ -9612,8 +9612,8 @@ Natural Language Processing for Human Resources: A Survey - NaokiOtaniMegagon Labs - NikitaBhutaniMegagon Labs, Inc + NaokiOtaniMegagon Labs + NikitaBhutaniMegagon Labs, Inc EstevamHruschkaMegagon Labs, Megagon Labs and Carnegie Mellon University 583-597 Advances in Natural Language Processing (NLP) have the potential to transform HR processes, from recruitment to employee management. While recent breakthroughs in NLP have generated significant interest in its industrial applications, a comprehensive overview of how NLP can be applied across HR activities is still lacking. This paper discovers opportunities for researchers and practitioners to harness NLP’s transformative potential in this domain. We analyze key fundamental tasks such as information extraction and text classification, and their roles in downstream applications like recommendation and language generation, while also discussing ethical concerns. Additionally, we identify gaps in current research and encourage future work to explore holistic approaches for achieving broader objectives in this field. @@ -9622,7 +9622,7 @@ Implementing Retrieval Augmented Generation Technique on Unstructured and Structured Data Sources in a Call Center of a Large Financial Institution - Syed ShariyarMurtazaRyerson University + Syed ShariyarMurtazaRyerson University YifanNieManulife EliasAvanManulife UtkarshSoniManulife @@ -9640,26 +9640,26 @@ Granite Guardian: Comprehensive <fixed-case>LLM</fixed-case> Safeguarding InkitPadhi ManishNagireddyIBM Research - GiandomenicoCornacchiaInternational Business Machines + GiandomenicoCornacchiaInternational Business Machines SubhajitChaudhuryInternational Business Machines TejaswiniPedapati - PierreDogninInternational Business Machines - KeerthiramMurugesanInternational Business Machines - ErikMiehlingIBM Research + PierreDogninInternational Business Machines + KeerthiramMurugesanInternational Business Machines + ErikMiehlingIBM Research MartínSantillán Cooper - KieranFraserInternational Business Machines + KieranFraserInternational Business Machines GiulioZizzoInternational Business Machines Muhammad ZaidHameedInternational Business Machines MarkPurcellIBM TJ Watson Research Center MichaelDesmond - QianPanIBM, International Business Machines + QianPanIBM, International Business Machines IngeVejsbjergInternational Business Machines Elizabeth M.DalyIBM Research MichaelHindInternational Business Machines WernerGeyer - AmbrishRawatInternational Business Machines + AmbrishRawatInternational Business Machines Kush R.VarshneyInternational Business Machines - PrasannaSattigeriIBM Research + PrasannaSattigeriIBM Research 607-615 The deployment of language models in real-world applications exposes users to various risks, including hallucinations and harmful or unethical content. These challenges highlight the urgent need for robust safeguards to ensure safe and responsible AI. To address this, we introduce Granite Guardian, a suite of advanced models designed to detect and mitigate risks associated with prompts and responses, enabling seamless integration with any large language model (LLM). Unlike existing open-source solutions, our Granite Guardian models provide comprehensive coverage across a wide range of risk dimensions, including social bias, profanity, violence, sexual content, unethical behavior, jailbreaking, and hallucination-related issues such as context relevance, groundedness, and answer accuracy in retrieval-augmented generation (RAG) scenarios. Trained on a unique dataset combining diverse human annotations and synthetic data, Granite Guardian excels in identifying risks often overlooked by traditional detection systems, particularly jailbreak attempts and RAG-specific challenges. https://github.com/ibm-granite/granite-guardian 2025.naacl-industry.49 @@ -9672,7 +9672,7 @@ YuhaoWangFacebook LiangzhenLaiFacebook ErnieChangMeta AI - ChangshengZhaoMeta Inc. + ChangshengZhaoMeta Inc. YangyangShiMeta VikasChandraMeta 616-626 @@ -9693,9 +9693,9 @@ Concept Distillation from Strong to Weak Models via Hypotheses-to-Theories Prompting - Emmanuel AboahBoatengMicrosoft + Emmanuel AboahBoatengMicrosoft Cassiano OBecker - NabihaAsgharMicrosoft + NabihaAsgharMicrosoft KabirWaliaMicrosoft AshwinSrinivasanMicrosoft EhiNosakhareMicrosoft @@ -9708,8 +9708,8 @@ Towards Reliable Agents: Benchmarking Customized <fixed-case>LLM</fixed-case>-Based Retrieval-Augmented Generation Frameworks with Deployment Validation - Kevin ShukangWang - Karel JoshuaHarjono + Kevin ShukangWang + Karel JoshuaHarjono RamonLawrenceUniversity of British Columbia 655-661 The emergence of Large Language Models has created new opportunities for building agent applications across various domains. To address the lack of targeted open benchmarks for agent frameworks, we designed a benchmark that features domain-specific, small knowledge bases, and includes a diverse set of questions categorized by type, such as simple, multi-hop, aggregation, and reasoning questions. We evaluated OpenAI’s Assistants API versus a RAG assistant built with Langchain and deployed a RAG system based on benchmark insights as a course assistant over a two-year span in a computer science course. Our findings reveal how domain-specific retrieval impacts response accuracy and highlight key challenges in real-world deployment. Notably, in smaller agentic systems with constrained knowledge bases, the primary challenge shifts from retrieval accuracy to data availability in the knowledge bases. We present insights from both benchmark evaluation and real-world usage data to guide the development of more reliable and effective agentic applications. @@ -9731,9 +9731,9 @@ Evaluating Bias in <fixed-case>LLM</fixed-case>s for Job-Resume Matching: Gender, Race, and Education - HayateIsoMegagon Labs, US + HayateIsoMegagon Labs, US PouyaPezeshkpourMegagon Labs - NikitaBhutaniMegagon Labs, Inc + NikitaBhutaniMegagon Labs, Inc EstevamHruschkaMegagon Labs, Megagon Labs and Carnegie Mellon University 672-683 Large Language Models (LLMs) offer the potential to automate hiring by matching job descriptions with candidate resumes, streamlining recruitment processes, and reducing operational costs. However, biases inherent in these models may lead to unfair hiring practices, reinforcing societal prejudices and undermining workplace diversity. This study examines the performance and fairness of LLMs in job-resume matching tasks within the English language and U.S. context. It evaluates how factors such as gender, race, and educational background influence model decisions, providing critical insights into the fairness and reliability of LLMs in HR applications.Our findings indicate that while recent models have reduced biases related to explicit attributes like gender and race, implicit biases concerning educational background remain significant. These results highlight the need for ongoing evaluation and the development of advanced bias mitigation strategies to ensure equitable hiring practices when using LLMs in industry settings. @@ -9744,7 +9744,7 @@ Goal-Driven Data Story, Narrations and Explanations AniyaAggarwalIBM Research AnkushGuptaIBM India Research Lab - ShivangiBithel + ShivangiBithel ArvindAgarwalInternational Business Machines and University of Maryland, College Park 684-694 In this paper, we propose a system designed to process and interpret vague, open-ended, and multi-line complex natural language queries, transforming them into coherent, actionable data stories. Our system’s modular architecture comprises five components—Question Generation, Answer Generation, NLG/Chart Generation, Chart2Text, and Story Representation—each utilizing LLMs to transform data into human-readable narratives and visualizations. Unlike existing tools, our system uniquely addresses the ambiguity of vague, multi-line queries, setting a new benchmark in data storytelling by tackling complexities no existing system comprehensively handles. Our system is cost-effective, which uses open-source models without extra training and emphasizes transparency by showcasing end-to-end processing and intermediate outputs. This enhances explainability, builds user trust, and clarifies the data story generation process. @@ -9756,7 +9756,7 @@ VishnuPrabhakaranAmazon PuravAggarwalAmazon VishruitKulshreshthaAmazon - ArunitaDasAmazon + ArunitaDasAmazon Sahini Venkata SitaramSruti AnoopSaladiAmazon 695-707 @@ -9767,7 +9767,7 @@ <fixed-case>A</fixed-case>uto<fixed-case>KB</fixed-case>: Automated Creation of Structured Knowledge Bases for Domain-Specific Support RishavSahayAmazon - ArihantJainAmazon + ArihantJainAmazon PuravAggarwalAmazon AnoopSaladiAmazon 708-723 @@ -9778,12 +9778,12 @@ Medical Spoken Named Entity Recognition KhaiLe-Duc - DavidThulkeRWTH Aachen University and AppTek + DavidThulkeRWTH Aachen University and AppTek Hung-PhongTran LongVo-Dang Khai-NguyenNguyen - Truong-SonHyUniversity of Alabama at Birmingham - RalfSchlüterAppTek GmbH and Rheinisch Westfälische Technische Hochschule Aachen + Truong-SonHyUniversity of Alabama at Birmingham + RalfSchlüterAppTek GmbH and Rheinisch Westfälische Technische Hochschule Aachen 724-783 Spoken Named Entity Recognition (NER) aims to extract named entities from speech and categorise them into types like person, location, organization, etc. In this work, we present *VietMed-NER* - the first spoken NER dataset in the medical domain. To our knowledge, our Vietnamese real-world dataset is the largest spoken NER dataset in the world regarding the number of entity types, featuring 18 distinct types. Furthermore, we present baseline results using various state-of-the-art pre-trained models: encoder-only and sequence-to-sequence; and conduct quantitative and qualitative error analysis. We found that pre-trained multilingual models generally outperform monolingual models on reference text and ASR output and encoders outperform sequence-to-sequence models in NER tasks. By translating the transcripts, the dataset can also be utilised for text NER in the medical domain in other languages than Vietnamese. All code, data and models are publicly available. 2025.naacl-industry.59 @@ -9793,7 +9793,7 @@ <fixed-case>PLEX</fixed-case>: Adaptive Parameter-Efficient Fine-Tuning for Code <fixed-case>LLM</fixed-case>s using Lottery-Tickets JaeseongLeeSeoul National University HojaeHan - JongyoonKimSeoul National University + JongyoonKimSeoul National University Seung-wonHwangSeoul National University NaunKangSamsung KyungJunAnSamsung @@ -9808,7 +9808,7 @@ YuyangLi PjmKerbuschSchiphol RhrPruimRadboud University Medical Center - TobiasKäferKarlsruher Institut für Technologie + TobiasKäferKarlsruher Institut für Technologie 794-808 Airports from the top 20 in terms of annual passengers are highly dynamic environment with thousands of flights daily, and they aim to increase the degree of automation. To contribute to this, we implemented a Conversational AI system that enables staff in an airport to communicate with flight information systems. This system not only answers standard airport queries but also resolves airport terminology, jargon, abbreviations, and dynamic questions involving reasoning. In this paper, we built three different Retrieval-Augmented Generation (RAG) methods, including traditional RAG, SQL RAG, and Knowledge Graph-based RAG (Graph RAG). Experiments showed that traditional RAG achieved 84.84% accuracy using BM25 + GPT-4 but occasionally produced hallucinations, which is risky to airport safety. In contrast, SQL RAG and Graph RAG achieved 80.85% and 91.49% accuracy respectively, with significantly fewer hallucinations. Moreover, Graph RAG was especially effective for questions that involved reasoning. Based on our observations, we thus recommend SQL RAG and Graph RAG are better for airport environments, due to fewer hallucinations and the ability to handle dynamic questions. 2025.naacl-industry.61 @@ -9859,7 +9859,7 @@ e<fixed-case>C</fixed-case>-<fixed-case>T</fixed-case>ab2<fixed-case>T</fixed-case>ext: Aspect-Based Text Generation from e-Commerce Product Tables Luis Antonio GutierrezGuaniloUTEC - Universidad de Ingeniería y Tecnología and UTEC - Universidad de Ingeniería y Tecnología Mir TafseerNayeemUniversity of Alberta - Cristian Jose Lopez DelAlamoUTEC - Universidad de Ingeniería y Tecnología + Cristian Jose Lopez DelAlamoUTEC - Universidad de Ingeniería y Tecnología DavoodRafieiUniversity of Alberta 849-867 Large Language Models (LLMs) have demonstrated exceptional versatility across diverse domains, yet their application in e-commerce remains underexplored due to a lack of domain-specific datasets. To address this gap, we introduce eC-Tab2Text, a novel dataset designed to capture the intricacies of e-commerce, including detailed product attributes and user-specific queries. Leveraging eC-Tab2Text, we focus on text generation from product tables, enabling LLMs to produce high-quality, attribute-specific product reviews from structured tabular data. Fine-tuned models were rigorously evaluated using standard Table2Text metrics, alongside correctness, faithfulness, and fluency assessments. Our results demonstrate substantial improvements in generating contextually accurate reviews, highlighting the transformative potential of tailored datasets and fine-tuning methodologies in optimizing e-commerce workflows. This work highlights the potential of LLMs in e-commerce workflows and the essential role of domain-specific datasets in tailoring them to industry-specific challenges. @@ -9871,7 +9871,7 @@ Tzu-LinKuo FengTingLiaoMediaTek Research Mu-WeiHsieh - Fu-ChiehChangNational Taiwan University and Mediatek Research + Fu-ChiehChangNational Taiwan University and Mediatek Research Po-ChunHsuMediaTek Research Da-shanShiu 868-902 @@ -9916,7 +9916,7 @@ <fixed-case>C</fixed-case>ode<fixed-case>G</fixed-case>en<fixed-case>W</fixed-case>rangler: Data Wrangling task automation using Code-Generating Models - AshleshaAkella + AshleshaAkella AbhijitManatkarInternational Business Machines KrishnasuriNarayanam SameepMehtaInternational Business Machines @@ -9927,11 +9927,11 @@ Dialogue Language Model with Large-Scale Persona Data Engineering - MengzeHong - Chen JasonZhang + MengzeHong + Chen JasonZhang ChaotaoChen RongzhongLian - DiJiang + DiJiang 961-970 Maintaining persona consistency is paramount in the application of open-domain dialogue systems, as exemplified by models like ChatGPT. Despite significant advancements, the limited scale and diversity of current persona dialogue datasets remain challenges to achieving robust persona-consistent dialogue models. In this study, drawing inspiration from the success of large-scale pre-training, we introduce PPDS, an open-domain persona dialogue system that employs extensive generative pre-training on a persona dialogue dataset to enhance persona consistency. Specifically, we present a persona extraction model designed to autonomously and precisely generate vast persona dialogue datasets. Additionally, we unveil a pioneering persona augmentation technique to address the invalid persona bias inherent in the constructed dataset. Both quantitative and human evaluations consistently highlight the superior response quality and persona consistency of our proposed model, underscoring its effectiveness. 2025.naacl-industry.71 @@ -9952,9 +9952,9 @@ Improved Near-Duplicate Detection for Aggregated and Paywalled News-Feeds - SiddharthTumreTata Consultancy Services Limited, India + SiddharthTumreTata Consultancy Services Limited, India SangameshwarPatilIndian Institute of Technology, Madras and Tata Consultancy Services Limited, India - AlokKumarTata Consultancy Services Limited, India + AlokKumarTata Consultancy Services Limited, India 979-987 News aggregators play a key role in the rapidly evolving digital landscape by providing comprehensive and timely news stories aggregated from diverse sources into one feed. As these articles are sourced from different outlets, they often end up covering the same underlying event but differ in phrasing, formatting or supplemented with additional details. It is crucial for the news aggregators to identify these near-duplicates, improving the content quality and user engagement by steering away from redundant information. The problem of near-duplicate news detection has become harder with increasing use of paywalls by the news websites resulting in restricted access to the content. It is now common to get only the headline and a short snippet from the article. Previous works have concentrated on full length versions of documents such as webpages. There is very little work that focuses on this variation of the near-duplicate detection problem in which only headline and a small text blurb is available for each news article. We propose Near-Duplicate Detection Using Metadata Augmented Communities (NDD-MAC) approach that combines embeddings from pretrained language model (PLM) and latent metadata of a news article followed by community detection to identify clusters of near-duplicates. We show the efficacy of proposed approach using 2 different real-world datasets. By integrating metadata with community detection, NDD-MAC is able to detect nuanced similarities and differences in news snippets and offers an industrial scale solution for the near-duplicate detection in scenarios with restricted content availability. 2025.naacl-industry.73 @@ -9962,11 +9962,11 @@ Pisets: A Robust Speech Recognition System for Lectures and Interviews - IvanBondarenkoNovosibirsk State University - DaniilGrebenkin + IvanBondarenkoNovosibirsk State University + DaniilGrebenkin OlegSedukhinSiberian Neuronets LLC MikhailKlementev - DerunetsRomanNovosibirsk State University + DerunetsRomanNovosibirsk State University LyudmilaBudnevaNovosibirsk State University 988-997 This work presents a speech-to-text system “Pisets” for scientists and journalists which is based on a three-component architecture aimed at improving speech recognition accuracy while minimizing errors and hallucinations associated with the Whisper model. The architecture comprises primary recognition using Wav2Vec2, false positive filtering via the Audio Spectrogram Transformer (AST), and final speech recognition through Whisper. The implementation of curriculum learning methods and the utilization of diverse Russian-language speech corpora significantly enhanced the system’s effectiveness. Additionally, advanced uncertainty modeling techniques were introduced, contributing to further improvements in transcription quality. The proposed approaches ensure robust transcribing of long audio data across various acoustic conditions compared to WhisperX and the usual Whisper model. The source code of “Pisets” system is publicly available at GitHub: https://github.com/bond005/pisets. @@ -9975,16 +9975,16 @@ <fixed-case>CPRM</fixed-case>: A <fixed-case>LLM</fixed-case>-based Continual Pre-training Framework for Relevance Modeling in Commercial Search - KaixinWuAnt Group + KaixinWuAnt Group YixinJiSoochow University ZeyuanChenAnt Group QiangWang CunxiangWang HongLiuAnt Group BaijunJiSoochow University - XuJia + XuJia ZhongyiLiuAnt Group - JinjieGu + JinjieGu YuanZhou LinjianMoAnt Group 998-1008 @@ -10034,7 +10034,7 @@ <fixed-case>H</fixed-case>y<fixed-case>PA</fixed-case>-<fixed-case>RAG</fixed-case>: A Hybrid Parameter Adaptive Retrieval-Augmented Generation System for <fixed-case>AI</fixed-case> Legal and Policy Applications RishiKalra - ZekunWuDepartment of Computer Science, University College London, University of London and Holistic AI + ZekunWuDepartment of Computer Science, University College London, University of London and Holistic AI AyeshaGulley AirlieHilliard XinGuanHolistic AI @@ -10047,7 +10047,7 @@ An Efficient Context-Dependent Memory Framework for <fixed-case>LLM</fixed-case>-Centric Agents - PengyuGao + PengyuGao JinmingZhaoQiyuan Lab XinyueChen LongYilin From c0c305c89e7375d259ce3f637de7e628c6f43f59 Mon Sep 17 00:00:00 2001 From: Matt Post Date: Thu, 29 May 2025 22:42:08 -0400 Subject: [PATCH 07/18] Orcids for Findings:NAACL 2025 --- data/xml/2025.findings.xml | 1682 ++++++++++++++++++------------------ 1 file changed, 841 insertions(+), 841 deletions(-) diff --git a/data/xml/2025.findings.xml b/data/xml/2025.findings.xml index f1faee9657..821579642b 100644 --- a/data/xml/2025.findings.xml +++ b/data/xml/2025.findings.xml @@ -49,7 +49,7 @@ HongliSunEast China University of Science and Technology KuiXueShanghai Artificial Intelligence Laboratory XiaofanZhangShanghai Jiaotong University - ShaotingZhangShanghai Artificial Intelligence Laboratory + ShaotingZhangShanghai Artificial Intelligence Laboratory TongRuan 32-56 Numerous advanced Large Language Models (LLMs) now support context lengths up to 128K, and some extend to 200K. Some benchmarks in the generic domain have also followed up on evaluating long-context capabilities. In the medical domain, tasks are distinctive due to the unique contexts and need for domain expertise, necessitating further evaluation. However, despite the frequent presence of long texts in medical scenarios, evaluation benchmarks of long-context capabilities for LLMs in this field are still rare. In this paper, we propose MedOdyssey, the first medical long-context benchmark with seven length levels ranging from 4K to 200K tokens. MedOdyssey consists of two primary components: the medical-context “needles in a haystack” task and a series of tasks specific to medical applications, together comprising 10 datasets. The first component includes challenges such as counter-intuitive reasoning and novel (unknown) facts injection to mitigate knowledge leakage and data contamination of LLMs. The second component confronts the challenge of requiring professional medical expertise. Especially, we design the ‘“Maximum Identical Context” principle to improve fairness by guaranteeing that different LLMs observe as many identical contexts as possible. Our experiment evaluates advanced proprietary and open-source LLMs tailored for processing long contexts and presents detailed performance analyses. This highlights that LLMs still face challenges and need for further research in this area. Our code and data are released in the repository: https://github.com/JOHNNY-fans/MedOdyssey. @@ -59,7 +59,7 @@ Can <fixed-case>LLM</fixed-case>s Learn Macroeconomic Narratives from Social Media? AlmogGuetaGoogle - AmirFederColumbia University and Google + AmirFederColumbia University and Google ZorikGekhmanTechnion, Technion ArielGoldsteinHebrew University of Jerusalem RoiReichartTechnion, Israel Institute of Technology @@ -73,7 +73,7 @@ LeonidasGee MilanGritta GerasimosLampourasHuawei Technologies Ltd. - IgnacioIacobacciElm Europe + IgnacioIacobacciElm Europe 79-94 Code Language Models have been trained togenerate accurate solutions, typically with noregard for runtime. On the other hand, previousworks that explored execution optimisationhave observed corresponding drops infunctional correctness. To that end, we introduceCode-Optimise, a framework that incorporatesboth correctness (passed, failed) andruntime (quick, slow) as learning signals viaself-generated preference data. Our frameworkis both lightweight and robust as it dynamicallyselects solutions to reduce overfitting whileavoiding a reliance on larger models for learningsignals. Code-Optimise achieves significantimprovements in pass@k while decreasingthe competitive baseline runtimes by anadditional 6% for in-domain data and up to3% for out-of-domain data. As a by-product,the average length of the generated solutionsis reduced by up to 48% on MBPP and 23%on HumanEval, resulting in faster and cheaperinference. The generated data and codebaseis open-sourced at https://github.com/huawei-noah/HEBO/tree/Code_Optimise. 2025.findings-naacl.5 @@ -83,7 +83,7 @@ People will agree what <fixed-case>I</fixed-case> think: Investigating <fixed-case>LLM</fixed-case>’s False Consensus Effect JunhyukChoiChung-Ang University YeseonHong - BugeunKimChung-Ang University + BugeunKimChung-Ang University 95-126 Large Language Models (LLMs) have been recently adopted in interactive systems requiring communication. As the false belief in a model can harm the usability of such systems, LLMs should not have cognitive biases that humans have. Psychologists especially focus on the False Consensus Effect (FCE), a cognitive bias where individuals overestimate the extent to which others share their beliefs or behaviors, because FCE can distract smooth communication by posing false beliefs. However, previous studies have less examined FCE in LLMs thoroughly, which needs more consideration of confounding biases, general situations, and prompt changes. Therefore, in this paper, we conduct two studies to examine the FCE phenomenon in LLMs. In Study 1, we investigate whether LLMs have FCE. In Study 2, we explore how various prompting styles affect the demonstration of FCE. As a result of these studies, we identified that popular LLMs have FCE. Also, the result specifies the conditions when FCE becomes more or less prevalent compared to normal usage. 2025.findings-naacl.6 @@ -91,16 +91,16 @@ <fixed-case>L</fixed-case>aw<fixed-case>I</fixed-case>nstruct: A Resource for Studying Language Model Adaptation to the Legal Domain - JoelNiklausHarvey + JoelNiklausHarvey LuciaZhengStanford University - Arya D.McCarthyScaled Cognition + Arya D.McCarthyScaled Cognition ChristopherHahnX, the moonshot factory Brian MRosenGoogle PeterHendersonPrinceton University Daniel E.HoStanford University GarrettHonke PercyLiangStanford University - Christopher DManningComputer Science Department, Stanford University + Christopher DManningComputer Science Department, Stanford University 127-152 Instruction tuning is an important step in making language models useful for direct user interaction. However, the legal domain is underrepresented in typical instruction datasets (e.g., only 10 out of 1600+ tasks in Super-NaturalInstructions). To study whether instruction tuning on legal datasets is necessary for strong legal reasoning, we aggregate 58 annotated legal datasets and write instructions for each, creating LawInstruct. LawInstruct covers 17 global jurisdictions, 24 languages and a total of 12M examples across diverse tasks such as legal QA, summarization of court cases, and legal argument mining. We evaluate our models on LegalBench, measuring legal reasoning across five categories in 162 challenging and realistic legal tasks, and MMLU, to measure potential drops in general reasoning capabilities. We find that legal-specific instruction tuning on Flan-T5 – yielding FLawN-T5 – improves performance on LegalBench across all model sizes, with an aggregate increase of 15 points or 50% over Flan-T5 for the base size. No model size shows performance drops in MMLU. We publish LawInstruct as a resource for further study of instruction tuning in the legal domain. 2025.findings-naacl.7 @@ -108,15 +108,15 @@ Stephanie: Step-by-Step Dialogues for Mimicking Human Interactions in Social Conversations - HaoYang + HaoYang HongyuanLuThe Chinese University of Hong Kong - XinhuaZeng - YangLiu + XinhuaZeng + YangLiu XiangZhangfacemind HaoranYang YumengZhang ShanHuang - YiranWei + YiranWei WaiLamThe Chinese University of Hong Kong 153-166 In the rapidly evolving field of natural language processing, dialogue systems primarily employ a single-step dialogue paradigm. Although this paradigm is commonly adopted, it lacks the depth and fluidity of human interactions and does not appear natural. We introduce a novel **Step**-by-Step Dialogue Paradigm (Stephanie), designed to mimic the ongoing dynamic nature of human conversations. By employing a dual learning strategy and a further-split post-editing method, we generated and utilized a high-quality step-by-step dialogue dataset to fine-tune existing large language models, enabling them to perform step-by-step dialogues. We thoroughly present Stephanie. Tailored automatic and human evaluations are conducted to assess its effectiveness compared to the traditional single-step dialogue paradigm. We will release code, Stephanie datasets, and Stephanie LLMs to facilitate the future of chatbot eras. @@ -127,7 +127,7 @@ <fixed-case>C</fixed-case>on<fixed-case>S</fixed-case>hift: Sense-based Language Variation Analysis using Flexible Alignment ClareArringtonRensselaer Polytechnic Institute MauricioGruppiVillanova University - SibelAdaliRensselaer Polytechnic Institute + SibelAdaliRensselaer Polytechnic Institute 167-181 We introduce ConShift, a family of alignment-based algorithms that enable semantic variation analysis at the sense-level. Using independent senses of words induced from the context of tokens in two corpora, sense-enriched word embeddings are aligned using self-supervision and a flexible matching mechanism. This approach makes it possible to test for multiple sense-level language variations such as sense gain/presence, loss/absence and broadening/narrowing, while providing explanation of the changes through visualization of related concepts. We illustrate the utility of the method with sense- and word-level semantic shift detection results for multiple evaluation datasets in diachronic settings and dialect variation in the synchronic setting. 2025.findings-naacl.9 @@ -137,9 +137,9 @@ Breaking the Stigma! Unobtrusively Probe Symptoms in Depression Disorder Diagnosis Dialogue JiemingCao ChenHuangSichuan University - YananZhang + YananZhang RuiboDeng - JinchengZhang + JinchengZhang WenqiangLeiSichuan University 182-200 2025.findings-naacl.10 @@ -149,7 +149,7 @@ <fixed-case>T</fixed-case>o<fixed-case>V</fixed-case>o: Toxicity Taxonomy via Voting Tinh SonLuong Thanh-ThienLe - Thang VietDoan + Thang VietDoan Linh NgoVanHanoi University of Science and Technology Thien HuuNguyenUniversity of Oregon Nguyen Thi NgocDiep @@ -173,7 +173,7 @@ Enhancing Adversarial Transferability in Visual-Language Pre-training Models via Local Shuffle and Sample-based Attack XinLiu AoyangZhou - KunHeHuazhong University of Sceince and Technology + KunHeHuazhong University of Sceince and Technology 231-245 Visual-Language Pre-training (VLP) models have achieved significant performance across various downstream tasks. However, they remain vulnerable to adversarial examples. While prior efforts focus on improving the adversarial transferability of multimodal adversarial examples through cross-modal interactions, these approaches suffer from overfitting issues, due to a lack of input diversity by relying excessively on information from adversarial examples in one modality when crafting attacks in another. To address this issue, we draw inspiration from strategies in some adversarial training methods and propose a novel attack called Local Shuffle and Sample-based Attack (LSSA). LSSA randomly shuffles one of the local image blocks, thus expanding the original image-text pairs, generating adversarial images, and sampling around them. Then, it utilizes both the original and sampled images to generate the adversarial texts. Extensive experiments on multiple models and datasets demonstrate that LSSA significantly enhances the transferability of multimodal adversarial examples across diverse VLP models and downstream tasks. Moreover, LSSA outperforms other advanced attacks on Large Vision-Language Models. 2025.findings-naacl.13 @@ -182,7 +182,7 @@ <fixed-case>D</fixed-case>is2<fixed-case>D</fixed-case>is: Explaining Ambiguity in Fact-Checking IevaStaliunaiteUniversity of Cambridge - AndreasVlachosUniversity of Cambridge + AndreasVlachosUniversity of Cambridge 246-267 Ambiguity is a linguistic tool for encoding information efficiently, yet it also causes misunderstandings and disagreements. It is particularly relevant to the domain of misinformation, as fact-checking ambiguous claims is difficult even for experts. In this paper we argue that instead of predicting a veracity label for which there is genuine disagreement, it would be more beneficial to explain the ambiguity. Thus, this work introduces claim disambiguation, a constrained generation task, for explaining ambiguous claims in fact-checking. This involves editing them to spell out an interpretation that can then be unequivocally supported by the given evidence. We collect a dataset of 1501 such claim revisions and conduct experiments with sequence-to-sequence models. The performance is compared to a simple copy baseline and a Large Language Model baseline. The best results are achieved by employing Minimum Bayes Decoding, with a BertScore F1 of 92.22. According to human evaluation, the model successfully disambiguates the claims 72% of the time. 2025.findings-naacl.14 @@ -196,10 +196,10 @@ YuhangZhou YiyangZhouUniversity of North Carolina at Chapel Hill HuaxiuYaoDepartment of Computer Science, University of North Carolina at Chapel Hill - TianyiZhouUniversity of Maryland, College Park + TianyiZhouUniversity of Maryland, College Park TomGoldsteinUniversity of Maryland, College Park ParminderBhatiaGEHC - TahaKass-HoutGE HealthCare + TahaKass-HoutGE HealthCare FurongHuangUniversity of Maryland CaoXiaoGE Healthcare 268-282 @@ -211,7 +211,7 @@ <fixed-case>R</fixed-case>e<fixed-case>PD</fixed-case>: Defending Jailbreak Attack through a Retrieval-based Prompt Decomposition Process PeiranWang XiaogengLiuUniversity of Wisconsin - Madison - ChaoweiXiaoUniversity of Wisconsin - Madison and NVIDIA + ChaoweiXiaoUniversity of Wisconsin - Madison and NVIDIA 283-294 In this study, we introduce RePD, an innovative attack Retrieval-based Prompt Decomposition framework designed to mitigate the risk of jailbreak attacks on large language models (LLMs). Despite rigorous pre-training and fine-tuning focused on ethical alignment, LLMs are still susceptible to jailbreak exploits. RePD operates on a one-shot learning model, wherein it accesses a database of pre-collected jailbreak prompt templates to identify and decompose harmful inquiries embedded within user prompts. This process involves integrating the decomposition of the jailbreak prompt into the user’s original query into a one-shot learning example to effectively teach the LLM to discern and separate malicious components. Consequently, the LLM is equipped to first neutralize any potentially harmful elements before addressing the user’s prompt in a manner that aligns with its ethical guidelines. RePD is versatile and compatible with a variety of open-source LLMs acting as agents. Through comprehensive experimentation with both harmful and benign prompts, we have demonstrated the efficacy of our proposed RePD in enhancing the resilience of LLMs against jailbreak attacks, without compromising their performance in responding to typical user requests. 2025.findings-naacl.16 @@ -223,7 +223,7 @@ YangDengSingapore Management University HengchangHuNational University of Singapore Min-YenKanNational University of Singapore - HaizhouLiThe Chinese University of Hong Kong (Shenzhen); National University of Singapore and National University of Singapore + HaizhouLiThe Chinese University of Hong Kong (Shenzhen); National University of Singapore and National University of Singapore 295-312 This paper aims to efficiently enable large language models (LLMs) to use external knowledge and goal guidance in conversational recommender system (CRS) tasks. Advanced LLMs (e.g., ChatGPT) are limited in domain-specific CRS tasks for 1) generating grounded responses with recommendation-oriented knowledge, or 2) proactively leading the conversations through different dialogue goals. In this work, we first analyze those limitations through a comprehensive evaluation, showing the necessity of external knowledge and goal guidance which contribute significantly to the recommendation accuracy and language quality. In light of this finding, we propose a novel ChatCRS framework to decompose the complex CRS task into several sub-tasks through the implementation of 1) a knowledge retrieval agent using a tool-augmented approach to reason over external Knowledge Bases and 2) a goal-planning agent for dialogue goal prediction. Experimental results on two multi-goal CRS datasets reveal that ChatCRS sets new state-of-the-art benchmarks, improving language quality of informativeness by 17% and proactivity by 27%, and achieving a tenfold enhancement in recommendation accuracy. 2025.findings-naacl.17 @@ -231,12 +231,12 @@ Data-Efficiently Learn Large Language Model for Universal 3<fixed-case>D</fixed-case> Scene Perception - ZehanWang + ZehanWang HaifengHuang YangZhaoByteDance Inc. ZiangZhang - TaoJinZhejiang University - ZhouZhaoZhejiang University and Zhejiang University + TaoJinZhejiang University + ZhouZhaoZhejiang University and Zhejiang University 313-333 3D scene understanding has gained significant attention due to its wide range of applications. However, existing methods for 3D scene understanding are limited to specific downstream tasks, which hinders their practicality in real-world applications. This paper presents Chat-3D, which combines the 3D visual perceptual ability of pre-trained 3D representations and the impressive reasoning and conversation capabilities of advanced LLMs to achieve the first universal dialogue systems for 3D scenes. Specifically, we align 3D representations into the feature space of LLMs, thus enabling LLMs to perceive the 3D world. Given the scarcity of 3D scene-text data, we propose a three-stage training strategy to efficiently utilize the available data for better alignment. To enhance the reasoning ability and develop a user-friendly interaction scheme, we further construct a high-quality object-centric 3D instruction dataset and design an associated object-centric prompt. With limited data, Chat-3D achieves a 82.2% relative score compared with GPT-4 on the constructed instruction dataset, and comparable performance to state-of-the-art LLM-based methods. 2025.findings-naacl.18 @@ -245,8 +245,8 @@ <fixed-case>U</fixed-case>nified<fixed-case>MLLM</fixed-case>: Enabling Unified Representation for Multi-modal Multi-tasks With Large Language Model ZhaoweiLiFudan University - WeiWang - YiQingCai + WeiWang + YiQingCai QiXu PengyuWang DongZhang @@ -272,10 +272,10 @@ <fixed-case>D</fixed-case>iscover<fixed-case>GPT</fixed-case>: Multi-task Fine-tuning Large Language Model for Related Table Discovery - XumingHuThe Hong Kong University of Science and Technology (Guangzhou) and Hong Kong University of Science and Technology + XumingHuThe Hong Kong University of Science and Technology (Guangzhou) and Hong Kong University of Science and Technology XiaoQinAmazon - ChuanLei - AsteriosKatsifodimosDelft University of Technology + ChuanLei + AsteriosKatsifodimosDelft University of Technology ZhengyuanShenAmazon BalasubramaniamSrinivasanAmazon HuzefaRangwalaAmazon and Computer Science, George Mason University @@ -287,9 +287,9 @@ Can <fixed-case>GPT</fixed-case>-4 Sway Experts’ Investment Decisions? TakehiroTakayanagi - HiroyaTakamuraAIST, National Institute of Advanced Industrial Science and Technology + HiroyaTakamuraAIST, National Institute of Advanced Industrial Science and Technology KiyoshiIzumiThe University of Tokyo, The University of Tokyo - Chung-ChiChenAIST, National Institute of Advanced Industrial Science and Technology + Chung-ChiChenAIST, National Institute of Advanced Industrial Science and Technology 374-383 In the post-Turing era, evaluating large language models (LLMs) involves assessing generated text based on readers’ decisions rather than merely its indistinguishability from human-produced content. This paper explores how LLM-generated text impacts readers’ decisions, focusing on both amateur and expert audiences. Our findings indicate that GPT-4 can generate persuasive analyses affecting the decisions of both amateurs and professionals. Furthermore, we evaluate the generated text from the aspects of grammar, convincingness, logical coherence, and usefulness. The results highlight a high correlation between real-world evaluation through audience decisions and the current multi-dimensional evaluators commonly used for generative models. Overall, this paper shows the potential and risk of using generated text to sway human decisions and also points out a new direction for evaluating generated text, i.e., leveraging the decisions of readers. We release our dataset to assist future research. 2025.findings-naacl.22 @@ -297,11 +297,11 @@ <fixed-case>P</fixed-case>oly<fixed-case>J</fixed-case>oin: Semantic Multi-key Joinable Table Search in Data Lakes - XumingHuThe Hong Kong University of Science and Technology (Guangzhou) and Hong Kong University of Science and Technology - ChuanLei + XumingHuThe Hong Kong University of Science and Technology (Guangzhou) and Hong Kong University of Science and Technology + ChuanLei XiaoQinAmazon - AsteriosKatsifodimosDelft University of Technology - ChristosFaloutsosAmazon and Carnegie Mellon University + AsteriosKatsifodimosDelft University of Technology + ChristosFaloutsosAmazon and Carnegie Mellon University HuzefaRangwalaAmazon and Computer Science, George Mason University 384-395 Given a query table, how can we effectively discover multi-key joinable tables on the web? This can be seen as a retrieval task, where users can lookup on the web for tables related to an existing one. Searching and discovering such joinable tables is critical to data analysts and data scientists for reporting, establishing correlations and training machine learning models. Existing joinable table search methods have mostly focused on single key (unary) joins, where a single column is the join key. However, these methods are ineffective when dealing with join keys composed of multiple columns (n-ary joins), which are prevalent on web table corpora. In this paper, we introduce PolyJoin, which finds multi-key semantically-joinable tables on the web, given a query table. PolyJoin employs a multi-key encoder and a novel self-supervised training method to generate the representations of multiple join keys, preserving the alignment across multiple columns. In particular, PolyJoin is equipped with a hierarchical contrastive learning technique to further enhance the model’s semantic understanding of multi-key joinable tables. PolyJoin outperforms the state-of-the-art methods by 2.89% and 3.67% with respect to MAP@30 and R@30 on two real-world web table benchmarks, respectively. @@ -311,7 +311,7 @@ Marrying <fixed-case>LLM</fixed-case>s with Dynamic Forecasting: A Graph Mixture-of-expert Perspective DapengJiang - XiaoLuoUniversity of California, Los Angeles + XiaoLuoUniversity of California, Los Angeles 396-410 Dynamical system modeling is a crucial area of research in machine learning with extensive applications in physics and social science. Recent data-driven approaches often employ graph neural networks (GNNs) to learn relationships in dynamical systems using message passing mechanisms. Despite their advancements, these methods often suffer from performance degradation when it comes to potential environmental change with distribution shifts in real-world applications. In this work, we propose a new perspective which leverages large language models (LLMs) to enhance the generalization capabilities of dynamical system modeling. In particular, we develop a novel framework named LLM Judge with Graph Mixture-of-expert LEGO which incorporates multiple graph experts to learn diverse dynamics within the systems. More importantly, LEGO utilizes LLMs with hierarchical prompts at object, edge, and system levels as a context-aware routing function to determine which experts carry the most relevant information to different environments. The whole framework is optimized by updating the weights and expert parameters in an alternative fashion. Extensive experiments across various datasets demonstrate the effectiveness of our proposed LEGO in comparison to extensive baselines. 2025.findings-naacl.24 @@ -323,11 +323,11 @@ YanxinLongTencent Data Platform XinchiDeng RuihangChuThe Chinese University of Hong Kong - JiangfengXiongTencent Data Platform + JiangfengXiongTencent Data Platform XiaodanLiangSUN YAT-SEN UNIVERSITY - HongChengThe Chinese University of Hong Kong - QinglinLu - WeiLiuTencent + HongChengThe Chinese University of Hong Kong + QinglinLu + WeiLiuTencent 411-426 Text-to-image (T2I) generation models have significantly advanced in recent years. However, effective interaction with these models is challenging for average users due to the need for specialized prompt engineering knowledge and the inability to perform multi-turn image generation, hindering a dynamic and iterative creation process. Recent attempts have tried to equip Multi-modal Large Language Models (MLLMs) with T2I models to bring the user’s natural language instructions into reality. Hence, the output modality of MLLMs is extended, and the multi-turn generation quality of T2I models is enhanced thanks to the strong multi-modal comprehension ability of MLLMs. However, many of these works face challenges in identifying correct output modalities and generating coherent images accordingly as the number of output modalities increases and the conversations go deeper. Therefore, we propose DialogGen, an effective pipeline to align off-the-shelf MLLMs and T2I models to build a Multi-modal Interactive Dialogue System (MIDS) for multi-turn Text-to-Image generation. It is composed of drawing prompt alignment, careful training data curation, and error correction. Moreover, as the field of MIDS flourishes, comprehensive benchmarks are urgently needed to evaluate MIDS fairly in terms of output modality correctness and multi-modal output coherence. To address this issue, we introduce the Multi-modal Dialogue Benchmark (DialogBen), a comprehensive bilingual benchmark designed to assess the ability of MLLMs to generate accurate and coherent multi-modal content that supports image editing. It contains two evaluation metrics to measure the model’s ability to switch modalities and the coherence of the output images. Our extensive experiments on DialogBen and user study demonstrate the effectiveness of DialogGen in producing correct output modalities and coherent multi-modal outputs compared with other State-of-the-Art models. We hope that DialogBen can contribute to the community for building more powerful MIDS. 2025.findings-naacl.25 @@ -337,7 +337,7 @@ <fixed-case>REL</fixed-case>ex<fixed-case>ED</fixed-case>: Retrieval-Enhanced Legal Summarization with Exemplar Diversity SantoshT.y.s.s ChenJiaTechnische Universität München - PatrickGoroncyDepartment of Informatics, Technische Universität München + PatrickGoroncyDepartment of Informatics, Technische Universität München MatthiasGrabmairTechnische Universität München 427-434 This paper addresses the task of legal summarization, which involves distilling complex legal documents into concise, coherent summaries. Current approaches often struggle with content theme deviation and inconsistent writing styles due to their reliance solely on source documents. We propose RELexED, a retrieval-augmented framework that utilizes exemplar summaries along with the source document to guide the model. RELexED employs a two-stage exemplar selection strategy, leveraging a determinantal point process to balance the trade-off between similarity of exemplars to the query and diversity among exemplars, with scores computed via influence functions. Experimental results on two legal summarization datasets demonstrate that RELexED significantly outperforms models that do not utilize exemplars and those that rely solely on similarity-based exemplar selection. @@ -350,15 +350,15 @@ YashanWangCentral Conservatory of Music RuibinYuan GuoZhancheng - XuTan + XuTan GeZhangByteDance Inc. - MonanZhou + MonanZhou JingChen XuefengMu YuejieGao YuanliangDong JiafengLiuCentral Conservatory of Music - XiaobingLiCentral Conservatory of Music + XiaobingLiCentral Conservatory of Music FengYuCentral Conservatory of Music MaosongSunTsinghua University 435-451 @@ -368,8 +368,8 @@ <fixed-case>L</fixed-case>og<fixed-case>R</fixed-case>ules: Enhancing Log Analysis Capability of Large Language Models through Rules - XinHuang - TingZhang + XinHuang + TingZhang WenZhaoPeking University 452-470 Currently, large language models (LLMs) have achieved impressive performance in natural language processing tasks. However, LLMs still exhibit many hallucinations when analyzing system logs, which is due to the implicit knowledge and rules in logs that LLMs cannot capture. Based on this, we propose LogRules, a lightweight log analysis framework that generates and utilizes rules through LLMs. LogRules consists of three stages: an induction stage, an alignment stage, and a reasoning stage. Firstly, in the induction stage, an strong LLM (e.g., GPT-4o-mini) is tasked with generating a series of rules related to logs, which are then validated on the training set. When the rules are confirmed to produce correct reasoning results, they are added to a rule repository. Secondly, considering that the LLMs with small size (\approx8B parameters) still face challenges in utilizing rules, we design an alignment method based on rule-case contrastive preference optimization (CPO) to effectively enhance the rule reasoning capabilities of these LLMs. Finally, in the reasoning stage, the LLM constructs prompt using the rule repository and performs log analysis on the test set. Experiments show that LogRules outperforms LLM-based methods in log parsing and anomaly detection tasks, and achieves better performance compared to case-based methods. @@ -379,9 +379,9 @@ Audio Description Generation in the Era of <fixed-case>LLM</fixed-case>s and <fixed-case>VLM</fixed-case>s: A Review of Transferable Generative <fixed-case>AI</fixed-case> Technologies YingqiangGaoUniversity of Zurich - LukasFischerUniversity of Zurich + LukasFischerUniversity of Zurich AlexaLintnerZHAW - Zürcher Hochschule für Angewandte Wissenschaften - SarahEblingUniversity of Zurich + SarahEblingUniversity of Zurich 471-490 Audio descriptions (ADs) function as acoustic commentaries designed to assist blind persons and persons with visual impairments in accessing digital media content on television and in movies, among other settings. As an accessibility service typically provided by trained AD professionals, the generation of ADs demands significant human effort, making the process both time-consuming and costly. Recent advancements in natural language processing (NLP) and computer vision (CV), particularly in large language models (LLMs) and vision-language models (VLMs), have allowed for getting a step closer to automatic AD generation. This paper reviews the technologies pertinent to AD generation in the era of LLMs and VLMs: we discuss how state-of-the-art NLP and CV technologies can be applied to generate ADs and identify essential research directions for the future. 2025.findings-naacl.29 @@ -389,9 +389,9 @@ Adaptive Retrieval-Augmented Generation for Conversational Systems - XiWangUniversity of Sheffield + XiWangUniversity of Sheffield ProchetaSenUniversity of Liverpool - RuizheLiUniversity of Aberdeen + RuizheLiUniversity of Aberdeen EmineYilmaz 491-503 With the success of integrating large language models into the development of conversational systems, many studies have shown the effectiveness of retrieving and augmenting external knowledge for informative responses. While many existing studies agree on the necessity of Retrieval Augmented Generation (RAG), further investigation into the necessity and value of applying RAG to every turn of the conversation is needed. In this study, we propose to investigate the need for each turn of system response to be augmented with external knowledge. In particular, by leveraging human judgements on the binary choice of adaptive augmentation, we develop RAGate, a gating model, which models conversation context and relevant inputs to predict if a conversational system requires RAG for improved responses. We conduct extensive experiments on devising and applying RAGate to conversational models, joined with well-rounded analyses of various conversational scenarios. Our experimental results and analysis indicate the effective application of RAGate in RAG-based conversational systems in identifying if system responses require RAG to generate high-quality responses with high confidence. This study also identifies and shows the correlation between the generation’s confidence level and the relevance of the augmented knowledge. We have also released the implementation code and resources in https://github.com/wangxieric/RAGate. @@ -401,8 +401,8 @@ Multimodal Generation with Consistency Transferring JunxiangQiu - JindaLu - ShuoWangUniversity of Science and Technology of China + JindaLu + ShuoWangUniversity of Science and Technology of China 504-513 Multimodal content generation has become an area of considerable interest. However, existing methods are hindered by limitations related to model constraints and training strategies: (1) Most current approaches rely on training models from scratch, resulting in inefficient training processes when extending these models; (2) There is a lack of constraints on adjacent steps within the models, leading to slow sampling and poor generation stability across various sampling methods. To address the issues, we introduce Multimodal Generation with Consistency Transferring (MGCT). The method introduces two key improvements: (1) A Model Consistency Transferring (MCT) strategy to acquire low-cost prior knowledge, increasing training efficiency and avoiding error accumulation; (2) A Layer Consistency Transferring (LCT) between adjacent steps, enhancing denoising capabilities at each step and improving model stability across various generation methods. These strategies ensure the consistency of jointly generated multimodal content and improving training efficiency. Experiments show that the algorithm enhances the model’s ability to capture actions and depict backgrounds more effectively. In both the AIST++ and Landscape datasets, it improves video generation speed by approximately 40% and quality by about 39.3%, while also achieving a slight 3% improvement in audio quality over the baseline. 2025.findings-naacl.31 @@ -410,9 +410,9 @@ On the Impact of Noise in Differentially Private Text Rewriting - StephenMeisenbacher + StephenMeisenbacher MaulikChevliTechnische Universität München - FlorianMatthesTechnische Universität München + FlorianMatthesTechnische Universität München 514-532 The field of text privatization often leverages the notion of *Differential Privacy* (DP) to provide formal guarantees in the rewriting or obfuscation of sensitive textual data. A common and nearly ubiquitous form of DP application necessitates the addition of calibrated noise to vector representations of text, either at the data- or model-level, which is governed by the privacy parameter \varepsilon. However, noise addition almost undoubtedly leads to considerable utility loss, thereby highlighting one major drawback of DP in NLP. In this work, we introduce a new sentence infilling privatization technique, and we use this method to explore the effect of noise in DP text rewriting. We empirically demonstrate that non-DP privatization techniques excel in utility preservation and can find an acceptable empirical privacy-utility trade-off, yet cannot outperform DP methods in empirical privacy protections. Our results highlight the significant impact of noise in current DP rewriting mechanisms, leading to a discussion of the merits and challenges of DP in NLP as well as the opportunities that non-DP methods present. 2025.findings-naacl.32 @@ -421,9 +421,9 @@ Teaching Large Language Models Number-Focused Headline Generation With Key Element Rationales ZhenQian - XiuzhenZhangRoyal Melbourne Institute of Technology - XiaofeiXu - FengXiaRoyal Melbourne Institute of Technology + XiuzhenZhangRoyal Melbourne Institute of Technology + XiaofeiXu + FengXiaRoyal Melbourne Institute of Technology 533-550 Number-focused headline generation is a summarization task requiring both high textual quality and precise numerical accuracy, which poses a unique challenge for Large Language Models (LLMs). Existing studies in the literature focus only on either textual quality or numerical reasoning and thus are inadequate to address this challenge. In this paper, we propose a novel chain-of-thought framework for using rationales comprising key elements of the Topic, Entities, and Numerical reasoning (TEN) in news articles to enhance the capability for LLMs to generate topic-aligned high-quality texts with precise numerical accuracy. Specifically, a teacher LLM is employed to generate TEN rationales as supervision data, which are then used to teach and fine-tune a student LLM. Our approach teaches the student LLM automatic generation of rationales with enhanced capability for numerical reasoning and topic-aligned numerical headline generation. Experiments show that our approach achieves superior performance in both textual quality and numerical accuracy. 2025.findings-naacl.33 @@ -440,11 +440,11 @@ <fixed-case>SIMPLOT</fixed-case>: Enhancing Chart Question Answering by Distilling Essentials - WonjoongKim + WonjoongKim SangwuPark - YeonjunIn + YeonjunIn SeokwonHanKorea Advanced Institute of Science & Technology - ChanyoungParkKorea Advanced Institute of Science and Technology + ChanyoungParkKorea Advanced Institute of Science and Technology 573-593 Recently, interpreting complex charts with logical reasoning has emerged as challenges due to the development of vision-language models. A prior state-of-the-art (SOTA) model has presented an end-to-end method that leverages the vision-language model to convert charts into table format utilizing Large Language Model (LLM) for reasoning. However, unlike natural images, charts contain a mix of essential and irrelevant information required for chart reasoning, and we discover that this characteristic can lower the performance of chart-to-table extraction. In this paper, we introduce SIMPLOT, a method designed to extract only the elements necessary for chart reasoning. The proposed method involves two steps: 1) training to mimic a simple plot that contains only the essential information from a complex chart for table extraction, followed by 2) performing reasoning based on the table. Our model enables accurate chart reasoning without the need for additional annotations or datasets, and its effectiveness is demonstrated through various experiments. 2025.findings-naacl.35 @@ -463,10 +463,10 @@ Lost in Overlap: Exploring Logit-based Watermark Collision in <fixed-case>LLM</fixed-case>s YiyangLuo - KeLinTsinghua University, Tsinghua University + KeLinTsinghua University, Tsinghua University ChaoGu JiahuiHouUniversity of Science and Technology of China - LijieWenTsinghua University + LijieWenTsinghua University LuoPing 620-637 The proliferation of large language models (LLMs) in generating content raises concerns about text copyright. Watermarking methods, particularly logit-based approaches, embed imperceptible identifiers into text to address these challenges. However, the widespread usage of watermarking across diverse LLMs has led to an inevitable issue known as watermark collision during common tasks, such as paraphrasing or translation.In this paper, we introduce watermark collision as a novel and general philosophy for watermark attacks, aimed at enhancing attack performance on top of any other attacking methods. We also provide a comprehensive demonstration that watermark collision poses a threat to all logit-based watermark algorithms, impacting not only specific attack scenarios but also downstream applications. @@ -486,13 +486,13 @@ Identifying and Mitigating Social Bias Knowledge in Language Models - RuizheChen + RuizheChen YichenLi - JianfeiYangNanyang Technological University + JianfeiYangNanyang Technological University YangFeng - Joey TianyiZhouA*STAR Centre for Frontier AI Research + Joey TianyiZhouA*STAR Centre for Frontier AI Research JianWuZhejiang University - ZuozhuLiuZhejiang University + ZuozhuLiuZhejiang University 651-672 Generating fair and accurate predictions plays a pivotal role in deploying pre-trained language models (PLMs) in the real world. However, existing debiasing methods may inevitably generate incorrect or nonsensical predictions as they are designed and evaluated to achieve parity across different social groups but leave aside individual commonsense facts, resulting in modified knowledge that elicits unreasonable or undesired predictions. This paper introduces a novel debiasing framework that first identifies the encoding locations of biases within language models and then applies the Fairness-Stamp (FAST). FAST focuses on fine-grained, individual bias mitigation and integrates a lightweight network into PLMs, specifically targeting identified biases while preserving essential knowledge and maintaining factual integrity. We also present BiaScope, a new benchmark comprising datasets and metrics designed to evaluate the retention of commonsense knowledge and the generalization across paraphrased social biases. Our extensive experiments across multiple datasets demonstrate that FAST surpasses state-of-the-art baselines with superior debiasing performance while not compromising the overall model capability for knowledge retention and downstream predictions. This highlights the potential of fine-grained debiasing strategies to achieve fairness in PLMs. Code will be publicly available. 2025.findings-naacl.39 @@ -500,8 +500,8 @@ <fixed-case>D</fixed-case>ia<fixed-case>S</fixed-case>ynth: Synthetic Dialogue Generation Framework for Low Resource Dialogue Applications - Sathya KrishnanSuresh - WuMengjun + Sathya KrishnanSuresh + WuMengjun TusharPranavSingapore Institute of Technology EngSiongChngNanyang Technological University 673-690 @@ -510,12 +510,12 @@ Do Not Design, Learn: A Trainable Scoring Function for Uncertainty Estimation in Generative <fixed-case>LLM</fixed-case>s - Duygu NurYaldiz + Duygu NurYaldiz Yavuz FarukBakmanUniversity of Southern California BaturalpBuyukatesUniversity of Birmingham ChenyangTaoAmazon AnilRamakrishnaAmazon - DimitriosDimitriadisAmazon + DimitriosDimitriadisAmazon JieyuZhaoUniversity of Southern California SalmanAvestimehrUniversity of Southern California 691-713 @@ -539,7 +539,7 @@ Synonym-unaware Fast Adversarial Training against Textual Adversarial Attacks YichenYang XinLiu - KunHeHuazhong University of Sceince and Technology + KunHeHuazhong University of Sceince and Technology 727-739 Numerous adversarial defense methods have been proposed to strengthen the robustness of Natural Language Processing (NLP) models against adversarial attacks. However, many of these methods rely on predetermined linguistic knowledge and assume that attackers’ synonym candidates are known, which is often unrealistic. In this work, we investigate adversarial training in the embedding space and introduce a Fast Adversarial Training (FAT) method to improve the model robustness without requiring synonym awareness. FAT leverages single-step perturbation generation and effective perturbation initialization based on two key insights: (1) adversarial perturbations generated by single-step and multi-step gradient ascent are similar, and (2) perturbations generated on the same training sample across successive epochs exhibit resemblance. By employing single-step gradient ascent and leveraging historical perturbation information, FAT not only expedites the training process but also efficiently initializes perturbations. Extensive experiments demonstrate that FAT significantly enhances the robustness of popular NLP models under scenarios where synonyms are unknown, outperforming other defense baselines under various character-level and word-level attacks. 2025.findings-naacl.43 @@ -547,9 +547,9 @@ Tethering Broken Themes: Aligning Neural Topic Models with Labels and Authors - MayankNagda + MayankNagda PhilOstheimerRPTU Kaiserslautern-Landau - SophieFellenzUniversität Kaiserslautern + SophieFellenzUniversität Kaiserslautern 740-760 Topic models are a popular approach for extracting semantic information from large document collections. However, recent studies suggest that the topics generated by these models often do not align well with human intentions. Although metadata such as labels and authorship information are available, it has not yet been effectively incorporated into neural topic models. To address this gap, we introduce FANToM, a novel method to align neural topic models with both labels and authorship information. FANToM allows for the inclusion of this metadata when available, producing interpretable topics and author distributions for each topic. Our approach demonstrates greater expressiveness than conventional topic models by learning the alignment between labels, topics, and authors. Experimental results show that FANToM improves existing models in terms of both topic quality and alignment. Additionally, it identifies author interests and similarities. 2025.findings-naacl.44 @@ -559,8 +559,8 @@ Towards Zero-Shot Multimodal Machine Translation MatthieuFuteral CordeliaSchmidGoogle, INRIA and Inria - BenoîtSagotInria - RachelBawdenInria + BenoîtSagotInria + RachelBawdenInria 761-778 Current multimodal machine translation (MMT) systems rely on fully supervised data (i.e sentences with their translations and accompanying images), which is costly to collect and prevents the extension of MMT to language pairs with no such data. We propose a method to bypass the need for fully supervised data to train MMT systems, using multimodal English data only. Our method ( ZeroMMT) consists in adapting a strong text-only machine translation (MT) model by training it jointly on two objectives: visually conditioned masked language modelling and the Kullback-Leibler divergence between the original MT and new MMT outputs. We evaluate on standard MMT benchmarks and on CoMMuTE, a contrastive test set designed to evaluate how well models use images to disambiguate translations. ZeroMMT obtains disambiguation results close to state-of-the-art MMT models trained on fully supervised examples. To prove that ZeroMMT generalizes to languages with no fully supervised training data, we extend CoMMuTE to three new languages: Arabic, Russian and Chinese. We also show that we can control the trade-off between disambiguation capabilities and translation fidelity at inference time using classifier-free guidance and without any additional data. Our code, data and trained models are publicly accessible. 2025.findings-naacl.45 @@ -568,12 +568,12 @@ Large-Scale Corpus Construction and Retrieval-Augmented Generation for <fixed-case>A</fixed-case>ncient <fixed-case>C</fixed-case>hinese Poetry: New Method and Data Insights - YangLiuSouth China University of Technology + YangLiuSouth China University of Technology LanLan - JiahuanCao - HiuyiCheng - KaiDingINTSIG Information - LianwenJin + JiahuanCao + HiuyiCheng + KaiDingINTSIG Information + LianwenJin 779-817 Ancient Chinese Poetry (ACP), a critical aspect of Chinese cultural heritage, presents unique challenges for Large Language Models (LLMs). One of the most pressing challenges is the significant hallucination issues faced by LLMs due to data scarcity and limited ability of general LLMs when dealing with ACP. To address these challenges, this paper constructs the ACP-Corpus, which encompasses 1.1 million ancient poems and 990K related texts, designed to enhance the training and performance of LLMs. Alongside this, we develop the ACP-QA dataset, comprising over 12 million question-answer pairs across 24 task categories, and the ACP-Eval dataset for rigorous evaluation purposes, containing 7,050 entries. Building on this resources, we propose the ACP-RAG framework, a specialized Retrieval-Augmented Generation (RAG) approach that significantly improves the performance of LLMs in the domain of ancient poetry from 49.2% to 89.0%. The ACP-RAG contains five modules of semantic coarse-grained retrieval, semantic fine-grained retrieval, keyword retrieval, keyword matching, and context filtering. Experiments show that ACP-RAG achieves a promising response accuracy of 89.0%, surpassing existing LLMs by a remarkable margin. We believe this work not only advances the capabilities of LLMs in processing ancient Chinese poetry but also contributes to the preservation and innovative development within this rich literary tradition. The datasets and code are available at https://github.com/SCUT-DLVCLab/ACP-RAG. 2025.findings-naacl.46 @@ -581,11 +581,11 @@ <fixed-case>O</fixed-case>pen<fixed-case>B</fixed-case>io<fixed-case>NER</fixed-case>: Lightweight Open-Domain Biomedical Named Entity Recognition Through Entity Type Description - AlessioCocchieriUniversity of Bologna - GiacomoFrisoni - MarcosMartínez GalindoInternational Business Machines + AlessioCocchieriUniversity of Bologna + GiacomoFrisoni + MarcosMartínez GalindoInternational Business Machines GianlucaMoroDISI - University of Bologna - GiuseppeTagliaviniUniversity of Bologna + GiuseppeTagliaviniUniversity of Bologna FrancescoCandoliUniversity of Bologna 818-837 Biomedical Named Entity Recognition (BioNER) faces significant challenges in real-world applications due to limited annotated data and the constant emergence of new entity types, making zero-shot learning capabilities crucial. While Large Language Models (LLMs) possess extensive domain knowledge necessary for specialized fields like biomedicine, their computational costs often make them impractical. To address these challenges, we introduce OpenBioNER, a lightweight BERT-based cross-encoder architecture that can identify any biomedical entity using only its description, eliminating the need for retraining on new, unseen entity types. Through comprehensive evaluation on established biomedical benchmarks, we demonstrate that OpenBioNER surpasses state-of-the-art baselines, including specialized 7B NER LLMs and GPT-4o, achieving up to 10% higher F1 scores while using 110M parameters only. Moreover, OpenBioNER outperforms existing small-scale models that match textual spans with entity types rather than descriptions, both in terms of accuracy and computational efficiency. @@ -604,7 +604,7 @@ Linguistically Grounded Analysis of Language Models using Shapley Head Values MarcellFekete - JohannesBjervaAalborg University + JohannesBjervaAalborg University 850-865 Understanding how linguistic knowledge is encoded in language models is crucial for improving their generalisation capabilities. In this paper, we investigate the processing of morphosyntactic phenomena, by leveraging a recently proposed method for probing language models via Shapley Head Values (SHVs). Using the English language BLiMP dataset, we test our approach on two widely used models, BERT and RoBERTa, and compare how linguistic constructions such as anaphor agreement and filler-gap dependencies are handled. Through quantitative pruning and qualitative clustering analysis, we demonstrate that attention heads responsible for processing related linguistic phenomena cluster together. Our results show that SHV-based attributions reveal distinct patterns across both models, providing insights into how language models organize and process linguistic information. These findings support the hypothesis that language models learn subnetworks corresponding to linguistic theory, with potential implications for cross-linguistic model analysis and interpretability in Natural Language Processing (NLP). 2025.findings-naacl.49 @@ -612,11 +612,11 @@ How Do Large Language Models Perform in Dynamical System Modeling - XiaoLuoUniversity of California, Los Angeles + XiaoLuoUniversity of California, Los Angeles BinqiChen - HaixinWangUCLA Computer Science Department, University of California, Los Angeles - ZhipingXiaoUniversity of Washington - MingZhangPeking University + HaixinWangUCLA Computer Science Department, University of California, Los Angeles + ZhipingXiaoUniversity of Washington + MingZhangPeking University YizhouSunUniversity of California, Los Angeles 866-880 This paper studies the problem of dynamical system modeling, which involves the evolution of multiple interacting objects. Recent data-driven methods often utilize graph neural networks (GNNs) to learn these interactions by optimizing the neural network in an end-to-end fashion. While large language models (LLMs) have shown exceptional zero-shot performance across various applications, their potential for modeling dynamical systems has not been extensively explored. In this work, we design prompting techniques for dynamical system modeling and systematically evaluate the capabilities of LLMs on two tasks, including dynamic forecasting and relational reasoning. An extensive benchmark LLM4DS across nine datasets is built for performance comparison. Our extensive experiments yield several key findings: (1) LLMs demonstrate competitive performance without training compared to state-of-the-art methods in dynamical system modeling. (2) LLMs effectively infer complex interactions among objects to capture system evolution. (3) Prompt engineering plays a crucial role in enabling LLMs to accurately understand and predict the evolution of systems. @@ -627,8 +627,8 @@ <fixed-case>LMM</fixed-case>s-Eval: Reality Check on the Evaluation of Large Multimodal Models KaichenZhang BoLi - PeiyuanZhangUniversity of California, San Diego - FanyiPu + PeiyuanZhangUniversity of California, San Diego + FanyiPu Joshua AdrianCahyono KairuiHuNanyang Technological University ShuaiLiu @@ -643,10 +643,10 @@ Pairwise Prompt-Based Tuning with Parameter Efficient Fast Adaptation for Generalized Zero-Shot Intent Detection - XiaotongZhang + XiaotongZhang QianruZhou - HanLiuDalian University of Technology - HongYu + HanLiuDalian University of Technology + HongYu 917-929 Generalized zero-shot intent detection (GZID) aims to recognize the labels of utterances from both seen and unseen intents by utilizing the knowledge learned from seen intents. Enhancing the generalization ability from seen intents to unseen intents is a key challenge in the GZID setting. Existing methods attempt to tackle this challenge by distinguishing unseen intents from seen intents or focusing on enhancing the model discriminability. However, the challenge is not solved substantially as they ignore to promote the representation learning ability of the model itself and neglect to strengthen the model adaptability to new tasks, resulting in overfitting on the seen intents. In this paper, we propose a pairwise prompt-based tuning model with parameter efficient fast adaptation which involves two training steps. In the first step, we leverage hybrid contrastive learning in discriminant space and masked language modeling to make predictions at both sentence and token levels, which can enhance the model discriminability and representation learning ability respectively. In the second step, we design a pipeline for generating and filtering unseen data by only providing unseen intent labels, and utilize parameter-efficient fine-tuning to quickly adapt to unseen intents. Experiments on four intent detection datasets demonstrate that our two-step training method has better comprehension and generalization capabilities. 2025.findings-naacl.52 @@ -656,7 +656,7 @@ <fixed-case>F</fixed-case>aithful<fixed-case>P</fixed-case>ersona: Balancing Faithfulness and Personalization in Code Explanations through Self-Critique ZhuangLuo YichuanLi - ZexingXu + ZexingXu KyuminLeeWorcester Polytechnic Institute S. RasoulEtesamiUniversity of Illinois, Urbana Champaign 930-944 @@ -668,7 +668,7 @@ Efficient Multi-Agent Collaboration with Tool Use for Online Planning in Complex Table Question Answering WeiZhouRobert Bosch GmbH, Bosch MohsenMesgarBosch - AnnemarieFriedrichUniversity of Augsburg + AnnemarieFriedrichUniversity of Augsburg HeikeAdelHochschule der Medien (University of Applied Sciences) 945-968 Complex table question answering (TQA) aims to answer questions that require complex reasoning, such as multi-step or multi-category reasoning, over data represented in tabular form. Previous approaches demonstrate notable performance by leveraging either closed-source large language models (LLMs) or fine-tuned open-weight LLMs. However, fine-tuning LLMs requires high-quality training data, which is costly to obtain. The use of closed-source LLMs poses accessibility challenges and leads to reproducibility issues. In this paper, we propose Multi Agent Collaboration with Tool use (MACT), a framework that requires neither fine-tuning nor closed-source models. In MACT, a planning agent and a coding agent that also make use of tools collaborate for TQA. MACT outperforms previous SoTA systems on three out of four benchmarks and performs comparably to the larger and more expensive closed-source model GPT-4 on two benchmarks, even when using only open-weight models without any fine-tuning. Our extensive analyses prove the effectiveness of MACT’s multi-agent collaboration in TQA. We release our code publicly. @@ -679,12 +679,12 @@ Ground Every Sentence: Improving Retrieval-Augmented <fixed-case>LLM</fixed-case>s with Interleaved Reference-Claim Generation SiruiXia XintaoWang - JiaqingLiangFudan University + JiaqingLiangFudan University YifeiZhang WeikangZhou JiajiDeng FeiYuAnt Group - YanghuaXiaoFudan University + YanghuaXiaoFudan University 969-988 Retrieval-Augmented Generation (RAG) has been widely adopted to enhance Large Language Models (LLMs) in knowledge-intensive tasks. To enhance credibility and verifiability in RAG systems, Attributed Text Generation (ATG) is proposed, which provides citations to retrieval knowledge in LLM-generated responses. Prior methods mainly adopt coarse-grained attributions, with passage-level or paragraph-level references or citations, which fall short in verifiability. This paper proposes ReClaim(Refer & Claim), a fine-grained ATG method that alternates the generation of references and answers step by step. Different from previous coarse-grained attribution, ReClaim provides sentence-level citations in long-form question-answering tasks. With extensive experiments, we verify the effectiveness of ReClaim in extensive settings, achieving a citation accuracy rate of 90%. 2025.findings-naacl.55 @@ -692,8 +692,8 @@ Understanding the Role of Mental Models in User Interaction with an Adaptive Dialog Agent - Lindsey MorganVanderlyn - DirkVäth + Lindsey MorganVanderlyn + DirkVäth ThangVuUniversity of Stuttgart, University of Stuttgart 989-1015 Mental models play an important role in whether user interactions with intelligent systems, such as dialog agents, are successful. Adaptive dialog systems present the opportunity to align a dialog agent’s behavior with heterogeneous user expectations. However, there has been little research into what mental models users form when interacting with a task-oriented dialog system, how these models affect users’ interactions, or what role system adaptation can play in this process. This can make it challenging to avoid damage to human-AI partnership. In this work, we collect a new publicly available dataset for exploring user mental models of information seeking dialog systems. We demonstrate that users have a variety of conflicting mental models about such systems, the validity of which directly impacts the success and perception of their interactions. Furthermore, we show that adapting a dialog agent’s behavior to better align with users’ mental models, even when done implicitly, can improve dialog efficiency, success, and user perception of the interaction. This shows that implicit adaptation can be beneficial for task-oriented dialog systems, so long as developers understand the mental models of their users. @@ -717,7 +717,7 @@ JinchaoZhang LixiangfangLixiangfang LichuanrongLichuanrong - BoLiInstitute of Information Engineering, Chinese Academy of Sciences + BoLiInstitute of Information Engineering, Chinese Academy of Sciences 1033-1044 Large language models (LLMs) exhibit exceptional performance across a wide range of natural language processing tasks, often relying on lengthy prompts to harness their full capabilities. However, extended prompts can lead to substantial computational overhead and increased hardware demands, limiting the scalability and efficiency of such models. In this paper, we propose DisComp, a two-stage prompt compression framework based on knowledge distillation that combines task-agnostic and task-aware strategies, designed to efficiently compress prompt length without compromising performance.In the first stage, task-agnostic compression is achieved through knowledge distillation, transferring the summarization capabilities of a LLM to a smaller, more efficient model. The distillation process combines cross-entropy loss and keyword matching loss to ensure the smaller model generates concise and informative summaries. In the second stage, sentence-level pruning is applied, where sentences are ranked by relevance to the query, and irrelevant sentences are pruned to retain only task-critical information. We evaluate our method on three benchmark datasets, LongBench , ZeroSCROLLS and NaturalQuestions. The results show that DisComp significantly outperforms previous task-agnostic and task-specific compression approaches, and it is up to 6.56× faster at inference compared to the best token-level compression method. 2025.findings-naacl.58 @@ -726,7 +726,7 @@ A Large-Scale Benchmark for <fixed-case>V</fixed-case>ietnamese Sentence Paraphrases Sang QuangNguyen - Kiet VanNguyenUniversity of Information Technology, VNU-HCM + Kiet VanNguyenUniversity of Information Technology, VNU-HCM 1045-1060 This paper presents ViSP, a high-quality Vietnamese dataset for sentence paraphrasing, consisting of 1.2M original–paraphrase pairs collected from various domains. The dataset was constructed using a hybrid approach that combines automatic paraphrase generation with manual evaluation to ensure high quality. We conducted experiments using methods such as back-translation, EDA, and baseline models like BART and T5, as well as large language models (LLMs), including GPT-4o, Gemini-1.5, Aya, Qwen-2.5, and Meta-Llama-3.1 variants. To the best of our knowledge, this is the first large-scale study on Vietnamese paraphrasing. We hope that our dataset and findings will serve as a valuable foundation for future research and applications in Vietnamese paraphrase tasks. The dataset is available for research purposes at https://github.com/ngwgsang/ViSP. 2025.findings-naacl.59 @@ -734,8 +734,8 @@ <fixed-case>RAMQA</fixed-case>: A Unified Framework for Retrieval-Augmented Multi-Modal Question Answering - YangBaiFacebook - ChristanGrantUniversity of Florida + YangBaiFacebook + ChristanGrantUniversity of Florida Daisy ZheWangUniversity of Florida 1061-1076 Multi-modal retrieval-augmented Question Answering (MRAQA), integrating text and images, has gained significant attention in information retrieval (IR) and natural language processing (NLP). Traditional ranking methods rely on small encoder-based language models, which are incompatible with modern decoder-based generative large language models (LLMs) that have advanced various NLP tasks. To bridge this gap, we propose RAMQA, a unified framework combining learning-to-rank methods with generative permutation-enhanced ranking techniques. We first train a pointwise multi-modal ranker using LLaVA as the backbone. Then, we apply instruction tuning to train a LLaMA model for re-ranking the top-k documents using an innovative autoregressive multi-task learning approach. Our generative ranking model generates re-ranked document IDs and specific answers from document candidates in various permutations. Experiments on two MRAQA benchmarks, WebQA and MultiModalQA, show significant improvements over strong baselines, highlighting the effectiveness of our approach. Data and code will be made public once the paper is accepted. @@ -744,13 +744,13 @@ <fixed-case>M</fixed-case>ulti<fixed-case>CAT</fixed-case>: Multimodal Communication Annotations for Teams - AdarshPyarelalUniversity of Arizona - John MCulnanUS Department of Veterans Affairs + AdarshPyarelalUniversity of Arizona + John MCulnanUS Department of Veterans Affairs AyeshaQamarTexas A&M University - College Station - MeghavarshiniKrishnaswamy - YuweiWangUniversity of Arizona + MeghavarshiniKrishnaswamy + YuweiWangUniversity of Arizona CheonkamJeong - ChenChen + ChenChen Md Messal MonemMiahTexas A&M University - College Station ShahriarHormozi JonathanTongTexas A&M University - College Station @@ -763,12 +763,12 @@ Prototype Tuning: A Meta-Learning Approach for Few-Shot Document-Level Relation Extraction with Large Language Models DinghaoPanDalian University of Technology - YuanyuanSun - BoXuDalian University of Technology + YuanyuanSun + BoXuDalian University of Technology JiruLi - ZhihaoYangDalian University of Technology - LingLuoDalian University of Technology - HongfeiLin + ZhihaoYangDalian University of Technology + LingLuoDalian University of Technology + HongfeiLin JianWang 1112-1128 Few-Shot Document-Level Relation Extraction (FSDLRE) aims to develop models capable of generalizing to new categories with minimal support examples. Although Large Language Models (LLMs) demonstrate exceptional In-Context Learning (ICL) capabilities on many few-shot tasks, their performance on FSDLRE tasks remains suboptimal due to the significant gap between the task format and the intrinsic capabilities of language models, coupled with the complexity of ICL prompts for document-level text. To address these challenges, we introduce a novel meta-training approach for LLMs termed Prototype Tuning. We construct simulated episodes using data with relation types that do not overlap with the test corpus, fundamentally enhancing the ICL capabilities of LLMs in FSDLRE through meta-learning. To further enhance the effects of meta-learning, we innovatively integrate the concept of prototype into the fine-tuning process of LLMs. This involves aggregating entity pairs from support documents into prototypes within the prompts and altering the way of determining relation categories to identifying the closest prototype. Experimental results demonstrate that our LLMs trained with this approach outperform all baselines. Our proposed approach markedly improves the ICL capabilities of LLMs in FSDLRE and mitigates the impact of relation semantic discrepancies between the training corpus and the test corpus on model performance. @@ -777,12 +777,12 @@ <fixed-case>L</fixed-case>egal<fixed-case>S</fixed-case>eg: Unlocking the Structure of <fixed-case>I</fixed-case>ndian Legal Judgments Through Rhetorical Role Classification - Shubham KumarNigamIIT Kanpur + Shubham KumarNigamIIT Kanpur TanmayDubey GovindSharma NoelShallumSymbiosis Law School Pune KripabandhuGhoshIndian Institute of Science Education and Research Kolkata - ArnabBhattacharyaIIT Kanpur + ArnabBhattacharyaIIT Kanpur 1129-1144 In this paper, we address the task of semantic segmentation of legal documents through rhetorical role classification, with a focus on Indian legal judgments. We introduce **LegalSeg**, the largest annotated dataset for this task, comprising over 7,000 documents and 1.4 million sentences, labeled with 7 rhetorical roles. To benchmark performance, we evaluate multiple state-of-the-art models, including Hierarchical BiLSTM-CRF, TransformerOverInLegalBERT (ToInLegalBERT), Graph Neural Networks (GNNs), and Role-Aware Transformers, alongside an exploratory **RhetoricLLaMA**, an instruction-tuned large language model. Our results demonstrate that models incorporating broader context, structural relationships, and sequential sentence information outperform those relying solely on sentence-level features. Additionally, we conducted experiments using surrounding context and predicted or actual labels of neighboring sentences to assess their impact on classification accuracy. Despite these advancements, challenges persist in distinguishing between closely related roles and addressing class imbalance. Our work underscores the potential of advanced techniques for improving legal document understanding and sets a strong foundation for future research in legal NLP. 2025.findings-naacl.63 @@ -790,11 +790,11 @@ Claim-Guided Textual Backdoor Attack for Practical Applications - MinkyooSong + MinkyooSong HannaKim - JaehanKimKorea Advanced Institute of Science & Technology + JaehanKimKorea Advanced Institute of Science & Technology YoungjinJinKorea Advanced Institute of Science & Technology - SeungwonShinKorea Advanced Institute of Science & Technology and Texas A&M University - College Station + SeungwonShinKorea Advanced Institute of Science & Technology and Texas A&M University - College Station 1145-1159 Recent advances in natural language processing and the increased use of large language models have exposed new security vulnerabilities, such as backdoor attacks. Previous backdoor attacks require input manipulation after model distribution to activate the backdoor, posing limitations in real-world applicability. Addressing this gap, we introduce a novel Claim-Guided Backdoor Attack (CGBA), which eliminates the need for such manipulations by utilizing inherent textual claims as triggers. CGBA leverages claim extraction, clustering, and targeted training to trick models to misbehave on targeted claims without affecting their performance on clean data. CGBA demonstrates its effectiveness and stealthiness across various datasets and models, significantly enhancing the feasibility of practical backdoor attacks. Our code and data will be available at https://github.com/minkyoo9/CGBA. 2025.findings-naacl.64 @@ -821,14 +821,14 @@ <tex-math>SusGen-GPT</tex-math>: A Data-Centric <fixed-case>LLM</fixed-case> for Financial <fixed-case>NLP</fixed-case> and Sustainability Report Generation - QilongWuShanghai Artificial Intelligence Laboratory + QilongWuShanghai Artificial Intelligence Laboratory XiaonengXiangHuawei Singapore Research Center HuangHejia XuanWang YeoWei JieSchool of Computer Science and Engineering, Nanyang Technological University RanjanSatapathy - Ricardo ShirotaFilhoInstitute of High Performance Computing, Singapore, A*STAR - BharadwajVeeravalli + Ricardo ShirotaFilhoInstitute of High Performance Computing, Singapore, A*STAR + BharadwajVeeravalli 1184-1203 The rapid growth of the financial sector and the increasing focus on Environmental, Social, and Governance (ESG) considerations have created a pressing need for advanced natural language processing (NLP) tools. Despite recent advancements, there is still a notable absence of open-source Large Language Models (LLMs) that are proficient across both general finance and ESG domains, such as generating ESG reports. To address this gap, we introduce SusGen-30k, a high-quality, category-balanced dataset comprising seven financial NLP tasks. In addition, we propose TCFD-Bench, a benchmark designed to improve the evaluation of sustainability report generation. Our data-centric approach led to the development of a suite of models, SusGen-GPT, trained on the curated dataset. These models were evaluated across six adapted tasks and two off-the-shelf tasks, showing state-of-the-art performance, surpassing all other models except GPT-4. Remarkably, SusGen-GPT achieved an average score only 0.02 below GPT-4, despite using models with only 7-8B parameters compared to much larger GPT-4. This demonstrates the efficiency of our approach in delivering high performance with significantly fewer resources, addressing existing challenges and fostering further advancements in the financial and ESG research community. 2025.findings-naacl.66 @@ -838,7 +838,7 @@ <fixed-case>G</fixed-case>r<fixed-case>E</fixed-case>m<fixed-case>LI</fixed-case>n: A Repository of Green Baseline Embeddings for 87 Low-Resource Languages Injected with Multilingual Graph Knowledge DaniilGurgurov RishuKumarGerman Research Center for AI - SimonOstermannGerman Research Center for AI + SimonOstermannGerman Research Center for AI 1204-1221 2025.findings-naacl.67 gurgurov-etal-2025-gremlin @@ -846,8 +846,8 @@ In-Context Example Selection via Similarity Search Improves Low-Resource Machine Translation Armel RandyZebazeINRIA - BenoîtSagotInria - RachelBawdenInria + BenoîtSagotInria + RachelBawdenInria 1222-1252 The ability of generative large language models (LLMs) to perform in-context learning has given rise to a large body of research into how best to prompt models for various natural language processing tasks. In this paper, we focus on machine translation (MT), a task that has been shown to benefit from in-context translation examples. However no systematic studies have been published on how best to select examples, and mixed results have been reported on the usefulness of similarity-based selection over random selection, although these results have mainly been shown for high-resource languages only. We provide a study covering multiple LLMs and in-context example retrieval strategies. Contrarily to previously published results, we find that retrieval based on sentence embedding similarity can improve MT, especially for low-resource language directions, and we also discuss the balance between selection pool diversity and quality. Code and outputs will be made freely available. 2025.findings-naacl.68 @@ -856,11 +856,11 @@ Self-Training Large Language Models for Tool-Use Without Demonstrations NeLuo - Aryo PradiptaGemaAnthropic and University of Edinburgh, University of Edinburgh + Aryo PradiptaGemaAnthropic and University of Edinburgh, University of Edinburgh XuanliHeUniversity College London, University of London - EmileVan KriekenEdinburgh University, University of Edinburgh + EmileVan KriekenEdinburgh University, University of Edinburgh PietroLesciUniversity of Cambridge - PasqualeMinerviniUniversity of Edinburgh, University of Edinburgh + PasqualeMinerviniUniversity of Edinburgh, University of Edinburgh 1253-1271 Large language models (LLMs) remain prone to factual inaccuracies and computational errors, including hallucinations and mistakes in mathematical reasoning. Recent work augmented LLMs with tools to mitigate these shortcomings, but often requires curated gold tool-use demonstrations. In this paper, we investigate whether LLMs can learn to use tools without demonstrations. First, we analyse zero-shot prompting strategies to guide LLMs in tool utilisation. Second, we propose a self-training method to synthesise tool-use traces using the LLM itself. We compare supervised fine-tuning and preference fine-tuning techniques for fine-tuning the model on datasets constructed using existing Question Answering (QA) datasets, i.e., TriviaQA and GSM8K. Experiments show that tool-use enhances performance on a long-tail knowledge task: 3.7% on PopQA, which is used solely for evaluation, but leads to mixed results on other datasets, i.e., TriviaQA, GSM8K, and NQ-Open. Our findings highlight the potential and challenges of integrating external tools into LLMs without demonstrations. 2025.findings-naacl.69 @@ -871,7 +871,7 @@ LekangJiang CaiqiZhang Pascal A.Scherz - StefanGoetzUniversity of Cambridge and Duke University + StefanGoetzUniversity of Cambridge and Duke University 1272-1287 Large language models (LLMs) have shown exceptional performance across various text generation tasks, but remain under-explored in the patent domain, which offers highly structured and precise language. This paper constructs a dataset to investigate the performance of current LLMs in patent claim generation. Our results demonstrate that generating claims based on patent descriptions outperforms previous research relying on abstracts. Interestingly, current patent-specific LLMs perform much worse than state-of-the-art general LLMs, highlighting the necessity for future research on in-domain LLMs. We also find that LLMs can produce high-quality first independent claims, but their performances markedly decrease for subsequent dependent claims. Moreover, fine-tuning can enhance the completeness of inventions’ features, conceptual clarity, and feature linkage. Among the tested LLMs, GPT-4 demonstrates the best performance in comprehensive human evaluations by patent experts, with better feature coverage, conceptual clarity, and technical coherence. Despite these capabilities, comprehensive revision and modification are still necessary to pass rigorous patent scrutiny and ensure legal robustness. 2025.findings-naacl.70 @@ -879,10 +879,10 @@ Obliviate: Neutralizing Task-agnostic Backdoors within the Parameter-efficient Fine-tuning Paradigm - JaehanKimKorea Advanced Institute of Science & Technology - MinkyooSong + JaehanKimKorea Advanced Institute of Science & Technology + MinkyooSong Seung HoNa - SeungwonShinKorea Advanced Institute of Science & Technology and Texas A&M University - College Station + SeungwonShinKorea Advanced Institute of Science & Technology and Texas A&M University - College Station 1288-1307 Parameter-efficient fine-tuning (PEFT) has become a key training strategy for large language models. However, its reliance on fewer trainable parameters poses security risks, such as task-agnostic backdoors. Despite their severe impact on a wide range of tasks, there is no practical defense solution available that effectively counters task-agnostic backdoors within the context of PEFT. In this study, we introduce Obliviate, a PEFT-integrable backdoor defense. We develop two techniques aimed at amplifying benign neurons within PEFT layers and penalizing the influence of trigger tokens. Our evaluations across three major PEFT architectures show that our method can significantly reduce the attack success rate of the state-of-the-art task-agnostic backdoors (83.6%\downarrow). Furthermore, our method exhibits robust defense capabilities against both task-specific backdoors and adaptive attacks. Source code will be obtained at https://github.com/jaehanwork/Obliviate. 2025.findings-naacl.71 @@ -891,14 +891,14 @@ <fixed-case>CORAL</fixed-case>: Benchmarking Multi-turn Conversational Retrieval-Augmented Generation YiruoCheng - KelongMao - ZiliangZhao + KelongMao + ZiliangZhao GuantingDong - HongjinQianBeijing Academy of Artificial Intelligence + HongjinQianBeijing Academy of Artificial Intelligence YongkangWu - TetsuyaSakaiNAVER and Waseda University - Ji-RongWenRenmin University of China - ZhichengDouRenmin University of China + TetsuyaSakaiNAVER and Waseda University + Ji-RongWenRenmin University of China + ZhichengDouRenmin University of China 1308-1330 Retrieval-Augmented Generation (RAG) has become a powerful paradigm for enhancing large language models (LLMs) through external knowledge retrieval. Despite its widespread attention, existing academic research predominantly focuses on single-turn RAG, leaving a significant gap in addressing the complexities of multi-turn conversations found in real-world applications. To bridge this gap, we introduce CORAL, a large-scale benchmark designed to assess RAG systems in realistic multi-turn conversational settings. CORAL includes diverse information-seeking conversations automatically derived from Wikipedia and tackles key challenges such as open-domain coverage, knowledge intensity, free-form responses, and topic shifts. It supports three core tasks of conversational RAG: passage retrieval, response generation, and citation labeling. We propose a unified framework to standardize various conversational RAG methods and conduct a comprehensive evaluation of these methods on CORAL, demonstrating substantial opportunities for improving existing approaches. 2025.findings-naacl.72 @@ -928,13 +928,13 @@ The Promises and Pitfalls of <fixed-case>LLM</fixed-case> Annotations in Dataset Labeling: a Case Study on Media Bias Detection - TomášHorych + TomášHorych ChristophMandlMedia Bias Group - TerryRuasGeorg-August Universität Göttingen - AndreGreiner-PetterGeorg-August Universität Göttingen - BelaGippGeorg-August Universität Göttingen + TerryRuasGeorg-August Universität Göttingen + AndreGreiner-PetterGeorg-August Universität Göttingen + BelaGippGeorg-August Universität Göttingen AkikoAizawaNational Institute of Informatics - TimoSpinde + TimoSpinde 1370-1386 High annotation costs from hiring or crowdsourcing complicate the creation of large, high-quality datasets needed for training reliable text classifiers. Recent research suggests using Large Language Models (LLMs) to automate the annotation process, reducing these costs while maintaining data quality. LLMs have shown promising results in annotating downstream tasks like hate speech detection and political framing. Building on the success in these areas, this study investigates whether LLMs are viable for annotating a complex task of media bias detection and whether a downstream media bias classifier can be trained on such data. We create Annolexical, the first large-scale dataset for media bias classification with over 48k synthetically annotated examples. Our classifier fine-tuned on it surpasses all of the annotator LLMs by 5-9% in Mathew’s Correlation Coefficient (MCC) and performs close to or outperforms the model trained on human-labeled data when evaluated on two media bias benchmark datasets (BABE and BASIL). This study demonstrates how our approach significantly reduces the cost of dataset creation in the media bias domain and, by extension - the development of the classifiers, while our subsequent behavioral stress-testing reveals some of its current limitations and trade-offs. 2025.findings-naacl.75 @@ -944,7 +944,7 @@ Mechanistic Unveiling of Transformer Circuits: Self-Influence as a Key to Model Reasoning LinZhangHarbin Institute of Technology LijieHuKAUST - DiWangKAUST + DiWangKAUST 1387-1404 Transformer-based language models have achieved significant success; however, their internal mechanisms remain largely opaque due to the complexity of non-linear interactions and high-dimensional operations. While previous studies have demonstrated that these models implicitly embed reasoning trees, humans typically employ various distinct logical reasoning mechanisms to complete the same task. It is still unclear which multi-step reasoning mechanisms are used by language models to solve such tasks. In this paper, we aim to address this question by investigating the mechanistic interpretability of language models, particularly in the context of multi-step reasoning tasks. Specifically, we employ circuit analysis and self-influence functions to evaluate the changing importance of each token throughout the reasoning process, allowing us to map the reasoning paths adopted by the model. We apply this methodology to the GPT-2 model on a prediction task (IOI) and demonstrate that the underlying circuits reveal a human-interpretable reasoning process used by the model. 2025.findings-naacl.76 @@ -954,7 +954,7 @@ Intrinsic Model Weaknesses: How Priming Attacks Unveil Vulnerabilities in Large Language Models YuyiHuang RunzheZhanUniversity of Macau - Derek F.WongUniversity of Macau + Derek F.WongUniversity of Macau Lidia S.ChaoUniversity of Macau AilinTaoGuangzhou Medical University 1405-1425 @@ -964,11 +964,11 @@ <fixed-case>A</fixed-case>d<fixed-case>P</fixed-case>araphrase: Paraphrase Dataset for Analyzing Linguistic Features toward Generating Attractive Ad Texts - SoichiroMurakamiCyberAgent, Inc. - PeinanZhangCyberAgent AI Lab - HidetakaKamigaitoNara Institute of Science and Technology - HiroyaTakamuraAIST, National Institute of Advanced Industrial Science and Technology - ManabuOkumuraInstitute of Science Tokyo and Tokyo Institute of Technology, Tokyo Institute of Technology + SoichiroMurakamiCyberAgent, Inc. + PeinanZhangCyberAgent AI Lab + HidetakaKamigaitoNara Institute of Science and Technology + HiroyaTakamuraAIST, National Institute of Advanced Industrial Science and Technology + ManabuOkumuraInstitute of Science Tokyo and Tokyo Institute of Technology, Tokyo Institute of Technology 1426-1439 Effective linguistic choices that attract potential customers play crucial roles in advertising success. This study aims to explore the linguistic features of ad texts that influence human preferences. Although the creation of attractive ad texts is an active area of research, progress in understanding the specific linguistic features that affect attractiveness is hindered by several obstacles. First, human preferences are complex and influenced by multiple factors, including their content, such as brand names, and their linguistic styles, making analysis challenging. Second, publicly available ad text datasets that include human preferences are lacking, such as ad performance metrics and human feedback, which reflect people’s interests. To address these problems, we present AdParaphrase, a paraphrase dataset that contains human preferences for pairs of ad texts that are semantically equivalent but differ in terms of wording and style. This dataset allows for preference analysis that focuses on the differences in linguistic features. Our analysis revealed that ad texts preferred by human judges have higher fluency, longer length, more nouns, and use of bracket symbols. Furthermore, we demonstrate that an ad text-generation model that considers these findings significantly improves the attractiveness of a given text. The dataset is publicly available at: https://github.com/CyberAgentAILab/AdParaphrase. 2025.findings-naacl.78 @@ -986,13 +986,13 @@ Learning to Explore and Select for Coverage-Conditioned Retrieval-Augmented Generation - TakyoungKimUniversity of Illinois at Urbana-Champaign - KyungjaeLeeUniversity of Seoul + TakyoungKimUniversity of Illinois at Urbana-Champaign + KyungjaeLeeUniversity of Seoul Young RokJang Ji YongChoLG Corporation GangwooKim MinseokChoLG Corporation - MoontaeLeeLG Corporation and University of Illinois, Chicago + MoontaeLeeLG Corporation and University of Illinois, Chicago 1460-1480 Interactions with large language models (LLMs) often yield long and detailed responses, leveraging both parametric knowledge and retrieval-augmented generation (RAG). While these responses can provide rich insights, they often include redundant or less engaging content not aligned with user interests. This issue becomes apparent when users specify particular subtopics to include or exclude – termed **coverage-conditioned (C^2)** queries – as LLMs often struggle to provide tailored responses. To address this challenge, we investigate the role of query outlines, sequences of subqueries designed to guide LLMs in generating responses that meet specific user requirements. To systematically create and evaluate these outlines, we introduce **QTree**, a dataset of 10K hierarchical sets of information-seeking subqueries that define structured boundaries for outline creation and evaluation in C^2 scenarios. Additionally, we develop **QPlanner**, a 7B language model trained to generate customized outlines within boundaries of QTree. We evaluate the effectiveness of the generated outlines through automatic and human judgements, focusing on their impact within retrieval-augmented generation (RAG) systems. Experimental results demonstrate that QPlanner, especially when trained with alignment techniques like DPO, generates higher-quality outlines that better fulfill diverse user needs. 2025.findings-naacl.80 @@ -1001,13 +1001,13 @@ <fixed-case>L</fixed-case>ay<fixed-case>A</fixed-case>lign: Enhancing Multilingual Reasoning in Large Language Models via Layer-Wise Adaptive Fusion and Alignment Strategy ZhiwenRuan - YixiaLi - HeZhu - LongyueWangAlibaba Group - WeihuaLuoAlibaba International Digital Commerce Group + YixiaLi + HeZhu + LongyueWangAlibaba Group + WeihuaLuoAlibaba International Digital Commerce Group KaifuZhangAlibaba Group - YunChenShanghai University of Finance and Economics - GuanhuaChenSouthern University of Science and Technology + YunChenShanghai University of Finance and Economics + GuanhuaChenSouthern University of Science and Technology 1481-1495 Despite being pretrained on multilingual corpora, large language models (LLMs) exhibit suboptimal performance on low-resource languages. Recent approaches have leveraged multilingual encoders alongside LLMs by introducing trainable parameters connecting the two models. However, these methods typically focus on the encoder’s output, overlooking valuable information from other layers. We propose Layer-Wise Adaptive Fusion and Alignment Strategy (LayAlign), a framework that integrates representations from all encoder layers, coupled with the adaptive fusion-enhanced attention mechanism to enable layer-wise interaction between the LLM and the multilingual encoder. Extensive experiments on multilingual reasoning tasks, along with analyses of learned representations, show that our approach consistently outperforms existing baselines. 2025.findings-naacl.81 @@ -1026,10 +1026,10 @@ From Argumentation to Deliberation: Perspectivized Stance Vectors for Fine-grained (Dis)agreement Analysis MoritzPlenzInstitute for Computational Linguistics, Heidelberg University, Ruprecht-Karls-Universität Heidelberg - PhilippHeinisch + PhilippHeinisch JanoschGehring PhilippCimianoBielefeld University and Bielefeld University - AnetteFrankRuprecht-Karls-Universität Heidelberg + AnetteFrankRuprecht-Karls-Universität Heidelberg 1525-1553 Debating over conflicting issues is a necessary first step towards resolving conflicts. However, intrinsic perspectives of an arguer are difficult to overcome by persuasive argumentation skills. Proceeding from a debate to a deliberative process, where we can identify actionable options for resolving a conflict requires a deeper analysis of arguments and the perspectives they are grounded in - as it is only from there that one can derive mutually agreeable resolution steps. In this work we develop a framework for a deliberative analysis of arguments in a computational argumentation setup. We conduct a fine-grained analysis of perspectivized stances expressed in the arguments of different arguers or stakeholders on a given issue, aiming not only to identify their opposing views, but also shared perspectives arising from their attitudes, values or needs. We formalize this analysis in Perspectivized Stance Vectors that characterize the individual perspectivized stances of all arguers on a given issue. We construct these vectors by determining issue- and argument-specific concepts, and predict an arguer’s stance relative to each of them. The vectors allow us to measure a modulated (dis)agreement between arguers, structured by perspectives, which allows us to identify actionable points for conflict resolution, as a first step towards deliberation. 2025.findings-naacl.83 @@ -1037,7 +1037,7 @@ <fixed-case>LVLM</fixed-case>-Compress-Bench: Benchmarking the Broader Impact of Large Vision-Language Model Compression - SouvikKunduIntel + SouvikKunduIntel AnahitaBhiwandiwalla SungdukYu PhillipHowardIntel @@ -1045,7 +1045,7 @@ SharathNittur SridharIntel Labs DavidCobbleyIntel Labs HaoKang - VasudevLalIntel + VasudevLalIntel 1554-1570 Despite recent efforts in understanding the compression impact on Large Language Models (LLMs) in terms of their downstream task performance and trustworthiness on relatively simpler uni-modal benchmarks (e.g. question answering, common sense reasoning), their detailed study on multi-modal Large Vision Language Models (LVLMs) is yet to be unveiled. Towards mitigating this gap, we present LVLM-Compress-Bench, a framework to first thorough study on the broad impact of compression on the generative performance of LVLMs on multi-modal input driven tasks. In specific, we consider two major classes of compression for autoregressive models, namely KV cache and weight compression, for the dynamically growing intermediate cache and static weights, respectively. We use four LVLM variants of the popular LLaVA framework to present our analysis to integrate various state-of-the-art KV and weight compression methods including uniform, outlier-reduced, and group quantization. With this framework we demonstrate on ten different multi-modal datasets with varied capabilities including recognition, knowledge, language generation, spatial awareness, visual reasoning, hallucination and visual illusion identification, toxicity, stereotypes and bias. In specific, our framework demonstrates the compression impact on both general and ethically critical metrics leveraging a combination of real world and synthetic datasets to encompass diverse societal intersectional attributes. Extensive experimental evaluations yield diverse and intriguing observations on the behavior of LVLMs at different quantization budget of KV and weights, in both maintaining and losing performance as compared to the baseline model with FP16 data format. We believe LVLM-Compress-Bench would help the community to have a deeper insight on the parting impact of compression and the societal impact the compressed models may pose. Code will be released soon. 2025.findings-naacl.84 @@ -1053,7 +1053,7 @@ Does Generative <fixed-case>AI</fixed-case> speak <fixed-case>N</fixed-case>igerian-<fixed-case>P</fixed-case>idgin?: Issues about Representativeness and Bias for Multilingualism in <fixed-case>LLM</fixed-case>s - David IfeoluwaAdelaniMcGill University + David IfeoluwaAdelaniMcGill University A. SezaDoğruözGhent University IyanuoluwaShodeBloomberg AnuoluwapoAremu @@ -1084,9 +1084,9 @@ <fixed-case>MA</fixed-case>i<fixed-case>DE</fixed-case>-up: Multilingual Deception Detection of <fixed-case>AI</fixed-case>-generated Hotel Reviews - OanaIgnatSanta Clara University + OanaIgnatSanta Clara University XiaomengXu - RadaMihalceaUniversity of Michigan + RadaMihalceaUniversity of Michigan 1636-1653 Deceptive reviews are becoming increasingly common, especially given the increase in performance and the prevalence of LLMs. While work to date has addressed the development of models to differentiate between truthful and deceptive human reviews, much less is known about the distinction between real reviews and AI-authored fake reviews. Moreover, most of the research so far has focused primarily on English, with very little work dedicated to other languages. In this paper, we compile and make publicly available the MAiDE-up dataset, consisting of 10,000 real and 10,000 AI-generated fake hotel reviews, balanced across ten languages. Using this dataset, we conduct extensive linguistic analyses to (1) compare the AI fake hotel reviews to real hotel reviews, and (2) identify the factors that influence the deception detection model performance. We explore the effectiveness of several models for deception detection in hotel reviews across three main dimensions: sentiment, location, and language. We find that these dimensions influence how well we can detect AI-generated fake reviews. 2025.findings-naacl.88 @@ -1095,7 +1095,7 @@ <fixed-case>L</fixed-case>e<fixed-case>C</fixed-case>o<fixed-case>PCR</fixed-case>: Legal Concept-guided Prior Case Retrieval for <fixed-case>E</fixed-case>uropean Court of Human Rights cases SantoshT.y.s.s - Isaac Misael OlguínNolasco + Isaac Misael OlguínNolasco MatthiasGrabmairTechnische Universität München 1654-1661 Prior case retrieval (PCR) is crucial for legal practitioners to find relevant precedent cases given the facts of a query case. Existing approaches often overlook the underlying semantic intent in determining relevance with respect to the query case. In this work, we propose LeCoPCR, a novel approach that explicitly generate intents in the form of legal concepts from a given query case facts and then augments the query with these concepts to enhance models understanding of semantic intent that dictates relavance. To overcome the unavailability of annotated legal concepts, We employ a weak supervision approach to extract key legal concepts from the reasoning section using Determinantal Point Process (DPP) to balance quality and diversity. Experimental results on the ECtHR-PCR dataset demonstrate the effectiveness of leveraging legal concepts and DPP-based key concept extraction. @@ -1114,9 +1114,9 @@ Data Poisoning for In-context Learning PengfeiHeMichigan State University - HanXuUniversity of Arizona + HanXuUniversity of Arizona YueXingMichigan State University - HuiLiu + HuiLiu MakotoYamadaOkinawa Institute of Science and Technology (OIST) JiliangTangMichigan State University 1680-1700 @@ -1137,11 +1137,11 @@ <fixed-case>B</fixed-case>io<fixed-case>EL</fixed-case>: A Comprehensive Python Package for Biomedical Entity Linking - PrasanthBathalaGeorgia Institute of Technology + PrasanthBathalaGeorgia Institute of Technology ChristopheYeGeorgia Institute of Technology BatuhanNursalGeorgia Institute of Technology - ShubhamLohiya - DavidKartchnerGeorgia Institute of Technology + ShubhamLohiya + DavidKartchnerGeorgia Institute of Technology Cassie S.MitchellGeorgia Institute of Technology 1709-1721 2025.findings-naacl.93 @@ -1153,7 +1153,7 @@ Patrick Y.WuAmerican University KristinaMilerUniversity of Maryland, College Park Alexander MiserlisHoyleETHZ - ETH Zurich - PhilipResnikUniversity of Maryland, College Park + PhilipResnikUniversity of Maryland, College Park 1722-1738 We introduce a text-based framework for measuring attitudes in communities toward issues of interest, going beyond the pro/con/neutral of conventional stance detection to characterize attitudes on a continuous scale using both implicit and explicit evidence in language. The framework exploits LLMs both to extract attitude-related evidence and to perform pairwise comparisons that yield unidimensional attitude scores via the classic Bradley-Terry model. We validate the LLM-based steps using human judgments, and illustrate the utility of the approach for social science by examining the evolution of attitudes on two high-profile issues in U.S. politics in two political communities on Reddit over the period spanning from the 2016 presidential campaign to the 2022 mid-term elections. WARNING: Potentially sensitive political content. 2025.findings-naacl.94 @@ -1162,9 +1162,9 @@ Semantic Consistency-Based Uncertainty Quantification for Factuality in Radiology Report Generation ChenyuWangBoston University, Boston University - WeichaoZhou + WeichaoZhou ShantanuGhosh - KayhanBatmanghelichBoston University, Boston University + KayhanBatmanghelichBoston University, Boston University WenchaoLiBoston University 1739-1754 Radiology report generation (RRG) has shown great potential in assisting radiologists by automating the labor-intensive task of report writing. While recent advancements have improved the quality and coherence of generated reports, ensuring their factual correctness remains a critical challenge. Although generative medical Vision Large Language Models (VLLMs) have been proposed to address this issue, these models are prone to hallucinations and can produce inaccurate diagnostic information. To address these concerns, we introduce a novel Semantic Consistency-Based Uncertainty Quantification framework that provides both report-level and sentence-level uncertainties. Unlike existing approaches, our method does not require modifications to the underlying model or access to its inner state, such as output token logits, thus serving as a plug-and-play module that can be seamlessly integrated with state-of-the-art models. Extensive experiments demonstrate the efficacy of our method in detecting hallucinations and enhancing the factual accuracy of automatically generated radiology reports. By abstaining from high-uncertainty reports, our approach improves factuality scores by 10%, achieved by rejecting 20% of reports on the MIMIC-CXR dataset. Furthermore, sentence-level uncertainty flags the lowest-precision sentence in each report with an 82.9% success rate. Our implementation is open-source and available at https://github.com/BU-DEPEND-Lab/SCUQ-RRG. @@ -1173,7 +1173,7 @@ <fixed-case>R</fixed-case>eward<fixed-case>B</fixed-case>ench: Evaluating Reward Models for Language Modeling - NathanLambert + NathanLambert ValentinaPyatkin JacobMorrison LJMiranda @@ -1183,7 +1183,7 @@ SachinKumar TomZick YejinChoi - Noah A.Smith + Noah A.Smith HannanehHajishirzi 1755-1797 Reward models (RMs) are at the crux of successfully using RLHF to align pretrained models to human preferences, yet there has been relatively little study that focuses on evaluation of those models. Evaluating reward models presents an opportunity to understand the opaque technologies used for alignment of language models and which values are embedded in them. Resources for reward model training and understanding are sparse in the nascent open-source community around them. To enhance scientific understanding of reward models, we present RewardBench, a benchmark dataset and code-base for evaluation. The RewardBench dataset is a collection of prompt-chosen-rejected trios spanning chat, reasoning, and safety, to benchmark how reward models perform on challenging, structured and out-of-distribution queries. We create specific comparison datasets for RMs that have subtle, but verifiable reasons (e.g. bugs, incorrect facts) why one answer should be preferred to another. On the RewardBench leaderboard, we evaluate RMs trained with a variety of methods, such as the direct MLE training of classifiers and the implicit reward modeling of Direct Preference Optimization (DPO). We present many findings on propensity for refusals, reasoning limitations, and instruction following shortcomings of various reward models towards a better understanding of the RLHF process. @@ -1201,11 +1201,11 @@ Tomato, Tomahto, Tomate: Do Multilingual Language Models Understand Based on Subword-Level Semantic Concepts? - CrystinaZhangUniversity of Waterloo + CrystinaZhangUniversity of Waterloo JingLuGoogle Vinh Q.TranGoogle DeepMind TalSchusterGoogle DeepMind and Google - DonaldMetzlerGoogle + DonaldMetzlerGoogle JimmyLinUniversity of Waterloo 1821-1837 Human understanding of text depends on general semantic concepts of words rather than their superficial forms. To what extent does our human intuition transfer to language models? In this work, we study the degree to which current multilingual language models (mLMs) understand based on subword-level semantic concepts. To this end, we form “semantic tokens” by merging the semantically similar subwords and their embeddings, and evaluate the updated mLMs on five heterogeneous multilingual downstream tasks. Results show that the general shared semantics could get the models a long way in making the predictions on mLMs with different tokenizers and model sizes. Inspections of the grouped subwords show that they exhibit a wide range of semantic similarities, including synonyms and translations across many languages and scripts. Lastly, we find that the zero-shot results with semantic tokens are on par with or even better than the original models on certain classification tasks, suggesting that the shared subword-level semantics may serve as the anchors for cross-lingual transfer. @@ -1218,7 +1218,7 @@ QiangNingJump Trading KishaloyHalderAmazon ZhengQiAmazon - WeiXiao + WeiXiao Phu MonHtutAWS AI Labs YiZhangAWS AI NehaAnna John @@ -1255,7 +1255,7 @@ KyleMacMillanUniversity of Chicago HongyuanMeiToyota Technological Institute at Chicago ChenhaoTanUniversity of Chicago - AnupMalaniUniversity of Chicago + AnupMalaniUniversity of Chicago 1917-1942 This paper introduces CaseSumm, a novel dataset for long-context summarization in the legal domain that addresses the need for longer and more complex datasets for summarization evaluation. We collect 25.6K U.S. Supreme Court (SCOTUS) opinions and their official summaries, known as “syllabuses.” Our dataset is the largest open legal case summarization dataset, and is the first to include summaries of SCOTUS decisions dating back to 1815.We also present a comprehensive evaluation of LLM-generated summaries using both automatic metrics and expert human evaluation, revealing discrepancies between these assessment methods. Our evaluation shows Mistral 7b, a smaller open-source model, outperforms larger models on most automatic metrics and successfully generates syllabus-like summaries. In contrast, human expert annotators indicate that Mistral summaries contain hallucinations. The annotators consistently rank GPT-4 summaries as clearer and exhibiting greater sensitivity and specificity. We find that LLM-based evaluations are not more correlated with human evaluations than traditional automatic metrics. Furthermore, our analysis identifies specific hallucinations in generated summaries, including precedent citation errors and misrepresentations of case facts. These findings demonstrate the limitations of current automatic evaluation methods for legal summarization and highlight the critical role of human evaluation in assessing summary quality, particularly in complex, high-stakes domains. 2025.findings-naacl.102 @@ -1263,7 +1263,7 @@ Chasing Random: Instruction Selection Strategies Fail to Generalize - HarshitaDiddeeCarnegie Mellon University + HarshitaDiddeeCarnegie Mellon University DaphneIppolitoCarnegie Mellon University 1943-1957 2025.findings-naacl.103 @@ -1272,7 +1272,7 @@ Can’t Hide Behind the <fixed-case>API</fixed-case>: Stealing Black-Box Commercial Embedding Models Manveer SinghTamberUniversity of Waterloo - JasperXian + JasperXian JimmyLinUniversity of Waterloo 1958-1969 Embedding models that generate dense vector representations of text are widely used and hold significant commercial value. Companies such as OpenAI and Cohere offer proprietary embedding models via paid APIs, but despite being “hidden” behind APIs, these models are not protected from theft. We present, to our knowledge, the first effort to “steal” these models for retrieval by training thief models on text–embedding pairs obtained from the APIs. Our experiments demonstrate that it is possible to replicate the retrieval effectiveness of commercial embedding models with a cost of under $300. Notably, our methods allow for distilling from multiple teachers into a single robust student model, and for distilling into presumably smaller models with fewer dimension vectors, yet competitive retrieval effectiveness. Our findings raise important considerations for deploying commercial embedding models and suggest measures to mitigate the risk of model theft. @@ -1281,16 +1281,16 @@ <fixed-case>CAMEL</fixed-case>-Bench: A Comprehensive <fixed-case>A</fixed-case>rabic <fixed-case>LMM</fixed-case> Benchmark - SaraGhabouraMohamed bin Zayed University of Artificial Intelligence - AhmedHeakl + SaraGhabouraMohamed bin Zayed University of Artificial Intelligence + AhmedHeakl OmkarThawakar Ali Husain Salem AbdullaAlharthiMohamed bin Zayed University of Artificial Intelligence InesRiahi - AbduljalilRadman - JormaLaaksonenAalto University + AbduljalilRadman + JormaLaaksonenAalto University Fahad ShahbazKhanMohamed bin Zayed University of Artificial Intelligence and Linköping University - SalmanKhanMohamed bin Zayed University of Artificial Intelligence and Australian National University - Rao MuhammadAnwerMohamed bin Zayed University of Artificial Intelligence + SalmanKhanMohamed bin Zayed University of Artificial Intelligence and Australian National University + Rao MuhammadAnwerMohamed bin Zayed University of Artificial Intelligence 1970-1980 Recent years have witnessed a significant interest in developing large multi-modal models (LMMs) capable of performing various visual reasoning and understanding tasks. This has led to the introduction of multiple LMM benchmarks to evaluate LMMs on different tasks. However, most existing LMM evaluation benchmarks are predominantly English-centric. In this work, we develop a comprehensive LMM evaluation benchmark for the Arabic language to represent a large population of over 400 million speakers. The proposed benchmark, named CAMEL-Bench, comprises eight diverse domains and 38 sub-domains including, multi-image understanding, complex visual perception, handwritten document understanding, video understanding, medical imaging, plant diseases, and remote sensing-based land use understanding to evaluate broad scenario generalizability. Our CAMEL-Bench comprises around 29,036 questions that are filtered from a larger pool of samples, where the quality is manually verified by native speakers to ensure reliable model assessment. We conduct evaluations of both closed-source, including GPT-4 series, and open-source LMMs. Our analysis reveals the need for substantial improvement, especially among the bestopen-source models, with even the closed-source GPT-4o achieving an overall score of 62%. Our benchmark will be publicly released. 2025.findings-naacl.105 @@ -1301,8 +1301,8 @@ DavidAnugraha Genta IndraWinataCapital One ChenyueLiThe Hong Kong University of Science and Technology - Patrick AmadeusIrawan - En-Shiun AnnieLee + Patrick AmadeusIrawan + En-Shiun AnnieLee 1981-2011 Performance prediction is a method to estimate the performance of Language Models (LMs) on various Natural Language Processing (NLP) tasks, mitigating computational costs associated with model capacity and data for fine-tuning. Our paper presents ProxyLM, a scalable task- and language-agnostic framework designed to predict the performance of LMs using proxy models. These proxy models act as surrogates, approximating the performance of the LM of interest. By leveraging these proxy models, ProxyLM significantly reduces computational overhead in task evaluations, achieving up to a 37.08x speedup over traditional methods, even with our smallest proxy models. Our results across multiple multilingual NLP tasks and various robustness tests demonstrate that ProxyLM not only adapts well to previously unseen languages in pre-trained LMs, but also generalizes effectively across different datasets, outperforming the state-of-the-art by at least 1.78x in terms of root-mean-square error (RMSE). 2025.findings-naacl.106 @@ -1311,8 +1311,8 @@ <fixed-case>S</fixed-case>im<fixed-case>SM</fixed-case>o<fixed-case>E</fixed-case>: Toward Efficient Training Mixture of Experts via Solving Representational Collapse GiangDo - HungLeDeakin University - TruyenTranDeakin University + HungLeDeakin University + TruyenTranDeakin University 2012-2025 Sparse mixture of experts (SMoE) have emerged as an effective approach for scaling large language models while keeping a constant computational cost. Regardless of several notable successes of SMoE, effective training such architecture remains elusive due to the representation collapse problem, which in turn harms model performance and causes parameter redundancy. In this work, we present Similarity-based Sparse Mixture of Experts (SimSMoE), a novel similarity of neural network algorithm, that guarantees a solution to address the representation collapse issue between experts given a fixed FLOPs budget. We conduct extensive empirical evaluations on three large language models for both Pre-training and Fine-tuning tasks to illustrate the efficacy, robustness, and scalability of our method. The results demonstrate that SimSMoE significantly enhances existing routing policy and outperforms other SMoE routing methods in performance for the tasks. Our implementation is publicly available at https://github.com/giangdip2410/SimSMoE. 2025.findings-naacl.107 @@ -1355,7 +1355,7 @@ XuandeFeng JunzhangLiuColumbia University XudongLinColumbia University - ZhecanWangUniversity of California, Los Angeles + ZhecanWangUniversity of California, Los Angeles Shih-FuChangColumbia University and Columbia University 2099-2116 The task of predicting time and location from images is challenging and requires complex human-like puzzle-solving ability over different clues. In this work, we formalize this ability into core skills and implement them using different modules in an expert pipeline called PuzzleGPT. PuzzleGPT consists of a perceiver to identify visual clues, a reasoner to deduce prediction candidates, a combiner to combinatorially combine information from different clues, a web retriever to get external knowledge if the task can’t be solved locally, and a noise filter for robustness. This results in a zero-shot, interpretable, and robust approach that records state-of-the-art performance on two datasets – TARA and WikiTilo. PuzzleGPT outperforms large VLMs such as BLIP-2, InstructBLIP, LLaVA, and even GPT-4V, as well as automatically generated reasoning pipelines like VisProg, by at least 32% and 38%, respectively. It even rivals or surpasses finetuned models. @@ -1377,7 +1377,7 @@ YuyangJiang ChachaChen DangNguyenUniversity of Chicago - Benjamin M.MervakUniversity of Michigan - Ann Arbor + Benjamin M.MervakUniversity of Michigan - Ann Arbor ChenhaoTanUniversity of Chicago 2127-2154 GPT-4’s purported strong multimodal abilities raise interests in using it to automate radiology report writing, but there lacks thorough evaluations. In this work, we perform a systematic evaluation of GPT-4 (4o and vision-preview) in generating radiology reports across three chest X-ray report benchmarks: MIMIC-CXR, CheXpert Plus, and IU X-Ray. We attempt to directly generate reports with different prompting strategies and find that the models fail terribly in both lexical metrics and clinical efficacy metrics. To understand the low performance, we decompose the task into two steps: 1) the **medical image reasoning** step of predicting medical condition labels from images; and 2) the **report synthesis** step of generating reports from (groundtruth) conditions. We show that GPT-4’s performance in image reasoning is consistently low across different prompts. In fact, the distributions of model-predicted labels remain constant regardless of which groundtruth conditions are present on the image, suggesting that the model is not interpreting chest X-rays meaningfully. Even when given groundtruth conditions in report synthesis, its generated reports are less correct and less natural-sounding than a finetuned Llama. Altogether, our findings cast doubt on the viability of using GPT-4 in a radiology workflow. @@ -1387,7 +1387,7 @@ Is Semantic Chunking Worth the Computational Cost? RenyiQuVectara - RuixuanTu + RuixuanTu Forrest ShengBaoVectara, Inc. 2155-2177 Recent advances in Retrieval-Augmented Generation (RAG) systems have popularized semantic chunking, which aims to improve retrieval performance by dividing documents into semantically coherent segments. Despite its growing adoption, the actual benefits over simpler fixed-size chunking, where documents are split into consecutive, fixed-size segments, remain unclear. This study systematically evaluates the effectiveness of semantic chunking using three common retrieval-related tasks: document retrieval, evidence retrieval, and retrieval-based answer generation. The results show that the computational costs associated with semantic chunking are not justified by consistent performance gains. These findings challenge the previous assumptions about semantic chunking and highlight the need for more efficient chunking strategies in RAG systems. @@ -1397,7 +1397,7 @@ On Using <fixed-case>A</fixed-case>rabic Language Dialects in Recommendation Systems AbdullaAlshabanah - MuraliAnnavaramUniversity of Southern California + MuraliAnnavaramUniversity of Southern California 2178-2186 While natural language processing (NLP) techniques have been applied to user reviews in recommendation systems, the potential of leveraging Arabic dialects in this context remains unexplored. Arabic is spoken by over 420 million people, with significant dialectal variation across regions. These dialects, often classified as low-resource languages, present both challenges and opportunities for machine learning applications. This paper represents the first attempt to incorporate Arabic dialects as a signal in recommendation systems. We explore both explicit and implicit approaches for integrating Arabic dialect information from user reviews, demonstrating its impact on improving recommendation performance. Our findings highlight the potential for leveraging dialectal diversity in Arabic to enhance recommendation systems and encourage further research at the intersection of NLP and recommendation systems within the Arab multicultural world. 2025.findings-naacl.115 @@ -1405,10 +1405,10 @@ Assessing <fixed-case>LLM</fixed-case>s for Zero-shot Abstractive Summarization Through the Lens of Relevance Paraphrasing - HadiAskari - AnshumanChhabraUniversity of South Florida - MuhaoChenUniversity of California, Davis and University of Southern California - PrasantMohapatra + HadiAskari + AnshumanChhabraUniversity of South Florida + MuhaoChenUniversity of California, Davis and University of Southern California + PrasantMohapatra 2187-2201 Large Language Models (LLMs) have achieved state-of-the-art performance at zero-shot generation of abstractive summaries for given articles. However, little is known about the robustness of such a process of zero-shot summarization.To bridge this gap, we propose *relevance paraphrasing*, a simple strategy that can be used to measure the robustness of LLMs as summarizers. The relevance paraphrasing approach identifies the most *relevant* sentences that contribute to generating an ideal summary, and then *paraphrases* these inputs to obtain a minimally perturbed dataset. Then, by evaluating model performance for summarization on both the original and perturbed datasets, we can assess the LLM’s one aspect of robustness. We conduct extensive experiments with relevance paraphrasing on 4 diverse datasets, as well as 4 LLMs of different sizes (GPT-3.5-Turbo, Llama-2-13B, Mistral-7B-v1, and Dolly-v2-7B). Our results indicate that LLMs are not consistent summarizers for the minimally perturbed articles, necessitating further improvements. 2025.findings-naacl.116 @@ -1419,7 +1419,7 @@ ZehuiWu ZiweiGongColumbia University LinAi - PengyuanShiColumbia University + PengyuanShiColumbia University KaanDonbekciColumbia University JuliaHirschbergColumbia University 2202-2218 @@ -1442,9 +1442,9 @@ QinLiuUniversity of California, Davis JiongxiaoWangUniversity of Wisconsin - Madison JunYanGoogle - HadiAskari - ChaoweiXiaoUniversity of Wisconsin - Madison and NVIDIA - MuhaoChenUniversity of California, Davis and University of Southern California + HadiAskari + ChaoweiXiaoUniversity of Wisconsin - Madison and NVIDIA + MuhaoChenUniversity of California, Davis and University of Southern California 2232-2249 Existing studies in backdoor defense have predominantly focused on the training phase, overlooking the critical aspect of testing time defense. This gap becomes pronounced in the context of Large Language Models (LLMs) deployed as Web Services, which typically offer only black-box access, rendering training-time defenses impractical. To bridge this gap, this study critically examines the use of demonstrations as a defense mechanism against backdoor attacks in black-box LLMs. With an identified task, we retrieve task-relevant demonstrations from a clean data pool and integrate them with user queries during testing. Importantly, this approach does not necessitate modifications or tuning of the model, nor does it require insight into the model’s internal architecture. The alignment properties inherent in in-context learning play a pivotal role in mitigating the impact of backdoor triggers, effectively recalibrating the behavior of compromised models. Our experimental analysis demonstrates that this method robustly defends against both instance-level and instruction-level backdoor attacks, outperforming existing defense baselines across most evaluation scenarios. 2025.findings-naacl.119 @@ -1461,7 +1461,7 @@ <fixed-case>K</fixed-case>wai<fixed-case>C</fixed-case>hat: A Large-Scale Video-Driven Multilingual Mixed-Type Dialogue Corpus XiaomingShiEast China Normal University - ZemingLiu + ZemingLiu YimingLeiBeijing University of Aeronautics and Astronautics ChenkaiZhang HaitaoLengKuaishou- 快手科技 @@ -1487,10 +1487,10 @@ Attention Tracker: Detecting Prompt Injection Attacks in <fixed-case>LLM</fixed-case>s Kuo-HanHung Ching-YunKoInternational Business Machines - AmbrishRawatInternational Business Machines + AmbrishRawatInternational Business Machines I-HsinChungInternational Business Machines - Winston H.HsuNational Taiwan University - Pin-YuChenInternational Business Machines + Winston H.HsuNational Taiwan University + Pin-YuChenInternational Business Machines 2309-2322 Large Language Models (LLMs) have revolutionized various domains but remain vulnerable to prompt injection attacks, where malicious inputs manipulate the model into ignoring original instructions and executing designated action. In this paper, we investigate the underlying mechanisms of these attacks by analyzing the attention patterns within LLMs. We introduce the concept of the distraction effect, where specific attention heads, termed important heads, shift focus from the original instruction to the injected instruction. Building on this discovery, we propose Attention Tracker, a training-free detection method that tracks attention patterns on instruction to detect prompt injection attacks without the need for additional LLM inference. Our method generalizes effectively across diverse models, datasets, and attack types, showing an AUROC improvement of up to 10.0% over existing methods, and performs well even on small LLMs. We demonstrate the robustness of our approach through extensive evaluations and provide insights into safeguarding LLM-integrated systems from prompt injection vulnerabilities. 2025.findings-naacl.123 @@ -1498,9 +1498,9 @@ Unsupervised Speech-text word-level alignment with Dynamic Programming - TianshuYu + TianshuYu ZihanGongShenzhen Institutes of Advanced Technology, Chinese Academy of Sciences, Chinese Academy of Sciences - MinghuanTanShenzhen Institutes of Advanced Technology, Chinese Academy of Sciences, Chinese Academy of Sciences + MinghuanTanShenzhen Institutes of Advanced Technology, Chinese Academy of Sciences, Chinese Academy of Sciences GuhongChenBeijing Institute of Technology MinYangShenzhen Institutes of Advanced Technology, Chinese Academy of Sciences, Chinese Academy of Sciences 2323-2334 @@ -1511,26 +1511,26 @@ <fixed-case>S</fixed-case>ci<fixed-case>A</fixed-case>ssess: Benchmarking <fixed-case>LLM</fixed-case> Proficiency in Scientific Literature Analysis HengxingCaiSUN YAT-SEN UNIVERSITY XiaochenCaiDP Technology - JunhanChang - SihangLi - LinYao + JunhanChang + SihangLi + LinYao WangChangxin ZhifengGaoDP Technology - HongshuaiWangDP technology - LiYongge + HongshuaiWangDP technology + LiYongge MujieLin - ShuwenYangDP Technology + ShuwenYangDP Technology JiankunWang MingjunXu JinHuang XiFangDP Technology JiaxiZhuang YuqiYin - YaqiLi + YaqiLi ChanghongChenDP Technology ZhengCheng - ZifengZhaoAI for Science Institute - LinfengZhangDP Technology + ZifengZhaoAI for Science Institute + LinfengZhangDP Technology GuolinKeDP Technology 2335-2357 Recent breakthroughs in Large Language Models (LLMs) have revolutionized scientific literature analysis. However, existing benchmarks fail to adequately evaluate the proficiency of LLMs in this domain, particularly in scenarios requiring higher-level abilities beyond mere memorization and the handling of multimodal data.In response to this gap, we introduce SciAssess, a benchmark specifically designed for the comprehensive evaluation of LLMs in scientific literature analysis. It aims to thoroughly assess the efficacy of LLMs by evaluating their capabilities in Memorization (L1), Comprehension (L2), and Analysis & Reasoning (L3). It encompasses a variety of tasks drawn from diverse scientific fields, including biology, chemistry, material, and medicine.To ensure the reliability of SciAssess, rigorous quality control measures have been implemented, ensuring accuracy, anonymization, and compliance with copyright standards. SciAssess evaluates 11 LLMs, highlighting their strengths and areas for improvement. We hope this evaluation supports the ongoing development of LLM applications in scientific literature analysis.SciAssess and its resources are available at https://github.com/sci-assess/SciAssess. @@ -1539,11 +1539,11 @@ Towards Understanding the Fragility of Multilingual <fixed-case>LLM</fixed-case>s against Fine-Tuning Attacks - SamuelePoppiUniversity of Modena and Reggio Emilia and University of Pisa + SamuelePoppiUniversity of Modena and Reggio Emilia and University of Pisa Zheng XinYongBrown University YifeiHeUniversity of Illinois Urbana-Champaign BobbieChern - HanZhaoUniversity of Illinois, Urbana Champaign + HanZhaoUniversity of Illinois, Urbana Champaign AoboYangFacebook JianfengChiMeta AI 2358-2372 @@ -1553,17 +1553,17 @@ <fixed-case>MASSW</fixed-case>: A New Dataset and Benchmark Tasks for <fixed-case>AI</fixed-case>-Assisted Scientific Workflows - XingjianZhangUniversity of Michigan - Ann Arbor - YutongXieUniversity of Michigan + XingjianZhangUniversity of Michigan - Ann Arbor + YutongXieUniversity of Michigan JinHuang - JingeMa - ZhaoyingPan + JingeMa + ZhaoyingPan QijiaLiu ZiyangXiong TolgaErgenLG AI Research DongsubShimLG AI Research HonglakLeeUniversity of Michigan - Ann Arbor and LG AI Research - QiaozhuMeiGoogle and University of Michigan + QiaozhuMeiGoogle and University of Michigan 2373-2394 Scientific innovation relies on detailed workflows, which include critical steps such as contextualizing literature, generating ideas, validating ideas, interpreting results, and planning new research. Scientific publications that document these workflows are extensive and unstructured, making it difficult to effectively navigate and explore the space of scientific innovation. To meet this challenge, we introduce **MASSW**, a comprehensive dataset of **M**ulti-**A**spect **S**ummarization of **S**cientific **W**orkflows. MASSW includes more than 152,000 peer-reviewed publications from 17 leading computer science conferences spanning the past 50 years. Using Large Language Models (LLMs), we automatically extract five core aspects from these publications – *context, key idea, method, outcome*, and *projected impact* – which correspond to five key steps in a research workflow. We show that these LLM-extract summaries have a comparable quality to human annotations, and they facilitate a variety of downstream tasks, corresponding to different types of predictions and recommendations along the scientific workflow. Overall, MASSW demonstrates decent utility as a pre-computed and trustful resource for the AI4Science community to create and benchmark a wide-range of new AI methods for optimizing scientific workflows and fostering scientific innovation. Our code and datasets are made available anonymously: [link](https://osf.io/7ygrq/?view_only=3d8261a0ea09489fa67ece2c68235afa). 2025.findings-naacl.127 @@ -1585,7 +1585,7 @@ NinarehMehrabiAmazon AnilRamakrishnaAmazon AnnaRumshiskyAmazon and University of Massachusetts, Lowell - Kai-WeiChangUniversity of California, Los Angeles and Amazon + Kai-WeiChangUniversity of California, Los Angeles and Amazon AramGalstyanInformation Sciences Institute, University of Southern California, University of Southern California, University of Southern California and Amazon Alexa MortezaZiyadiAmazon RahulGupta @@ -1596,8 +1596,8 @@ <fixed-case>D</fixed-case>i<fixed-case>VIS</fixed-case>e: Direct Visual-Input Speech Synthesis Preserving Speaker Characteristics And Intelligibility - YifanLiu - YuFang + YifanLiu + YuFang ZhouhanLinShanghai Jiao Tong University 2424-2439 Video-to-speech (V2S) synthesis, the task of generating speech directly from silent video input, is inherently more challenging than other speech synthesis tasks due to the need to accurately reconstruct both speech content and speaker characteristics from visual cues alone. Recently, audio-visual pretraining has eliminated the need for additional acoustic hints in V2S, which previous methods often relied on to ensure training convergence. However, even with pretraining, existing methods continue to face challenges in achieving a balance between acoustic intelligibility and the preservation of speaker-specific characteristics. We analyzed this limitation and were motivated to introduce DiVISe (Direct Vsual-Input Speech Synthesis), an end-to-end V2S model that predicts Mel-spectrograms directly from video frames alone. Despite not taking any acoustic hints, DiVISe effectively preserves speaker characteristics in the generated audio, and achieves superior performance on both objective and subjective metrics across the LRS2 and LRS3 datasets. Our results demonstrate that DiVISe not only outperforms existing V2S models in acoustic intelligibility but also scales more effectively with increased data and model parameters. Code and weights will be made publicly available after acceptance of this paper. @@ -1606,11 +1606,11 @@ <fixed-case>G</fixed-case>raph<fixed-case>ICL</fixed-case>: Unlocking Graph Learning Potential in <fixed-case>LLM</fixed-case>s through Structured Prompt Design - YuanfuSunNew York University + YuanfuSunNew York University ZhengnanMa YiFangNew York University JingMaCase Western Reserve University - QiaoyuTanNew York University Shanghai + QiaoyuTanNew York University Shanghai 2440-2459 The growing importance of textual and relational systems has driven interest in enhancing large language models (LLMs) for graph-structured data, particularly Text-Attributed Graphs (TAGs), where samples are represented by textual descriptions interconnected by edges. While research has largely focused on developing specialized graph LLMs through task-specific instruction tuning, a comprehensive benchmark for evaluating LLMs solely through prompt design remains surprisingly absent. Without such a carefully crafted evaluation benchmark, most if not all, tailored graph LLMs are compared against general LLMs using simplistic queries (e.g., zero-shot reasoning with LLaMA), which can potentially camouflage many advantages as well as unexpected predicaments of them. To achieve more general evaluations and unveil the true potential of LLMs for graph tasks, we introduce Graph In-context Learning (GraphICL) Benchmark, a comprehensive benchmark comprising novel prompt templates designed to capture graph structure and handle limited label knowledge. Our systematic evaluation shows that general-purpose LLMs equipped with our GraphICL outperform state-of-the-art specialized graph LLMs and graph neural network models in resource-constrained settings and out-of-domain tasks. These findings highlight the significant potential of prompt engineering to enhance LLM performance on graph learning tasks without training and offer a strong baseline for advancing research in graph LLMs. 2025.findings-naacl.131 @@ -1618,12 +1618,12 @@ <fixed-case>FIDELITY</fixed-case>: Fine-grained Interpretable Distillation for Effective Language Insights and Topic Yielding - DivyanshSingh - BrodieMatherThe Institute for Human & Machine Cognition + DivyanshSingh + BrodieMatherThe Institute for Human & Machine Cognition DemiZhang PatrickLehman JustinHo - Bonnie JDorrUniversity of Florida + Bonnie JDorrUniversity of Florida 2460-2472 The rapid expansion of text data has increased the need for effective methods to distill meaningful information from large datasets. Traditional and state-of-the-art approaches have made significant strides in topic modeling, yet they fall short in generating contextually specific and semantically intuitive topics, particularly in dynamic environments and low-resource languages. Additionally, multi-document summarization systems often struggle with issues like redundancy, scalability, and maintaining readability. We introduce FIDELITY (Fine-grained Interpretable Distillation for Effective Language Insights and Topic Yielding), a hybrid method that combines topic modeling and text summarization to produce fine-grained, semantically rich, and contextually relevant output. FIDELITY enhances dataset accessibility and interpretability, outperforming traditional models in topic diversity, similarity, and in the ability to process new, unseen documents. Additionally, it demonstrates robust multilingual capabilities, effectively handling low-resource languages like Tagalog. This makes FIDELITY a powerful tool for distilling and understanding complex textual data, providing detailed insights while maintaining the necessary granularity for practical applications. 2025.findings-naacl.132 @@ -1631,12 +1631,12 @@ <fixed-case>C</fixed-case>lassic4<fixed-case>C</fixed-case>hildren: Adapting <fixed-case>C</fixed-case>hinese Literary Classics for Children with Large Language Model - JialiChen + JialiChen XusenHei YuqiXue - ZihanWu + ZihanWu JiayuanXie - YiCaiSouth China University of Technology + YiCaiSouth China University of Technology 2473-2488 Chinese literary classics hold significant cultural and educational value, offering deep insights into morality, history, and human nature. These works often include classical Chinese and complex narratives, making them difficult for children to read. To bridge this gap, we introduce a child-friendly literary adaptation (CLA) task to adapt the Chinese literary classic into engaging and accessible text for children. However, recent large language models (LLMs) overlook children’s reading preferences (i.e., vivid character portrayals, concise narrative structures, and appropriate readability with simpler words and sentences), which poses challenges in CLA. In this paper, we propose a method called InstructChild, which augments the LLM with these preferences for adaptation. Specifically, we first obtain the characters’ personalities and narrative structure as additional information for fine-grained instruction tuning. Then, we devise a readability metric as the reward to align the LLM with the children’s reading level. Finally, a lookahead decoding strategy is applied to improve the readability of the generated text during inference. To support the evaluation of CLA task, we construct the Classic4Children dataset, which comprises both the original and child-friendly versions of the Four Great Classical Novels of Chinese literature. Experimental results show that our InstructChild significantly improves performance in automatic and human evaluation. 2025.findings-naacl.133 @@ -1646,9 +1646,9 @@ Considering Length Diversity in Retrieval-Augmented Summarization Juseon-Do JaesungHwang - JingunKwon - HidetakaKamigaito - ManabuOkumura + JingunKwon + HidetakaKamigaito + ManabuOkumura 2489-2500 This study investigates retrieval-augmented summarization by specifically examining the impact of exemplar summary lengths because previous methods have not considered length constraints. We propose a Diverse Length-aware Maximal Marginal Relevance (DL-MMR) algorithm to better control summary lengths. This algorithm combines the query relevance with diverse target lengths in retrieval-augmented summarization. Unlike previous methods that necessitate exhaustive exemplar-exemplar relevance comparisons using MMR, DL-MMR considers the exemplar target length as well and avoids comparing exemplars to each other, thereby reducing computational cost and conserving memory during the construction of an exemplar pool. Experimental results showed the effectiveness of DL-MMR, which considers length diversity, compared to the original MMR algorithm. DL-MMR additionally showed the effectiveness in memory saving of 781,513 times and computational cost reduction of 500,092 times, while maintaining the same level of informativeness. 2025.findings-naacl.134 @@ -1657,13 +1657,13 @@ <fixed-case>LMOD</fixed-case>: A Large Multimodal Ophthalmology Dataset and Benchmark for Large Vision-Language Models ZhenyueQinYale University - YuYin - DylanCampbellAustralian National University + YuYin + DylanCampbellAustralian National University XuanshengWu KeZouNational University of Singapore - NinghaoLiuUniversity of Georgia + NinghaoLiuUniversity of Georgia Yih ChungThamNational University of Singapore - XiuzhenZhangRoyal Melbourne Institute of Technology + XiuzhenZhangRoyal Melbourne Institute of Technology QingyuChenYale University 2501-2522 The prevalence of vision-threatening eye diseases is a significant global burden, with many cases remaining undiagnosed or diagnosed too late for effective treatment. Large vision-language models (LVLMs) have the potential to assist in understanding anatomical information, diagnosing eye diseases, and drafting interpretations and follow-up plans, thereby reducing the burden on clinicians and improving access to eye care. However, limited benchmarks are available to assess LVLMs’ performance in ophthalmology-specific applications. In this study, we introduce LMOD, a large-scale multimodal ophthalmology benchmark consisting of 21,993 instances across (1) five ophthalmic imaging modalities: optical coherence tomography, color fundus photographs, scanning laser ophthalmoscopy, lens photographs, and surgical scenes; (2) free-text, demographic, and disease biomarker information; and (3) primary ophthalmology-specific applications such as anatomical information understanding, disease diagnosis, and subgroup analysis. In addition, we benchmarked 13 state-of-the-art LVLM representatives from closed-source, open-source, and medical domains. The results demonstrate a significant performance drop for LVLMs in ophthalmology compared to other domains. Systematic error analysis further identified six major failure modes: misclassification, failure to abstain, inconsistent reasoning, hallucination, assertions without justification, and lack of domain-specific knowledge. In contrast, supervised neural networks specifically trained on these tasks as baselines demonstrated high accuracy. These findings underscore the pressing need for benchmarks in the development and validation of ophthalmology-specific LVLMs. @@ -1672,8 +1672,8 @@ Syntriever: How to Train Your Retriever with Synthetic Data from <fixed-case>LLM</fixed-case>s - MinsangKimSK Telecom - Seung JunBaekKorea University + MinsangKimSK Telecom + Seung JunBaekKorea University 2523-2539 LLMs have boosted progress in many AI applications. Recently, there were attempts to distill the vast knowledge of LLMs into information retrieval systems. Those distillation methods mostly use output probabilities of LLMs which are unavailable in the latest black-box LLMs. We propose Syntriever, a training framework for retrievers using synthetic data from black-box LLMs. Syntriever consists of two stages. Firstly in the distillation stage, we synthesize relevant and plausibly irrelevant passages and augmented queries using chain-of-thoughts for the given queries. LLM is asked to self-verify the synthetic data for possible hallucinations, after which retrievers are trained with a loss designed to cluster the embeddings of relevant passages. Secondly in the alignment stage, we align the retriever with the preferences of LLMs. We propose a preference modeling called partial Plackett-Luce ranking to learn LLM preferences with regularization which prevents the model from deviating excessively from that trained in the distillation stage. Experiments show that Syntriever achieves state-of-the-art performances on benchmark datasets from various domains in nDCG@K. the source code is available in https://github.com/kmswin1/Syntriever 2025.findings-naacl.136 @@ -1682,7 +1682,7 @@ <fixed-case>D</fixed-case>yn<fixed-case>C</fixed-case>lean: Training Dynamics-based Label Cleaning for Distantly-Supervised Named Entity Recognition QiZhang - HuitongPanTemple University + HuitongPanTemple University ZhijiaChenFacebook Longin JanLateckiTemple University CorneliaCarageaUniversity of Illinois at Chicago @@ -1708,13 +1708,13 @@ WeiqingYangNortheastern University HanbinWang ZhenghaoLiuNortheastern University - XinzeLiNortheastern University + XinzeLiNortheastern University YukunYanTsinghua University - ShuoWang + ShuoWang YuGu - MingheYuNortheastern University - ZhiyuanLiuTsinghua University - GeYu + MingheYuNortheastern University + ZhiyuanLiuTsinghua University + GeYu 2570-2585 Code debugging is a vital stage of software development, essential for ensuring the reliability and performance of Large Language Models (LLMs) in the code generation task. Human debugging typically follows a multi-stage process, which includes Bug Localization, Bug Identification, Code Repair, and Code Recognition. However, existing code debugging benchmarks predominantly focus on the Code Repair stage, which offers only a limited perspective on evaluating the debugging capabilities of LLMs. In this paper, we introduce DEBUGEVAL, a comprehensive benchmark for evaluating the debugging abilities of LLMs by emulating the multi-stage human debugging process. Through evaluating on DEBUGEVAL, we observe that 7B-scale models consistently underperform compared to their larger counterparts, highlighting their limitations in comprehending code semantics. In this case, we propose the COmmunicative Agent-based data SynThesis (COAST) framework, which employs a multi-agent system to generate high-quality training data for supervised fine-tuning (SFT). Experimental results demonstrate that COAST-generated data outperform human-curated and GPT-4-generated data, enabling 7B-scale LLMs to achieve debugging performance comparable to GPT-3.5. All data and codes are available at https://github.com/NEUIR/COAST. 2025.findings-naacl.139 @@ -1722,16 +1722,16 @@ Chain-of-Probe: Examining the Necessity and Accuracy of <fixed-case>C</fixed-case>o<fixed-case>T</fixed-case> Step-by-Step - ZezhongWang + ZezhongWang XingshanZengHuawei Technologies Ltd. - WeiwenLiuHuawei Technologies Ltd. + WeiwenLiuHuawei Technologies Ltd. YufeiWangHuawei Technologies Ltd. - LiangyouLiHuawei Noah’s Ark Lab + LiangyouLiHuawei Noah’s Ark Lab YashengWang LifengShangHuawei Technologies Ltd. - XinJiang - QunLiuHuawei Noah’s Ark Lab - Kam-FaiWongThe Chinese University of Hong Kong + XinJiang + QunLiuHuawei Noah’s Ark Lab + Kam-FaiWongThe Chinese University of Hong Kong 2586-2606 Current research found the issue of Early Answering in large language models (LLMs), where the models already have an answer before generating the Chain-of-Thought (CoT). This phenomenon suggests a potential lack of necessary dependency between the predicted answer and the reasoning process. Consequently, two important questions arise: (1) Is CoT still necessary if the model already has an answer? (2) Can the correctness of the answer serve as valid evidence for the correctness of CoT? To address these questions, we propose a method, namely Chain-of-Probe (CoP), to probe changes in confidence during the model’s reasoning. The probing results show that in a significant number of question-answer cases, CoT appears to be unnecessary, and this necessity correlates with the simplicity of the task, defined by the reasoning steps required. Furthermore, by analyzing patterns in confidence change, we examine the correctness of the model’s reasoning. Our validation reveals that many responses, although correct in their final answer, contain errors in their reasoning process. To this end, we propose a strategic approach based on CoP to prioritize answers with correct reasoning among multiple candidates, thereby bolstering the reliability of the model’s reasoning. 2025.findings-naacl.140 @@ -1741,7 +1741,7 @@ <fixed-case>INDIC</fixed-case> <fixed-case>QA</fixed-case> <fixed-case>BENCHMARK</fixed-case>: A Multilingual Benchmark to Evaluate Question Answering capability of <fixed-case>LLM</fixed-case>s for <fixed-case>I</fixed-case>ndic Languages Abhishek KumarSinghIndian Institute of Technology Bombay, Indian Institute of Technology, Bombay VishwajeetKumarInternational Business Machines - RudraMurthyIBM India Ltd + RudraMurthyIBM India Ltd JaydeepSen AshishMittalIBM Research, Indian Institute of Technology, Bombay and IBM Research GaneshRamakrishnanIndian Institute of Technology Bombay, Indian Institute of Technology Bombay @@ -1752,15 +1752,15 @@ Learning with Less: Knowledge Distillation from Large Language Models via Unlabeled Data - JuanhuiLiMichigan State University + JuanhuiLiMichigan State University SreyashiNagAmazon HuiLiuAmazon XianfengTangAmazon Sheikh MuhammadSarwar - LimengCuiAmazon + LimengCuiAmazon HansuGuAmazon - SuhangWangPennsylvania State University - QiHeAmazon + SuhangWangPennsylvania State University + QiHeAmazon JiliangTangMichigan State University 2627-2641 In real-world NLP applications, Large Language Models (LLMs) offer promising solutions due to their extensive training on vast datasets. However, the large size and high computation demands of LLMs limit their practicality in many applications, especially when further fine-tuning is required. To address these limitations, smaller models are typically preferred for deployment. However, their training is hindered by the scarcity of labeled data. In contrast, unlabeled data is often readily which can be leveraged by using LLMs to generate pseudo-labels for training smaller models. This enables the smaller models (student) to acquire knowledge from LLMs (teacher) while reducing computational costs. This process introduces challenges, such as potential noisy pseudo-labels. % and the high computational expense of processing large unlabeled datasets. Selecting high-quality and informative data is therefore critical to enhance model performance while improving the efficiency of data utilization. To address this, we propose LLKD that enables Learning with Less computational resources and less data for Knowledge Distillation from LLMs. LLKD is an adaptive sample selection method that incorporates signals from both the teacher and student. Specifically, it prioritizes samples where the teacher demonstrates high confidence in its labeling, indicating reliable labels, and where the student exhibits a high information need, identifying challenging samples that require further learning. Our comprehensive experiments show that LLKD achieves superior performance across various datasets with higher data efficiency. @@ -1772,13 +1772,13 @@ ZhaoguangLong YuhaoZhou ShangqingZhaoEast China Normal University - YupeiRen + YupeiRen LiCaiGuizhou University ChenghaoJia ZheChen ZheFangEast China Normal University YuxiangSong - ManLan + ManLan 2642-2653 With the scale of Large Language Models(LLMs) and the size of the training data continuing to expand, the computational costs required for training or tuning have significantly increased as well. In this work we propose an efficient and effective Large-Scale Data Compression (LSDC) method to substantially reduce the size of training data and thus enhance the training efficiency without compromising the performance of LLMs through a bifurcated quantization strategy. Specifically, our method first segments the dataset into multiple clusters, significantly reducing the time and memory requirements for data compression. Then, during the second phase of coreset selection, the diversity of samples is ensured by maximizing the submodular gain in order to avoid performance degradation. The comparative experiments showed that the performance of LLMs fine-tuned on a 20% compressed subset of the Alpaca dataset using LSDC outperformed those on the full dataset. Moreover,on a domain-specific instruction dataset of millions of samples, the LLMs fine-tuned on a 10% compressed dataset using LSDC outperformed those on the entire dataset, which dramatically enhances the domain-adaption capabilities of LLMs. This provides a promising potential of LSDC in training bigger LLMs from scratch and supervised fine-tuning as well. 2025.findings-naacl.143 @@ -1796,7 +1796,7 @@ Enhancing the Prototype Network with Local-to-Global Optimization for Few-Shot Relation Extraction - HuiSun + HuiSun RongxinChenJimei University 2668-2677 Few-Shot Relation Extraction (FSRE) aims to achieve high classification performance by training relation classification models with a small amount of labeled data. Prototypical networks serve as a straightforward and efficient method for optimizing model performance by combining similarity evaluation and contrastive learning. However, directly integrating these methods can introduce unpredictable noise, such as information redundancy, which hinders classification performance and negatively affects embedding space learning. The technique presented in this paper applies Local-To-Global optimization to enhance prototypical networks in few-shot relation extraction. Specifically, this paper develops a local optimization strategy that indirectly optimizes the prototypes by optimizing the other information contained within the prototypes. It considers relation prototypes as global anchors and incorporates the techniques introduced in this paper, such as information alignment, local contrastive learning, and a local adaptive focal loss function, to address the issues of information redundancy. This approach enables the model to learn a unified and effective embedding space. We conduct extensive experiments on the FewRel 1.0 and FewRel 2.0 datasets to validate the effectiveness of the proposed model. @@ -1809,7 +1809,7 @@ QingningShen YanHuThe Chinese University of Hong Kong AnningzheGaoByteDance Inc. - BenyouWangThe Chinese University of Hong Kong, Shenzhen + BenyouWangThe Chinese University of Hong Kong, Shenzhen 2678-2710 2025.findings-naacl.146 huang-etal-2025-llms @@ -1819,10 +1819,10 @@ Sara BourbourHosseinbeigiTarbiat Modares University BehnamRohaniSharif University of Technology, Sharif University of Technology MostafaMasoudiUniversity of Tehran, University of Tehran - MehrnoushShamsfardShahid Beheshti University - ZahraSaaberiShahid Beheshti University - Mostafa KarimiManeshShahid Beheshti University - Mohammad AminAbbasiIran University of Science and Technology Tehran, University of Tehran + MehrnoushShamsfardShahid Beheshti University + ZahraSaaberiShahid Beheshti University + Mostafa KarimiManeshShahid Beheshti University + Mohammad AminAbbasiIran University of Science and Technology Tehran, University of Tehran 2711-2727 Evaluation of large language models (LLMs) in low-resource languages like Persian has received less attention than in high-resource languages like English. Existing evaluation approaches for Persian LLMs generally lack comprehensive frameworks, limiting their ability to assess models’ performance over a wide range of tasks requiring considerable cultural and contextual knowledge, as well as a deeper understanding of Persian literature and style. This paper first aims to fill this gap by providing two new benchmarks, PeKA and PK-BETS, on topics such as history, literature, and cultural knowledge, as well as challenging the present state-of-the-art models’ abilities in a variety of Persian language comprehension tasks. These datasets are meant to reduce data contamination while providing an accurate assessment of Persian LLMs. The second aim of this paper is the general evaluation of LLMs across the current Persian benchmarks to provide a comprehensive performance overview. By offering a structured evaluation methodology, we hope to promote the examination of LLMs in the Persian language. 2025.findings-naacl.147 @@ -1833,9 +1833,9 @@ ZileQiaoAlibaba Group WeiYePeking University YongJiangTongyi Lab - TongMo + TongMo PengjunXie - WeipingLiPeking University + WeipingLiPeking University FeiHuangAlibaba Group US ShikunZhangPeking University 2728-2740 @@ -1846,7 +1846,7 @@ Evaluating Self-Generated Documents for Enhancing Retrieval-Augmented Generation with Large Language Models JiataoLi - XinyuHuPeking University + XinyuHuPeking University XunjianYin XiaojunWan 2741-2775 @@ -1866,12 +1866,12 @@ Semi-supervised Fine-tuning for Large Language Models - JunyuLuoPeking University - XiaoLuoUniversity of California, Los Angeles - XiusiChenUniversity of Illinois at Urbana-Champaign - ZhipingXiaoUniversity of Washington - WeiJuSichuan University - MingZhangPeking University + JunyuLuoPeking University + XiaoLuoUniversity of California, Los Angeles + XiusiChenUniversity of Illinois at Urbana-Champaign + ZhipingXiaoUniversity of Washington + WeiJuSichuan University + MingZhangPeking University 2795-2808 Supervised fine-tuning (SFT) is crucial in adapting large language models (LLMs) to a specific domain or task. However, only a limited amount of labeled data is available in practical applications, which poses a severe challenge for SFT in yielding satisfactory results. Therefore, a data-efficient framework that can fully exploit labeled and unlabeled data for LLM fine-tuning is highly anticipated.Towards this end, we introduce a **semi-supervised fine-tuning (SemiFT)** task and a framework named **SemiEvol** for LLM alignment from a propagate-and-select manner. For knowledge propagation, SemiEvol adopts a bi-level approach, propagating knowledge from labeled data to unlabeled data through both in-weight and in-context methods. For knowledge selection, SemiEvol incorporates a collaborative learning mechanism, selecting higher-quality pseudo-response samples. We conducted experiments using GPT-4o-mini and Llama-3.1 on seven general or domain-specific datasets, demonstrating significant improvements in model performance on target data. Furthermore, we compared SemiEvol with SFT and self-evolution methods, highlighting its practicality in hybrid data scenarios. Github Repository: [https://github.com/luo-junyu/SemiEvol](https://github.com/luo-junyu/SemiEvol). 2025.findings-naacl.151 @@ -1881,7 +1881,7 @@ <fixed-case>CALM</fixed-case>: Unleashing the Cross-Lingual Self-Aligning Ability of Language Model Question Answering YumengWang ZhiyuanFanHong Kong University of Science and Technology - QingyunWangUniversity of Illinois, Urbana Champaign + QingyunWangUniversity of Illinois, Urbana Champaign Yi R.FungHong Kong University of Science and Technology HengJiUniversity of Illinois, Urbana-Champaign 2809-2817 @@ -1891,9 +1891,9 @@ Towards Prompt Generalization: Grammar-aware Cross-Prompt Automated Essay Scoring - HeejinDoPohang University of Science and Technology + HeejinDoPohang University of Science and Technology TaeheePark - SangwonRyuPohang University of Science and Technology + SangwonRyuPohang University of Science and Technology GaryLee 2818-2824 In automated essay scoring (AES), recent efforts have shifted toward cross-prompt settings that score essays on unseen prompts for practical applicability. However, prior methods trained with essay-score pairs of specific prompts pose challenges in obtaining prompt-generalized essay representation. In this work, we propose a grammar-aware cross-prompt trait scoring (GAPS), which internally captures prompt-independent syntactic aspects to learn generic essay representation. We acquire grammatical error-corrected information in essays via the grammar error correction technique and design the AES model to seamlessly integrate such information. By internally referring to both the corrected and the original essays, the model can focus on generic features during training. Empirical experiments validate our method’s generalizability, showing remarkable improvements in prompt-independent and grammar-related traits. Furthermore, GAPS achieves notable QWK gains in the most challenging cross-prompt scenario, highlighting its strength in evaluating unseen prompts. @@ -1905,7 +1905,7 @@ YongqiFan NanWang KuiXueShanghai Artificial Intelligence Laboratory - JingpingLiuEast China University of Science and Technology + JingpingLiuEast China University of Science and Technology TongRuan 2825-2851 Embedding-based retrieval (EBR), the mainstream approach in information retrieval (IR), aims to help users obtain relevant information and plays a crucial role in retrieval-augmented generation (RAG) techniques of large language models (LLMs). Numerous methods have been proposed to significantly improve the quality of retrieved content and many generic benchmarks are proposed to evaluate the retrieval abilities of embedding models. However, texts in the medical domain present unique contexts, structures, and language patterns, such as terminology, doctor-patient dialogue, and electronic health records (EHRs). Despite these unique features, specific benchmarks for medical context retrieval are still lacking. In this paper, we propose MedEureka, an enriched benchmark designed to evaluate medical-context retrieval capabilities of embedding models with multi-granularity and multi-data types. MedEureka includes four levels of granularity and six types of medical texts, encompassing 18 datasets, incorporating granularity and data type description to prompt instruction-fine-tuned text embedding models for embedding generation. We also provide the MedEureka Toolkit to support evaluation on the MedEureka test set. Our experiments evaluate state-of-the-art open-source and proprietary embedding models, and fine-tuned classical baselines, providing a detailed performance analysis. This underscores the challenges of using embedding models for medical domain retrieval and the need for further research. Our code and data are released in the repository: https://github.com/JOHNNY-fans/MedEureka. @@ -1915,10 +1915,10 @@ A Federated Framework for <fixed-case>LLM</fixed-case>-based Recommendation JujiaZhao - WenjieWangNational University of Singapore + WenjieWangNational University of Singapore ChenXu - See-KiongNgNational University of Singapore - Tat-SengChuaNational University of Singapore + See-KiongNgNational University of Singapore + Tat-SengChuaNational University of Singapore 2852-2865 Large Language Models (LLMs) have showcased their potential in building generative recommendation systems through fine-tuning user behavior data. However, utilizing the user behavior data may pose significant privacy risks like in the traditional recommender models, potentially leading to ethical dilemmas and violations of data protection regulations. To address the privacy concerns, Federated Learning for Recommendation (Fed4Rec) has been identified as a promising solution. However, directly applying Fed4Rec in the LLM context introduces two challenges: 1) exacerbated client performance imbalance, which ultimately impacts the system’s long-term effectiveness, and 2) substantial client resource costs, posing a high demand for clients’ both computational and storage capability to locally train and infer LLMs.To tackle these challenges, we propose a federated framework for LLM-based recommendation (shorted as FELLRec). Generally, FELLRec designs two key strategies. 1) Dynamic balance strategy, which designs dynamic parameter aggregation and learning speed for different clients during training, aiming to ensure relatively balanced performance across clients. 2) Flexible storage strategy, which selectively retains certain sensitive LLM layers on the client side, while offloading other layers to the server, aiming to preserve privacy while saving resources. Specifically, FELLRec flexibly maintains those input and output layers on the client side to ensure the protection of all sensitive information. Experiment results show that FELLRec can achieve a more balanced client performance and improved overall performance in a computational and storage-efficient way while safeguarding user privacy well. 2025.findings-naacl.155 @@ -1931,9 +1931,9 @@ YijianLu ZitianGaoUbiquant YichenDiTsinghua University, Tsinghua University - LijieWenTsinghua University - IrwinKing - Philip S.YuUniversity of Illinois, Chicago + LijieWenTsinghua University + IrwinKing + Philip S.YuUniversity of Illinois, Chicago 2866-2882 Watermarking algorithms for large language models (LLMs) have attained high accuracy in detecting LLM-generated text. However, existing methods primarily focus on distinguishing fully watermarked text from non-watermarked text, overlooking real-world scenarios where LLMs generate only small sections within large documents. In this scenario, balancing time complexity and detection performance poses significant challenges. This paper presents WaterSeeker, a novel approach to efficiently detect and locate watermarked segments amid extensive natural text. It first applies an efficient anomaly extraction method to preliminarily locate suspicious watermarked regions. Following this, it conducts a local traversal and performs full-text detection for more precise verification. Theoretical analysis and experimental results demonstrate that WaterSeeker achieves a superior balance between detection accuracy and computational efficiency. Moreover, its localization capability lays the foundation for building interpretable AI detection systems. Our code is available at https://github.com/THU-BPM/WaterSeeker. 2025.findings-naacl.156 @@ -1942,8 +1942,8 @@ <fixed-case>MIRAGE</fixed-case>: A Metric-Intensive Benchmark for Retrieval-Augmented Generation Evaluation ChanheeParkKorea University - HyeonseokMoonKorea University - ChanjunParkKorea University + HyeonseokMoonKorea University + ChanjunParkKorea University HeuiseokLim 2883-2900 Retrieval-Augmented Generation (RAG) has gained prominence as an effective method for enhancing the generative capabilities of Large Language Models (LLMs) through the incorporation of external knowledge. However, the evaluation of RAG systems remains a challenge, due to the intricate interplay between retrieval and generation components. This limitation has resulted in a scarcity of benchmarks that facilitate a detailed, component-specific assessment. In this work, we present MIRAGE, a Question Answering dataset specifically designed for RAG evaluation. MIRAGE consists of 7,560 curated instances mapped to a retrieval pool of 37,800 entries, enabling an efficient and precise evaluation of both retrieval and generation tasks. We also introduce novel evaluation metrics aimed at measuring RAG adaptability, encompassing dimensions such as noise vulnerability, context acceptability, context insensitivity, and context misinterpretation. Through comprehensive experiments across various retriever-LLM configurations, we provide new insights into the optimal alignment of model pairs and the nuanced dynamics within RAG systems. The dataset and evaluation code are publicly available, allowing for seamless integration and customization in diverse research settings. @@ -1953,13 +1953,13 @@ <fixed-case>FIRE</fixed-case>: Fact-checking with Iterative Retrieval and Verification ZhuohanXie - RuiXingMohamed bin Zayed University of Artificial Intelligence and University of Melbourne + RuiXingMohamed bin Zayed University of Artificial Intelligence and University of Melbourne YuxiaWang JiahuiGeng - HasanIqbalMohamed bin Zayed University of Artificial Intelligence + HasanIqbalMohamed bin Zayed University of Artificial Intelligence DhruvSahnan IrynaGurevychInstitute for Computer Science, Artificial Intelligence and Technology, Mohamed bin Zayed University of Artificial Intelligence and Technische Universität Darmstadt - PreslavNakovMohamed bin Zayed University of Artificial Intelligence + PreslavNakovMohamed bin Zayed University of Artificial Intelligence 2901-2914 Fact-checking long-form text is challenging, and it is therefore common practice to break it down into multiple atomic claims. The typical approach to fact-checking these atomic claims involves retrieving a fixed number of pieces of evidence, followed by a verification step. However, this method is usually not cost-effective, as it underutilizes the verification model’s internal knowledge of the claim and fails to replicate the iterative reasoning process in human search strategies. To address these limitations, we propose FIRE, a novel agent-based framework that integrates evidence retrieval and claim verification in an iterative manner. Specifically, FIRE employs a unified mechanism to decide whether to provide a final answer or generate a subsequent search query, based on its confidence in the current judgment. We compare FIRE with other strong fact-checking frameworks and find that it achieves slightly better performance while reducing large language model (LLM) costs by an average of 7.6 times and search costs by 16.5 times. These results indicate that FIRE holds promise for application in large-scale fact-checking operations. 2025.findings-naacl.158 @@ -1967,9 +1967,9 @@ Lessons from a User Experience Evaluation of <fixed-case>NLP</fixed-case> Interfaces - EduardoCalòUtrecht University - LydiaPenkertIndependent Researcher - SaadMahamoodtrivago N.V. + EduardoCalòUtrecht University + LydiaPenkertIndependent Researcher + SaadMahamoodtrivago N.V. 2915-2929 Human evaluations lay at the heart of evaluations within the field of Natural Language Processing (NLP). Seen as the “golden standard” of evaluations, questions are being asked on whether these evaluations are both reproducible and repeatable. One overlooked aspect is the design choices made by researchers when designing user interfaces (UIs). In this paper, four UIs used in past NLP human evaluations are assessed by UX experts, based on standardized human-centered interaction principles. Building on these insights, we derive several recommendations that the NLP community should apply when designing UIs, to enable more consistent human evaluation responses. 2025.findings-naacl.159 @@ -1978,17 +1978,17 @@ <fixed-case>T</fixed-case>rend<fixed-case>S</fixed-case>im: Simulating Trending Topics in Social Media Under Poisoning Attacks with <fixed-case>LLM</fixed-case>-based Multi-agent System ZeyuZhang - JianxunLian - ChenMa + JianxunLian + ChenMa YaningQu YeLuo - LeiWang - RuiLi - XuChenRenmin University of China - YankaiLinRenmin University of China - LeWuHefei University of Technology - XingXieMicrosoft Research Asia - Ji-RongWenRenmin University of China + LeiWang + RuiLi + XuChenRenmin University of China + YankaiLinRenmin University of China + LeWuHefei University of Technology + XingXieMicrosoft Research Asia + Ji-RongWenRenmin University of China 2930-2949 Trending topics have become a significant part of modern social media, attracting users to participate in discussions of breaking events. However, they also bring in a new channel for poisoning attacks, resulting in negative impacts on society. Therefore, it is urgent to study this critical problem and develop effective strategies for defense. In this paper, we propose TrendSim, an LLM-based multi-agent system to simulate trending topics in social media under poisoning attacks. Specifically, we create a simulation environment for trending topics that incorporates a time-aware interaction mechanism, centralized message dissemination, and an interactive system. Moreover, we develop LLM-based humanoid agents to simulate users in social media, and propose prototype-based attackers to replicate poisoning attacks. Besides, we evaluate TrendSim from multiple aspects to validate its effectiveness. Based on TrendSim, we conduct simulation experiments to study four critical problems about poisoning attacks on trending topics. 2025.findings-naacl.160 @@ -1996,9 +1996,9 @@ <fixed-case>ASR</fixed-case>ank: Zero-Shot Re-Ranking with Answer Scent for Document Retrieval - AbdelrahmanAbdallah - JamshidMozafariUniversität Innsbruck - BhawnaPiryaniUniversität Innsbruck + AbdelrahmanAbdallah + JamshidMozafariUniversität Innsbruck + BhawnaPiryaniUniversität Innsbruck AdamJatowtUniversität Innsbruck 2950-2970 Retrieval-Augmented Generation (RAG) models have drawn considerable attention in modern open-domain question answering. The effectiveness of RAG depends on the quality of the top retrieved documents. However, conventional retrieval methods sometimes fail to rank the most relevant documents at the top. In this paper, we introduce ASRANK, a new re-ranking method based on scoring retrieved documents using zero-shot answer scent which relies on a pre-trained large language model to compute the likelihood of the document-derived answers aligning with the answer scent. Our approach demonstrates marked improvements across several datasets, including NQ, TriviaQA, WebQA, ArchivalQA, HotpotQA, and Entity Questions. Notably, ASRANK increases Top-1 retrieval accuracy on NQ from 19.2% to 46.5% for MSS and 22.1% to 47.3% for BM25. It also shows strong retrieval performance on several datasets compared to state-of-the-art methods (47.3 Top-1 by ASRANK vs 35.4 by UPR by BM25). @@ -2028,7 +2028,7 @@ JiwanChung JungbinChoYonsei University JisooKimYonsei University - SungwoongKim + SungwoongKim GyeongboSimNCSOFT YoungjaeYuYonsei University 2990-3005 @@ -2039,13 +2039,13 @@ <fixed-case>P</fixed-case>lot2<fixed-case>C</fixed-case>ode: A Comprehensive Benchmark for Evaluating Multi-modal Large Language Models in Code Generation from Scientific Plots ChengyueWuThe University of Hong Kong - ZhixuanLiangThe University of Hong Kong + ZhixuanLiangThe University of Hong Kong YixiaoGe QiushanGuoByteDance Inc. - ZeyuLu + ZeyuLu JiahaoWang - YingShanTencent AI Lab Center of Visual Computing and Tencent PCG ARC Lab - PingLuoThe University of Hong Kong + YingShanTencent AI Lab Center of Visual Computing and Tencent PCG ARC Lab + PingLuoThe University of Hong Kong 3006-3028 Multi-modal Large Language Models have shown remarkable progress in visual contexts, yet their ability to convert visual figures into executable code remains underexplored. To address this, we introduce Plot2Code, a comprehensive benchmark designed to assess MLLMs’ visual coding capabilities. Plot2Code includes 132 high-quality matplotlib plots across six plot types, as well as an additional 150 and 86 plots from Python’s and R’s plotly libraries respectively, totaling 368 plots. Each plot is paired with its source code and a descriptive instruction generated by GPT-4, enabling thorough evaluation across diverse inputs. Furthermore, we propose three automatic evaluation metrics—code pass rate, text-match ratio, and GPT-4V rating judgement—to assess the quality of generated code and rendered images. Notably, the GPT-4V rating demonstrates strong reliability, as it correlates well with human evaluations, particularly for datasets of a certain size. Cross-validation across MLLMs (GPT-4V, Gemini-1.5-Pro, and Claude-3-Opus) also shows high consistency in ratings, which likely stems from the fact that ratings are based on rendered images rather than direct MLLM outputs, indicating minimal bias for this metric. Our evaluation of 14 MLLMs, including both proprietary and open-source models, highlights significant challenges in visual coding, particularly for text-dense plots, where MLLMs heavily rely on textual instructions. We believe these findings will advance future development of MLLMs. 2025.findings-naacl.164 @@ -2054,13 +2054,13 @@ <fixed-case>F</fixed-case>unnel<fixed-case>RAG</fixed-case>: A Coarse-to-Fine Progressive Retrieval Paradigm for <fixed-case>RAG</fixed-case> XinpingZhao - YanZhong + YanZhong ZetianSun XinshuoHu ZhenyuLiu DongfangLiHarbin Institute of Technology - BaotianHuHarbin Institute of Technology, Shenzhen - MinZhangHarbin Institute of Technology + BaotianHuHarbin Institute of Technology, Shenzhen + MinZhangHarbin Institute of Technology 3029-3046 Retrieval-Augmented Generation (RAG) prevails in Large Language Models. It mainly consists of retrieval and generation. The retrieval modules (a.k.a. retrievers) aim to find useful information used to facilitate the generation modules (a.k.a. generators). As such, generators’ performance largely depends on the effectiveness and efficiency of retrievers. However, the widely used retrieval paradigm remains flat. It treats retrieval procedures as a one-off deal with constant granularity. Despite effectiveness, we argue that they suffer from two limitations: (1) flat retrieval exerts a significant burden on one retriever; (2) constant granularity limits the ceiling of retrieval performance. In this work, we propose a progressive retrieval paradigm with coarse-to-fine granularity for RAG, termed FunnelRAG, so as to balance effectiveness and efficiency. Specifically, FunnelRAG establishes a progressive retrieval pipeline by collaborating coarse-to-fine granularity, large-to-small quantity, and low-to-high capacity, which can relieve the burden on one retriever and also promote the ceiling of retrieval performance. Extensive experiments manifest that FunnelRAG achieves comparable retrieval performance while the time overhead is reduced by nearly 40 percent. 2025.findings-naacl.165 @@ -2078,9 +2078,9 @@ Overcoming both Domain Shift and Label Shift for Referring Video Segmentation - HaiHuang + HaiHuang SashuaiZhouZhejiang University - YanXia + YanXia 3058-3069 Open-set domain generalization (OSDG) aims to enhance the robustness of the model when facing both domain shift and label shift, highlighting a wide range of potential in real-world applications. However, previous OSDG methods can only recognize seen objects and mark all unseen objects as “unknown” categories during inference, which is far from satisfactory. In this paper, we explore the scenario of referring video segmentation to study how to make the model maintain good segmentation ability for unknown objects under OSDG setting. To bridge the huge gap caused by label shift, we propose CLIP-based Reasoning Prompt (CRPrompt), which can combine text and visual prompts together to improve text-object matching ability of CLIP, transferring the segmentation ability to unseen classes based on the knowledge learned from seen classes and large-scale text-image pairs, i.e., color, shape, spatial relationships. Meanwhile, to improve the robustness of CRPrompt, we propose Retrieval-augmented Instance Normalization (RaIN), which can effectively enhance the robustness of the model by retrieving visual objects with similar semantic concepts through input query and performing Instance Norm among them. Extensive experiments on open-set and zero-shot domain generalization tasks demonstrate the effectiveness of our approach. 2025.findings-naacl.167 @@ -2101,7 +2101,7 @@ Beyond Excess and Deficiency: Adaptive Length Bias Mitigation in Reward Models for <fixed-case>RLHF</fixed-case> - YuyanBu + YuyanBu LiangyuHuo YiJingduxiaoman QingYang @@ -2112,9 +2112,9 @@ Neuroplasticity and Corruption in Model Mechanisms: A Case Study Of Indirect Object Identification - Vishnu KabirChhabra + Vishnu KabirChhabra DingZhu - Mohammad MahdiKhaliliOhio State University, Columbus and Yahoo! Research + Mohammad MahdiKhaliliOhio State University, Columbus and Yahoo! Research 3099-3122 Previous research has shown that fine-tuning language models on general tasks enhance their underlying mechanisms. However, the impact of fine-tuning on poisoned data and the resulting changes in these mechanisms are poorly understood. This study investigates the changes in a model’s mechanisms during toxic fine-tuning and identifies the primary corruption mechanisms. We also analyze the changes after retraining a corrupted model on the original dataset and observe neuroplasticity behaviors, where the model relearns original mechanisms after fine-tuning the corrupted model. Our findings indicate that; (i) Underlying mechanisms are amplified across task-specific fine-tuning which can be generalized to longer epochs, (ii) Model corruption via toxic fine-tuning is localized to specific circuit components, (iii) Models exhibit neuroplasticity when retraining corrupted models on clean dataset, reforming the original model mechanisms. 2025.findings-naacl.170 @@ -2123,10 +2123,10 @@ <fixed-case>VANE</fixed-case>-Bench: Video Anomaly Evaluation Benchmark for Conversational <fixed-case>LMM</fixed-case>s HananGaniMohamed bin Zayed University of Artificial Intelligence - RohitBharadwajUniversity of Edinburgh, University of Edinburgh - MuzammalNaseerKhalifa University of Science, Technology and Research + RohitBharadwajUniversity of Edinburgh, University of Edinburgh + MuzammalNaseerKhalifa University of Science, Technology and Research Fahad ShahbazKhanMohamed bin Zayed University of Artificial Intelligence and Linköping University - SalmanKhanMohamed bin Zayed University of Artificial Intelligence and Australian National University + SalmanKhanMohamed bin Zayed University of Artificial Intelligence and Australian National University 3123-3140 The recent advancements in Large Language Models (LLMs) have greatly influenced the development of Large Multi-modal Video Models (Video-LMMs), significantly enhancing our ability to interpret and analyze video data. Despite their impressive capabilities, current Video-LMMs have not been evaluated for anomaly detection tasks, which is critical to their deployment in practical scenarios e.g., towards identifying deepfakes, manipulated video content, traffic accidents and crimes. In this paper, we introduce VANE-Bench, a benchmark designed to assess the proficiency of Video-LMMs in detecting and localizing anomalies and inconsistencies in videos. Our dataset comprises an array of videos synthetically generated using existing state-of-the-art text-to-video generation models, encompassing a variety of subtle anomalies and inconsistencies grouped into five categories: unnatural transformations, unnatural appearance, pass-through, disappearance and sudden appearance. Additionally, our benchmark features real-world samples from existing anomaly detection datasets, focusing on crime-related irregularities, atypical pedestrian behavior, and unusual events. The task is structured as a visual question-answering challenge to gauge the models’ ability to accurately detect and localize the anomalies within the videos. We evaluate nine existing Video-LMMs, both open and closed sources, on this benchmarking task and find that most of the models encounter difficulties in effectively identifying the subtle anomalies. In conclusion, our research offers significant insights into the current capabilities of Video-LMMs in the realm of anomaly detection, highlighting the importance of our work in evaluating and improving these models for real-world applications. Our code and data is publicly available at https://github.com/rohit901/VANE-Bench. 2025.findings-naacl.171 @@ -2136,7 +2136,7 @@ Jailbreaking Prompt Attack: A Controllable Adversarial Attack against Diffusion Models JiachenMa YijiangLi - ZhiqingXiaoYale University and Zhejiang University + ZhiqingXiaoYale University and Zhejiang University AndaCaoZhejiang University JieZhang ChaoYe @@ -2164,10 +2164,10 @@ Task-wrapped Continual Learning in Task-Oriented Dialogue Systems - MinZeng + MinZeng HaiqinYangInternational Digital Economy Academy (IDEA) - XiChen - YikeGuoHong Kong University of Science and Technology and Imperial College London + XiChen + YikeGuoHong Kong University of Science and Technology and Imperial College London 3173-3183 Continual learning is vital for task-oriented dialogue systems (ToDs), and AdapterCL, equipped with residual adapters, has proven effectiveness in this domain. However, its performance is limited by training separate adapters for each task, preventing global knowledge sharing. To address this, we propose **Task-wrapped Continual Learning (TCL)**, a novel framework that employs **Task-Wrapped Adapters (TWAs)**, to simultaneously learn both global and task-specific information through parameter sharing. TCL leverages task-conditioned hypernetworks to transfer global knowledge across tasks, enabling TWAs to start from more informed initialization, efficiently learning task-specific details while reducing model parameters. Additionally, the simple, linear structure of both hypernetworks and TWAs ensure stable training, with task-free inference supported through effective loss utilization. Across 37 ToD domains, TCL consistently outperforms AdapterCL, significantly reducing forgetting. Remarkably, by setting the task embedding dimension to 1, TCL achieves a 4.76% improvement over AdapterCL while using only 46% of the parameters. These findings position TWA as a lightweight, powerful alternative to traditional adapters, offering a promising solution for continual learning in ToDs. The code is availableat https://github.com/cloversjtu/TCL. 2025.findings-naacl.174 @@ -2175,10 +2175,10 @@ Untangling Hate Speech Definitions: A Semantic Componential Analysis Across Cultures and Domains - KaterinaKorreUniversity of Bologna + KaterinaKorreUniversity of Bologna AriannaMuti - FedericoRuggeriUniversity of Bologna - AlbertoBarrón-CedeñoUniversità di Bologna + FedericoRuggeriUniversity of Bologna + AlbertoBarrón-CedeñoUniversità di Bologna 3184-3198 Hate speech relies heavily on cultural influences, leading to varying individual interpretations. For that reason, we propose a Semantic Componential Analysis (SCA) framework for a cross-cultural and cross-domain analysis of hate speech definitions. We create the first dataset of hate speech definitions encompassing 493 definitions from more than 100 cultures, drawn from five key domains: online dictionaries, academic research, Wikipedia, legal texts, and online platforms. By decomposing these definitions into semantic components,our analysis reveals significant variation across definitions, yet many domains borrow definitions from one another without taking into account the target culture. We conduct zero-shot model experiments using our proposed dataset, employing three popular open-sourced LLMs to understand the impact of different definitions on hate speech detection. Our findings indicate that LLMs are sensitive to definitions: responses for hate speech detection change according to the complexity of definitions used in the prompt. 2025.findings-naacl.175 @@ -2201,8 +2201,8 @@ Multi-Condition Guided Diffusion Network for Multimodal Emotion Recognition in Conversation WenjinTian - XianyingHuangChongqing University of Technology - ShihaoZouHuazhong University of Science and Technology + XianyingHuangChongqing University of Technology + ShihaoZouHuazhong University of Science and Technology 3215-3227 Emotion recognition in conversation (ERC) involves identifying emotional labels associated with utterances within a conversation, a task that is essential for developing empathetic robots. Current research emphasizes contextual factors, the speaker’s influence, and extracting complementary information across different modalities. However, it often overlooks the cross-modal noise at the semantic level and the redundant information brought by the features themselves. This study introduces a diffusion-based approach designed to effectively address the challenges posed by redundant information and unexpected noise while robustly capturing shared semantics, thus facilitating the learning of compact and representative features from multimodal data. Specifically, we present the Multi-Condition Guided Diffusion Network (McDiff). McDiff employs a modal prior knowledge extraction strategy to derive the prior distribution for each modality, thereby enhancing the regional attention of each modality and applying the generated prior distribution at each diffusion step. Furthermore, we propose a method to learn the mutual information of each modality through a specific objective constraints approach prior to the forward process, which aims to improve inter-modal interaction and mitigate the effects of noise and redundancy. Comprehensive experiments conducted on two multimodal datasets, IEMOCAP and MELD, demonstrate that McDiff significantly surpasses existing state-of-the-art methodologies, thereby affirming the generalizability and efficacy of the proposed model. 2025.findings-naacl.177 @@ -2210,12 +2210,12 @@ Thank You, Stingray: Multilingual Large Language Models Can Not (Yet) Disambiguate Cross-Lingual Word Senses - SamuelCahyawijayaCohere + SamuelCahyawijayaCohere RuochenZhangBrown University Jan Christian BlaiseCruzMohamed bin Zayed University of Artificial Intelligence - HolyLoveniaAI Singapore + HolyLoveniaAI Singapore ElisaGilbertUniversität Leipzig - HirokiNomotoTokyo University of Foreign Studies + HirokiNomotoTokyo University of Foreign Studies Alham FikriAjiMohamed bin Zayed University of Artificial Intelligence 3228-3250 Multilingual large language models (LLMs) have gained prominence, but concerns arise regarding their reliability beyond English. This study addresses the gap in cross-lingual semantic evaluation by introducing a novel benchmark for cross-lingual sense disambiguation, StingrayBench. In this paper, we demonstrate using false friends—words that are orthographically similar but have completely different meanings in two languages— as a possible approach to pinpoint the limitation of cross-lingual sense disambiguation in LLMs. We collect false friends in four language pairs, namely Indonesian-Malay, Indonesian-Tagalog, Chinese-Japanese, and English-German; and challenge LLMs to distinguish the use of them in context. In our analysis of various models, we observe they tend to be biased toward higher-resource languages. We also propose new metrics for quantifying the cross-lingual sense bias and comprehension based on our benchmark. Our work contributes to developing more diverse and inclusive language modeling, promoting fairer access for the wider multilingual community. @@ -2224,9 +2224,9 @@ Atoxia: Red-teaming Large Language Models with Target Toxic Answers - YuhaoDuThe Chinese University of Hong Kong, Shenzhen - ZhuoLiThe Chinese University of Hong Kong, Shenzhen - PengyuChengTencent + YuhaoDuThe Chinese University of Hong Kong, Shenzhen + ZhuoLiThe Chinese University of Hong Kong, Shenzhen + PengyuChengTencent XiangWanShenzhen Research Institute of Big Data AnningzheGaoByteDance Inc. 3251-3266 @@ -2247,7 +2247,7 @@ Probing-<fixed-case>RAG</fixed-case>: Self-Probing to Guide Language Models in Selective Document Retrieval - IngeolBaek + IngeolBaek HwanChang ByeongJeongKimChung-Ang University JiminLeeChung-Ang University @@ -2279,10 +2279,10 @@ Continuous Speech Tokenizer in Text To Speech YixingLi - RuobingXie - XingwuSunTencent AI Platform + RuobingXie + XingwuSunTencent AI Platform YuChengThe Chinese University of Hong Kong - ZhanhuiKang + ZhanhuiKang 3341-3347 The fusion of speech and language in the era of large language models has garnered significant attention. Discrete speech token is often utilized in text-to-speech tasks for speech compression and portability, which is convenient for joint training with text and have good compression efficiency. However, we found that the discrete speech tokenizer still suffers from information loss. Therefore, we propose a simple yet effective continuous speech tokenizer named Cont-SPT, and a text-to-speech model based on continuous speech tokens. Our results show that the speech language model based on the continuous speech tokenizer has better continuity and higher estimated Mean Opinion Scores (MoS). This enhancement is attributed to better information preservation rate of the continuous speech tokenizer across both low and high frequencies in the frequency domain. The code and resources for Cont-SPT can be found in https://github.com/Yixing-Li/Continuous-Speech-Tokenizer. 2025.findings-naacl.184 @@ -2292,13 +2292,13 @@ Efficient Annotator Reliability Assessment and Sample Weighting for Knowledge-Based Misinformation Detection on Social Media OwenCook CharlieGrimshaw - Ben PengWuUniversity of Sheffield + Ben PengWuUniversity of Sheffield SophieDillon JackHicks LukeJones ThomasSmith MatyasSzert - XingyiSongUniversity of Sheffield + XingyiSongUniversity of Sheffield 3348-3358 Misinformation spreads rapidly on social media, confusing the truth and targeting potentially vulnerable people. To effectively mitigate the negative impact of misinformation, it must first be accurately detected before applying a mitigation strategy, such as X’s community notes, which is currently a manual process. This study takes a knowledge-based approach to misinformation detection, modelling the problem similarly to one of natural language inference. The EffiARA annotation framework is introduced, aiming to utilise inter- and intra-annotator agreement to understand the reliability of each annotator and influence the training of large language models for classification based on annotator reliability. In assessing the EffiARA annotation framework, the Russo-Ukrainian Conflict Knowledge-Based Misinformation Classification Dataset (RUC-MCD) was developed and made publicly available. This study finds that sample weighting using annotator reliability performs the best, utilising both inter- and intra-annotator agreement and soft label training. The highest classification performance achieved using Llama-3.2-1B was a macro-F1 of 0.757 and 0.740 using TwHIN-BERT-large. 2025.findings-naacl.185 @@ -2307,7 +2307,7 @@ Challenges in Trustworthy Human Evaluation of Chatbots WentingZhaoCornell University - Alexander MRushCornell University and School of Engineering and Applied Sciences, Harvard University + Alexander MRushCornell University and School of Engineering and Applied Sciences, Harvard University TanyaGoyalCornell University 3359-3365 Recently, open community-driven platforms like Chatbot Arena that collect user preference data from site visitors have gained reputation as trustworthy publicly available benchmarks for LLM performance. While gold standard, it is often tricky to implement the required guardrails to collect high-quality annotations from humans. In this paper, we demonstrate that different source of bad annotations, both malicious and otherwise, can corrupt the reliability of open leaderboard rankings. In particular, we show that only 10% of poor quality votes by apathetic (site visitors not appropriately incentivized to give correct votes) or adversarial (bad actors seeking to inflate the ranking of a target model) annotators can change the rankings of models by up to 5 places on the leaderboard. Finally, we discuss open challenges in ensuring high quality human annotations. @@ -2316,9 +2316,9 @@ <fixed-case>RATSD</fixed-case>: Retrieval Augmented Truthfulness Stance Detection from Social Media Posts Toward Factual Claims - ZhengyuanZhu - ZeyuZhang - HaiqiZhangUniversity of Texas at Arlington + ZhengyuanZhu + ZeyuZhang + HaiqiZhangUniversity of Texas at Arlington ChengkaiLiUniversity of Texas at Arlington 3366-3381 Social media provides a valuable lens for assessing public perceptions and opinions. This paper focuses on the concept of truthfulness stance, which evaluates whether a textual utterance affirms, disputes, or remains neutral or indifferent toward a factual claim. Our systematic analysis fills a gap in the existing literature by offering the first in-depth conceptual framework encompassing various definitions of stance. We introduce RATSD (Retrieval Augmented Truthfulness Stance Detection), a novel method that leverages large language models (LLMs) with retrieval-augmented generation (RAG) to enhance the contextual understanding of tweets in relation to claims. RATSD is evaluated on TSD-CT, our newly developed dataset containing 3,105 claim-tweet pairs, along with existing benchmark datasets. Our experiment results demonstrate that RATSD outperforms state-of-the-art methods, achieving a significant increase in Macro-F1 score on TSD-CT. Our contributions establish a foundation for advancing research in misinformation analysis and provide valuable tools for understanding public perceptions in digital discourse. @@ -2332,7 +2332,7 @@ ZiwenXiaDeepwisdom SiruiHongDeepWisdom YunZhuGoogle - BangLiuUniversity of Montreal + BangLiuUniversity of Montreal ChenglinWuDeepWisdom 3382-3392 Large Language Models (LLMs) are proficient at retrieving single facts from extended contexts, yet they struggle with tasks requiring the simultaneous retrieval of multiple facts, especially during generation. This paper identifies a novel “lost-in-the-middle” phenomenon, where LLMs progressively lose track of critical information throughout the generation process, resulting in incomplete or inaccurate retrieval. To address this challenge, we introduce Find All Crucial Texts (FACT), an iterative retrieval method that refines context through successive rounds of rewriting. This approach enables models to capture essential facts incrementally, which are often overlooked in single-pass retrieval. Experiments demonstrate that FACT substantially enhances multi-fact retrieval performance across various tasks, though improvements are less notable in general-purpose QA scenarios. Our findings shed light on the limitations of LLMs in multi-fact retrieval and underscore the need for more resilient long-context retrieval strategies. @@ -2341,14 +2341,14 @@ Temporal Working Memory: Query-Guided Segment Refinement for Enhanced Multimodal Understanding - XingjianDiao + XingjianDiao ChunhuiZhangDartmouth College - WeiyiWu + WeiyiWu ZhongyuOuyang PeijunQingDartmouth College - MingCheng - SoroushVosoughiDartmouth College - JiangGuiDartmouth College + MingCheng + SoroushVosoughiDartmouth College + JiangGuiDartmouth College 3393-3409 Multimodal foundation models (MFMs) have demonstrated significant success in tasks such as visual captioning, question answering, and image-text retrieval. However, these models face inherent limitations due to their finite internal capacity, which restricts their ability to process extended temporal sequences—an essential requirement for comprehensive video and audio analysis. To overcome these challenges, we introduce a specialized cognitive module, temporal working memory (TWM), which aims to enhance the temporal modeling capabilities of MFMs. It selectively retains task-relevant information across temporal dimensions, ensuring that critical details are preserved throughout the processing of video and audio content. The TWM uses a query-guided attention approach to focus on the most informative multimodal segments within temporal sequences. By retaining only the most relevant content, TWM optimizes the use of the model’s limited capacity, enhancing its temporal modeling ability. This plug-and-play module can be easily integrated into existing MFMs. With our TWM, nine state-of-the-art models exhibit significant performance improvements across tasks such as video captioning, question answering, and video-text retrieval. By enhancing temporal modeling, TWM extends the capability of MFMs to handle complex, time-sensitive data effectively. Our code is available at https://github.com/xid32/NAACL_2025_TWM. 2025.findings-naacl.189 @@ -2367,7 +2367,7 @@ Multilingual Blending: Large Language Model Safety Alignment Evaluation with Language Mixture - JiayangSong + JiayangSong YuhengHuang ZhehuaZhouUniversity of Alberta LeiMaThe University of Tokyo and University of Alberta @@ -2380,7 +2380,7 @@ Mitigating Hallucinations in Multimodal Spatial Relations through Constraint-Aware Prompting JiaruiWuUniversity of Rochester ZhuoLiuUniversity of Rochester - HangfengHeUniversity of Rochester + HangfengHeUniversity of Rochester 3450-3468 Spatial relation hallucinations pose a persistent challenge in large vision-language models (LVLMs), leading to generate incorrect predictions about object positions and spatial configurations within an image. To address this issue, we propose a constraint-aware prompting framework designed to reduce spatial relation hallucinations. Specifically, we introduce two types of constraints: (1) bidirectional constraint, which ensures consistency in pairwise object relations, and (2) transitivity constraint, which enforces relational dependence across multiple objects. By incorporating these constraints, LVLMs can produce more spatially coherent and consistent outputs. We evaluate our method on three widely-used spatial relation datasets, demonstrating performance improvements over existing approaches. Additionally, a systematic analysis of various bidirectional relation analysis choices and transitivity reference selections highlights greater possibilities of our methods in incorporating constraints to mitigate spatial relation hallucinations. 2025.findings-naacl.192 @@ -2391,8 +2391,8 @@ JunjieLiuZhejiang University ShaotianYanAlibaba Group ChenShenAlibaba Group - ZhengdongXiao - LiangXie + ZhengdongXiao + LiangXie WenxiaoWangZhejiang University JiepingYeAlibaba Group 3469-3498 @@ -2436,13 +2436,13 @@ <fixed-case>S</fixed-case>yn<fixed-case>G</fixed-case>host: Invisible and Universal Task-agnostic Backdoor Attack via Syntactic Transfer - PengzhouCheng - WeiDu - ZongruWu + PengzhouCheng + WeiDu + ZongruWu FengweiZhangSouthern University of Science and Technology - LiboChenShanghai Jiaotong University - ZhuoshengZhangShanghai Jiao Tong University - GongshenLiuShanghai Jiao Tong University + LiboChenShanghai Jiaotong University + ZhuoshengZhangShanghai Jiao Tong University + GongshenLiuShanghai Jiao Tong University 3530-3546 2025.findings-naacl.196 cheng-etal-2025-synghost @@ -2453,8 +2453,8 @@ ChenyuanYang ZhijieWang YuhengHuang - ZhaoyangChu - DaSong + ZhaoyangChu + DaSong LingmingZhangUniversity of Illinois Urbana-Champaign An RanChenUniversity of Alberta LeiMaThe University of Tokyo and University of Alberta @@ -2466,13 +2466,13 @@ Safe Inputs but Unsafe Output: Benchmarking Cross-modality Safety Alignment of Large Vision-Language Models SiyinWangFudan University - XingsongYe + XingsongYe QinyuanCheng JunwenDuan ShiminLi JinlanFu - XipengQiuFudan University - XuanjingHuangFudan University + XipengQiuFudan University + XuanjingHuangFudan University 3563-3605 As Artificial General Intelligence (AGI) becomes increasingly integrated into various facets of human life, ensuring the safety and ethical alignment of such systems is paramount. Previous studies primarily focus on single-modality threats, which may not suffice given the integrated and complex nature of cross-modality interactions. We introduce a novel safety alignment challenge called Safe Inputs but Unsafe Output (*SIUO*) to evaluate cross-modality safety alignment. Specifically, it considers cases where single modalities are safe independently but could potentially lead to unsafe or unethical outputs when combined. To empirically investigate this problem, we developed the *SIUO*, a cross-modality benchmark encompassing 9 critical safety domains, such as self-harm, illegal activities, and privacy violations. Our findings reveal substantial safety vulnerabilities in both closed- and open-source LVLMs, such as GPT-4V and LLaVA, underscoring the inadequacy of current models to reliably interpret and respond to complex, real-world scenarios. 2025.findings-naacl.198 @@ -2480,10 +2480,10 @@ <fixed-case>FLEX</fixed-case>: A Benchmark for Evaluating Robustness of Fairness in Large Language Models - DahyunJungKorea University - SeungyoonLeeKorea University - HyeonseokMoonKorea University - ChanjunParkKorea University + DahyunJungKorea University + SeungyoonLeeKorea University + HyeonseokMoonKorea University + ChanjunParkKorea University HeuiseokLim 3606-3620 Recent advancements in Large Language Models (LLMs) have significantly enhanced interactions between users and models. These advancements concurrently underscore the need for rigorous safety evaluations due to the manifestation of social biases, which can lead to harmful societal impacts. Despite these concerns, existing benchmarks may overlook the intrinsic weaknesses of LLMs, which can generate biased responses even with simple adversarial instructions. To address this critical gap, we introduce a new benchmark, Fairness Benchmark in LLM under Extreme Scenarios (FLEX), designed to test whether LLMs can sustain fairness even when exposed to prompts constructed to induce bias. To thoroughly evaluate the robustness of LLMs, we integrate prompts that amplify potential biases into the fairness assessment. Comparative experiments between FLEX and existing benchmarks demonstrate that traditional evaluations may underestimate the inherent risks in models. This highlights the need for more stringent LLM evaluation benchmarks to guarantee safety and fairness. @@ -2493,7 +2493,7 @@ When and How to Augment Your Input: Question Routing Helps Balance the Accuracy and Efficiency of Large Language Models ShufanChenUniversity of Science and Technology of China - HeZheng + HeZheng LeiCuiTsinghua University, Tsinghua University 3621-3634 Although large language models rely on parametric knowledge to achieve exceptional performance across various question-answering tasks, they still face challenges when addressing knowledge-based long-tail questions. Augmented generation techniques, such as chain-of-thought prompting and retrieval augmentation, can effectively enhance the ability of these models to answer long-tail questions. However, improving accuracy through augmented generation often results in significant latency within question-answering systems. This paper addresses the issue of “when and how to augment the input” by proposing an adaptive question routing framework. This framework employs a query router to select the most appropriate augmentation path at the right time, thereby enhancing both the accuracy and efficiency of question-answering systems. Extensive comparative experiments on benchmarks such as AmbigNQ, HotpotQA, MMLU-STEM, and PopQA demonstrate that our method surpasses existing approaches in both accuracy and efficiency. Furthermore, this paper introduces two metrics for evaluating adaptive question augmentation methods and presents a new benchmark for adaptive question augmentation, aiming to advance the field. @@ -2512,9 +2512,9 @@ From Curiosity to Clarity : Exploring the Impact of Consecutive Why-Questions - GeonyeongSonHanyang University - JaeyoungLeeHanyang University - MisukKimHanyang University + GeonyeongSonHanyang University + JaeyoungLeeHanyang University + MisukKimHanyang University 3649-3664 Humans attempt to understand the real world by asking the fundamental question ”Why?” when faced with incomprehensible situations in everyday life. Such why-questions provide essential knowledge that can help in understanding these situations. In this study, we conducted an end-to-end process to verify the utility of consecutive why-questions, from constructing a large language model (LLM)-based dataset to performing quantitative evaluation and analysis. Firstly, we created a WHY-Chain dataset, consisting of answers generated by an LLM in response to chain-of-why-questions, including a validity check. We also incorporated objectives that effectively capture the ”consecutive” characteristic of the data. Using the WHY-Chain dataset and two types of self-supervised objectives, we trained the pre-trained model. As a result, the refined model demonstrated improved performance on downstream tasks that require commonsense reasoning. Additionally, we conducted various ablation studies to assess the impact of different factors, confirming the scalability of the proposed approach. Lastly, we confirmed the consistency of the logical information by reasoning chain analysis of the answers generated from consecutive why-questions. 2025.findings-naacl.202 @@ -2524,7 +2524,7 @@ <fixed-case>C</fixed-case>ollab<fixed-case>S</fixed-case>tory: Multi-<fixed-case>LLM</fixed-case> Collaborative Story Generation and Authorship Analysis SaranyaVenkatramanAmazon Nafis IrtizaTriptoPennsylvania State University - DongwonLeeThe Pennsylvania State University + DongwonLeeThe Pennsylvania State University 3665-3679 The rise of unifying frameworks that enable seamless interoperability of Large Language Models (LLMs) has made LLM-LLM collaboration for open-ended tasks a possibility. Despite this, there have not been efforts to explore such collaborative writing. We take the next step beyond human-LLM collaboration to explore this multi-LLM scenario by generating the first exclusively LLM-generated collaborative stories dataset called CollabStory. We focus on single-author to multi-author (up to 5 LLMs) scenarios, where multiple LLMs co-author stories. We generate over 32k stories using open-source instruction-tuned LLMs. Further, we take inspiration from the PAN tasks that have set the standard for human-human multi-author writing tasks and analysis. We extend their authorship-related tasks for multi-LLM settings and present baselines for LLM-LLM collaboration. We find that current baselines are not able to handle this emerging scenario. Thus, CollabStory is a resource that could help propel an understanding as well as the development of new techniques to discern the use of multiple LLMs. This is crucial to study in the context of writing tasks since LLM-LLM collaboration could potentially overwhelm ongoing challenges related to plagiarism detection, credit assignment, maintaining academic integrity in educational settings, and addressing copyright infringement concerns. We make our dataset and code available at https://github.com/saranya-venkatraman/CollabStory. 2025.findings-naacl.203 @@ -2545,16 +2545,16 @@ <fixed-case>K</fixed-case>now<fixed-case>A</fixed-case>gent: Knowledge-Augmented Planning for <fixed-case>LLM</fixed-case>-Based Agents - YuqiZhu + YuqiZhu ShuofeiQiao - YixinOu + YixinOu ShuminDeng - ShiweiLyu + ShiweiLyu YueShenantgroup - LeiLiang - JinjieGu + LeiLiang + JinjieGu HuajunChenZhejiang University - NingyuZhangZhejiang University + NingyuZhangZhejiang University 3709-3732 Large Language Models (LLMs) have demonstrated great potential in complex reasoning tasks, yet they fall short when tackling more sophisticated challenges, especially when interacting with environments through generating executable actions. This inadequacy primarily stems from the lack of built-in action knowledge in language agents, which fails to effectively guide the planning trajectories during task solving and results in planning hallucination. To address this issue, we introduce KnowAgent, a novel approach designed to enhance the planning capabilities of LLMs by incorporating explicit action knowledge. Specifically, KnowAgent employs an action knowledge base and a knowledgeable self-learning strategy to constrain the action path during planning, enabling more reasonable trajectory synthesis, and thereby enhancing the planning performance of language agents. Experimental results on HotpotQA and ALFWorld based on various backbone models demonstrate that KnowAgent can achieve comparable or superior performance to existing baselines. Further analysis indicates the effectiveness of KnowAgent in terms of planning hallucinations mitigation. 2025.findings-naacl.205 @@ -2578,7 +2578,7 @@ AbhijitMishraUniversity of Texas at Austin and Apple ShreyaShukla JoseTorres - JacekGwizdkaUniversity of Texas at Austin + JacekGwizdkaUniversity of Texas at Austin ShounakRoychowdhuryUniversity of Texas at Austin 3747-3759 Decoding and expressing brain activity in a comprehensible form is a challenging frontier in AI. This paper presents *Thought2Text*, which uses instruction-tuned Large Language Models (LLMs) fine-tuned with EEG data to achieve this goal. The approach involves three stages: (1) training an EEG encoder for visual feature extraction, (2) fine-tuning LLMs on image and text data, enabling multimodal description generation, and (3) further fine-tuning on EEG embeddings to generate text directly from EEG during inference. Experiments on a public EEG dataset collected for six subjects with image stimuli and text captions demonstrate the efficacy of multimodal LLMs (*LLaMA-v3*, *Mistral-v0.3*, *Qwen2.5*), validated using traditional language generation evaluation metrics, as well as *fluency* and *adequacy* measures. This approach marks a significant advancement towards portable, low-cost “thoughts-to-text” technology with potential applications in both neuroscience and natural language processing. @@ -2597,11 +2597,11 @@ Towards Cross-Lingual Explanation of Artwork in Large-scale Vision Language Models ShintaroOzaki - KazukiHayashi + KazukiHayashi YusukeSakaiNara Institute of Science and Technology, Japan - HidetakaKamigaitoNara Institute of Science and Technology + HidetakaKamigaitoNara Institute of Science and Technology KatsuhikoHayashiThe University of Tokyo - TaroWatanabeNara Institute of Science and Technology, Japan + TaroWatanabeNara Institute of Science and Technology, Japan 3773-3809 As the performance of Large-scale Vision Language Models (LVLMs) improves, they are increasingly capable of responding in multiple languages, and there is an expectation that the demand for explanations generated by LVLMs will grow. However, pre-training of Vision Encoder and the integrated training of LLMs with Vision Encoder are mainly conducted using English training data, leaving it uncertain whether LVLMs can completely handle their potential when generating explanations in languages other than English. In addition, multilingual QA benchmarks that create datasets using machine translation have cultural differences and biases, remaining issues for use as evaluation tasks. To address these challenges, this study created an extended dataset in multiple languages without relying on machine translation. This dataset that takes into account nuances and country-specific phrases was then used to evaluate the generation explanation abilities of LVLMs. Furthermore, this study examined whether Instruction-Tuning in resource-rich English improves performance in other languages. Our findings indicate that LVLMs perform worse in languages other than English compared to English. In addition, it was observed that LVLMs struggle to effectively manage the knowledge learned from English data. 2025.findings-naacl.209 @@ -2612,7 +2612,7 @@ YiyiChen QiongxiuLiAalborg University RussaBiswasAalborg University, Aalborg University - JohannesBjervaAalborg University + JohannesBjervaAalborg University 3810-3827 Language Confusion is a phenomenon where Large Language Models (LLMs) generate text that is neither in the desired language, nor in a contextually appropriate language. This phenomenon presents a critical challenge in text generation by LLMs, often appearing as erratic and unpredictable behavior. We hypothesize that there are linguistic regularities to this inherent vulnerability in LLMs and shed light on patterns of language confusion across LLMs. We introduce a novel metric, Language Confusion Entropy, designed to directly measure and quantify this confusion, based on language distributions informed by linguistic typology and lexical variation. Comprehensive comparisons with the Language Confusion Benchmark (Marchisio et al., 2024) confirm the effectiveness of our metric, revealing patterns of language confusion across LLMs. We further link language confusion to LLM security, and find patterns in the case of multilingual embedding inversion attacks. Our analysis demonstrates that linguistic typology offers theoretically grounded interpretation, and valuable insights into leveraging language similarities as a prior for LLM alignment and security. 2025.findings-naacl.210 @@ -2628,10 +2628,10 @@ ZhiyiZhangUniversity of Southern California XiaolongXu JunyingChen - JieFuShanghai Artificial Intelligence Laboratory + JieFuShanghai Artificial Intelligence Laboratory XiangWanShenzhen Research Institute of Big Data AnningzheGaoByteDance Inc. - BenyouWangThe Chinese University of Hong Kong, Shenzhen + BenyouWangThe Chinese University of Hong Kong, Shenzhen 3828-3848 Large Language Models infuse newfound vigor into the advancement of the medical domain, yet the scarcity of data poses a significant bottleneck hindering community progress. In this paper, we release the largest ever medical Question Answering (QA) dataset with 26 Million QA pairs named Huatuo-26M. We benchmark many existing approaches in our dataset in terms of both retrieval and generation. We also experimentally show the benefit of the proposed dataset in many aspects: (i) it serves as a fine-tuning data for training medical Large Language Models (LLMs); (ii) it works as an external knowledge source for retrieval-augmented generation (RAG); (iii) it demonstrates transferability by enhancing zero-shot performance on other QA datasets; and (iv) it aids in training biomedical model as a pre-training corpus. Our empirical findings substantiate the dataset’s utility in these domains, thereby confirming its significance as a resource in the medical QA landscape. 2025.findings-naacl.211 @@ -2639,13 +2639,13 @@ <fixed-case>SEP</fixed-case>-<fixed-case>MLDC</fixed-case>: A Simple and Effective Paradigm for Multi-Label Document Classification - HanLiuDalian University of Technology + HanLiuDalian University of Technology ShuqinLi - XiaotongZhang + XiaotongZhang YuanyuanWang - FengZhangPeking University - HongyangChen - HongYu + FengZhangPeking University + HongyangChen + HongYu 3849-3859 Multi-label document classification (MLDC) aims to allocate more than one label to each document and attracts increasing attention in many practical applications. However, previous studies have failed to pay sufficient attention to the lack of semantic information on labels and the long-tail problem prevalent in the datasets. Additionally, most existing methods focus on optimizing document features, overlooking the potential of high-quality label features to enhance classification performance. In this paper, we propose a simple and effective paradigm for MLDC. Regarding the problem of insufficient label information and imbalance in the sample size of categories, we utilize large language models (LLMs) to semantically expand the label content and generate pseudo-samples for the tail categories. To optimize the features of both documents and labels, we design the contrastive learning boosted feature optimization module facilitated by the similarity matrices. Finally, we construct a label-guided feature selection module to incorporate the optimized label features into the input features to provide richer semantic information for the classifier. Extensive experiments have demonstrated that our proposed method significantly outperforms state-of-the-art baselines. 2025.findings-naacl.212 @@ -2653,10 +2653,10 @@ Improving Pre-trained Language Models with Knowledge Enhancement and Filtering Framework - QiZhao + QiZhao QiSongUniversity of Science and Technology of China TianXie - HaiyueZhang + HaiyueZhang HongyuYang XiangyangLi 3860-3871 @@ -2667,8 +2667,8 @@ Using Review Combination and Pseudo-Tokens for Aspect Sentiment Quad Prediction JiazhouChen - XuJiaHebei Normal University - RuiQiangGuo + XuJiaHebei Normal University + RuiQiangGuo 3872-3883 Aspect Sentiment Quad Prediction (ASQP) aims to identify quadruples consisting of an aspect term, aspect category, opinion term, and sentiment polarity from a given sentence, which is the most representative and challenging task in aspect-based sentiment analysis. A major challenge arises when implicit sentiment is present, as existing models often confuse implicit and explicit sentiment, making it difficult to extract the quadruples effectively. To tackle this issue, we propose a framework that leverages distinct labeled features from diverse reviews and incorporates pseudo-token prompts to harness the semantic knowledge of pre-trained models, effectively capturing both implicit and explicit sentiment expressions. Our approach begins by categorizing reviews based on the presence of implicit sentiment elements. We then build new samples that combine those with implicit sentiment and those with explicit sentiment. Next, we employ prompts with pseudo-tokens to guide the model in distinguishing between implicit and explicit sentiment expressions. Extensive experimental results show that our proposed method enhances the model’s ability across four public datasets, averaging 1.99% F1 improvement, particularly in instances involving implicit sentiment. We release our code at https://github.com/chienarmor/absa-implicit. 2025.findings-naacl.214 @@ -2676,9 +2676,9 @@ <fixed-case>DDGIP</fixed-case>: Radiology Report Generation Through Disease Description Graph and Informed Prompting - ChentaoHuang + ChentaoHuang GuangliLiEast China Jiao Tong University - XinjiongZhou + XinjiongZhou YafengRen HongbinZhangEast China Jiao Tong University 3884-3894 @@ -2708,7 +2708,7 @@ YiShenChina Mobile Communications Group Co.,Ltd SijiaLiu YiTang - SenSong + SenSong XiaoyiWangBeijing Wispirit Technology LongjunCaiAlibaba Group 3912-3921 @@ -2718,16 +2718,16 @@ <fixed-case>TE</fixed-case>a<fixed-case>R</fixed-case>: Improving <fixed-case>LLM</fixed-case>-based Machine Translation with Systematic Self-Refinement - ZhaopengFengZhejiang University + ZhaopengFengZhejiang University YanZhangTencent HaoLi BeiWu JiayuLiaoTencent NLP Speech WenqiangLiuTencent - JunLang + JunLang YangFeng JianWuZhejiang University - ZuozhuLiuZhejiang University + ZuozhuLiuZhejiang University 3922-3938 Large Language Models (LLMs) have achieved impressive results in Machine Translation (MT). However, human evaluations reveal that LLM-generated translations still contain various errors. Notably, feeding the error information back into the LLMs can facilitate self-refinement, leading to enhanced translation quality. Motivated by these findings, we introduce TEaR (Translate, Estimate, and Refine), a systematic LLM-based self-refinement framework aimed at bootstrapping translation performance. Our key results show that: 1) TEaR framework enables LLMs to improve their translation quality relying solely on self-feedback, measured by both automatic metrics and Multidimensional Quality Metrics (MQM) scores; 2) TEaR autonomously selects improvements, ensuring a robust translation quality baseline while outperforming both internal refinement and external feedback methods. Error analysis and iterative refinement experiments show its ability to continuously reduce translation errors and enhance overall translation quality. Our code and data are publicly available at https://github.com/fzp0424/self_correct_mt. 2025.findings-naacl.218 @@ -2736,9 +2736,9 @@ Vulnerability of Large Language Models to Output Prefix Jailbreaks: Impact of Positions on Safety YiweiWangUniversity of California, Merced - MuhaoChenUniversity of California, Davis and University of Southern California + MuhaoChenUniversity of California, Davis and University of Southern California NanyunPengUniversity of California, Los Angeles - Kai-WeiChangUniversity of California, Los Angeles and Amazon + Kai-WeiChangUniversity of California, Los Angeles and Amazon 3939-3952 Previous research on jailbreak attacks has mainly focused on optimizing the adversarial snippet content injected into input prompts to expose LLM security vulnerabilities. A significant portion of this research focuses on developing more complex, less readable adversarial snippets that can achieve higher attack success rates. In contrast to this trend, our research investigates the impact of the adversarial snippet’s position on the effectiveness of jailbreak attacks. We find that placing a simple and readable adversarial snippet at the beginning of the output effectively exposes LLM safety vulnerabilities, leading to much higher attack success rates than the input suffix attack or prompt-based output jailbreaks. Precisely speaking, we discover that directly enforcing the user’s target embedded output prefix is an effective method to expose LLMs’ safety vulnerabilities. 2025.findings-naacl.219 @@ -2746,7 +2746,7 @@ <fixed-case>I</fixed-case>ma<fixed-case>RA</fixed-case>: An Imaginative Frame Augmented Method for Low-Resource Multimodal Metaphor Detection and Explanation - YuanTian + YuanTian MinzhengWang NanXu WenjiMaoInstitute of Automation, Chinese Academy of Sciences @@ -2757,7 +2757,7 @@ <fixed-case>XAMPLER</fixed-case>: Learning to Retrieve Cross-Lingual In-Context Examples - PeiqinLinInstitut für Informatik + PeiqinLinInstitut für Informatik AndreMartinsInstituto Superior Técnico and Unbabel HinrichSchuetze 3968-3977 @@ -2783,13 +2783,13 @@ <fixed-case>GRAIT</fixed-case>: Gradient-Driven Refusal-Aware Instruction Tuning for Effective Hallucination Mitigation RunchuanZhu XinkeJiang - JiangWuShanghai Artificial Intelligence Laboratory + JiangWuShanghai Artificial Intelligence Laboratory ZhipengMa JiaheSong FengshuoBai DahuaLinThe Chinese University of Hong Kong - LijunWuShanghai Artificial Intelligence Laboratory - ConghuiHeShanghai AI Lab + LijunWuShanghai Artificial Intelligence Laboratory + ConghuiHeShanghai AI Lab 4006-4021 Refusal-Aware Instruction Tuning (RAIT) aims to enhance Large Language Models (LLMs) by improving their ability to refuse responses to questions beyond their knowledge, thereby reducing hallucinations and improving reliability. Effective RAIT must address two key challenges: firstly, effectively reject unknown questions to minimize hallucinations; secondly, avoid over-refusal to ensure questions that can be correctly answered are not rejected, thereby maintain the helpfulness of LLM outputs. In this paper, we address the two challenges by deriving insightful observations from the gradient-based perspective, and proposing the Gradient-driven Refusal Aware Instruction Tuning Framework GRAIT: (1) employs gradient-driven sample selection to effectively minimize hallucinations and (2) introduces an adaptive weighting mechanism during fine-tuning to reduce the risk of over-refusal, achieving the balance between accurate refusals and maintaining useful responses. Experimental evaluations on open-ended and multiple-choice question answering tasks demonstrate that GRAIT significantly outperforms existing RAIT methods in the overall performance. The source code and data will be available at https://github.com/opendatalab/GRAIT . 2025.findings-naacl.223 @@ -2797,9 +2797,9 @@ Entity Pair-guided Relation Summarization and Retrieval in <fixed-case>LLM</fixed-case>s for Document-level Relation Extraction - FuZhangNortheastern University - HongsenYu - JingweiChengNortheastern University, China + FuZhangNortheastern University + HongsenYu + JingweiChengNortheastern University, China HuangmingXu 4022-4037 Document-level relation extraction (DocRE) aims to extract relations between entities in a document. While previous research has primarily focused on traditional small models, recent studies have extended the scope to large language models (LLMs). Current LLM-based methods typically focus on filtering all potential relations (candidate relations) within a document at one time and then performing triplet fact extraction. However, most approaches for candidate relation filtering are based on the document level, which results in insufficient correlation between candidate relations and entity pairs. In addition, the data imbalance problem caused by a large amount of no-relation data (NA problem) is another important reason for the suboptimal performance of LLM-based methods. To address these issues, we propose an entity pair-guided relation summarization and retrieval model (EP-RSR) for DocRE, which introduces an innovative LLM-based document-level relation extraction paradigm, EPRF (Entity Pair-Relation-Fact), along with an entity pair-level candidate relation filtering method. Our approach first selects entity pairs that potentially contain relations and uses them to guide relation summarization and retrieval for extracting relation facts. This enhances the relevance between candidate relations and entity pairs while alleviating the issue of imbalanced NA data. Benchmark testing on three datasets demonstrates that our approach achieves state-of-the-art (SOTA) performance for LLM-based models. Our code is available at https://github.com/LookingYu/EP-RSR. @@ -2808,7 +2808,7 @@ A Recipe of Parallel Corpora Exploitation for Multilingual Large Language Models - PeiqinLinInstitut für Informatik + PeiqinLinInstitut für Informatik AndreMartinsInstituto Superior Técnico and Unbabel HinrichSchuetze 4038-4050 @@ -2819,14 +2819,14 @@ Omni-Chart-600<fixed-case>K</fixed-case>: A Comprehensive Dataset of Chart Types for Chart Understanding ShuleiWang - ShuaiYang + ShuaiYang WangLin ZirunGuo - SihangCaiCollege of Computer Science and Technology, Zhejiang University - HaiHuang + SihangCaiCollege of Computer Science and Technology, Zhejiang University + HaiHuang YeWang - JingyuanChenZhejiang University - TaoJinZhejiang University + JingyuanChenZhejiang University + TaoJinZhejiang University 4051-4069 To address the deficiencies in chart types and the limited scope of chart tasks in existing datasets, we conducted a comprehensive review of current data collection methodologies. By integrating manual annotation with data generation leveraging GPT-4, we developed a dataset that includes 21 diverse chart types and a broad spectrum of tasks, such as data retrieval and mathematical reasoning. Our analysis of existing models revealed that capabilities in information extraction, mathematical reasoning, and understanding of multiple chart types are essential for performing a variety of chart tasks. To overcome the limitations in these areas, we devised a two-stage training strategy and a method for jointly training the vision encoder tailored for multi-type charts. In the first stage, we designed several tasks to enhance the model’s general understanding of charts, aligning multimodal large models pre-trained on natural images to chart tasks. To further improve the model’s capability to understand various chart tasks and enhance its reasoning abilities, we employed Chain-of-Thought data for training in the second stage. Through two-stage training on our proposed dataset, the pre-trained multimodal large language model achieved state-of-the-art performance across multiple chart understanding tasks, demonstrating the superiority of our data and methods. 2025.findings-naacl.226 @@ -2849,8 +2849,8 @@ IuliiaZaitova VitaliiHirak Badr M.Abdullah - DietrichKlakow - BerndMöbiusUniversität des Saarlandes + DietrichKlakow + BerndMöbiusUniversität des Saarlandes TaniaAvgustinova 4083-4092 This study analyzes the attention patterns of fine-tuned encoder-only models based on the BERT architecture (BERT-based models) towards two distinct types of Multiword Expressions (MWEs): idioms and microsyntactic units (MSUs). Idioms present challenges in semantic non-compositionality, whereas MSUs demonstrate unconventional syntactic behavior that does not conform to standard grammatical categorizations. We aim to understand whether fine-tuning BERT-based models on specific tasks influences their attention to MWEs, and how this attention differs between semantic and syntactic tasks. We examine attention scores to MWEs in both pre-trained and fine-tuned BERT-based models. We utilize monolingual models and datasets in six Indo-European languages — English, German, Dutch, Polish, Russian, and Ukrainian. Our results show that fine-tuning significantly influences how models allocate attention to MWEs. Specifically, models fine-tuned on semantic tasks tend to distribute attention to idiomatic expressions more evenly across layers. Models fine-tuned on syntactic tasks show an increase in attention to MSUs in the lower layers, corresponding with syntactic processing requirements. @@ -2862,10 +2862,10 @@ JiweiTang JinXuTsinghua University TingweiLu - ZhichengZhang + ZhichengZhang YimingZhaoYimingZhao - LinHaiLinHaiTsinghua University, Tsinghua University - Hai-TaoZhengTsinghua University, Tsinghua University + LinHaiLinHaiTsinghua University, Tsinghua University + Hai-TaoZhengTsinghua University, Tsinghua University 4093-4108 Large language models (LLMs) demonstrate exceptional capabilities in various scenarios. However, they suffer from much redundant information and are sensitive to the position of key information in long context scenarios. To address these challenges, we present Perception Compressor, a training-free prompt compression framework. It includes a perception retriever that leverages guiding questions and instruction to retrieve the most relevant demonstrations, a dual-slope ratio allocator to dynamically allocate compression ratios and open-book ratios, and a semi-guided iterative compression that retains key information at the token level while removing tokens that distract the LLM. We conduct extensive experiments on long context benchmarks, i.e., NaturalQuestions, LongBench, and MuSiQue. Experiment results show that Perception Compressor outperforms existing methods by a large margin, achieving state-of-the-art performance. 2025.findings-naacl.229 @@ -2873,8 +2873,8 @@ <fixed-case>M</fixed-case>ojo<fixed-case>B</fixed-case>ench: Language Modeling and Benchmarks for Mojo - NishatRaihan - Joanna C. S.Santos + NishatRaihan + Joanna C. S.Santos MarcosZampieri 4109-4128 The recently introduced Mojo programming language (PL) by Modular, has received significant attention in the scientific community due to its claimed significant speed boost over Python. Despite advancements in code Large Language Models (LLMs) across various PLs, Mojo remains unexplored in this context. To address this gap, we introduce MojoBench, the first framework for Mojo code generation. MojoBench includes HumanEval-Mojo, a benchmark dataset designed for evaluating code LLMs on Mojo, and Mojo-Coder, the first LLM pretrained and finetuned for Mojo code generation, which supports instructions in 5 natural languages (NLs). Our results show that Mojo-Coder achieves a 30-35% performance improvement over leading models like GPT-4o and Claude-3.5-Sonnet. Furthermore, we provide insights into LLM behavior with underrepresented and unseen PLs, offering potential strategies for enhancing model adaptability. MojoBench contributes to our understanding of LLM capabilities and limitations in emerging programming paradigms fostering more robust code generation systems. @@ -2885,7 +2885,7 @@ <fixed-case>VL</fixed-case>ind-Bench: Measuring Language Priors in Large Vision-Language Models Kang-ilLeeSeoul National University MinbeomKim - SeunghyunYoonAdobe Research + SeunghyunYoonAdobe Research MinsungKim DongryeolLeeSeoul National University HyukhunKohSeoul National University @@ -2897,12 +2897,12 @@ <fixed-case>GRAG</fixed-case>: Graph Retrieval-Augmented Generation - YuntongHuEmory University + YuntongHuEmory University ZhihanLeiEmory University ZhengZhang - BoPan - ChenLing - LiangZhaoEmory University + BoPan + ChenLing + LiangZhaoEmory University 4145-4157 Naive Retrieval-Augmented Generation (RAG) focuses on individual documents during retrieval and, as a result, falls short in handling networked documents which are very popular in many applications such as citation graphs, social media, and knowledge graphs. To overcome this limitation, we introduce Graph Retrieval-Augmented Generation (GRAG), which tackles the fundamental challenges in retrieving textual subgraphs and integrating the joint textual and topological information into Large Language Models (LLMs) to enhance its generation. To enable efficient textual subgraph retrieval, we propose a novel divide-and-conquer strategy that retrieves the optimal subgraph structure in linear time. To achieve graph context-aware generation, incorporate textual graphs into LLMs through two complementary views—the text view and the graph view—enabling LLMs to more effectively comprehend and utilize the graph context. Extensive experiments on graph reasoning benchmarks demonstrate that in scenarios requiring multi-hop reasoning on textual graphs, our GRAG approach significantly outperforms current state-of-the-art RAG methods. Our datasets as well as codes of GRAG are available at https://github.com/HuieL/GRAG. 2025.findings-naacl.232 @@ -2924,9 +2924,9 @@ Scaling Up Membership Inference: When and How Attacks Succeed on Large Language Models HaritzPuertoTU Darmstadt - MartinGubriParameter Lab + MartinGubriParameter Lab SangdooYunNAVER - Seong JoonOhParameter Lab and Eberhard-Karls-Universität Tübingen + Seong JoonOhParameter Lab and Eberhard-Karls-Universität Tübingen 4165-4182 Membership inference attacks (MIA) attempt to verify the membership of a given data sample in the training set for a model. MIA has become relevant in recent years, following the rapid development of large language models (LLM). Many are concerned about the usage of copyrighted materials for training them and call for methods for detecting such usage. However, recent research has largely concluded that current MIA methods do not work on LLMs. Even when they seem to work, it is usually because of the ill-designed experimental setup where other shortcut features enable “cheating.” In this work, we argue that MIA still works on LLMs, but only when multiple documents are presented for testing. We construct new benchmarks that measure the MIA performances at a continuous scale of data samples, from sentences (n-grams) to a collection of documents (multiple chunks of tokens). To validate the efficacy of current MIA approaches at greater scales, we adapt a recent work on Dataset Inference (DI) for the task of binary membership detection that aggregates paragraph-level MIA features to enable document- and dataset-level MIA. This baseline achieves the first successful MIA on pre-trained and fine-tuned LLMs. 2025.findings-naacl.234 @@ -2946,11 +2946,11 @@ Exploring Hybrid Sampling Inference for Aspect-based Sentiment Analysis - XiaoyiBao + XiaoyiBao MinjieQiang - JinghangGuHong Kong Polytechnic University + JinghangGuHong Kong Polytechnic University ZhongqingWangSoochow University, China - Chu-RenHuang + Chu-RenHuang 4199-4210 As the training of large language models (LLMs) will encounter high computational costs, massive works are now focusing on inference. Their methods can be generally summarised as re-sampling the target multiple times and performing a vote upon the outputs. Despite bringing significant performance improvements, it is a high-cost method that requires multiple sampling with the preset size. In this paper, we propose a simple yet efficient inference strategies named __Hybrid Sampling__ that combining both multiple and single sampling to greatly reduce the cost of multiple sampling without sacrificing performance. __Hybrid Sampling__ could dynamically choose the essential part of generated sequence for multiple sampling and proceed the rest with single sampling, achieving a performance-cost balance. Extensive experiments in several benchmarks underscore the robustness and effectiveness of our proposed Hybrid Sampling and more importantly, it is much faster. 2025.findings-naacl.236 @@ -2958,10 +2958,10 @@ <fixed-case>F</fixed-case>e<fixed-case>RG</fixed-case>-<fixed-case>LLM</fixed-case> : Feature Engineering by Reason Generation Large Language Models - JeonghyunKo - GyeongyunParkKorea University - DonghoonLeeKorea University - KyunamLeeSK Telecom + JeonghyunKo + GyeongyunParkKorea University + DonghoonLeeKorea University + KyunamLeeSK Telecom 4211-4228 2025.findings-naacl.237 ko-etal-2025-ferg @@ -2969,7 +2969,7 @@ Effective Self-Mining of In-Context Examples for Unsupervised Machine Translation with <fixed-case>LLM</fixed-case>s AbdellahEl MekkiMohamed bin Zayed University of Artificial Intelligence - MuhammadAbdul-MageedUniversity of British Columbia + MuhammadAbdul-MageedUniversity of British Columbia 4229-4256 Large Language Models (LLMs) have demonstrated impressive performance on a wide range of natural language processing (NLP) tasks, primarily through in-context learning (ICL). In ICL, the LLM is provided with examples that represent a given task such that it learns to generate answers for test inputs. However, access to these in-context examples is not guaranteed especially for low-resource or massively multilingual tasks. In this work, we propose an unsupervised approach to mine in-context examples for machine translation (MT), enabling unsupervised MT (UMT) across different languages. Our approach begins with word-level mining to acquire word translations that are then used to perform sentence-level mining. As the quality of mined parallel pairs may not be optimal due to noise or mistakes, we introduce a filtering criterion to select the optimal in-context examples from a pool of unsupervised parallel sentences. We evaluate our approach using two multilingual LLMs on 288 directions from the FLORES-200 dataset (CITATION) and analyze the impact of various linguistic features on performance. Our findings demonstrate the effectiveness of our unsupervised approach in mining in-context examples for MT, leading to better or comparable translation performance as translation with regular in-context samples (extracted from human-annotated data), while also outperforming the other state-of-the-art UMT methods by an average of 7 BLEU points. 2025.findings-naacl.238 @@ -2995,9 +2995,9 @@ <fixed-case>QP</fixed-case>runer: Probabilistic Decision Quantization for Structured Pruning in Large Language Models ChanghaiZhou YuhuaZhou - YibinWang + YibinWang ShijieHan - QianQiaoFudan University + QianQiaoFudan University HongguangLiJF SmartInvest Holdings 4276-4286 The rise of large language models (LLMs) has significantly advanced various natural language processing (NLP) tasks. However, the resource demands of these models pose substantial challenges. Structured pruning is an effective approach to reducing model size, but it often results in significant accuracy degradation, necessitating parameter updates to adapt. Unfortunately, such fine-tuning requires substantial memory, which limits its applicability. To address these challenges, we introduce quantization into the structured pruning framework to reduce memory consumption during both fine-tuning and inference. However, the combined errors from pruning and quantization increase the difficulty of fine-tuning, requiring a more refined quantization scheme. To this end, we propose QPruner, a novel framework that employs structured pruning to reduce model size, followed by a layer-wise mixed-precision quantization scheme. Quantization precisions are assigned to each layer based on their importance to the target task, and Bayesian optimization is employed to refine precision allocation strategies, ensuring a balance between model accuracy and memory efficiency. Extensive experiments on benchmark datasets demonstrate that QPruner significantly outperforms existing methods in memory savings while maintaining or improving model performance. @@ -3011,7 +3011,7 @@ JingTang HuiminChenIndependent researcher WenboZhouUniversity of Science and Technology of China - WeimingZhangUniversity of Science and Technology of China + WeimingZhangUniversity of Science and Technology of China NenghaiYuUniversity of Science and Technology of China 4287-4298 Retrieval-Augmented Generation (RAG) improves Large Language Models (LLMs) by using external knowledge, but it struggles with precise entity information retrieval. Our proposed **MES-RAG** framework enhances entity-specific query handling and provides accurate, secure, and consistent responses. MES-RAG introduces proactive security measures that ensure system integrity by applying protections prior to data access. Additionally, the system supports real-time multi-modal outputs, including text, images, audio, and video, seamlessly integrating into existing RAG architectures. Experimental results demonstrate that MES-RAG significantly improves both accuracy and recall, highlighting its effectiveness in advancing the security and utility of question-answering, increasing accuracy to **0.83 (+0.25)** on targeted task. Our code and data are available at https://github.com/wpydcr/MES-RAG. @@ -3020,9 +3020,9 @@ <fixed-case>LVP</fixed-case>runing: An Effective yet Simple Language-Guided Vision Token Pruning Approach for Multi-modal Large Language Models - YizhengSun + YizhengSun YanzeXin - HaoLiMicrosoft Research + HaoLiMicrosoft Research JingyuanSunUniversity of Manchester ChenghuaLinUniversity of Manchester RizaBatista-NavarroUniversity of Manchester @@ -3035,9 +3035,9 @@ How Much Knowledge Can You Pack into a <fixed-case>L</fixed-case>o<fixed-case>RA</fixed-case> Adapter without Harming <fixed-case>LLM</fixed-case>? SergeyPletenev MariaMarina - DaniilMoskovskiy + DaniilMoskovskiy VasilyKonovalovAIRI - PavelBraslavskiNazarbayev University + PavelBraslavskiNazarbayev University AlexanderPanchenkoSkoltech MikhailSalnikovSkolkovo Institute of Science and Technology 4309-4322 @@ -3050,7 +3050,7 @@ XinyuanLunational university of singaore, National University of Singapore LiangmingPanUniversity of Arizona YuboMaSchool of Computer Science and Engineering, Nanyang Technological University - PreslavNakovMohamed bin Zayed University of Artificial Intelligence + PreslavNakovMohamed bin Zayed University of Artificial Intelligence Min-YenKanNational University of Singapore 4323-4339 Current Large Language Models (LLMs) exhibit limited ability to understand table structures and to apply precise numerical reasoning, which is crucial for tasks such as table question answering and table-based fact verification. To address these challenges, we introduce our Tool-Augmented Reasoning framework for Tables (TART), which integrates LLMs with specialized tools. TART contains three key components: a table formatter to ensure accurate data representation, a tool maker to develop specific computational tools, and an explanation generator to maintain explainability. We also present the TOOLTAB dataset, a new benchmark designed specifically for training LLMs in table–tool integration. Our experiments indicate that TART achieves substantial improvements over existing methods (e.g., Chain-of-Thought) by improving both the precision of data processing and the clarity of the reasoning process. Notably, TART paired with CodeLlama achieves 90.0% of the accuracy of the closed-sourced LLM GPT-3.5-turbo, highlighting its robustness in diverse real-world scenarios. Both code and data are openly available at https://github.com/XinyuanLu00/TART. @@ -3062,7 +3062,7 @@ ZhihuiShao ShubinCaiShenzhen University RongshengLin - ZhongMingShenzhen University + ZhongMingShenzhen University 4340-4349 Large Language Models (LLMs) have recently demonstrated remarkable performance in Text-to-SQL tasks. However, existing research primarily focuses on the optimization of prompts and improvements in workflow, with few studies delving into the exploration of the questions. In this paper, we propose a Text-to-SQL framework based on question classification and multi-agent collaboration (QCMA-SQL). Specifically, we first employ multiple cross-attention mechanisms to train a schema selector to classify questions and select the most suitable database schema. Subsequently, we employ the appropriate agents based on the varying difficulty levels of the questions to generate preliminary SQL queries. Moreover, we implement syntax validation and execution optimization steps to generate final SQL queries. Experimental results on the Spider dataset show that the QCMA-SQL framework achieves an execution accuracy of 87.4%, outperforming state-of-the-art methods. Through ablation studies, we find that classifying the questions ultimately leads to a 2.8% increase in execution accuracy. 2025.findings-naacl.245 @@ -3071,8 +3071,8 @@ Efficient Nearest Neighbor based Uncertainty Estimation for Natural Language Processing Tasks WataruHashimotoNara Institute of Science and Technology, Japan - HidetakaKamigaitoNara Institute of Science and Technology - TaroWatanabeNara Institute of Science and Technology, Japan + HidetakaKamigaitoNara Institute of Science and Technology + TaroWatanabeNara Institute of Science and Technology, Japan 4350-4366 Trustworthiness in model predictions is crucial for safety-critical applications in the real world. However, deep neural networks often suffer from the issues of uncertainty estimation, such as miscalibration. In this study, we propose k-Nearest Neighbor Uncertainty Estimation (kNN-UE), which is a new uncertainty estimation method that uses not only the distances from the neighbors, but also the ratio of labels in the neighbors. Experiments on sentiment analysis, natural language inference, and named entity recognition show that our proposed method outperforms the baselines and recent density-based methods in several calibration and uncertainty metrics. Moreover, our analyses indicate that approximate nearest neighbor search techniques reduce the inference overhead without significantly degrading the uncertainty estimation performance when they are appropriately combined. 2025.findings-naacl.246 @@ -3082,8 +3082,8 @@ <fixed-case>B</fixed-case>it<fixed-case>A</fixed-case>buse: A Dataset of Visually Perturbed Texts for Defending Phishing Attacks HanyongLee ChaelynLee - YongjaeLeeRetrvr Inc. - JaesungLeeChung-Ang University and Chung-Ang University + YongjaeLeeRetrvr Inc. + JaesungLeeChung-Ang University and Chung-Ang University 4367-4384 Phishing often targets victims through visually perturbed texts to bypass security systems. The noise contained in these texts functions as an adversarial attack, designed to deceive language models and hinder their ability to accurately interpret the content. However, since it is difficult to obtain sufficient phishing cases, previous studies have used synthetic datasets that do not contain real-world cases. In this study, we propose the BitAbuse dataset, which includes real-world phishing cases, to address the limitations of previous research. Our dataset comprises a total of 325,580 visually perturbed texts. The dataset inputs are drawn from the raw corpus, consisting of visually perturbed sentences and sentences generated through an artificial perturbation process. Each input sentence is labeled with its corresponding ground truth, representing the restored, non-perturbed version. Language models trained on our proposed dataset demonstrated significantly better performance compared to previous methods, achieving an accuracy of approximately 96%. Our analysis revealed a significant gap between real-world and synthetic examples, underscoring the value of our dataset for building reliable pre-trained models for restoration tasks. We release the BitAbuse dataset, which includes real-world phishing cases annotated with visual perturbations, to support future research in adversarial attack defense. 2025.findings-naacl.247 @@ -3091,7 +3091,7 @@ Unfolding the Headline: Iterative Self-Questioning for News Retrieval and Timeline Summarization - WeiqiWu + WeiqiWu ShenHuangAlibaba Group YongJiangTongyi Lab PengjunXie @@ -3105,7 +3105,7 @@ <fixed-case>R</fixed-case>etriever<fixed-case>G</fixed-case>uard: Empowering Information Retrieval to Combat <fixed-case>LLM</fixed-case>-Generated Misinformation ChuwenChen - ShuaiZhangAmazon + ShuaiZhangAmazon 4399-4411 Large language models (LLMs) have demonstrated impressive capabilities in generating human-like text and have been shown to store factual knowledge within their extensive parameters. However, models like ChatGPT can still actively or passively generate false or misleading information, increasing the challenge of distinguishing between human-created and machine-generated content. This poses significant risks to the authenticity and reliability of digital communication. This work aims to enhance retrieval models’ ability to identify the authenticity of texts generated by large language models, with the goal of improving the truthfulness of retrieved texts and reducing the harm of false information in the era of large models. Our contributions include: (1) we construct a diverse dataset of authentic human-authored texts and highly deceptive AI-generated texts from various domains; (2) we propose a self-supervised training method, RetrieverGuard, that enables the model to capture textual rules and styles of false information from the corpus without human-labelled data, achieving higher accuracy and robustness in identifying misleading and highly deceptive AI-generated content. 2025.findings-naacl.249 @@ -3118,8 +3118,8 @@ ChangSuChoi HanGyeolYoo HyeonSeokLimSeoul National University of Science and Technology - KyungTaeLimKorea Advanced Institute of Science & Technology - JungyeulParkThe University of British Columbia + KyungTaeLimKorea Advanced Institute of Science & Technology + JungyeulParkThe University of British Columbia 4412-4426 This study explores the integration of automated writing evaluation (AWE) and grammatical error correction (GEC) through multitask learning, demonstrating how combining these distinct tasks can enhance performance in both areas. By leveraging a shared learning framework, we show that models trained jointly on AWE and GEC outperform those trained on each task individually. To support this effort, we introduce a dataset specifically designed for multitask learning using AWE and GEC. Our experiments reveal significant synergies between tasks, leading to improvements in both writing assessment accuracy and error correction precision. This research represents a novel approach for optimizing language learning tools by unifying writing evaluation and correction tasks, offering insights into the potential of multitask learning in educational applications. 2025.findings-naacl.250 @@ -3131,7 +3131,7 @@ ZeyuHuang ZihanQiuAlibaba Group ZiliWang - JieFuShanghai Artificial Intelligence Laboratory + JieFuShanghai Artificial Intelligence Laboratory 4427-4447 Mixture-of-experts (MoE) is gaining increasing attention due to its unique properties and remarkable performance, especially for language tasks. By sparsely activating a subset of parameters for each token, MoE architecture could increase the model size without sacrificing computational efficiency, achieving a better trade-off between performance and training costs. However, the underlying mechanism of MoE still lacks further exploration, and its modularization degree remains questionable. In this paper, we make an initial attempt to understand the inner workings of MoE-based large language models. Concretely, we comprehensively study the parametric and behavioral features of four popular MoE-based models and reveal some intriguing observations, including 1) Neurons act like fine-grained experts; 2) The router of MoE usually selects experts with larger output norms; 3) The expert diversity increases as the layer increases, while the last layer is an outlier, which is further validated by an initial experiment. Based on the observations, we also provide suggestions for a broad spectrum of MoE practitioners, such as router design and expert allocation. We hope this work could shed light on future research on the MoE framework and other modular architectures. Code is available at https://github.com/kamanphoebe/Look-into-MoEs. 2025.findings-naacl.251 @@ -3139,10 +3139,10 @@ <fixed-case>CDB</fixed-case>: A Unified Framework for Hope Speech Detection Through Counterfactual, Desire and Belief - Tulio Ferreira Leite DaSilvaUniversidade de São Paulo + Tulio Ferreira Leite DaSilvaUniversidade de São Paulo Gonzalo FreijedoAdunaEcole Normale Supérieure – PSL FarahBenamaraInstitut de recherche en informatique de toulouse - AldaMariCNRS + AldaMariCNRS ZongminLi LiYueInstitute for Infocomm Research, A*STAR JianSuA*STAR @@ -3156,10 +3156,10 @@ JiyueJiang PenganChenShanghai Artificial Intelligence Laboratory, University of Hong Kong and University of Hong Kong LihengChen - ShengWang + ShengWang QinghangBao LingpengKongDepartment of Computer Science, The University of Hong Kong - YuLiDepartment of Computer Science and Engineering, The Chinese University of Hong Kong + YuLiDepartment of Computer Science and Engineering, The Chinese University of Hong Kong ChuanWuThe University of Hong Kong 4464-4505 The rapid evolution of large language models (LLMs) has transformed the competitive landscape in natural language processing (NLP), particularly for English and other data-rich languages. However, underrepresented languages like Cantonese, spoken by over 85 million people, face significant development gaps, which is particularly concerning given the economic significance of the Guangdong-Hong Kong-Macau Greater Bay Area, and in substantial Cantonese-speaking populations in places like Singapore and North America. Despite its wide use, Cantonese has scant representation in NLP research, especially compared to other languages from similarly developed regions. To bridge these gaps, we outline current Cantonese NLP methods and introduce new benchmarks designed to evaluate LLM performance in factual generation, mathematical logic, complex reasoning, and general knowledge in Cantonese, which aim to advance open-source Cantonese LLM technology. We also propose future research directions and recommended models to enhance Cantonese LLM development. @@ -3170,7 +3170,7 @@ Improving Reward Models with Synthetic Critiques ZihuiwenYe Fraser DavidGreenleeCohere - MaxBartoloCohere and University College London + MaxBartoloCohere and University College London PhilBlunsomGoogle, Department of Computer Science, University of Oxford and DeepMind Jon AnderCamposCohere MatthiasGalléCohere @@ -3184,12 +3184,12 @@ YuanyiWang HanLi HaifengSunBeijing University of Posts and Telecommunications, Beijing University of Posts and Telecommunications and Beijing University of Posts and Telecommunications - LeiZhangChina Unicom Network Communications Co., Ltd. - BoHe - WeiTang - TianhaoYan + LeiZhangChina Unicom Network Communications Co., Ltd. + BoHe + WeiTang + TianhaoYan QiQiBeijing University of Posts and Telecommunications - JingyuWang + JingyuWang 4521-4535 Entity alignment (EA) is crucial for integrating multi-source knowledge graphs (KGs), aiming to identify equivalent entities across different graphs. However, most existing EA decoding methods rely on both entity and relation embeddings, limiting their generalizability and efficiency, especially in GNN-based models. To address these challenges, we propose Triple Feature Propagation (TFP), an adaptable and fast EA decoding framework that only utilizes entity embeddings. TFP reconstructs KG representation by maximizing the smoothness of entity embeddings. The discretized smoothness-maximization process yields the explicit Euler solution of TFP. We also generalize multi-view matrices: entity-to-entity, entity-to-relation, relation-to-entity, and relation-to-triple, to capture structural diversity. Extensive experiments on public datasets demonstrate that TFP is fast and adaptable to various encoders, achieving comparable results to state-of-the-art methods in under 6 seconds, and surpassing them in many cases. 2025.findings-naacl.255 @@ -3199,7 +3199,7 @@ Lost in the Distance: Large Language Models Struggle to Capture Long-Distance Relational Knowledge MeiyunWang TakeshiKojimaThe University of Tokyo - YusukeIwasawaThe University of Tokyo, The University of Tokyo + YusukeIwasawaThe University of Tokyo, The University of Tokyo YutakaMatsuoThe University of Tokyo and The University of Tokyo 4536-4544 Large language models (LLMs) have demonstrated impressive capabilities in handling long contexts, but challenges remain in capturing relational knowledge spread far apart within text. Connecting long-distance knowledge is important for solving tasks as the context length increases: imagine reading a lengthy detective novel where seemingly trivial information introduced early on often becomes essential during the climactic reveal of the culprit. In this study, we expose the ”Lost in the Distance” phenomenon, where LLM performance of capturing the relational knowledge degrades significantly when the relational knowledge is separated by noise, i.e., unrelated sentences to solve a task. Specifically, we design an experiment in which we insert artificial noise between two related elements and observe model performance as the distance between them increases. Our findings show that while LLMs can handle edge noise with little impact, their ability to reason about distant relationships declines sharply as the intervening noise grows. These findings are consistent in both forward-looking prediction and backward-looking prediction settings. We validate this across various models (GPT-4, Gemini-1.5-pro, GPT-4o-mini, Gemini-1.5-flash, Claude-3.5-Sonnet) and tasks (causal reasoning and knowledge extraction). These results reveal a significant limitation in how LLMs process relational knowledge over long contexts. We release our code and data to support further research. @@ -3209,10 +3209,10 @@ <fixed-case>F</fixed-case>in<fixed-case>NLI</fixed-case>: Novel Dataset for Multi-Genre Financial Natural Language Inference Benchmarking JabezMagomere - ElenaKochkinaJ.P. Morgan Chase - SamuelMensahJ.P. Morgan Chase - SimerjotKaurJPMorgan Chase and Co - ChareseSmileyJ.P. Morgan Chase + ElenaKochkinaJ.P. Morgan Chase + SamuelMensahJ.P. Morgan Chase + SimerjotKaurJPMorgan Chase and Co + ChareseSmileyJ.P. Morgan Chase 4545-4568 We introduce FinNLI, a benchmark dataset for Financial Natural Language Inference (FinNLI) across diverse financial texts like SEC Filings, Annual Reports, and Earnings Call transcripts. Our dataset framework ensures diverse premise-hypothesis pairs while minimizing spurious correlations. FinNLI comprises 21,304 pairs, including a high-quality test set of 3,304 instances annotated by finance experts. Evaluations show that domain shift significantly degrades general-domain NLI performance. The highest Macro F1 scores for pre-trained (PLMs) and large language models (LLMs) baselines are 74.57% and 78.62%, respectively, highlighting the dataset’s difficulty. Surprisingly, instruction-tuned financial LLMs perform poorly, suggesting limited generalizability. FinNLI exposes weaknesses in current LLMs for financial reasoning, indicating room for improvement. 2025.findings-naacl.257 @@ -3220,10 +3220,10 @@ Music for All: Representational Bias and Cross-Cultural Adaptability of Music Generation Models - AtharvaMehtaMohamed bin Zayed University of Artificial Intelligence - ShivamChauhanMohamed bin Zayed University of Artificial Intelligence - AmirbekDjanibekov - AtharvaKulkarni + AtharvaMehtaMohamed bin Zayed University of Artificial Intelligence + ShivamChauhanMohamed bin Zayed University of Artificial Intelligence + AmirbekDjanibekov + AtharvaKulkarni GusXiaNew York University MonojitChoudhuryMohamed bin Zayed University of Artificial Intelligence 4569-4585 @@ -3235,7 +3235,7 @@ <fixed-case>SFMSS</fixed-case>: Service Flow aware Medical Scenario Simulation for Conversational Data Generation ZhijieBao QingyunLiu - XuanjingHuangFudan University + XuanjingHuangFudan University ZhongyuWeiFudan University 4586-4604 Medical-specific Large Language Models (LLMs) have demonstrated impressive performance on medical-related exams and tasks. Despite their success in single-turn question and answering, instruction-tuned LLMs often falter in real-world healthcare applications, highlighting a disconnect between existing instruction datasets and practical contexts. To address this issue, we propose Service Flow aware Medical Scenario Simulation (SFMSS), a simulation framework designed for medical conversational data generation. SFMSS employs three key strategies to ensure the quality of the data generation. the use of Authentic Seed Data ensures alignment of real-world distributions. Diverse Patient Simulation enables simulated patients to exhibit distinct communication styles and complex behavioral logic. Service Flow Control ensures that conversations progress in alignment with medical objectives. We construct a dataset targeting on outpatient reception through SFMSS, named SFMSS-CD. Building on this dataset, we develop a model called SFMSS-Nurse. We conduct both automatic and human evaluations, involving 15 users and 15 clinical experts, to assess the effectiveness of SFMSS. The results demonstrate that SFMSS-Nurse outperforms all baselines, including the current state-of-the-art model GPT-4o, and aligns with human preferences and clinical demands. @@ -3246,7 +3246,7 @@ Re-evaluating Automatic <fixed-case>LLM</fixed-case> System Ranking for Alignment with Human Preference MingqiGao YixinLiuYale University - XinyuHuPeking University + XinyuHuPeking University XiaojunWan JonathanBraggAllen Institute for Artificial Intelligence ArmanCohanYale University and Allen Institute for Artificial Intelligence @@ -3257,7 +3257,7 @@ <fixed-case>G</fixed-case>uide<fixed-case>Q</fixed-case>: Framework for Guided Questioning for progressive informational collection and classification - PriyaMishraIndian Institute of Technology, Bombay, Dhirubhai Ambani Institute Of Information and Communication Technology + PriyaMishraIndian Institute of Technology, Bombay, Dhirubhai Ambani Institute Of Information and Communication Technology SurajRachaIndian Institute of Technology, Bombay, Dhirubhai Ambani Institute Of Information and Communication Technology KaustubhPonkshe AditAkarsh @@ -3283,7 +3283,7 @@ El Moatez BillahNagoudiUniversity of British Columbia AbdellahEl MekkiMohamed bin Zayed University of Artificial Intelligence FakhraddinAlwajih - MuhammadAbdul-MageedUniversity of British Columbia + MuhammadAbdul-MageedUniversity of British Columbia 4654-4670 In this paper, we introduce Swan, a family of embedding models centred around the Arabic language, addressing both small-scale and large-scale use cases. Swan includes two variants: Swan-Small, based on ARBERTv2, and Swan-Large, built on ArMistral, a pretrained Arabic large language model. To evaluate these models, we propose ArabicMTEB, a comprehensive benchmark suite that assesses cross-lingual, multi-dialectal, multi-domain, and multi-cultural Arabic text embedding performance, covering eight diverse tasks and spanning 94 datasets. Swan-Large achieves state-of-the-art results, outperforming Multilingual-E5-large in most Arabic tasks, while the Swan-Small consistently surpasses Multilingual-E5-base. Our extensive evaluations demonstrate that Swan models are dialectally and culturally aware, excelling across various Arabic domains while offering significant monetary efficiency. This work significantly advances the field of Arabic language modelling and provides valuable resources for future research and applications in Arabic natural language processing. Our models and benchmarks will be made publicly accessible for research. 2025.findings-naacl.263 @@ -3294,9 +3294,9 @@ JipengZhang YaxuanQin RenjiePi - WeizhongZhangFudan University - RuiPanUniversity of Illinois at Urbana-Champaign - TongZhangUIUC + WeizhongZhangFudan University + RuiPanUniversity of Illinois at Urbana-Champaign + TongZhangUIUC 4671-4686 Instruction tuning has achieved unprecedented success in NLP, turning large language models into versatile chatbots. However, the increasing variety and volume of instruction datasets demand significant computational resources. To address this, it is essential to extract a small and highly informative subset (i.e., Coreset) that achieves comparable performance to the full dataset. Achieving this goal poses non-trivial challenges: 1) data selection requires accurate data representations that reflect the training samples’ quality, 2) considering the diverse nature of instruction datasets, and 3) ensuring the efficiency of the coreset selection algorithm for large models. To address these challenges, we propose Task-Agnostic Gradient Clustered COreset Selection (TAGCOS). Specifically, we leverage sample gradients as the data representations, perform clustering to group similar data, and apply an efficient greedy algorithm for coreset selection. Experimental results show that our algorithm, selecting only 5% of the data, surpasses other unsupervised methods and achieves performance close to that of the full dataset. 2025.findings-naacl.264 @@ -3304,8 +3304,8 @@ From Text to Emoji: How <fixed-case>PEFT</fixed-case>-Driven Personality Manipulation Unleashes the Emoji Potential in <fixed-case>LLM</fixed-case>s - NavyaJain - ZekunWuDepartment of Computer Science, University College London, University of London and Holistic AI + NavyaJain + ZekunWuDepartment of Computer Science, University College London, University of London and Holistic AI Cristian Enrique MunozVillalobos AirlieHilliard XinGuanHolistic AI @@ -3319,9 +3319,9 @@ Decoding Fatphobia: Examining Anti-Fat and Pro-Thin Bias in <fixed-case>AI</fixed-case>-Generated Images - JaneWarren - Gary M.WeissFordham University - FernandoMartinez + JaneWarren + Gary M.WeissFordham University + FernandoMartinez AnnikaGuo YijunZhaoFordham University 4724-4736 @@ -3335,7 +3335,7 @@ HaopingBaiApple ShuangMaApple FengNan - YanchaoSunApple AI/ML + YanchaoSunApple AI/ML ZhaoyangXuApple ShenMaApple JiaruiLuApple @@ -3349,7 +3349,7 @@ DominicWalshApple TobiasGindeleApple JuergenWiestApple - ZhengfengLaiApple + ZhengfengLaiApple Xiaoming SimonWangDidi Research US JiulongShanApple MengCaoApple @@ -3363,7 +3363,7 @@ Improving Consistency in <fixed-case>LLM</fixed-case> Inference using Probabilistic Tokenization AshutoshSathe - DivyanshuAggarwal + DivyanshuAggarwal SunayanaSitaramMicrosoft 4766-4778 Prior research has demonstrated noticeable performance gains through the use of probabilistic tokenizations, an approach that involves employing multiple tokenizations of the same input string during the training phase of a language model. Despite these promising findings, modern large language models (LLMs) have yet to be trained using probabilistic tokenizations. Interestingly, while the tokenizers of these contemporary LLMs have the capability to generate multiple tokenizations, this property remains underutilized.In this work, we propose a novel method to leverage the multiple tokenization capabilities of modern LLM tokenizers, aiming to enhance the self-consistency of LLMs in reasoning tasks. Our experiments indicate that when utilizing probabilistic tokenizations, LLMs generate logically diverse reasoning paths, moving beyond mere surface-level linguistic diversity. We carefully study probabilistic tokenization and offer insights to explain the self consistency improvements it brings through extensive experimentation on 5 LLM families and 4 reasoning benchmarks. @@ -3375,7 +3375,7 @@ TianrongZhangPennsylvania State University BochuanCao YuanpuCaoPennsylvania State University - LuLinPennsylvania State University + LuLinPennsylvania State University PrasenjitMitraCarnegie Mellon University JinghuiChenPennsylvania State University 4779-4807 @@ -3386,7 +3386,7 @@ Human and <fixed-case>LLM</fixed-case>-Based Resume Matching: An Observational Study SwanandVaishampayan - HunterLearyVirginia Polytechnic Institute and State University + HunterLearyVirginia Polytechnic Institute and State University Yoseph BerhanuAlebachew LouisHickmanVirginia Polytechnic Institute and State University Brent A.StevenorNREMT @@ -3401,7 +3401,7 @@ A Practical Examination of <fixed-case>AI</fixed-case>-Generated Text Detectors for Large Language Models BrianTufts XuandongZhaoUniversity of California, Berkeley - LeiLiSchool of Computer Science, Carnegie Mellon University + LeiLiSchool of Computer Science, Carnegie Mellon University 4824-4841 The proliferation of large language models has raised growing concerns about their misuse, particularly in cases where AI-generated text is falsely attributed to human authors. Machine-generated content detectors claim to effectively identify such text under various conditions and from any language model. This paper critically evaluates these claims by assessing several popular detectors (RADAR, Wild, T5Sentinel, Fast-DetectGPT, PHD, LogRank, Binoculars) on a range of domains, datasets, and models that these detectors have not previously encountered. We employ various prompting strategies to simulate practical adversarial attacks, demonstrating that even moderate efforts can significantly evade detection. We emphasize the importance of the true positive rate at a specific false positive rate (TPR@FPR) metric and demonstrate that these detectors perform poorly in certain settings, with TPR@.01 as low as 0%. Our findings suggest that both trained and zero-shot detectors struggle to maintain high sensitivity while achieving a reasonable true positive rate. 2025.findings-naacl.271 @@ -3410,8 +3410,8 @@ Robust Bias Detection in <fixed-case>MLM</fixed-case>s and its Application to Human Trait Ratings IngrojShrestha - LouisTayPurdue University - PadminiSrinivasanUniversity of Iowa + LouisTayPurdue University + PadminiSrinivasanUniversity of Iowa 4842-4858 There has been significant prior work using templates to study bias against demographic attributes in MLMs. However, these have limitations: they overlook random variability of templates and target concepts analyzed, assume equality amongst templates, and overlook bias quantification. Addressing these, we propose a systematic statistical approach to assess bias in MLMs, using mixed models to account for random effects, pseudo-perplexity weights for sentences derived from templates and quantify bias using statistical effect sizes. Replicating prior studies, we match on bias scores in magnitude and direction with small to medium effect sizes.Next, we explore the novel problem of gender bias in the context of *personality* and *character* traits, across seven MLMs (base and large). We find that MLMs vary; ALBERT is unbiased for binary gender but the most biased for non-binary *neo*, while RoBERTa-large is the most biased for binary gender but shows small to no bias for *neo*. There is some alignment of MLM bias and findings in psychology (human perspective) - in *agreeableness* with RoBERTa-large and *emotional stability* with BERT-large. There is general agreement for the remaining 3 personality dimensions: both sides observe at most small differences across gender. For character traits, human studies on gender bias are limited thus comparisons are not feasible. 2025.findings-naacl.272 @@ -3420,8 +3420,8 @@ How Inclusively do <fixed-case>LM</fixed-case>s Perceive Social and Moral Norms? MichaelGalarnyk - AgamShah - DipanwitaGuhathakurta + AgamShah + DipanwitaGuhathakurta PoojithaNandigam SudheerChavaGeorgia Institute of Technology 4859-4869 @@ -3441,11 +3441,11 @@ Echoes of Discord: Forecasting Hater Reactions to Counterspeech - XiaoyingSong + XiaoyingSong Sharon LissethPerez - XinchenYuUniversity of Arizona + XinchenYuUniversity of Arizona EduardoBlancoUniversity of Arizona - LingziHongUniversity of North Texas + LingziHongUniversity of North Texas 4892-4905 Hate speech (HS) erodes the inclusiveness of online users and propagates negativity and division. Counterspeech has been recognized as a way to mitigate the harmful consequences. While some research has investigated the impact of user-generated counterspeech on social media platforms, few have examined and modeled haters’ reactions toward counterspeech, despite the immediate alteration of haters’ attitudes being an important aspect of counterspeech. This study fills the gap by analyzing the impact of counterspeech from the hater’s perspective, focusing on whether the counterspeech leads the hater to reenter the conversation and if the reentry is hateful. We compile the Reddit Echoes of Hate dataset (ReEco), which consists of triple-turn conversations featuring haters’ reactions, to assess the impact of counterspeech. To predict haters’ behaviors, we employ two strategies: a two-stage reaction predictor and a three-way classifier. The linguistic analysis sheds insights on the language of counterspeech to hate eliciting different haters’ reactions. Experimental results demonstrate that the 3-way classification model outperforms the two-stage reaction predictor, which first predicts reentry and then determines the reentry type. We conclude the study with an assessment showing the most common errors identified by the best-performing model. 2025.findings-naacl.275 @@ -3454,7 +3454,7 @@ Contextual Metric Meta-Evaluation by Measuring Local Metric Accuracy AthiyaDeviyaniSchool of Computer Science, Carnegie Mellon University - FernandoDiazCarnegie Mellon University and Google + FernandoDiazCarnegie Mellon University and Google 4906-4925 Meta-evaluation of automatic evaluation metrics—assessing evaluation metrics themselves—is crucial for accurately benchmarking natural language processing systems and has implications for scientific inquiry, production model development, and policy enforcement. While existing approaches to metric meta-evaluation focus on general statements about the absolute and relative quality of metrics across arbitrary system outputs, in practice, metrics are applied in highly contextual settings, often measuring the performance for a highly constrained set of system outputs. For example, we may only be interested in evaluating a specific model or class of models. We introduce a method for contextual metric meta-evaluation by comparing the local metric accuracy of evaluation metrics. Across translation, speech recognition, and ranking tasks, we demonstrate that the local metric accuracies vary both in absolute value and relative effectiveness as we shift across evaluation contexts. This observed variation highlights the importance of adopting context-specific metric evaluations over global ones. 2025.findings-naacl.276 @@ -3484,17 +3484,17 @@ <fixed-case>B</fixed-case>n<fixed-case>TTS</fixed-case>: Few-Shot Speaker Adaptation in Low-Resource Setting - Mohammad Jahid IbnaBasherChittagong University of Engineering and Technology + Mohammad Jahid IbnaBasherChittagong University of Engineering and Technology MdKowsherUniversity of Central Florida Md SaifulIslam Rabindra NathNandiHishab Singapure Pte. Ltd Nusrat JahanProttasha Mehadi HasanMenon Tareq AlMuntasir - Shammur AbsarChowdhuryQatar Computing Research Institute - FirojAlamQatar Computing Research Institute + Shammur AbsarChowdhuryQatar Computing Research Institute + FirojAlamQatar Computing Research Institute NiloofarYousefiUniversity of Central Florida - OzlemGaribayUniversity of Central Florida + OzlemGaribayUniversity of Central Florida 4956-4968 This paper introduces BnTTS (Bangla Text-To-Speech), the first framework for Bangla speaker adaptation-based TTS, designed to bridge the gap in Bangla speech synthesis using minimal training data. Building upon the XTTS architecture, our approach integrates Bangla into a multilingual TTS pipeline, with modifications to account for the phonetic and linguistic characteristics of the language. We pretrain BnTTS on 3.85k hours of Bangla speech dataset with corresponding text labels and evaluate performance in both zero-shot and few-shot settings on our proposed test dataset. Empirical evaluations in few-shot settings show that BnTTS significantly improves the naturalness, intelligibility, and speaker fidelity of synthesized Bangla speech. Compared to state-of-the-art Bangla TTS systems, BnTTS exhibits superior performance in Subjective Mean Opinion Score (SMOS), Naturalness, and Clarity metrics. 2025.findings-naacl.279 @@ -3502,8 +3502,8 @@ Playing with Voices: Tabletop Role-Playing Game Recordings as a Diarization Challenge - LianRemme - KevinTangHeinrich Heine University Düsseldorf and University of Florida + LianRemme + KevinTangHeinrich Heine University Düsseldorf and University of Florida 4969-4983 This paper provides a proof of concept that audio of tabletop role-playing games (TTRPG) could serve as a challenge for diarization systems. TTRPGs are carried out mostly by conversation. Participants often alter their voices to indicate that they are talking as a fictional character. Audio processing systems are susceptible to voice conversion with or without technological assistance. TTRPG present a conversational phenomenon in which voice conversion is an inherent characteristic for an immersive gaming experience. This could make it more challenging for diarizers to pick the real speaker and determine that impersonating is just that. We present the creation of a small TTRPG audio dataset and compare it against the AMI and the ICSI corpus. The performance of two diarizers, pyannote.audio and wespeaker, were evaluated. We observed that TTRPGs’ properties result in a higher confusion rate for both diarizers.Additionally, wespeaker strongly underestimates the number of speakers in the TTRPG audio files.We propose TTRPG audio as a promising challenge for diarization systems. 2025.findings-naacl.280 @@ -3514,8 +3514,8 @@ YuenChen Vethavikashini ChithrraRaghuramCCC Intelligent Solutions JustusMatternDepartment of Computer Science, ETHZ - ETH Zurich and Rheinisch Westfälische Technische Hochschule Aachen - RadaMihalceaUniversity of Michigan - ZhijingJinDepartment of Computer Science, University of Toronto + RadaMihalceaUniversity of Michigan + ZhijingJinDepartment of Computer Science, University of Toronto 4984-5004 Generated texts from large language models (LLMs) have been shown to exhibit a variety of harmful, human-like biases against various demographics. These findings motivate research efforts aiming to understand and measure such effects. This paper introduces a causal formulation for bias measurement in generative language models. Based on this theoretical foundation, we outline a list of desiderata for designing robust bias benchmarks. We then propose a benchmark called OccuGender, with a bias-measuring procedure to investigate occupational gender bias. We test several state-of-the-art open-source LLMs on OccuGender, including Llama, Mistral, and their instruction-tuned versions. The results show that these models exhibit substantial occupational gender bias. Lastly, we discuss prompting strategies for bias mitigation and an extension of our causal formulation to illustrate the generalizability of our framework. 2025.findings-naacl.281 @@ -3524,7 +3524,7 @@ <fixed-case>OLMES</fixed-case>: A Standard for Language Model Evaluations YulingGuAllen Institute for Artificial Intelligence - OyvindTafjordAllen Institute for Artificial Intelligence + OyvindTafjordAllen Institute for Artificial Intelligence BaileyKuehl DanyHaddad JesseDodgeAllen Institute for Artificial Intelligence @@ -3545,7 +3545,7 @@ <fixed-case>M</fixed-case>o<fixed-case>LA</fixed-case>: <fixed-case>M</fixed-case>o<fixed-case>E</fixed-case> <fixed-case>L</fixed-case>o<fixed-case>RA</fixed-case> with Layer-wise Expert Allocation - ChongyangGao + ChongyangGao KezhenChenTogether AI JinmengRaoGoogle DeepMind RuiboLiuGoogle DeepMind @@ -3553,7 +3553,7 @@ YawenZhangGoogle X, Mineral.ai DaiyiPeng XiaoyuanGuoMineral.ai - VsSubrahmanianNorthwestern University + VsSubrahmanianNorthwestern University 5097-5112 Recent efforts to integrate low-rank adaptation (LoRA) with the Mixture-of-Experts (MoE) have managed to achieve performance comparable to full-parameter fine-tuning by tuning much fewer parameters. Despite promising results, research on improving the efficiency and expert analysis of LoRA with MoE is still in its early stages. Recent studies have shown that experts in the MoE architecture have different strengths and also exhibit some redundancy. Does this statement also apply to parameter-efficient MoE? In this paper, we introduce a novel parameter-efficient MoE method, MoE-LoRA with Layer-wise Expert Allocation (MoLA) for Transformer-based models, where each model layer uses a varying number of LoRA experts. We investigate several architectures with varying layer-wise expert configurations. Experiments on six well-known NLP and commonsense QA benchmarks demonstrate that MoLA achieves equal or superior performance compared to all baselines on top of both LLAMA-2, Mistral, and Gemma. We find that allocating more LoRA experts to middle layers further enhances the effectiveness of models with a certain number of experts in total. The redundancy of the experts is more obvious in the lower layers. With much fewer parameters, this allocation strategy outperforms the setting with the same number of experts in every layer. This work can be widely used as a plug-and-play parameter-efficient tuning approach for various applications. The code has been made available at https://github.com/GCYZSL/MoLA. @@ -3562,9 +3562,9 @@ <fixed-case>C</fixed-case>ode<fixed-case>S</fixed-case>im: Multi-Agent Code Generation and Problem Solving through Simulation-Driven Planning and Debugging - Md. AshrafulIslamBangladesh University of Engineering and Technology + Md. AshrafulIslamBangladesh University of Engineering and Technology Mohammed EunusAliBangladesh University of Engineering and Technology - Md RizwanParvezQatar Computing Research Institute + Md RizwanParvezQatar Computing Research Institute 5113-5139 2025.findings-naacl.285 islam-etal-2025-codesim @@ -3582,9 +3582,9 @@ Evaluation of Multilingual Image Captioning: How far can we get with <fixed-case>CLIP</fixed-case> models? - Goncalo Emanuel CavacoGomes - ChrysoulaZervaInstituto Superior Técnico - BrunoMartinsInstituto Superior Técnico + Goncalo Emanuel CavacoGomes + ChrysoulaZervaInstituto Superior Técnico + BrunoMartinsInstituto Superior Técnico 5156-5175 The evaluation of image captions, looking at both linguistic fluency and semantic correspondence to visual contents, has witnessed a significant effort. Still, despite advancements such as the CLIPScore metric, multilingual captioning evaluation has remained relatively unexplored. This work presents several strategies, and extensive experiments, related to evaluating CLIPScore variants in multilingual settings. To address the lack of multilingual test data, we consider two different strategies: (1) using quality aware machine-translated datasets with human judgements, and (2) re-purposing multilingual datasets that target semantic inference and reasoning. Our results highlight the potential of finetuned multilingual models to generalize across languages and to handle complex linguistic challenges. Tests with machine-translated data show that multilingual CLIPScore models can maintain a high correlation with human judgements across different languages, and additional tests with natively multilingual and multicultural data further attest to the high-quality assessments. 2025.findings-naacl.287 @@ -3593,7 +3593,7 @@ Avoiding Copyright Infringement via Large Language Model Unlearning GuangyaoDou - ZheyuanLiuUniversity of Notre Dame + ZheyuanLiuUniversity of Notre Dame QingLyuUniversity of Pennsylvania KaizeDingNorthwestern University EricWongUniversity of Pennsylvania @@ -3606,8 +3606,8 @@ A Context-Aware Contrastive Learning Framework for Hateful Meme Detection and Segmentation XuanyuSu YansongLi - DianaInkpenUniversity of Ottawa - NathalieJapkowiczAmerican University + DianaInkpenUniversity of Ottawa + NathalieJapkowiczAmerican University 5201-5215 Amidst the rise of Large Multimodal Models (LMMs) and their widespread application in generating and interpreting complex content, the risk of propagating biased and harmful memes remains significant. Current safety measures often fail to detect subtly integrated hateful content within “Confounder Memes”. To address this, we introduce HateSieve, a new framework designed to enhance the detection and segmentation of hateful elements in memes. HateSieve features a novel Contrastive Meme Generator that creates semantically correlated memes, a customized triplet dataset for contrastive learning, and an Image-Text Alignment module that produces context-aware embeddings for accurate meme segmentation. Empirical experiments show that HateSieve not only surpasses existing LMMs in performance with fewer trainable parameters but also offers a robust mechanism for precisely identifying and isolating hateful content. Caution: Contains academic discussions of hate speech; viewer discretion advised. 2025.findings-naacl.289 @@ -3627,7 +3627,7 @@ Does Data Contamination Detection Work (Well) for <fixed-case>LLM</fixed-case>s? A Survey and Evaluation on Detection Assumptions - YujuanFu + YujuanFu OzlemUzunerGeorge Mason University MelihaYetisgenUniversity of Washington FeiXiaUniversity of Washington, Seattle @@ -3640,7 +3640,7 @@ Representation-to-Creativity (<fixed-case>R</fixed-case>2<fixed-case>C</fixed-case>): Automated Holistic Scoring Model for Essay Creativity DeokgiKim JoonyoungJo - Byung-WonOnKunsan National University + Byung-WonOnKunsan National University IngyuLeeYeungnam University 5257-5275 Despite active research on Automated Essay Scoring (AES), there is a noticeable scarcity of studies focusing on predicting creativity scores for essays. In this study, we develop a new essay rubric specifically designed for assessing creativity in essays. Leveraging this rubric, we construct ground truth data consisting of 5,048 essays. Furthermore, we propose a novel self-supervised learning model that recognizes cluster patterns within the essay embedding space and leverages them for creativity scoring. This approach aims to automatically generate a high-quality training set, thereby facilitating the training of diverse language models. Our experimental findings indicated a substantial enhancement in the assessment of essay creativity, demonstrating an increase in F1-score up to 58% compared to the primary state-of-the-art models across the ASAP and AIHUB datasets. @@ -3651,9 +3651,9 @@ From Single to Multi: How <fixed-case>LLM</fixed-case>s Hallucinate in Multi-Document Summarization Catarina GBelémUniversity of California, Irvine PouyaPezeshkpourMegagon Labs - HayateIsoMegagon Labs, US + HayateIsoMegagon Labs, US SeijiMaekawaMegagon Labs, US - NikitaBhutaniMegagon Labs, Inc + NikitaBhutaniMegagon Labs, Inc EstevamHruschkaMegagon Labs, Megagon Labs and Carnegie Mellon University 5276-5309 Although many studies have investigated and reduced hallucinations in large language models (LLMs) for single-document tasks, research on hallucination in multi-document summarization (MDS) tasks remains largely unexplored. Specifically, it is unclear how the challenges arising from handling multiple documents (e.g., repetition and diversity of information) affect models outputs. In this work, we investigate how hallucinations manifest in LLMs when summarizing topic-specific information from a set of documents. Since no benchmarks exist for investigating hallucinations in MDS, we leverage existing news and conversation datasets, annotated with topic-specific insights, to create two novel multi-document benchmarks. When evaluating 5 LLMs on our benchmarks, we observe that on average, up to 75% of the content in LLM-generated summary is hallucinated, with hallucinations more likely to occur towards the end of the summaries. Moreover, when summarizing non-existent topic-related information, GPT-3.5-turbo and GPT-4o still generate summaries about 79.45% and 44% of the time, raising concerns about their tendency to fabricate content. To better understand the characteristics of these hallucinations, we conduct a human evaluation of 700+ insights and discover that most errors stem from either failing to follow instructions or producing overly generic insights. Motivated by these observations, we investigate the efficacy of simple post-hoc baselines in mitigating hallucinations but find them only moderately effective. Our results underscore the need for more effective approaches that systematically mitigate hallucinations in MDS. @@ -3678,12 +3678,12 @@ Where is this coming from? Making groundedness count in the evaluation of Document <fixed-case>VQA</fixed-case> models - ArminehNourbakhshSchool of Computer Science, Carnegie Mellon University and J.P. Morgan Chase + ArminehNourbakhshSchool of Computer Science, Carnegie Mellon University and J.P. Morgan Chase SiddharthParekh - PranavShettyJ.P. Morgan Chase + PranavShettyJ.P. Morgan Chase ZhaoJin SameenaShahJ.P. Morgan Chase - CarolynRoseSchool of Computer Science, Carnegie Mellon University + CarolynRoseSchool of Computer Science, Carnegie Mellon University 5326-5346 Document Visual Question Answering (VQA) models have evolved at an impressive rate over the past few years, coming close to or matching human performance on some benchmarks. We argue that common evaluation metrics used by popular benchmarks do not account for the semantic and multimodal groundedness of a model’s outputs. As a result, hallucinations and major semantic errors are treated the same way as well-grounded outputs, and the evaluation scores do not reflect the reasoning capabilities of the model. In response, we propose a new evaluation methodology that accounts for the groundedness of predictions with regard to the semantic characteristics of the output as well as the multimodal placement of the output within the input document. Our proposed methodology is parameterized in such a way that users can configure the score according to their preferences. We validate our scoring methodology using human judgment and show its potential impact on existing popular leaderboards. Through extensive analyses, we demonstrate that our proposed method produces scores that are a better indicator of a model’s robustness and tends to give higher rewards to better-calibrated answers. 2025.findings-naacl.295 @@ -3692,7 +3692,7 @@ Transformer-based Causal Language Models Perform Clustering XinboWuUniversity of Illinois, Urbana Champaign - Lav R.VarshneyUniversity of Illinois at Urbana-Champaign + Lav R.VarshneyUniversity of Illinois at Urbana-Champaign 5347-5372 Even though large language models (LLMs) have demonstrated remarkable capability in solving various natural language tasks, the capability of an LLM to follow human instructions is still an area of active development. Recent works (Ouyang et al., 2022; Rafailov et al., 2023; Zhang et al., 2023) have shown great improvements in instruction-following capability through additional training for instruction-following tasks. However, the mechanisms responsible for effective instruction-following capabilities remain inadequately understood. Here, we introduce a simplified instruction-following task and use synthetic datasets to analyze a Transformer-based causal language model. Our findings suggest that the model learns task-specific information by clustering data within its hidden space, with this clustering process evolving dynamically during learning. We also demonstrate how this phenomenon assists the model in handling unseen instances, and validate our results in a more realistic setting. We further present applications in pre-training and alignment, inspired by clustering. 2025.findings-naacl.296 @@ -3701,7 +3701,7 @@ Towards Better Multi-task Learning: A Framework for Optimizing Dataset Combinations in Large Language Models ZaifuZhan - RuiZhangUniversity of Minnesota - Twin Cities + RuiZhangUniversity of Minnesota - Twin Cities 5373-5386 To efficiently select optimal dataset combinations for enhancing multi-task learning (MTL) performance in large language models, we proposed a novel framework that leverages a neural network to predict the best dataset combinations. The framework iteratively refines the selection, greatly improving efficiency, while being model-, dataset-, and domain-independent. Through experiments on 12 biomedical datasets across four tasks—named entity recognition, relation extraction, event extraction, and text classification—we demonstrate that our approach effectively identifies better combinations, even for tasks that may seem unpromising from a human perspective. This verifies that our framework provides a promising solution for maximizing MTL potential. 2025.findings-naacl.297 @@ -3723,7 +3723,7 @@ YuHe ZihanYao TianyuQi - JunLiuXi’an Jiaotong University + JunLiuXi’an Jiaotong University MingLi 5414-5428 Large language models (LLMs) have shown remarkable effectiveness across various domains, with data augmentation methods utilizing GPT for synthetic data generation becoming prevalent. However, the quality and utility of augmented data remain questionable, and current methods lack clear metrics for evaluating data characteristics. To address these challenges, we propose ResoFilter, a novel method that integrates models, data, and tasks to refine datasets. ResoFilter leverages the fine-tuning process to obtain Data-Parameter features for data selection, offering improved interpretability by representing data characteristics through model weights. Our experiments demonstrate that ResoFilter achieves comparable results to full-scale fine-tuning using only half the data in mathematical tasks and exhibits strong generalization across different models and domains. This method provides valuable insights for constructing synthetic datasets and evaluating high-quality data, offering a promising solution for enhancing data augmentation techniques and improving training dataset quality for LLMs. For reproducibility, we will release our code and data upon acceptance. @@ -3732,19 +3732,19 @@ <fixed-case>UCFE</fixed-case>: A User-Centric Financial Expertise Benchmark for Large Language Models - YuzheYang + YuzheYang YifeiZhangNanjing university and The Chinese University of Hong Kong, Shenzhen YanHuThe Chinese University of Hong Kong - YilinGuo + YilinGuo RuoliGan YueruHe - MingcongLei + MingcongLei XiaoZhang HainingWang - QianqianXieWuhan University - JiminHuangThe Fin AI + QianqianXieWuhan University + JiminHuangThe Fin AI HonghaiYu - BenyouWangThe Chinese University of Hong Kong, Shenzhen + BenyouWangThe Chinese University of Hong Kong, Shenzhen 5429-5448 This paper introduces the UCFE: User-Centric Financial Expertise benchmark, an innovative framework designed to evaluate the ability of large language models (LLMs) to handle complex real-world financial tasks. UCFE benchmark adopts a hybrid approach that combines human expert evaluations with dynamic, task-specific interactions to simulate the complexities of evolving financial scenarios. Firstly, we conducted a user study involving 804 participants, collecting their feedback on financial tasks. Secondly, based on this feedback, we created our dataset that encompasses a wide range of user intents and interactions. This dataset serves as the foundation for benchmarking 11 LLMs services using the LLM-as-Judge methodology. Our results show a significant alignment between benchmark scores and human preferences, with a Pearson correlation coefficient of 0.78, confirming the effectiveness of the UCFE dataset and our evaluation approach. UCFE benchmark not only reveals the potential of LLMs in the financial domain but also provides a robust framework for assessing their performance and user satisfaction. 2025.findings-naacl.300 @@ -3752,10 +3752,10 @@ <fixed-case>BRIEF</fixed-case>: Bridging Retrieval and Inference for Multi-hop Reasoning via Compression - YuankaiLi + YuankaiLi Jia-ChenGuUniversity of California, Los Angeles DiWu - Kai-WeiChangUniversity of California, Los Angeles and Amazon + Kai-WeiChangUniversity of California, Los Angeles and Amazon NanyunPengUniversity of California, Los Angeles 5449-5470 Retrieval-augmented generation (RAG) can supplement large language models (LLMs) by integrating external knowledge. However, as the number of retrieved documents increases, the input length to LLMs grows linearly, causing a dramatic increase in latency and a degradation in long-context understanding. This is particularly serious for multi-hop questions that require a chain of reasoning across documents. To accelerate inference, reduce costs, and minimize distractions, this paper presents BRIEF (Bridging Retrieval and Inference through Evidence Fusion), a lightweight approach that performs query-aware multi-hop reasoning by compressing retrieved documents into highly dense textual summaries to integrate into in-context RAG. To enable learning compression for multi-hop reasoning, we curate synthetic data by extracting atomic propositions that encapsulate distinct factoids from the source documents to compose synthetic summaries. Based on our synthetic data built entirely by open-source models, BRIEF generates more concise summaries and enables a range of LLMs to achieve exceptional open-domain question answering (QA) performance. For example, on HotpotQA, BRIEF improves the compression rate by 2 times compared to the state-of-the-art baseline, while outperforming it by 3.00% EM and 4.16% F1 with Flan-UL2 as the reader model. It also generates more concise summaries than proprietary GPT-3.5, while demonstrating nearly identical QA performance. @@ -3764,12 +3764,12 @@ An Optimizable Suffix Is Worth A Thousand Templates: Efficient Black-box Jailbreaking without Affirmative Phrases via <fixed-case>LLM</fixed-case> as Optimizer - WeipengJiang + WeipengJiang ZhentingWangRutgers University JuanZhaiUniversity of Massachusetts at Amherst - ShiqingMaUniversity of Massachusetts at Amherst + ShiqingMaUniversity of Massachusetts at Amherst ZhengyuZhaoXi’an Jiaotong University - ChaoShenXi’an Jiaotong University + ChaoShenXi’an Jiaotong University 5471-5483 Despite prior safety alignment efforts, LLMs can still generate harmful and unethical content when subjected to jailbreaking attacks. Existing jailbreaking methods fall into two main categories: template-based and optimization-based methods. The former requires significant manual effort and domain knowledge, while the latter, exemplified by GCG, which seeks to maximize the likelihood of harmful LLM outputs through token-level optimization, also encounters several limitations: requiring white-box access, necessitating pre-constructed affirmative phrase, and suffering from low efficiency. This paper introduces ECLIPSE, a novel and efficient black-box jailbreaking method with optimizable suffixes. We employ task prompts to translate jailbreaking objectives into natural language instructions, guiding LLMs to generate adversarial suffixes for malicious queries. A harmfulness scorer provides continuous feedback, enabling LLM self-reflection and iterative optimization to autonomously produce effective suffixes. Experimental results demonstrate that ECLIPSE achieves an average attack success rate (ASR) of 0.92 across three open-source LLMs and GPT-3.5-Turbo, significantly outperforming GCG by 2.4 times. Moreover, ECLIPSE matches template-based methods in ASR while substantially reducing average attack overhead by 83%, offering superior attack efficiency. 2025.findings-naacl.302 @@ -3778,9 +3778,9 @@ Multi-Stage <fixed-case>LLM</fixed-case> Fine-Tuning with a Continual Learning Setting ChanghaoGuan - ChaoHuang - HongliangLi - YouLi + ChaoHuang + HongliangLi + YouLi NingCheng ZiheLiu YufengChen @@ -3796,7 +3796,7 @@ Hao-XiangXu Jun-YuMaUniversity of Science and Technology of China Zhen-HuaLingUniversity of Science and Technology of China - NingyuZhangZhejiang University + NingyuZhangZhejiang University Jia-ChenGuUniversity of California, Los Angeles 5499-5515 Large language models (LLMs) struggle with hallucinations due to false or outdated knowledge. Given the high resource demands of retraining these models, there is an increasing focus on developing model editing. However, the general abilities of LLMs across downstream tasks are prone to significant degradation during sequential editing. This paper statistically observes that the parameter matrix after editing exhibits a significant deviation compared to its previous state as the number of edits increases. This serious deviation affects the original knowledge associations within LLMs and leads to the degradation of their general abilities. To this end, a framework termed Editing Anchor Compression (EAC) is proposed to constrain the deviation of the parameter matrix during sequential editing. It compresses the editing information by selecting editing anchors that are important in encoding new relations without deviating too much from the original matrix, thereby preserving the general abilities. Experiments of applying EAC to two popular editing methods on three LLMs across four tasks are conducted. Evaluation results show that EAC effectively minimizes unreasonable deviations caused by model editing, preserving over 70% of the general abilities while better retaining the editing knowledge compared to the original counterpart methods. @@ -3807,7 +3807,7 @@ <fixed-case>MLKV</fixed-case>: Multi-Layer Key-Value Heads for Memory Efficient Transformer Decoding Zayd Muhammad KawakibiZuhriMohamed bin Zayed University of Artificial Intelligence Muhammad FaridAdilazuardaMohamed bin Zayed University of Artificial Intelligence - AyuPurwariantiInstitut Teknologi Bandung + AyuPurwariantiInstitut Teknologi Bandung Alham FikriAjiMohamed bin Zayed University of Artificial Intelligence 5516-5525 Auto-regressive inference of transformers benefit greatly from Key-Value (KV) caching, but can lead to major memory bottlenecks as model size, batch size, and sequence length grow at scale. We introduce Multi-Layer Key-Value (MLKV) sharing, a novel approach extending KV sharing across transformer layers to reduce memory usage beyond what was possible with Multi-Query Attention (MQA) and Grouped-Query Attention (GQA). Evaluations on various NLP benchmarks and inference metrics using uptrained Pythia-160M variants demonstrate that MLKV significantly reduces memory usage with minimal performance loss, reducing KV cache size down to a factor of 6x compared to MQA. These results highlight MLKV’s potential for efficient deployment of transformer models at scale. @@ -3817,7 +3817,7 @@ Clarify When Necessary: Resolving Ambiguity Through Interaction with <fixed-case>LM</fixed-case>s Michael JQZhangNew York University - EunsolChoiNew York University + EunsolChoiNew York University 5526-5543 In this work, we explore the challenges of developing interactive assistants that resolve ambiguity by asking their users clarifying questions. Specifically, we develop a task-agnostic framework for evaluating a system’s ability to determine when to ask for clarification. Determining when to ask for clarification is a challenging task that requires systems to consider the demands of the individual user (i.e., how much they prioritize speed and usability versus carefulness) and the distribution of interpretations for a given request (i.e., whether an ambiguous request has one dominant, inferable interpretation). Using this framework, we evaluate systems for determining when to clarify across three NLP applications: QA, MT, and NLI. Finally, we introduce present a novel uncertainty estimation approach, IntentSim, that determines the utility of asking a clarifying question by estimating the entropy over user intents. Our method consistently outperforms existing uncertainty estimation approaches at identifying predictions that will benefit from clarification. Furthermore, we find that IntentSim is robust, demonstrating improvements across a wide range of NLP tasks and LMs. Together, our work lays foundation for further studies on clarifying interactions with LM assistants. 2025.findings-naacl.306 @@ -3825,7 +3825,7 @@ <fixed-case>DOLFIN</fixed-case> - Document-Level Financial Test-Set for Machine Translation - MariamNakhle + MariamNakhle MarcoDinarelliCNRS RaheelQaderLingua Custodia EmmanuelleEsperança-RodierUniversity of Grenoble-Alpes @@ -3838,26 +3838,26 @@ Are Large Language Models Effective in Clinical Trial Design? A Study on Baseline Feature Generation NafisNeehal - BowenWangRensselaer Polytechnic Institute - ShayomDebopadhayaAlbany Medical College - CoreyCurranRensselaer Polytechnic Institute - KeerthiramMurugesanInternational Business Machines + BowenWangRensselaer Polytechnic Institute + ShayomDebopadhayaAlbany Medical College + CoreyCurranRensselaer Polytechnic Institute + KeerthiramMurugesanInternational Business Machines SohamDanMicrosoft - VibhaAnandInternational Business Machines - KristinBennettRensselaer Polytechnic Institute + VibhaAnandInternational Business Machines + KristinBennettRensselaer Polytechnic Institute 5557-5570 2025.findings-naacl.308 neehal-etal-2025-large Lightweight Contenders: Navigating Semi-Supervised Text Mining through Peer Collaboration and Self Transcendence - QianrenMaoZhongguancun Laboratory, Beijing, P.R.China. and Beihang University - WeifengJiangNanyang Technological University + QianrenMaoZhongguancun Laboratory, Beijing, P.R.China. and Beihang University + WeifengJiangNanyang Technological University JunnanLiu ChenghuaLinUniversity of Manchester - QianLiBeijing University of Posts and Telecommunications + QianLiBeijing University of Posts and Telecommunications XianqingWen - JianxinLiBeihang University + JianxinLiBeihang University JinhuLu 5571-5585 The semi-supervised learning (SSL) strategy in lightweight models requires reducing annotated samples and facilitating cost-effective inference. However, the constraint on model parameters, imposed by the scarcity of training labels, limits the SSL performance. In this paper, we introduce PS-NET, a novel framework tailored for semi-supervised text mining with lightweight models. PS-NET incorporates online distillation to train lightweight student models by imitating the Teacher model. It also integrates an ensemble of student peers that collaboratively instruct each other. Additionally, PS-NET implements a constant adversarial perturbation schema to further self-augmentation by progressive generalizing. Our PS-NET, equipped with a 2-layer distilled BERT, exhibits notable performance enhancements over SOTA lightweight SSL frameworks of FLiText and Disco in SSL text classification with extremely rare labelled data. @@ -3866,13 +3866,13 @@ Language-based Valence and Arousal Expressions between the <fixed-case>U</fixed-case>nited <fixed-case>S</fixed-case>tates and <fixed-case>C</fixed-case>hina: a Cross-Cultural Examination - Young MinChoUniversity of Pennsylvania + Young MinChoUniversity of Pennsylvania DandanPangBFH - Bern University of Applied Sciences StutiThapaUniversity of Tulsa GarrickSherman LyleUngar - LouisTayPurdue University - Sharath ChandraGuntukuUniversity of Pennsylvania + LouisTayPurdue University + Sharath ChandraGuntukuUniversity of Pennsylvania 5586-5600 While affective expressions on social media have been extensively studied, most research has focused on the Western context. This paper explores cultural differences in affective expressions by comparing valence and arousal on Twitter/X (geolocated to the US) and Sina Weibo (in Mainland China). Using the NRC-VAD lexicon to measure valence and arousal, we identify distinct patterns of emotional expression across both platforms. Our analysis reveals a functional representation between valence and arousal, showing a negative offset in contrast to traditional lab-based findings which suggest a positive offset. Furthermore, we uncover significant cross-cultural differences in arousal, with US users displaying higher emotional intensity than Chinese users, regardless of the valence of the content. Finally, we conduct a comprehensive language analysis correlating n-grams and LDA topics with affective dimensions to deepen our understanding of how language and culture shape emotional expression. These findings contribute to a more nuanced understanding of affective communication across cultural and linguistic contexts on social media. 2025.findings-naacl.310 @@ -3882,7 +3882,7 @@ Chain-of-Rank: Enhancing Large Language Models for Domain-Specific <fixed-case>RAG</fixed-case> in Edge Device JuntaeLeeQualcomm Inc, QualComm JihwanBangQualcomm Inc, QualComm - KyuhongShimSung Kyun Kwan University + KyuhongShimSung Kyun Kwan University SeunghanYangQualcomm AI Research SimyungChangQualComm AI Research 5601-5608 @@ -3894,9 +3894,9 @@ <fixed-case>MAL</fixed-case>o<fixed-case>RA</fixed-case>: Mixture of Asymmetric Low-Rank Adaptation for Enhanced Multi-Task Learning XujiaWang HaiyanZhaoTsinghua University - ShuoWang + ShuoWang HanqingWang - ZhiyuanLiuTsinghua University + ZhiyuanLiuTsinghua University 5609-5626 Parameter-Efficient Fine-Tuning (PEFT) methods such as LoRA have significantly improved the adaptation of LLMs to downstream tasksin a resource-efficient manner. However, in multi-task scenarios, challenges such as training imbalance and the seesaw effect frequently emerge. Mixture-of-LoRA (MoLoRA), which combines LoRA with sparse Mixture-of-Experts, mitigates some of these issues by promoting task-specific learning among experts. Despite this, MoLoRA remains inefficient in terms of training speed, parameter utilization, and overall multi-task performance. In this paper, we propose Mixture of Asymmetric Low-Rank Adaptaion (MALoRA), a flexible fine-tuning framework that leverages asymmetric optimization among LoRA experts. MALoRA reduces the number of trainable parameters by 30% to 48%, increases training speed by 1.2x, and matches the computational efficiency of single-task LoRA models. Additionally, MALoRA addresses overfitting issues commonly seen in high-rank configurations, enhancing performance stability. Extensive experiments across diverse multi-task learning scenarios demonstrate that MALoRA consistently outperforms all baseline methods in both inter-domain and intra-domain tasks. 2025.findings-naacl.312 @@ -3904,12 +3904,12 @@ <fixed-case>L</fixed-case>lama<fixed-case>L</fixed-case>ens: Specialized Multilingual <fixed-case>LLM</fixed-case> for Analyzing News and Social Media Content - Mohamed BayanKmainasiUniversity of Qatar + Mohamed BayanKmainasiUniversity of Qatar Ali EzzatShahroor - MaramHasanainQatar Computing Research Institute - Sahinur RahmanLaskarUPES + MaramHasanainQatar Computing Research Institute + Sahinur RahmanLaskarUPES NaeemulHassanUniversity of Maryland, College Park - FirojAlamQatar Computing Research Institute + FirojAlamQatar Computing Research Institute 5627-5649 Large Language Models (LLMs) have demonstrated remarkable success as general-purpose task solvers across various fields. However, their capabilities remain limited when addressing domain-specific problems, particularly in downstream NLP tasks. Research has shown that models fine-tuned on instruction-based downstream NLP datasets outperform those that are not fine-tuned. While most efforts in this area have primarily focused on resource-rich languages like English and broad domains, little attention has been given to multilingual settings and specific domains. To address this gap, this study focuses on developing a specialized LLM, LlamaLens, for analyzing news and social media content in a multilingual context. To the best of our knowledge, this is the first attempt to tackle both domain specificity and multilinguality, with a particular focus on news and social media. Our experimental setup includes 18 tasks, represented by 52 datasets covering Arabic, English, and Hindi. We demonstrate that LlamaLens outperforms the current state-of-the-art (SOTA) on 23 testing sets, and achieves comparable performance on 8 sets. We make the models and resources publicly available for the research community (https://huggingface.co/QCRI). 2025.findings-naacl.313 @@ -3928,7 +3928,7 @@ Preserving Zero-shot Capability in Supervised Fine-tuning for Multi-label Text Classification - Si-AnChen + Si-AnChen Hsuan-TienLinNational Taiwan University Chih-JenLinNational Taiwan University 5699-5712 @@ -3940,10 +3940,10 @@ Data-centric <fixed-case>NLP</fixed-case> Backdoor Defense from the Lens of Memorization ZhentingWangRutgers University ZhizhiWang - MingyuJinRutgers University + MingyuJinRutgers University MengnanDuNew Jersey Institute of Technology JuanZhaiUniversity of Massachusetts at Amherst - ShiqingMaUniversity of Massachusetts at Amherst + ShiqingMaUniversity of Massachusetts at Amherst 5713-5731 Backdoor attack is a severe threat to the trustworthiness of DNN-based language models. In this paper, we first extend the definition of memorization of language models from sample-wise to more fine-grained sentence element-wise (e.g., word, phrase, structure, and style), and then point out that language model backdoors are a type of element-wise memorization. Through further analysis, we find that the strength of such memorization is positively correlated to the frequency of duplicated elements in the training dataset. In conclusion, duplicated sentence elements are necessary for successful backdoor attacks. Based on this, we propose a data-centric defense. We first detect trigger candidates in training data by finding memorizable elements, i.e., duplicated elements, and then confirm real triggers by testing if the candidates can activate backdoor behaviors (i.e., malicious elements). Results show that our method outperforms state-of-the-art defenses in defending against different types of NLP backdoors. 2025.findings-naacl.316 @@ -3965,9 +3965,9 @@ Infogent: An Agent-Based Framework for Web Information Aggregation RevanthGangi ReddyUniversity of Illinois at Urbana-Champaign SagnikMukherjee - JeonghwanKim - ZhenhailongWang - DilekHakkani-TürUniversity of Illinois at Urbana-Champaign + JeonghwanKim + ZhenhailongWang + DilekHakkani-TürUniversity of Illinois at Urbana-Champaign HengJiUniversity of Illinois, Urbana-Champaign 5745-5758 Despite seemingly performant web agents on the task-completion benchmarks, most existing methods evaluate the agents based on a presupposition: the web navigation task consists of a linear sequence of actions with an end state that marks task completion. In contrast, our work focuses on web navigation for information aggregation, wherein the agent must explore different websites to gather information for a complex query. We consider web information aggregation from two different perspectives: i) Direct API-driven Access relies on a text-only view of the Web, leveraging external tools such as Google Search API to navigate the Web and a scraper to extract website contents. (ii) Interactive Visual Access uses screenshots of the webpages and requires interaction with the browser to navigate and access information. Motivated by these diverse information access settings, we introduce Infogent, a novel modular framework for web information aggregation involving three distinct components: Navigator, Extractor, and Aggregator. Experiments on different information access settings demonstrate that Infogent beats an existing SOTA multi-agent search framework by 7% under Direct API-Driven Access on FRAMES and improves over an existing information-seeking web agent by 4.3% under Interactive Visual Access on AssistantBench. @@ -3978,7 +3978,7 @@ On the Role of Key Phrases in Argument Mining NilmadhabDas Vijaya VSaradhi - AshishAnandIndian Institute of Technology, Guwahati + AshishAnandIndian Institute of Technology, Guwahati 5759-5772 Argument mining (AM) focuses on analyzing argumentative structures such as Argument Components (ACs) and Argumentative Relations (ARs). Modeling dependencies between ACs and ARs is challenging due to the complex interactions between ACs. Existing approaches often overlook crucial conceptual links, such as key phrases that connect two related ACs, and tend to rely on cartesian product methods to model these dependencies, which can result in class imbalances. To extract key phrases from the AM benchmarks, we employ a prompt-based strategy utilizing an open-source Large Language Model (LLM). Building on this, we propose a unified text-to-text generation framework that leverages Augmented Natural Language (ANL) formatting and integrates the extracted key phrases inside the ANL itself to efficiently solve multiple AM tasks in a joint formulation. Our method sets new State-of-the-Art (SoTA) on three structurally distinct standard AM benchmarks, surpassing baselines by up to 9.5% F1 score, demonstrating its strong potential. 2025.findings-naacl.319 @@ -3986,7 +3986,7 @@ <fixed-case>T</fixed-case>ab<fixed-case>C</fixed-case>omp: A Dataset for Visual Table Reading Comprehension - SomrajGautamIndian Institute of Technology, Jodhpur + SomrajGautamIndian Institute of Technology, Jodhpur AbhishekBhandariIndian Institute of Technology, Jodhpur GauravHarit 5773-5780 @@ -4001,7 +4001,7 @@ LiningYang YuhuaZhou XuCheng - YibinWang + YibinWang HongguangLiJF SmartInvest Holdings 5781-5795 The efficient compression of large language models (LLMs) has become increasingly popular. However, recovering the performance of compressed LLMs remains a major challenge. The current practice in LLM compression entails the implementation of structural pruning, complemented by a recovery phase that leverages the Low-Rank Adaptation (LoRA) algorithm. Structural pruning’s uneven modification of model architecture, coupled with standard LoRA’s fixed configuration allocation across layers in an online pipeline, leads to suboptimal performance in various downstream tasks for pruned models. To address this challenge, we introduce RankAdaptor, a hierarchical rank allocation method that enables efficient fine-tuning of pruned LLMs according to layerwise specific recovery requirements. We employ a performance model that conducts offline meta-learning and online incremental learning to explore optimal rank values for each layer. Comprehensive experiments on popular benchmarks show that RankAdaptor consistently outperforms state-of-the-art methods across a variety of pruning settings and LLM architectures, with improvements ranging from 0.7% to 5.5%. @@ -4010,10 +4010,10 @@ Rationale Behind Essay Scores: Enhancing <fixed-case>S</fixed-case>-<fixed-case>LLM</fixed-case>’s Multi-Trait Essay Scoring with Rationale Generated by <fixed-case>LLM</fixed-case>s - SeongYeubChuKorea Advanced Institute of Science & Technology + SeongYeubChuKorea Advanced Institute of Science & Technology Jong WooKimKorea Advanced Institute of Science & Technology - BryanWong - Mun YongYiKorea Advanced Institute of Science & Technology + BryanWong + Mun YongYiKorea Advanced Institute of Science & Technology 5796-5814 Existing automated essay scoring (AES) has solely relied on essay text without using explanatory rationales for the scores, thereby forgoing an opportunity to capture the specific aspects evaluated by rubric indicators in a fine-grained manner. This paper introduces Rationale-based Multiple Trait Scoring (RMTS), a novel approach for multi-trait essay scoring that integrates prompt-engineering-based large language models (LLMs) with a fine-tuning-based essay scoring model using a smaller large language model (S-LLM). RMTS uses an LLM-based trait-wise rationale generation system where a separate LLM agent generates trait-specific rationales based on rubric guidelines, which the scoring model uses to accurately predict multi-trait scores. Extensive experiments on benchmark datasets, including ASAP, ASAP++, and Feedback Prize, show that RMTS significantly outperforms state-of-the-art models and vanilla S-LLMs in trait-specific scoring. By assisting quantitative assessment with fine-grained qualitative rationales, RMTS enhances the trait-wise reliability, providing partial explanations about essays. The code is available at https://github.com/BBeeChu/RMTS.git. @@ -4022,7 +4022,7 @@ <fixed-case>MTPC</fixed-case>hat: A Multimodal Time-Aware Persona Dataset for Conversational Agents - WanqiYang + WanqiYang YandaLi MengFangUniversity of Liverpool and Eindhoven University of Technology LingChenUniversity of Technology Sydney @@ -4039,7 +4039,7 @@ MianqiuHuang DongZhang YaqianZhouFudan University, Tsinghua University - XipengQiuFudan University + XipengQiuFudan University 5827-5845 Large Language Models (LLMs) acquire extensive knowledge and remarkable abilities from extensive text corpora, making them powerful tools for various applications. To make LLMs more usable, aligning them with human preferences is essential. Existing alignment techniques, such as Reinforcement Learning from Human Feedback (RLHF) and Direct Preference Optimization (DPO), typically embed predefined preferences directly within the model’s parameters. These methods, however, often result in a static alignment that can not account for the diversity of human preferences in practical applications.In response to this challenge, we propose an effective method, MetaAlign, which aims to help LLMs dynamically align with various explicit or implicit preferences specified at inference time. Experimental results show that LLMs optimized on our meticulously constructed MetaAlign Dataset can effectively align with any preferences specified at the inference stage, validating the feasibility of MetaAlign. We hope that our work can provide some insights into the alignment of language models. 2025.findings-naacl.324 @@ -4047,9 +4047,9 @@ <fixed-case>MAQA</fixed-case>: Evaluating Uncertainty Quantification in <fixed-case>LLM</fixed-case>s Regarding Data Uncertainty - YongjinYangUniversity of Toronto + YongjinYangUniversity of Toronto HaneulYooKAIST - HwaranLeeSogang University + HwaranLeeSogang University 5846-5863 Despite the massive advancements in large language models (LLMs), they still suffer from producing plausible but incorrect responses. To improve the reliability of LLMs, recent research has focused on uncertainty quantification to predict whether a response is correct or not. However, most uncertainty quantification methods have been evaluated on single-labeled questions, which removes data uncertainty—the irreducible randomness often present in user queries, which can arise from factors like multiple possible answers. This limitation may cause uncertainty quantification results to be unreliable in practical settings. In this paper, we investigate previous uncertainty quantification methods under the presence of data uncertainty. Our contributions are two-fold: 1) proposing a new Multi-Answer Question Answering dataset, **MAQA**, consisting of world knowledge, mathematical reasoning, and commonsense reasoning tasks to evaluate uncertainty quantification regarding data uncertainty, and 2) assessing 5 uncertainty quantification methods of diverse white- and black-box LLMs. Our findings show that previous methods relatively struggle compared to single-answer settings, though this varies depending on the task. Moreover, we observe that entropy- and consistency-based methods effectively estimate model uncertainty, even in the presence of data uncertainty. 2025.findings-naacl.325 @@ -4060,9 +4060,9 @@ Hyundong JustinChoUSC/ISI KarishmaSharma Nicolaas PaulJedemaAmazon - Leonardo F. R.RibeiroAmazon - JonathanMayUniversity of Southern California and USC/ISI - AlessandroMoschittiAmazon AGI + Leonardo F. R.RibeiroAmazon + JonathanMayUniversity of Southern California and USC/ISI + AlessandroMoschittiAmazon AGI 5864-5885 Language models are aligned to the collective voice of many, resulting in generic outputs that do not align with specific users’ styles. In this work, we present Trial-Error-Explain In-Context Learning (TICL), a tuning-free method that personalizes language models for text generation tasks with fewer than 10 examples per user. TICL iteratively expands an in-context learning prompt via a trial-error-explain process, adding model-generated negative samples and explanations that provide fine-grained guidance towards a specific user’s style. TICL achieves favorable win rates on pairwise comparisons with LLM-as-a-judge up to 91.5% against the previous state-of-the-art and outperforms competitive tuning-free baselines for personalized alignment tasks of writing emails, essays and news articles. Both lexical and qualitative analyses show that the negative samples and explanations enable language models to learn stylistic context more effectively and overcome the bias towards structural and formal phrases observed in their zero-shot outputs. By front-loading inference compute to create a user-specific in-context learning prompt that does not require extra generation steps at test time, presents a novel yet simple approach for personalized alignment. 2025.findings-naacl.326 @@ -4094,7 +4094,7 @@ KushagraBhushan YatinNandwaniInternational Business Machines DineshKhandelwalInternational Business Machines - SonamGuptaIndian Institute of Technology, Madras + SonamGuptaIndian Institute of Technology, Madras GauravPandeyInternational Business Machines DineshRaghuIBM Research - New Delhi SachindraJoshi @@ -4105,10 +4105,10 @@ Find the Intention of Instruction: Comprehensive Evaluation of Instruction Understanding for Large Language Models - HyeonseokMoonKorea University - JaehyungSeo - SeungyoonLeeKorea University - ChanjunParkKorea University + HyeonseokMoonKorea University + JaehyungSeo + SeungyoonLeeKorea University + ChanjunParkKorea University HeuiseokLim 5944-5964 Through numerous endeavors, large language models (LLMs) have witnessed significant advancements in their instruction-following capability. However, we discern that LLMs are prone to generate responses to instruction-formatted statements in an instinctive manner, rather than comprehending the underlying user intention reside within the given instructions. We also recognize that the significance of instruction understanding capability is largely overlooked in most of LLM evaluation benchmarks. To ensure more comprehensive evaluation on the instruction understanding capability of LLM, we propose Intention of Instruction (IntInst) benchmark, which primary objective is to distinguish the appropriate instruction that accurately instruct to generate a given context. IntInst presents four instruction candidates and requires LLMs to select one among them. Through extensive experiments with several instruction-tuned LLMs, we reveal that most LLMs struggle to grasp the actual intention concealed in the instruction and thoroughly analyze the factors influencing instruction understanding. @@ -4119,9 +4119,9 @@ Long-Tail Crisis in Nearest Neighbor Language Models YutoNishidaNara Institute of Science and Technology, Japan MakotoMorishitaFuture Corporation and Tohoku University - HiroyukiDeguchiNTT Communications - HidetakaKamigaitoNara Institute of Science and Technology - TaroWatanabeNara Institute of Science and Technology, Japan + HiroyukiDeguchiNTT Communications + HidetakaKamigaitoNara Institute of Science and Technology + TaroWatanabeNara Institute of Science and Technology, Japan 5965-5978 The k-nearest-neighbor language model (kNN-LM), one of the retrieval-augmented language models, improves the perplexity for given text by directly accessing a large datastore built from any text data during inference.A widely held hypothesis for the success of kNN-LM is that its explicit memory, i.e., the datastore, enhances predictions for long-tail phenomena.However, prior works have primarily shown its ability to retrieve long-tail contexts, leaving the model’s performance remain underexplored in estimating the probabilities of long-tail target tokens during inference.In this paper, we investigate the behavior of kNN-LM on low-frequency tokens, examining prediction probability, retrieval accuracy, and token distribution in the datastore.Our experimental results reveal that kNN-LM does not improve prediction performance for low-frequency tokens but mainly benefits high-frequency tokens regardless of long-tail contexts in the datastore. 2025.findings-naacl.331 @@ -4131,7 +4131,7 @@ Keep Guessing? When Considering Inference Scaling, Mind the Baselines GalYonaResearch, Google OrHonovich - OmerLevyFacebook + OmerLevyFacebook RoeeAharoniGoogle 5979-5991 Scaling inference compute in large language models (LLMs) through repeated sampling consistently increases the coverage (fraction of problems solved) as the number of samples increases. We conjecture that this observed improvement is partially due to the answer distribution of standard evaluation benchmarks, which is skewed towards a relatively small set of common answers. To test this conjecture, we define a baseline that enumerates answers according to their prevalence in the training set. Experiments spanning two domains – mathematical reasoning and factual knowledge – reveal that this baseline outperforms repeated model sampling for some LLMs, while the coverage for others is on par with that of a mixture strategy that obtains k answers by using only 10 model samples and similarly guessing the remaining k-10 attempts via enumeration. Our baseline enables a more accurate measurement of how much repeated sampling improves coverage in such settings beyond prompt-agnostic guessing. @@ -4188,11 +4188,11 @@ Can <fixed-case>I</fixed-case> Introduce My Boyfriend to My Grandmother? Evaluating Large Language Models Capabilities on <fixed-case>I</fixed-case>ranian Social Norm Classification - HamidrezaSaffariPolytechnic Institute of Milan + HamidrezaSaffariPolytechnic Institute of Milan MohammadaminShafieiUniversity of Milan - DonyaRooeinBocconi University + DonyaRooeinBocconi University FrancescoPierriPolitecnico di Milano - DeboraNozzaBocconi University + DeboraNozzaBocconi University 6060-6074 Creating globally inclusive AI systems demands datasets reflecting diverse social norms. Iran, with its unique cultural blend, offers an ideal case study, with Farsi adding linguistic complexity. In this work, we introduce the Iranian Social Norms (ISN) dataset, a novel collection of 1,699 Iranian social norms, including environments, demographic features, and scope annotation, alongside English translations. Our evaluation of 6 Large Language Models (LLMs) in classifying Iranian social norms, using a variety of prompts, uncovered critical insights into the impact of geographic and linguistic context. Results revealed a substantial performance gap in LLMs’ comprehension of Iranian norms. Notably, while the geographic context in English prompts enhanced the performance, this effect was absent in Farsi, pointing to nuanced linguistic challenges. Particularly, performance was significantly worse for Iran-specific norms, emphasizing the importance of culturally tailored datasets. As the first Farsi dataset for social norm classification, ISN will facilitate crucial cross-cultural analyses, shedding light on how values differ across contexts and cultures. 2025.findings-naacl.337 @@ -4212,9 +4212,9 @@ Adapting <fixed-case>LLM</fixed-case> Agents with Universal Communication Feedback KuanWang YadongLuMicrosoft - MichaelSantacroceMicrosoft + MichaelSantacroceMicrosoft YeyunGong - ChaoZhangGeorgia Institute of Technology + ChaoZhangGeorgia Institute of Technology YelongShen 6090-6107 Recent advances in large language models (LLMs) have demonstrated potential for LLM agents. To facilitate the training for these agents with both linguistic feedback and non-linguistic reward signals, we introduce Learning through Communication (LTC). We design a universal buffer to store all the feedback, and an iterative pipeline to enable an LLM agent to explore and update its policy in an given environment. To optimize agent interactions for task-specific learning with our universal buffer and pipeline, we introduce diverse communication patterns tailored for both single-agent and multi-agent environments. We evaluate the efficacy of our LTC approach on four diverse datasets: ALFWorld (single-agent), HotpotQA (multi-agent collaboration), Chameleon (multi-agent competition), and GSM8k (multi-agent teacher-student). On these data sets, LTC outperforms the supervised instruction fine-tuning baselines by 3.6% to 12%. These results highlight the versatility and efficiency of LTC in facilitating online adaptation for LLM agents. @@ -4233,7 +4233,7 @@ <fixed-case>S</fixed-case>ea<fixed-case>E</fixed-case>xam and <fixed-case>S</fixed-case>ea<fixed-case>B</fixed-case>ench: Benchmarking <fixed-case>LLM</fixed-case>s with Local Multilingual Questions in <fixed-case>S</fixed-case>outheast <fixed-case>A</fixed-case>sia - ChaoqunLiu + ChaoqunLiu WenxuanZhangSingapore University of Technology and Design JiahaoYing MahaniAljuniedAlibaba Group @@ -4256,10 +4256,10 @@ From Intentions to Techniques: A Comprehensive Taxonomy and Challenges in Text Watermarking for Large Language Models - Harsh NishantLalai + Harsh NishantLalai AashishAnantha RamakrishnanPennsylvania State University, Pennsylvania State University - Raj SanjayShahGeorgia Institute of Technology - DongwonLeeThe Pennsylvania State University + Raj SanjayShahGeorgia Institute of Technology + DongwonLeeThe Pennsylvania State University 6147-6160 With the rapid growth of Large Language Models (LLMs), safeguarding textual content against unauthorized use is crucial. Watermarking offers a vital solution, protecting both - LLM-generated and plain text sources. This paper presents a unified overview of different perspectives behind designing watermarking techniques through a comprehensive survey of the research literature. Our work has two key advantages: (1) We analyze research based on the specific intentions behind different watermarking techniques, evaluation datasets used, and watermarking addition and removal methods to construct a cohesive taxonomy. (2) We highlight the gaps and open challenges in text watermarking to promote research protecting text authorship. This extensive coverage and detailed analysis sets our work apart, outlining the evolving landscape of text watermarking in Language Models. 2025.findings-naacl.343 @@ -4268,9 +4268,9 @@ <fixed-case>M</fixed-case>-<fixed-case>IFE</fixed-case>val: Multilingual Instruction-Following Evaluation AntoineDussolle - A.Cardeña + A.Cardeña ShotaSatoLightblue - PeterDevine + PeterDevine 6161-6176 Instruction following is a core capability of modern Large language models (LLMs), making evaluating this capability essential to understanding these models. The Instruction Following Evaluation (IFEval) benchmark from the literature does this using objective criteria, offering a measure of LLM performance without subjective AI or human judgement. However, it only includes English instructions, limiting its ability to assess LLMs in other languages.We propose the Multilingual Instruction Following Evaluation (M-IFEval) benchmark, expanding the evaluation to French, Japanese, and Spanish, with both general and language-specific instructions. Applying this benchmark to 8 state-of-the-art LLMs, we find that benchmark performance across languages and instruction types can vary widely, underscoring the importance of a multilingual benchmark for evaluating LLMs in a diverse cultural context. 2025.findings-naacl.344 @@ -4278,12 +4278,12 @@ Automatic Annotation Augmentation Boosts Translation between Molecules and Natural Language - ZhiqiangZhong + ZhiqiangZhong Simon Sataa-YuLarsen HaoyuGuo - TaoTang - KuangyuZhouMicrosoft - DavideMottinAarhus University + TaoTang + KuangyuZhouMicrosoft + DavideMottinAarhus University 6177-6194 Recent advancements in AI for biological research focus on integrating molecular data with natural language to accelerate drug discovery. However, the scarcity of high-quality annotations limits progress in this area. This paper introduces LA^3, a Language-based Automatic Annotation Augmentation framework that leverages large language models to augment existing datasets, thereby improving AI training. We demonstrate the effectiveness of LA^3 by creating an enhanced dataset, LaChEBI-20, where we systematically rewrite the annotations of molecules from an established dataset. These rewritten annotations preserve essential molecular information while providing more varied sentence structures and vocabulary. Using LaChEBI-20, we train LaMolT5 based on a benchmark architecture to learn the mapping between molecular representations and augmented annotations.Experimental results on text-based *de novo* molecule generation and molecule captioning demonstrate that LaMolT5 outperforms state-of-the-art models. Notably, incorporating LA^3 leads to improvements of up to 301% over the benchmark architecture. Furthermore, we validate the effectiveness of LA^3 notable applications in *image*, *text* and *graph* tasks, affirming its versatility and utility. 2025.findings-naacl.345 @@ -4292,8 +4292,8 @@ Let Modalities Teach Each Other: Modal-Collaborative Knowledge Extraction and Fusion for Multimodal Knowledge Graph Completion GuoliangZhu - TaoRen - DandanWangInstitute of Software Chinese Academy of Sciences + TaoRen + DandanWangInstitute of Software Chinese Academy of Sciences JunHuInstitute of Software, CAS 6195-6207 Multimodal knowledge graph completion (MKGC) aims to predict missing triples in MKGs using multimodal information. Recent research typically either extracts information from each modality separately to predict, then ensembles the predictions at the decision stage, or projects multiple modalities into a unified feature space to learn multimodal representations for prediction. However, these methods usually overlook the intrinsic correlation between modalities in MKGs which should be leveraged in both unimodal knowledge extraction and multimodal knowledge fusion. Motivated by this, we propose a noval Modal-collaborative knowledge learning (Moodle) framework for MKGC, the key idea of which is to foster mutual guidance and collaboration during unimodal knowledge extraction, to let each modality acquire distinct and complementary knowledge that subsequently enhances the multimodal knowledge fusion. Specifically, Moodle preserves the representations of different modalities to learn unimodal knowledge while modeling the mutual guidance through multi-task learning. Furthermore, Moodle performs multimodal knowledge fusion and prediction guided by unimodal knowledge, capturing their synergistic relationships and acquire fine-grained semantic knowledge through contrastive learning. Extensive experiments on three real-world datasets demonstrate the advantages of Moodle over state-of-the-art methods. @@ -4302,10 +4302,10 @@ Modeling the Differential Prevalence of Online Supportive Interactions in Private Instant Messages of Adolescents - OndrejSotolarMasaryk University + OndrejSotolarMasaryk University MichałTkaczyk JaromírPlhákMasaryk University - DavidSmahel + DavidSmahel 6208-6226 This paper focuses on modeling gender-based and pair-or-group disparities in online supportive interactions among adolescents. To address the limitations of conventional social science methods in handling large datasets, this research employs language models to detect supportive interactions based on the Social Support Behavioral Code and to model their distribution. The study conceptualizes detection as a classification task, constructs a new dataset, and trains predictive models. The novel dataset comprises 196,772 utterances from 2165 users collected from Instant Messenger apps. The results show that the predictions of language models can be used to effectively model the distribution of supportive interactions in private online dialogues. As a result, this study provides new computational evidence that supports the theory that supportive interactions are more prevalent in online female-to-female conversations. The findings advance our understanding of supportive interactions in adolescent communication and present methods to automate the analysis of large datasets, opening new research avenues in computational social science. 2025.findings-naacl.347 @@ -4316,7 +4316,7 @@ RuiquanZhang RuiZhaoXiamen University ZhicongWu - LiangZhang + LiangZhang HaoqiZhang YidongChen 6227-6239 @@ -4326,7 +4326,7 @@ Selective Self-to-Supervised Fine-Tuning for Generalization in Large Language Models - SonamGuptaIndian Institute of Technology, Madras + SonamGuptaIndian Institute of Technology, Madras YatinNandwaniInternational Business Machines AsafYehudai DineshKhandelwalInternational Business Machines @@ -4339,20 +4339,20 @@ <fixed-case>P</fixed-case>roverb<fixed-case>E</fixed-case>val: Exploring <fixed-case>LLM</fixed-case> Evaluation Challenges for Low-resource Language Understanding - Israel AbebeAzime - Atnafu LambeboTonjaMohamed bin Zayed University of Artificial Intelligence + Israel AbebeAzime + Atnafu LambeboTonjaMohamed bin Zayed University of Artificial Intelligence Tadesse DestawBelay YonasChanie Bontu FufaBalcha - Negasi HaileAbadiLesan AI + Negasi HaileAbadiLesan AI Henok BiadglignAdemtew Mulubrhan AbebeNerea Debela DesalegnYadeta Derartu DagneGeremewAdama Science and Technology University and Gebeya Inc. Assefa AtsbihaTesfu - PhilippSlusallekGerman Research Center for Artificial Intelligence (DFKI) and Saarland University + PhilippSlusallekGerman Research Center for Artificial Intelligence (DFKI) and Saarland University ThamarSolorioMohamed bin Zayed University of Artificial Intelligence and University of Houston - DietrichKlakow + DietrichKlakow 6250-6266 2025.findings-naacl.350 azime-etal-2025-proverbeval @@ -4361,7 +4361,7 @@ <fixed-case>MRE</fixed-case>-<fixed-case>MI</fixed-case>: A Multi-image Dataset for Multimodal Relation Extraction in Social Media Posts ShizhouHuang BoXuDonghua University, Shanghai - ChangqunLi + ChangqunLi YangYu Xin AlexLin 6267-6277 @@ -4374,7 +4374,7 @@ Do HuuDat Duc AnhDo Anh TuanLuuNanyang Technological University - WrayBuntineVinUniversity + WrayBuntineVinUniversity 6278-6290 While diffusion models excel at conditionally generating high-quality images, prior works in discrete diffusion models were not evaluated on conditional long-text generation. This work addresses the limitations of prior discrete diffusion models for conditional long-text generation, particularly in the long abstractive summarization task. Despite faster decoding speeds compared to autoregressive methods, previous discrete diffusion models failed on the abstractive summarization task due to the incompatibility between the backbone architectures and the random noising process. To overcome these challenges, we introduce a novel semantic-aware noising process that enables Transformer backbones to handle long sequences effectively. Additionally, we propose CrossMamba, an adaptation of the Mamba model to the encoder-decoder paradigm, which integrates seamlessly with the random absorbing noising process. Our approaches outperform existing discrete diffusion models on three benchmark summarization datasets: Gigaword, CNN/DailyMail, and Arxiv, while also achieving much faster inference speed compared to autoregressive models. 2025.findings-naacl.352 @@ -4412,7 +4412,7 @@ TatsuroInaba KeitoKudo KeisukeSakaguchiTohoku University - KentaroInuiMBZUAI, RIKEN and Tohoku University + KentaroInuiMBZUAI, RIKEN and Tohoku University 6324-6343 According to the stages-of-inference hypothesis, early layers of language models map their subword-tokenized input, which does not necessarily correspond to a linguistically meaningful segmentation, to more meaningful representations that form the model’s “inner vocabulary”.Prior analysis of this *detokenization* stage has predominantly relied on probing and interventions such as path patching, which involve selecting particular inputs, choosing a subset of components that will be patched, and then observing changes in model behavior.Here, we show that several important aspects of the detokenization stage can be understood purely by analyzing model weights, without performing any model inference steps.Specifically, we introduce an analytical decomposition of first-layer attention in GPT-2.Our decomposition yields interpretable terms that quantify the relative contributions of position-related, token-related, and mixed effects.By focusing on terms in this decomposition, we discover weight-based explanations of attention bias toward close tokens and attention for detokenization. 2025.findings-naacl.355 @@ -4434,11 +4434,11 @@ <fixed-case>SOLID</fixed-case>: Self-seeding and Multi-intent Self-instructing <fixed-case>LLM</fixed-case>s for Generating Intent-aware Information-Seeking Dialogs ArianAskari RoxanaPetcuUniversity of Amsterdam and University of Amsterdam - ChuanMengUniversity of Amsterdam - MohammadAliannejadiUniversity of Amsterdam + ChuanMengUniversity of Amsterdam + MohammadAliannejadiUniversity of Amsterdam AminAbolghasemi - EvangelosKanoulasUniversity of Amsterdam - SuzanVerberneUniversiteit Leiden + EvangelosKanoulasUniversity of Amsterdam + SuzanVerberneUniversiteit Leiden 6375-6395 Intent prediction in information-seeking dialogs is challenging and requires a substantial amount of data with human-labeled intents for effective model training. While Large Language Models (LLMs) have demonstrated effectiveness in generating synthetic data, existing methods typically rely on human feedback and are tailored to structured, task-oriented intents. In this paper, we leverage LLMs for zero-shot generation of large-scale, open-domain, intent-aware information-seeking dialogs to serve as training data for intent prediction models. We introduce SOLID, a method that generates dialogs turn by turn using novel self-seeding and multi-intent self-instructing strategies. Additionally, we propose SOLID-RL, a finetuned version that generates an entire dialog in one step using data created with SOLID. SOLID and SOLID-RL are each used to generate over 300k intent-aware dialogs, significantly surpassing the size of existing datasets. Experiments show that intent prediction models trained on sampled dialogs generated by SOLID and SOLID-RL outperform those trained solely on human-generated dialogs. Our findings demonstrate the potential of LLMs to expand training datasets, as they provide valuable resources for conversational agents across multiple tasks. Our self-seeding and self-instructing approaches are adaptable to various conversational data types and languages with minimal modifications. 2025.findings-naacl.357 @@ -4450,7 +4450,7 @@ YunkeWangUniversity of Sydney DaochangLiuUniversity of Western Australia BoDuWuhan University - ChangXuUniversity of Sydney + ChangXuUniversity of Sydney 6396-6418 2025.findings-naacl.358 xu-etal-2025-collageprompt @@ -4474,7 +4474,7 @@ JongyoonSongSamsung Research SaehyungLeeSeoul National University JunsungParkSeoul National University - SungrohYoonSeoul National University + SungrohYoonSeoul National University 6435-6455 Multi-hop reasoning, which requires multi-step reasoning based on the supporting documents within a given context, remains challenging for large language models (LLMs). LLMs often struggle to filter out irrelevant documents within the context, and their performance is sensitive to the absolute position of supporting documents within that context. In this paper, we identify an additional challenge: LLMs’ performance is also sensitive to the order, relative position, in which the supporting documents are presented. We refer to this as the misordered context problem. To address this issue, based on the theoretical approach, we propose a simple yet effective method called context repetition (CoRe), which involves prompting the model by repeatedly presenting the context. This ensures that certain contiguous reasoning segments within supporting documents are presented in the optimal order, effectively guiding the model’s reasoning in the appropriate direction. Applying CoRe, we improve the F1 score by up to 30%p on multi-hop QA tasks and increase accuracy by up to 70%p on a synthetic task. Additionally, CoRe helps mitigate the well-known “lost-in-the-middle” problem in LLMs and can be effectively combined with retrieval-based approaches utilizing Chain-of-Thought (CoT) reasoning. 2025.findings-naacl.360 @@ -4482,9 +4482,9 @@ Text Annotation via Inductive Coding: Comparing Human Experts to <fixed-case>LLM</fixed-case>s in Qualitative Data Analysis - AngelinaParfenova - AndreasMarfurtHSLU - Lucerne University of Applied Sciences and Arts - JürgenPfefferTechnische Universität München + AngelinaParfenova + AndreasMarfurtHSLU - Lucerne University of Applied Sciences and Arts + JürgenPfefferTechnische Universität München AlexanderDenzlerHSLU - Lucerne University of Applied Sciences and Arts 6456-6469 This paper investigates the automation of qualitative data analysis, focusing on inductive coding using large language models (LLMs). Unlike traditional approaches that rely on deductive methods with predefined labels, this research investigates the inductive process where labels emerge from the data. The study evaluates the performance of six open-source LLMs compared to human experts. As part of the evaluation, experts rated the perceived difficulty of the quotes they coded. The results reveal a peculiar dichotomy: human coders consistently perform well when labeling complex sentences but struggle with simpler ones, while LLMs exhibit the opposite trend. Additionally, the study explores systematic deviations in both human and LLM-generated labels by comparing them to the golden standard from the test set. While human annotations may sometimes differ from the golden standard, they are often rated more favorably by other humans. In contrast, some LLMs demonstrate closer alignment with the true labels but receive lower evaluations from experts. @@ -4503,7 +4503,7 @@ Breaking <fixed-case>R</fixed-case>e<fixed-case>A</fixed-case>ct Agents: Foot-in-the-Door Attack Will Get You In ItayNakashInternational Business Machines - GeorgeKourInternational Business Machines + GeorgeKourInternational Business Machines GuyUzielInternational Business Machines AteretAnaby TavorInternational Business Machines 6484-6509 @@ -4513,8 +4513,8 @@ As easy as <fixed-case>PIE</fixed-case>: understanding when pruning causes language models to disagree - PietroTropeanoCopenhagen University - MariaMaistroUniversity of Copenhagen + PietroTropeanoCopenhagen University + MariaMaistroUniversity of Copenhagen TuukkaRuotsaloLappeenranta University of Technology, University of Copenhagen and University of Helsinki ChristinaLiomaUniversity of Copenhagen 6510-6536 @@ -4524,13 +4524,13 @@ Multi-Agent Simulator Drives Language Models for Legal Intensive Interaction - ShengbinYue + ShengbinYue TingHuang ZhengJia SiyuanWang ShujunLiu YunSong - XuanjingHuang + XuanjingHuang ZhongyuWei 6537-6570 Large Language Models (LLMs) have significantly advanced legal intelligence, but the scarcity of scenario data impedes the progress toward interactive legal scenarios. This paper introduces a Multi-agent Legal Simulation Driver (MASER) to scalably generate synthetic data by simulating interactive legal scenarios. Leveraging real-legal case sources, MASER ensures the consistency of legal attributes between participants and introduces a supervisory mechanism to align participants’ characters and behaviors as well as addressing distractions. A Multi-stage Interactive Legal Evaluation (MILE) benchmark is further constructed to evaluate LLMs’ performance in dynamic legal scenarios. Extensive experiments confirm the effectiveness of our framework. @@ -4539,7 +4539,7 @@ Exploring Backward Reasoning in Large Language Models - LeonardoRanaldi + LeonardoRanaldi GiuliaPucci 6571-6586 Multi-step reasoning through in-context learning strategies have been extensively explored, highlighting the abilities of Large Language Models (LLMs) to generate answers derived from step-by-step reasoning. These studies focus the attention on LLMs’ forward reasoning abilities epitomised in a series of general premises leading to a final solution. In this paper, by taking the reverse perspective, we study the backward reasoning abilities of LLMs, namely the inference that leads to the causal hypothesis. Behind formalising the backward problems, we analyse whether the LLMs are able to reason about the conclusion and reconstruct the original question that led to the delivery of the final answer. Operating with question-answering tasks involving symbolic reasoning, understanding, and commonsense abilities, we observe that the proposed models reveal robust comprehension capabilities managing different kinds of input; however, they are not always able to reason in the backward direction. Finally, to challenge this limitation, we demonstrate that instructing LLMs to generate the answer by reconsidering the structure of the problem allows for improved backward reasoning direction. @@ -4561,11 +4561,11 @@ Dynamic Guided and Domain Applicable Safeguards for Enhanced Security in Large Language Models WeidiLuo HeCao - ZijingLiuInternational Digital Economy Academy + ZijingLiuInternational Digital Economy Academy YuWang AidanWongInternational Digital Economy Academy, International Digital Economy Academy BinFengInternational Digital Economy Academy, International Digital Economy Academy - YuanYaoThe Hong Kong University of Science and Technology + YuanYaoThe Hong Kong University of Science and Technology YuLiInternational Digital Economy Academy 6599-6620 With the extensive deployment of Large Language Models (LLMs), ensuring their safety has become increasingly critical. However, existing defense methods often struggle with two key issues: (i) inadequate defense capabilities, particularly in domain-specific scenarios like chemistry, where a lack of specialized knowledge can lead to the generation of harmful responses to malicious queries. (ii) over-defensiveness, which compromises the general utility and responsiveness of LLMs. To mitigate these issues, we introduce a multi-agents-based defense framework, Guide for Defense (G4D), which leverages accurate external information to provide an unbiased summary of user intentions and analytically grounded safety response guidance. Extensive experiments on popular jailbreak attacks and benign datasets show that our G4D can enhance LLM’s robustness against jailbreak attacks on general and domain-specific scenarios without compromising the model’s general functionality. @@ -4596,15 +4596,15 @@ Optimizing <fixed-case>LLM</fixed-case>s for <fixed-case>I</fixed-case>talian: Reducing Token Fertility and Enhancing Efficiency Through Vocabulary Adaptation - LucaMoroniUniversity of Roma “La Sapienza” + LucaMoroniUniversity of Roma “La Sapienza” GiovanniPuccettiCNR - Pere-LluísHuguet CabotFacebook - Andrei StefanBejgu + Pere-LluísHuguet CabotFacebook + Andrei StefanBejgu AlessioMiaschiInstitute for Computational Linguistics “A. Zampolli” (CNR-ILC), Pisa EdoardoBarbaUniversity of Roma “La Sapienza” - FeliceDell’OrlettaIstituto di Linguistica Computazionale “A. Zampolli” (ILC) - AndreaEsuliCNR - RobertoNavigliSapienza University of Rome + FeliceDell’OrlettaIstituto di Linguistica Computazionale “A. Zampolli” (ILC) + AndreaEsuliCNR + RobertoNavigliSapienza University of Rome 6646-6660 The number of pretrained Large Language Models (LLMs) is increasing steadily, though the majority are designed predominantly for the English language. While state-of-the-art LLMs can handle other languages, due to language contamination or some degree of multilingual pretraining data, they are not optimized for non-English languages, leading to inefficient encoding (high token “fertility”) and slower inference speed.In this work, we thoroughly compare a variety of vocabulary adaptation techniques for optimizing English LLMs for the Italian language, and put forward Semantic Alignment Vocabulary Adaptation (SAVA), a novel method that leverages neural mapping for vocabulary substitution. SAVA achieves competitive performance across multiple downstream tasks, enhancing grounded alignment strategies. We adapt two LLMs: Mistral-7B-v0.1, reducing token fertility by 25%, and Llama-3.1-8B, optimizing the vocabulary and reducing the number of parameters by 1 billion. We show that, following the adaptation of the vocabulary, these models can recover their performance with a relatively limited stage of continual training on the target language. Finally, we test the capabilities of the adapted models on various multi-choice and generative tasks. 2025.findings-naacl.371 @@ -4612,10 +4612,10 @@ Beyond the Mode: Sequence-Level Distillation of Multilingual Translation Models for Low-Resource Language Pairs - AarónGaliano-JiménezUniversidad de Alicante - Juan AntonioPérez-OrtizUniversidad de Alicante - FelipeSánchez-MartínezUniversity of Alicante - Víctor M.Sánchez-CartagenaUniversidad de Alicante + AarónGaliano-JiménezUniversidad de Alicante + Juan AntonioPérez-OrtizUniversidad de Alicante + FelipeSánchez-MartínezUniversity of Alicante + Víctor M.Sánchez-CartagenaUniversidad de Alicante 6661-6676 This paper delves into sequence-level knowledge distillation (KD) of multilingual pre-trained translation models. We posit that, beyond the approximated mode obtained via beam search, the whole output distribution of the teacher contains valuable insights for students. We explore the potential of n-best lists from beam search to guide student’s learning and then investigate alternative decoding methods to address observed issues like low variability and under-representation of infrequent tokens. Our research in data-limited scenarios reveals that although sampling methods can slightly compromise the translation quality of the teacher output compared to beam search based methods, they enrich the generated corpora with increased variability and lexical richness, ultimately enhancing student model performance and reducing the gender bias amplification commonly associated with KD. 2025.findings-naacl.372 @@ -4623,7 +4623,7 @@ <fixed-case>LLM</fixed-case>s for Extremely Low-Resource <fixed-case>F</fixed-case>inno-<fixed-case>U</fixed-case>gric Languages - TaidoPurasonUniversity of Tartu + TaidoPurasonUniversity of Tartu Hele-AndraKuulmets MarkFishelUniversity of Tartu 6677-6697 @@ -4633,7 +4633,7 @@ <fixed-case>LOFT</fixed-case>: Scalable and More Realistic Long-Context Evaluation - JinhyukLeeGoogle + JinhyukLeeGoogle AnthonyChenGoogle DeepMind ZhuyunDaiGoogle DheeruDuaGoogle @@ -4642,12 +4642,12 @@ YiLuanGoogle SébArnoldGoogle DeepMind VincentPerotGoogle - SiddharthDalmiaGoogle Deepmind + SiddharthDalmiaGoogle Deepmind HexiangHuxAI XudongLinColumbia University PanupongPasupatGoogle AidaAminiUniversity of Washington, Seattle - Jeremy R.ColeGoogle DeepMind + Jeremy R.ColeGoogle DeepMind SebastianRiedelGoogle and University College London IftekharNaimGoogle Ming-WeiChangGoogle Deepmind @@ -4660,7 +4660,7 @@ On the Influence of Context Size and Model Choice in Retrieval-Augmented Generation Systems JurajVladikaTechnische Universität München - FlorianMatthesTechnische Universität München + FlorianMatthesTechnische Universität München 6724-6736 Retrieval-augmented generation (RAG) has emerged as an approach to augment large language models (LLMs) by reducing their reliance on static knowledge and improving answer factuality. RAG retrieves relevant context snippets and generates an answer based on them. Despite its increasing industrial adoption, systematic exploration of RAG components is lacking, particularly regarding the ideal size of provided context, and the choice of base LLM and retrieval method. To help guide development of robust RAG systems, we evaluate various context sizes, BM25 and semantic search as retrievers, and eight base LLMs. Moving away from the usual RAG evaluation with short answers, we explore the more challenging long-form question answering in two domains, where a good answer has to utilize the entire context. Our findings indicate that final QA performance improves steadily with up to 15 snippets but stagnates or declines beyond that. Finally, we show that different general-purpose LLMs excel in the biomedical domain than the encyclopedic one, and that open-domain evidence retrieval in large corpora is challenging. 2025.findings-naacl.375 @@ -4668,10 +4668,10 @@ Aligning Black-box Language Models with Human Judgments - Gerrit J.j.Van Den BurgAmazon + Gerrit J.j.Van Den BurgAmazon GenSuzukiAmazon WeiLiuAmazon - MuratSensoy + MuratSensoy 6737-6749 Large language models (LLMs) are increasingly used as automated judges to evaluate recommendation systems, search engines, and other subjective tasks, where relying on human evaluators can be costly, time-consuming, and unscalable. LLMs offer an efficient solution for continuous, automated evaluation. However, since the systems that are built and improved with these judgments are ultimately designed for human use, it is crucial that LLM judgments align closely with human evaluators to ensure such systems remain human-centered. On the other hand, aligning LLM judgments with human evaluators is challenging due to individual variability and biases in human judgments. We propose a simple yet effective framework to align LLM judgments with individual human evaluators or their aggregated judgments, without retraining or fine-tuning the LLM. Our approach learns a linear mapping between the LLM’s outputs and human judgments, achieving over 142% average improvement in agreement across 29 tasks with only a small number of calibration examples used for training. Notably, our method works in zero-shot and few-shot settings, exceeds inter-human agreement on four out of six tasks, and enables smaller LLMs to achieve performance comparable to that of larger models. 2025.findings-naacl.376 @@ -4679,7 +4679,7 @@ Guideline Compliance in Task-Oriented Dialogue: The Chained Prior Approach - XiangyuWenDepartment of Computer Science and Engineering, The Chinese University of Hong Kong + XiangyuWenDepartment of Computer Science and Engineering, The Chinese University of Hong Kong JianyuanZhong ZhijianXu QiangXuThe Chinese University of Hong Kong @@ -4692,8 +4692,8 @@ <fixed-case>A</fixed-case>uto<fixed-case>B</fixed-case>reach: Universal and Adaptive Jailbreaking with Efficient Wordplay-Guided Optimization via Multi-<fixed-case>LLM</fixed-case>s JiaweiChen XiaoYangTsinghua University, Tsinghua University - ZhengweiFang - YuTian + ZhengweiFang + YuTian YinpengDong ZhaoxiaYinEast China Normal University HangSuTsinghua University @@ -4707,7 +4707,7 @@ BingfengChenGuangdong University of Technology ChenjieQiu YifengXie - BoyanXu + BoyanXu RuichuCaiGuangdong University of Technology ZhifengHaoShantou University 6799-6806 @@ -4716,16 +4716,16 @@ <fixed-case>B</fixed-case>an<fixed-case>NERD</fixed-case>: A Benchmark Dataset and Context-Driven Approach for <fixed-case>B</fixed-case>angla Named Entity Recognition - Md. MotaharMahtabGIGATECH, BEXIMCO - Faisal AhamedKhanGiga Tech Limited - Md. EkramulIslamGiga Tech Limited. - Md. Shahad MahmudChowdhuryGiga Tech Limited + Md. MotaharMahtabGIGATECH, BEXIMCO + Faisal AhamedKhanGiga Tech Limited + Md. EkramulIslamGiga Tech Limited. + Md. Shahad MahmudChowdhuryGiga Tech Limited Labib ImamChowdhuryGiga Tech Limited - SadiaAfrinHeinrich-Heine Universität Düsseldorf and Giga Tech Limited + SadiaAfrinHeinrich-Heine Universität Düsseldorf and Giga Tech Limited HazratAliGiga Tech Limited - Mohammad Mamun OrRashid - NabeelMohammedNorth South University - Mohammad RuhulAminFordham University + Mohammad Mamun OrRashid + NabeelMohammedNorth South University + Mohammad RuhulAminFordham University 6807-6828 In this study, we introduce BanNERD, the most extensive human-annotated and validated Bangla Named Entity Recognition Dataset to date, comprising over 85,000 sentences. BanNERD is curated from a diverse array of sources, spanning over 29 domains, thereby offering a comprehensive range of generalized contexts. To ensure the dataset’s quality, expert linguists developed a detailed annotation guideline tailored to the Bangla language. All annotations underwent rigorous validation by a team of validators, with final labels being determined via majority voting, thereby ensuring the highest annotation quality and a high IAA score of 0.88. In a cross-dataset evaluation, models trained on BanNERD consistently outperformed those trained on four existing Bangla NER datasets. Additionally, we propose a method named BanNERCEM (Bangla NER context-ensemble Method) which outperforms existing approaches on Bangla NER datasets and performs competitively on English datasets using lightweight Bangla pretrained LLMs. Our approach passes each context separately to the model instead of previous concatenation-based approaches achieving the highest average macro F1 score of 81.85% across 10 NER classes, outperforming previous approaches and ensuring better context utilization. We are making the code and datasets publicly available at https://github.com/eblict-gigatech/BanNERD in order to contribute to the further advancement of Bangla NLP. 2025.findings-naacl.380 @@ -4734,10 +4734,10 @@ Large Language Models Reflect Human Citation Patterns with a Heightened Citation Bias AndresAlgabaVrije Universiteit Brussel - CarmenMazijnVrije Universiteit Brussel - VincentHolstVrije Universiteit Brussel - FlorianoToriVrije Universiteit Brussel - SylviaWenmackersKU Leuven + CarmenMazijnVrije Universiteit Brussel + VincentHolstVrije Universiteit Brussel + FlorianoToriVrije Universiteit Brussel + SylviaWenmackersKU Leuven VincentGinisVrije Universiteit Brussel 6829-6864 Citation practices are crucial in shaping the structure of scientific knowledge, yet they are often influenced by contemporary norms and biases. The emergence of Large Language Models (LLMs) introduces a new dynamic to these practices. Interestingly, the characteristics and potential biases of references recommended by LLMs that entirely rely on their parametric knowledge, and not on search or retrieval-augmented generation, remain unexplored. Here, we analyze these characteristics in an experiment using a dataset from AAAI, NeurIPS, ICML, and ICLR, published after GPT-4’s knowledge cut-off date. In our experiment, LLMs are tasked with suggesting scholarly references for the anonymized in-text citations within these papers. Our findings reveal a remarkable similarity between human and LLM citation patterns, but with a more pronounced high citation bias, which persists even after controlling for publication year, title length, number of authors, and venue. The results hold for both GPT-4, and the more capable models GPT-4o and Claude 3.5 where the papers are part of the training data. Additionally, we observe a large consistency between the characteristics of LLM’s existing and non-existent generated references, indicating the model’s internalization of citation patterns. By analyzing citation graphs, we show that the references recommended are embedded in the relevant citation context, suggesting an even deeper conceptual internalization of the citation networks. While LLMs can aid in citation generation, they may also amplify existing biases, such as the Matthew effect, and introduce new ones, potentially skewing scientific knowledge dissemination. @@ -4747,7 +4747,7 @@ What can Large Language Models Capture about Code Functional Equivalence? NickilMaveliUniversity of Edinburgh, University of Edinburgh - AntonioVergariUniversity of Edinburgh, University of Edinburgh + AntonioVergariUniversity of Edinburgh, University of Edinburgh Shay BCohenUniversity of Edinburgh 6865-6903 Code-LLMs, LLMs pre-trained on large code corpora, have shown great progress in learning rich representations of the structure and syntax of code, successfully using it to generate or classify code fragments. At the same time, understanding if they are able to do so because they capture code semantics, and how well, is still an open question. In this paper, we tackle this problem by introducing SeqCoBench, a benchmark for systematically assessing how Code-LLMs can capture code functional equivalence. SeqCoBench contains over 20 code transformations that either preserve or alter the semantics of Python programs. We conduct extensive evaluations in different settings, including zero-shot and parameter-efficient finetuning methods on state-of-the-art (Code)-LLMs to see if they can discern semantically equivalent or different pairs of programs in SeqCoBench. We find that the performance gap between these LLMs and classical match-based retrieval scores is minimal, with both approaches showing a concerning lack of depth in understanding code semantics. @@ -4756,14 +4756,14 @@ Make Every Penny Count: Difficulty-Adaptive Self-Consistency for Cost-Efficient Reasoning - XinglinWang + XinglinWang ShaoxiongFengRedNote YiweiLi - PeiwenYuan - YueqiZhang + PeiwenYuan + YueqiZhang ChuyiTan BoyuanPan - YaoHu + YaoHu KanLi 6904-6917 Self-consistency (SC), a widely used decoding strategy for chain-of-thought reasoning, shows significant gains across various multi-step reasoning tasks but comes with a high cost due to multiple sampling with the preset size. Its variants, Adaptive self-consistency (ASC) and Early-stopping self-consistency (ESC), dynamically adjust the number of samples based on the posterior distribution of a set of pre-samples, reducing the cost of SC with minimal impact on performance. Both methods, however, do not exploit the prior information about question difficulty. It often results in unnecessary repeated sampling for easy questions that could be accurately answered with just one attempt, wasting resources. To tackle this problem, we propose Difficulty-Adaptive Self-Consistency (DSC), which leverages the difficulty information of batch queries from both prior and posterior perspectives to adaptively allocate inference resources, further reducing the overall cost of SC. To demonstrate the effectiveness of DSC, we conduct extensive experiments on three popular categories of reasoning tasks: arithmetic, commonsense and symbolic reasoning on six benchmarks. The empirical results show that DSC consistently surpasses the strong baseline ASC and ESC in terms of costs by a significant margin, while attaining comparable performances. @@ -4774,7 +4774,7 @@ Large Language Models Are Better Logical Fallacy Reasoners with Counterargument, Explanation, and Goal-Aware Prompt Formulation JiwonJeong HyejuJangIndiana University - HogunParkSungkyunkwan University + HogunParkSungkyunkwan University 6918-6937 The advancement of Large Language Models (LLMs) has greatly improved our ability to process complex language. However, accurately detecting logical fallacies remains a significant challenge. This study presents a novel and effective prompt formulation approach for logical fallacy detection, applicable in both supervised (fine-tuned) and unsupervised (zero-shot) settings. Our method enriches input text by incorporating implicit contextual information—counterarguments, explanations, and goals—which we query for validity within the argument’s context. We then rank these queries based on confidence scores to inform classification. We evaluate our approach across multiple datasets from 5 domains, covering 29 distinct fallacy types, using models from GPT and LLaMA series. The results show substantial improvements over state-of-the-art models: up to a 0.57 increase in F1-score in zero-shot settings and up to 0.45 in fine-tuned models. Extensive analyses further illustrate why and how our method excels. 2025.findings-naacl.384 @@ -4794,7 +4794,7 @@ Unmasking Database Vulnerabilities: Zero-Knowledge Schema Inference Attacks in Text-to-<fixed-case>SQL</fixed-case> Systems - ĐorđeKlisura + ĐorđeKlisura AnthonyRiosUniversity of Texas at San Antonio 6954-6976 Text-to-SQL systems empower users to interact with databases using natural language, automatically translating queries into executable SQL code. However, their reliance on database schema information for SQL generation exposes them to significant security vulnerabilities, particularly schema inference attacks that can lead to unauthorized data access or manipulation. In this paper, we introduce a novel zero-knowledge framework for reconstructing the underlying database schema of text-to-SQL models without any prior knowledge of the database. Our approach systematically probes text-to-SQL models with specially crafted questions and leverages a surrogate GPT-4 model to interpret the outputs, effectively uncovering hidden schema elements—including tables, columns, and data types. We demonstrate that our method achieves high accuracy in reconstructing table names, with F1 scores of up to .99 for generative models and .78 for fine-tuned models, underscoring the severity of schema leakage risks. We also show that our attack can steal prompt information in non-text-to-SQL models. Furthermore, we propose a simple protection mechanism for generative models and empirically show its limitations in mitigating these attacks. @@ -4814,10 +4814,10 @@ Tackling Social Bias against the Poor: a Dataset and a Taxonomy on Aporophobia GeorginaCurtoUnited Nations University Institute in Macau - SvetlanaKiritchenkoNational Research Council Canada + SvetlanaKiritchenkoNational Research Council Canada Muhammad Hammad FahimSiddiqui IsarNejadgholiNational Research Council Canada and University of Ottawa - Kathleen C.FraserNational Research Council Canada + Kathleen C.FraserNational Research Council Canada 6995-7016 Eradicating poverty is the first goal in the U.N. Sustainable Development Goals. However, aporophobia – the societal bias against people living in poverty – constitutes a major obstacle to designing, approving and implementing poverty-mitigation policies. This work presents an initial step towards operationalizing the concept of aporophobia to identify and track harmful beliefs and discriminative actions against poor people on social media. In close collaboration with non-profits and governmental organizations, we conduct data collection and exploration. Then we manually annotate a corpus of English tweets from five world regions for the presence of (1) direct expressions of aporophobia, and (2) statements referring to or criticizing aporophobic views or actions of others, to comprehensively characterize the social media discourse related to bias and discrimination against the poor. Based on the annotated data, we devise a taxonomy of categories of aporophobic attitudes and actions expressed through speech on social media. Finally, we train several classifiers and identify the main challenges for automatic detection of aporophobia in social networks. This work paves the way towards identifying, tracking, and mitigating aporophobic views on social media at scale. 2025.findings-naacl.388 @@ -4825,12 +4825,12 @@ The <fixed-case>A</fixed-case>merican <fixed-case>S</fixed-case>ign <fixed-case>L</fixed-case>anguage Knowledge Graph: Infusing <fixed-case>ASL</fixed-case> Models with Linguistic Knowledge - LeeKezarUniversity of Southern California + LeeKezarUniversity of Southern California NidhiMunikote ZianZengUniversity of Hawaii System - ZedSehyrChapman University + ZedSehyrChapman University NaomiCaselliBoston University, Boston University - JesseThomasonUniversity of Southern California and Amazon + JesseThomasonUniversity of Southern California and Amazon 7017-7029 Sign language models could make modern language technologies more accessible to those who sign, but the supply of accurately labeled data struggles to meet the demand associated with training large, end-to-end neural models. As an alternative to this approach, we explore how knowledge about the linguistic structure of signs may be used as inductive priors for learning sign recognition and comprehension tasks. We first construct the American Sign Language Knowledge Graph (ASLKG) from 11 sources of linguistic knowledge, with emphasis on features related to signs’ phonological and lexical-semantic properties. Then, we use the ASLKG to train neuro-symbolic models on ASL video input tasks, achieving accuracies of 91% for isolated sign recognition, 14% for predicting the semantic features of unseen signs, and 36% for classifying the topic of Youtube-ASL videos. 2025.findings-naacl.389 @@ -4839,13 +4839,13 @@ Reinforcement Learning for Aligning Large Language Models Agents with Interactive Environments: Quantifying and Mitigating Prompt Overfitting Mohamed SalimAissi - ClémentRomacInria and Hugging Face + ClémentRomacInria and Hugging Face ThomasCarta SylvainLamprierUniversité d’Angers Pierre-YvesOudeyerInria - OlivierSigaudSorbonne Université + OlivierSigaudSorbonne Université LaureSoulierSorbonne Université, CNRS, ISIR - NicolasThomesorbonne université + NicolasThomesorbonne université 7030-7046 Reinforcement learning (RL) is a promising approach for aligning large language models (LLMs) knowledge with sequential decision-making tasks. However, few studies have thoroughly investigated the impact on LLM agents capabilities of fine-tuning them with RL in a specific environment. In this paper, we propose a novel framework to analyze the sensitivity of LLMs to prompt formulations following RL training in a textual environment. Our findings reveal that the performance of LLMs degrades when faced with prompt formulations different from those used during the RL training phase. Besides, we analyze the source of this sensitivity by examining the model’s internal representations and salient tokens. Finally, we propose to use a contrastive loss to mitigate this sensitivity and improve the robustness and generalization capabilities of LLMs. 2025.findings-naacl.390 @@ -4855,11 +4855,11 @@ An empirical study of validating synthetic data for formula generation UsneekSinghMicrosoft JoséCambronero - SumitGulwaniResearch, Microsoft + SumitGulwaniResearch, Microsoft AdityaKanadeMicrosoft - AnirudhKhatryUniversity of Texas at Austin - VuLeMicrosoft - MukulSinghMicrosoft + AnirudhKhatryUniversity of Texas at Austin + VuLeMicrosoft + MukulSinghMicrosoft GustVerbruggenMicrosoft 7047-7054 Large language models (LLMs) can be leveraged to help write formulas in spreadsheets, but formula data resources are scarce, impacting both the base performance of pre-trained models and limiting the ability to fine-tune them. Given a corpus of formulas, we can use another model to generate synthetic natural language utterances for fine-tuning. However, it is important to validate whether the natural language (NL) generated by the LLM is accurate for it to be beneficial for fine-tuning. In this paper, we provide empirical results on the impact of validating these synthetic training examples with surrogate objectives that evaluate the accuracy of the synthetic annotations. We demonstrate that validation improves performance over raw data across four models (2 open and 2 closed weight). Interestingly, we show that although validation tends to prune more challenging examples, it increases the complexity of problems that models can solve after being fine-tuned on validated data. @@ -4869,10 +4869,10 @@ <fixed-case>T</fixed-case>e<fixed-case>C</fixed-case>o<fixed-case>F</fixed-case>e<fixed-case>S</fixed-case>: Text Column Featurization using Semantic Analysis AnanyaSinghaResearch, Microsoft - MukulSinghMicrosoft - AshishTiwariMicrosoft - SumitGulwaniResearch, Microsoft - VuLeMicrosoft + MukulSinghMicrosoft + AshishTiwariMicrosoft + SumitGulwaniResearch, Microsoft + VuLeMicrosoft ChrisParninNorth Carolina State University 7055-7061 Extracting insights from text columns can bechallenging and time-intensive. Existing methods for topic modeling and feature extractionare based on syntactic features and often overlook the semantics. We introduce the semantictext column featurization problem, and presenta scalable approach for automatically solvingit. We extract a small sample smartly, use alarge language model (LLM) to label only thesample, and then lift the labeling to the wholecolumn using text embeddings. We evaluateour approach by turning existing text classification benchmarks into semantic categorization benchmarks. Our approach performs better than baselines and naive use of LLMs. @@ -4884,7 +4884,7 @@ XiXu WendaXu SiqiOuyangCMU, Carnegie Mellon University - LeiLiSchool of Computer Science, Carnegie Mellon University + LeiLiSchool of Computer Science, Carnegie Mellon University 7062-7067 Simultaneous speech translation (SimulST) systems must balance translation quality with response time, making latency measurement crucial for evaluating their real-world performance. However, there has been a longstanding belief that current metrics yield unrealistically high latency measurements in unsegmented streaming settings. In this paper, we investigate this phenomenon, revealing its root cause in a fundamental misconception underlying existing latency evaluation approaches. We demonstrate that this issue affects not only streaming but also segment-level latency evaluation across different metrics. Furthermore, we propose a modification to correctly measure computation-aware latency for SimulST systems, addressing the limitations present in existing metrics. 2025.findings-naacl.393 @@ -4902,7 +4902,7 @@ Adaptive Attacks Break Defenses Against Indirect Prompt Injection Attacks on <fixed-case>LLM</fixed-case> Agents QiusiZhan - RichardFang + RichardFang Henil ShalinPanchal DanielKang 7101-7117 @@ -4914,9 +4914,9 @@ Flaming-hot Initiation with Regular Execution Sampling for Large Language Models WeizheChenUniversity of Southern California ZhichengZhangCarnegie Mellon University - GuanlinLiuByteDance Inc. + GuanlinLiuByteDance Inc. RenjieZhengByteDance - WenleiShi + WenleiShi ChenDunByteDance Inc. ZhengWuByteDance Inc. XingJin @@ -4928,7 +4928,7 @@ <fixed-case>HEISIR</fixed-case>: Hierarchical Expansion of Inverted Semantic Indexing for Training-free Retrieval of Conversational Data using <fixed-case>LLM</fixed-case>s - SangyeopKimCoxwave and Seoul National University + SangyeopKimCoxwave and Seoul National University HangyeulLee YohanLeeCoxwave 7128-7144 @@ -4939,10 +4939,10 @@ “Women do not have heart attacks!” Gender Biases in Automatically Generated Clinical Cases in <fixed-case>F</fixed-case>rench FannyDucelUniversité Paris-Saclay - NicolasHiebelUniversité Paris-Saclay - OlivierFerretCEA - KarënFortUniversity of Lorraine - AurélieNévéolLISN-CNRS / Université Paris Saclay + NicolasHiebelUniversité Paris-Saclay + OlivierFerretCEA + KarënFortUniversity of Lorraine + AurélieNévéolLISN-CNRS / Université Paris Saclay 7145-7159 Healthcare professionals are increasingly including Language Models (LMs) in clinical practice. However, LMs have been shown to exhibit and amplify stereotypical biases that can cause life-threatening harm in a medical context. This study aims to evaluate gender biases in automatically generated clinical cases in French, on ten disorders. Using seven LMs fine-tuned for clinical case generation and an automatic linguistic gender detection tool, we measure the associations between disorders and gender. We unveil that LMs over-generate cases describing male patients, creating synthetic corpora that are not consistent with documented prevalence for these disorders. For instance, when prompts do not specify a gender, LMs generate eight times more clinical cases describing male (vs. female patients) for heart attack. We discuss the ideal synthetic clinical case corpus and establish that explicitly mentioning demographic information in generation instructions appears to be the fairest strategy. In conclusion, we argue that the presence of gender biases in synthetic text raises concerns about LM-induced harm, especially for women and transgender people. 2025.findings-naacl.398 @@ -4966,7 +4966,7 @@ Exploring Large Language Models for Hate Speech Detection in <fixed-case>R</fixed-case>ioplatense <fixed-case>S</fixed-case>panish Juan ManuelPérezUniversidad de San Andres and Universidad de Buenos Aires - PaulaMiguelUniversidad de Buenos Aires and Universidad de Buenos Aires + PaulaMiguelUniversidad de Buenos Aires and Universidad de Buenos Aires VivianaCotikComputer Science Department, University of Buenos Aires 7174-7187 Hate speech detection deals with many language variants, slang, slurs, expression modalities, and cultural nuances. This outlines the importance of working with specific corpora, when addressing hate speech within the scope of Natural Language Processing, recently revolutionized by the irruption of Large Language Models. This work presents a brief analysis of the performance of large language models in the detection of Hate Speech for Rioplatense Spanish. We performed classification experiments leveraging chain-of-thought reasoning with ChatGPT 3.5, Mixtral, and Aya, comparing their results with those of a state-of-the-art BERT classifier. These experiments outline that, even if large language models show a lower precision compared to the fine-tuned BERT classifier and, in some cases, they find hard-to-get slurs or colloquialisms, they still are sensitive to highly nuanced cases (particularly, homophobic/transphobic hate speech). We make our code and models publicly available for future research. @@ -4980,7 +4980,7 @@ CharlieCowen-Breen JayWhite DesmondDeVaulPrinceton University - FrederickRiemenschneiderRuprecht-Karls-Universität Heidelberg + FrederickRiemenschneiderRuprecht-Karls-Universität Heidelberg Karthik RNarasimhanPrinceton University BarbaraGraziosiPrinceton University 7188-7202 @@ -4992,20 +4992,20 @@ <fixed-case>W</fixed-case>orld<fixed-case>M</fixed-case>ed<fixed-case>QA</fixed-case>-<fixed-case>V</fixed-case>: a multilingual, multimodal medical examination dataset for multimodal language models evaluation JoãoMatosUniversity of Oxford ShanChen - Siena Kathleen V.Placino + Siena Kathleen V.Placino YingyaLiHarvard University - Juan Carlos ClimentPardo - DaphnaIdan + Juan Carlos ClimentPardo + DaphnaIdan TakeshiTohyamaMassachusetts Institute of Technology DavidRestrepoCentrale Supélec and Massachusetts Institute of Technology - Luis FilipeNakayamaMassachusetts Institute of Technology + Luis FilipeNakayamaMassachusetts Institute of Technology José María MilletPascual-Leone - Guergana KSavovaHarvard University - HugoAertsHarvard University - Leo AnthonyCeliMassachusetts Institute of Technology and Beth Israel Deaconess Medical Center - An-Kwok IanWongDuke University + Guergana KSavovaHarvard University + HugoAertsHarvard University + Leo AnthonyCeliMassachusetts Institute of Technology and Beth Israel Deaconess Medical Center + An-Kwok IanWongDuke University DanielleBittermanHarvard University - JackGallifant + JackGallifant 7203-7216 Multimodal/vision language models (VLMs) are increasingly being deployed in healthcare settings worldwide, necessitating robust benchmarks to ensure their safety, efficacy, and fairness. Multiple-choice question and answer (QA) datasets derived from national medical examinations have long served as valuable evaluation tools, but existing datasets are largely text-only and available in a limited subset of languages and countries. To address these challenges, we present WorldMedQA-V, an updated multilingual, multimodal benchmarking dataset designed to evaluate VLMs in healthcare. WorldMedQA-V includes 568 labeled multiple-choice QAs paired with 568 medical images from four countries (Brazil, Israel, Japan, and Spain), covering original languages and validated English translations by native clinicians, respectively. Baseline performance for common open- and closed-source models are provided in the local language and English translations, and with and without images provided to the model. The WorldMedQA-V benchmark aims to better match AI systems to the diverse healthcare environments in which they are deployed, fostering more equitable, effective, and representative applications. 2025.findings-naacl.402 @@ -5015,10 +5015,10 @@ <fixed-case>B</fixed-case>an<fixed-case>TH</fixed-case>: A Multi-label Hate Speech Detection Dataset for Transliterated <fixed-case>B</fixed-case>angla FabihaHaiderPenta Global Limited Fariha TanjimShifatPenta Global Limited - Md FarhanIshmamIslamic University of Technology + Md FarhanIshmamIslamic University of Technology Md Sakib Ul RahmanSourove Deeparghya DuttaBaruaPenta Global Limited - MdFahimIndependent University, Bangladesh + MdFahimIndependent University, Bangladesh Md Farhad AlamBhuiyan 7217-7236 The proliferation of transliterated texts in digital spaces has emphasized the need for detecting and classifying hate speech in languages beyond English, particularly in low-resource languages. As online discourse can perpetuate discrimination based on target groups, e.g. gender, religion, and origin, multi-label classification of hateful content can help in understanding hate motivation and enhance content moderation. While previous efforts have focused on monolingual or binary hate classification tasks, no work has yet addressed the challenge of multi-label hate speech classification in transliterated Bangla. We introduce BanTH, the first multi-label transliterated Bangla hate speech dataset. The samples are sourced from YouTube comments, where each instance is labeled with one or more target groups, reflecting the regional demographic. We propose a novel translation-based LLM prompting strategy that translates or transliterates under-resourced text to higher-resourced text before classifying the hate group(s). Experiments reveal further pre-trained encoders achieving state-of-the-art performance on the BanTH dataset while translation-based prompting outperforms other strategies in the zero-shot setting. We address a critical gap in Bangla hate speech and set the stage for further exploration into code-mixed and multi-label classification in underrepresented languages. @@ -5054,16 +5054,16 @@ Adaptive Parameter Compression for Language Models - JeremiasBohnTechnische Universität München + JeremiasBohnTechnische Universität München FredericMrozinski - GeorgGrohTechnical University Munich + GeorgGrohTechnical University Munich 7269-7286 2025.findings-naacl.406 bohn-etal-2025-adaptive Personalize Your <fixed-case>LLM</fixed-case>: Fake it then Align it - YijingZhangUniversity of Wisconsin - Madison + YijingZhangUniversity of Wisconsin - Madison DyahAdilaUniversity of Wisconsin, Madison ChanghoShinUniversity of Wisconsin, Madison FredericSalaUniversity of Wisconsin, Madison @@ -5089,8 +5089,8 @@ Inference Scaling for Bridging Retrieval and Augmented Generation YoungwonLeeSeoul National University Seung-wonHwangSeoul National University - Daniel FCamposSnowflake - FilipGralińskiSnowflake and Adam Mickiewicz University + Daniel FCamposSnowflake + FilipGralińskiSnowflake and Adam Mickiewicz University ZheweiYaoSnowflake YuxiongHeMicrosoft 7324-7339 @@ -5125,7 +5125,7 @@ When natural language is not enough: The limits of in-context learning demonstrations in multilingual reasoning - LeonardoRanaldi + LeonardoRanaldi BarryHaddowUniversity of Edinburgh AlexandraBirchUniversity of Edinburgh 7369-7396 @@ -5135,7 +5135,7 @@ Uncovering Latent Arguments in Social Media Messaging by Employing <fixed-case>LLM</fixed-case>s-in-the-Loop Strategy - TunazzinaIslam + TunazzinaIslam DanGoldwasserPurdue University and Purdue University 7397-7429 The widespread use of social media has led to a surge in popularity for automated methods of analyzing public opinion. Supervised methods are adept at text categorization, yet the dynamic nature of social media discussions poses a continual challenge for these techniques due to the constant shifting of the focus. On the other hand, traditional unsupervised methods for extracting themes from public discourse, such as topic modeling, often reveal overarching patterns that might not capture specific nuances. Consequently, a significant portion of research into social media discourse still depends on labor-intensive manual coding techniques and a human-in-the-loop approach, which are both time-consuming and costly. In this work, we study the problem of discovering arguments associated with a specific theme. We propose a generic **LLMs-in-the-Loop** strategy that leverages the advanced capabilities of Large Language Models (LLMs) to extract latent arguments from social media messaging. To demonstrate our approach, we apply our framework to contentious topics. We use two publicly available datasets: (1) the climate campaigns dataset of 14k Facebook ads with 25 themes and (2) the COVID-19 vaccine campaigns dataset of 9k Facebook ads with 14 themes. Additionally, we design a downstream task as stance prediction by leveraging talking points in climate debates. Furthermore, we analyze demographic targeting and the adaptation of messaging based on real-world events. @@ -5146,7 +5146,7 @@ <fixed-case>A</fixed-case>crostic<fixed-case>S</fixed-case>leuth: Probabilistic Identification and Ranking of Acrostics in Multilingual Corpora AleksandrFedchin IsabelCooperman - PramitChaudhuriUniversity of Texas at Austin + PramitChaudhuriUniversity of Texas at Austin Joseph P.Dexter 7430-7437 For centuries, writers have hidden messages as acrostics, in which initial letters of consecutive lines or paragraphs form meaningful words or phrases. Scholars searching for acrostics manually can only focus on a few authors at a time and often favor qualitative arguments about whether a given acrostic is accidental or intentional. Here we describe AcrosticSleuth, a first-of-its-kind approach to identify acrostics automatically and rank them by the probability that the corresponding sequence of characters does not occur by chance. Since acrostics are rare, we formalize the problem as a binary classification task in the presence of extreme class imbalance. To evaluate AcrosticSleuth, we present the Acrostic Identification Dataset (AcrostID), a collection of acrostics from the WikiSource online database. Despite the class imbalance, AcrosticSleuth achieves F1 scores of 0.39, 0.59, and 0.66 on the French, English, and Russian subdomains of WikiSource, respectively. We further demonstrate that AcrosticSleuth can identify previously unknown instances of wordplay in high-profile literary contexts, including the English philosopher Thomas Hobbes’ signature in the opening paragraphs of The Elements of Law. @@ -5155,12 +5155,12 @@ <fixed-case>M</fixed-case>ed<fixed-case>T</fixed-case>hink: A Rationale-Guided Framework for Explaining Medical Visual Question Answering - XiaotangGai + XiaotangGai ChenyiZhou JiaxiangLiu YangFeng JianWuZhejiang University - ZuozhuLiuZhejiang University + ZuozhuLiuZhejiang University 7438-7450 Medical Visual Question Answering (Med-VQA), which offers language responses to image-based medical inquiries, represents a challenging task and significant advancement in healthcare. It assists medical experts to swiftly interpret medical images, thereby enabling faster and more accurate diagnoses. However, the model interpretability and transparency of existing Med-VQA solutions are often limited, posing challenges in understanding their decision-making processes. To address this issue, we devise a semi-automated annotation process to streamline data preparation and build new benchmark Med-VQA datasets R-RAD, R-SLAKE and R-Path. These datasets provide intermediate medical decision-making rationales generated by multimodal large language models and human annotations for question-answering pairs in existing Med-VQA datasets, i.e., VQA-RAD, SLAKE and PathVQA. Moreover, we design a novel framework, MedThink, which finetunes lightweight pretrained generative models by incorporating medical decision-making rationales. MedThink includes three distinct strategies to generate decision outcomes and corresponding rationales, clearly showcasing the medical decision-making process during reasoning. Our comprehensive experiments show that our method achieves an accuracy of 83.5% on R-RAD, 86.3% on R-SLAKE and 87.2% on R-Path. These results significantly exceed those of existing state-of-the-art models with comparable parameters. Datasets and code are available at https://github.com/Tang-xiaoxiao/Medthink. 2025.findings-naacl.415 @@ -5181,8 +5181,8 @@ JoelMire Zubin TrivadiAysolaCMU, Carnegie Mellon University DanielChechelnitskyCMU, Carnegie Mellon University - NicholasDeasColumbia University - ChrysoulaZervaInstituto Superior Técnico + NicholasDeasColumbia University + ChrysoulaZervaInstituto Superior Técnico MaartenSapCarnegie Mellon University 7468-7487 Preference alignment via reward models helps build safe, helpful, and reliable large language models (LLMs). However, subjectivity in preference judgments and the lack of representative sampling in preference data collection can introduce new biases, hindering reward models’ fairness and equity. In this work, we introduce a framework for evaluating dialect biases in reward models and conduct a case study on biases against African American Language (AAL) through several experiments comparing reward model preferences and behavior on paired White Mainstream English (WME) and both machine-translated and human-written AAL corpora. We show that reward models are less aligned with human preferences when processing AAL texts vs. WME ones (-4% accuracy on average), frequently disprefer AAL-aligned texts vs. WME-aligned ones, and steer conversations toward WME, even when prompted with AAL texts. Our findings provide a targeted analysis of anti-AAL biases at a relatively understudied stage in LLM development, highlighting representational harms and ethical questions about the desired behavior of LLMs concerning AAL. @@ -5191,16 +5191,16 @@ Do Large Language Models Align with Core Mental Health Counseling Competencies? - Viet CuongNguyen + Viet CuongNguyen MohammadTaher DongwanHong Vinicius KonkolicsPossobom Vibha ThirunellayiGopalakrishnan EktaRaj - ZihangLi + ZihangLi Heather J.SoledNorthwell Health Michael L.Birnbaum - SrijanKumarGeorgia Institute of Technology + SrijanKumarGeorgia Institute of Technology MunmunDe Choudhury 7488-7511 The rapid evolution of Large Language Models (LLMs) presents a promising solution to the global shortage of mental health professionals. However, their alignment with essential counseling competencies remains underexplored. We introduce CounselingBench, a novel NCMHCE-based benchmark evaluating 22 general-purpose and medical-finetuned LLMs across five key competencies. While frontier models surpass minimum aptitude thresholds, they fall short of expert-level performance, excelling in Intake, Assessment & Diagnosis but struggling with Core Counseling Attributes and Professional Practice & Ethics. Surprisingly, medical LLMs do not outperform generalist models in accuracy, though they provide slightly better justifications while making more context-related errors. These findings highlight the challenges of developing AI for mental health counseling, particularly in competencies requiring empathy and nuanced reasoning. Our results underscore the need for specialized, fine-tuned models aligned with core mental health counseling competencies and supported by human oversight before real-world deployment. Code and data associated with this manuscript can be found at: https://github.com/cuongnguyenx/CounselingBench @@ -5212,7 +5212,7 @@ ZizhangChen PeizhaoLiGoogle XiaomengDong - PengyuHongBrandeis University + PengyuHongBrandeis University 7512-7523 To facilitate healthcare delivery, language models (LMs) have significant potential for clinical prediction tasks using electronic health records (EHRs). However, in these high-stakes applications, unreliable decisions can result in significant costs due to compromised patient safety and ethical concerns, thus increasing the need for good uncertainty modelling of automated clinical predictions. To address this, we consider uncertainty quantification of LMs for EHR tasks in both white-box and black-box settings. We first quantify uncertainty in white-box models, where we have access to model parameters and output logits. We show that an effective reduction of model uncertainty can be achieved by using the proposed multi-tasking and ensemble methods in EHRs. Continuing with this idea, we extend our approach to black-box settings, including popular proprietary LMs such as GPT-4. We validate our framework using longitudinal clinical data from over 6,000 patients across ten clinical prediction tasks. Results show that ensembling methods and multi-task prediction prompts reduce uncertainty across different scenarios. These findings increase model transparency in white-box and black-box settings, thereby advancing reliable AI healthcare. 2025.findings-naacl.419 @@ -5222,10 +5222,10 @@ Hypothesis Generation for Materials Discovery and Design Using Goal-Driven and Constraint-Guided <fixed-case>LLM</fixed-case> Agents ShrinidhiKumbharArizona State University VenkateshMishra - KevinCoutinhoArizona State University + KevinCoutinhoArizona State University DivijHandaArizona State University AshifIquebalArizona State University - ChittaBaralArizona State University + ChittaBaralArizona State University 7524-7555 Materials discovery and design are essential for advancing technology across various industries by enabling the development of application-specific materials. Recent research has leveraged Large Language Models (LLMs) to accelerate this process. We explore the potential of LLMs to generate viable hypotheses that, once validated, can expedite materials discovery. Collaborating with materials science experts, we curated a novel dataset from recent journal publications, featuring real-world goals, constraints, and methods for designing real-world applications. Using this dataset, we test LLM-based agents that generate hypotheses for achieving given goals under specific constraints. To assess the relevance and quality of these hypotheses, we propose a novel scalable evaluation metric that emulates the process a materials scientist would use to evaluate a hypothesis critically. Our curated dataset, proposed method, and evaluation framework aim to advance future research in accelerating materials discovery and design with LLMs. 2025.findings-naacl.420 @@ -5234,8 +5234,8 @@ Aligning to What? Limits to <fixed-case>RLHF</fixed-case> Based Alignment LoganBarnhart - RezaAkbarian BafghiUniversity of Colorado at Boulder - StephenBeckerUniversity of Colorado, Boulder + RezaAkbarian BafghiUniversity of Colorado at Boulder + StephenBeckerUniversity of Colorado, Boulder MaziarRaissiUniversity of Colorado at Boulder 7556-7591 Reinforcement Learning from Human Feedback (RLHF) is increasingly used to align large language models (LLMs) with human preferences. However, the effectiveness of RLHF in addressing underlying biases remains unclear. This study investigates the relationship between RLHF and both covert and overt biases in LLMs, particularly focusing on biases against African Americans. We applied various RLHF techniques (DPO, ORPO, and RLOO) to Llama 3 8B and evaluated the covert and overt biases of the resulting models using matched-guise probing and explicit bias testing. We performed additional tests with DPO on different base models and datasets; among several implications, we found that SFT before RLHF calcifies model biases. Additionally, we extend the tools for measuring biases to multi-modal models. Through our experiments we collect evidence that indicates that current alignment techniques are inadequate for nebulous tasks such as mitigating covert biases, highlighting the need for capable datasets, data curating techniques, or alignment tools. @@ -5246,7 +5246,7 @@ Beyond Words: Exploring Cultural Value Sensitivity in Multimodal Models SrishtiYadav ZhiZhangUniversity of Amsterdam, University of Amsterdam - DanielHershcovichUniversity of Copenhagen + DanielHershcovichUniversity of Copenhagen EkaterinaShutovaUniversity of Amsterdam 7592-7608 Investigating value alignment in Large Language Models (LLMs) based on cultural context has become a critical area of research. However, similar biases have not been extensively explored in large vision-language models (VLMs). As the scale of multimodal models continues to grow, it becomes increasingly important to assess whether images can serve as reliable proxies for culture and how these values are embedded through the integration of both visual and textual data. In this paper, we conduct a thorough evaluation of multimodal model at different scales, focusing on their alignment with cultural values. Our findings reveal that, much like LLMs, VLMs exhibit sensitivity to cultural values, but their performance in aligning with these values is highly context-dependent. While VLMs show potential in improving value understanding through the use of images, this alignment varies significantly across contexts highlighting the complexities and underexplored challenges in the alignment of multimodal models. @@ -5258,9 +5258,9 @@ JeffreyOlmoBrigham Young University JaredWilsonBrigham Young University MaxForseyBrigham Young University - BryceHepner + BryceHepner Thomas VincentHoweBrigham Young University - DavidWingateBrigham Young University + DavidWingateBrigham Young University 7609-7619 Sparse Autoencoders (SAEs) are a promising approach for extracting neural network representations by learning a sparse and overcomplete decomposition of the network’s internal activations. However, SAEs are traditionally trained considering only activation values and not the effect those activations have on downstream computations. This limits the information available to learn features, and biases the autoencoder towards neglecting features which are represented with small activation values but strongly influence model outputs.To address this, we introduce Gradient SAEs (g-SAEs), which modify the k-sparse autoencoder architecture by augmenting the TopK activation function to rely on the gradients of the input activation when selecting the k elements. For a given sparsity level, g-SAEs produce reconstructions that are more faithful to original network performance when propagated through the network.Additionally, we find evidence that g-SAEs learn latents that are on average more effective at steering models in arbitrary contexts.By considering the downstream effects of activations, our approach leverages the dual nature of neural network features as both representations, retrospectively, and actions, prospectively. While previous methods have approached the problem of feature discovery primarily focused on the former aspect, g-SAEs represent a step towards accounting for the latter as well. 2025.findings-naacl.423 @@ -5268,13 +5268,13 @@ Tooling or Not Tooling? The Impact of Tools on Language Agents for Chemistry Problem Solving - BotaoYuThe Ohio State University + BotaoYuThe Ohio State University Frazier N.BakerOhio State University, Columbus ZiruChen GarrettHerb BoyuGouOhio State University, Columbus - DanielAdu-AmpratwumOhio State University - XiaNingOhio State University, Columbus + DanielAdu-AmpratwumOhio State University + XiaNingOhio State University, Columbus HuanSunThe Ohio State University, Columbus 7620-7640 To enhance large language models (LLMs) for chemistry problem solving, several LLM-based agents augmented with tools have been proposed, such as ChemCrow and Coscientist. However, their evaluations are narrow in scope, leaving a large gap in understanding the benefits of tools across diverse chemistry tasks. To bridge this gap, we develop ChemAgent, an enhanced chemistry agent over ChemCrow, and conduct a comprehensive evaluation of its performance on both specialized chemistry tasks and general chemistry questions. Surprisingly, ChemAgent does not consistently outperform its base LLMs without tools. Our error analysis with a chemistry expert suggests that: For specialized chemistry tasks, such as synthesis prediction, we should augment agents with specialized tools; however, for general chemistry questions like those in exams, agents’ ability to reason correctly with chemistry knowledge matters more, and tool augmentation does not always help. @@ -5286,10 +5286,10 @@ ViacheslavVasilev JuliaAgafonova NikolaiGerasimenko - AlexanderKapitanovSberDevices + AlexanderKapitanovSberDevices PolinaMikhailovasalute devices EvelinaMironova - DenisDimitrovAIRI and Sber + DenisDimitrovAIRI and Sber 7641-7657 Text-to-image generation models have gained popularity among users around the world. However, many of these models exhibit a strong bias toward English-speaking cultures, ignoring or misrepresenting the unique characteristics of other language groups, countries, and nationalities. The lack of cultural awareness can reduce the generation quality and lead to undesirable consequences such as unintentional insult, and the spread of prejudice. In contrast to the field of natural language processing, cultural awareness in computer vision has not been explored as extensively. In this paper, we strive to reduce this gap. We propose a RusCode benchmark for evaluating the quality of text-to-image generation containing elements of the Russian cultural code. To do this, we form a list of 19 categories that best represent the features of Russian visual culture. Our final dataset consists of 1250 text prompts in Russian and their translations into English. The prompts cover a wide range of topics, including complex concepts from art, popular culture, folk traditions, famous people’s names, natural objects, scientific achievements, etc. We present the results of a human evaluation of the side-by-side comparison of Russian visual concepts representations using popular generative models. 2025.findings-naacl.425 @@ -5301,7 +5301,7 @@ PranavChitale, State University of New York at Stony Brook KhushbooSingh NiranjanBalasubramanianState University of New York, Stony Brook - H.SchwartzStony Brook University (SUNY) + H.SchwartzStony Brook University (SUNY) 7658-7667 Like most of NLP, models for human-centered NLP tasks—tasks attempting to assess author-level information—predominantly use rep-resentations derived from hidden states of Transformer-based LLMs. However, what component of the LM is used for the representation varies widely. Moreover, there is a need for Human Language Models (HuLMs) that implicitly model the author and provide a user-level hidden state. Here, we systematically evaluate different ways of representing documents and users using different LM and HuLM architectures to predict task outcomes as both dynamically changing states and averaged trait-like user-level attributes of valence, arousal, empathy, and distress. We find that representing documents as an average of the token hidden states performs the best generally. Further, while a user-level hidden state itself is rarely the best representation, we find its inclusion in the model strengthens token or document embeddings used to derive document- and user-level representations resulting in best performances. 2025.findings-naacl.426 @@ -5309,17 +5309,17 @@ Large Language Models and Causal Inference in Collaboration: A Comprehensive Survey - XiaoyuLiuUniversity of Maryland, College Park - PaihengXuDepartment of Computer Science, University of Maryland, College Park + XiaoyuLiuUniversity of Maryland, College Park + PaihengXuDepartment of Computer Science, University of Maryland, College Park JundaWuUniversity of California, San Diego JiaxinYuan YifanYangUniversity of Maryland, College Park YuhangZhou FuxiaoLiu TianruiGuan - HaoliangWangAdobe Research - TongYuAdobe Research - JulianMcAuleyUniversity of California, San Diego, University of California, San Diego + HaoliangWangAdobe Research + TongYuAdobe Research + JulianMcAuleyUniversity of California, San Diego, University of California, San Diego WeiAiUniversity of Maryland, College Park FurongHuangUniversity of Maryland 7668-7684 @@ -5348,16 +5348,16 @@ Using Linguistic Entrainment to Evaluate Large Language Models for Use in Cognitive Behavioral Therapy - MinaKianUniversity of Southern California + MinaKianUniversity of Southern California KaleenShresthaUniversity of Southern California - KatrinFischer + KatrinFischer XiaoyuanZhu JonathanOngUniversity of Southern California AryanTrehan JessicaWang GloriaChang SébArnoldGoogle DeepMind - MajaMataricUniversity of Southern California + MajaMataricUniversity of Southern California 7724-7743 Entrainment, the responsive communication between interacting individuals, is a crucial process in building a strong relationship between a mental health therapist and their client, leading to positive therapeutic outcomes. However, so far entrainment has not been investigated as a measure of efficacy of large language models (LLMs) delivering mental health therapy. In this work, we evaluate the linguistic entrainment of an LLM (ChatGPT 3.5-turbo) in a mental health dialog setting. We first validate computational measures of linguistic entrainment with two measures of the quality of client self-disclosures: intimacy and engagement (p < 0.05). We then compare the linguistic entrainment of the LLM to trained therapists and non-expert online peer supporters in a cognitive behavioral therapy (CBT) setting. We show that the LLM is outperformed by humans with respect to linguistic entrainment (p < 0.001). These results support the need to be cautious in using LLMs out-of-the-box for mental health applications. 2025.findings-naacl.430 @@ -5369,7 +5369,7 @@ AliceRozetUniversity of Florida JotsnaGowda PryceHouck - KevinTangHeinrich Heine University Düsseldorf and University of Florida + KevinTangHeinrich Heine University Düsseldorf and University of Florida SarahMoellerUniversity of Florida 7744-7756 African American English (AAE) presents unique challenges in natural language processing (NLP) This research systematically compares the performance of available NLP models—rule-based, transformer-based, and large language models (LLMs)—capable of identifying key grammatical features of AAE, namely Habitual Be and Multiple Negation. These features were selected for their distinct grammatical complexity and frequency of occurrence. The evaluation involved sentence-level binary classification tasks, using both zero-shot and few-shot strategies. The analysis reveals that while LLMs show promise compared to the baseline, they are influenced by biases such as recency and unrelated features in the text such as formality. This study highlights the necessity for improved model training and architectural adjustments to better accommodate AAE’s unique linguistic characteristics. Data and code are available. @@ -5378,13 +5378,13 @@ <fixed-case>LLM</fixed-case>-Microscope: Uncovering the Hidden Role of Punctuation in Context Memory of Transformers - AntonRazzhigaev + AntonRazzhigaev MatveyMikhalchukArtificial Intelligence Research Institute (AIRI) TemurbekRahmatullaev - ElizavetaGoncharovaArtificial Intelligence Research Institure and Higher School of Economics - PolinaDruzhininaArtificial Intelligence Research Institute + ElizavetaGoncharovaArtificial Intelligence Research Institure and Higher School of Economics + PolinaDruzhininaArtificial Intelligence Research Institute IvanOseledetsArtificial Intelligence Research Institute, Skolkovo Institute of Science and Technology and Institute of Numerical Mathematics - AndreyKuznetsovAIRI, Sber and Samara National Research University + AndreyKuznetsovAIRI, Sber and Samara National Research University 7757-7764 We introduce methods to quantify how Large Language Models (LLMs) encode and store contextual information, revealing that tokens often seen as minor (e.g., determiners, punctuation) carry surprisingly high context. Notably, removing these tokens — especially stopwords, articles, and commas — consistently degrades performance on MMLU and BABILong-4k, even if removing only irrelevant tokens. Our analysis also shows a strong correlation between contextualization and linearity, where linearity measures how closely the transformation from one layer’s embeddings to the next can be approximated by a single linear mapping. These findings underscore the hidden importance of “filler” tokens in maintaining context. For further exploration, we present LLM-Microscope, an open-source toolkit that assesses token-level nonlinearity, evaluates contextual memory, visualizes intermediate layer contributions (via an adapted Logit Lens), and measures the intrinsic dimensionality of representations. This toolkit illuminates how seemingly trivial tokens can be critical for long-range understanding. 2025.findings-naacl.432 @@ -5415,9 +5415,9 @@ MihirParmar SatChidananda JayanthSrinivasa - GaowenLiu - AliPayaniCisco - ChittaBaralArizona State University + GaowenLiu + AliPayaniCisco + ChittaBaralArizona State University 7795-7826 Reasoning abilities of LLMs have been a key focus in recent years. One challenging reasoning domain with interesting nuances is legal reasoning, which requires careful application of rules, and precedents while balancing deductive and analogical reasoning, and conflicts between rules. Although there have been a few works on using LLMs for legal reasoning, their focus has been on overall accuracy. In this paper, we dig deeper to do a step-by-step analysis and figure out where they commit errors. We use the college-level Multiple Choice Question-Answering (MCQA) task from the Civil Procedure dataset and propose a new error taxonomy derived from initial manual analysis of reasoning chains with respect to several LLMs, including two objective measures: soundness and correctness scores. We then develop an LLM-based automated evaluation framework to identify reasoning errors and evaluate the performance of LLMs. The computation of soundness and correctness on the dataset using the auto-evaluator framework reveals several interesting insights. Furthermore, we show that incorporating the error taxonomy as feedback in popular prompting techniques marginally increases LLM performance. Our work will also serve as an evaluation framework that can be used in detailed error analysis of reasoning chains for logic-intensive complex tasks. 2025.findings-naacl.435 @@ -5428,8 +5428,8 @@ SiyiLiu KishaloyHalderAmazon ZhengQiAmazon - WeiXiao - NikolaosPappasAWS AI Labs + WeiXiao + NikolaosPappasAWS AI Labs Phu MonHtutAWS AI Labs NehaAnna John YassineBenajiba @@ -5444,8 +5444,8 @@ HaotengYin JinhaKimAmazon PrashantMathurAmazon - KrishanuSarkerAmazon - ViditBansalAmazon + KrishanuSarkerAmazon + ViditBansalAmazon 7836-7850 Entity matching (EM), which identifies whether two data records refer to the same real-world entity, is crucial for knowledge base construction and enhancing data-driven AI systems. Recent advances in language models (LMs) have shown great potential in resolving entities with rich textual attributes. However, their performance heavily depends on how structured entities are “talked” through serialized text. The impact of this serialization process remains underexplored, particularly for entities with complex relations in knowledge graphs (KGs). In this work, we systematically study entity serialization by benchmarking the effect of common schemes with LMs of different sizes on diverse tabular matching datasets. We apply our findings to propose a novel serialization scheme for KG entities based on random walks and utilize LLMs to encode sampled semantic walks for matching. Using this lightweight approach with open-source LLMs, we achieve a leading performance on EM in canonical and highly heterogeneous KGs, demonstrating significant throughput increases and superior robustness compared to GPT-4-based methods. Our study on serialization provides valuable insights for the deployment of LMs in real-world EM tasks. 2025.findings-naacl.437 @@ -5454,7 +5454,7 @@ Accounting for Sycophancy in Language Model Uncertainty Estimation AnthonySiciliaNortheastern University - MertInanNortheastern University + MertInanNortheastern University MaliheAlikhaniNortheastern University 7851-7866 Effective human-machine collaboration requires machine learning models to externalize uncertainty, so users can reflect and intervene when necessary. For language models, these representations of uncertainty may be impacted by sycophancy bias: proclivity to agree with users, even if they are wrong. For instance, models may be over-confident in (incorrect) problem solutions suggested by a user. We study the relationship between sycophancy and uncertainty estimation for the first time. We propose a generalization of the definition of sycophancy bias to measure downstream impacts on uncertainty estimation, and also propose a new algorithm (SyRoUP) to account for sycophancy in the uncertainty estimation process. Unlike previous works, we study a broad array of user behaviors, varying both correctness and confidence of user suggestions to see how model answers (and their certainty) change. Our experiments across conversation forecasting and question-answering tasks show that user confidence plays a critical role in modulating the effects of sycophancy, and that SyRoUP can better predict these effects. From these results, we argue that externalizing both model and user uncertainty can help to mitigate the impacts of sycophancy bias. @@ -5464,7 +5464,7 @@ Zero-Shot Keyphrase Generation: Investigating Specialized Instructions and Multi-sample Aggregation on Large Language Models JishnuRay ChowdhuryBloomberg - JayanthMohanUniversity of Illinois at Chicago + JayanthMohanUniversity of Illinois at Chicago TomasMalik CorneliaCarageaUniversity of Illinois at Chicago 7867-7884 @@ -5485,11 +5485,11 @@ <fixed-case>CLERC</fixed-case>: A Dataset for <fixed-case>U</fixed-case>. <fixed-case>S</fixed-case>. Legal Case Retrieval and Retrieval-Augmented Analysis Generation Abe BohanHou OrionWeller - GuanghuiQin - EugeneYangJohns Hopkins University - DawnLawrieJohns Hopkins University + GuanghuiQin + EugeneYangJohns Hopkins University + DawnLawrieJohns Hopkins University NilsHolzenbergerTélécom ParisTech - AndrewBlair-StanekJohns Hopkins University and University of Maryland School of Law + AndrewBlair-StanekJohns Hopkins University and University of Maryland School of Law BenjaminVan DurmeMicrosoft and Johns Hopkins University 7898-7913 Legal professionals need to write analyses that rely on citations to relevant precedents, i.e., previous case decisions. Intelligence systems assisting legal professionals in writing such documents provide great benefits but are challenging to design. Such systems need to help locate, summarize, and reason over salient precedents in order to be useful. To enable systems for such tasks, we work with legal professionals to create a colossal dataset. supporting two important backbone tasks: information retrieval (IR) and retrieval-augmented generation (RAG). This dataset **CLERC** (Case Law Evaluation and Retrieval Corpus), is constructed for training and evaluating models on their ability to (1) find corresponding citations for a given piece of legal analysis and to (2) compile the text of these citations (as well as previous context) into a cogent analysis that supports a reasoning goal. We benchmark state-of-the-art models on CLERC, showing that current approaches still struggle: GPT-4o generates analyses with the highest ROUGE F-scores but hallucinates the most, while zero-shot IR models only achieve 48.3% recall@1000. @@ -5504,7 +5504,7 @@ SebastienDiarra Christopher MHoman Mamadou K.Keita - MichaelLeventhalRobotsMali + MichaelLeventhalRobotsMali 7914-7929 Illiteracy is a predictor of many negative social and personal outcomes. Illiteracy rates are particularly high in countries with underresourced languages, where few books exist that are suitable for children to learn to read from. We present GAIfE (Generative AI for Education), a toolchain and workflow developed through empirical methods, that demonstrates how existing tools can be adapted to address low literacy for an underresourced language. We used GAIfE (a play on the Bambara word for “book”) to construct materials for developing children’s reading competence in Bambara, the vehicular language of Mali. Our approach to the generation and post-generation editing of content skewed by the Global-North-centric bias of available LLMs, enabled us to rapidly multiply the content in Bambara available online by 10 times while maintaining high standards of attractiveness of the material to maintain high engagement, accurate representation of the Malian culture and physical and social environment and language quality. Using our materials, pilot reading programs achieved a 67% reduction in the number of children unable to read Bambara. Our approach demonstrated the power of bias-aware application of generative AI to the problem domain as well as the potential impact the application of this technology could have on reducing illiteracy and improving learning outcomes through native language education. 2025.findings-naacl.442 @@ -5522,16 +5522,16 @@ <fixed-case>UCL</fixed-case>-Bench: A <fixed-case>C</fixed-case>hinese User-Centric Legal Benchmark for Large Language Models RuoliGan - DuanyuFeng - ChenZhangNational University of Singapore + DuanyuFeng + ChenZhangNational University of Singapore ZhihangLinWestlake Scietrain HaochenJia HaoWangSichuan University - ZhenyangCaiThe Chinese University of Hong Kong, Shenzhen - LeiCui - QianqianXieWuhan University - JiminHuangThe Fin AI - BenyouWangThe Chinese University of Hong Kong, Shenzhen + ZhenyangCaiThe Chinese University of Hong Kong, Shenzhen + LeiCui + QianqianXieWuhan University + JiminHuangThe Fin AI + BenyouWangThe Chinese University of Hong Kong, Shenzhen 7945-7988 Existing legal benchmarks focusing on knowledge and logic effectively evaluate LLMs on various tasks in legal domain. However, few have explored the practical application of LLMs by actual users. To further assess whether LLMs meet the specific needs of legal practitioners in real-world scenarios, we introduce UCL-Bench, a Chinese User-Centric Legal Benchmark, comprising 22 tasks across 5 distinct legal scenarios.To build the UCL-Bench, we conduct a user survey targeting legal professionals to understand their needs and challenges. Based on the survey results, we craft tasks, verified by legal professionals, and categorized them according to Bloom’s taxonomy. Each task in UCL-Bench mirrors real-world legal scenarios, and instead of relying on pre-defined answers, legal experts provide detailed answer guidance for each task, incorporating both “information” and “needs” elements to mimic the complexities of legal practice. With the guidance, we use GPT-4 as the user simulator and evaluator, enabling multi-turn dialogues as a answer guidance based evaluation framework. Our findings reveal that many recent open-source general models achieve the highest performance, suggesting that they are well-suited to address the needs of legal practitioners. However, these legal LLMs do not outperform ChatGPT, indicating a need for training strategies aligned with users’ needs. Furthermore, we find that the most effective models are able to address legal issues within fewer dialogue turns, highlighting the importance of concise and accurate responses in achieving high performance. The code and dataset are available at https://github.com/wittenberg11/UCL-bench. 2025.findings-naacl.444 @@ -5539,9 +5539,9 @@ <fixed-case>MIDAS</fixed-case>: Multi-level Intent, Domain, And Slot Knowledge Distillation for Multi-turn <fixed-case>NLU</fixed-case> - YanLi - So-EonKimKyung Hee University - Seong-BaePark + YanLi + So-EonKimKyung Hee University + Seong-BaePark CarenHanUniversity of Melbourne, University of Western Australia and University of Sydney 7989-8012 Although Large Language Models (LLMs) can generate coherent text, they often struggle to recognise user intent behind queries. In contrast, Natural Language Understanding (NLU) models interpret the purpose and key information of user input for responsive interactions. Existing NLU models typically map utterances to a dual-level semantic frame, involving sentence-level intent (SI) and word-level slot (WS) labels. However, real-life conversations primarily consist of multi-turn dialogues, requiring the interpretation of complex and extended exchanges. Researchers encounter challenges in addressing all facets of multi-turn dialogue using a unified NLU model. This paper introduces MIDAS, a novel approach leveraging multi-level intent, domain, and slot knowledge distillation for multi-turn NLU. We construct distinct teachers for SI detection, WS filling, and conversation-level domain (CD) classification, each fine-tuned for specific knowledge. A multi-teacher loss is proposed to facilitate the integration of these teachers, guiding a student model in multi-turn dialogue tasks. Results demonstrate the efficacy of our model in improving multi-turn conversation understanding, showcasing the potential for advancements in NLU through multi-level dialogue knowledge distillation. Our implementation is open-sourced on GitHub (https://github.com/adlnlp/Midas). @@ -5555,7 +5555,7 @@ BarunPatraMicrosoft VishravChaudharyMicrosoft AlonBenhaimMicrosoft - JayPujaraUniversity of Southern California + JayPujaraUniversity of Southern California XiaSongMicrosoft 8013-8021 At the forefront of state-of-the-art human alignment methods are preference optimization methods (*PO). Prior research has often concentrated on identifying the best-performing method, typically involving a grid search over hyperparameters, which can be impractical for general practitioners. In this paper, we examine the robustness of existing state-of-the-art methods to varying hyperparameters in a realistic out-of-distribution (OOD) scenario that mirrors real-world applications of human alignment. Our goal is to empirically find the method that increases the likelihood of achieving better results through the lens of various metrics, such as KL divergence and response length. We also introduce LN-DPO, a simple length-normalized version of DPO that is more stable across hyperparameters, effectively reduces the average response length, and improves performance. Our analysis of state-of-the-art reference-free (i.e., SimPO) and reference-dependent (i.e., DPO and LN-DPO) methods reveals that they perform similarly at their peak (i.e., best possible scenario). However, we uncover that the pattern of change in performance greatly varies as we move away from the best possible scenario. @@ -5577,7 +5577,7 @@ SaaketAgashe YueFan AnthonyReynaUniversity of California, Santa Cruz - Xin EricWangSimular and University of California, Santa Cruz + Xin EricWangSimular and University of California, Santa Cruz 8038-8057 Large Language Models (LLMs) have demonstrated emergent common-sense reasoning and Theory of Mind (ToM) capabilities, making them promising candidates for developing coordination agents. This study introduces the LLM-Coordination Benchmark, a novel benchmark for analyzing LLMs in the context of Pure Coordination Settings, where agents must cooperate to maximize gains. Our benchmark evaluates LLMs through two distinct tasks. The first is Agentic Coordination, where LLMs act as proactive participants in four pure coordination games. The second is Coordination Question Answering (CoordQA), which tests LLMs on 198 multiple-choice questions across these games to evaluate three key abilities: Environment Comprehension, ToM Reasoning, and Joint Planning. Results from Agentic Coordination experiments reveal that LLM-Agents excel in multi-agent coordination settings where decision-making primarily relies on environmental variables but face challenges in scenarios requiring active consideration of partners’ beliefs and intentions. The CoordQA experiments further highlight significant room for improvement in LLMs’ Theory of Mind reasoning and joint planning capabilities. Zero-Shot Coordination (ZSC) experiments in the Agentic Coordination setting demonstrate that LLM agents, unlike RL methods, exhibit robustness to unseen partners. These findings indicate the potential of LLMs as Agents in pure coordination setups and underscore areas for improvement. 2025.findings-naacl.448 @@ -5585,10 +5585,10 @@ <fixed-case>A</fixed-case>ssertion<fixed-case>B</fixed-case>ench: A Benchmark to Evaluate Large-Language Models for Assertion Generation - VaishnaviPulavarthi + VaishnaviPulavarthi DeekshaNandal SohamDanMicrosoft - DebjitPalUniversity of Illinois at Chicago + DebjitPalUniversity of Illinois at Chicago 8058-8065 Assertions have been the de facto collateral for hardware for over a decade. The verification quality, i.e., detection and diagnosis of corner-case design bugs, is critically dependent on the assertion quality. There has been a considerable amount of research to generate high-quality assertions from hardware design source code and design execution trace data. With recent advent of generative AI techniques such as Large-Language Models (LLMs), there has been a renewed interest in deploying LLMs for assertion generation. However, there is little effort to quantitatively establish the effectiveness and suitability of various LLMs for assertion generation. In this paper, we present AssertionBench, a novel benchmark to evaluate LLMs’ effectiveness for assertion generation quantitatively. AssertionBench contains 100 curated Verilog hardware designs from OpenCores and formally verified assertions for each design, generated from GoldMine and HARM. We use AssertionBench to compare state-of-the-art LLMs, e.g., GPT-3.5, GPT-4o, CodeLLaMa-2, and LLaMa3-70B, to assess their effectiveness in inferring functionally correct assertions for hardware designs. Our experiments comprehensively demonstrate how LLMs perform relative to each other, the benefits of using more in-context exemplars in generating a higher fraction of functionally correct assertions, and the significant room for improvement for LLM-based assertion generators. 2025.findings-naacl.449 @@ -5630,8 +5630,8 @@ QimingWu ZichenChenUniversity of California, Santa Barbara WillCorcoran - MishaSraUniversity of California, Santa Barbara - AmbujSinghUC Santa Barbara + MishaSraUniversity of California, Santa Barbara + AmbujSinghUC Santa Barbara 8095-8117 Large language models (LLMs) have achieved remarkable success in natural language processing (NLP), demonstrating significant capabilities in processing and understanding text data. However, recent studies have identified limitations in LLMs’ ability to manipulate, program, and reason about structured data, especially graphs. We introduce GraphEval36K, the first comprehensive graph dataset, comprising 40 graph coding problems and 36,900 test cases to evaluate the ability of LLMs on graph problem-solving. Our dataset is categorized into eight primary and four sub-categories to ensure a thorough evaluation across different types of graphs. We benchmark eight LLMs, finding that private models outperform open-source ones, though the gap is narrowing. We also analyze the performance of LLMs across directed vs undirected graphs, different kinds of graph concepts, and network models. Furthermore, to improve the usability of our evaluation framework, we propose Structured Symbolic Decomposition (SSD), an instruction-based method designed to enhance LLM performance on complex graph tasks. Results show that SSD improves the average passing rate of GPT-4, GPT-4o, Gemini-Pro and Claude-3-Sonnet by 8.38%, 6.78%, 29.28% and 25.28%, respectively. 2025.findings-naacl.452 @@ -5639,7 +5639,7 @@ <fixed-case>S</fixed-case>imul<fixed-case>B</fixed-case>ench: Evaluating Language Models with Creative Simulation Tasks - QiJia + QiJia XiangYueCarnegie Mellon University TuneyZheng JieHuangxAI @@ -5651,9 +5651,9 @@ <fixed-case>R</fixed-case>easoning<fixed-case>R</fixed-case>ec: Bridging Personalized Recommendations and Human-Interpretable Explanations through <fixed-case>LLM</fixed-case> Reasoning - MillenniumBismayAmazon + MillenniumBismayAmazon XiangjueDongTexas A&M University - College Station - JamesCaverleeGoogle and Texas A&M University - College Station + JamesCaverleeGoogle and Texas A&M University - College Station 8132-8148 This paper presents ReasoningRec, a reasoning-based recommendation framework that leverages Large Language Models (LLMs) to bridge the gap between recommendations and human-interpretable explanations. In contrast to conventional recommendation systems that rely on implicit user-item interactions, ReasoningRec employs LLMs to model users and items, focusing on preferences, aversions, and explanatory reasoning. The framework utilizes a larger LLM to generate synthetic explanations for user preferences, subsequently used to fine-tune a smaller LLM for enhanced recommendation accuracy and human-interpretable explanation. Our experimental study investigates the impact of reasoning and contextual information on personalized recommendations, revealing that the quality of contextual and personalized data significantly influences the LLM’s capacity to generate plausible explanations. Empirical evaluations demonstrate that ReasoningRec surpasses state-of-the-art methods by up to 12.5% in recommendation prediction while concurrently providing human-intelligible explanations. 2025.findings-naacl.454 @@ -5664,13 +5664,13 @@ ShilongLi YanchengHeAlibaba Group HuiHuang - XingyuanBuAlibaba Group + XingyuanBuAlibaba Group JiahengLiuNanjing University HangyuGuoAlibaba Group WeixunWang JihaoGu - WenboSuAlibaba Group - BoZhengAlibaba Group + WenboSuAlibaba Group + BoZhengAlibaba Group 8149-8173 Recent advancements in Direct Preference Optimization (DPO) have significantly enhanced the alignment of Large Language Models (LLMs) with human preferences, owing to its simplicity and effectiveness. However, existing methods typically optimize a scalar score or ranking reward, thereby overlooking the multi-dimensional nature of human preferences. In this work, we propose to extend the preference of DPO to two dimensions: segments and aspects. We first introduce a 2D supervision dataset called HelpSteer-2D. For the segment dimension, we divide the response into sentences and assign scores to each segment. For the aspect dimension, we meticulously design several criteria covering the response quality rubrics. With the 2-dimensional signals as feedback, we develop a 2D-DPO framework, decomposing the overall objective into multi-segment and multi-aspect objectives. Extensive experiments on popular benchmarks demonstrate that 2D-DPO performs better than methods that optimize for scalar or 1-dimensional preferences. 2025.findings-naacl.455 @@ -5678,12 +5678,12 @@ Demystifying the Power of Large Language Models in Graph Generation - YuWangUniversity of Oregon and Vanderbilt University - Ryan A.RossiAdobe Research + YuWangUniversity of Oregon and Vanderbilt University + Ryan A.RossiAdobe Research NamyongParkMeta AI Nesreen K.AhmedIntel AI Research - DanaiKoutraAmazon and University of Michigan - Ann Arbor - FranckDernoncourt + DanaiKoutraAmazon and University of Michigan - Ann Arbor + FranckDernoncourt TylerDerrVanderbilt University 8174-8189 Despite the unprecedented success of applying Large Language Models (LLMs) to graph discriminative tasks such as node classification and link prediction, its potential for graph structure generation remains largely unexplored. To fill this crucial gap, this paper presents a systematic investigation into the capability of LLMs for graph structure generation. Specifically, we design prompts triggering LLMs to generate codes that optimize network properties by injecting domain expertise from network science. Since graphs in different domains exhibit unique structural properties captured by various metrics (e.g., clustering coefficient capturing triangles in social networks while squares reflecting road segments in transportation networks), we first evaluate the capability of LLMs to generate graphs satisfying each structural property in different domains. After that, we select the optimal property configurations and benchmark the graph structure generation performance of LLMs against established graph generative models across multiple domains. Our findings shed light on generating graph structures from an LLM perspective. Our code is publically available https://github.com/yuwvandy/LLM-GraphGen. @@ -5698,7 +5698,7 @@ LeoJin JuntingZhou ZiqiangLiu - FeitengFang + FeitengFang MingshanChang TianyuZheng XinchengZhang @@ -5710,9 +5710,9 @@ WenhaoHuang JiajunZhangInstitute of automation, Chinese academy of science, Chinese Academy of Sciences ChenghuaLinUniversity of Manchester - JieFuShanghai Artificial Intelligence Laboratory + JieFuShanghai Artificial Intelligence Laboratory MinYangShenzhen Institutes of Advanced Technology, Chinese Academy of Sciences, Chinese Academy of Sciences - ShiwenNiShenzhen Institutes of Advanced Technology, Chinese Academy of Sciences + ShiwenNiShenzhen Institutes of Advanced Technology, Chinese Academy of Sciences GeZhangByteDance Inc. 8190-8205 Remarkable progress on large language models (LLMs), particularly in English, has facilitated impressive capabilities in following human instructions. However, there remains a noticeable gap in instruction fine-tuning for Chinese, where the complex linguistic features pose significant challenges. Existing datasets, generally distilled from English-centric LLMs, are not well-aligned with Chinese users’ interaction patterns. To bridge this gap, we introduce COIG-CQIA, a new Chinese instruction tuning dataset derived from various real-world data resources and undergoing comprehensive human verification. We conduct extensive experiments on COIG-CQIA, and compare them with strong baseline models and datasets. The experimental results show that models trained on COIG-CQIA achieve highly competitive performance in diverse benchmarks. Additionally, our findings offer several insights for designing effective Chinese instruction-tuning datasets and data mixing strategies. Our dataset are available at https://huggingface.co/datasets/m-a-p/COIG-CQIA. @@ -5725,7 +5725,7 @@ JiaxinZhangIntuit AI Research XiangGaoIntuit WendiCuiIntuit - PengLiUC Santa Barbara + PengLiUC Santa Barbara KamalikaDasIntuit 8206-8217 In tasks such as summarization and open-book question answering (QA), Large Language Models (LLMs) frequently experience “contextual hallucination”, where they generate irrelevant or incorrect responses despite having access to accurate information in the input. This issue often stems from the models’ propensity to prioritize self-generated content over input context, leading to a disregard for pertinent details. To address this challenge, we introduce, Guided Attention Map Editing (GAME), an innovative approach that dynamically adjusts attention maps to enhance contextual relevance. During inference, GAME employs a trained classifier to identify attention maps likely to induce hallucinations and implements targeted interventions. These interventions, guided by gradient-informed “edit directions”, strategically redistribute attention weights across various heads to efficiently mitigate hallucination. Extensive evaluations on challenging summarization and open-book QA tasks demonstrate that GAME consistently and significantly reduces hallucinations across diverse open-source models, thereby improving the reliability and applicability of LLMs. @@ -5736,7 +5736,7 @@ Alleviating Hallucinations of Large Language Models through Induced Hallucinations YueZhang LeyangCui - V.W.The Hong Kong University of Science and Technology + V.W.The Hong Kong University of Science and Technology ShumingShiTencent AI Lab 8218-8232 Despite their impressive capabilities, large language models (LLMs) have been observed to generate responses that include inaccurate or fabricated information, a phenomenon commonly known as hallucination. In this work, we propose a simple Induce-then-Contrast Decoding (ICD) strategy to alleviate hallucinations. We first construct a factually weak LLM by inducing hallucinations from the original LLMs. Then, we penalize these induced hallucinations during decoding to enhance the factuality of the generated content. Concretely, we determine the final next-token predictions by amplifying the predictions from the original model and downplaying the induced untruthful predictions via contrastive decoding. Experimental results on both discrimination-based and generation-based hallucination evaluation benchmarks, such as TruthfulQA and FActScore, demonstrate that our proposed ICD methods can effectively enhance the factuality of LLMs across various task formats, model sizes, and model families. For example, when equipped with ICD, Llama2-7B-Chat and Mistral-7B-Instruct achieve performance comparable to ChatGPT and GPT4 on TruthfulQA, respectively, without compromising their generalization capabilities on other tasks. @@ -5745,7 +5745,7 @@ <fixed-case>M</fixed-case>o<fixed-case>DE</fixed-case>: Effective Multi-task Parameter Efficient Fine-Tuning with a Mixture of Dyadic Experts - LinNingGoogle + LinNingGoogle HarshLaraResearch, Google MeiqiGuoGoogle AbhinavRastogiGoogle @@ -5756,12 +5756,12 @@ Unsupervised Sentence Representation Learning with Syntactically Aligned Negative Samples - ZhilanWang + ZhilanWang ZekaiZhi - RizeJinTiangong University + RizeJinTiangong University KehuiSongTiangong University HeWangtiangong university - Da-JungChoAjou University + Da-JungChoAjou University 8247-8259 Sentence representation learning benefits from data augmentation strategies to improve model performance and generalization, yet existing approaches often encounter issues such as semantic inconsistencies and feature suppression. To address these limitations, we propose a method for generating Syntactically Aligned Negative (SAN) samples through a semantic importance-aware Masked Language Model (MLM) approach. Our method quantifies semantic contributions of individual words to produce negative samples that have substantial textual overlap with the original sentences while conveying different meanings. We further introduce Hierarchical-InfoNCE (HiNCE), a novel contrastive learning objective employing differential temperature weighting to optimize the utilization of both in-batch and syntactically aligned negative samples. Extensive evaluations across seven semantic textual similarity benchmarks demonstrate consistent improvements over state-of-the-art models. 2025.findings-naacl.461 @@ -5769,7 +5769,7 @@ Hierarchical Speculative Decoding with Dynamic Window - ShensianSyuNational Taiwan University + ShensianSyuNational Taiwan University Hung-yiLeeNational Taiwan University 8260-8273 Speculative Decoding (SD) utilizes an efficient draft model to generate multiple tokens, which are subsequently verified in parallel by a target model. This approach has shown significant potential for accelerating inference in large language models (LLMs), with performance heavily reliant on the hyperparameter K—the window size. However, previous methods often depend on simple heuristics to select K or dynamically adjust the window size, which may necessitate additional training or careful resource management to avoid competition.To address these challenges, we propose Hierarchical Speculative Decoding with Dynamic Window (HSDDW), a straightforward framework that eliminates the need for additional training. Specifically, we introduce a self-verify mechanism that enables the draft model to autonomously decide when to stop generating tokens. Additionally, by integrating a hierarchical structure that leverages the capabilities of models of different sizes, we significantly enhance the overall speed of the system.HSDDW demonstrates competitive performance across four datasets, achieving notable speedups of 2.91\times on MT-Bench and 2.99\times on Alpaca, outperforming existing state-of-the-art methods. @@ -5779,8 +5779,8 @@ <fixed-case>Q</fixed-case>-<fixed-case>FAKER</fixed-case>: Query-free Hard Black-box Attack via Controlled Generation CheolWonNaSungkyunkwan University - YunSeokChoiSungKyunKwan University - Jee-HyongLeeSungkyunkwan University + YunSeokChoiSungKyunKwan University + Jee-HyongLeeSungkyunkwan University 8274-8289 Many adversarial attack approaches are proposed to verify the vulnerability of language models. However, they require numerous queries and the information on the target model. Even black-box attack methods also require the target model’s output information. They are not applicable in real-world scenarios, as in hard black-box settings where the target model is closed and inaccessible. Even the recently proposed hard black-box attacks still require many queries and demand extremely high costs for training adversarial generators. To address these challenges, we propose Q-faker (Query-free Hard Black-box Attacker), a novel and efficient method that generates adversarial examples without accessing the target model. To avoid accessing the target model, we use a surrogate model instead. The surrogate model generates adversarial sentences for a target-agnostic attack. During this process, we leverage controlled generation techniques. We evaluate our proposed method on eight datasets. Experimental results demonstrate our method’s effectiveness including high transferability and the high quality of the generated adversarial examples, and prove its practical in hard black-box settings. 2025.findings-naacl.463 @@ -5791,11 +5791,11 @@ XiangLi ZhiyiYin, Chinese Academy of Sciences HexiangTanChinese Academy of Sciences - ShaolingJing - DuSuInstitute of Computing Technology, Chinese Academy of Sciences + ShaolingJing + DuSuInstitute of Computing Technology, Chinese Academy of Sciences YiCheng - HuaweiShenInstitute of Computing Technology, Chinese Academy of Sciences - FeiSunInstitute of Computing Technology, Chinese Academy of Sciences + HuaweiShenInstitute of Computing Technology, Chinese Academy of Sciences + FeiSunInstitute of Computing Technology, Chinese Academy of Sciences 8290-8301 As LLM-generated text becomes increasingly prevalent on the internet, often containing hallucinations or biases, detecting such content has emerged as a critical area of research.Recent methods have demonstrated impressive performance in detecting text generated entirely by LLMs.However, in real-world scenarios, users often introduce perturbations to the LLM-generated text, and the robustness of existing detection methods against these perturbations has not been sufficiently explored.This paper empirically investigates this challenge and finds that even minor perturbations can severely degrade the performance of current detection methods. To address this issue, we find that the syntactic tree is minimally affected by disturbances and exhibits distinct differences between human-written and LLM-generated text.Therefore, we propose a detection method based on syntactic trees, which can capture features invariant to perturbations.It demonstrates significantly improved robustness against perturbation on the HC3 and GPT-3.5-mixed datasets.Moreover, it also has the shortest time expenditure.We provide the code and data at https://github.com/thulx18/PRDetect. 2025.findings-naacl.464 @@ -5817,7 +5817,7 @@ KritarthPrasadSony Research India, Bangalore MohammadiZakiSony Research India, Bangalore Pratik RakeshSinghSony Research India - PankajWasnikSony Research India + PankajWasnikSony Research India 8322-8335 Ensembling neural machine translation (NMT) models to produce higher-quality translations than the L individual models has been extensively studied. Recent methods typically employ a candidate selection block (CSB) and an encoder-decoder fusion block (FB), requiring inference across all candidate models, leading to significant computational overhead, generally \Omega(L). This paper introduces SmartGen, a reinforcement learning (RL)-based strategy that improves the CSB by selecting a small, fixed number of candidates and identifying optimal groups to pass to the fusion block for each input sentence. Furthermore, previously, the CSB and FB were trained independently, leading to suboptimal NMT performance. Our DQN-based SmartGen addresses this by using feedback from the FB block as a reward during training. We also resolve a key issue in earlier methods, where candidates were passed to the FB without modification, by introducing a Competitive Correction Block (CCB). Finally, we validate our approach with extensive experiments on English-Hindi translation tasks in both directions as well as English to Chinese and English to German. 2025.findings-naacl.466 @@ -5825,10 +5825,10 @@ Evaluating Numeracy of Language Models as a Natural Language Inference Task - RahmadMahendraRoyal Melbourne Institute of Technology and Universitas Indonesia - DamianoSpinaRoyal Melbourne Institute of Technology + RahmadMahendraRoyal Melbourne Institute of Technology and Universitas Indonesia + DamianoSpinaRoyal Melbourne Institute of Technology LawrenceCavedonRoyal Melbourne Institute of Technology - KarinVerspoorRoyal Melbourne Institute of Technology + KarinVerspoorRoyal Melbourne Institute of Technology 8336-8361 While recent advancements in large language models (LLMs) have enhanced their capabilities to solve mathematical problems, other aspects of numeracy remain underexplored. In this paper, we propose a benchmark to evaluate the ability of language models to perform basic numeracy tasks. We frame numeracy as a Natural Language Inference (NLI) task to assess the models’ ability to understand both numbers and language contexts. We evaluate 49 language models (LMs), including fine-tuned LMs on NLI datasets, instruction-tuned LLMs, and specialized math-LLMs. Our findings reveal three main insights: (1) LLMs only clearly outperform smaller LMs in arithmetic tasks, indicating that mathematical reasoning cannot be generalized to other numeracy skills such as number comparison and normalization; (2) while most language models achieve fair to good accuracy for NLI entailment cases, they still struggle to predict contradiction and neutral cases; and (3) the robustness of language models’ numeracy capabilities needs improvement, particularly in understanding the semantics and pragmatics of numbers in linguistic contexts. 2025.findings-naacl.467 @@ -5855,8 +5855,8 @@ MinjuKim Beong-wooKwakYonsei University YeonsooLee - DonghaLeeYonsei University - JinyoungYeoYonsei University + DonghaLeeYonsei University + JinyoungYeoYonsei University YoungjaeYuYonsei University 8397-8437 Recent advancements in Large Language Models (LLMs) have led to their adaptation in various domains as conversational agents. We wonder: can personality tests be applied to these agents to analyze their behavior, similar to humans? We introduce TRAIT, a new benchmark consisting of 8K multi-choice questions designed to assess the personality of LLMs. TRAIT is built on two psychometrically validated small human questionnaires, Big Five Inventory (BFI) and Short Dark Triad (SD-3), enhanced with the ATOMIC-10X knowledge graph to a variety of real-world scenarios. TRAIT also outperforms existing personality tests for LLMs in terms of reliability and validity, achieving the highest scores across four key metrics: Content Validity, Internal Validity, Refusal Rate, and Reliability. Using TRAIT, we reveal two notable insights into personalities of LLMs: 1) LLMs exhibit distinct and consistent personality, which is highly influenced by their training data (e.g., data used for alignment tuning), and 2) current prompting techniques have limited effectiveness in eliciting certain traits, such as high psychopathy or low conscientiousness, suggesting the need for further research in this direction. @@ -5865,10 +5865,10 @@ Tell Me What You Know About Sexism: Expert-<fixed-case>LLM</fixed-case> Interaction Strategies and Co-Created Definitions for Zero-Shot Sexism Detection - MyrtheReuverVrije Universiteit Amsterdam + MyrtheReuverVrije Universiteit Amsterdam IndiraSenUniversität Mannheim MatteoMelis - GabriellaLapesaGESIS – Leibniz Institute for the Social Sciences and Heinrich-Heine University Düsseldorf + GabriellaLapesaGESIS – Leibniz Institute for the Social Sciences and Heinrich-Heine University Düsseldorf 8438-8467 This paper investigates hybrid intelligence and collaboration between researchers of sexism and Large Language Models (LLMs), with afour-component pipeline. First, nine sexism researchers answer questions about their knowledge of sexism and of LLMs. They then participate in two interactive experiments involving an LLM (GPT3.5). The first experiment has experts assessing the model’s knowledgeabout sexism and suitability for use in research. The second experiment tasks them with creating three different definitions of sexism: anexpert-written definition, an LLM-written one, and a co-created definition. Lastly, zero-shot classification experiments use the three definitions from each expert in a prompt template for sexism detection, evaluating GPT4o on 2.500 texts sampled from five sexism benchmarks. We then analyze the resulting 67.500 classification decisions. The LLM interactions lead to longer and more complex definitions of sexism. Expert-written definitions on average perform poorly compared to LLM-generated definitions. However, some experts do improve classification performance with their co-created definitions of sexism, also experts who are inexperienced in using LLMs. 2025.findings-naacl.470 @@ -5877,7 +5877,7 @@ The Role of Prosody in Spoken Question Answering JieChi - Maureende SeysselApple + Maureende SeysselApple NatalieSchluterTechnical University of Denmark, Apple and IT University 8468-8479 Spoken language understanding research to date has generally carried a heavy text perspective. Most datasets are derived from text, which is then subsequently synthesized into speech, and most models typically rely on automatic transcriptions of speech. This is to the detriment of prosody–additional information carried by the speech signal beyond the phonetics of the words themselves and difficult to recover from text alone. In this work, we investigate the role of prosody in Spoken Question Answering. By isolating prosodic and lexical information on the SLUE-SQA-5 dataset, which consists of natural speech, we demonstrate that models trained on prosodic information alone can perform reasonably well by utilizing prosodic cues. However, we find that when lexical information is available, models tend to predominantly rely on it. Our findings suggest that while prosodic cues provide valuable supplementary information, more effective integration methods are required to ensure prosody contributes more significantly alongside lexical features. @@ -5887,7 +5887,7 @@ Target-Augmented Shared Fusion-based Multimodal Sarcasm Explanation Generation PalaashGoelIndraprastha Institute of Information Technology, Delhi - Dushyant SinghChauhan + Dushyant SinghChauhan Md ShadAkhtarIndraprastha Institute of Information Technology, Delhi 8480-8493 Sarcasm is a linguistic phenomenon that intends to ridicule a target (e.g., entity, event, or person) in an inherent way. Multimodal Sarcasm Explanation (MuSE) aims at revealing the intended irony in a sarcastic post using a natural language explanation. Though important, existing systems overlooked the significance of the target of sarcasm in generating explanations. In this paper, we propose a Target-aUgmented shaRed fusion-Based sarcasm explanatiOn model, aka. TURBO. We design a novel shared-fusion mechanism to leverage the inter-modality relationships between an image and its caption. TURBO assumes the target of the sarcasm and guides the multimodal shared fusion mechanism in learning intricacies of the intended irony for explanations. We evaluate our proposed TURBO model on the MORE+ dataset. Comparison against multiple baselines and state-of-the-art models signifies the performance improvement of TURBO by an average margin of +3.3%. Moreover, we explore LLMs in zero and one-shot settings for our task and observe that LLM-generated explanation, though remarkable, often fails to capture the critical nuances of the sarcasm. Furthermore, we supplement our study with extensive human mevaluation on TURBO’s generated explanations and find them out to be comparatively better than other systems. @@ -5897,7 +5897,7 @@ Seeds of Discourse: A Multilingual Corpus of Direct Quotations from <fixed-case>A</fixed-case>frican Media on Agricultural Biotechnologies PatriciaChiril - TrevorSpreadburyUniversity of Chicago + TrevorSpreadburyUniversity of Chicago Joeva SeanRockState University of New York at Stony Brook BrianDowd-Uribe DavidUminskyUniversity of Chicago @@ -5908,11 +5908,11 @@ Position Really Matters: Towards a Holistic Approach for Prompt Tuning - XianjunYangFacebook + XianjunYangFacebook WeiChengNEC-Labs XujiangZhaoNEC Labs America WenchaoYuUniversity of California, Los Angeles - Linda RuthPetzoldUniversity of California, Santa Barbara + Linda RuthPetzoldUniversity of California, Santa Barbara HaifengChen 8501-8523 Prompt tuning is highly effective in efficiently extracting knowledge from foundation models, encompassing both language, vision, and vision-language models. However, the efficacy of employing fixed soft prompts with a predetermined position for concatenation with inputs for all instances, irrespective of their inherent disparities, remains uncertain. Variables such as the position, length, and representations of prompts across diverse instances and tasks can substantially influence the performance of prompt tuning. We first provide a theoretical analysis, revealing that optimizing the position of the prompt to encompass the input can capture additional semantic information that traditional prefix or postfix prompt tuning methods fail to capture. Then, we present a holistic parametric prompt tuning strategy that dynamically determines different factors of prompts based on specific tasks or instances. Experimental results underscore the significant performance improvement achieved by dynamic prompt tuning across a wide range of tasks, including NLP, vision recognition, and vision-language tasks. Furthermore, we establish the universal applicability of our approach under full-data, few-shot, and multitask settings. From 7f2c9c7dc37abea44da86bdb24b0094098d8a26f Mon Sep 17 00:00:00 2001 From: Matt Post Date: Fri, 6 Jun 2025 09:12:05 -0400 Subject: [PATCH 08/18] Add ORCIDs for 2024.conll-babylm --- data/xml/2024.conll.xml | 68 ++++++++++++++++++++--------------------- 1 file changed, 34 insertions(+), 34 deletions(-) diff --git a/data/xml/2024.conll.xml b/data/xml/2024.conll.xml index 2bfaebc3b6..acfd6244b3 100644 --- a/data/xml/2024.conll.xml +++ b/data/xml/2024.conll.xml @@ -568,7 +568,7 @@ From Babble to Words: Pre-Training Language Models on Continuous Streams of Phonemes - ZébulonGorielyUniversity of Cambridge + ZébulonGorielyUniversity of Cambridge RichardDiehl MartinezUniversity of Cambridge AndrewCainesUniversity of Cambridge PaulaButteryUniversity of Cambridge @@ -580,10 +580,10 @@ Graphemes vs. phonemes: battling it out in character-based language models - BastianBunzeckUniversität Bielefeld - DanielDuranUniversität Bielefeld - LeonieSchadeUniversität Bielefeld - SinaZarrießBielefeld University + BastianBunzeckUniversität Bielefeld + DanielDuranUniversität Bielefeld + LeonieSchadeUniversität Bielefeld + SinaZarrießBielefeld University 54-64 We present grapheme-llama and phoneme-llama, character-based language models trained for the 2024 BabyLM challenge. Through these models, we explore an under-researched approach to downsizing: replacing subword-based tokenization with character-level tokenization, drastically reducing the vocabulary size. The grapheme model is trained on a standard BabyLM dataset, while the phoneme model uses a phoneme-converted version of this dataset. Results show that grapheme-based models perform better overall, achieving scores comparable to subword-based models on grammatical benchmarks. Despite lower performance, phoneme models also demonstrate promising grammatical learning. We argue that our results challenge conventional wisdom on language modeling techniques and open up novel research questions with character- and phoneme-based models as objects of inquiry. 2024.conll-babylm.5 @@ -591,10 +591,10 @@ Exploring Curriculum Learning for Vision-Language Tasks: A Study on Small-Scale Multimodal Training - RohanSaha + RohanSaha AbrarFahim, University of Alberta - AlonaFysheUniversity of Alberta - AlexMurphy + AlonaFysheUniversity of Alberta + AlexMurphy 65-81 For specialized domains, there is often not a wealth of data with which to train large machine learning models. In such limited data / compute settings, various methods exist aiming to \textit{do more with less}, such as finetuning from a pretrained model, modulating difficulty levels as data are presented to a model (curriculum learning), and considering the role of model type / size. Approaches to efficient \textit{machine} learning also take inspiration from \textit{human} learning by considering use cases where machine learning systems have access to approximately the same number of words experienced by a 13 year old child (100M words). We investigate the role of 3 primary variables in a limited data regime as part of the multimodal track of the BabyLM challenge. We contrast: (i) curriculum learning, (ii), pretraining (with text-only data), (iii) model type. We modulate these variables and assess them on two types of tasks: (a) multimodal (text+image), and (b) unimodal (text-only) tasks. We find that curriculum learning benefits multimodal evaluations over non-curriclum learning models, particularly when combining text-only pretraining. On text-only tasks, curriculum learning appears to help models with smaller trainable parameter counts. We suggest possible reasons based on architectural differences and training designs as to why one might observe such results. 2024.conll-babylm.6 @@ -613,7 +613,7 @@ Choosy Babies Need One Coach: Inducing Mode-Seeking Behavior in <fixed-case>B</fixed-case>aby<fixed-case>L</fixed-case>lama with Reverse <fixed-case>KL</fixed-case> Divergence ShaozhenShi - YevgenMatusevychUniversity of Groningen + YevgenMatusevychUniversity of Groningen MalvinaNissimUniversity of Groningen 95-105 This study presents our submission to the Strict-Small Track of the 2nd BabyLM Challenge. We use a teacher-student distillation setup with the BabyLLaMa model (Timiryasov and Tastet, 2023) as a backbone. To make the student’s learning process more focused, we replace the objective function with a reverse Kullback-Leibler divergence, known to cause mode-seeking (rather than mode-averaging) behaviour in computational learners. We further experiment with having a single teacher (instead of an ensemble of two teachers) and implement additional optimization strategies to improve the distillation process. Our experiments show that under reverse KL divergence, a single-teacher model often outperforms or matches multiple-teacher models across most tasks. Additionally, incorporating advanced optimization techniques further enhances model performance, demonstrating the effectiveness and robustness of our proposed approach. These findings support our idea that “choosy babies need one coach”. @@ -624,9 +624,9 @@ Different Ways to Forget: Linguistic Gates in Recurrent Neural Networks CristianoChesiIstituto Universitario di Studi Superiori VeronicaBressanUniversity School for Advanced Studies IUSS - MatildeBarbini - AchilleFuscoIstituto Universitario di Studi Superiori - Maria Letizia PicciniBianchessiIstituto Universitario di Studi Superiori + MatildeBarbini + AchilleFuscoIstituto Universitario di Studi Superiori + Maria Letizia PicciniBianchessiIstituto Universitario di Studi Superiori SofiaNeriNA SarahRossiNA TommasoSgrizziIstituto Universitario di Studi Superiori @@ -638,7 +638,7 @@ Developmentally Plausible Multimodal Language Models Are Highly Modular AlinaKleringsUniversität Mannheim - ChristianBarteltUniversität Mannheim + ChristianBarteltUniversität Mannheim AaronMuellerNortheastern University and Technion - Israel Institute of Technology, Technion 118-139 Large language models demonstrate emergent modularity, where functionally specialized components and circuits arise to handle specific tasks or task formats. If similar modules arise in models trained on more cognitively plausible datasets, it could inform debates surrounding what kinds of would be learnable given more human-like language learning signals. In this paper, we describe a multimodal vision-language model submitted to the BabyLM Challenge. Our model achieves similar performance to the best-performing architectures from last year, though visual information does not improve performance on text-only tasks over text-only models (in accordance with prior findings). To better understand how the model processes the evaluation tasks of the BabyLM Challenge, we leverage causal interpretability methods to locate the neurons that contribute to the model’s final decisions. We find that the models we train are highly modular: distinct components arise to process related tasks. Furthermore, on text-and-image tasks, adding or removing visual inputs causes the model to use distinct components to process the same textual inputs. This suggests that modal and task-specific specialization is efficiently learned, and that a high degree of functional specialization arises in even small-scale language models. @@ -655,7 +655,7 @@ Extending the <fixed-case>B</fixed-case>aby<fixed-case>LM</fixed-case> Initiative : Promoting Diversity in Datasets and Metrics through High-Quality Linguistic Corpora - LaurentPrévotUniversité d’Aix-Marseille + LaurentPrévotUniversité d’Aix-Marseille Sheng-FuWangAcademia Sinica Jou-AnChiNA Shu-KaiHsiehNational Taiwan University @@ -666,7 +666,7 @@ Integrating Quasi-symbolic Conceptual Knowledge into Language Model Pre-training - GáborBerendUniversity of Szeged + GáborBerendUniversity of Szeged 159-165 In this paper, we investigate the integration of latent conceptual knowledge into the pre-training of masked language models. Our solution is based on the use of an auxiliary model, from which we extract training signals for training a student model. We determine the training signals from the hidden representations of the student model in an unsupervised way, using sparse coding. Models trained on latent concepts alone have an improved fine-tunability on downstream tasks, however, they perform worse on traditional language modeling, i.e., when the goal is to output missing tokens as opposed to latent semantic classes of words. In order to preserve the improved fine-tuning capability of the models, while making them better at the task of language modeling, we propose a final stage of pre-training, during which we perform traditional masked language modeling. The final stage of pre-training is based on a model that has already been pre-trained on the task of modeling latent semantic properties, with the weights of the backbone model being frozen. During the final training phase, we only train a lightweight linear classifier layer on top of the logits that the model determines for the latent semantic properties. With this modification, we can obtain the benefits of both the traditional training paradigms and the one which is based on the use of latent semantic properties. We release our source code at github.com/SzegedAI/MLSM. 2024.conll-babylm.13 @@ -687,7 +687,7 @@ Less is More: Pre-Training Cross-Lingual Small-Scale Language Models with Cognitively-Plausible Curriculum Learning Strategies SuchirSalhanUniversity of Cambridge RichardDiehl MartinezUniversity of Cambridge - ZébulonGorielyUniversity of Cambridge + ZébulonGorielyUniversity of Cambridge PaulaButteryUniversity of Cambridge 174-188 Curriculum Learning has been a popular strategy to improve the cognitive plausibility of Small-Scale Language Models (SSLMs) in the BabyLM Challenge. However, it has not led to considerable improvements over non-curriculum models. We assess whether theoretical linguistic acquisition theories can be used to specify more fine-grained curriculum learning strategies, creating age-ordered corpora of Child-Directed Speech for four typologically distant language families to implement SSLMs and acquisition-inspired curricula cross-lingually. Comparing the success of three objective curricula (Growing, Inwards & MMM) that precisely replicate the predictions of acquisition theories on a standard SSLM architecture, we find fine-grained acquisition-inspired curricula can outperform non-curriculum baselines and performance benefits of curricula strategies in SSLMs can be derived by specifying fine-grained language-specific curricula that precisely replicate language acquisition theories. @@ -696,9 +696,9 @@ <fixed-case>C</fixed-case>oncrete<fixed-case>GPT</fixed-case>: A Baby <fixed-case>GPT</fixed-case>-2 Based on Lexical Concreteness and Curriculum Learning - LucaCaponeUniversity of Pisa + LucaCaponeUniversity of Pisa AlessandroBondielliUniversita’ di Pisa, University of Pisa - AlessandroLenciUniversity of Pisa + AlessandroLenciUniversity of Pisa 189-196 We present a model for the Strict-Small track of the BabyLM Challenge 2024 (Choshen et al. 2024). We introduce a Curriculum Learning approach for training a specialized version of GPT-2 (Radford et al. 2019), that we name ConcreteGPT. We utilize the norms from (Brysbaert et al. 2014) which provide concreteness ratings for 40,000 English lexical items based on human subjects. Using these norms, we assign a concreteness score to each sentence in the training dataset and develop two curriculum strategies that progressively introduce more complex and abstract language patterns in the training data. Compared to the baselines, our best model shows lower performance on zero-shot tasks but demonstrates superior performance in fine-tuning tasks. Notably, our curriculum-trained models exhibit significant improvements over a non-curriculum based training of the same model. 2024.conll-babylm.16 @@ -725,10 +725,10 @@ Using Curriculum Masking Based on Child Language Development to Train a Large Language Model with Limited Training Data EvanLucasMichigan Technological University - DylanGainesMichigan Technological University + DylanGainesMichigan Technological University Tagore RaoKosireddy KevinLi - Timothy C.HavensMichigan Technological University + Timothy C.HavensMichigan Technological University 221-228 In this paper we detail our submissions to the Strict and Strict-Small tracks of the 2024 BabyLM Challenge. We approach this challenge with two methodologies: i) use of a novel dataset, and ii) development of a pre-training technique based on the fusion of child language acquisition with traditional masked language modeling, which we call curriculum masking. The novel dataset used for this task is based on user submissions to the Reddit forum (i.e., subreddit) “Explain Like I’m Five”, which explains diverse concepts using simple language. Curriculum masking works by creating learning phases based on a standard child language development timeline, where the masked words learned by the model start with simple nouns and gradually expand to include more complex parts of speech. We show that using internet-based training data shows a small improvement in evaluation scores as compared to baseline training data. Our proposed pre-training method of curriculum masking is conceptually novel and also shows improved rates of learning over typical masked language modeling pre-training, potentially allowing for good performance with fewer total epochs on smaller training datasets. Code for the curriculum masking implementation is shared at https://github.com/evan-person/curriculumMaskingBabyLM2024. 2024.conll-babylm.19 @@ -737,7 +737,7 @@ <fixed-case>W</fixed-case>hat<fixed-case>I</fixed-case>f: Leveraging Word Vectors for Small-Scale Data Augmentation AlexLymanBrigham Young University - BryceHepner + BryceHepner 229-236 We introduce WhatIf, a lightly supervised data augmentation technique that leverages word vectors to enhance training data for small-scale language models. Inspired by reading prediction strategies used in education, WhatIf creates new samples by substituting semantically similar words in the training data. We evaluate WhatIf on multiple datasets, demonstrating small but consistent improvements in downstream evaluation compared to baseline models. Finally, we compare WhatIf to other small-scale data augmentation techniques and find that it provides comparable quantitative results at a potential tradeoff to qualitative evaluation. 2024.conll-babylm.20 @@ -745,8 +745,8 @@ A surprisal oracle for when every layer counts - XudongHongSaarland University and Max-Planck Institute for Informatics - SharidLoáicigaUniversity of Gothenburg, Sweden + XudongHongSaarland University and Max-Planck Institute for Informatics + SharidLoáicigaUniversity of Gothenburg, Sweden AsadSayeedUniversity of Gothenburg 237-243 Active Curriculum Language Modeling (ACLM; Hong et al., 2023) is a learner-directed approach to training a language model. We proposed the original version of this process in our submission to the BabyLM 2023 task, and now we propose an updated ACLM process for the BabyLM 2024 task. ACLM involves an iteratively-and dynamically-constructed curriculum informed over the training process by a model of uncertainty; other training items that are similarly uncertain to a least certain candidate item are prioritized. Our new process improves the similarity model so that it is more dynamic, and we run ACLM over the most successful model from the BabyLM 2023 task: ELC-BERT (Charpentier and Samuel, 2023). We find that while our models underperform on fine-grained grammatical inferences, they outperform the BabyLM 2024 official base-lines on common-sense and world-knowledge tasks. We make our code available at https://github.com/asayeed/ActiveBaby. @@ -757,9 +757,9 @@ Dreaming Out Loud: A Self-Synthesis Approach For Training Vision-Language Models With Developmentally Plausible Data BadrAlKhamissiEPFL - EPF Lausanne YingtianTang - AbdülkadirGökceEPFL - EPF Lausanne - JohannesMehrerEPFL - EPF Lausanne - MartinSchrimpfEPFL - EPF Lausanne + AbdülkadirGökceEPFL - EPF Lausanne + JohannesMehrerEPFL - EPF Lausanne + MartinSchrimpfEPFL - EPF Lausanne 244-251 While today’s large language models exhibit impressive abilities in generating human-like text, they require massive amounts of data during training. We here take inspiration from human cognitive development to train models in limited data conditions. Specifically we present a self-synthesis approach that iterates through four phases: Phase 1 sets up fundamental language abilities, training the model from scratch on a small corpus. Language is then associated with the visual environment in phase 2, integrating the model with a vision encoder to generate descriptive captions from labeled images. In the “self-synthesis” phase 3, the model generates captions for unlabeled images, that it then uses to further train its language component with a mix of synthetic, and previous real-world text. This phase is meant to expand the model’s linguistic repertoire, similar to humans self-annotating new experiences. Finally, phase 4 develops advanced cognitive skills, by training the model on specific tasks such as visual question answering and reasoning. Our approach offers a proof of concept for training a multimodal model using a developmentally plausible amount of data. 2024.conll-babylm.22 @@ -771,7 +771,7 @@ AkiyoFukatsuTokyo University, Tokyo Institute of Technology MiyuOba AriannaBisazzaUniversity of Groningen - YoheiOsekiUniversity of Tokyo + YoheiOsekiUniversity of Tokyo 252-261 While current large language models have achieved a remarkable success, their data efficiency remains a challenge to overcome. Recently it has been suggested that child-directed speech (CDS) can improve training data efficiency of modern language models based on Transformer neural networks. However, it is not yet understood which specific properties of CDS are effective for training these models. In the context of the BabyLM Challenge, we focus on Variation Sets (VSs), sets of consecutive utterances expressing a similar intent with slightly different words and structures, which are ubiquitous in CDS. To assess the impact of VSs on training data efficiency, we augment CDS data with different proportions of artificial VSs and use these datasets to train an auto-regressive model, GPT-2. We find that the best proportion of VSs depends on the evaluation benchmark: BLiMP and GLUE scores benefit from the presence of VSs, but EWOK scores do not. Additionally, the results vary depending on multiple factors such as the number of epochs and the order of utterance presentation. Taken together, these findings suggest that VSs can have a beneficial influence on language models, while leaving room for further investigation. 2024.conll-babylm.23 @@ -780,7 +780,7 @@ <fixed-case>GPT</fixed-case> or <fixed-case>BERT</fixed-case>: why not both? Lucas Georges GabrielCharpentierUniversity of Oslo - DavidSamuelUniversity of Oslo + DavidSamuelUniversity of Oslo 262-283 We present a simple way to merge masked language modeling with causal language modeling. This hybrid training objective results in a model that combines the strengths of both modeling paradigms within a single transformer stack – GPT-BERT can be transparently used like any standard causal or masked language model. We test the pretraining process that enables this flexible behavior on the BabyLM Challenge 2024. The results show that the hybrid pretraining outperforms masked-only or causal-only models. We openly release the models, training corpora and code. 2024.conll-babylm.24 @@ -797,7 +797,7 @@ <fixed-case>B</fixed-case>aby<fixed-case>L</fixed-case>lama-2: Ensemble-Distilled Models Consistently Outperform Teachers With Limited Data - Jean-LoupTastetUniversity of Copenhagen + Jean-LoupTastetUniversity of Copenhagen InarTimiryasovCopenhagen University, Niels Bohr Institute 292-301 We present BabyLlama-2, a 345 million parameter model distillation-pretrained from two teachers on a 10 million word corpus for the BabyLM competition. On the BLiMP and SuperGLUE benchmarks, BabyLlama-2 outperforms baselines trained on both 10 and 100 million word datasets with the same data mix, as well as its teacher models. Through an extensive hyperparameter sweep, we demonstrate that the advantages of distillation cannot be attributed to suboptimal hyperparameter selection of the teachers. Our findings underscore the need for further investigation into distillation techniques, particularly in data-limited settings. @@ -816,9 +816,9 @@ <fixed-case>BERT</fixed-case>time Stories: Investigating the Role of Synthetic Story Data in Language Pre-training NikitasTheodoropoulos - GiorgosFilandrianosNational Technical University of Athens + GiorgosFilandrianosNational Technical University of Athens VassilisLyberatos - MariaLymperaiou + MariaLymperaiou GiorgosStamouNational Technical University of Athens 308-323 We describe our contribution to the Strict and Strict-Small tracks of the 2nd iteration of the BabyLM Challenge. The shared task is centered around efficient pre-training given data constraints motivated by human development. In response, we study the effect of synthetic story data in language pre-training using *TinyStories*: a recently introduced dataset of short stories. Initially, we train GPT-Neo models on subsets of *TinyStories*, while varying the amount of available data. We find that, even with access to less than 100M words, the models are able to generate high-quality, original completions to a given story, and acquire substantial linguistic knowledge. To measure the effect of synthetic story data, we train *LTG-BERT* encoder models on a combined dataset of: a subset of *TinyStories*, story completions generated by GPT-Neo, and a subset of the *BabyLM* dataset. Our experimentation reveals that synthetic data can occasionally offer modest gains, but overall have a negative influence on linguistic understanding. Our work offers an initial study on synthesizing story data in low resource settings and underscores their potential for augmentation in data-constrained language modeling. We publicly release our models and implementation on our GitHub. @@ -827,11 +827,11 @@ <fixed-case>A</fixed-case>nt<fixed-case>LM</fixed-case>: Bridging Causal and Masked Language Models - XinruYu - BinGuo + XinruYu + BinGuo ShiweiLuo JieWang - TaoJi + TaoJi YuanbinWu 324-331 Causal Language Modeling (CLM) and Masked Language Modeling (MLM) are two mainstream learning paradigms based on Transformer networks, specifically the Decoder-only and Encoder-only architectures. The strengths of each paradigm in downstream tasks have shown a mix of advantages and disadvantages. In the past BabyLM Challenge 2023, although the MLM paradigm achieved the best average performance, the CLM paradigm demonstrated significantly faster convergence rates. For the BabyLM Challenge 2024, we propose a novel language modeling paradigm named \textbf{AntLM}, which integrates both CLM and MLM to leverage the advantages of these two classic paradigms. We chose the strict-small track and conducted experiments on two foundation models: BabyLlama, representing CLM, and LTG-BERT, representing MLM. During the training process for specific foundation models, we alternate between applying CLM or MLM training objectives and causal or bidirectional attention masks. Experimental results show that combining the two pretraining objectives leverages their strengths, enhancing overall training performance. Under the same epochs, AntLM_{BabyLlama} improves Macro-average by 1%, and AntLM_{LTG-BERT} achieves a 2.2% increase over the baselines. From fe4ba0eb99c78851dcab2e2fdc86c5088e2de135 Mon Sep 17 00:00:00 2001 From: Matt Post Date: Fri, 6 Jun 2025 09:12:51 -0400 Subject: [PATCH 09/18] Ingest ORCIDS for 2025.trustnlp-main --- data/xml/2025.trustnlp.xml | 104 ++++++++++++++++++------------------- 1 file changed, 52 insertions(+), 52 deletions(-) diff --git a/data/xml/2025.trustnlp.xml b/data/xml/2025.trustnlp.xml index 0a09837354..3c277445cd 100644 --- a/data/xml/2025.trustnlp.xml +++ b/data/xml/2025.trustnlp.xml @@ -31,7 +31,7 @@ Beyond Text-to-<fixed-case>SQL</fixed-case> for <fixed-case>I</fixed-case>o<fixed-case>T</fixed-case> Defense: A Comprehensive Framework for Querying and Classifying <fixed-case>I</fixed-case>o<fixed-case>T</fixed-case> Threats RyanPavlichNA - NimaEbadi + NimaEbadi RichardTarbellNA BillyLinaresNA AdrianTanNA @@ -41,7 +41,7 @@ HannahHaleyNA JerrisGeorgeNA RockySlavinUniversity of Texas at San Antonio - Kim-Kwang RaymondChooUniversity of Texas at San Antonio + Kim-Kwang RaymondChooUniversity of Texas at San Antonio GlennDietrichNA AnthonyRiosUniversity of Texas at San Antonio 1-12 @@ -54,7 +54,7 @@ RuoxiCheng YizhongDing ShuirongCaonanjing university - ZhiqiangWangbeijing electronic science&technology institute + ZhiqiangWangbeijing electronic science&technology institute ShitongShao 13-22 Audio can disclose PII, particularly when combined with related text data. Therefore, it is essential to develop tools to detect privacy leakage in Contrastive Language-Audio Pretraining(CLAP). Existing MIAs need audio as input, risking exposure of voiceprint and requiring costly shadow models. We first propose PRMID, a membership inference detector based probability ranking given by CLAP, which does not require training shadow models but still requires both audio and text of the individual as input. To address these limitations, we then propose USMID, a textual unimodal speaker-level membership inference detector, querying the target model using only text data. We randomly generate textual gibberish that are clearly not in training dataset. Then we extract feature vectors from these texts using the CLAP model and train a set of anomaly detectors on them. During inference, the feature vector of each test text is input into the anomaly detector to determine if the speaker is in the training set (anomalous) or not (normal). If available, USMID can further enhance detection by integrating real audio of the tested speaker. Extensive experiments on various CLAP model architectures and datasets demonstrate that USMID outperforms baseline methods using only text data. @@ -69,7 +69,7 @@ RanjieDuan XiaoshuangJia ShaoweiYuan - ZhiqiangWangbeijing electronic science&technology institute + ZhiqiangWangbeijing electronic science&technology institute XiaojunJiaNanyang Technological University 23-40 Understanding the vulnerabilities of Large Vision Language Models (LVLMs) to jailbreak attacks is essential for their responsible real-world deployment. Most previous work requires access to model gradients, or is based on human knowledge (prompt engineering) to complete jailbreak, and they hardly consider the interaction of images and text, resulting in inability to jailbreak in black box scenarios or poor performance. To overcome these limitations, we propose a Prior-Guided Bimodal Interactive Black-Box Jailbreak Attack for toxicity maximization, referred to as PBI-Attack. Our method begins by extracting malicious features from a harmful corpus using an alternative LVLM and embedding these features into a benign image as prior information. Subsequently, we enhance these features through bidirectional cross-modal interaction optimization, which iteratively optimizes the bimodal perturbations in an alternating manner through greedy search, aiming to maximize the toxicity of the generated response. The toxicity level is quantified using a well-trained evaluation model.Experiments demonstrate that PBI-Attack outperforms previous state-of-the-art jailbreak methods, achieving an average attack success rate of 92.5% across three open-source LVLMs and around 67.3% on three closed-source LVLMs.redDisclaimer: This paper contains potentially disturbing and offensive content. @@ -78,12 +78,12 @@ Ambiguity Detection and Uncertainty Calibration for Question Answering with Large Language Models - ZhengyanShi + ZhengyanShi GiuseppeCastellucciAmazon SimoneFiliceTechnology Innovation Institute - SaarKuziAmazon + SaarKuziAmazon EladKravi - EugeneAgichteinEmory University + EugeneAgichteinEmory University OlegRokhlenko ShervinMalmasiAmazon 41-55 @@ -114,7 +114,7 @@ Break the Breakout: Reinventing <fixed-case>LM</fixed-case> Defense Against Jailbreak Attacks with Self-Refine HeegyuKimAjou University - HyunsoukChoAjou University + HyunsoukChoAjou University 82-102 Language models (LMs) are vulnerable to exploitation for adversarial misuse. Training LMs for safety alignment is extensive, making it hard to respond to fast-developing attacks immediately, such as jailbreaks. We propose self-refine with formatting that achieves outstanding safety even in non-safety-aligned LMsand evaluate our method alongside several defense baselines, demonstrating that it is the safest training-free method against jailbreak attacks.Additionally, we proposed a formatting method that improves the efficiency of the self-refine process while reducing attack success rates in fewer iterations. We observed that non-safety-aligned LMs outperform safety-aligned LMs in safety tasks by giving more helpful and safe responses.In conclusion, our findings can achieve less safety risk with fewer computational costs, allowing non-safety LM to be efficiently utilized in real-world service. 2025.trustnlp-main.7 @@ -122,7 +122,7 @@ Minimal Evidence Group Identification for Claim Verification - XiangciLiAmazon Web Services + XiangciLiAmazon Web Services SihaoChenMicrosoft RajviKapadiaGoogle JessicaOuyangUniversity of Texas at Dallas @@ -136,7 +136,7 @@ Cracking the Code: Enhancing Implicit Hate Speech Detection through Coding Classification LuWei LiangzhiLiMeetyou AI Lab and Qufu Normal University - TongXiang + TongXiang LiuXiaoMeetyou AI Lab NoaGarciaOsaka University 112-126 @@ -166,9 +166,9 @@ Rainbow-Teaming for the <fixed-case>P</fixed-case>olish Language: A Reproducibility Study - AleksandraKrasnodębska - MaciejChrabaszczWarsaw University of Technology - WojciechKusaNASK - National Research Institute + AleksandraKrasnodębska + MaciejChrabaszczWarsaw University of Technology + WojciechKusaNASK - National Research Institute 155-165 The development of multilingual large language models (LLMs) presents challenges in evaluating their safety across all supported languages. Enhancing safety in one language (e.g., English) may inadvertently introduce vulnerabilities in others. To address this issue, we implement a methodology for the automatic creation of red-teaming datasets for safety evaluation in Polish language. Our approach generates both harmful and non-harmful prompts by sampling different risk categories and attack styles. We test several open-source models, including those trained on Polish data, and evaluate them using metrics such as Attack Success Rate (ASR) and False Reject Rate (FRR). The results reveal clear gaps in safety performance between models and show that better testing across languages is needed. 2025.trustnlp-main.12 @@ -176,10 +176,10 @@ <fixed-case>B</fixed-case>ias<fixed-case>E</fixed-case>dit: Debiasing Stereotyped Language Models via Model Editing - XinXu + XinXu WeiXuGeorgia Institute of Technology - NingyuZhangZhejiang University - JulianMcAuleyUniversity of California, San Diego, University of California, San Diego + NingyuZhangZhejiang University + JulianMcAuleyUniversity of California, San Diego, University of California, San Diego 166-184 Previous studies have established that language models manifest stereotyped biases. Existing debiasing strategies, such as retraining a model with counterfactual data, representation projection, and prompting often fail to efficiently eliminate bias or directly alter the models’ biased internal representations. To address these issues, we propose BiasEdit, an efficient model editing method to remove stereotypical bias from language models through lightweight networks that act as editors to generate parameter updates. BiasEdit employs a *debiasing loss* guiding editor networks to conduct local edits on partial parameters of a language model for debiasing while preserving the language modeling abilities during editing through a *retention loss*. Experiments on StereoSet and Crows-Pairs demonstrate the effectiveness, efficiency, and robustness of BiasEdit in eliminating bias compared to tangental debiasing baselines, and little to no impact on the language models’ general capabilities. In addition, we conduct bias tracing to probe bias in various modules and explore bias editing impacts on different components of language models. 2025.trustnlp-main.13 @@ -187,11 +187,11 @@ Do Voters Get the Information They Want? Understanding Authentic Voter <fixed-case>FAQ</fixed-case>s in the <fixed-case>US</fixed-case> and How to Improve for Informed Electoral Participation - VipulaRawte + VipulaRawte Deja NScottUniversity of South Carolina GauravKumar AishneetJuneja - Bharat SowryaYaddanapalli + Bharat SowryaYaddanapalli BiplavSrivastavaUniversity of South Carolina 185-231 Accurate information is crucial for democracy as it empowers voters to make informed decisions about their representatives and keeping them accountable. In the US, state election commissions (SECs), often required by law, are the primary providers of Frequently Asked Questions (FAQs) to voters, and secondary sources like non-profits such as League of Women Voters (LWV) try to complement their information shortfall. However, surprisingly, to the best of our knowledge, there is neither a single source with comprehensive FAQs nor a study analyzing the data at national level to identify current practices and ways to improve the status quo. This paper addresses it by providing the first dataset on Voter FAQs covering all the US states. Second, we introduce metrics for FAQ information quality (FIQ) with respect to questions, answers, and answers to corresponding questions. Third, we use FIQs to analyze US FAQs to identify leading, mainstream and lagging content practices and corresponding states. Finally, we identify what states across the spectrum can do to improve FAQ quality and thus, the overall information ecosystem. Across all 50 U.S. states, 12% were identified as leaders and 8% as laggards for FIQSvoter, while 14% were leaders and 12% laggards for FIQSdeveloper. The code and sample data are provided at https://anonymous.4open.science/r/election-qa-analysis-BE4E. @@ -200,7 +200,7 @@ <fixed-case>V</fixed-case>i<fixed-case>B</fixed-case>e: A Text-to-Video Benchmark for Evaluating Hallucination in Large Multimodal Models - VipulaRawte + VipulaRawte SarthakJain AarushSinha GarvKaushik @@ -209,8 +209,8 @@ Samyak RajeshJainPowerSchool Aishwarya NareshRegantiAmazon VinijaJainFacebook - AmanChadhaAmazon Web Services - AmitShethUniversity of South Carolina + AmanChadhaAmazon Web Services + AmitShethUniversity of South Carolina AmitavaDasUniversity of South Carolina 232-246 Recent advances in Large Multimodal Models (LMMs) have expanded their capabilities to video understanding, with Text-to-Video (T2V) models excelling in generating videos from textual prompts. However, they still frequently produce hallucinated content, revealing AI-generated inconsistencies. We introduce ViBe https://huggingface.co/datasets/ViBe-T2V-Bench/ViBe: a large-scale dataset of hallucinated videos from open-source T2V models. We identify five major hallucination types: Vanishing Subject, Omission Error, Numeric Variability, Subject Dysmorphia, and Visual Incongruity. Using ten T2V models, we generated and manually annotated 3,782 videos from 837 diverse MS COCO captions. Our proposed benchmark includes a dataset of hallucinated videos and a classification framework using video embeddings. ViBe serves as a critical resource for evaluating T2V reliability and advancing hallucination detection. We establish classification as a baseline, with the TimeSFormer + CNN ensemble achieving the best performance (0.345 accuracy, 0.342 F1 score). While initial baselines proposed achieve modest accuracy, this highlights the difficulty of automated hallucination detection and the need for improved methods. Our research aims to drive the development of more robust T2V models and evaluate their outputs based on user preferences. Our code is available at: https://anonymous.4open.science/r/vibe-1840/ @@ -220,7 +220,7 @@ Know What You do Not Know: Verbalized Uncertainty Estimation Robustness on Corrupted Images in Vision-Language Models MirkoBorszukovszki - Ivo PascalDe Jong + Ivo PascalDe Jong MatiasValdenegro-ToroUniversity of Groningen 247-265 To leverage the full potential of Large Language Models (LLMs) it is crucial to have some information on their answers’ uncertainty. This means that the model has to be able to quantify how certain it is in the correctness of a given response. Bad uncertainty estimates can lead to overconfident wrong answers undermining trust in these models. Quite a lot of research has been done on language models that work with text inputs and provide text outputs. Still, since the visual capabilities have been added to these models recently, there has not been much progress on the uncertainty of Visual Language Models (VLMs). We tested three state-of-the-art VLMs on corrupted image data. We found that the severity of the corruption negatively impacted the models’ ability to estimate their uncertainty and the models also showed overconfidence in most of the experiments. @@ -229,7 +229,7 @@ Summary the Savior: Harmful Keyword and Query-based Summarization for <fixed-case>LLM</fixed-case> Jailbreak Defense - ShagotoRahman + ShagotoRahman IanHarrisUniversity of California-Irvine 266-275 Large Language Models (LLMs) are widely used for their capabilities, but face threats from jailbreak attacks, which exploit LLMs to generate inappropriate information and bypass their defense system. Existing defenses are often specific to jailbreak attacks and as a result, a robust, attack-independent solution is needed to address both Natural Language Processing (NLP) ambiguities and attack variability. In this study, we have introduced, Summary The Savior, a novel jailbreak detection mechanism leveraging harmful keywords and query-based security-aware summary classification. By analyzing the illegal and improper contents of prompts within the summaries, the proposed method remains robust against attack diversity and NLP ambiguities. Two novel datasets for harmful keyword extraction and security aware summaries utilizing GPT-4 and Llama-3.1 70B respectively have been generated in this regard. Moreover, an “ambiguous harmful” class has been introduced to address content and intent ambiguities. Evaluation results demonstrate that, Summary The Savior achieves higher defense performance, outperforming state-of-the-art defense mechanisms namely Perplexity Filtering, SmoothLLM, Erase and Check with lowest attack success rates across various jailbreak attacks namely PAIR, GCG, JBC and Random Search, on Llama-2, Vicuna-13B and GPT-4. Our codes, models, and results are available at: https://github.com/shrestho10/SummaryTheSavior @@ -238,11 +238,11 @@ Bias A-head? Analyzing Bias in Transformer-Based Language Model Attention Heads - YiYangHong Kong University of Science and Technology + YiYangHong Kong University of Science and Technology HanyuDuan - AhmedAbbasiUniversity of Notre Dame + AhmedAbbasiUniversity of Notre Dame John P.LalorUniversity of Notre Dame - Kar YanTam + Kar YanTam 276-290 Transformer-based pretrained large language models (PLM) such as BERT and GPT have achieved remarkable success in NLP tasks. However, PLMs are prone to encoding stereotypical biases. Although a burgeoning literature has emerged on stereotypical bias mitigation in PLMs, such as work on debiasing gender and racial stereotyping, how such biases manifest and behave internally within PLMs remains largely unknown. Understanding the internal stereotyping mechanisms may allow better assessment of model fairness and guide the development of effective mitigation strategies. In this work, we focus on attention heads, a major component of the Transformer architecture, and propose a bias analysis framework to explore and identify a small set of biased heads that are found to contribute to a PLM’s stereotypical bias. We conduct extensive experiments to validate the existence of these biased heads and to better understand how they behave. We investigate gender and racial bias in the English language in two types of Transformer-based PLMs: the encoder-based BERT model and the decoder-based autoregressive GPT model, LLaMA-2 (7B), and LLaMA-2-Chat (7B). Overall, the results shed light on understanding the bias behavior in pretrained language models. 2025.trustnlp-main.18 @@ -250,8 +250,8 @@ Mimicking How Humans Interpret Out-of-Context Sentences Through Controlled Toxicity Decoding - Maria MihaelaTrusca - LiesbethAllein + Maria MihaelaTrusca + LiesbethAllein 291-297 Interpretations of a single sentence can vary, particularly when its context is lost. This paper aims to simulate how readers perceive content with varying toxicity levels by generating diverse interpretations of out-of-context sentences. By modeling toxicity we can anticipate misunderstandings and reveal hidden toxic meanings. Our proposed decoding strategy explicitly controls toxicity in the set of generated interpretations by (i) aligning interpretation toxicity with the input, (ii) relaxing toxicity constraints for more toxic input sentences, and (iii) promoting diversity in toxicity levels within the set of generated interpretations. Experimental results show that our method improves alignment with human-written interpretations in both syntax and semantics while reducing model prediction uncertainty. 2025.trustnlp-main.19 @@ -289,7 +289,7 @@ Building Safe <fixed-case>G</fixed-case>en<fixed-case>AI</fixed-case> Applications: An End-to-End Overview of Red Teaming for Large Language Models - AlbertoPurpuraInternational Business Machines + AlbertoPurpuraInternational Business Machines SahilWadhwaCapitalOne JesseZymetCapitalOne AkshayGuptaCapitalOne @@ -305,7 +305,7 @@ Difficulty Estimation in Natural Language Tasks with Action Scores AleksandarAngelov - Tsegaye MisikirTashuUniversity of Groningen + Tsegaye MisikirTashuUniversity of Groningen MatiasValdenegro-ToroUniversity of Groningen 351-364 This study investigates the effectiveness of the action score, a metric originally developed for computer vision tasks, in estimating sample difficulty across various natural language processing (NLP) tasks. Using transformer-based models, the action score is applied to sentiment analysis, natural language inference, and abstractive text summarization. The results demonstrate that the action score can effectively identify challenging samples in sentiment analysis and natural language inference, often capturing difficult instances that are missed by more established metrics like entropy. However, the effectiveness of the action score appears to be task-dependent, as evidenced by its performance in the abstractive text summarization task, where it exhibits a nearly linear relationship with entropy. The findings suggest that the action score can provide valuable insights into the characteristics of challenging samples in NLP tasks, particularly in classification settings. However, its application should be carefully considered in the context of each specific task and in light of emerging research on the potential value of hard samples in machine learning. @@ -314,9 +314,9 @@ Are Small Language Models Ready to Compete with Large Language Models for Practical Applications? - NeelabhSinhaGeorgia Institute of Technology + NeelabhSinhaGeorgia Institute of Technology VinijaJainFacebook - AmanChadhaAmazon Web Services + AmanChadhaAmazon Web Services 365-398 The rapid rise of Language Models (LMs) has expanded their use in several applications. Yet, due to constraints of model size, associated cost, or proprietary restrictions, utilizing state-of-the-art (SOTA) LLMs is not always feasible. With open, smaller LMs emerging, more applications can leverage their capabilities, but selecting the right LM can be challenging as smaller LMs don’t perform well universally. This work tries to bridge this gap by proposing a framework to experimentally evaluate small, open LMs in practical settings through measuring semantic correctness of outputs across three practical aspects: task types, application domains and reasoning types, using diverse prompt styles. It also conducts an in-depth comparison of 10 small, open LMs to identify best LM and prompt style depending on specific application requirement using the proposed framework. We also show that if selected appropriately, they can outperform SOTA LLMs like DeepSeek-v2, GPT-4o-mini, Gemini-1.5-Pro, and even compete with GPT-4o. 2025.trustnlp-main.25 @@ -326,7 +326,7 @@ A Calibrated Reflection Approach for Enhancing Confidence Estimation in <fixed-case>LLM</fixed-case>s UmeshBodhwani YuanLing - ShujingDong + ShujingDong YarongFeng HongfeiLi AyushGoyal @@ -348,7 +348,7 @@ Battling Misinformation: An Empirical Study on Adversarial Factuality in Open-Source Large Language Models Shahnewaz KarimSakibUniversity of Tennessee at Chattanooga Anindya BijoyDasUniversity of Akron - ShibbirAhmedTexas State University + ShibbirAhmedTexas State University 432-443 Adversarial factuality refers to the deliberate insertion of misinformation into input prompts by an adversary, characterized by varying levels of expressed confidence. In this study, we systematically evaluate the performance of several open-source large language models (LLMs) when exposed to such adversarial inputs. Three tiers of adversarial confidence are considered: strongly confident, moderately confident, and limited confidence. Our analysis encompasses eight LLMs: LLaMA 3.1 (8B), Phi 3 (3.8B), Qwen 2.5 (7B), Deepseek-v2 (16B), Gemma2 (9B), Falcon (7B), Mistrallite (7B), and LLaVA (7B). Empirical results indicate that LLaMA 3.1 (8B) exhibits a robust capability in detecting adversarial inputs, whereas Falcon (7B) shows comparatively lower performance. Notably, for the majority of the models, detection success improves as the adversary’s confidence decreases; however, this trend is reversed for LLaMA 3.1 (8B) and Phi 3 (3.8B), where a reduction in adversarial confidence corresponds with diminished detection performance. Further analysis of the queries that elicited the highest and lowest rates of successful attacks reveals that adversarial attacks are more effective when targeting less commonly referenced or obscure information. 2025.trustnlp-main.28 @@ -358,8 +358,8 @@ Will the Prince Get True Love’s Kiss? On the Model Sensitivity to Gender Perturbation over Fairytale Texts Christina AChanceUniversity of California, Los Angeles DaYin - DakuoWangNortheastern University - Kai-WeiChangUniversity of California, Los Angeles and Amazon + DakuoWangNortheastern University + Kai-WeiChangUniversity of California, Los Angeles and Amazon 444-460 In this paper, we study whether language models are affected by learned gender stereotypes during the comprehension of stories. Specifically, we investigate how models respond to gender stereotype perturbations through counterfactual data augmentation. Focusing on Question Answering (QA) tasks in fairytales, we modify the FairytaleQA dataset by swapping gendered character information and introducing counterfactual gender stereotypes during training. This allows us to assess model robustness and examine whether learned biases influence story comprehension. Our results show that models exhibit slight performance drops when faced with gender perturbations in the test set, indicating sensitivity to learned stereotypes. However, when fine-tuned on counterfactual training data, models become more robust to anti-stereotypical narratives. Additionally, we conduct a case study demonstrating how incorporating counterfactual anti-stereotype examples can improve inclusivity in downstream applications. 2025.trustnlp-main.29 @@ -385,9 +385,9 @@ Defining and Quantifying Visual Hallucinations in Vision-Language Models - VipulaRawte + VipulaRawte AryanMishra - AmitShethUniversity of South Carolina + AmitShethUniversity of South Carolina AmitavaDasUniversity of South Carolina 501-510 The troubling rise of hallucination presents perhaps the most significant impediment to the advancement of responsible AI. In recent times, considerable research has focused on detecting and mitigating hallucination in Large Language Models (LLMs). However, it’s worth noting that hallucination is also quite prevalent in Vision-Language models (VLMs). In this paper, we offer a fine-grained discourse on profiling VLM hallucination based on the image captioning task. We delineate eight fine-grained orientations of visual hallucination: i) Contextual Guessing, ii) Identity Incongruity, iii) Geographical Erratum, iv) Visual Illusion, v) Gender Anomaly, vi) VLM as Classifier, vii) Wrong Reading, and viii) Numeric Discrepancy. We curate Visual HallucInation eLiciTation, a publicly available dataset comprising 2,000 samples generated using eight VLMs across the image captioning task, along with human annotations for the categories as mentioned earlier. To establish a method for quantification and to offer a comparative framework enabling the evaluation and ranking of VLMs according to their vulnerability to producing hallucinations, we propose the Visual Hallucination Vulnerability Index (VHVI). In summary, we introduce the VHILT dataset for image-to-text hallucinations and propose the VHVI metric to quantify hallucinations in VLMs, targeting specific visual hallucination types. A subset sample is available at: https://huggingface.co/datasets/vr25/vhil. The full dataset will be publicly released upon acceptance. @@ -397,7 +397,7 @@ Revitalizing Saturated Benchmarks: A Weighted Metric Approach for Differentiating Large Language Model Performance BryanEtzine - MasoudHashemiServiceNow Inc + MasoudHashemiServiceNow Inc NishanthMadhusudhanServiceNow Inc SagarDavasamServiceNow Inc RoshneeSharmaServiceNow Inc @@ -410,10 +410,10 @@ Synthetic Lyrics Detection Across Languages and Genres - YanisLabrak + YanisLabrak MarkusFrohmannJohannes Kepler Universität Linz GabrielMeseguer-BrocalDeezer - Elena V.EpureDeezer + Elena V.EpureDeezer 524-541 In recent years, the use of large language models (LLMs) to generate music content, particularly lyrics, has gained in popularity. These advances provide valuable tools for artists and enhance their creative processes, but they also raise concerns about copyright violations, consumer satisfaction, and content spamming. Previous research has explored content detection in various domains. However, no work has focused on the text modality, lyrics, in music. To address this gap, we curated a diverse dataset of real and synthetic lyrics from multiple languages, music genres, and artists. The generation pipeline was validated using both humans and automated methods. We performed a thorough evaluation of existing synthetic text detection approaches on lyrics, a previously unexplored data type. We also investigated methods to adapt the best-performing features to lyrics through unsupervised domain adaptation. Following both music and industrial constraints, we examined how well these approaches generalize across languages, scale with data availability, handle multilingual language content, and perform on novel genres in few-shot settings. Our findings show promising results that could inform policy decisions around AI-generated music and enhance transparency for users. 2025.trustnlp-main.34 @@ -421,13 +421,13 @@ A Lightweight Multi Aspect Controlled Text Generation Solution For Large Language Models - ChenyangZhang + ChenyangZhang JiayiLin HaiboTong - BingxuanHou + BingxuanHou DongyuZhang - JialinLi - JunliWangTongji University + JialinLi + JunliWangTongji University 542-551 Multi-Aspect Controllable Text Generation (MCTG) introduces fine-grained multiple constraints in natural language generation, i.e. control attributes in topics, sentiments, and detoxification.MCTG demonstrates application prospects for trustworthy generation of Large Language Models (LLMs) but is limited by generalization issues.Existing work exploits additional structures and strategies for solutions, requiring LLMs’ modifications.To activate LLMs’ MCTG ability, we propose a lightweight MCTG pipeline based on data augmentation and instruction tuning.We analyze aspect bias and correlations in traditional datasets and address these concerns with augmented control attributes and sentences.Augmented datasets are feasible for instruction tuning.We conduct experiments for various LLMs backbone and parameter sizes, demonstrating general effectiveness on MCTG performance. 2025.trustnlp-main.35 @@ -437,7 +437,7 @@ Gender Bias in Large Language Models across Multiple Languages: A Case Study of <fixed-case>C</fixed-case>hat<fixed-case>GPT</fixed-case> YiTianDing JinmanZhao - ChenJiaSI-TECH Information Technology Co., Ltd + ChenJiaSI-TECH Information Technology Co., Ltd YiningWang ZifanQian WeizheChen @@ -454,8 +454,8 @@ VenkateshMishra AgneetChatterjeeArizona State University AmirSaeidiArizona State University - RitikaSarkarArizona State University - ChittaBaralArizona State University + RitikaSarkarArizona State University + ChittaBaralArizona State University 580-598 Large Language Models (LLMs) have achieved remarkable performance across a wide variety of natural language tasks. However, they have been shown to suffer from a critical limitation pertinent to ‘hallucination’ in their output. Recent research has focused on investigating and addressing this problem for a variety of tasks such as biography generation, question answering, abstractive summarization, and dialogue generation. However, the crucial aspect pertaining to ‘negation’ has remained considerably underexplored. Negation is important because it adds depth and nuance to the understanding of language and is also crucial for logical reasoning and inference. In this work, we address the above limitation and particularly focus on studying the impact of negation in LLM hallucinations. Specifically, we study four tasks with negation: ‘false premise completion’, ‘constrained fact generation’, ‘multiple choice question answering’, and ‘fact generation’. We show that open-source state-of-the-art LLMs such as LLaMA-2-chat, Vicuna, and Orca-2 hallucinate considerably on all these tasks involving negation which underlines a critical shortcoming of these models. Addressing this problem, we further study numerous strategies to mitigate these hallucinations and demonstrate their impact. 2025.trustnlp-main.37 @@ -463,11 +463,11 @@ <fixed-case>FACTOID</fixed-case>: <fixed-case>FAC</fixed-case>tual en<fixed-case>T</fixed-case>ailment f<fixed-case>O</fixed-case>r halluc<fixed-case>I</fixed-case>nation Detection - VipulaRawte - S.m Towhidul IslamTonmoyUniversity of South Carolina + VipulaRawte + S.m Towhidul IslamTonmoyUniversity of South Carolina ShravaniNag - AmanChadhaAmazon Web Services - AmitShethUniversity of South Carolina + AmanChadhaAmazon Web Services + AmitShethUniversity of South Carolina AmitavaDasUniversity of South Carolina 599-617 2025.trustnlp-main.38 From 9f8668a39f9b3b13d1d6035d7a855a5d61fc7298 Mon Sep 17 00:00:00 2001 From: Matt Post Date: Fri, 6 Jun 2025 09:13:46 -0400 Subject: [PATCH 10/18] Ingest ORCIDS for 2025.c3nlp-1 --- data/xml/2025.c3nlp.xml | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/data/xml/2025.c3nlp.xml b/data/xml/2025.c3nlp.xml index e4ea10f8af..6b331f32ec 100644 --- a/data/xml/2025.c3nlp.xml +++ b/data/xml/2025.c3nlp.xml @@ -53,9 +53,9 @@ <fixed-case>I</fixed-case>nsp<fixed-case>AI</fixed-case>red: Cross-cultural Inspiration Detection and Analysis in Real and <fixed-case>LLM</fixed-case>-generated Social Media Data - OanaIgnatSanta Clara University + OanaIgnatSanta Clara University Gayathri GaneshLakshmy - RadaMihalceaUniversity of Michigan + RadaMihalceaUniversity of Michigan 35-49 Inspiration is linked to various positive outcomes, such as increased creativity, productivity, and happiness. Although inspiration has great potential, there has been limited effort toward identifying content that is inspiring, as opposed to just engaging or positive. Additionally, most research has concentrated on Western data, with little attention paid to other cultures. This work is the first to study cross-cultural inspiration through machine learning methods. We aim to identify and analyze real and AI-generated cross-cultural inspiring posts. To this end, we compile and make publicly available the InspAIred dataset, which consists of 2,000 real inspiring posts, 2,000 real non-inspiring posts, and 2,000 generated inspiring posts evenly distributed across India and the UK. The real posts are sourced from Reddit, while the generated posts are created using the GPT-4 model. Using this dataset, we conduct extensive computational linguistic analyses to (1) compare inspiring content across cultures, (2) compare AI-generated inspiring posts to real inspiring posts, and (3) determine if detection models can accurately distinguish between inspiring content across cultures and data sources. 2025.c3nlp-1.4 @@ -63,9 +63,9 @@ <fixed-case>D</fixed-case>a<fixed-case>K</fixed-case>ultur: Evaluating the Cultural Awareness of Language Models for <fixed-case>D</fixed-case>anish with Native Speakers - MaxMüller-EbersteinIT University of Copenhagen + MaxMüller-EbersteinIT University of Copenhagen MikeZhang - ElisaBassignana + ElisaBassignana Peter BrunsgaardTrolle Rob Van DerGootIT University of Copenhagen 50-58 @@ -86,7 +86,7 @@ <fixed-case>LLM</fixed-case>-<fixed-case>C</fixed-case>3<fixed-case>MOD</fixed-case>: A Human-<fixed-case>LLM</fixed-case> Collaborative System for Cross-Cultural Hate Speech Moderation JunyeongPark SeogyeongJeongKorea Advanced Institute of Science & Technology - SeyoungSongKAIST + SeyoungSongKAIST YohanLeeElectronics and Telecommunications Research Institute AliceOhKorea Advanced Institute of Science and Technology 71-88 @@ -96,7 +96,7 @@ One world, one opinion? The superstar effect in <fixed-case>LLM</fixed-case> responses - SofieGoethals + SofieGoethals LaurenRhueUniversity of Maryland, College Park 89-107 As large language models (LLMs) are shaping the way information is shared and accessed online, their opinions have the potential to influence a wide audience. This study examines who is predicted by the studied LLMs as the most prominent figures across various fields, while using prompts in ten different languages to explore the influence of linguistic diversity. Our findings reveal low diversity in responses, with a small number of figures dominating recognition across languages (also known as the “superstar effect”). These results highlight the risk of narrowing global knowledge representation when LLMs are used to retrieve subjective information. @@ -107,7 +107,7 @@ Towards Region-aware Bias Evaluation Metrics AnganaBorah AparnaGarimellaAdobe Research - RadaMihalceaUniversity of Michigan + RadaMihalceaUniversity of Michigan 108-131 When exposed to human-generated data, language models are known to learn and amplify societal biases. While previous works introduced metrics that can be used to assess the bias in these models, they rely on assumptions that may not be universally true. For instance, a gender bias dimension commonly used by these metrics is that of family–career, but this may not be the only common bias in certain regions of the world. In this paper, we identify topical differences in gender bias across different regions and propose a region-aware bottom-up approach for bias assessment. Several of our proposed region-aware gender bias dimensions are found to be aligned with the human perception of gender biases in these regions. 2025.c3nlp-1.9 @@ -115,15 +115,15 @@ Cross-Cultural Differences in Mental Health Expressions on Social Media - SunnyRaiSchool of Engineering and Applied Science, University of Pennsylvania + SunnyRaiSchool of Engineering and Applied Science, University of Pennsylvania KhushiShelat DevanshJain AshwinKishenNA - Young MinChoUniversity of Pennsylvania + Young MinChoUniversity of Pennsylvania MaitreyiRedkar - SamindaraHardikar-SawantShri Jagdishprasad Jhabarmal Tibrewala University + SamindaraHardikar-SawantShri Jagdishprasad Jhabarmal Tibrewala University LyleUngar - Sharath ChandraGuntukuUniversity of Pennsylvania + Sharath ChandraGuntukuUniversity of Pennsylvania 132-142 Culture moderates the way individuals perceive and express mental distress. Current understandings of mental health expressions on social media, however, are predominantly derived from WEIRD (Western, Educated, Industrialized, Rich, and Democratic) contexts. To address this gap, we examine mental health posts on Reddit made by individuals geolocated in India, to identify variations in social media language specific to the Indian context compared to users from Western nations. Our experiments reveal significant psychosocial variations in emotions and temporal orientation. This study demonstrates the potential of social media platforms for identifying cross-cultural differences in mental health expressions (e.g. seeking advice in India vs seeking support by Western users). Significant linguistic variations in online mental health-related language emphasize the importance of developing precision-targeted interventions that are culturally appropriate. 2025.c3nlp-1.10 @@ -136,7 +136,7 @@ JavadIsmayilzadaKorea Advanced Institute of Science & Technology JunyeongPark EunsuKim - HuzamaAhmadKorea Advanced Institute of Science & Technology + HuzamaAhmadKorea Advanced Institute of Science & Technology Na MinAnKAIST JamesThorneKAIST AliceOhKorea Advanced Institute of Science and Technology From acbf9abb73d50af328a4f521b6f1ac8668579bbd Mon Sep 17 00:00:00 2001 From: Matt Post Date: Fri, 6 Jun 2025 09:14:31 -0400 Subject: [PATCH 11/18] Ingest ORCIDS for 2025.privatenlp-main --- data/xml/2025.privatenlp.xml | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/data/xml/2025.privatenlp.xml b/data/xml/2025.privatenlp.xml index beaa23101f..4e5c6cab0c 100644 --- a/data/xml/2025.privatenlp.xml +++ b/data/xml/2025.privatenlp.xml @@ -34,8 +34,8 @@ <fixed-case>TAROT</fixed-case>: Task-Oriented Authorship Obfuscation Using Policy Optimization Methods GabrielLoiseauINRIA - DamienSileoINRIA - DamienRiquetVade + DamienSileoINRIA + DamienRiquetVade MaximeMeyer MarcTommasiLille University and INRIA 14-31 @@ -54,11 +54,11 @@ Named Entity Inference Attacks on Clinical <fixed-case>LLM</fixed-case>s: Exploring Privacy Risks and the Impact of Mitigation Strategies - AdamSutton - XiBai + AdamSutton + XiBai KawsarNoor ThomasSearle - RichardDobsonKing’s College London, University of London and University College London, University of London + RichardDobsonKing’s College London, University of London and University College London, University of London 42-52 Transformer-based Large Language Models (LLMs) have achieved remarkable success across various domains, including clinical language processing, where they enable state-of-the-art performance in numerous tasks. Like all deep learning models, LLMs are susceptible to inference attacks that exploit sensitive attributes seen during training. AnonCAT, a RoBERTa-based masked language model, has been fine-tuned to de-identify sensitive clinical textual data. The community has a responsibility to explore the privacy risks of these models. This work proposes an attack method to infer sensitive named entities used in the training of AnonCAT models. We perform three experiments; the privacy implications of generating multiple names, the impact of white-box and black-box on attack inference performance, and the privacy-enhancing effects of Differential Privacy (DP) when applied to AnonCAT. By providing real textual predictions and privacy leakage metrics, this research contributes to understanding and mitigating the potential risks associated with exposing LLMs in sensitive domains like healthcare. 2025.privatenlp-main.4 @@ -76,8 +76,8 @@ Beyond Reconstruction: Generating Privacy-Preserving Clinical Letters LiboRen SamuelBelkadi - LifengHan - WarrenDel-PintoUniversity of Manchester + LifengHan + WarrenDel-PintoUniversity of Manchester GoranNenadicUniversity of Manchester 60-74 Due to the sensitive nature of clinical letters, their use in model training, medical research, and education is limited. This work aims to generate diverse, de-identified, and high-quality synthetic clinical letters to enhance privacy protection. This study explores various pre-trained language models (PLMs) for text masking and generation, employing various masking strategies with a focus on Bio_ClinicalBERT. Both qualitative and quantitative methods are used for evaluation, supplemented by a downstream Named Entity Recognition (NER) task. Our results indicate that encoder-only models outperform encoder-decoder models. General-domain and clinical-domain PLMs exhibit comparable performance when clinical information is preserved. Preserving clinical entities and document structure yields better performance than fine-tuning alone. Masking stopwords enhances text quality, whereas masking nouns or verbs has a negative impact. BERTScore proves to be the most reliable quantitative evaluation metric in our task. Contextual information has minimal impact, indicating that synthetic letters can effectively replace original ones in downstream tasks. Unlike previous studies that focus primarily on reconstructing original letters or training a privacy-detection and substitution model, this project provides a framework for generating diverse clinical letters while embedding privacy detection, enabling sensitive dataset expansion and facilitating the use of real-world clinical data. Our codes and trained models will be publicly available at https://github.com/HECTA-UoM/Synthetic4Health. @@ -87,7 +87,7 @@ Beyond De-Identification: A Structured Approach for Defining and Detecting Indirect Identifiers in Medical Texts IbrahimBaroudTechnische Universität Berlin - LisaRaithelTechnische Universität Berlin + LisaRaithelTechnische Universität Berlin SebastianMöller RolandRollerGerman Research Center for AI 75-85 @@ -97,10 +97,10 @@ Investigating User Perspectives on Differentially Private Text Privatization - StephenMeisenbacher - AlexandraKlymenko + StephenMeisenbacher + AlexandraKlymenko AlexanderKarppTechnische Universität München - FlorianMatthesTechnische Universität München + FlorianMatthesTechnische Universität München 86-105 Recent literature has seen a considerable uptick in *Differentially Private Natural Language Processing* (DP NLP). This includes DP text privatization, where potentially sensitive input texts are transformed under DP to achieve privatized output texts that ideally mask sensitive information *and* maintain original semantics. Despite continued work to address the open challenges in DP text privatization, there remains a scarcity of work addressing user perceptions of this technology, a crucial aspect which serves as the final barrier to practical adoption. In this work, we conduct a survey study with 721 laypersons around the globe, investigating how the factors of *scenario*, *data sensitivity*, *mechanism type*, and *reason for data collection* impact user preferences for text privatization. We learn that while all these factors play a role in influencing privacy decisions, users are highly sensitive to the utility and coherence of the private output texts. Our findings highlight the socio-technical factors that must be considered in the study of DP NLP, opening the door to further user-based investigations going forward. 2025.privatenlp-main.8 From 471994db7e98ba2abeeebdccb8c172b5ef52cf9f Mon Sep 17 00:00:00 2001 From: Matt Post Date: Fri, 6 Jun 2025 09:15:02 -0400 Subject: [PATCH 12/18] Ingest ORCIDS for 2025.knowledgenlp-1 --- data/xml/2025.knowledgenlp.xml | 50 +++++++++++++++++----------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/data/xml/2025.knowledgenlp.xml b/data/xml/2025.knowledgenlp.xml index db67b377e2..14930e367e 100644 --- a/data/xml/2025.knowledgenlp.xml +++ b/data/xml/2025.knowledgenlp.xml @@ -25,8 +25,8 @@ Entity Retrieval for Answering Entity-Centric Questions - HassanShavarani - AnoopSarkarSimon Fraser University + HassanShavarani + AnoopSarkarSimon Fraser University 1-17 The similarity between the question and indexed documents is a key factor in document retrieval for retrieval-augmented question answering. Although this is typically the only method for obtaining the relevant documents, it is not the sole approach when dealing with entity-centric questions. We study Entity Retrieval, an alternative retrieval method, which rather than relying on question-document similarity, depends on the salient entities within the question to identify the retrieval documents. We conduct an in-depth analysis of the performance of both dense and sparse retrieval methods in comparison to Entity Retrieval. Our findings reveal the great potential of entity-driven methods for improving augmentation document retrieval in both accuracy and efficiency. 2025.knowledgenlp-1.1 @@ -34,7 +34,7 @@ <fixed-case>ELECTRA</fixed-case> and <fixed-case>GPT</fixed-case>-4o: Cost-Effective Partners for Sentiment Analysis - James P.BenoStanford Engineering CGOE + James P.BenoStanford Engineering CGOE 18-36 Bidirectional transformers excel at sentiment analysis, and Large Language Models (LLM) are effective zero-shot learners. Might they perform better as a team? This paper explores collaborative approaches between ELECTRA and GPT-4o for three-way sentiment classification. We fine-tuned (FT) four models (ELECTRA Base/Large, GPT-4o/4o-mini) using a mix of reviews from Stanford Sentiment Treebank (SST) and DynaSent. We provided input from ELECTRA to GPT as: predicted label, probabilities, and retrieved examples. Sharing ELECTRA Base FT predictions with GPT-4o-mini significantly improved performance over either model alone (82.50 macro F1 vs. 79.14 ELECTRA Base FT, 79.41 GPT-4o-mini) and yielded the lowest cost/performance ratio ($0.12/F1 point). However, when GPT models were fine-tuned, including predictions decreased performance. GPT-4o FT-M was the top performer (86.99), with GPT-4o-mini FT close behind (86.70) at much less cost ($0.38 vs. $1.59/F1 point). Our results show that augmenting prompts with predictions from fine-tuned encoders is an efficient way to boost performance, and a fine-tuned GPT-4o-mini is nearly as good as GPT-4o FT at 76% less cost. Both are affordable options for projects with limited resources. 2025.knowledgenlp-1.2 @@ -42,8 +42,8 @@ Retrieval of Temporal Event Sequences from Textual Descriptions - ZefangLiuJ.P. Morgan Chase and Georgia Institute of Technology - YinzhuQuanGeorgia Institute of Technology + ZefangLiuJ.P. Morgan Chase and Georgia Institute of Technology + YinzhuQuanGeorgia Institute of Technology 37-49 Retrieving temporal event sequences from textual descriptions is crucial for applications such as analyzing e-commerce behavior, monitoring social media activities, and tracking criminal incidents. To advance this task, we introduce TESRBench, a comprehensive benchmark for temporal event sequence retrieval (TESR) from textual descriptions. TESRBench includes diverse real-world datasets with synthesized and reviewed textual descriptions, providing a strong foundation for evaluating retrieval performance and addressing challenges in this domain. Building on this benchmark, we propose TPP-Embedding, a novel model for embedding and retrieving event sequences. The model leverages the TPP-LLM framework, integrating large language models (LLMs) with temporal point processes (TPPs) to encode both event texts and times. By pooling representations and applying a contrastive loss, it unifies temporal dynamics and event semantics in a shared embedding space, aligning sequence-level embeddings of event sequences and their descriptions. TPP-Embedding demonstrates superior performance over baseline models across TESRBench datasets, establishing it as a powerful solution for the temporal event sequence retrieval task. 2025.knowledgenlp-1.3 @@ -52,9 +52,9 @@ Generating Tables from the Parametric Knowledge of Language Models YevgeniBerkovitcheBay Inc. - OrenGlickmanBar-Ilan University + OrenGlickmanBar-Ilan University AmitSomechBar-Ilan University - TomerWolfsonUniversity of Pennsylvania, University of Pennsylvania + TomerWolfsonUniversity of Pennsylvania, University of Pennsylvania 50-65 We explore generating factual tables from the parametric knowledge of large language models (LLMs). While LLMs have demonstrated impressive capabilities in recreating knowledge bases and generating free-form text, their ability to generate structured tabular data has received little attention. To address this gap, we explore the table generation abilities of eight state-of-the-art LLMs, including GPT-4o and Llama3.1-405B, using three prompting methods: full-table, row-by-row, and cell-by-cell. To facilitate evaluation we introduce WikiTabGen, a new benchmark consisting of 119 manually curated Wikipedia tables and their description. Our findings show that table generation remains challenging, with the best performing model (LLaMA3.1-405B) reaching only 25.4% accuracy. We further analyze how properties like table size, popularity, and numerical content impact performance. This study highlights the unique challenges of LLM-based table generation and offers a foundation for future research in this area. All code, data, and prompts are publicly available. 2025.knowledgenlp-1.4 @@ -64,7 +64,7 @@ Investigating Large Language Models for Text-to-<fixed-case>SPARQL</fixed-case> Generation JacopoD’AbramoUniversity of Bologna AndreaZugariniExpert.ai Srl - PaoloTorroniUniversity of Bologna + PaoloTorroniUniversity of Bologna 66-80 Large Language Models (LLMs) have demonstrated strong capabilities in code generation, such as translating natural language questions into SQL queries. However, state-of-the-art solutions often involve a costly fine-tuning step. In this study, we extensively evaluate In-Context Learning (ICL) solutions for text-to-SPARQL generation with different architectures and configurations, based on methods for retrieving relevant demonstrations for few-shot prompting and working with multiple generated hypotheses. In this way, we demonstrate that LLMs can formulate SPARQL queries achieving state-of-the-art results on several Knowledge Graph Question Answering (KGQA) benchmark datasets without fine-tuning. 2025.knowledgenlp-1.5 @@ -95,7 +95,7 @@ Enhancing Cross-Language Code Translation via Task-Specific Embedding Alignment in Retrieval-Augmented Generation - ManishBhattaraiLos Alamos National Laboratory + ManishBhattaraiLos Alamos National Laboratory Minh N.VuLos Alamos National Laboratory JavierE. SantosLos Alamos National Laboratory IsmaelIsmaelLos Alamos National Laboratory @@ -108,7 +108,7 @@ <fixed-case>LLM</fixed-case> Reasoning Engine: Specialized Training for Enhanced Mathematical Reasoning ShuguangChenPurdue University - GuangLinPurdue University + GuangLinPurdue University 118-128 Large Language Models (LLMs) have shown remarkable performance in various natural language processing tasks but face challenges in mathematical reasoning, where complex problem-solving requires both linguistic understanding and mathematical reasoning skills. Existing approaches to address this challenge often rely on ensemble methods and suffer from the problem of data scarcity in target domains. In this work, we present a novel method to enhance the capabilities of LLMs in mathematical reasoning tasks. Motivated by the need to bridge this gap, our approach incorporates a question paraphrase strategy, which aims to diversify the linguistic forms of mathematical questions to improve generalization. Additionally, specialized training objectives are employed to guide the model’s learning process, focusing on enhancing its understanding of mathematical concepts and reasoning processes. We conduct experiments on four datasets using different LLMs, and demonstrate the effectiveness of our approach in improving LLMs’ performance on mathematical reasoning tasks. Our findings underscore the significance of our methodology in advancing large language models and their potential implications for real-world applications that require mathematical reasoning abilities. 2025.knowledgenlp-1.9 @@ -121,7 +121,7 @@ TusharVatsaAdobe Systems SuhasSureshaAdobe Systems IshitaVermaAdobe Systems - Tracy HollowayKingAdobe Systems + Tracy HollowayKingAdobe Systems MifriedrMifriedrNA ChengChenAdobe Systems 129-140 @@ -135,7 +135,7 @@ DanielHajialigol ZhongkaiSun JieHaoAmazon - XuanWangVirginia Polytechnic Institute and State University + XuanWangVirginia Polytechnic Institute and State University 141-151 Multi-hop question answering (MHQA) requires a model to retrieve and integrate information from multiple passages to answer a complex question. Recent systems leverage the power of large language models and integrate evidence retrieval with reasoning prompts (e.g., chain-of-thought reasoning) for the MHQA task. However, the complexities in the question types (bridge v.s. comparison questions) and the reasoning types (sequential v.s. parallel reasonings) require more novel and fine-grained prompting methods to enhance the performance of MHQA under the zero-shot setting.In this paper, we propose StoC-ToT, a stochastic tree-of-thought reasoning prompting method with constrained decoding for MHQA and conduct a detailed comparison with other reasoning prompts on different question types and reasoning types. Specifically, we construct a tree-like reasoning structure by prompting the model to break down the original question into smaller sub-questions to form different reasoning paths. In addition, we prompt the model to provide a probability estimation for each reasoning path at each reasoning step. At answer time, we conduct constrained decoding on the model to generate more grounded answers and reduce hallucination. Experiments comparing StoC-ToT with on two MHQA datasets and five large language models showed that outperforms other reasoning prompts by a significant margin. 2025.knowledgenlp-1.12 @@ -178,7 +178,7 @@ YiqinHuangUniversity of California, Berkeley LucasSpangherUniversity of California Berkeley SewonMinUniversity of California, Berkeley and Allen Institute for Artificial Intelligence - MarkDredzeDepartment of Computer Science, Whiting School of Engineering and Bloomberg + MarkDredzeDepartment of Computer Science, Whiting School of Engineering and Bloomberg 180-204 Multi-document retrieval approaches often overlook the ways different retrievals complement each other when addressing complex queries. In this work, we study journalist source selection in news article writing and examine the discourse roles that different sources serve when paired together, finding that discourse function (not simply informational content) is an important component of source usage. Then, we introduce a novel IR task to benchmark how well language models can reason about this narrative process. We extract a journalist’s initial query and the sources they used from news articles and aim to recover the sources that support this query. We demonstrate that large language models (LLMs) can be employed in multi-step query planning, identifying informational gaps and enhancing retrieval performance, but current approaches to interleave queries fall short. By training auxiliary discourse planners and incorporating this information into LLMs, we enhance query planning, achieving a significant 5% improvement in precision and a 2% increase in F1 score over the previous SOTA, all while maintaining recall. 2025.knowledgenlp-1.18 @@ -186,17 +186,17 @@ <fixed-case>HEAL</fixed-case>: Hierarchical Embedding Alignment Loss for Improved Retrieval and Representation Learning - ManishBhattaraiLos Alamos National Laboratory - RyanBarron, University of Maryland, Baltimore County and Los Alamos National Laboratory + ManishBhattaraiLos Alamos National Laboratory + RyanBarron, University of Maryland, Baltimore County and Los Alamos National Laboratory Maksim E.Eren Minh N.VuLos Alamos National Laboratory - VesselinGrantcharov + VesselinGrantcharov IsmaelIsmaelLos Alamos National Laboratory ValentinStanev - CynthiaMatuszekUniversity of Maryland, Baltimore County + CynthiaMatuszekUniversity of Maryland, Baltimore County Vladimir IValtchinov KimRasmussen - Boian S.Alexandrov + Boian S.Alexandrov 205-214 Retrieval-Augmented Generation (RAG) enhances Large Language Models (LLMs) by integrating external document retrieval to provide domain-specific or up-to-date knowledge. The effectiveness of RAG depends on the relevance of retrieved documents, which is influenced by the semantic alignment of embeddings with the domain’s specialized content. Although full fine-tuning can align language models to specific domains, it is computationally intensive and demands substantial data. This paper introduces Hierarchical Embedding Alignment Loss (HEAL), a novel method that leverages hierarchical fuzzy clustering with matrix factorization within contrastive learning to efficiently align LLM embeddings with domain-specific content. HEAL computes level/depth-wise contrastive losses and incorporates hierarchical penalties to align embeddings with the underlying relationships in label hierarchies. This approach enhances retrieval relevance and document classification, effectively reducing hallucinations in LLM outputs. In our experiments, we benchmark and evaluate HEAL across diverse domains, including Healthcare, Material Science, Cyber-security, and Applied Maths. 2025.knowledgenlp-1.19 @@ -208,7 +208,7 @@ AmitAgarwalOracle HansaMeghwani Hitesh LaxmichandPatelOracle - SrikantPandaOracle + SrikantPandaOracle 215-229 Retrieval-Augmented Generation (RAG) systems and large language model (LLM)-powered chatbots have significantly advanced conversational AI by combining generative capabilities with external knowledge retrieval. Despite their success, enterprise-scale deployments face critical challenges, including diverse user queries, high latency, hallucinations, and difficulty integrating frequently updated domain-specific knowledge. This paper introduces a novel hybrid framework that integrates RAG with intent-based canned responses, leveraging predefined high-confidence responses for efficiency while dynamically routing complex or ambiguous queries to the RAG pipeline. Our framework employs a dialogue context manager to ensure coherence in multi-turn interactions and incorporates a feedback loop to refine intents, dynamically adjust confidence thresholds, and expand response coverage over time. Experimental results demonstrate that the proposed framework achieves a balance of high accuracy (95%) and low latency (180ms), outperforming RAG and intent-based systems across diverse query types, positioning it as a scalable and adaptive solution for enterprise conversational AI applications. 2025.knowledgenlp-1.20 @@ -216,7 +216,7 @@ Chain of Evidences and Evidence to Generate: Prompting for Context Grounded and Retrieval Augmented Reasoning - Md RizwanParvezQatar Computing Research Institute + Md RizwanParvezQatar Computing Research Institute 230-245 While chain-of-thoughts (CoT) prompting has revolutionized how LLMs perform reasoning tasks, its current methods and variations (e.g, Self-consistency, ReACT, Reflexion, Tree-of-Thoughts (ToT), Cumulative Reasoning (CR) etc.,) suffer from limitations like limited context grounding, hallucination/inconsistent output generation, and iterative sluggishness. To overcome these challenges, we introduce a novel mono/dual-step zero-shot prompting framework built upon two unique strategies Chain of Evidences (CoE) and Evidence to Generate (E2G). Instead of unverified reasoning claims, our innovative approaches leverage the power of “evidence for decision making” by first focusing exclusively on the thought sequences explicitly mentioned in the context which then serve as extracted evidence, guiding the LLM’s output generation process with greater precision and efficiency. This simple yet potent approach unlocks the full potential of chain-of-thoughts prompting, facilitating faster, more reliable, and contextually aware reasoning in LLMs. Our framework consistently achieves remarkable results across various knowledge-intensive reasoning and generation tasks, surpassing baseline approaches with state-of-the-art LLMs. For instance, (i) on the LogiQA benchmark using GPT-4, CoE achieves a new state-of-the-art accuracy of 53.8%, surpassing CoT by 18%, ToT by 11%, and CR by 9%; (ii) CoE with PaLM-2 outperforms the variable-shot performance of Gemini Ultra by 0.9 F1 points, achieving an F1 score of 83.3 on DROP. We release our prompts and outputs on these benchmarks as a new instruction tuning dataset for future research at Hugging Face. 2025.knowledgenlp-1.21 @@ -236,7 +236,7 @@ <fixed-case>MSR</fixed-case><tex-math>^2</tex-math>: A Benchmark for Multi-Source Retrieval and Reasoning in Visual Question Answering Kuo-HanHung Hung-ChiehFangNational Taiwan University - Chao-WeiHuangNational Taiwan University + Chao-WeiHuangNational Taiwan University Yun-NungChenDepartment of Computer Science and Informational Engineering, National Taiwan University 259-271 This paper introduces MSR^2, a benchmark for multi-source retrieval and reasoning in visual question answering. Unlike previous knowledge-based visual question answering datasets, MSR^2 focuses on questions involving multiple fine-grained entities, providing a unique opportunity to assess a model’s spatial reasoning ability and its capacity to retrieve and aggregate information from various sources for different entities. Through comprehensive evaluation using MSR^2, we gain valuable insights into the capabilities and limitations of state-of-the-art large vision-language models (LVLMs).Our findings reveal that even state-of-the-art LVLMs struggle with questions requiring multi-entities and knowledge-intensive reasoning, highlighting important new directions for future research.Additionally, we demonstrate that enhanced visual entity recognition and knowledge retrieval can significantly improve performance on MSR^2, pinpointing key areas for advancement. @@ -254,8 +254,8 @@ <fixed-case>C</fixed-case>laim<fixed-case>C</fixed-case>heck: Automatic Fact-Checking of Textual Claims using Web Evidence - Akshith ReddyPutta - JacobDevasierUniversity of Texas at Arlington + Akshith ReddyPutta + JacobDevasierUniversity of Texas at Arlington ChengkaiLiUniversity of Texas at Arlington 303-316 We introduce ClaimCheck, an efficient fact-checking system that verifies textual claims using smaller, open-source large language models. ClaimCheck integrates two fact-checking strategies, claim-matching and novel claim processing. Claim-matching uses related fact-checks from trusted organizations to fact-check a claim. Novel claim processing breaks down fact-checking into manageable subtasks—generating targeted questions, retrieving Web evidence, extracting answers, and synthesizing verdicts. Evaluation on the AVeriTeC benchmark demonstrates 62.6% verdict prediction accuracy, with claim-matching providing a 2.8% improvement. ClaimCheck approaches the performance of state-of-the-art systems while requiring significantly fewer computational resources, demonstrating the effectiveness of using small language models for fact-checking tasks. Furthermore, our code is publicly available to help make automated fact-checking more accessible. @@ -266,7 +266,7 @@ Can dependency parses facilitate generalization in language models? A case study of cross-lingual relation extraction RitamDuttCarnegie Mellon University ShounakSural - CarolynRoseSchool of Computer Science, Carnegie Mellon University + CarolynRoseSchool of Computer Science, Carnegie Mellon University 317-337 In this work, we propose DEPGEN, a framework for evaluating the generalization capabilities of language models on the task of relation extraction, with dependency parses as scaffolds. We use a GNN-based framework that takes dependency parses as input and learns embeddings of entities which are augmented to a baseline multilingual encoder. We also investigate the role of dependency parses when they are included as part of the prompt to LLMs in a zero-shot learning setup. We observe that including off-the-shelf dependency parses can aid relation extraction, with the best performing model having a mild relative improvement of 0.91% and 1.5% in the in-domain and zero-shot setting respectively across two datasets. For the in-context learning setup, we observe an average improvement of 1.67%, with significant gains for low-performing LLMs. We also carry out extensive statistical analysis to investigate how different factors such as the choice of the dependency parser or the nature of the prompt impact performance. We make our code and results publicly available for the research community at https://github.com/ShoRit/multilingual-re.git. 2025.knowledgenlp-1.27 @@ -276,7 +276,7 @@ Can dependency parses facilitate generalization in language models? A case study of cross-lingual relation extraction RitamDuttCarnegie Mellon University ShounakSural - CarolynRoseSchool of Computer Science, Carnegie Mellon University + CarolynRoseSchool of Computer Science, Carnegie Mellon University 338-358 In this work, we propose DEPGEN, a framework for evaluating the generalization capabilities of language models on the task of relation extraction, with dependency parses as scaffolds. We use a GNN-based framework that takes dependency parses as input and learns embeddings of entities which are augmented to a baseline multilingual encoder. We also investigate the role of dependency parses when they are included as part of the prompt to LLMs in a zero-shot learning setup. We observe that including off-the-shelf dependency parses can aid relation extraction, with the best performing model having a mild relative improvement of 0.91% and 1.5% in the in-domain and zero-shot setting respectively across two datasets. For the in-context learning setup, we observe an average improvement of 1.67%, with significant gains for low-performing LLMs. We also carry out extensive statistical analysis to investigate how different factors such as the choice of the dependency parser or the nature of the prompt impact performance. We make our code and results publicly available for the research community at https://github.com/ShoRit/multilingual-re.git. 2025.knowledgenlp-1.28 From b9c058106a1f11c031d5924798dce5fca0d8e11f Mon Sep 17 00:00:00 2001 From: Matt Post Date: Fri, 6 Jun 2025 09:15:39 -0400 Subject: [PATCH 13/18] Ingest ORCIDS for 2025.queerinai-main --- data/xml/2025.queerinai.xml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/data/xml/2025.queerinai.xml b/data/xml/2025.queerinai.xml index 7f4a76c4be..f32395bdab 100644 --- a/data/xml/2025.queerinai.xml +++ b/data/xml/2025.queerinai.xml @@ -26,7 +26,7 @@ Studying the Representation of the <fixed-case>LGBTQ</fixed-case>+ Community in <fixed-case>R</fixed-case>u<fixed-case>P</fixed-case>aul’s Drag Race with <fixed-case>LLM</fixed-case>-Based Topic Modeling - MikaHämäläinenMetropolia University of Applied Sciences + MikaHämäläinenMetropolia University of Applied Sciences 1-5 This study investigates the representation of LGBTQ+ community in the widely acclaimed reality television series RuPaul’s Drag Race through a novel application of large language model (LLM)-based topic modeling. By analyzing subtitles from seasons 1 to 16, the research identifies a spectrum of topics ranging from empowering themes, such as self-expression through drag, community support and positive body image, to challenges faced by the LGBTQ+ community, including homophobia, HIV and mental health. Employing an LLM allowed for nuanced exploration of these themes, overcoming the limitations of traditional word-based topic modeling. 2025.queerinai-main.1 @@ -56,7 +56,7 @@ Quoc-ToanNguyen JoshNguyenNA TuanPhamState University of New York at Binghamton - William JohnTeahan + William JohnTeahan 26-34 Anti-LGBTQIA+ texts in user-generated content pose significant risks to online safety and inclusivity. This study investigates the capabilities and limitations of five widely adopted Large Language Models (LLMs)—DeepSeek-V3, GPT-4o, GPT-4o-mini, GPT-o1-mini, and Llama3.3-70B—in detecting such harmful content. Our findings reveal that while LLMs demonstrate potential in identifying offensive language, their effectiveness varies across models and metrics, with notable shortcomings in calibration. Furthermore, linguistic analysis exposes deeply embedded patterns of discrimination, reinforcing the urgency for improved detection mechanisms for this marginalised population. In summary, this study demonstrates the significant potential of LLMs for practical application in detecting anti-LGBTQIA+ user-generated texts and provides valuable insights from text analysis that can inform topic modelling. These findings contribute to developing safer digital platforms and enhancing protection for LGBTQIA+ individuals. 2025.queerinai-main.4 @@ -65,7 +65,7 @@ A <fixed-case>B</fixed-case>ayesian account of pronoun and neopronoun acquisition Cassandra LJacobsState University of New York, Buffalo - MorganGrobolUniversité Paris Nanterre + MorganGrobolUniversité Paris Nanterre 35-40 A major challenge to equity among members of queer communities is the use of one’s chosen forms of reference, such as personal names or pronouns. Speakers often dimiss errors in pronominal use as unintentional, and claim that their errors reflect many decades of fossilized mainstream language use, including attitudes or expectations about the relationship between one’s appearance and acceptable forms of reference. Here, we propose a modeling framework that allows language use and speech communities to change over time, including the adoption of neopronouns and other forms for self-reference. We present a probabilistic graphical modeling approach to pronominal reference that is flexible in the face of change and experience while also moving beyond form-to-meaning mappings. The model critically also does not rely on lexical covariance structure to learn referring expressions. We show that such a model can account for individual differences in how quickly pronouns or names are integrated into symbolic knowledge and can empower computational systems to be both flexible and respectful of queer people with diverse gender expression. 2025.queerinai-main.5 From efe1d3b94bcc1f7196eb7d4de23dcbf5c3cda293 Mon Sep 17 00:00:00 2001 From: Matt Post Date: Fri, 6 Jun 2025 09:19:55 -0400 Subject: [PATCH 14/18] Adjust outputs on script --- bin/ingest_orcids.py | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/bin/ingest_orcids.py b/bin/ingest_orcids.py index 07aef22d4c..35df3ab837 100755 --- a/bin/ingest_orcids.py +++ b/bin/ingest_orcids.py @@ -88,15 +88,15 @@ def main( # Load the papers.yaml file, skipping non-archival papers papers = [p for p in parse_paper_yaml(paper_yaml) if p["archival"]] - print(f"Found {len(papers)} archival papers", file=sys.stderr) + # print(f"Found {len(papers)} archival papers", file=sys.stderr) - for paper in papers: - print("PAPER:", paper['id'], file=sys.stderr) - for author in paper['authors']: - print( - f" {author['first_name']} {author['last_name']} ({author.get('institution', '')})", - file=sys.stderr, - ) + # for paper in papers: + # print("PAPER:", paper['id'], file=sys.stderr) + # for author in paper['authors']: + # print( + # f" {author['first_name']} {author['last_name']} ({author.get('institution', '')})", + # file=sys.stderr, + # ) collection_id, volume_name = full_volume_id.split('-') @@ -119,14 +119,11 @@ def main( volume_node.findall('./paper') ), f"Number of papers in YAML ({len(papers)}) does not match number in XML ({len(volume_node.findall('./paper'))})" + num_added = 0 for paper, paper_node in zip(papers, volume_node.findall('./paper')): # paper_num = int(paper["id"]) paper_num = int(paper_node.attrib['id']) - print(f"PAPER: YAML={paper_num}", file=sys.stderr) - - # assert paper_num == paper_id_xml, ( - # f"Paper ID mismatch: YAML={paper_num}, XML={paper_id_xml}" - # ) + # print(f"PAPER: YAML={paper_num}", file=sys.stderr) def get_author_xml(author_xml): name = "" @@ -141,10 +138,10 @@ def get_author_xml(author_xml): for author_yaml, author_node in zip( paper['authors'], paper_node.findall('./author') ): - print( - f"- Author YAML={author_yaml['first_name']} {author_yaml['last_name']} XML={get_author_xml(author_node)}", - file=sys.stderr, - ) + # print( + # f"- Author YAML={author_yaml['first_name']} {author_yaml['last_name']} XML={get_author_xml(author_node)}", + # file=sys.stderr, + # ) if orcid := author_yaml.get('orcid'): # grab ORCID pattern from orcid: \d{4}-\d{4}-\d{4}-\d{3}[0-9X] orcid_pattern = r'\d{4}-\d{4}-\d{4}-\d{3}[0-9X]' @@ -152,6 +149,7 @@ def get_author_xml(author_xml): if match: # If the ORCID is in the expected format, use it directly orcid = match.group(0) + num_added += 1 else: print(f"Invalid ORCID format: {orcid}", file=sys.stderr) continue @@ -161,6 +159,7 @@ def get_author_xml(author_xml): indent(root_node) tree = etree.ElementTree(root_node) tree.write(collection_file, encoding='UTF-8', xml_declaration=True, with_tail=True) + print(f"Added {num_added} ORCIDs for {full_volume_id} to {collection_file}", file=sys.stderr) if __name__ == '__main__': From 1af01e9b6b9e199ed0a028e6b4ab868aec1bbdad Mon Sep 17 00:00:00 2001 From: Matt Post Date: Fri, 6 Jun 2025 09:20:01 -0400 Subject: [PATCH 15/18] Add ORCIDS for 2025.cmcl-1 --- data/xml/2025.cmcl.xml | 44 +++++++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/data/xml/2025.cmcl.xml b/data/xml/2025.cmcl.xml index 720d42f758..ea8f3e69e6 100644 --- a/data/xml/2025.cmcl.xml +++ b/data/xml/2025.cmcl.xml @@ -42,7 +42,7 @@ Capturing Online <fixed-case>SRC</fixed-case>/<fixed-case>ORC</fixed-case> Effort with Memory Measures from a Minimalist Parser - AnielloDe SantoUniversity of Utah + AnielloDe SantoUniversity of Utah 24-35 A parser for Minimalist grammars (Stabler, 2013) has been shown to successfully model sentence processing preferences across an array of languages and phenomena when combined with complexity metrics that relate parsing behavior to memory usage (Gerth, 2015; Graf et al., 2017; De Santo, 2020, a.o.). This model provides a quantifiable theory of the effects of fine-grained grammatical structure on cognitive cost, and can help strengthen the link between generative syntactic theory and sentence processing.However, work on it has focused on offline asymmetries.Here, we extend this approach by showing how memory-based measures of effort that explicitly consider minimalist-like structure-building operations improve our ability to account for word-by-word (online) behavioral data. 2025.cmcl-1.5 @@ -61,7 +61,7 @@ Profiling neural grammar induction on morphemically tokenised child-directed speech MilaMarcheva - TheresaBiberauerUniversity of the Western Cape, University of Stellenbosch and University of Cambridge + TheresaBiberauerUniversity of the Western Cape, University of Stellenbosch and University of Cambridge WeiweiSunUniversity of Cambridge 47-54 We investigate the performance of state-of-the-art (SotA) neural grammar induction (GI) models on a morphemically tokenised English dataset based on the CHILDES treebank (Pearl and Sprouse, 2013). Using implementations from Yang et al. (2021a), we train models and evaluate them with the standard F1 score. We introduce novel evaluation metrics—depth-of-morpheme and sibling-of-morpheme—which measure phenomena around bound morpheme attachment. Our results reveal that models with the highest F1 scores do not necessarily induce linguistically plausible structures for bound morpheme attachment, highlighting a key challenge for cognitively plausible GI. @@ -73,7 +73,7 @@ FermínTraviComputer Science Department, University of Buenos Aires Gabriel AiméLeclercqUniversidad de Buenos Aires Diego FernandezSlezakComputer Science Department, Universidad de Buenos Aires - BrunoBianchi + BrunoBianchi Juan EKamienkowski 55-65 Reading, while structured, is a non-linear process. Readers may skip some words, linger on others, or revisit earlier text. Emerging work has started exploring the incorporation of reading behaviour through eye-tracking into the training of specific language tasks. In this work, we investigate the broader question of how gaze data can shape word embeddings by using text as read by human participants and predicting gaze measures from them. To that end, we conducted an eye-tracking experiment with 76 participants reading 20 short stories in Spanish and fine-tuned Word2Vec and LSTM models on the collected data. Evaluations with representational similarity analysis and word pair similarities showed a limited, but largely consistent, gain from gaze incorporation, suggesting future work should expand linguistic diversity and use cognitively aligned evaluations to better understand its role in bridging computational and human language representations. @@ -82,9 +82,9 @@ Unzipping the Causality of <fixed-case>Z</fixed-case>ipf’s Law and Other Lexical Trade-offs - AmandaDoucetteMcGill University, McGill University - Timothy J.O’DonnellMcGill University, Mila and McGill University - MorganSondereggerMcGill University + AmandaDoucetteMcGill University, McGill University + Timothy J.O’DonnellMcGill University, Mila and McGill University + MorganSondereggerMcGill University 66-76 There are strong constraints on the structure of a possible lexicon. For example, the negative correlation between word frequency and length known as Zipf’s law, and a negative correlation between word length and phonotactic complexity appear to hold across languages. While lexical trade-offs like these have been examined individually, it is unclear how they interact as a system. In this paper, we propose causal discovery as a method for identifying lexical biases and their interactions in a set of variables. We represent the lexicon as a causal model, and apply the Fast Causal Discovery algorithm (Spirtes et al., 1995) to identify both causal relationships between measured variables and the existence of possible unmeasured confounding variables. We apply this method to lexical data including measures of word length, frequency, phonotactic complexity, and morphological irregularity for 25 languages and find evidence of universal associations involving word length with a high likelihood of involving an unmeasured confounder, suggesting that additional variables need to be measured to determine how they are related. We also find evidence of variation across languages in relationships between the remaining variables, and suggest that given a larger dataset, causal discovery algorithms can be a useful tool in assessing the universality of lexical biases. 2025.cmcl-1.11 @@ -92,10 +92,10 @@ Quantifying Semantic Functional Specialization in the Brain Using Encoding Models of Natural Language - JiaqiChen + JiaqiChen RichardAntonelloColumbia University KaavyaChaparalaThe MITRE Corporation - CoenArrowUniversity of Western Australia + CoenArrowUniversity of Western Australia NimaMesgaraniColumbia University 77-90 Although functional specialization in the brain - a phenomenon where different regions process different types of information - is well documented, we still lack precise mathematical methods with which to measure it. This work proposes a technique to quantify how brain regions respond to distinct categories of information. Using a topic encoding model, we identify brain regions that respond strongly to specific semantic categories while responding minimally to all others. We then use a language model to characterize the common themes across each region’s preferred categories. Our technique successfully identifies previously known functionally selective regions and reveals consistent patterns across subjects while also highlighting new areas of high specialization worthy of further study. @@ -104,7 +104,7 @@ “Is There Anything Else?”: Examining Administrator Influence on Linguistic Features from the Cookie Theft Picture Description Cognitive Test - ChangyeLiUniversity of Washington + ChangyeLiUniversity of Washington ZhechengSheng TrevorCohenUniversity of Washington Serguei V. S.PakhomovUniversity of Minnesota - Twin Cities @@ -125,7 +125,7 @@ Distinct social-linguistic processing between humans and large audio-language models: Evidence from model-brain alignment HanlinWuThe Chinese University of Hong Kong XufengDuan - ZhenguangCai + ZhenguangCai 135-143 Voice-based AI development faces unique challenges in processing both linguistic and paralinguistic information. This study compares how large audio-language models (LALMs) and humans integrate speaker characteristics during speech comprehension, asking whether LALMs process speaker-contextualized language in ways that parallel human cognitive mechanisms. We compared two LALMs’ (Qwen2-Audio and Ultravox 0.5) processing patterns with human EEG responses. Using surprisal and entropy metrics from the models, we analyzed their sensitivity to speaker-content incongruency across social stereotype violations (e.g., a man claiming to regularly get manicures) and biological knowledge violations (e.g., a man claiming to be pregnant). Results revealed that Qwen2-Audio exhibited increased surprisal for speaker-incongruent content and its surprisal values significantly predicted human N400 responses, while Ultravox 0.5 showed limited sensitivity to speaker characteristics. Importantly, neither model replicated the human-like processing distinction between social violations (eliciting N400 effects) and biological violations (eliciting P600 effects). These findings reveal both the potential and limitations of current LALMs in processing speaker-contextualized language, and suggest differences in social-linguistic processing mechanisms between humans and LALMs. 2025.cmcl-1.18 @@ -153,7 +153,7 @@ Towards a <fixed-case>B</fixed-case>ayesian hierarchical model of lexical processing Cassandra LJacobsState University of New York, Buffalo - LoïcGrobolUniversité Paris Nanterre + LoïcGrobolUniversité Paris Nanterre 165-171 In cases of pervasive uncertainty, cognitive systems benefit from heuristics or committing to more general hypotheses. Here we have presented a hierarchical cognitive model of lexical processing that synthesizes advances in early rational cognitive models with modern-day neural architectures. Probabilities of higher-order categories derived from layers extracted from the middle layers of an encoder language model have predictive power in accounting for several reading measures for both predicted and unpredicted words and influence even early first fixation duration behavior. The results suggest that lexical processing can take place within a latent, but nevertheless discrete, space in cases of uncertainty. 2025.cmcl-1.21 @@ -162,7 +162,7 @@ Modeling <fixed-case>C</fixed-case>hinese <fixed-case>L</fixed-case>2 Writing Development: The <fixed-case>LLM</fixed-case>-Surprisal Perspective JingyingHu - YanCongPurdue University + YanCongPurdue University 172-183 LLM-surprisal is a computational measure of how unexpected a word or character is given the preceding context, as estimated by large language models (LLMs). This study investigated the effectiveness of LLM-surprisal in modeling second language (L2) writing development, focusing on Chinese L2 writing as a case to test its cross-linguistical generalizability. We selected three types of LLMs with different pretraining settings: a multilingual model trained on various languages, a Chinese-general model trained on both Simplified and Traditional Chinese, and a Traditional-Chinese-specific model. This comparison allowed us to explore how model architecture and training data affect LLM-surprisal estimates of learners’ essays written in Traditional Chinese, which in turn influence the modeling of L2 proficiency and development. We also correlated LLM-surprisals with 16 classic linguistic complexity indices (e.g., character sophistication, lexical diversity, syntactic complexity, and discourse coherence) to evaluate its interpretability and validity as a measure of L2 writing assessment. Our findings demonstrate the potential of LLM-surprisal as a robust, interpretable, cross-linguistically applicable metric for automatic writing assessment and contribute to bridging computational and linguistic approaches in understanding and modeling L2 writing development. All analysis scripts are available at https://github.com/JingyingHu/ChineseL2Writing-Surprisals. 2025.cmcl-1.22 @@ -170,9 +170,9 @@ Beyond Binary <fixed-case>A</fixed-case>nimacy: A Multi-Method Investigation of <fixed-case>LM</fixed-case>s’ Sensitivity in <fixed-case>E</fixed-case>nglish Object Relative Clauses - YueLi - YanCongPurdue University - Elaine J.FrancisPurdue University + YueLi + YanCongPurdue University + Elaine J.FrancisPurdue University 184-196 Animacy is a well-documented factor affecting language production, but its influence on Language Models (LMs) in complex structures like Object Relative Clauses (ORCs) remains underexplored. This study examines LMs’ sensitivity to animacy in English ORC structure choice (passive vs. active) using surprisal-based and prompting-based analyses, alongside human baselines. In surprisal-based analysis, DistilGPT-2 best mirrored human preferences, while GPT-Neo and BERT-base showed rigid biases, diverging from human patterns. Prompting-based analysis expanded testing to GPT-4o-mini, Gemini models, and DeepSeek-R1, revealing GPT-4o-mini’s stronger human alignment but limited animacy sensitivity in Gemini models and DeepSeek-R1. Some LMs exhibited inconsistencies between analyses, reinforcing that prompting alone is unreliable for assessing linguistic competence. Corpus analysis confirmed that training data alone cannot fully explain animacy sensitivity, suggesting emergent animacy-aware representations. These findings underscore the interaction between training data, model architecture, and linguistic generalization, highlighting the need for integrating structured linguistic knowledge into LMs to enhance their alignment with human sentence processing mechanisms. 2025.cmcl-1.23 @@ -180,7 +180,7 @@ An Empirical Study of Language Syllabification using Syllabary and Lexical Networks - RusaliSaha + RusaliSaha YannickMarchandDalhousie University 197-206 Language syllabification is the separation of a word into written or spoken syllables. The study of syllabification plays a pivotal role in morphology and there have been previous attempts to study this phenomenon using graphs or networks. Previous approaches have claimed through visual estimation that the degree distribution of language networks follows the Power Law distribution, however, there have not been any empirically grounded metrics to determine the same. In our study, we implement two kinds of language networks, namely, syllabary and lexical networks, and investigate the syllabification of four European languages: English, French, German and Spanish using network analysis and examine their small-world, random and scale-free nature. We additionally empirically prove that contrary to claims in previous works, although the degree distribution of these networks appear to follow a power law distribution, they are actually more in agreement with a log-normal distribution, when a numerically grounded curve-fitting is applied. Finally, we explore how syllabary and lexical networks for the English language change over time using a database of age-of-acquisition rating words. Our analysis further shows that the preferential attachment mechanism appears to be a well-grounded explanation for the degree distribution of the syllabary network. @@ -200,8 +200,8 @@ When Men Bite Dogs: Testing Good-Enough Parsing in <fixed-case>T</fixed-case>urkish with Humans and Large Language Models - OnurKeleşBogazici University - Nazik DinctopalDenizBogazici University + OnurKeleşBogazici University + Nazik DinctopalDenizBogazici University 219-231 This paper investigates good-enough parsing in Turkish by comparing human self-paced reading performance to the surprisal and attention patterns of three Turkish Large Language Models (LLMs), GPT-2-Base, GPT-2-Large, and LLaMA-3. The results show that Turkish speakers rely on good-enough parsing for implausible but grammatically permissible sentences (e.g., interpreting sentences such as ‘the man bit the dog’ as ‘the dog bit the man’). Although the smaller LLMs (e.g., GPT-2) were better predictors of human RTs, they seem to have relied more heavily on semantic plausibility than humans. Comparably, larger LLMs (e.g., LLaMA-3) tended to make more probabilistic parsing based on word order, exhibiting less good-enough parsing behavior. Therefore, we conclude that LLMs take syntactic and semantic constraints into account when processing thematic roles, but not to the same extent as human parsers. 2025.cmcl-1.26 @@ -211,7 +211,7 @@ Transformers Can Model Human Hyperprediction in Buzzer Quiz YoichiroYamashitaThe University of Tokyo YutoHaradaTokyo University, Tokyo Institute of Technology - YoheiOsekiUniversity of Tokyo + YoheiOsekiUniversity of Tokyo 232-243 Humans tend to predict the next words during sentence comprehension, but under unique circumstances, they demonstrate an ability for longer coherent word sequence prediction. In this paper, we investigate whether Transformers can model such hyperprediction observed in humans during sentence processing, specifically in the context of Japanese buzzer quizzes. We conducted eye-tracking experiments where the participants read the first half of buzzer quiz questions and predicted the second half, while we modeled their reading time using the GPT-2. By modeling the reading times of each word in the first half of the question using GPT-2 surprisal, we examined under what conditions fine-tuned language models can better predict reading times. As a result, we found that GPT-2 surprisal effectively explains the reading times of quiz experts as they read the first half of the question while predicting the latter half. When the language model was fine-tuned with quiz questions, the perplexity value decreased. Lower perplexity corresponded to higher psychometric predictive power; however, excessive data for fine-tuning led to a decrease in perplexity and the fine-tuned model exhibited a low psychometric predictive power. Overall, our findings suggest that a moderate amount of data is required for fine-tuning in order to model human hyperprediction. 2025.cmcl-1.27 @@ -221,7 +221,7 @@ What to Predict? Exploring How Sentence Structure Influences Contrast Predictions in Humans and Large Language Models ShuqiWangThe Chinese University of Hong Kong XufengDuan - ZhenguangCai + ZhenguangCai 244-252 This study examines how sentence structure shapes contrast predictions in both humans and large language models (LLMs). Using Mandarin ditransitive constructions — double object (DO, “She gave the girl the candy, but not...”) vs. prepositional object (PO, “She gave the candy to the girl, but not...”) as a testbed, we employed a sentence continuation task involving three human groups (written, spoken, and prosodically normalized spoken stimuli) and three LLMs (GPT-4o, LLaMA-3, and Qwen-2.5). Two principal findings emerged: (1) Although human participants predominantly focused on the theme (e.g., “the candy”), contrast predictions were significantly modulated by sentence structure—particularly in spoken contexts, where the sentence-final element drew more attention. (2) While LLMs showed a similar reliance on structure, they displayed a larger effect size and more closely resembled human spoken data than written data, indicating a stronger emphasis on linear order in generating contrast predictions. By adopting a unified psycholinguistic paradigm, this study advances our understanding of predictive language processing for both humans and LLMs and informs research on human–model alignment in linguistic tasks. 2025.cmcl-1.28 @@ -230,9 +230,9 @@ Investigating noun-noun compound relation representations in autoregressive large language models SaffronKendrickThe Queen’s University Belfast - MarkOrmerodThe Queen’s University Belfast + MarkOrmerodThe Queen’s University Belfast HuiWangThe Queen’s University Belfast - BarryDevereuxQueen’s University Belfast + BarryDevereuxQueen’s University Belfast 253-263 This paper uses autoregressive large language models to explore at which points in a given input sentence the semantic information is decodable. Using representational similarity analysis and probing, the results show that autoregressive models are capable of extracting the semantic relation information from a dataset of noun-noun compounds. When considering the effect of processing the head and modifier nouns in context, the extracted representations show greater correlation after processing both constituent nouns in the same sentence. The linguistic properties of the head nouns may influence the ability of LLMs to extract relation information when the head and modifier words are processed separately. Probing suggests that Phi-1 and LLaMA-3.2 are exposed to relation information during training, as they are able to predict the relation vectors for compounds from separate word representations to a similar degree as using compositional compound representations. However, the difference in processing condition for GPT-2 and DeepSeek-R1 indicates that these models are actively processing the contextual semantic relation information of the compound. 2025.cmcl-1.30 From 701ae50ca12902029643a6e76503dbccf74603df Mon Sep 17 00:00:00 2001 From: Matt Post Date: Fri, 6 Jun 2025 09:23:42 -0400 Subject: [PATCH 16/18] Ingest ORCIDS for a number of 2025 workshops --- data/xml/2024.clicit.xml | 2 +- data/xml/2025.aisd.xml | 28 +-- data/xml/2025.dravidianlangtech.xml | 338 ++++++++++++++-------------- data/xml/2025.in2writing.xml | 32 +-- data/xml/2025.loresmt.xml | 42 ++-- data/xml/2025.mwe.xml | 44 ++-- data/xml/2025.naacl.xml | 118 +++++----- data/xml/2025.repl4nlp.xml | 22 +- data/xml/2025.wnut.xml | 28 +-- 9 files changed, 327 insertions(+), 327 deletions(-) diff --git a/data/xml/2024.clicit.xml b/data/xml/2024.clicit.xml index e4614b9308..a34647ea4f 100644 --- a/data/xml/2024.clicit.xml +++ b/data/xml/2024.clicit.xml @@ -21,7 +21,7 @@ Preface to the <fixed-case>CL</fixed-case>i<fixed-case>C</fixed-case>-it 2024 Proceedings - FeliceDell’OrlettaItaliaNLP Lab @ Institute for Computational Linguistics “Antonio Zampolli”, ILC - CNR + FeliceDell’OrlettaItaliaNLP Lab @ Institute for Computational Linguistics “Antonio Zampolli”, ILC - CNR AlessandroLenciUniversity of Pisa SimonettaMontemagniIstituto di Linguistica Computazionale “Antonio Zampolli” RacheleSprugnoliUniversity of Parma diff --git a/data/xml/2025.aisd.xml b/data/xml/2025.aisd.xml index 5681117f70..630150bea6 100644 --- a/data/xml/2025.aisd.xml +++ b/data/xml/2025.aisd.xml @@ -26,11 +26,11 @@ Variable Extraction for Model Recovery in Scientific Literature - ChunweiLiuMassachusetts Institute of Technology - EnriqueNoriega-AtalaUniversity of Arizona - AdarshPyarelalUniversity of Arizona - Clayton TMorrisonUniversity of Arizona - MikeCafarellaMassachusetts Institute of Technology + ChunweiLiuMassachusetts Institute of Technology + EnriqueNoriega-AtalaUniversity of Arizona + AdarshPyarelalUniversity of Arizona + Clayton TMorrisonUniversity of Arizona + MikeCafarellaMassachusetts Institute of Technology 1-12 Due to the increasing productivity in the scientific community, it is difficult to keep up with the literature without the assistance of AI methods. This paper evaluates various methods for extracting mathematical model variables from epidemiological studies, such as ‘infection rate (\alpha),” ‘recovery rate (\gamma),” and ‘mortality rate (\mu).” Variable extraction appears to be a basic task, but plays a pivotal role in recovering models from scientific literature. Once extracted, we can use these variables for automatic mathematical modeling, simulation, and replication of published results. We also introduce a benchmark dataset comprising manually-annotated variable descriptions and variable values extracted from scientific papers. Our analysis shows that LLM-based solutions perform the best. Despite the incremental benefits of combining rule-based extraction outputs with LLMs, the leap in performance attributed to the transfer-learning and instruction-tuning capabilities of LLMs themselves is far more significant. This investigation demonstrates the potential of LLMs to enhance automatic comprehension of scientific artifacts and for automatic model recovery and simulation. 2025.aisd-main.1 @@ -39,8 +39,8 @@ How Well Do Large Language Models Extract Keywords? A Systematic Evaluation on Scientific Corpora Nacef BenMansour - HamedRahimi - MotasemAlrahabi + HamedRahimi + MotasemAlrahabi 13-21 Automatic keyword extraction from scientific articles is pivotal for organizing scholarly archives, powering semantic search engines, and mapping interdisciplinary research trends. However, existing methods—including statistical and graph-based approaches—struggle to handle domain-specific challenges such as technical terminology, cross-disciplinary ambiguity, and dynamic scientific jargon. This paper presents an empirical comparison of traditional keyword extraction methods (e.g. TextRank and YAKE) with approaches based on Large Language Model. We introduce a novel evaluation framework that combines fuzzy semantic matching based on Levenshtein Distance with exact-match metrics (F1, precision, recall) to address inconsistencies in keyword normalization across scientific corpora. Through an extensive ablation study across nine different LLMs, we analyze their performance and associated costs. Our findings reveal that LLM-based methods consistently achieve superior precision and relevance compared to traditional approaches. This performance advantage suggests significant potential for improving scientific search systems and information retrieval in academic contexts. 2025.aisd-main.2 @@ -48,7 +48,7 @@ A Human-<fixed-case>LLM</fixed-case> Note-Taking System with Case-Based Reasoning as Framework for Scientific Discovery - Douglas BCraigUniversity of Michigan - Ann Arbor + Douglas BCraigUniversity of Michigan - Ann Arbor 22-30 Scientific discovery is an iterative process that requires transparent reasoning, empirical validation, and structured problem-solving. This work presents a novel human-in-the-loop AI system that leverages case-based reasoning to facilitate structured scientific inquiry. The system is designed to be note-centric, using the Obsidian note-taking application as the primary interface where all components, including user inputs, system cases, and tool specifications, are represented as plain-text notes. This approach ensures that every step of the research process is visible, editable, and revisable by both the user and the AI. The system dynamically retrieves relevant cases from past experience, refines hypotheses, and structures research workflows in a transparent and iterative manner. The methodology is demonstrated through a case study investigating the role of TLR4 in sepsis, illustrating how the system supports problem framing, literature review, hypothesis formulation, and empirical validation. The results highlight the potential of AI-assisted scientific workflows to enhance research efficiency while preserving human oversight and interpretability. 2025.aisd-main.3 @@ -56,11 +56,11 @@ Towards <fixed-case>AI</fixed-case>-assisted Academic Writing - Daniel J.LieblingGoogle + Daniel J.LieblingGoogle MalcolmKaneGoogle MadeleineGrunde-McLaughlinUniversity of Washington IanLangGoogle - SubhashiniVenugopalanGoogle + SubhashiniVenugopalanGoogle MichaelBrennerHarvard University 31-45 We present components of an AI-assisted academic writing system including citation recommendation and introduction writing. The system recommends citations by considering the user’s current document context to provide relevant suggestions. It generates introductions in a structured fashion, situating the contributions of the research relative to prior work. We demonstrate the effectiveness of the components through quantitative evaluations. Finally, the paper presents qualitative research exploring how researchers incorporate citations into their writing workflows. Our findings indicate that there is demand for precise AI-assisted writing systems and simple, effective methods for meeting those needs. @@ -70,8 +70,8 @@ Evaluating and Enhancing Large Language Models for Novelty Assessment in Scholarly Publications EthanLin - ZhiyuanPeng - YiFangSanta Clara University + ZhiyuanPeng + YiFangSanta Clara University 46-57 Recent studies have evaluated creativity, where novelty is an important aspect, of large language models (LLMs) primarily from a semantic perspective, using benchmarks from cognitive science. However, assessing the novelty in scholarly publications, a critical facet of evaluating LLMs as scientific discovery assistants, remains underexplored, despite its potential to accelerate research cycles and prioritize high-impact contributions in scientific workflows. We introduce SchNovel, a benchmark to evaluate LLMs’ ability to assess novelty in scholarly papers, a task central to streamlining discovery pipeline. SchNovel consists of 15000 pairs of papers across six fields sampled from the arXiv dataset with publication dates spanning 2 to 10 years apart. In each pair, the more recently published paper is assumed to be more novel. Additionally, we propose RAG-Novelty, a retrieval-augmented method that mirrors human peer review by grounding novelty assessment in retrieved context. Extensive experiments provide insights into the capabilities of different LLMs to assess novelty and demonstrate that RAG-Novelty outperforms recent baseline models highlight LLMs’ promise as tools for automating novelty detection in scientific workflows. 2025.aisd-main.5 @@ -79,8 +79,8 @@ <fixed-case>LLM</fixed-case>-Assisted Translation of Legacy <fixed-case>FORTRAN</fixed-case> Codes to <fixed-case>C</fixed-case>++: A Cross-Platform Study - Nishath RajivRanasinghe - Shawn M.JonesLos Alamos National Laboratory + Nishath RajivRanasinghe + Shawn M.JonesLos Alamos National Laboratory MichalKucerLos Alamos National Laboratory AyanBiswasLos Alamos National Laboratory DanielO’MalleyLos Alamos National Laboratory diff --git a/data/xml/2025.dravidianlangtech.xml b/data/xml/2025.dravidianlangtech.xml index 3102140339..5ae9656bc2 100644 --- a/data/xml/2025.dravidianlangtech.xml +++ b/data/xml/2025.dravidianlangtech.xml @@ -39,9 +39,9 @@ Eureka-<fixed-case>CIOL</fixed-case>@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech 2025: Using Customized <fixed-case>BERT</fixed-case>s for Sentiment Analysis of <fixed-case>T</fixed-case>amil Political Comments Enjamamul HaqueEram - AnishaAhmed + AnishaAhmed Sabrina AfrozMitu - Azmine ToushikWasi + Azmine ToushikWasi 6-11 Sentiment analysis on social media platforms plays a crucial role in understanding public opinion and the decision-making process on political matters. As a significant number of individuals express their views on social media, analyzing these opinions is essential for monitoring political trends and assessing voter sentiment. However, sentiment analysis for low-resource languages, such as Tamil, presents considerable challenges due to the limited availability of annotated datasets and linguistic complexities. To address this gap, we utilize a novel dataset encompassing seven sentiment classes, offering a unique opportunity to explore sentiment variations in Tamil political discourse. In this study, we evaluate multiple pre-trained models from the Hugging Face library and experiment with various hyperparameter configurations to optimize model performance. Our findings aim to contribute to the development of more effective sentiment analysis tools tailored for low-resource languages, ultimately empowering Tamil-speaking communities by providing deeper insights into their political sentiments. Our full experimental codebase is publicly available at: ciol-researchlab/NAACL25-Eureka-Sentiment-Analysis-Tamil 2025.dravidianlangtech-1.2 @@ -49,10 +49,10 @@ Akatsuki-<fixed-case>CIOL</fixed-case>@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech 2025: Ensemble-Based Approach Using Pre-Trained Models for Fake News Detection in <fixed-case>D</fixed-case>ravidian Languages - Mahfuz AhmedAnik + Mahfuz AhmedAnik Md. IqramulHoque - WahidFaisal - Azmine ToushikWasi + WahidFaisal + Azmine ToushikWasi Md ManjurulAhsanUniversity of Oklahoma 12-18 The widespread spread of fake news on social media poses significant challenges, particularly for low-resource languages like Malayalam. The accessibility of social platforms accelerates misinformation, leading to societal polarization and poor decision-making. Detecting fake news in Malayalam is complex due to its linguistic diversity, code-mixing, and dialectal variations, compounded by the lack of large labeled datasets and tailored models. To address these, we developed a fine-tuned transformer-based model for binary and multiclass fake news detection. The binary classifier achieved a macro F1 score of 0.814, while the multiclass model, using multimodal embeddings, achieved a score of 0.1978. Our system ranked 14th and 11th in the shared task competition, highlighting the need for specialized techniques in underrepresented languages. Our full experimental codebase is publicly available at: ciol-researchlab/NAACL25-Akatsuki-Fake-News-Detection. @@ -83,7 +83,7 @@ <fixed-case>JAS</fixed-case>@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech 2025: Abusive <fixed-case>T</fixed-case>amil Text targeting Women on Social Media BSaathvik JaneshvarSivakumar - ThenmozhiDurairaj + ThenmozhiDurairaj 28-32 This paper presents our submission for Abusive Comment Detection in Tamil - DravidianLangTech@NAACL 2025. The aim is to classify whether a given comment is abusive towards women. Google’s MuRIL (Khanujaet al., 2021), a transformer-based multilingual model, is fine-tuned using the provided dataset to build the classification model. The datasetis preprocessed, tokenised, and formatted for model training. The model is trained and evaluated using accuracy, F1-score, precision, andrecall. Our approach achieved an evaluation accuracy of 77.76% and an F1-score of 77.65%. The lack of large, high-quality datasets forlow-resource languages has also been acknowledged. 2025.dravidianlangtech-1.6 @@ -106,7 +106,7 @@ <fixed-case>NLP</fixed-case>ops<fixed-case>CIOL</fixed-case>@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech 2025: Classification of Abusive <fixed-case>T</fixed-case>amil and <fixed-case>M</fixed-case>alayalam Text Targeting Women Using Pre-trained Models Abdullah AlNahian Mst RafiaIslam - Azmine ToushikWasi + Azmine ToushikWasi Md ManjurulAhsanUniversity of Oklahoma 38-45 Hate speech detection in multilingual and code-mixed contexts remains a significant challenge due to linguistic diversity and overlapping syntactic structures. This paper presents a study on the detection of hate speech in Tamil and Malayalam using transformer-based models. Our goal is to address underfitting and develop effective models for hate speech classification. We evaluate several pre-trained models, including MuRIL and XLM-RoBERTa, and show that fine-tuning is crucial for better performance. The test results show a Macro-F1 score of 0.7039 for Tamil and 0.6402 for Malayalam, highlighting the promise of these models with further improvements in fine-tuning. We also discuss data preprocessing techniques, model implementations, and experimental findings. Our full experimental codebase is publicly available at: github.com/ciol-researchlab/NAACL25-NLPops-Classification-Abusive-Text. @@ -191,7 +191,7 @@ <fixed-case>SSNCSE</fixed-case>@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech 2025: Multimodal Hate Speech Detection in <fixed-case>D</fixed-case>ravidian Languages SreejaKSri Sivasubramaniya Nadar College of Engineering - BharathiB + BharathiB 98-102 Hate speech detection is a serious challenge due to the different digital media communication, particularly in low-resource languages. This research focuses on the problem of multimodal hate speech detection by incorporating both textual and audio modalities. In the context of social media platforms, hate speech is conveyed not only through text but also through audios, which may further amplify harmful content. In order to manage the issue, we provide a multiclass classification model that influences both text and audio features to detect and categorize hate speech in low-resource languages. The model uses machine learning models for text analysis and audio processing, allowing it to efficiently capture the complex relationships between the two modalities. Class weight mechanism involves avoiding overfitting. The prediction has been finalized using the majority fusion technique. Performance is measured using a macro average F1 score metric. Three languages—Tamil, Malayalam, and Telugu—have the optimal F1-scores, which are 0.59, 0.52, and 0.33. 2025.dravidianlangtech-1.17 @@ -201,7 +201,7 @@ Bridging Linguistic Complexity: Sentiment Analysis of <fixed-case>T</fixed-case>amil Code-Mixed Text Using Meta-Model Anusha M DGowda DeepthiVikram - Parameshwar RHegdeYenepoya University + Parameshwar RHegdeYenepoya University 103-108 Sentiment analysis in code-mixed languages poses significant challenges due to the complex nature of mixed-language text. This study explores sentiment analysis on Tamil code-mixed text using deep learning models such as Long Short-Term Memory (LSTM), hybrid models like Convolutional Neural Network (CNN) + Gated Recurrent Unit (GRU) and LSTM + GRU, along with meta-models including Logistic Regression, Random Forest, and Decision Tree. The LSTM+GRU hybrid model achieved an accuracy of 0.31, while the CNN+GRU hybrid model reached 0.28. The Random Forest meta-model demonstrated exceptional performance on the development set with an accuracy of 0.99. However, its performance dropped significantly on the test set, achieving an accuracy of 0.1333. The study results emphasize the potential of meta-model-based classification for improving performance in NLP tasks. 2025.dravidianlangtech-1.18 @@ -210,7 +210,7 @@ <fixed-case>Y</fixed-case>en<fixed-case>CS</fixed-case>@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech 2025: Integrating Hybrid Architectures for Fake News Detection in Low-Resource <fixed-case>D</fixed-case>ravidian Languages Anusha M DGowda - Parameshwar RHegdeYenepoya University + Parameshwar RHegdeYenepoya University 109-113 Detecting fake news in under-resourced Dravidian languages is a rigorous task due to the scarcity of annotated datasets and the intricate nature of code-mixed text. This study tackles these issues by employing advanced machine learning techniques for two key classification tasks, the first task involves binary classification achieving a macro-average F1-score of 0.792 using a hybrid fusion model that integrates Bidirectional Recurrent Neural Network (Bi-RNN) and Long Short-Term Memory (LSTM)-Recurrent Neural Network (RNN) with weighted averaging. The second task focuses on fine-grained classification, categorizing news where an LSTM-GRU hybrid model attained a macro-average F1-score of 0.26. These findings highlight the effectiveness of hybrid models in improving fake news detection for under-resourced languages. Additionally, this study provides a foundational framework that can be adapted to address similar challenges in other under-resourced languages, emphasizing the need for further research in this area. 2025.dravidianlangtech-1.19 @@ -219,12 +219,12 @@ Overview of the Shared Task on Multimodal Hate Speech Detection in <fixed-case>D</fixed-case>ravidian languages: <fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech@<fixed-case>NAACL</fixed-case> 2025 Jyothish LalGAmrita Vishwa Vidyapeetham (Deemed University) - PremjithBAmrita Vishwa Vidyapeetham (Deemed University) - Bharathi RajaChakravarthiUniversity of Galway + PremjithBAmrita Vishwa Vidyapeetham (Deemed University) + Bharathi RajaChakravarthiUniversity of Galway SaranyaRajiakodiCentral University of Tamil Nadu - BharathiB + BharathiB RajeswariNatarajan - RatnavelRajalakshmiVellore Institute of Technology + RatnavelRajalakshmiVellore Institute of Technology 114-122 The detection of hate speech in social media platforms is very crucial these days. This is due to its adverse impact on mental health, social harmony, and online safety. This paper presents the overview of the shared task on Multimodal Hate Speech Detection in Dravidian Languages organized as part of DravidianLangTech@NAACL 2025. The task emphasizes detecting hate speech in social media content that combines speech and text. Here, we focus on three low-resource Dravidian languages: Malayalam, Tamil, and Telugu. Participants were required to classify hate speech in three sub-tasks, each corresponding to one of these languages. The dataset was curated by collecting speech and corresponding text from YouTube videos. Various machine learning and deep learning-based models, including transformer-based architectures and multimodal frameworks, were employed by the participants. The submissions were evaluated using the macro F1 score. Experimental results underline the potential of multimodal approaches in advancing hate speech detection for low-resource languages. Team SSNTrio achieved the highest F1 score in Malayalam and Tamil of 0.7511 and 0.7332, respectively. Team lowes scored the best F1 score of 0.3817 in the Telugu sub-task. 2025.dravidianlangtech-1.20 @@ -232,13 +232,13 @@ Overview of the Shared Task on Detecting <fixed-case>AI</fixed-case> Generated Product Reviews in <fixed-case>D</fixed-case>ravidian Languages: <fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech@<fixed-case>NAACL</fixed-case> 2025 - PremjithBAmrita Vishwa Vidyapeetham (Deemed University) - NandhiniKumareshCentral university of Tamil Nadu - Bharathi RajaChakravarthiUniversity of Galway - ThenmozhiDurairaj + PremjithBAmrita Vishwa Vidyapeetham (Deemed University) + NandhiniKumareshCentral university of Tamil Nadu + Bharathi RajaChakravarthiUniversity of Galway + ThenmozhiDurairaj BalasubramanianPalaniIIIT Kottayam SajeethaThavareesan - Prasanna KumarKumaresanData Science Institution, University of Galway, Ireland + Prasanna KumarKumaresanData Science Institution, University of Galway, Ireland 123-132 The detection of AI-generated product reviews is critical due to the increased use of large language models (LLMs) and their capability to generate convincing sentences. The AI-generated reviews can affect the consumers and businesses as they influence the trust and decision-making. This paper presents the overview of the shared task on Detecting AI-generated product reviews in Dravidian Languages” organized as part of DravidianLangTech@NAACL 2025. This task involves two subtasks—one in Malayalam and another in Tamil, both of which are binary classifications where a review is to be classified as human-generated or AI-generated. The dataset was curated by collecting comments from YouTube videos. Various machine learning and deep learning-based models ranging from SVM to transformer-based architectures were employed by the participants. 2025.dravidianlangtech-1.21 @@ -246,12 +246,12 @@ Girma@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech 2025: Detecting <fixed-case>AI</fixed-case> Generated Product Reviews - Girma YohannisBade + Girma YohannisBade Muhammad TayyabZamir - OlgaKolesnikovaInstituto Politécnico Nacional + OlgaKolesnikovaInstituto Politécnico Nacional José LuisOropeza - GrigoriSidorovInstituto Politécnico Nacional - AlexanderGelbukhInstituto Politécnico Nacional + GrigoriSidorovInstituto Politécnico Nacional + AlexanderGelbukhInstituto Politécnico Nacional 133-138 The increasing prevalence of AI-generated content, including fake product reviews, poses significant challenges in maintaining authenticity and trust in e-commerce systems. While much work has focused on detecting such reviews in high-resource languages, limited attention has been given to low-resource languages like Malayalam and Tamil. This study aims to address this gap by developing a robust framework to identify AI-generated product reviews in these languages. We explore a BERT-based approach for this task. Our methodology involves fine-tuning a BERT-based model specifically on Malayalam and Tamil datasets. The experiments are conducted using labeled datasets that contain a mix of human-written and AI-generated reviews. Performance is evaluated using the macro F1 score. The results show that the BERT-based model achieved a macro F1 score of 0.6394 for Tamil and 0.8849 for Malayalam. Preliminary results indicate that the BERT-based model performs significantly better for Malayalam than for Tamil in terms of the average Macro F1 score, leveraging its ability to capture the complex linguistic features of these languages. Finally, we open the source code of the implementation in the GitHub repository: AI-Generated-Product-Review-Code 2025.dravidianlangtech-1.22 @@ -259,8 +259,8 @@ <fixed-case>B</fixed-case>eyond_<fixed-case>T</fixed-case>ech@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech 2025: Political Multiclass Sentiment Analysis using Machine Learning and Neural Network - KogilavaniShanmugavadivel - MalligaSubramanian + KogilavaniShanmugavadivel + MalligaSubramanian SanjaiR MohammedSameer MotheeswaranK @@ -271,11 +271,11 @@ Misogynistic Meme Detection in <fixed-case>D</fixed-case>ravidian Languages Using Kolmogorov Arnold-based Networks - ManashaArunachalam - Navneet KrishnaChukka - Harish VijayV - PremjithBAmrita Vishwa Vidyapeetham (Deemed University) - Bharathi RajaChakravarthiUniversity of Galway + ManashaArunachalam + Navneet KrishnaChukka + Harish VijayV + PremjithBAmrita Vishwa Vidyapeetham (Deemed University) + Bharathi RajaChakravarthiUniversity of Galway 144-151 The prevalence of misogynistic content online poses significant challenges to ensuring a safe and inclusive digital space for women. This study presents a pipeline to classify online memes as misogynistic or non misogynistic. The pipeline combines contextual image embeddings generated using the Vision Transformer Encoder (ViTE) model with text embeddings extracted from the memes using ModernBERT. These multimodal embeddings were fused and trained using three advanced types of Kolmogorov Artificial Networks (KAN): PyKAN, FastKAN, and Chebyshev KAN. The models were evaluated based on their F1 scores, demonstrating their effectiveness in addressing this issue. This research marks an important step towards reducing offensive online content, promoting safer and more respectful interactions in the digital world. 2025.dravidianlangtech-1.24 @@ -287,7 +287,7 @@ Kankipati VenkataMeghana KondakindiSupriya TaraSamiksha - PremjithBAmrita Vishwa Vidyapeetham (Deemed University) + PremjithBAmrita Vishwa Vidyapeetham (Deemed University) 152-156 Detecting abusive and similarly toxic content posted on a social media platform is challenging due to the complexities of the language, data imbalance, and the code-mixed nature of the text. In this paper, we present our submissions for the shared task on abusive Tamil and Malayalam texts targeting women on social media—DravidianLangTech@NAACL 2025. We propose a hybrid embedding model that integrates embeddings generated using term frequency-inverse document frequency (TF-IDF) and BERT. To get rid of the differences in the embedding dimensions, we used a dimensionality reduction method with TF-IDF embedding. We submitted two more runs to the shared task, which involve a model based on TF-IDF embedding and another based on BERT-based embedding. The code for the submissions is available at https://github.com/Tarrruh/NLP_HTMS. 2025.dravidianlangtech-1.25 @@ -295,8 +295,8 @@ <fixed-case>T</fixed-case>eam_<fixed-case>C</fixed-case>atalysts@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech 2025: Leveraging Political Sentiment Analysis using Machine Learning Techniques for Classifying <fixed-case>T</fixed-case>amil Tweets - KogilavaniShanmugavadivel - MalligaSubramanian + KogilavaniShanmugavadivel + MalligaSubramanian SubhadeviK Sowbharanika JananiSivakumar RahulK @@ -307,8 +307,8 @@ <fixed-case>I</fixed-case>nnovation<fixed-case>E</fixed-case>ngineers@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech 2025: Enhanced <fixed-case>CNN</fixed-case> Models for Detecting Misogyny in <fixed-case>T</fixed-case>amil Memes Using Image and Text Classification - KogilavaniShanmugavadivel - MalligaSubramanian + KogilavaniShanmugavadivel + MalligaSubramanian Pooja SreeM PalanimuruganV Roshini PriyaK @@ -320,9 +320,9 @@ <fixed-case>M</fixed-case>ystic<fixed-case>CIOL</fixed-case>@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech 2025: A Hybrid Framework for Sentiment Analysis in <fixed-case>T</fixed-case>amil and <fixed-case>T</fixed-case>ulu Using Fine-Tuned <fixed-case>SBERT</fixed-case> Embeddings and Custom <fixed-case>MLP</fixed-case> Architectures MinhazChowdhury - ArnabLaskar + ArnabLaskar TajAhmad - Azmine ToushikWasi + Azmine ToushikWasi 167-172 Sentiment analysis is a crucial NLP task used to analyze opinions in various domains, including marketing, politics, and social media. While transformer-based models like BERT and SBERT have significantly improved sentiment classification, their effectiveness in low-resource languages remains limited. Tamil and Tulu, despite their widespread use, suffer from data scarcity, dialectal variations, and code-mixing challenges, making sentiment analysis difficult. Existing methods rely on traditional classifiers or word embeddings, which struggle to generalize in these settings. To address this, we propose a hybrid framework that integrates fine-tuned SBERT embeddings with a Multi-Layer Perceptron (MLP) classifier, enhancing contextual representation and classification robustness. Our framework achieves validation F1-scores of 0.4218 for Tamil and 0.3935 for Tulu and test F1-scores of 0.4299 in Tamil and 0.1546 on Tulu, demonstrating its effectiveness. This research provides a scalable solution for sentiment classification in low-resource languages, with future improvements planned through data augmentation and transfer learning. Our full experimental codebase is publicly available at: github.com/ciol-researchlab/NAACL25-Mystic-Tamil-Sentiment-Analysis. 2025.dravidianlangtech-1.28 @@ -330,8 +330,8 @@ <fixed-case>KEC</fixed-case>_<fixed-case>AI</fixed-case>_<fixed-case>DATA</fixed-case>_<fixed-case>DRIFTERS</fixed-case>@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech 2025: Fake News Detection in <fixed-case>D</fixed-case>ravidian Languages - KogilavaniShanmugavadivel - MalligaSubramanian + KogilavaniShanmugavadivel + MalligaSubramanian Vishali KS PriyankaB Naveen KumarK @@ -342,8 +342,8 @@ <fixed-case>KECE</fixed-case>mpower@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech 2025: Abusive <fixed-case>T</fixed-case>amil and <fixed-case>M</fixed-case>alayalam Text targeting Women on Social Media - MalligaSubramanian - KogilavaniShanmugavadivel + MalligaSubramanian + KogilavaniShanmugavadivel Indhuja VS KowshikP JayasuryaS @@ -354,8 +354,8 @@ <fixed-case>KEC</fixed-case>_<fixed-case>AI</fixed-case>_<fixed-case>GRYFFINDOR</fixed-case>@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech 2025: Multimodal Hate Speech Detection in <fixed-case>D</fixed-case>ravidian languages - KogilavaniShanmugavadivel - MalligaSubramanian + KogilavaniShanmugavadivel + MalligaSubramanian ShahidKhanS ShriSashmitha.s YashicaS @@ -366,11 +366,11 @@ <fixed-case>KECL</fixed-case>ingu<fixed-case>AI</fixed-case>sts@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech 2025: Detecting <fixed-case>AI</fixed-case>-generated Product Reviews in <fixed-case>D</fixed-case>ravidian Languages - MalligaSubramanian + MalligaSubramanian RojithaR Mithun ChakravarthyY Renusri RV - KogilavaniShanmugavadivel + KogilavaniShanmugavadivel 187-190 With the surge of AI-generated content in online spaces, ensuring the authenticity of product reviews has become a critical challenge. This paper addresses the task of detecting AI-generated product reviews in Dravidian languages, specifically Tamil and Malayalam, which present unique hurdles due to their complex morphology, rich syntactic structures, and code-mixed nature. We introduce a novel methodology combining machine learning classifiers with advanced multilingual transformer models to identify AI-generated reviews. Our approach not only accounts for the linguistic intricacies of these languages but also leverages domain specific datasets to improve detection accuracy. For Tamil, we evaluate Logistic Regression, Random Forest, and XGBoost, while for Malayalam, we explore Logistic Regression, Multinomial Naive Bayes (MNB), and Support Vector Machines (SVM). Transformer based models significantly outperform these traditional classifiers, demonstrating superior performance across multiple metrics. 2025.dravidianlangtech-1.32 @@ -380,7 +380,7 @@ Dll5143@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech 2025: Majority Voting-Based Framework for Misogyny Meme Detection in <fixed-case>T</fixed-case>amil and <fixed-case>M</fixed-case>alayalam SarbajeetPattanaik AshokYadav - VrijendraSingh + VrijendraSingh 191-199 Misogyny memes pose a significant challenge on social networks, particularly in Dravidian-scripted languages, where subtle expressions can propagate harmful narratives against women. This paper presents our approach for the “Shared Task on MisogynyMeme Detection,” organized as part of DravidianLangTech@NAACL 2025, focusing on misogyny meme detection in Tamil andMalayalam. To tackle this problem, we proposed a multi-model framework that integrates three distinct models: M1 (ResNet-50 + google/muril-large-cased), M2 (openai/clipvit- base-patch32 + ai4bharat/indic-bert), and M3 (ResNet-50 + ai4bharat/indic-bert). Thefinal classification is determined using a majority voting mechanism, ensuring robustness by leveraging the complementary strengths ofthese models. This approach enhances classification performance by reducing biases and improving generalization. Our model achievedan F1 score of 0.77 for Tamil, significantly improving misogyny detection in the language. For Malayalam, the framework achieved anF1 score of 0.84, demonstrating strong performance. Overall, our method ranked 5th in Tamil and 4th in Malayalam, highlighting itscompetitive effectiveness in misogyny meme detection. 2025.dravidianlangtech-1.33 @@ -388,8 +388,8 @@ <fixed-case>KEC</fixed-case>_<fixed-case>AI</fixed-case>_<fixed-case>VSS</fixed-case>_run2@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech 2025: Abusive <fixed-case>T</fixed-case>amil and <fixed-case>M</fixed-case>alayalam Text targeting Women on Social Media - KogilavaniShanmugavadivel - MalligaSubramanian + KogilavaniShanmugavadivel + MalligaSubramanian SathiyaseelanS Suresh BabuK VasikaranS @@ -400,9 +400,9 @@ <fixed-case>T</fixed-case>he_<fixed-case>D</fixed-case>eathly_<fixed-case>H</fixed-case>allows@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech 2025: <fixed-case>AI</fixed-case> Content Detection in <fixed-case>D</fixed-case>ravidian Languages - KogilavaniShanmugavadivel - MalligaSubramanian - VasantharanK + KogilavaniShanmugavadivel + MalligaSubramanian + VasantharanK Prethish GA VijayakumaranS 205-209 @@ -423,7 +423,7 @@ <fixed-case>I</fixed-case>nnovate<fixed-case>X</fixed-case>@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech 2025: Detecting <fixed-case>AI</fixed-case>-Generated Product Reviews in <fixed-case>D</fixed-case>ravidian Languages MoogambigaiA PandiarajanD - BharathiB + BharathiB 215-220 This paper presents our approach to the Shared Task on Detecting AI-Generated Product Reviews in Dravidian Languages as part of DravidianLangTech@NAACL 2025. The task focuses on distinguishing between human-written and AI-generated reviews in Tamil and Malayalam, languages rich in linguistic complexities. Using the provided datasets, we implemented machine learning and deep learning models, including Logistic Regression (LR), Support Vector Machine (SVM), and BERT. Through preprocessing techniques like tokenization and TF-IDF vectorization, we achieved competitive results, with our SVM and BERT models demonstrating superior performance in Tamil and Malayalam respectively. Our findings underscore the unique challenges of working with Dravidian languages in this domain and highlight the importance of robust feature extraction. 2025.dravidianlangtech-1.37 @@ -442,8 +442,8 @@ <fixed-case>B</fixed-case>lue<fixed-case>R</fixed-case>ay@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech-2025: Fake News Detection in <fixed-case>D</fixed-case>ravidian Languages - KogilavaniShanmugavadivel - MalligaSubramanian + KogilavaniShanmugavadivel + MalligaSubramanian AiswaryaM ArunaT JeevaananthS @@ -454,8 +454,8 @@ <fixed-case>KEC</fixed-case>_<fixed-case>AI</fixed-case>_<fixed-case>ZEROWATTS</fixed-case>@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech 2025: Multimodal Hate Speech Detection in <fixed-case>D</fixed-case>ravidian languages - KogilavaniShanmugavadivel - MalligaSubramanian + KogilavaniShanmugavadivel + MalligaSubramanian Naveenram CE VishalRs SrineshS @@ -495,10 +495,10 @@ <fixed-case>C</fixed-case>ode_<fixed-case>C</fixed-case>onquerors@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech 2025: Deep Learning Approach for Sentiment Analysis in <fixed-case>T</fixed-case>amil and <fixed-case>T</fixed-case>ulu - Harish VijayV + Harish VijayV Ippatapu VenkataSrichandra Pathange OmkareshwaraRao - PremjithBAmrita Vishwa Vidyapeetham (Deemed University) + PremjithBAmrita Vishwa Vidyapeetham (Deemed University) 254-258 In this paper we propose a novel approach to sentiment analysis in languages with mixed Dravidian codes, specifically Tamil-English and Tulu-English social media text. We introduce an innovative hybrid deep learning architecture that uniquely combines convolutional and recurrent neural networks to effectively capture both local patterns and long-term dependencies in code-mixed text. Our model addresses critical challenges in low-resource language processing through a comprehensive preprocessing pipeline and specialized handling of class imbalance and out-of-vocabulary words. Evaluated on a substantial dataset of social media comments, our approach achieved competitive macro F1 scores of 0.3357 for Tamil (ranked 18) and 0.3628 for Tulu (ranked 13) 2025.dravidianlangtech-1.44 @@ -506,8 +506,8 @@ <fixed-case>KEC</fixed-case>_<fixed-case>TECH</fixed-case>_<fixed-case>TITANS</fixed-case>@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech 2025: Abusive Text Detection in <fixed-case>T</fixed-case>amil and <fixed-case>M</fixed-case>alayalam Social Media Comments Using Machine Learning - MalligaSubramanian - KogilavaniShanmugavadivel + MalligaSubramanian + KogilavaniShanmugavadivel DeepigaP DharshiniS AnanthakumarS @@ -519,9 +519,9 @@ F<tex-math>^2</tex-math> (<fixed-case>F</fixed-case>uture<fixed-case>F</fixed-case>iction): Detection of Fake News on Futuristic Technology - MsvpjSathvikIIIT Dharwad + MsvpjSathvikIIIT Dharwad VenkateshVelugubantlaMeridian Cooperative - Ravi TejaPotlaNVIDIA + Ravi TejaPotlaNVIDIA 264-272 There is widespread of misinformation on futuristic technology and society. To accurately detect such news, the algorithms require up-to-date knowledge. The Large Language Models excel in the NLP but cannot retrieve the ongoing events or innovations. For example, GPT and it’s variants are restricted till the knowledge of 2021. We introduce a new methodology for the identification of fake news pertaining to futuristic technology and society. Leveraging the power of Google Knowledge, we enhance the capabilities of the GPT-3.5 language model, thereby elevating its performance in the detection of misinformation. The proposed framework exhibits superior efficacy compared to established baselines with the accuracy of 81.04%. Moreover, we propose a novel dataset consisting of fake news in three languages English, Telugu and Tenglish of around 21000 from various sources. 2025.dravidianlangtech-1.46 @@ -531,7 +531,7 @@ <fixed-case>J</fixed-case>ust<fixed-case>AT</fixed-case>alented<fixed-case>T</fixed-case>eam@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech 2025: A Study of <fixed-case>ML</fixed-case> and <fixed-case>DL</fixed-case> approaches for Sentiment Analysis in Code-Mixed <fixed-case>T</fixed-case>amil and <fixed-case>T</fixed-case>ulu Texts Ponsubash RajR Paruvatha PriyaB - BharathiB + BharathiB 273-277 The growing prevalence of code-mixed text on social media presents unique challenges for sen- timent analysis, particularly in low-resource languages like Tamil and Tulu. This paper ex- plores sentiment classification in Tamil-English and Tulu-English code-mixed datasets using both machine learning (ML) and deep learn- ing (DL) approaches. The ML model utilizes TF-IDF feature extraction combined with a Logistic Regression classifier, while the DL model employs FastText embeddings and a BiLSTM network enhanced with an attention mechanism. Experimental results reveal that the ML model outperforms the DL model in terms of macro F1-score for both languages. Specifically, for Tamil, the ML model achieves a macro F1-score of 0.46, surpassing the DL model’s score of 0.43. For Tulu, the ML model significantly outperforms the DL model, achiev- ing 0.60 compared to 0.48. This performance disparity is more pronounced in Tulu due to its smaller dataset size of 13,308 samples com- pared to Tamil’s 31,122 samples, highlight- ing the data efficiency of ML models in low- resource settings. The study provides insights into the strengths and limitations of each ap- proach, demonstrating that traditional ML tech- niques remain competitive for code-mixed sen- timent analysis when data is limited. These findings contribute to ongoing research in mul- tilingual NLP and offer practical implications for applications such as social media monitor- ing, customer feedback analysis, and conversa- tional AI in Dravidian languages. 2025.dravidianlangtech-1.47 @@ -539,8 +539,8 @@ <fixed-case>KEC</fixed-case>_<fixed-case>TECH</fixed-case>_<fixed-case>TITANS</fixed-case>@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech 2025:Sentiment Analysis for Low-Resource Languages: Insights from <fixed-case>T</fixed-case>amil and <fixed-case>T</fixed-case>ulu using Deep Learning and Machine Learning Models - MalligaSubramanian - KogilavaniShanmugavadivel + MalligaSubramanian + KogilavaniShanmugavadivel DharshiniS DeepigaP PraveenkumarC @@ -553,7 +553,7 @@ <fixed-case>C</fixed-case>ode_<fixed-case>C</fixed-case>onquerors@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech 2025: Multimodal Misogyny Detection in <fixed-case>D</fixed-case>ravidian Languages Using Vision Transformer and <fixed-case>BERT</fixed-case> Pathange OmkareshwaraRao - Harish VijayV + Harish VijayV Ippatapu VenkataSrichandra NeethuMohan Sachin KumarSAmrita Vishwa Vidyapeetham (Deemed University) @@ -565,7 +565,7 @@ <fixed-case>Y</fixed-case>en<fixed-case>LP</fixed-case>_<fixed-case>CS</fixed-case>@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech 2025: Sentiment Analysis on Code-Mixed <fixed-case>T</fixed-case>amil-<fixed-case>T</fixed-case>ulu Data Using Machine Learning and Deep Learning Models RakshaAdyanthaya - Rathnakara ShettyPYenepoya University + Rathnakara ShettyPYenepoya University 288-292 The sentiment analysis in code-mixed Dravidian languages such as Tamil-English and Tulu-English is the focus of this study because these languages present difficulties for conventional techniques. In this work, We used ensembles, multilingual Bidirectional Encoder Representation(mBERT), Bidirectional Long Short Term Memory (BiLSTM), Random Forest (RF), Support Vector Machine (SVM), and preprocessing in conjunction with Term Frequency-Inverse Document Frequency (TF-IDF) and Word2Vec feature extraction. mBERT obtained accuracy of 64% for Tamil and 68% for Tulu on development datasets. In test sets, the ensemble model gave Tamil a macro F1-score of 0.4117, while mBERT gave Tulu a macro F1-score of 0.5511. With regularization and data augmentation, these results demonstrate the approach’s potential for further advancements. 2025.dravidianlangtech-1.50 @@ -588,12 +588,12 @@ <fixed-case>KEC</fixed-case>-Elite-Analysts@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech 2025: Deciphering Emotions in <fixed-case>T</fixed-case>amil-<fixed-case>E</fixed-case>nglish and Code-Mixed Social Media Tweets - MalligaSubramanian + MalligaSubramanian ArunaA AnbarasanT AmudhavanM JahaganapathiS - KogilavaniShanmugavadivel + KogilavaniShanmugavadivel 299-303 Sentiment analysis in code-mixed languages, particularly Tamil-English, is a growing challenge in natural language processing (NLP) due to the prevalence of multilingual communities on social media. This paper explores various machine learning and transformer-based models, including Logistic Regression, Support Vector Machines (SVM), K-Nearest Neighbors (KNN), BERT, and mBERT, for sentiment classification of Tamil-English code-mixed text. The models are evaluated on a shared task dataset provided by DravidianLangTech@NAACL 2025, with performance measured through accuracy, precision, recall, and F1-score. Our results demonstrate that transformer-based models, particularly mBERT, outperform traditional classifiers in identifying sentiment polarity. Future work aims to address the challenges posed by code-switching and class imbalance through advanced model architectures and data augmentation techniques. 2025.dravidianlangtech-1.52 @@ -629,7 +629,7 @@ <fixed-case>CUET</fixed-case>_<fixed-case>A</fixed-case>gile@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech 2025: Fine-tuning Transformers for Detecting Abusive Text Targeting Women from <fixed-case>T</fixed-case>amil and <fixed-case>M</fixed-case>alayalam Texts Tareque MdHanif - Md RashadurRahmanChittagong University of Engineering and Technology + Md RashadurRahmanChittagong University of Engineering and Technology 315-319 As social media has grown, so has online abuse, with women often facing harmful online behavior. This discourages their free participation and expression online. This paper outlines the approach adopted by our team for detecting abusive comments in Tamil and Malayalam. The task focuses on classifying whether a given comment contains abusive language towards women. We experimented with transformer based models by fine-tuning Tamil-BERT for Tamil and Malayalam-BERT for Malayalam. Additionally, we fine-tuned IndicBERT v2 on both Tamil and Malayalam datasets. To evaluate the effect of pre-processing, we also conducted experiments using non-preprocessed text. Results demonstrate that IndicBERT v2 outperformed the language-specific BERT models in both languages. Pre-processing the data showed mixed results, with a slight improvement in the Tamil dataset but no significant benefit for the Malayalam dataset. Our approach secured first place in Tamil with a macro F1-score of 0.7883 and second place in Malayalam with a macro F1-score of 0.7234. The implementation details of the task will be found in the GitHub repository. 2025.dravidianlangtech-1.55 @@ -670,11 +670,11 @@ <fixed-case>SSNT</fixed-case>rio@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech 2025: Identification of <fixed-case>AI</fixed-case> Generated Content in <fixed-case>D</fixed-case>ravidian Languages using Transformers - JBhuvana + JBhuvana MirnalineeT T RohanR DiyaSeshan - AvaneeshKoushik + AvaneeshKoushik 335-339 The increasing prevalence of AI-generated content has raised concerns about the authenticity and reliability of online reviews, particularly in resource-limited languages like Tamil and Malayalam. This paper presents an approach to the Shared Task on Detecting AI-generated Product Reviews in Dravidian Languages at NAACL2025, which focuses on distinguishing AI-generated reviews from human-written ones in Tamil and Malayalam. Several transformer-based models, including IndicBERT, RoBERTa, mBERT, and XLM-R, were evaluated, with language-specific BERT models for Tamil and Malayalam demonstrating the best performance. The chosen methodologies were evaluated using Macro Average F1 score. In the rank list released by the organizers, team SSNTrio, achieved ranks of 3rd and 29th for the Malayalam and Tamil datasets with Macro Average F1 Scores of 0.914 and 0.598 respectively. 2025.dravidianlangtech-1.59 @@ -682,11 +682,11 @@ <fixed-case>SSNT</fixed-case>rio@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech 2025: Sentiment Analysis in <fixed-case>D</fixed-case>ravidian Languages using Multilingual <fixed-case>BERT</fixed-case> - JBhuvana + JBhuvana MirnalineeT T DiyaSeshan RohanR - AvaneeshKoushik + AvaneeshKoushik 340-344 This paper presents an approach to sentiment analysis for code-mixed Tamil-English and Tulu-English datasets as part of the DravidianLangTech@NAACL 2025 shared task. Sentiment analysis, the process of determining the emotional tone or subjective opinion in text, has become a critical tool in analyzing public sentiment on social media platforms. The approach discussed here uses multilingual BERT (mBERT) fine-tuned on the provided datasets to classify sentiment polarity into various predefined categories: for Tulu, the categories were positive, negative, not_tulu, mixed, and neutral; for Tamil, the categories were positive, negative, unknown, mixed_feelings, and neutral. The mBERT model demonstrates its effectiveness in handling sentiment analysis for codemixed and resource-constrained languages by achieving an F1-score of 0.44 for Tamil, securing the 6th position in the ranklist; and 0.56 for Tulu, ranking 5th in the respective task. 2025.dravidianlangtech-1.60 @@ -694,9 +694,9 @@ <fixed-case>NLP</fixed-case>_goats@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech 2025: Detecting Fake News in <fixed-case>D</fixed-case>ravidian Languages: A Text Classification Approach - Srihari VK - Vijay KarthickVaidyanathan - ThenmozhiDurairaj + Srihari VK + Vijay KarthickVaidyanathan + ThenmozhiDurairaj 345-349 The advent and expansion of social media have transformed global communication. Despite its numerous advantages, it has also created an avenue for the rapid spread of fake news, which can impact people’s decision-making and judgment. This study explores detecting fake news as part of the DravidianLangTech@NAACL 2025 shared task, focusing on two key tasks. The aim of Task 1 is to classify Malayalam social media posts as either original or fake, and Task 2 categorizes Malayalam-language news articles into five levels of truthfulness: False, Half True, Mostly False, Partly False, and Mostly True. We accomplished the tasks using transformer models, e.g., M-BERT and classifiers like Naive Bayes. Our results were promising, with M-BERT achieving the better results. We achieved a macro-F1 score of 0.83 for distinguishing between fake and original content in Task 1 and a score of 0.54 for classifying news articles in Task 2, ranking us 11 and 4, respectively. 2025.dravidianlangtech-1.61 @@ -704,9 +704,9 @@ <fixed-case>NLP</fixed-case>_goats@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech 2025: Towards Safer Social Media: Detecting Abusive Language Directed at Women in <fixed-case>D</fixed-case>ravidian Languages - Vijay KarthickVaidyanathan - Srihari VK - ThenmozhiDurairaj + Vijay KarthickVaidyanathan + Srihari VK + ThenmozhiDurairaj 350-354 Social media in the present world is an essential communication platform for information sharing. But their emergence has now led to an increase in the proportion of online abuse, in particular against women in the form of abusive and offensive messages. A reflection of the social inequalities, the importance of detecting abusive language is highlighted by the fact that the usage has a profound psychological and social impact on the victims. This work by DravidianLangTech@NAACL 2025 aims at developing an automated abusive content detection system for women directed towards women on the Tamil and Malayalam platforms, two of the Dravidian languages. Based on a dataset of their YouTube comments about sensitive issues, the study uses multilingual BERT (mBERT) to detect abusive comments versus non-abusive ones. We achieved F1 scores of 0.75 in Tamil and 0.68 in Malayalam, placing us 13 and 9 respectively. 2025.dravidianlangtech-1.62 @@ -716,9 +716,9 @@ <fixed-case>H</fixed-case>er<fixed-case>WILL</fixed-case>@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech 2025: Ensemble Approach for Misogyny Detection in Memes Using Pre-trained Text and Vision Transformers Neelima MonjushaPreetiJahangirnagar University TrinaChakrabortyShahjalal University of Science and Technology - Noor Mairukh KhanArnobUniversity of Asia Pacific - SaiyaraMahmud - Azmine ToushikWasi + Noor Mairukh KhanArnobUniversity of Asia Pacific + SaiyaraMahmud + Azmine ToushikWasi 355-360 Misogynistic memes on social media perpetuate gender stereotypes, contribute to harassment, and suppress feminist activism. However, most existing misogyny detection models focus on high-resource languages, leaving a gap in low-resource settings. This work addresses that gap by focusing on misogynistic memes in Tamil and Malayalam, two Dravidian languages with limited resources. We combine computer vision and natural language processing for multi-modal detection, using CLIP embeddings for the vision component and BERT models trained on code-mixed hate speech datasets for the text component. Our results show that this integrated approach effectively captures the unique characteristics of misogynistic memes in these languages, achieving competitive performance with a Macro F1 Score of 0.7800 for the Tamil test set and 0.8748 for the Malayalam test set. These findings highlight the potential of multimodal models and the adaptation of pre-trained models to specific linguistic and cultural contexts, advancing misogyny detection in low-resource settings. Code available at https://github.com/HerWILL-Inc/NAACL-2025 2025.dravidianlangtech-1.63 @@ -727,7 +727,7 @@ Cognitext@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech2025: Fake News Classification in <fixed-case>M</fixed-case>alayalam Using m<fixed-case>BERT</fixed-case> and <fixed-case>LSTM</fixed-case> ShriyaAlladi - BharathiB + BharathiB 361-365 Fake news detection is a crucial task in combat- ing misinformation, particularly in underrepresented languages such as Malayalam. This paper focuses on detecting fake news in Dravidian languages using two tasks: Social Media Text Classification and News Classification. We employ a fine-tuned multilingual BERT (mBERT) model for classifying a given social media text into original or fake and an LSTM-based architecture for accurately detecting and classifying fake news articles in the Malayalam language into different categories.Extensive preprocessing techniques, such as tokenization and text cleaning, were used to ensure data quality. Our experiments achieved significant accuracy rates and F1- scores. The study’s contributions include applying advanced machine learning techniques to the Malayalam language, addressing the lack of research on low-resource languages, and highlighting the challenges of fake news detection in multilingual and code-mixed environments. 2025.dravidianlangtech-1.64 @@ -735,10 +735,10 @@ <fixed-case>NLP</fixed-case>_goats_<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech_2025__<fixed-case>D</fixed-case>etecting_<fixed-case>AI</fixed-case>_<fixed-case>W</fixed-case>ritten_<fixed-case>R</fixed-case>eviews_for_<fixed-case>C</fixed-case>onsumer_<fixed-case>T</fixed-case>rust - Srihari VK - Vijay KarthickVaidyanathan + Srihari VK + Vijay KarthickVaidyanathan Mugilkrishna DU - ThenmozhiDurairaj + ThenmozhiDurairaj 366-370 The rise of AI-generated content has introduced challenges in distinguishing machine-generated text from human-written text, particularly in low-resource languages. The identification of artificial intelligence (AI)-based reviews is of significant importance to preserve trust and authenticity on online platforms. The Shared Task on Detecting AI-Generated Product Reviews in Dravidian languages deals with the task of detecting AI-generated and human-written reviews in Tamil and Malayalam. To solve this problem, we specifically fine-tuned mBERT for binary classification. Our system achieved 10th place in Tamil with a macro F1-score of 0.90 and 28th place in Malayalam with a macro F1-score of 0.68, as reported by the NAACL 2025 organizers. The findings demonstrate the complexity involved in the separation of AI-derived text from human-authored writing, with a call for continued advances in detection methods. 2025.dravidianlangtech-1.65 @@ -755,8 +755,8 @@ <fixed-case>DLRG</fixed-case>@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech 2025: Multimodal Hate Speech Detection in <fixed-case>D</fixed-case>ravidian Languages - RatnavelRajalakshmiVellore Institute of Technology - RameshKannan + RatnavelRajalakshmiVellore Institute of Technology + RameshKannan MeeteshSaini BitanMallik 376-380 @@ -826,7 +826,7 @@ shimig@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech2025: Stratification of Abusive content on Women in Social Media GersomeShimiMadras Christian College Jerin MahibhaCMeenakshi Sundararajan Engineering College - ThenmozhiDurairaj + ThenmozhiDurairaj 409-414 The social network is a trending medium for interaction and sharing content globally. The content is sensitive since it can create an impact and change the trends of stakeholder’s thought as well as behavior. When the content is targeted towards women, it may be abusive or non-abusive and the identification is a tedious task. The content posted on social networks can be in English, code mix, or any low-resource language. The shared task Abusive Tamil and Malayalam Text targeting Women on Social Media was conducted as part of DravidianLangTech@NAACL 2025 organized by DravidianLangTech. The task is to identify the content given in Tamil or Malayalam or code mix as abusive or non-abusive. The task is accomplished for the South Indian languages Tamil and Malayalam using pretrained transformer model, BERT base multilingual cased and achieved the accuracy measure of 0.765 and 0.677. 2025.dravidianlangtech-1.73 @@ -835,8 +835,8 @@ <fixed-case>SSNT</fixed-case>rio@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech2025: <fixed-case>LLM</fixed-case> Based Techniques for Detection of Abusive Text Targeting Women MirnalineeT T - JBhuvana - AvaneeshKoushik + JBhuvana + AvaneeshKoushik DiyaSeshan RohanR 415-419 @@ -849,7 +849,7 @@ Md MinhazulKabirChittagong University of Engineering and Technology Md.Mohiuddin KawsarAhmed - Mohammed MoshiulHoqueChittagong University of Engineering and Technology + Mohammed MoshiulHoqueChittagong University of Engineering and Technology 420-426 2025.dravidianlangtech-1.75 kabir-etal-2025-cuet @@ -860,7 +860,7 @@ NazmusSakib Md. AlamMiahChittagong University of Engineering and Technology JawadHossain - Mohammed MoshiulHoqueChittagong University of Engineering and Technology + Mohammed MoshiulHoqueChittagong University of Engineering and Technology 427-434 Memes have become one of the main mediums for expressing ideas, humor, and opinions through visual-textual content on social media. The same medium has been used to propagate harmful ideologies, such as misogyny, that undermine gender equality and perpetuate harmful stereotypes. Identifying misogynistic memes is particularly challenging in low-resource languages (LRLs), such as Tamil and Malayalam, due to the scarcity of annotated datasets and sophisticated tools. Therefore, DravidianLangTech@NAACL 2025 launched a Shared Task on Misogyny Meme Detection to identify misogyny memes. For this task, this work exploited an extensive array of models, including machine learning (LR, RF, SVM, and XGBoost), and deep learning (CNN, BiLSTM+CNN, CNN+GRU, and LSTM) are explored to extract textual features, while CNN, BiLSTM + CNN, ResNet50, and DenseNet121 are utilized for visual features.Furthermore, we have explored feature-level and decision-level fusion techniques with several model combinations like MuRIL with ResNet50, MuRIL with BiLSTM+CNN, T5+MuRIL with ResNet50, and mBERT with ResNet50. The evaluation results demonstrated that BERT + ResNet50 performed best, obtaining an F1 score of 0.81716 (Tamil) and were ranked 2nd in the task. The early fusion of MuRIL+ResNet50 showed the highest F1 score of 0.82531 and received a 9th rank in Malayalam. 2025.dravidianlangtech-1.76 @@ -883,7 +883,7 @@ Md. RefajHossan AlamgirHossainCUET JawadHossain - Mohammed MoshiulHoqueChittagong University of Engineering and Technology + Mohammed MoshiulHoqueChittagong University of Engineering and Technology 440-447 The rapid growth of digital platforms and social media has significantly contributed to spreading fake news, posing serious societal challenges. While extensive research has been conducted on detecting fake news in high-resource languages (HRLs) such as English, relatively little attention has been given to low-resource languages (LRLs) like Malayalam due to insufficient data and computational tools. To address this challenge, the DravidianLangTech 2025 workshop organized a shared task on fake news detection in Dravidian languages. The task was divided into two sub-tasks, and our team participated in Task 1, which focused on classifying social media texts as original or fake. We explored a range of machine learning (ML) techniques, including Logistic Regression (LR), Multinomial Naïve Bayes (MNB), and Support Vector Machines (SVM), as well as deep learning (DL) models such as CNN, BiLSTM, and a hybrid CNN+BiLSTM. Additionally, this work examined several transformer-based models, including m-BERT, Indic-BERT, XLM-Roberta, and MuRIL-BERT, to exploit the task. Our team achieved 6th place in Task 1, with MuRIL-BERT delivering the best performance, achieving an F1 score of 0.874. 2025.dravidianlangtech-1.78 @@ -902,11 +902,11 @@ <fixed-case>SSNT</fixed-case>rio @ <fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech 2025: Hybrid Approach for Hate Speech Detection in <fixed-case>D</fixed-case>ravidian Languages with Text and Audio Modalities - JBhuvana + JBhuvana MirnalineeT T RohanR DiyaSeshan - AvaneeshKoushik + AvaneeshKoushik 454-458 This paper presents the approach and findings from the Multimodal Social Media Data Analysis in Dravidian Languages (MSMDA-DL) shared task at DravidianLangTech@NAACL 2025. The task focuses on detecting multimodal hate speech in Tamil, Malayalam, and Telugu, requiring models to analyze both text and speech components from social media content. The proposed methodology uses language-specific BERT models for the provided text transcripts, followed by multimodal feature extraction techniques, and classification using a Random Forest classifier to enhance performance across the three languages. The models achieved a macro-F1 score of 0.7332 (Rank 1) in Tamil, 0.7511 (Rank 1) in Malayalam, and 0.3758 (Rank 2) in Telugu, demonstrating the effectiveness of the approach in multilingual settings. The models performed well despite the challenges posed by limited resources, highlighting the potential of language-specific BERT models and multimodal techniques in hate speech detection for Dravidian languages. 2025.dravidianlangtech-1.80 @@ -929,7 +929,7 @@ DolaChakraborty ShamimaAfroz JawadHossain - Mohammed MoshiulHoqueChittagong University of Engineering and Technology + Mohammed MoshiulHoqueChittagong University of Engineering and Technology 465-471 The rapid spread of misinformation in the digital era presents critical challenges for fake news detection, especially in low-resource languages (LRLs) like Malayalam, which lack extensive datasets and pre-trained models for widely spoken languages. This gap in resources makes it harder to build robust systems for combating misinformation despite the significant societal and political consequences it can have. To address these challenges, this work proposes a transformer-based approach for Task 1 of the Fake News Detection in Dravidian Languages (DravidianLangTech@NAACL 2025), which focuses on classifying Malayalam social media texts as either original or fake. The experiments involved a range of ML techniques (Logistic Regression (LR), Support Vector Machines (SVM), and Decision Trees (DT)) and DL architectures (BiLSTM, BiLSTM-LSTM, and BiLSTM-CNN). This work also explored transformer-based models, including IndicBERT, MuRiL, XLM-RoBERTa, and Malayalam BERT. Among these, Malayalam BERT achieved the best performance, with the highest macro F1-score of 0.892, securing a rank of 3rd in the competition. 2025.dravidianlangtech-1.82 @@ -951,7 +951,7 @@ HarshitaSharma SimranSimranInstitute of Informatics and Communication VajratiyaVajrobol - NitishaAggarwal + NitishaAggarwal 478-482 Misogyny has become a pervasive issue in digital spaces. Misleading gender stereotypes are getting communicated through digital content.This content is majorly displayed as a text-and-image memes. With the growing prevalence of online content, it is essential to develop automated systems capable of detecting such harmful content to ensure safer online environments. This study focuses on the detection of misogynistic memes in two Dravidian languages, Tamil and Malayalam. The proposed model utilizes a pre-trained XLM-RoBERTa (XLM-R) model for text analysis and a Vision Transformer (ViT) for image feature extraction. A custom neural network classifier was trained on integrating the outputs of both modalities to form a unified representation. This model predicts whether the meme represents misogyny or not. This follows an early-fusion strategy since features of both modalities are combined before feeding into the classification model. This approach achieved promising results using a macro F1-score of 0.84066 on the Malayalam test dataset and 0.68830 on the Tamil test dataset. In addition, it is worth noting that this approach secured Rank 7 and 11 in Malayalam and Tamil classification respectively in the shared task of Misogyny Meme Detection (MMD). The findings demonstrate that the multimodal approach significantly enhances the accuracy of detecting misogynistic content compared to text-only or image-only models. 2025.dravidianlangtech-1.84 @@ -974,7 +974,7 @@ Symom HossainShohan Ashraful IslamParan JawadHossain - Mohammed MoshiulHoqueChittagong University of Engineering and Technology + Mohammed MoshiulHoqueChittagong University of Engineering and Technology 489-495 The rise of social media has significantly facilitated the rapid spread of hate speech. Detecting hate speech for content moderation is challenging, especially in low-resource languages (LRLs) like Telugu. Although some progress has been noticed in hate speech detection in Telegu concerning unimodal (text or image) in recent years, there is a lack of research on hate speech detection based on multimodal content detection (specifically using audio and text). In this regard, DravidianLangTech has arranged a shared task to address this challenge. This work explored three machine learning (ML), three deep learning (DL), and seven transformer-based models that integrate text and audio modalities using cross-modal attention for hate speech detection. The evaluation results demonstrate that mBERT achieved the highest F-1 score of 49.68% using text. However, the proposed multimodal attention-based approach with Whisper-small+TeluguBERT-3 achieved an F-1 score of 43 68%, which helped us achieve a rank of 3rd in the shared task competition. 2025.dravidianlangtech-1.86 @@ -992,11 +992,11 @@ <fixed-case>CIC</fixed-case>-<fixed-case>NLP</fixed-case>@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech 2025: Detecting <fixed-case>AI</fixed-case>-generated Product Reviews in <fixed-case>D</fixed-case>ravidian Languages - TewodrosAchamaleh + TewodrosAchamaleh Tolulope OlalekanAbiola - Lemlem EyobKawo + Lemlem EyobKawo MikiyasMebraihtu - GrigoriSidorovInstituto Politécnico Nacional + GrigoriSidorovInstituto Politécnico Nacional 502-507 AI-generated text now matches human writing so well that telling them apart is very difficult. Our CIC-NLP team submits results for the DravidianLangTech@NAACL 2025 shared task to reveal AI-generated product reviews in Dravidian languages. We performed a binary classification task with XLM-RoBERTa-Base using the DravidianLangTech@NAACL 2025 datasets offered by the event organizers. Through training the model correctly, our tests could tell between human and AI-generated reviews with scores of 0.96 for Tamil and 0.88 for Malayalam in the evaluation test set. This paper presents detailed information about preprocessing, model architecture, hyperparameter fine-tuning settings, the experimental process, and the results. The source code is available on GitHub1. 2025.dravidianlangtech-1.88 @@ -1007,7 +1007,7 @@ DolaChakraborty ShamimaAfroz JawadHossain - Mohammed MoshiulHoqueChittagong University of Engineering and Technology + Mohammed MoshiulHoqueChittagong University of Engineering and Technology 508-513 Misogyny memes are a form of online content that spreads harmful and damaging ideas about women. By combining images and text, they often aim to mock, disrespect, or insult women, sometimes overtly and other times in more subtle, insidious ways. Detecting Misogyny memes is crucial for fostering safer and more respectful online communities. While extensive research has been conducted on high-resource languages (HRLs) like English, low-resource languages (LRLs) such as Dravidian (e.g., Tamil and Malayalam) remain largely overlooked. The shared task on Misogyny Meme Detection, organized as part of DravidianLangTech@NAACL 2025, provided a platform to tackle the challenge of identifying misogynistic content in memes, specifically in Malayalam. We participated in the competition and adopted a multimodal approach to contribute to this effort. For image analysis, we employed a ResNet18 model to extract visual features, while for text analysis, we utilized the IndicBERT model. Our system achieved an impressive F1-score of 0.87, earning us the 3rd rank in the task. 2025.dravidianlangtech-1.89 @@ -1018,7 +1018,7 @@ Md.Mohiuddin Md MinhazulKabirChittagong University of Engineering and Technology KawsarAhmed - Mohammed MoshiulHoqueChittagong University of Engineering and Technology + Mohammed MoshiulHoqueChittagong University of Engineering and Technology 514-521 Misogyny memes, a form of digital content, reflect societal prejudices by discriminating against women through shaming and stereotyping. In this study, we present a multimodal approach combining Indic-BERT and ViT-base-patch16-224 to address misogyny memes. We explored various Machine Learning, Deep Learning, and Transformer models for unimodal and multimodal classification using provided Tamil and Malayalam meme dataset. Our findings highlight the challenges traditional ML and DL models face in understanding the nuances of Dravidian languages, while emphasizing the importance of transformer models in capturing these complexities. Our multimodal method achieved F1-scores of 77.18% and 84.11% in Tamil and Malayalam, respectively, securing 6th place for both languages among the participants. 2025.dravidianlangtech-1.90 @@ -1030,7 +1030,7 @@ Tofayel AhmmedBabuChittagong University of Engineering and Technology MD Musa KalimullahRatul JawadHossain - Mohammed MoshiulHoqueChittagong University of Engineering and Technology + Mohammed MoshiulHoqueChittagong University of Engineering and Technology 522-528 E-commerce platforms face growing challenges regarding consumer trust and review authenticity because of the growing number of AI-generated product reviews. Low-resource languages (LRLs) such as Tamil and Malayalam face limited investigation by AI detection techniques because these languages experience constraints from sparse data sources and complex linguistic structures. The research team at CUET_NetworkSociety took part in the AI-Generated Review Detection contest during the DravidianLangTech@NAACL 2025 event to fill this knowledge void. Using a combination of machine learning, deep learning, and transformer-based models, we detected AI-generated and human-written reviews in both Tamil and Malayalam. The developed method employed DistilBERT, which underwent an advanced preprocessing pipeline and hyperparameter optimization using the Transformers library. This approach achieved a Macro F1-score of 0.81 for Tamil (Subtask 1), securing 18th place, and a score of 0.7287 for Malayalam (Subtask 2), ranking 25th. 2025.dravidianlangtech-1.91 @@ -1042,7 +1042,7 @@ SabikAftahee Tofayel AhmmedBabuChittagong University of Engineering and Technology JawadHossain - Mohammed MoshiulHoqueChittagong University of Engineering and Technology + Mohammed MoshiulHoqueChittagong University of Engineering and Technology 529-535 Memes are commonly used for communication on social media platforms, and some of them can propagate misogynistic content, spreading harmful messages. Detecting such misogynistic memes has become a significant challenge, especially for low-resource languages like Tamil and Malayalam, due to their complex linguistic structures. To tackle this issue, a shared task on detecting misogynistic memes was organized at DravidianLangTech@NAACL 2025. This paper proposes a multimodal deep learning approach for detecting misogynistic memes in Tamil and Malayalam. The proposed model combines fine-tuned ResNet18 for visual feature extraction and indicBERT for analyzing textual content. The fused model was applied to the test dataset, achieving macro F1 scores of 76.32% for Tamil and 80.35% for Malayalam. Our approach led to 7th and 12th positions for Tamil and Malayalam, respectively. 2025.dravidianlangtech-1.92 @@ -1054,7 +1054,7 @@ MD Musa KalimullahRatul SabikAftahee JawadHossain - Mohammed MoshiulHoqueChittagong University of Engineering and Technology + Mohammed MoshiulHoqueChittagong University of Engineering and Technology 536-542 Social media has become an established medium of public communication and opinions on every aspect of life, but especially politics. This has resulted in a growing need for tools that can process the large amount of unstructured data that is produced on these platforms providing actionable insights in domains such as social trends and political opinion. Low-resource languages like Tamil present challenges due to limited tools and annotated data, highlighting the need for NLP focus on understudied languages. To address this, a shared task has been organized by DravidianLangTech@NAACL 2025 for political sentiment analysis for low-resource languages, with a specific focus on Tamil. In this task, we have explored several machine learning methods such as SVM, AdaBoost, GB, deep learning methods including CNN, LSTM, GRU BiLSTM, and the ensemble of different deep learning models, and transformer-based methods including mBERT, T5, XLM-R. The mBERT model performed best by achieving a macro F1 score of 0.2178 and placing our team 22nd in the rank list. 2025.dravidianlangtech-1.93 @@ -1062,7 +1062,7 @@ cantnlp@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech-2025: A Bag-of-Sounds Approach to Multimodal Hate Speech Detection - SidneyWongUniversity of Canterbury + SidneyWongUniversity of Canterbury AndrewLi 543-551 This paper presents the systems and results for the Multimodal Social Media Data Analysis in Dravidian Languages (MSMDA-DL) shared task at the Fifth Workshop on Speech, Vision, and Language Technologies for Dravidian Languages (DravidianLangTech-2025). We took a ‘bag-of-sounds’ approach by training our hate speech detection system on the speech (audio) data using transformed Mel spectrogram measures. While our candidate model performed poorly on the test set, our approach offered promising results during training and development for Malayalam and Tamil. With sufficient and well-balanced training data, our results show that it is feasible to use both text and speech (audio) data in the development of multimodal hate speech detection systems. @@ -1093,9 +1093,9 @@ Detection of Religious Hate Speech During Elections in <fixed-case>K</fixed-case>arnataka - MsvpjSathvikIIIT Dharwad + MsvpjSathvikIIIT Dharwad RajSonani - Ravi TejaPotlaNVIDIA + Ravi TejaPotlaNVIDIA 562-566 We propose a novel dataset for detecting religious hate speech in the context of elections in Karnataka, with a particular focus on Kannada and Kannada-English code-mixed text. The data was collected during the Karnataka state elections and includes 3,000 labeled samples that reflect various forms of online discourse related to religion. This dataset aims to address the growing concern of religious intolerance and hate speech during election periods, it’s a dataset of multilingual, code-mixed language. To evaluate the effectiveness of this dataset, we benchmarked it using the latest state-of-the-art algorithms. We achieved accuracy of 78.61%. 2025.dravidianlangtech-1.97 @@ -1103,8 +1103,8 @@ <fixed-case>DLTCNITPY</fixed-case>@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech 2025 Abusive Code-mixed Text Detection System Targeting Women for <fixed-case>T</fixed-case>amil and <fixed-case>M</fixed-case>alayalam Languages using Deep Learning Technique - HabibaA - Dr GAghila + HabibaA + Dr GAghila 567-572 The growing use of social communication platforms has seen women facing higher degrees of online violence than ever before. This paper presents how a deep learning abuse detection system can be applied to inappropriate text directed at women on social media. Because of the diversity of languages and the casual nature of online communication, coupled with the cultural diversity around the world, the detection of such content is often severely lacking. This research utilized Long Short-Term Memory (LSTM) for abuse text detection in Malayalam and Tamil languages. This modeldelivers 0.75, a high F1 score for Malayalam, and for Tamil, 0.72, achieving the desired balance of identifying abuse and non-abusive content and achieving high-performance rates. The designed model, based on the dataset provided in DravidianLangTech@NAACL2025 (shared task) comprising code-mixed abusive and nonabusive social media posts in Malayalam and Tamil, showcases a high propensity for detecting accuracy and indicates the likely success of deep learning-based models for abuse textdetection in resource-constrained languages. 2025.dravidianlangtech-1.98 @@ -1114,7 +1114,7 @@ <fixed-case>TSD</fixed-case>: Towards Computational Processing of <fixed-case>T</fixed-case>amil Similes - A <fixed-case>T</fixed-case>amil Simile Dataset AathavanNithiyananthan JathushanRaveendra - UthayasankerThayasivamUniversity of Moratuwa + UthayasankerThayasivamUniversity of Moratuwa 573-579 A simile is a powerful figure of speech that makes a comparison between two different things via shared properties, often using words like “like” or “as” to create vivid imagery, convey emotions, and enhance understanding. However, computational research on similes is limited in low-resource languages like Tamil due to the lack of simile datasets. This work introduces a manually annotated Tamil Simile Dataset (TSD) comprising around 1.5k simile sentences drawn from various sources. Our data annotation guidelines ensure that all the simile sentences are annotated with the three components, namely tenor, vehicle, and context. We benchmark our dataset for simile interpretation and simile generation tasks using chosen pre-trained language models (PLMs) and present the results. Our findings highlight the challenges of simile tasks in Tamil, suggesting areas for further improvement. We believe that TSD will drive progress in computational simile processing for Tamil and other low-resource languages, further advancing simile related tasks in Natural Language Processing. 2025.dravidianlangtech-1.99 @@ -1123,8 +1123,8 @@ Hydrangea@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>an<fixed-case>T</fixed-case>ech2025: Abusive language Identification from <fixed-case>T</fixed-case>amil and <fixed-case>M</fixed-case>alayalam Text using Transformer Models ShanmithaThirumoorthy - ThenmozhiDurairaj - RatnavelRajalakshmiVellore Institute of Technology + ThenmozhiDurairaj + RatnavelRajalakshmiVellore Institute of Technology 580-584 Abusive language toward women on the Internet has always been perceived as a danger to free speech and safe online spaces. In this paper, we discuss three transformer-based models - BERT, XLM-RoBERTa, and DistilBERT-in identifying gender-abusive comments in Tamil and Malayalam YouTube contents. We fine-tune and compare these models using a dataset provided by DravidianLangTech 2025 shared task for identifying the abusive content from social media. Compared to the models above, the results of XLM-RoBERTa are better and reached F1 scores of 0.7708 for Tamil and 0.6876 for Malayalam. BERT followed with scores of 0.7658 (Tamil) and 0.6671 (Malayalam). Of the DistilBERTs, performance was varyingly different for the different languages. A large difference in performance between the models, especially in the case of Malayalam, indicates that working in low-resource languages is difficult. The choice of a model is extremely critical in applying abusive language detection. The findings would be important information for effective content moderation systems in linguistically diverse contexts. In general, it would promote safe online spaces for women in South Indian language communities. 2025.dravidianlangtech-1.100 @@ -1133,9 +1133,9 @@ Towards Effective Emotion Analysis in Low-Resource <fixed-case>T</fixed-case>amil Texts PriyatharshanBalachandran - UthayasankerThayasivamUniversity of Moratuwa - RandilPushpanandaUniversity of Colombo School of Computing - RuvanWeerasingheInformatics Institute of Technology + UthayasankerThayasivamUniversity of Moratuwa + RandilPushpanandaUniversity of Colombo School of Computing + RuvanWeerasingheInformatics Institute of Technology 585-598 Emotion analysis plays a significant role in understanding human behavior and communication, yet research in Tamil language remains limited. This study focuses on building an emotion classifier for Tamil texts using machine learning (ML) and deep learning (DL), along with creating an emotion-annotated Tamil corpus for Ekman’s basic emotions. Our dataset combines publicly available data with re-annotation and translations. Along with traditional ML models we investigated the use of Transfer Learning (TL) with state-of-the-art models, such as BERT and Electra based models. Experiments were conducted on unbalanced and balanced datasets using data augmentation techniques. The results indicate that MultinomialNaive Bayes (MNB) and Support Vector Machine (SVM) performed well with TF-IDF and BoW representations, while among Transfer Learning models, LaBSE achieved the highest accuracy (63% balanced, 69% unbalanced), followed by TamilBERT and IndicBERT. 2025.dravidianlangtech-1.101 @@ -1147,15 +1147,15 @@ Safiul AlamSarker MD Musa KalimullahRatul KawsarAhmed - Mohammed MoshiulHoqueChittagong University of Engineering and Technology + Mohammed MoshiulHoqueChittagong University of Engineering and Technology 599-604 2025.dravidianlangtech-1.102 hasan-etal-2025-cuet <fixed-case>NAYEL</fixed-case>@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech-2025: Character N-gram and Machine Learning Coordination for Fake News Detection in <fixed-case>D</fixed-case>ravidian Languages - HamadaNayelPrince Sattam bin Abdulaziz University and Benha University - MohammedAldawsariPrince Sattam bin Abdulaziz University + HamadaNayelPrince Sattam bin Abdulaziz University and Benha University + MohammedAldawsariPrince Sattam bin Abdulaziz University Hosahalli LakshmaiahShashirekhaMangalore University 605-608 This paper introduces the detailed description of the submitted model by the team NAYEL to Fake News Detection in Dravidian Languages shared task. The proposed model uses a simple character n-gram TF-IDF as a feature extraction approach integrated with an ensemble of various classical machine learning classification algorithms. While the simplicity of the proposed model structure, although it outperforms other complex structure models as the shared task results observed. The proposed model achieved a f1-score of 87.5% and secured the 5th rank. @@ -1167,7 +1167,7 @@ AbiramiJayaraman Aruna DeviShanmugam DharunikaSasikumar - BharathiB + BharathiB 609-613 The shared task on Detecting AI-generated Product Reviews in Dravidian Languages is aimed at addressing the growing concern of AI-generated product reviews, specifically in Malayalam and Tamil. As AI tools become more advanced, the ability to distinguish between human-written and AI-generated content has become increasingly crucial, especially in the domain of online reviews where authenticity is essential for consumer decision-making. In our approach, we used the ALBERT, IndicBERT, and Support Vector Machine (SVM) models to classify the reviews. The results of our experiments demonstrate the effectiveness of our methods in detecting AI-generated content. 2025.dravidianlangtech-1.104 @@ -1178,7 +1178,7 @@ AbiramiJayaraman Aruna DeviShanmugam DharunikaSasikumar - BharathiB + BharathiB 614-618 Sentiment analysis is recognized as an important area in Natural Language Processing (NLP) that aims at understanding and classifying opinions or emotions in text. In the political field, public sentiment is analyzed to gain insight into opinions, address issues, and shape better policies. Social media platforms like Twitter (now X) are widely used to express thoughts and have become a valuable source of real-time political discussions. In this paper, the shared task of Political Multiclass Sentiment Analysis of Tamil tweets is examined, where the objective is to classify tweets into specific sentiment categories. The proposed approach is explained, which involves preprocessing Tamil text, extracting useful features, and applying machine learning and deep learning models for classification. The effectiveness of the methods is demonstrated through experimental results and the challenges encountered while working on the analysis of Tamil political sentiment are discussed. 2025.dravidianlangtech-1.105 @@ -1186,8 +1186,8 @@ <fixed-case>TEAM</fixed-case>_<fixed-case>STRIKERS</fixed-case>@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech2025: Misogyny Meme Detection in <fixed-case>T</fixed-case>amil Using Multimodal Deep Learning - KogilavaniShanmugavadivel - MalligaSubramanian + KogilavaniShanmugavadivel + MalligaSubramanian Mohamed ArsathH RamyaK RagavR @@ -1200,8 +1200,8 @@ <fixed-case>KCRL</fixed-case>@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech 2025: Multi-Pooling Feature Fusion with <fixed-case>XLM</fixed-case>-<fixed-case>R</fixed-case>o<fixed-case>BERT</fixed-case>a for <fixed-case>M</fixed-case>alayalam Fake News Detection and Classification FarihaHaq Md. Tanvir AhammedShawon - Md AyonMiaDhaka International University - Golam SarwarMd. MursalinMemorial University of Newfoundland + Md AyonMiaDhaka International University + Golam SarwarMd. MursalinMemorial University of Newfoundland Muhammad IbrahimKhan 624-629 The rapid spread of misinformation on social media platforms necessitates robust detection mechanisms, particularly for languages with limited computational resources. This paper presents our system for the DravidianLangTech 2025 shared task on Fake News Detection in Malayalam YouTube comments, addressing both binary and multiclass classification challenges. We propose a Multi-Pooling Feature Fusion (MPFF) architecture that leverages [CLS] + Mean + Max pooling strategy with transformer models. Our system demonstrates strong performance across both tasks, achieving a macro-averaged F1 score of 0.874, ranking 6th in binary classification, and 0.628, securing 1st position in multiclass classification. Experimental results show that our MPFF approach with XLM-RoBERTa significantly outperforms traditional machine learning and deep learning baselines, particularly excelling in the more challenging multiclass scenario. These findings highlight the effectiveness of our methodology in capturing nuanced linguistic features for fake news detection in Malayalam, contributing to the advancement of automated verification systems for Dravidian languages. @@ -1210,10 +1210,10 @@ <fixed-case>KCRL</fixed-case>@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech 2025: Multi-View Feature Fusion with <fixed-case>XLM</fixed-case>-<fixed-case>R</fixed-case> for <fixed-case>T</fixed-case>amil Political Sentiment Analysis - Md AyonMiaDhaka International University + Md AyonMiaDhaka International University FarihaHaq Md. Tanvir AhammedShawon - Golam SarwarMd. MursalinMemorial University of Newfoundland + Golam SarwarMd. MursalinMemorial University of Newfoundland Muhammad IbrahimKhan 630-635 Political discourse on social media platforms significantly influences public opinion, necessitating accurate sentiment analysis for understanding societal perspectives. This paper presents a system developed for the shared task of Political Multiclass Sentiment Analysis in Tamil tweets. The task aims to classify tweets into seven distinct sentiment categories: Substantiated, Sarcastic, Opinionated, Positive, Negative, Neutral, and None of the above. We propose a Multi-View Feature Fusion (MVFF) architecture that leverages XLM-R with a CLS-Attention-Mean mechanism for sentiment classification. Our experimental results demonstrate the effectiveness of our approach, achieving a macro-average F1-score of 0.37 on the test set and securing the 2nd position in the shared task. Through comprehensive error analysis, we identify specific classification challenges and demonstrate how our model effectively navigates the linguistic complexities of Tamil political discourse while maintaining robust classification performance across multiple sentiment categories. @@ -1233,7 +1233,7 @@ <fixed-case>T</fixed-case>eam<fixed-case>V</fixed-case>ision@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech 2025: Detecting <fixed-case>AI</fixed-case> generated product reviews in <fixed-case>D</fixed-case>ravidian Languages Shankari SR SarumathiP - BharathiB + BharathiB 642-646 Recent advancements in natural language processing (NLP) have enabled artificial intelligence (AI) models to generate product reviewsthat are indistinguishable from those written by humans. To address these concerns, this study proposes an effective AI detector model capable of differentiating between AI-generated and human-written product reviews. Our methodology incorporates various machine learning techniques, including Naive Bayes, Random Forest, Logistic Regression, SVM, and deep learning approaches based on the BERT architecture.Our findings reveal that BERT outperforms other models in detecting AI-generated content in both Tamil product reviews and Malayalam product reviews. 2025.dravidianlangtech-1.110 @@ -1241,11 +1241,11 @@ <fixed-case>CIC</fixed-case>-<fixed-case>NLP</fixed-case>@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech 2025: Fake News Detection in <fixed-case>D</fixed-case>ravidian Languages - TewodrosAchamaleh - NidaHafeez + TewodrosAchamaleh + NidaHafeez MikiyasMebraihtu FatimaUroosa - GrigoriSidorovInstituto Politécnico Nacional + GrigoriSidorovInstituto Politécnico Nacional 647-654 Misinformation is a growing problem for technologycompanies and for society. Although there exists a large body of related work on identifying fake news in predominantlyresource languages, there is unfortunately a lack of such studies in low-resource languages (LRLs). Because corpora and annotated data are scarce in LRLs, the identification of false information remains at an exploratory stage. Fake news detection is critical in this digital era to avoid spreading misleading information. This research work presents an approach to Detect Fake News in Dravidian Languages. Our team CIC-NLP work primarily targets Task 1 which involves identifying whether a given social platform news is original or fake. For fake news detection (FND) problem, we used mBERT model and utilized the dataset that was provided by the organizers of the workshop. In this work, we describe our findings and the results of the proposed method. Our mBERT model achieved an F1 score of 0.853. 2025.dravidianlangtech-1.111 @@ -1265,9 +1265,9 @@ <fixed-case>T</fixed-case>he_<fixed-case>D</fixed-case>eathly_<fixed-case>H</fixed-case>allows@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech 2025: Multimodal Hate Speech Detection in <fixed-case>D</fixed-case>ravidian Languages - KogilavaniShanmugavadivel - MalligaSubramanian - VasantharanK + KogilavaniShanmugavadivel + MalligaSubramanian + VasantharanK Prethish GA SanthoshS 661-665 @@ -1288,17 +1288,17 @@ Findings of the Shared Task on Abusive <fixed-case>T</fixed-case>amil and <fixed-case>M</fixed-case>alayalam Text Targeting Women on Social Media: <fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech@<fixed-case>NAACL</fixed-case> 2025 SaranyaRajiakodiCentral University of Tamil Nadu - Bharathi RajaChakravarthiUniversity of Galway - Shunmuga Priya MuthusamyChinnan + Bharathi RajaChakravarthiUniversity of Galway + Shunmuga Priya MuthusamyChinnan RubaPriyadharshiniThe Gandhigram Rural Institute - Deemed University - Raja MeenakshiJVellore Institute of Technology - KathiravanPannerselvamCentral University of Tamil Nadu - RahulPonnusamy + Raja MeenakshiJVellore Institute of Technology + KathiravanPannerselvamCentral University of Tamil Nadu + RahulPonnusamy BhuvaneswariSivagnanamCentral University of Tamil Nadu - PaulBuitelaarUniversity of Galway + PaulBuitelaarUniversity of Galway BhavanimeenaK JananayaganJananayagan - Kishore KumarPonnusamy + Kishore KumarPonnusamy 671-681 This overview paper presents the findings of the Shared Task on Abusive Tamil and Malayalam Text Targeting Women on Social Media, organized as part of DravidianLangTech@NAACL 2025. The task aimed to encourage the development of robust systems to detectabusive content targeting women in Tamil and Malayalam, two low-resource Dravidian languages. Participants were provided with annotated datasets containing abusive and nonabusive text curated from YouTube comments. We present an overview of the approaches and analyse the results of the shared task submissions. We believe the findings presented in this paper will be useful to researchers working in Dravidian language technology. 2025.dravidianlangtech-1.115 @@ -1324,7 +1324,7 @@ Syeda AlishaNoor SadiaAnjum Syed AhmadReza - Md RashadurRahmanChittagong University of Engineering and Technology + Md RashadurRahmanChittagong University of Engineering and Technology 688-693 Fake news detection in Malayalam is difficult due to limited data and language challenges. This study compares machine learning, deep learning, and transformer models for classification. The dataset is balanced and divided into training, development and test sets. Machine learning models (SVM, Random Forest, Naive Bayes) used TF-IDF features and deep learning models (LSTM, BiLSTM, CNN) worked with tokenized sequences. We fine-tuned transformer models like IndicBERT, MuRIL, mBERT, and Malayalam-Bert. Among them, the Malayalam-Bert model performed the best and achieved an F1 score of 86%. On the other hand mBERT performed best at spotting fake news. However, the models struggled with mixed-language text and complex writing. Despite these challenges, transformer models turned out to be the most effective for detecting fake news in Malayalam. 2025.dravidianlangtech-1.117 @@ -1388,13 +1388,13 @@ Findings of the Shared Task on Misogyny Meme Detection: <fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech@<fixed-case>NAACL</fixed-case> 2025 - Bharathi RajaChakravarthiUniversity of Galway - RahulPonnusamy + Bharathi RajaChakravarthiUniversity of Galway + RahulPonnusamy SaranyaRajiakodiCentral University of Tamil Nadu - Shunmuga Priya MuthusamyChinnan - PaulBuitelaarUniversity of Galway + Shunmuga Priya MuthusamyChinnan + PaulBuitelaarUniversity of Galway BhuvaneswariSivagnanamCentral University of Tamil Nadu - Anshid KAWMO IMAM GAZZALI ARTS AND SCIENCE COLLEGE + Anshid KAWMO IMAM GAZZALI ARTS AND SCIENCE COLLEGE 721-731 The rapid expansion of social media has facilitated communication but also enabled the spread of misogynistic memes, reinforcing gender stereotypes and toxic online environments. Detecting such content is challenging due to the multimodal nature of memes, where meaning emerges from the interplay of text and images. The Misogyny Meme Detection shared task at DravidianLangTech@NAACL 2025 focused on Tamil and Malayalam, encouraging the development of multimodal approaches. With 114 teams registered and 23 submitting predictions, participants leveraged various pretrained language models and vision models through fusion techniques. The best models achieved high macro F1 scores (0.83682 for Tamil, 0.87631 for Malayalam), highlighting the effectiveness of multimodal learning. Despite these advances, challenges such as bias in the data set, class imbalance, and cultural variations persist. Future research should refine multimodal detection methods to improve accuracy and adaptability, fostering safer and more inclusive online spaces. 2025.dravidianlangtech-1.123 @@ -1402,16 +1402,16 @@ Overview of the Shared Task on Sentiment Analysis in <fixed-case>T</fixed-case>amil and <fixed-case>T</fixed-case>ulu - ThenmozhiDurairaj - Bharathi RajaChakravarthiUniversity of Galway + ThenmozhiDurairaj + Bharathi RajaChakravarthiUniversity of Galway AshaHegdeMangalore University Hosahalli LakshmaiahShashirekhaMangalore University RajeswariNatarajan SajeethaThavareesan - RatnasingamSakuntharajEastern University of Sri Lanka - KrishnakumariK + RatnasingamSakuntharajEastern University of Sri Lanka + KrishnakumariK CharmathiRajkumar - PoorviShetty + PoorviShetty Harshitha SKumar 732-738 Sentiment analysis is an essential task for interpreting subjective opinions and emotions in textual data, with significant implications across commercial and societal applications. This paper provides an overview of the shared task on Sentiment Analysis in Tamil and Tulu, organized as part of DravidianLangTech@NAACL 2025. The task comprises two components: one addressing Tamil and the other focusing on Tulu, both designed as multi-class classification challenges, wherein the sentiment of a given text must be categorized as positive, negative, neutral and unknown. The dataset was diligently organized by aggregating user-generated content from social media platforms such as YouTube and Twitter, ensuring linguistic diversity and real-world applicability. Participants applied a variety of computational approaches, ranging from classical machine learning algorithms such as Traditional Machine Learning Models, Deep Learning Models, Pre-trained Language Models and other Feature Representation Techniques to tackle the challenges posed by linguistic code-mixing, orthographic variations, and resource scarcity in these low resource languages. @@ -1424,7 +1424,7 @@ Md. Saikat HossainShohag AlamgirHossainChittagong University of Engineering and Technology JawadHossain - Mohammed MoshiulHoqueChittagong University of Engineering and Technology + Mohammed MoshiulHoqueChittagong University of Engineering and Technology 739-745 With the exponential growth of social media usage, the prevalence of abusive language targeting women has become a pressing issue, particularly in low-resource languages (LRLs) like Tamil and Malayalam. This study is part of the shared task at DravidianLangTech@NAACL 2025, which focuses on detecting abusive comments in Tamil social media content. The provided dataset consists of binary-labeled comments (Abusive or Non-Abusive), gathered from YouTube, reflecting explicit abuse, implicit bias, stereotypes, and coded language. We developed and evaluated multiple models for this task, including traditional machine learning algorithms (Logistic Regression, Support Vector Machine, Random Forest Classifier, and Multinomial Naive Bayes), deep learning models (CNN, BiLSTM, and CNN+BiLSTM), and transformer-based architectures (DistilBERT, Multilingual BERT, XLM-RoBERTa), and fine-tuned variants of these models. Our best-performing model, Multilingual BERT, achieved a weighted F1-score of 0.7203, ranking 19 in the competition. 2025.dravidianlangtech-1.125 @@ -1432,14 +1432,14 @@ Overview on Political Multiclass Sentiment Analysis of <fixed-case>T</fixed-case>amil <fixed-case>X</fixed-case> (<fixed-case>T</fixed-case>witter) Comments: <fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech@<fixed-case>NAACL</fixed-case> 2025 - Bharathi RajaChakravarthiUniversity of Galway + Bharathi RajaChakravarthiUniversity of Galway SaranyaRajiakodiCentral University of Tamil Nadu - ThenmozhiDurairaj - SathiyarajThangasamy - RatnasingamSakuntharajEastern University of Sri Lanka - Prasanna KumarKumaresanData Science Institution, University of Galway, Ireland - Kishore KumarPonnusamy - Arunaggiri PandianKarunanidhiMicron Technology + ThenmozhiDurairaj + SathiyarajThangasamy + RatnasingamSakuntharajEastern University of Sri Lanka + Prasanna KumarKumaresanData Science Institution, University of Galway, Ireland + Kishore KumarPonnusamy + Arunaggiri PandianKarunanidhiMicron Technology RohanR 746-753 Political multiclass detection is the task of identifying the predefined seven political classes. In this paper, we report an overview of the findings on the “Political Multiclass Sentiment Analysis of Tamil X(Twitter) Comments” shared task conducted at the workshop on DravidianLangTech@NAACL 2025. The participants were provided with annotated Twitter comments, which are split into training, development, and unlabelled test datasets. A total of 139 participants registered for this shared task, and 25 teams finally submitted their results. The performance of the submitted systems was evaluated and ranked in terms of the macro-F1 score. @@ -1448,8 +1448,8 @@ <fixed-case>KEC</fixed-case>_<fixed-case>AI</fixed-case>_<fixed-case>BRIGHTRED</fixed-case>@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech 2025: Multimodal Hate Speech Detection in <fixed-case>D</fixed-case>ravidian languages - KogilavaniShanmugavadivel - MalligaSubramanian + KogilavaniShanmugavadivel + MalligaSubramanian NishdharaniP SanthiyaE Yaswanth RajE @@ -1460,12 +1460,12 @@ Overview of the Shared Task on Fake News Detection in <fixed-case>D</fixed-case>ravidian Languages-<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech@<fixed-case>NAACL</fixed-case> 2025 - MalligaSubramanian - PremjithBAmrita Vishwa Vidyapeetham (Deemed University) - KogilavaniShanmugavadivel + MalligaSubramanian + PremjithBAmrita Vishwa Vidyapeetham (Deemed University) + KogilavaniShanmugavadivel SanthiyaPandiyan BalasubramanianPalaniIIIT Kottayam - Bharathi RajaChakravarthiUniversity of Galway + Bharathi RajaChakravarthiUniversity of Galway 759-767 Detecting and mitigating fake news on social media is critical for preventing misinformation, protecting democratic processes, preventing public distress, mitigating hate speech, reducing financial fraud, maintaining information reliability, etc. This paper summarizes the findings of the shared task “Fake News Detection in Dravidian Languages—DravidianLangTech@NAACL 2025.” The goal of this task is to detect fake content in social media posts in Malayalam. It consists of two subtasks: the first focuses on binary classification (Fake or Original), while the second categorizes the fake news into five types—False, Half True, Mostly False, Partly False, and Mostly True. In Task 1, 22 teams submitted machine learning techniques like SVM, Naïve Bayes, and SGD, as well as BERT-based architectures. Among these, XLM-RoBERTa had the highest macro F1 score of 89.8%. For Task 2, 11 teams submitted models using LSTM, GRU, XLM-RoBERTa, and SVM. XLM-RoBERTa once again outperformed other models, attaining the highest macro F1 score of 68.2%. 2025.dravidianlangtech-1.128 diff --git a/data/xml/2025.in2writing.xml b/data/xml/2025.in2writing.xml index e7da1906c7..4748ff5404 100644 --- a/data/xml/2025.in2writing.xml +++ b/data/xml/2025.in2writing.xml @@ -27,12 +27,12 @@ Understanding Writing Assistants for Scientific Figure Captions: A Thematic Analysis Ho Yin SamNgPennsylvania State University Ting-YaoHsuPennsylvania State University - JiyooMinSeoul City University - SungchulKimAdobe Systems - Ryan A.RossiAdobe Research - TongYuAdobe Research - HyungguJungSeoul National University - Ting-Hao KennethHuangPennsylvania State University + JiyooMinSeoul City University + SungchulKimAdobe Systems + Ryan A.RossiAdobe Research + TongYuAdobe Research + HyungguJungSeoul National University + Ting-Hao KennethHuangPennsylvania State University 1-10 Scientific figure captions are essential for communicating complex data but are often overlooked, leading to unclear or redundant descriptions. While many studies focus on generating captions as an ‘output’, little attention has been given to the writer’s process of crafting captions for scientific figures. This study examines how researchers use AI-generated captions to support caption writing. Through thematic analysis of interviews and video recordings with 18 participants from diverse disciplines, we identified four key themes: (1) integrating captions with figures and text, (2) bridging gaps between language proficiency and domain expertise, (3) leveraging multiple AI-generated suggestions, and (4) adapting to diverse writing norms. These findings provide actionable design insights for developing AI writing assistants that better support researchers in creating effective scientific figure captions. 2025.in2writing-1.1 @@ -40,10 +40,10 @@ <fixed-case>ARWI</fixed-case>: <fixed-case>A</fixed-case>rabic Write and Improve - KirillChirkunov + KirillChirkunov BasharAlhafniNew York University ChatrineQwaiderMohamed bin Zayed University of Artificial Intelligence and Chalmers University of Technology - NizarHabashNew York University Abu Dhabi + NizarHabashNew York University Abu Dhabi TedBriscoeMohamed bin Zayed University of Artificial Intelligence 11-18 Although Arabic is spoken by over 400 million people, advanced Arabic writing assistance tools remain limited. To address this gap, we present ARWI, a new writing assistant that helps learners improve essay writing in Modern Standard Arabic. ARWI is the first publicly available Arabic writing assistant to include a prompt database for different proficiency levels, an Arabic text editor, state-of-the-art grammatical error detection and correction, and automated essay scoring aligned with the Common European Framework of Reference standards for language attainment (https://arwi.mbzuai.ac.ae/). Moreover, ARWI can be used to gather a growing auto-annotated corpus, facilitating further research on Arabic grammar correction and essay scoring, as well as profiling patterns of errors made by native speakers and non-native learners. A preliminary user study shows that ARWI provides actionable feedback, helping learners identify grammatical gaps, assess language proficiency, and guide improvement. @@ -53,7 +53,7 @@ <fixed-case>R</fixed-case>ead<fixed-case>C</fixed-case>trl: Personalizing text generation with readability-controlled instruction learning HieuTran - ZonghaiYaoUniversity of Massachusetts at Amherst + ZonghaiYaoUniversity of Massachusetts at Amherst LingxiLiUniversity of Massachusetts at Amherst HongYuColumbia University 19-36 @@ -63,7 +63,7 @@ <fixed-case>AI</fixed-case> Writing Assistants in Tanzanian Universities: Adoption Trends, Challenges, and Opportunities - Alfred MalengoKondoro + Alfred MalengoKondoro 37-46 This study examines the adoption, challenges, and impact of AI writing assistants in Tanzanian universities, with a focus on their role in supporting academic writing, enhancing accessibility, and accommodating low-resource languages such as Swahili. Through a structured survey of 1,005 university students, we analyze AI usage patterns, key barriers to adoption, and the improvements needed to make AI writing assistants more inclusive and effective. Findings reveal that limited Swahili integration, affordability constraints, and ethical concerns hinder AI adoption, disproportionately affecting students in resource-constrained settings. To address these challenges, we propose strategies for adapting AI models to diverse linguistic, academic, and infrastructural contexts, emphasizing Swahili-language support, AI literacy initiatives, and accessibility-focused AI development. By bridging these gaps, this study contributes to the development of AI-driven educational tools that are more equitable, contextually relevant, and effective for students in Tanzania and beyond. 2025.in2writing-1.4 @@ -81,7 +81,7 @@ Interaction-Required Suggestions for Control, Ownership, and Awareness in Human-<fixed-case>AI</fixed-case> Co-Writing Kenneth C.ArnoldCalvin University - JihoKimCalvin University + JihoKimCalvin University 62-68 This paper explores interaction designs for generative AI interfaces that necessitate human involvement throughout the generation process. We argue that such interfaces can promote cognitive engagement, agency, and thoughtful decision-making. Through a case study in text revision, we present and analyze two interaction techniques: (1) using a predictive-text interaction to type the agent’s response to a revision request, and (2) highlighting potential edit opportunities in a document. Our implementations demonstrate how these approaches reveal the landscape of writing possibilities and enable fine-grained control. We discuss implications for human-AI writing partnerships and future interaction design directions. 2025.in2writing-1.6 @@ -89,7 +89,7 @@ Voice Interaction With Conversational <fixed-case>AI</fixed-case> Could Facilitate Thoughtful Reflection and Substantive Revision in Writing - JihoKimCalvin University + JihoKimCalvin University PhilippeLabanMicrosoft XiangChenUniversity of California, Los Angeles Kenneth C.ArnoldCalvin University @@ -102,7 +102,7 @@ <fixed-case>RONA</fixed-case>: Pragmatically Diverse Image Captioning with Coherence Relations AashishAnantha RamakrishnanPennsylvania State University, Pennsylvania State University Aadarsh AnanthaRamakrishnan - DongwonLeeThe Pennsylvania State University + DongwonLeeThe Pennsylvania State University 74-86 Writing Assistants (e.g., Grammarly, Microsoft Copilot) traditionally generate diverse image captions by employing syntactic and semantic variations to describe image components. However, human-written captions prioritize conveying a central message alongside visual descriptions using pragmatic cues. To enhance caption diversity, it is essential to explore alternative ways of communicating these messages in conjunction with visual content. We propose RONA, a novel prompting strategy for Multi-modal Large Language Models (MLLM) that leverages Coherence Relations as a controllable axis for pragmatic variations. We demonstrate that RONA generates captions with better overall diversity and ground-truth alignment, compared to MLLM baselines across multiple domains. Our code is available at: https://github.com/aashish2000/RONA 2025.in2writing-1.8 @@ -112,8 +112,8 @@ Multi-Agent Based Character Simulation for Story Writing TianYuDepartment of Computer Science, University of Toronto KenShiDepartment of Computer Science, University of Toronto - ZixinZhaoDepartment of Computer Science, University of Toronto - GeraldPennDepartment of Computer Science, University of Toronto + ZixinZhaoDepartment of Computer Science, University of Toronto + GeraldPennDepartment of Computer Science, University of Toronto 87-108 This work proposes a novel multi-agent story-generation system that writes stories from a narrative plan. Traditional approaches tend to generate a section of text directly from its outline. Our system, by contrast, divides this elaboration process into role-play and rewrite steps, where the former step enacts the story in chronological order with LLM-backed character agents, and the latter step refines the role-play result to align with a narrative plan. We show that the stories produced by our system are preferable to two other LLM-based story-generation approaches. We attribute this advancement to the benefits of incorporating a character-based simulation strategy. 2025.in2writing-1.9 @@ -122,7 +122,7 @@ An Analysis of Scoring Methods for Reranking in Large Language Model Story Generation MeganDeeringUniversity of Toronto - GeraldPennDepartment of Computer Science, University of Toronto + GeraldPennDepartment of Computer Science, University of Toronto 109-120 Outline-conditioned story generation using Large Language Models (LLMs) offers a promising approach for automating narrative creation. Some outline-conditioned story generation methods use automatic scoring during the generation process in order to improve the story quality. However, current research has shown that automatic scoring is not ideal for assessing story quality. This paper evaluates three proposed automatic story-scoring methods to improve the reranking of outputs during the generation process. These scoring methods leverage different prompting strategies and fine-tuning techniques to enhance the accuracy and relevance of the assessments. By experimenting with these approaches within a beam search framework, we aim to identify the most effective methods for optimizing story-generation outcomes. While we have found no significant overall difference between these methods in terms of their agreement with human ratings during story generation, the overall story ratings by human evaluators are average. These findings motivate the need for improved automatic scoring techniques and datasets while also indicating that simpler, more easily implementable scoring methods for reranking perform comparably to more complex approaches. 2025.in2writing-1.10 diff --git a/data/xml/2025.loresmt.xml b/data/xml/2025.loresmt.xml index 766d4cc5b4..1b2cb6cdab 100644 --- a/data/xml/2025.loresmt.xml +++ b/data/xml/2025.loresmt.xml @@ -25,9 +25,9 @@ Comparative Evaluation of Machine Translation Models Using Human-Translated Social Media Posts as References: Human-Translated Datasets - Shareefa AhmedAl Amer + Shareefa AhmedAl Amer Mark G.Lee - PhillipSmithUniversity of Birmingham + PhillipSmithUniversity of Birmingham 1-9 Machine translation (MT) of social media text presents unique challenges due to its informal nature, linguistic variations, and rapid evolution of language trends. In this paper, we propose a human-translated English dataset to Arabic, Italian, and Spanish, and a human-translated Arabic dataset to Modern Standard Arabic (MSA) and English. We also perform a comprehensive analysis of three publicly accessible MT models using human translations as a reference. We investigate the impact of social media informality on translation quality by translating the MSA version of the text and comparing BLEU and METEOR scores with the direct translation of the original social media posts. Our findings reveal that MarianMT provides the closest translations to human for Italian and Spanish among the three models, with METEOR scores of 0.583 and 0.640, respectively, while Google Translate provides the closest translations for Arabic, with a METEOR score of 0.354. By comparing the translation of the original social media posts with the MSA version, we confirm that the informality of social media text significantly impacts translation quality, with an increase of 12 percentage points in METEOR scores over the original posts. Additionally, we investigate inter-model alignment and the degree to which the output of these MT models align. 2025.loresmt-1.1 @@ -64,7 +64,7 @@ Wenzhou Dialect Speech to <fixed-case>M</fixed-case>andarin Text Conversion ZhipengGaoDoshisha University AkihiroTamuraDoshisha University - TsuneoKato + TsuneoKato 36-43 The Wenzhou dialect is a Chinese dialect that is significantly distinct from Mandarin, the official language of China. It is among the most complex Chinese dialects and is nearly incomprehensible to people from regions such as Northern China, thereby creating substantial communication barriers. Therefore, the conversion between the Wenzhou dialect and Mandarin is essential to facilitate communication between Wenzhou dialect speakers and those from other Chinese regions. However, as a low-resource language, the Wenzhou dialect lacks publicly available datasets, and such conversion technologies have not been extensively researched. Thus, in this study, we create a parallel dataset containing Wenzhou dialect speech and the corresponding Mandarin text and build benchmark models for Wenzhou dialect speech-to-Mandarin text conversion. In particular, we fine-tune two self-supervised learning-based pretrained models, that is, TeleSpeech-ASR1.0 and Wav2Vec2-XLS-R, with our training dataset and report their performance on our test dataset as baselines for future research. 2025.loresmt-1.5 @@ -81,10 +81,10 @@ Low-resource Machine Translation: what for? who for? An observational study on a dedicated Tetun language translation service - RaphaelMerxUniversity of Melbourne + RaphaelMerxUniversity of Melbourne Adérito José GuterresCorreia - HannaSuominenAustralian National University - EkaterinaVylomovaThe University of Melbourne + HannaSuominenAustralian National University + EkaterinaVylomovaThe University of Melbourne 54-65 Low-resource machine translation (MT) presents a diversity of community needs and application challenges that remain poorly understood. To complement surveys and focus groups, which tend to rely on small samples of respondents, we propose an observational study on actual usage patterns of a specialized MT service for the Tetun language, which is the lingua franca in Timor-Leste. Our analysis of 100,000 translation requests reveals patterns that challenge assumptions based on existing corpora. We find that users, many of them students on mobile devices, typically translate text from a high-resource language into Tetun across diverse domains including science, healthcare, and daily life. This contrasts sharply with available Tetun corpora, which are dominated by news articles covering government and social issues.Our results suggest that MT systems for institutionalized minority languages like Tetun should prioritize accuracy on domains relevant to educational contexts, in the high-resource to low-resource direction. More broadly, this study demonstrates how observational analysis can inform low-resource language technology development, by grounding research in practical community needs. 2025.loresmt-1.7 @@ -92,9 +92,9 @@ Jamo-Level Subword Tokenization in Low-Resource <fixed-case>K</fixed-case>orean Machine Translation - JunyoungLeeHome Team Science and Technology Agency + JunyoungLeeHome Team Science and Technology Agency MarcoCognettaTokyo Institute of Technology, Tokyo Institute of Technology and Google - SangwhanMoonGoogle and Tokyo Institute of Technology + SangwhanMoonGoogle and Tokyo Institute of Technology NaoakiOkazakiInstitute of Science Tokyo 66-80 Subword tokenization, where text is represented in an intermediate form between full words and characters, is ubiquitous in modern NLP due to its ability to represent any input sentence with a small vocabulary. However for Korean, where there are 11,172 base characters (*syllables*) in its alphabet, it is difficult to have a vocabulary large enough to succinctly encode text while fitting within parameter-budget constraints. This motivates us to explore an alternative representation for Korean which relies on the decompositional nature of Korean syllables: a syllable can be uniquely decomposed into a sequence of two or three subcharacters (*jamo*), of which there are only 68.Using jamo as the basis for subword tokenization (e.g., byte-pair encoding) leads to shorter tokenized sequences with fewer vocabulary parameters, exposes the model to sub-syllable-level morphological information, and increases the amount of augmentation gained from subword regularization. We evaluate jamo-level subword tokenization on several Korean translation tasks and find that jamo-level subword models consistently outperform syllable- and byte-level models in low-resource and restricted-vocabulary settings. @@ -113,14 +113,14 @@ <fixed-case>M</fixed-case>ode<fixed-case>L</fixed-case>ing: A Novel Dataset for Testing Linguistic Reasoning in Language Models - Nathan AndrewChiStanford University + Nathan AndrewChiStanford University TeodorMalchev RileyKong Ryan AndrewChi LucasHuang Ethan AChiHudson River Trading R. ThomasMcCoyYale University - DragomirRadevYale University + DragomirRadevYale University 105-114 We introduce ModeLing, a novel benchmark of Linguistics Olympiad-style puzzles which tests few-shot reasoning in AI systems. Solving these puzzles necessitates inferring aspects of a language’s grammatical structure from a small number of examples. Such puzzles provide a natural testbed for language models, as they require compositional generalization and few-shot inductive reasoning. Consisting solely of new puzzles written specifically for this work, ModeLing has no risk of appearing in the training data of existing AI systems: this ameliorates the risk of data leakage, a potential confounder for many prior evaluations of reasoning. Evaluating several large open source language models and GPT on our benchmark, we observe non-negligible accuracy, demonstrating few-shot emergent reasoning ability which cannot merely be attributed to shallow memorization. However, imperfect model performance suggests that ModeLing can be used to measure further progress in linguistic reasoning. 2025.loresmt-1.10 @@ -128,10 +128,10 @@ Multilingual State Space Models for Structured Question Answering in <fixed-case>I</fixed-case>ndic Languages - ArpitaVats + ArpitaVats RahulRajaLinkedIn MrinalMathurGoogle and ByteDance Inc. - AmanChadhaAmazon Web Services + AmanChadhaAmazon Web Services VinijaJainFacebook 115-128 The diversity and complexity of Indic languages present unique challenges for natural language processing (NLP) tasks, particularly in the domain of question answering (QA).To address these challenges, this paper explores the application of State Space Models (SSMs) to build efficient and contextually aware QA systems tailored for Indic languages. SSMs are particularly suited for this task due to their ability to model long-term and short-term dependencies in sequential data, making them well-equipped to handle the rich morphology, complex syntax, and contextual intricacies characteristic of Indian languages. We evaluated multiple SSM architectures across diverse datasets representing various Indic languages and conducted a comparative analysis of their performance. Our results demonstrate that these models effectively capture linguistic subtleties, leading to significant improvements in question interpretation, context alignment, and answer generation. This work represents the first application of SSMs to question answering tasks in Indic languages, establishing a foundational benchmark for future research in this domain. Furthermore, we propose enhancements to existing SSM frameworks, optimizing their applicability to low-resource settings and multilingual scenarios prevalent in Indic languages. @@ -141,7 +141,7 @@ Parallel Corpora for Machine Translation in Low-Resource <fixed-case>I</fixed-case>ndic Languages: A Comprehensive Review RahulRajaLinkedIn - ArpitaVats + ArpitaVats 129-143 Parallel corpora play an important role in training machine translation (MT) models, particularly for low-resource languages where high-quality bilingual data is scarce. This review provides a comprehensive overview of available parallel corpora for Indic languages, which span diverse linguistic families, scripts, and regional variations. We categorize these corpora into text-to-text, code-switched, and various categories of multimodal datasets, highlighting their significance in the development of robust multilingual MT systems. Beyond resource enumeration, we critically examine the challenges faced in corpus creation, including linguistic diversity, script variation, data scarcity, and the prevalence of informal textual content. We also discuss and evaluate these corpora in various terms such as alignment quality and domain representativeness. Furthermore, we address open challenges such as data imbalance across Indic languages, the trade-off between quality and quantity, and the impact of noisy, informal, and dialectal data on MT performance. Finally, we outline future directions, including leveraging cross-lingual transfer learning, expanding multilingual datasets, and integrating multimodal resources to enhance translation quality. To the best of our knowledge, this paper presents the first comprehensive review of parallel corpora specifically tailored for low-resource Indic languages in the context of machine translation. 2025.loresmt-1.12 @@ -160,8 +160,8 @@ Building Data Infrastructure for Low-Resource Languages Sarah K. K.LugerConsumer Reports - RafaelMosquera - PedroOrtiz SuarezCommon Crawl Foundation + RafaelMosquera + PedroOrtiz SuarezCommon Crawl Foundation 154-160 The MLCommons Datasets Working Group presents a comprehensive initiative to advance the development and accessibility of artificial intelligence (AI) training and testing resources. This paper introduces three key projects aimed at addressing critical gaps in the AI data ecosystem: the Unsupervised People’s Speech Dataset, containing over 821,000 hours of speech across 89+ languages; a strategic collaboration with Common Crawl to enhance web crawling capabilities for low-resource languages; and a framework for knowledge graph extraction evaluation. By focusing on languages other than English (LOTE) and creating permissively licensed, high-quality datasets, these initiatives aim to democratize AI development and improve model performance across diverse linguistic contexts. This work represents a significant step toward more inclusive and capable AI systems that can serve global communities. 2025.loresmt-1.14 @@ -170,8 +170,8 @@ Encoder-Aware Sequence-Level Knowledge Distillation for Low-Resource Neural Machine Translation MenanVelayuthanUniversity of Moratuwa - NisansaDe SilvaUniversity of Moratuwa - SurangikaRanathungaMassey University + NisansaDe SilvaUniversity of Moratuwa + SurangikaRanathungaMassey University 161-170 Domain adaptation in Neural Machine Translation (NMT) is commonly achieved through fine-tuning, but this approach becomes inefficient as the number of domains increases. Knowledge distillation (KD) provides a scalable alternative by training a compact model on distilled data from a larger model. However, we hypothesize that vanilla sequence-level KD primarily distills the decoder while neglecting encoder knowledge, leading to suboptimal knowledge transfer and limiting its effectiveness in low-resource settings, where both data and computational resources are constrained. To address this, we propose an improved sequence-level KD method that enhances encoder knowledge transfer through a cosine-based alignment loss. Our approach first trains a large model on a mixed-domain dataset and generates a Distilled Mixed Dataset (DMD). A small model is then trained on this dataset via sequence-level KD with encoder alignment. Experiments in a low-resource setting validate our hypothesis, demonstrating that our approach outperforms vanilla sequence-level KD, improves generalization to out-of-domain data, and facilitates efficient domain adaptation while reducing model size and computational cost. 2025.loresmt-1.15 @@ -183,7 +183,7 @@ ParnianFazel FarzanehGoshtasbInstitution for Humanities and Cultural Studies NadiaHajipourInstitute for Humanities and Cultural Studies - SadraSabouri + SadraSabouri EhsaneddinAsgariQatar Computing Research Institute and University of California, Berkeley HosseinSametiSharif University of Technology 171-182 @@ -193,9 +193,9 @@ Limitations of Religious Data and the Importance of the Target Domain: Towards Machine Translation for <fixed-case>G</fixed-case>uinea-<fixed-case>B</fixed-case>issau Creole - JacquelineRowe - EdwardGow-Smith - MarkHeppleUniversity of Sheffield + JacquelineRowe + EdwardGow-Smith + MarkHeppleUniversity of Sheffield 183-200 We introduce a new dataset for machine translation of Guinea-Bissau Creole (Kiriol), comprising around 40 thousand parallel sentences to English and Portuguese. This dataset is made up of predominantly religious data (from the Bible and texts from the Jehovah’s Witnesses), but also a small amount of general domain data (from a dictionary). This mirrors the typical resource availability of many low resource languages. We train a number of transformer-based models to investigate how to improve domain transfer from religious data to a more general domain. We find that adding even 300 sentences from the target domain when training substantially improves the translation performance, highlighting the importance and need for data collection for low-resource languages, even on a small-scale. We additionally find that Portuguese-to-Kiriol translation models perform better on average than other source and target language pairs, and investigate how this relates to the morphological complexity of the languages involved and the degree of lexical overlap between creoles and lexifiers. Overall, we hope our work will stimulate research into Kiriol and into how machine translation might better support creole languages in general. 2025.loresmt-1.17 diff --git a/data/xml/2025.mwe.xml b/data/xml/2025.mwe.xml index 0e0e177344..e1a60aa520 100644 --- a/data/xml/2025.mwe.xml +++ b/data/xml/2025.mwe.xml @@ -37,7 +37,7 @@ Probing Internal Representations of Multi-Word Verbs in Large Language Models HassaneKissane AchimSchillingFriedrich-Alexander Universität Erlangen-Nürnberg - PatrickKraussUniversity Erlangen-Nuremberg + PatrickKraussUniversity Erlangen-Nuremberg 7-13 This study investigates the internal representations of verb-particle combinations, called multi-word verbs, within transformer-based large language models (LLMs), specifically examining how these models capture lexical and syntactic properties at different neural network layers. Using the BERT architecture, we analyze the representations of its layers for two different verb-particle constructions: phrasal verbs like “give up” and prepositional verbs like “look at”. Our methodology includes training probing classifiers on the model output to classify these categories at both word and sentence levels. The results indicate that the model’s middle layers achieve the highest classification accuracies. To further analyze the nature of these distinctions, we conduct a data separability test using the Generalized Discrimination Value (GDV). While GDV results show weak linear separability between the two verb types, probing classifiers still achieve high accuracy, suggesting that representations of these linguistic categories may be “non-linearly separable”. This aligns with previous research indicating that linguistic distinctions in neural networks are not always encoded in a linearly separable manner. These findings computationally support usage-based claims on the representation of verb-particle constructions and highlight the complex interaction between neural network architectures and linguistic structures. 2025.mwe-1.2 @@ -45,14 +45,14 @@ <fixed-case>VMWE</fixed-case> identification with models trained on <fixed-case>GUD</fixed-case> (a <fixed-case>UD</fixed-case>v.2 treebank of Standard <fixed-case>M</fixed-case>odern <fixed-case>G</fixed-case>reek) - StellaMarkantonatou + StellaMarkantonatou VivianStamouILSP - “Athena” Research Center - StavrosBompolasARCHIMEDES Unit | Athena Research Center + StavrosBompolasARCHIMEDES Unit | Athena Research Center KaterinaAnastasopoulouHellenic American University and University of Athens Irianna LinardakiVasileiadi KonstantinosDiamantopoulos YannisKazosAthena Research and Innovation Centre - AntoniosAnastasopoulosAthena Research Center and George Mason University + AntoniosAnastasopoulosAthena Research Center and George Mason University 14-20 UD_Greek-GUD (GUD) is the most recent Universal Dependencies (UD) treebank for Standard Modern Greek (SMG) and the first SMG UD treebank to annotate Verbal Multiword Expressions (VMWEs). GUD contains material from fiction texts and various sites that use colloquial SMG. We describe the special annotation decisions we implemented with GUD, the pipeline we developed to facilitate the active annotation of new material, and we report on the method we designed to evaluate the performance of models trained on GUD as regards VMWE identification tasks. 2025.mwe-1.3 @@ -61,9 +61,9 @@ Using <fixed-case>LLM</fixed-case>s to Advance Idiom Corpus Construction DoğukanArslan - Hüseyin AnılÇakmak - GulsenEryigit - JoakimNivreUppsala University + Hüseyin AnılÇakmak + GulsenEryigit + JoakimNivreUppsala University 21-31 Idiom corpora typically include both idiomatic and literal examples of potentially idiomatic expressions, but creating such corpora traditionally requires substantial expert effort and cost. In this article, we explore the use of large language models (LLMs) to generate synthetic idiom corpora as a more time- and cost-efficient alternative. We evaluate the effectiveness of synthetic data in training task-specific models and testing GPT-4 in few-shot prompting setting using synthetic data for idiomaticity detection. Our findings reveal that although models trained on synthetic data perform worse than those trained on human-generated data, synthetic data generation offers considerable advantages in terms of cost and time. Specifically, task-specific idiomaticity detection models trained on synthetic data outperform the general-purpose LLM that generated the data when evaluated in a zero-shot setting, achieving an average improvement of 11 percentage points across four languages. Moreover, synthetic data enhances the LLM’s performance, enabling it to match the task-specific models trained with synthetic data when few-shot prompting is applied. 2025.mwe-1.4 @@ -71,8 +71,8 @@ Gathering Compositionality Ratings of Ambiguous Noun-Adjective Multiword Expressions in <fixed-case>G</fixed-case>alician - LauraCastroUniversidad de Santiago de Compostela - MarcosGarciaUniversidade de Santiago de Compostela + LauraCastroUniversidad de Santiago de Compostela + MarcosGarciaUniversidade de Santiago de Compostela 32-40 Multiword expressions pose numerous challenges to most NLP tasks, and so do their compositionality and semantic ambiguity. The need for resources that make it possible to explore such phenomena is rather pressing, even more so in the case of low-resource languages. In this paper, we present a dataset of noun-adjective compounds in Galician with compositionality scores at token level. These MWEs are ambiguous due to being potentially idiomatic expressions, as well as due to the ambiguity and productivity of their constituents. The dataset comprises 240 MWEs that amount to 322 senses, which are contextualized in two sets of sentences, manually created, and extracted from corpora, totaling 1,858 examples. For this dataset, we gathered human judgments on compositionality levels for compounds, heads, and modifiers. Furthermore, we obtained frequency, ambiguity, and productivity data for compounds and their constituents, and we explored potential correlations between mean compositionality scores and these three properties in terms of compounds, heads, and modifiers. This valuable resource helps evaluate language models on (non-)compositionality and ambiguity, key challenges in NLP, and is especially relevant for Galician, a low-resource variety lacking annotated datasets for such linguistic phenomena. 2025.mwe-1.5 @@ -81,14 +81,14 @@ Survey on Lexical Resources Focused on Multiword Expressions for the Purposes of <fixed-case>NLP</fixed-case> VerginicaMititelu - VoulaGiouliAristotle University of Thessaloniki and ILSP - “Athena” Research Center - GražinaKorvelVilnius University + VoulaGiouliAristotle University of Thessaloniki and ILSP - “Athena” Research Center + GražinaKorvelVilnius University ChayaLiebeskind - IrinaLobzhanidzeIlia Chavchavadze State University - RusudanMakhachashviliBorys Grinchenko Kyiv Metropolitan University and Borys Grinchenko Kyiv Metropolitan University - StellaMarkantonatou - AleksandraMarkovicInstitute for the Serbian Language of SASA - IvelinaStoyanovaDeaf Studies Institute and Institute for Bulgarian Language, Bulgarian Academy of Sciences + IrinaLobzhanidzeIlia Chavchavadze State University + RusudanMakhachashviliBorys Grinchenko Kyiv Metropolitan University and Borys Grinchenko Kyiv Metropolitan University + StellaMarkantonatou + AleksandraMarkovicInstitute for the Serbian Language of SASA + IvelinaStoyanovaDeaf Studies Institute and Institute for Bulgarian Language, Bulgarian Academy of Sciences 41-57 Lexica of MWEs have always been a valuable resource for various NLP tasks. This paper presents the results of a comprehensive survey on multiword lexical resources that extends a previous one from 2016 to the present. We analyze a diverse set of lexica across multiple languages, reporting on aspects such as creation date, intended usage, languages covered and linguality type, content, acquisition method, accessibility, and linkage to other language resources. Our findings highlight trends in MWE lexicon development focusing on the representation level of languages. This survey aims to support future efforts in creating MWE lexica for NLP applications by identifying these gaps and opportunities. 2025.mwe-1.6 @@ -96,8 +96,8 @@ A <fixed-case>E</fixed-case>uropean <fixed-case>P</fixed-case>ortuguese corpus annotated for verbal idioms - DavidAntunesINESC-ID Lisboa - JorgeBaptistaINESC ID Lisboa and Universidade do Algarve + DavidAntunesINESC-ID Lisboa + JorgeBaptistaINESC ID Lisboa and Universidade do Algarve Nuno J.MamedeInstituto Superior Técnico and INESC-ID 58-66 This paper presents the construction of VIDiom-PT, a corpus in European Portuguese annotated for verbal idioms (e.g. O Rui bateu a bota, lit.: Rui hit the boot ‘Rui died’). This linguistic resource aims to support the development of systems capable of processing such constructions in this language variety. To assist in the annotation effort, two tools were built. The first allows for the detection of possible instances of verbal idioms in texts, while the second provides a graphical interface for annotating them. This effort culminated in the annotation of a total of 5,178 instances of 747 different verbal idioms in more than 200,000 sentences in European Portuguese. A highly reliable inter-annotator agreement was achieved, using Krippendorff’s alpha for nominal data (0.869) with 5% of the data independently annotated by 3 experts. Part of the annotated corpus is also made publicly available. @@ -109,7 +109,7 @@ UlianaSentsova DeboraCiminariUniversity of Bologna Josef VanGenabithGerman Research Center for AI and Universität des Saarlandes - CristinaEspaña-BonetBarcelona Supercomputing Center and German Research Center for AI + CristinaEspaña-BonetBarcelona Supercomputing Center and German Research Center for AI 67-81 Language models are able to handle compositionality and, to some extent, non-compositional phenomena such as semantic idiosyncrasy, a feature most prominent in the case of idioms. This work introduces the MultiCoPIE corpus that includes potentially idiomatic expressions in Catalan, Italian, and Russian, extending the language coverage of PIE corpus data. The new corpus provides additional linguistic features of idioms, such as their semantic compositionality, part-of-speech of idiom head as well as their corresponding idiomatic expressions in English. With this new resource at hand, we first fine-tune an XLM-RoBERTa model to classify figurative and literal usage of potentially idiomatic expressions in English. We then study cross-lingual transfer to the languages represented in the MultiCoPIE corpus, evaluating the model’s ability to generalize an idiom-related task to languages not seen during fine-tuning. We show the effect of ‘cross-lingual lexical overlap’: the performance of the model, fine-tuned on English idiomatic expressions and tested on the MultiCoPIE languages, increases significantly when classifying ‘shared idioms’ -idiomatic expressions that have direct counterparts in English with similar form and meaning. While this observation raises questions about the generalizability of cross-lingual learning, the results from experiments on PIEs demonstrate strong evidence of effective cross-lingual transfer, even when accounting for idioms similar across languages. 2025.mwe-1.8 @@ -117,11 +117,11 @@ Named Entity Recognition for the <fixed-case>I</fixed-case>rish Language - JaneAdkins + JaneAdkins HugoCollins - JoachimWagnerDublin City University + JoachimWagnerDublin City University AbigailWalshDublin City University - BrianDavisDublin City University + BrianDavisDublin City University 82-96 The Irish language has been deemed ‘definitely endangered’ (Moseley, 2012) and has been clas- sified as having ‘weak or no support’ (Lynn, 2023) regarding digital resources in spite of its status as the first official and national language of the Republic of Ireland. This research de- velops the first named entity recognition (NER) tool for the Irish language, one of the essen- tial tasks identified by the Digital Plan for Irish (Ní Chasaide et al., 2022). In this study, we produce a small gold-standard NER-annotated corpus and compare both monolingual and mul- tilingual BERT models fine-tuned on this task. We experiment with different model architec- tures and low-resource language approaches to enrich our dataset. We test our models on a mix of single- and multi-word named entities as well as a specific multi-word named entity test set. Our proposed gaBERT model with the implementation of random data augmentation and a conditional random fields layer demon- strates significant performance improvements over baseline models, alternative architectures, and multilingual models, achieving an F1 score of 76.52. This study contributes to advanc- ing Irish language technologies and supporting Irish language digital resources, providing a basis for Irish NER and identification of other MWE types. 2025.mwe-1.9 diff --git a/data/xml/2025.naacl.xml b/data/xml/2025.naacl.xml index 9dd567ccf6..7f891926cb 100644 --- a/data/xml/2025.naacl.xml +++ b/data/xml/2025.naacl.xml @@ -10095,7 +10095,7 @@ <fixed-case>INSIGHTBUDDY</fixed-case>-<fixed-case>AI</fixed-case>: Medication Extraction and Entity Linking using Pre-Trained Language Models and Ensemble Learning PabloRomero - LifengHan + LifengHan GoranNenadicUniversity of Manchester 18-27 This paper presents our system, InsightBuddy-AI, designed for extracting medication mentions and their associated attributes, and for linking these entities to established clinical terminology resources, including SNOMED-CT, the British National Formulary (BNF), ICD, and the Dictionary of Medicines and Devices (dm+d).To perform medication extraction, we investigated various ensemble learning approaches, including stacked and voting ensembles (using first, average, and max voting methods) built upon eight pre-trained language models (PLMs). These models include general-domain PLMs—BERT, RoBERTa, and RoBERTa-Large—as well as domain-specific models such as BioBERT, BioClinicalBERT, BioMedRoBERTa, ClinicalBERT, and PubMedBERT.The system targets the extraction of drug-related attributes such as adverse drug effects (ADEs), dosage, duration, form, frequency, reason, route, and strength.Experiments conducted on the n2c2-2018 shared task dataset demonstrate that ensemble learning methods outperformed individually fine-tuned models, with notable improvements of 2.43% in Precision and 1.35% in F1-score.We have also developed cross-platform desktop applications for both entity recognition and entity linking, available for Windows and macOS.The InsightBuddy-AI application is freely accessible for research use at https://github.com/HECTA-UoM/InsightBuddy-AI. @@ -10105,7 +10105,7 @@ Linguistic Features in <fixed-case>G</fixed-case>erman <fixed-case>BERT</fixed-case>: The Role of Morphology, Syntax, and Semantics in Multi-Class Text Classification HenrikeBeyerUniversity of Dundee - DiegoFrassinelliLudwig-Maximilians-Universität München + DiegoFrassinelliLudwig-Maximilians-Universität München 28-39 Most studies on the linguistic information encoded by BERT primarily focus on English. Our study examines a monolingual German BERT model using a semantic classification task on newspaper articles, analysing the linguistic features influencing classification decisions through SHAP values. We use the TüBa-D/Z corpus, a resource with gold-standard annotations for a set of linguistic features, including POS, inflectional morphology, phrasal, clausal, and dependency structures. Semantic features of nouns are evaluated via the GermaNet ontology using shared hypernyms. Our results indicate that the features identified in English also affect classification in German but suggests important language- and task-specific features as well. 2025.naacl-srw.3 @@ -10125,7 +10125,7 @@ ZoeCaballero-DomínguezInstituto Tecnológico y de Estudios Superiores de Monterrey Valeria J.Ramírez-MacíasInstituto Tecnológico y de Estudios Superiores de Monterrey SaburButt - HectorCeballosTecnologico de Monterrey + HectorCeballosTecnologico de Monterrey 48-54 In the digital age, social media platforms like Twitter serve as an extensive repository of public discourse, including instances of sexism. It is important to identify such behavior since radicalized ideologies can lead to real-world violent acts. This project aims to develop a deep learning-based tool that leverages a combination of BERT (both English and multilingual versions) and GraphSAGE, a Graph Neural Network (GNN) model, alongside sentiment analysis and natural language processing (NLP) techniques. The tool is designed to analyze tweets for sexism detection and classify them into five categories. 2025.naacl-srw.5 @@ -10135,10 +10135,10 @@ Towards Codec-<fixed-case>LM</fixed-case> Co-design for Neural Codec Language Models Shih-LunWuMassachusetts Institute of Technology AakashLahoti - Arjun DDesai + Arjun DDesai KaranGoelStanford University - ChrisDonahueCarnegie Mellon University and Google - AlbertGuCarnegie Mellon University + ChrisDonahueCarnegie Mellon University and Google + AlbertGuCarnegie Mellon University 55-65 Neural codec language models (or codec LMs) are emerging as a powerful framework for audio generation tasks like text-to-speech (TTS). These models leverage advancements in language modeling and residual vector quantization (RVQ)-based audio codecs, which compress audios into discrete codes for LMs to process. Despite the close interdependence of codecs and LMs in these systems, research on codecs and LMs has largely remained siloed. In this work, we propose three techniques for better codec-LM co-design: (i) a frame-wise codec encoder that improves both LM log-likelihood and end-to-end TTS metrics, (ii) LM codebook level dropout, a method to efficiently navigate a portion of the codec-LM design space by training a single LM, and (iii) increased codec frame duration, which we show can accelerate inference while maintaining end-to-end performance. Our experiments demonstrate that combining all three co-design techniques results in doubled inference speed, and improvements in intelligibility, audio quality, and speaker control in TTS relative to a siloed baseline. 2025.naacl-srw.6 @@ -10147,7 +10147,7 @@ Low-resource Machine Translation for Code-switched <fixed-case>K</fixed-case>azakh-<fixed-case>R</fixed-case>ussian Language Pair MaksimBorisov - ZhanibekKozhirbayevNational Laboratory Astana, Nazarbayev University + ZhanibekKozhirbayevNational Laboratory Astana, Nazarbayev University ValentinMalykh 66-76 Machine translation for low-resource language pairs is a challenging task. This task could become extremely difficult once a speaker uses code switching. We present the first code-switching Kazakh-Russian parallel corpus.Additionally, we propose a method to build a machine translation model for code-switched Kazakh-Russian language pair with no labeled data. Our method is basing on generation of synthetic data. This method results in a model beating an existing commercial system by human evaluation. @@ -10156,11 +10156,11 @@ Generative Product Recommendations for Implicit Superlative Queries - KaustubhDholeEmory University + KaustubhDholeEmory University NikhitaVedulaAmazon - SaarKuziAmazon + SaarKuziAmazon GiuseppeCastellucciAmazon - EugeneAgichteinEmory University + EugeneAgichteinEmory University ShervinMalmasiAmazon 77-91 In recommender systems, users often seek the best products through indirect, vague, or under-specified queries such as “best shoes for trail running.” These queries, referred to as implicit superlative queries, pose a challenge for standard retrieval and ranking systems due to their lack of explicit attribute mentions and the need for identifying and reasoning over complex attributes. We investigate how Large Language Models (LLMs) can generate implicit attributes for ranking and reason over them to improve product recommendations for such queries. As a first step, we propose a novel four-point schema, called SUPERB, for annotating the best product candidates for superlative queries, paired with LLM-based product annotations. We then empirically evaluate several existing retrieval and ranking approaches on our newly created dataset, providing insights and discussing how to integrate these findings into real-world e-commerce production systems. @@ -10172,7 +10172,7 @@ YichengFu ZikuiWang LiuxinYang - MeiqingHuo + MeiqingHuo ZhongdongmingDaiUniversity of California, San Diego 92-104 Quizzes play a crucial role in education by reinforcing students’ understanding of key concepts and encouraging self-directed exploration. However, compiling high-quality quizzes can be challenging and require deep expertise and insight into specific subject matter. Although LLMs have greatly enhanced the efficiency of quiz generation, concerns remain regarding the quality of these AI-generated quizzes and their educational impact on students. To address these issues, we introduce ConQuer, a concept-based quiz generation framework that leverages external knowledge sources. We employ comprehensive evaluation dimensions to assess the quality of the generated quizzes, using LLMs as judges. Our experiment results demonstrate a 4.8% improvement in evaluation scores and a 77.52% win rate in pairwise comparisons against baseline quiz sets. Ablation studies further underscore the effectiveness of each component in our framework. @@ -10182,10 +10182,10 @@ What is it? Towards a Generalizable Native <fixed-case>A</fixed-case>merican Language Identification System IvoryYang - WeichengMaGeorgia Institute of Technology + WeichengMaGeorgia Institute of Technology Carlos GuerreroAlvarezDartmouth College and Dartmouth College WilliamDinauer - SoroushVosoughiDartmouth College + SoroushVosoughiDartmouth College 105-111 This paper presents a research thesis proposal to develop a generalizable Native American language identification system. Despite their cultural and historical significance, Native American languages remain entirely unsupported by major commercial language identification systems. This omission not only underscores the systemic neglect of endangered languages in technological development, but also highlights the urgent need for dedicated, community-driven solutions. We propose a two-pronged approach: (1) systematically curating linguistic resources across all Native American languages for robust training, and (2) tailored data augmentation to generate synthetic yet linguistically coherent training samples. As proof of concept, we extend an existing rudimentary Athabaskan language classifier by integrating Plains Apache, an extinct Southern Athabaskan language, as an additional language class. We also adapt a data generation framework for low-resource languages to create synthetic Plains Apache data, highlighting the potential of data augmentation. This proposal advocates for a community-driven, technological approach to supporting Native American languages. 2025.naacl-srw.10 @@ -10203,7 +10203,7 @@ Sentimatic: Sentiment-guided Automatic Generation of Preference Datasets for Customer Support Dialogue System - SuhyunLee + SuhyunLee ChangHeonHan 120-128 Supervised Fine-tuning (SFT) and preference optimization (PO) are key methods for enhancing language models and aligning them with human preferences. However, scaling preference datasets for PO training is challenging, leading AI customer support systems to rely on SFT. To address this, we propose the Sentiment-guided Automatic Generation of Preference Datasets (Sentimatic) methodology to automatically generate customer preference datasets without human intervention using a publicly available dataset constructed for SFT. Our approach classifies responses by sentiment, fine-tunes models on them, and applies advanced sampling and evaluation techniques to ensure diversity and quality. Ultimately, we generated 1,174 customer preference datasets based on 357 test datasets, and through experiments, we confirmed that the AI customer support system trained on these datasets is capable of carefully considering customer emotions and generating professional and appropriate responses. @@ -10212,7 +10212,7 @@ Privacy-Preserving Federated Learning for Hate Speech Detection - Ivode Souza Bueno Júnior + Ivode Souza Bueno Júnior HaotianYe AxelWisiorek HinrichSchütze @@ -10223,9 +10223,9 @@ From Annotation to Adaptation: Metrics, Synthetic Data, and Aspect Extraction for Aspect-Based Sentiment Analysis with Large Language Models - NikitaNeveditsinSt. Mary’s University - PawanLingrasSt. Mary’s University - Vijay KumarMagoYork University + NikitaNeveditsinSt. Mary’s University + PawanLingrasSt. Mary’s University + Vijay KumarMagoYork University 142-161 This study examines the performance of Large Language Models (LLMs) in Aspect-Based Sentiment Analysis (ABSA), with a focus on implicit aspect extraction in a novel domain. Using a synthetic sports feedback dataset, we evaluate open-weight LLMs’ ability to extract aspect-polarity pairs and propose a metric to facilitate the evaluation of aspect extraction with generative models. Our findings highlight both the potential and limitations of LLMs in the ABSA task. 2025.naacl-srw.14 @@ -10234,9 +10234,9 @@ Developing <fixed-case>J</fixed-case>apanese <fixed-case>CLIP</fixed-case> Models Leveraging an Open-weight <fixed-case>LLM</fixed-case> for Large-scale Dataset Translation IssaSugiuraKyoto University - ShuheiKuritaNational Institute of Informatics and New York University + ShuheiKuritaNational Institute of Informatics and New York University YusukeOdaNational Institute of Informatics and Nara Institute of Science and Technology - DaisukeKawaharaWaseda University + DaisukeKawaharaWaseda University NaoakiOkazakiInstitute of Science Tokyo 162-170 CLIP is a foundational model that bridges images and text, widely adopted as a key component in numerous vision-language models.However, the lack of large-scale open Japanese image-text pairs poses a significant barrier to the development of Japanese vision-language models.In this study, we constructed a Japanese image-text pair dataset with 1.5 billion examples using machine translation with open-weight LLMs and pre-trained Japanese CLIP models on the dataset.The performance of the pre-trained models was evaluated across seven benchmark datasets, achieving competitive average scores compared to models of similar size without the need for extensive data curation. However, the results also revealed relatively low performance on tasks specific to Japanese culture, highlighting the limitations of translation-based approaches in capturing cultural nuances. Our dataset, models, and code are publicly available. @@ -10279,11 +10279,11 @@ Multilingual Native Language Identification with Large Language Models - DhimanGoswamiGeorge Mason University + DhimanGoswamiGeorge Mason University MarcosZampieriGeorge Mason University KaiNorth ShervinMalmasiAmazon - AntoniosAnastasopoulosAthena Research Center and George Mason University + AntoniosAnastasopoulosAthena Research Center and George Mason University 193-199 Native Language Identification (NLI) is the task of automatically identifying the native language (L1) of individuals based on their second language (L2) production. The introduction of Large Language Models (LLMs) with billions of parameters has renewed interest in text-based NLI, with new studies exploring LLM-based approaches to NLI on English L2. The capabilities of state-of-the-art LLMs on non-English NLI corpora, however, have not yet been fully evaluated. To fill this important gap, we present the first evaluation of LLMs for multilingual NLI. We evaluated the performance of several LLMs compared to traditional statistical machine learning models and language-specific BERT-based models on NLI corpora in English, Italian, Norwegian, and Portuguese. Our results show that fine-tuned GPT-4 models achieve state-of-the-art NLI performance. 2025.naacl-srw.19 @@ -10294,7 +10294,7 @@ SamuelBelkadi LiboRen NicoloMicheletti - LifengHan + LifengHan GoranNenadicUniversity of Manchester 200-206 The abundance of medical records holds great promise for enhancing healthcare and advancing biomedical research. However, due to privacy constraints, access to such data is typically limited to internal use.Recent studies have attempted to overcome this challenge by generating synthetic data through Causal Language Modelling. Yet, this approach often fails to ensure patient anonymity and offers limited control over output diversity—unless additional computational cost is introduced.In response, we propose a method for generating synthetic free-text medical records based on Masked Language Modelling. Our approach retains key medical details while introducing variability in the generated texts and reducing the risk of patient re-identification. With a relatively lightweight architecture of approximately 120 million parameters, the system ensures low inference costs.Experimental results show that our method produces high-quality synthetic data, achieving a HIPAA-compliant PHI recall of 96% and a re-identification risk of only 3.5%. Furthermore, downstream evaluations reveal that models trained on the synthetic data perform comparably to those trained on real-world data. Our trained models are publicly available on Github as SynDeidMLM (at https://github.com/SamySam0/SynDeidMLM) (meaning synthetic and de-identified data generation using MLM). @@ -10304,7 +10304,7 @@ How many words does it take to understand a low-resource language? EmilyChang - NadaBasitUniversity of Virginia, Charlottesville + NadaBasitUniversity of Virginia, Charlottesville 207-224 When developing language technology, researchers have routinely turned to transfer learning to resolve the data scarcity conundrum presented in low-resource languages. As far as we know, this study is the first to evaluate the amount of documentation needed for transfer learning, specifically the smallest vocabulary size needed to create a sentence embedding space. In adopting widely spoken languages as a proxy for low-resource languages, our experiments show that the relationship between a sentence embedding’s vocabulary size and performance is logarithmic with performance leveling at a vocabulary size of 25,000. It should be noted that this relationship cannot be replicated across all languages and this level of documentation does not exist for many low-resource languages. We do observe, however, that performance accelerates at a vocabulary size of \le 1000, a quantity that is present in most low-resource language documentation. These results can aid researchers in understanding whether a low-resource language has enough documentation necessary to support the creation of a sentence embedding and language model. 2025.naacl-srw.21 @@ -10332,7 +10332,7 @@ Tighter Clusters, Safer Code? Improving Vulnerability Detection with Enhanced Contrastive Loss PranavKapparad - Biju RMohan + Biju RMohan 247-252 Distinguishing vulnerable code from non-vulnerable code is challenging due to high inter-class similarity. Supervised contrastive learning (SCL) improves embedding separation but struggles with intra-class clustering, especially when variations within the same class are subtle. We propose Cluster-Enhanced Supervised Contrastive Loss (CESCL), an extension of SCL with a distance-based regularization term that tightens intra-class clustering while maintaining inter-class separation. Evaluating on CodeBERT and GraphCodeBERT with Binary Cross Entropy (BCE), BCE + SCL, and BCE + CESCL, our method improves F1 score by 1.76% on CodeBERT and 4.1% on GraphCodeBERT, demonstrating its effectiveness in code vulnerability detection and broader applicability to high-similarity classification tasks. 2025.naacl-srw.24 @@ -10342,7 +10342,7 @@ Text Extraction and Script Completion in Images of <fixed-case>A</fixed-case>rabic Script-Based Calligraphy: A Thesis Proposal Dilara ZeynepGürer ÜmitAtlamazBogazici University - Şaziye BetülÖzateşBoğaziçi University + Şaziye BetülÖzateşBoğaziçi University 253-259 Arabic calligraphy carries rich historical information and meaning. However, the complexity of its artistic elements and the absence of a consistent baseline make text extraction from such works highly challenging. In this paper, we provide an in-depth analysis of the unique obstacles in processing and interpreting these images, including the variability in calligraphic styles, the influence of artistic distortions, and the challenges posed by missing or damaged text elements. We explore potential solutions by leveraging state-of-the-art architectures and deep learning models, including visual language models, to improve text extraction and script completion. 2025.naacl-srw.25 @@ -10351,7 +10351,7 @@ Subasa - Adapting Language Models for Low-resourced Offensive Language Detection in <fixed-case>S</fixed-case>inhala ShanilkaHaturusinghe - Tharindu CyrilWeerasooriyaAccenture + Tharindu CyrilWeerasooriyaAccenture Christopher MHoman MarcosZampieriGeorge Mason University Sidath RavindraLiyanage @@ -10363,8 +10363,8 @@ Integrating Symbolic Execution into the Fine-Tuning of Code-Generating <fixed-case>LLM</fixed-case>s MarinaSakharova - AbhinavAnandTechnische Universität Darmstadt - MiraMeziniTechnische Universität Darmstadt + AbhinavAnandTechnische Universität Darmstadt + MiraMeziniTechnische Universität Darmstadt 271-278 Code-generating Large Language Models (LLMs) have become essential tools in modern software development, enhancing productivity and accelerating development. This paper aims to investigate the fine-tuning of code-generating LLMs using Reinforcement Learning and Direct Preference Optimization, further improving their performance. To achieve this, we enhance the training data for the reward model with the help of symbolic execution techniques, ensuring more comprehensive and objective data. With symbolic execution, we create a custom dataset that better captures the nuances in code evaluation. Our reward models, fine-tuned on this dataset, demonstrate significant improvements over the baseline, CodeRL, in estimating the quality of generated code. Our code-generating LLMs, trained with the help of reward model feedback, achieve similar results compared to the CodeRL benchmark. 2025.naacl-srw.27 @@ -10375,7 +10375,7 @@ EliseiRykovSkolkovo Institute of Science and Technology and Tinkoff KseniiaPetrushina KseniiaTitova - AntonRazzhigaev + AntonRazzhigaev AlexanderPanchenkoSkoltech VasilyKonovalovAIRI 279-293 @@ -10385,9 +10385,9 @@ <fixed-case>C</fixed-case>olor<fixed-case>F</fixed-case>oil: Investigating Color Blindness in Large Vision and Language Models - Ahnaf MozibSamin - M FirozAhmed - Md. Mushtaq ShahriyarRafeeMetropolitan University + Ahnaf MozibSamin + M FirozAhmed + Md. Mushtaq ShahriyarRafeeMetropolitan University 294-300 With the utilization of Transformer architecture, large Vision and Language (V&L) models have shown promising performance in even zero-shot settings. Several studies, however, indicate a lack of robustness of the models when dealing with complex linguistics and visual attributes. In this work, we introduce a novel V&L benchmark - ColorFoil, by creating color-related foils to assess the models’ perception ability to detect colors like red, white, green, etc. We evaluate seven state-of-the-art V&L models including CLIP, ViLT, GroupViT, and BridgeTower, etc. in a zero-shot setting and present intriguing findings from the V&L models. The experimental evaluation indicates that ViLT and BridgeTower demonstrate much better color perception capabilities compared to CLIP and its variants and GroupViT. Moreover, CLIP-based models and GroupViT struggle to distinguish colors that are visually distinct to humans with normal color perception ability. 2025.naacl-srw.29 @@ -10406,7 +10406,7 @@ Anik MahmudShanto Mst. Sanjida JamalPriya Fahim ShakilTamim - Mohammed MoshiulHoqueChittagong University of Engineering and Technology + Mohammed MoshiulHoqueChittagong University of Engineering and Technology 311-320 Identifying commercial posts in resource-constrained languages among diverse and unstructured content remains a significant challenge for automatic text classification tasks. To address this, this work introduces a novel dataset named MDC^3 (Multimodal Dataset for Commercial Content Classification), comprising 5,007 annotated Bengali social media posts classified as commercial and noncommercial. A comprehensive annotation guideline accompanying the dataset is included to aid future dataset creation in resource-constrained languages. Furthermore, we performed extensive experiments on MDC^3 considering both unimodal and multimodal domains. Specifically, the late fusion of textual (mBERT) and visual (ViT) models (i.e., ViT+mBERT) achieves the highest F1 score of 90.91, significantly surpassing other baselines. 2025.naacl-srw.31 @@ -10429,7 +10429,7 @@ LisPereiraNational Institute of Information and Communications Technology (NICT), National Institute of Advanced Industrial Science and Technology FeiChengKyoto University Wan JouSheKyoto Institute of Technology - EijiAramakiNara Institute of Science and Technology, Japan + EijiAramakiNara Institute of Science and Technology, Japan 333-342 Existing in-context learning (ICL) methods for relation extraction (RE) often prioritize language similarity over structural similarity, which may result in overlooking entity relationships. We propose an AMR-enhanced retrieval-based ICL method for RE to address this issue. Our model retrieves in-context examples based on semantic structure similarity between task inputs and training samples. We conducted experiments in the supervised setting on four standard English RE datasets. The results show that our method achieves state-of-the-art performance on three datasets and competitive results on the fourth. Furthermore, our method outperforms baselines by a large margin across all datasets in the more demanding unsupervised setting. 2025.naacl-srw.33 @@ -10437,8 +10437,8 @@ Linguistic Analysis of Veteran Job Interviews to Assess Effectiveness in Translating Military Expertise to the Civilian Workforce - Caroline J.WendtUniversity of Colorado at Boulder - Ehsanul HaqueNirjhar + Caroline J.WendtUniversity of Colorado at Boulder + Ehsanul HaqueNirjhar TheodoraChaspariUniversity of Colorado at Boulder 343-355 The ways in which natural language processing (NLP) can inform how veterans can improve effectiveness in translating military experience to workforce utility is underexplored. We design NLP experiments to evaluate the degree of explanation in veteran job interview responses as a proxy for perceived hireability. We examine linguistic and psycholinguistic features, context, and participant variability to investigate the mechanics of effective communication in employee selection. Results yield good performance when distinguishing between varying degrees of explanation in responses using LIWC features, indicating robustness of linguistic feature integration. Classifying Over- and Under-explained responses reflects challenges of class imbalance and the limitations of tested NLP methods for detecting subtleties in overly verbose or concise communication. Our findings have immediate applications for assistive technologies in job interview settings, and broader implications for enhancing automated communication assessment tools and refining strategies for training and interventions in communication-heavy fields. @@ -10449,7 +10449,7 @@ <fixed-case>M</fixed-case>eta<fixed-case>M</fixed-case>eme: A Dataset for Meme Template and Meta-Category Classification BenjaminLambrightBrandeis University JordanYouner - ConstantineLignosBrandeis University + ConstantineLignosBrandeis University 356-367 This paper introduces a new dataset for classifying memes by their template and communicative intent.It includes a broad selection of meme templates and examples scraped from imgflip and a smaller hand-annotated set of memes scraped from Reddit.The Reddit memes have been annotated for meta-category using a novel annotation scheme that classifies memes by the structure of the perspective they are being used to communicate.YOLOv11 and ChatGPT 4o are used to provide baseline modeling results.We find that YOLO struggles with template classification on real-world data but outperforms ChatGPT in classifying meta-categories. 2025.naacl-srw.35 @@ -10458,8 +10458,8 @@ Representing and Clustering Errors in Offensive Language Detection JoodOteyOakland University - LauraBiesterMiddlebury College - Steven RWilsonUniversity of Michigan - Flint + LauraBiesterMiddlebury College + Steven RWilsonUniversity of Michigan - Flint 368-380 Content moderation is essential in preventing the spread of harmful content on the Internet. However, there are instances where moderation fails and it is important to understand when and why that happens. Workflows that aim to uncover a system’s weakness typically use clustering of the data points’ embeddings to group errors together. In this paper, we evaluate the K-Means clustering of four text representations for the task of offensive language detection in English and Levantine Arabic. We find Sentence-BERT (SBERT) embeddings give the most human-interpretable clustering for English errors and the grouping is mainly based on the targeted group in the text. Meanwhile, SBERT embeddings of Large Language Model (LLM)-generated linguistic features give the most interpretable clustering for Arabic errors. 2025.naacl-srw.36 @@ -10478,7 +10478,7 @@ Can Large Language Models Advance Crosswalks? The Case of <fixed-case>D</fixed-case>anish Occupation Codes BoleiMaLudwig-Maximilians-Universität München - Cynthia A.HuangMonash University + Cynthia A.HuangMonash University Anna-CarolinaHaenschUniversity of Maryland, College Park and Ludwig-Maximilians-Universität München 392-399 Crosswalks, which map one classification system to another, are critical tools for harmonizing data across time, countries, or frameworks. However, constructing crosswalks is labor-intensive and often requires domain expertise. This paper investigates the potential of Large Language Models (LLMs) to assist in creating crosswalks, focusing on two Danish occupational classification systems from different time periods as a case study. We propose a two-stage, prompt-based framework for this task, where LLMs perform similarity assessments between classification codes and identify final mappings through a guided decision process. Using four instruction-tuned LLMs and comparing them against an embedding-based baseline, we evaluate the performance of different models in crosswalks. Our results highlight the strengths of LLMs in crosswalk creation compared to the embedding-based baseline, showing the effectiveness of the interactive prompt-based framework for conducting crosswalks by LLMs. Furthermore, we analyze the impact of model combinations across two interactive rounds, highlighting the importance of model selection and consistency. This work contributes to the growing field of NLP applications for domain-specific knowledge mapping and demonstrates the potential of LLMs in advancing crosswalk methodologies. @@ -10498,7 +10498,7 @@ Do Video Language Models really understand the video contexts? - JeongwanShinKyungpook National University + JeongwanShinKyungpook National University JinhyeongLimHD Korea Shipbuilding & Offshore Engineering HyeyoungParkKyungpook National University 408-417 @@ -10509,9 +10509,9 @@ Evaluating Text Style Transfer Evaluation: Are There Any Reliable Metrics? SourabrataMukherjee - Atul Kr.OjhaUniversity of Galway, Ireland, Insight SFI Research Centre for Data Analytics, DSI, University of Galway, Ireland and Panlingua Languague Processing LLP, India - John PhilipMcCraeNational University of Ireland Galway - OndrejDusekCharles University, Prague + Atul Kr.OjhaUniversity of Galway, Ireland, Insight SFI Research Centre for Data Analytics, DSI, University of Galway, Ireland and Panlingua Languague Processing LLP, India + John PhilipMcCraeNational University of Ireland Galway + OndrejDusekCharles University, Prague 418-434 Text style transfer (TST) is the task of transforming a text to reflect a particular style while preserving its original content. Evaluating TSToutputs is a multidimensional challenge, requiring the assessment of style transfer accuracy, content preservation, and naturalness. Us-ing human evaluation is ideal but costly, as is common in other natural language processing (NLP) tasks; however, automatic metrics forTST have not received as much attention as metrics for, e.g., machine translation or summarization. In this paper, we examine both set ofexisting and novel metrics from broader NLP tasks for TST evaluation, focusing on two popular subtasks—sentiment transfer and detoxification—in a multilingual context comprising English, Hindi, and Bengali. By conducting meta-evaluation through correlation with hu-man judgments, we demonstrate the effectiveness of these metrics when used individually and in ensembles. Additionally, we investigatethe potential of large language models (LLMs) as tools for TST evaluation. Our findings highlight newly applied advanced NLP metrics andLLM-based evaluations provide better insights than existing TST metrics. Our oracle ensemble approaches show even more potential. 2025.naacl-srw.41 @@ -10520,8 +10520,8 @@ (<fixed-case>CPER</fixed-case>) From Guessing to Asking: An Approach to Resolving Persona Knowledge Gap in <fixed-case>LLM</fixed-case>s during Multi-Turn Conversations SarveshBaskar - ManasGaurUniversity of Maryland Baltimore County - SrinivasanParthasarathyOhio State University + ManasGaurUniversity of Maryland Baltimore County + SrinivasanParthasarathyOhio State University Tanmay TulsidasVerlekarBITS Pilani, Goa campus 435-447 In multi-turn dialogues, large language models face a critical challenge of ensuring coherence while adapting to user-specific information.. This study introduces the persona knowledge gap, the discrepancy between a model’s internal understanding and the knowledge required for coherent, personalized conversations. While prior research has recognized these gaps, computational methods for their identification and resolution remain underexplored. We propose Conversation Preference Elicitation and Recommendation (CPER), a novel framework that dynamically detects and resolves persona knowledge gaps using intrinsic uncertainty quantification and feedback-driven refinement. CPER consists of three key modules: a Contextual Understanding Module for preference extraction, a Dynamic Feedback Module for measuring uncertainty and refining persona alignment, and a Persona-Driven Response Generation module for adapting responses based on accumulated user context. We evaluate CPER on two real-world datasets: CCPE-M for preferential movie recommendations and ESConv for mental health support. Using A/B testing, human evaluators preferred CPER’s responses 42% more often than baseline models in CCPE-M and 27% more often in ESConv. A qualitative human evaluation confirms that CPER’s responses are preferred for maintaining contextual relevance and coherence, particularly in longer (12+ turn) conversations. @@ -10532,7 +10532,7 @@ Streamlining <fixed-case>LLM</fixed-case>s: Adaptive Knowledge Distillation for Tailored Language Models PrajviSaxenaGerman Research Center for AI SabineJanzen - WolfgangMaassUniversität des Saarlandes + WolfgangMaassUniversität des Saarlandes 448-455 Large language models (LLMs) like GPT-4 and LLaMA-3 offer transformative potential across industries, e.g., enhancing customer service, revolutionizing medical diagnostics, or identifying crises in news articles. However, deploying LLMs faces challenges such as limited training data, high computational costs, and issues with transparency and explainability. Our research focuses on distilling compact, parameter-efficient tailored language models (TLMs) from LLMs for domain-specific tasks with comparable performance. Current approaches like knowledge distillation, fine-tuning, and model parallelism address computational efficiency but lack hybrid strategies to balance efficiency, adaptability, and accuracy. We present ANON - an adaptive knowledge distillation framework integrating knowledge distillation with adapters to generate computationally efficient TLMs without relying on labeled datasets. ANON uses cross-entropy loss to transfer knowledge from the teacher’s outputs and internal representations while employing adaptive prompt engineering and a progressive distillation strategy for phased knowledge transfer. We evaluated ANON’s performance in the crisis domain, where accuracy is critical and labeled data is scarce. Experiments showed that ANON outperforms recent approaches of knowledge distillation, both in terms of the resulting TLM performance and in reducing the computational costs for training and maintaining accuracy compared to LLMs for domain-specific applications. 2025.naacl-srw.43 @@ -10553,7 +10553,7 @@ <fixed-case>A</fixed-case>uto<fixed-case>ML</fixed-case> Meets Hugging Face: Domain-Aware Pretrained Model Selection for Text Classification ParisaSafikhaniOtto-von-Guericke-Universität Magdeburg - Engineering, Otto von Guericke University Magdeburg and The German Centre for Higher Education Research and Science Studies (DZHW) - DavidBroneskeDeutsches Zentrum für Hochschul- und Wissenschaftsforschung + DavidBroneskeDeutsches Zentrum für Hochschul- und Wissenschaftsforschung 466-473 The effectiveness of embedding methods is crucial for optimizing text classification performance in Automated Machine Learning (AutoML). However, selecting the most suitable pre-trained model for a given task remains challenging. This study introduces the Corpus-Driven Domain Mapping (CDDM) pipeline, which utilizes a domain-annotated corpus of pre-fine-tuned models from the Hugging Face Model Hub to improve model selection. Integrating these models into AutoML systems significantly boosts classification performance across multiple datasets compared to baseline methods. Despite some domain recognition inaccuracies, results demonstrate CDDM’s potential to enhance model selection, streamline AutoML workflows, and reduce computational costs. 2025.naacl-srw.45 @@ -10570,7 +10570,7 @@ Detecting, Generating, and Evaluating in the Writing Style of Different Authors - MosabRezaeiNorthern Illinois University + MosabRezaeiNorthern Illinois University 485-491 In recent years, stylometry has been investigated in many different fields. Hence, in this work, we are going to tackle this problem, detecting, generating, and evaluating textual documents according to the writing style by leveraging state-of-the-art models. In the first step, the sentences will be extracted from several different books, each belonging to a different author, to create a dataset. Then the selected models will be trained to detect the author of sentences in the dataset. After that, generator models are utilized to generate sentences based on the authors’ writing styles with unpaired samples in the dataset. Finally, to evaluate the performance of the generators, the previously trained models will be used to assess the generated sentences and to compare the distribution of various syntactic features between the original and generated sentences. We hope the result shows that models can be achieved to detect and generate textual documents for the given authors according to their writing style. 2025.naacl-srw.47 @@ -10578,8 +10578,8 @@ Collaborative Data Exploration through Visualization: A Thesis Proposal Analyzing Impact of Conversational Assistants - AbariBhattacharyaUniversity of Illinois at Chicago - BarbaraDi EugenioUniversity of Illinois, Chicago + AbariBhattacharyaUniversity of Illinois at Chicago + BarbaraDi EugenioUniversity of Illinois, Chicago 492-500 Data visualization is integral to any Exploratory Data Analysis (EDA) task. However, generating visualization requires expertise, presenting a steep learning curve and a significant cognitive load. Natural language interfaces for EDA aim to lower this barrier by allowing users to generate visualizations through natural language queries. However, complexity remains when EDA is performed collaboratively, requiring an environment to support multi-user interaction. In this thesis proposal, we discuss challenges in user-system interaction in a collaborative multi-user setup, such as errors in visualization generation due to misinterpretation of user requests. We hypothesize that a Conversational Assistant (CA) capable of understanding user-initiated clarification requests and generating accurate responses can improve user experience and support collaborative EDA tasks. To this end, we propose to develop such a CA (Figure tab:system_issues) and evaluate it through a user study, thus examining its impact on user experience in a collaborative environment for EDA. 2025.naacl-srw.48 @@ -10591,7 +10591,7 @@ PriyanshuPriya Armita ManiTripathi PradeepikaVermaIndian Institute of Technology, Patna, Dhirubhai Ambani Institute Of Information and Communication Technology - AsifEkbalIndian Institute of Technology, Jodhpur + AsifEkbalIndian Institute of Technology, Jodhpur 501-516 Commonsense inference and domain-specific expertise are crucial for understanding and responding to emotional, cognitive, and topic-specific cues in counseling conversations with crime victims. However, these key evidences are often dispersed across multiple utterances, making it difficult to capture through single-hop reasoning. To address this, we propose MENDER, a novel Multi-hop commonsensE and domaiN-specific Chain-of-Thought (CoT) reasoning framework for knowleDge-grounded empathEtic Response generation in counseling dialogues. MENDER leverages large language models (LLMs) to integrate commonsense and domain knowledge via multi-hop reasoning over the dialogue context. It employs two specialized reasoning chains, viz. Commonsense Knowledge-driven CoT and Domain Knowledge-driven CoT rationales, which extract and aggregate dispersed emotional, cognitive, and topical evidences to generate knowledge-grounded empathetic counseling responses. Experimental evaluations on counseling dialogue dataset, POEM validate MENDER’s efficacy in generating coherent, empathetic, knowledge-grounded responses. 2025.naacl-srw.49 @@ -10601,7 +10601,7 @@ <fixed-case>S</fixed-case>kip<fixed-case>CLM</fixed-case>: Enhancing Crosslingual Alignment of Decoder Transformer Models via Contrastive Learning and Skip Connection NikitaSushko AlexanderPanchenkoSkoltech - ElenaTutubalinaKazan Federal University + ElenaTutubalinaKazan Federal University 517-528 This paper proposes SkipCLM, a novel method for improving multilingual machine translation in Decoder Transformers. We augment contrastive learning for cross-lingual alignment with a trainable skip connection to preserve information crucial for accurate target language generation. Experiments with XGLM-564M on the Flores-101 benchmark demonstrate improved performance, particularly for en-de and en-zh direction translations, compared to direct sequence-to-sequence training and existing contrastive learning methods. Code is available at: https://github.com/s-nlp/skipclm. 2025.naacl-srw.50 @@ -10610,7 +10610,7 @@ Towards <fixed-case>LLM</fixed-case>s Robustness to Changes in Prompt Format Styles LilianNgweta - KiranKateInternational Business Machines + KiranKateInternational Business Machines JasonTsayIBM Research YaraRizkInternational Business Machines 529-537 @@ -10625,10 +10625,10 @@ SeijiGobara RyoTsujimotoNara Institute of Science and Technology, Japan HibikiNakataniNara Institute of Science and Technology - KazukiHayashi + KazukiHayashi YusukeSakaiNara Institute of Science and Technology, Japan - HidetakaKamigaitoNara Institute of Science and Technology - TaroWatanabeNara Institute of Science and Technology, Japan + HidetakaKamigaitoNara Institute of Science and Technology + TaroWatanabeNara Institute of Science and Technology, Japan 538-550 The proportion of responses to a question and its options, known as the response distribution, enables detailed analysis of human society. Recent studies highlight the use of Large Language Models (LLMs) for predicting response distributions as a cost-effective survey method. However, the reliability of these predictions remains unclear. LLMs often generate answers by blindly following instructions rather than applying rational reasoning based on pretraining-acquired knowledge. This study investigates whether LLMs can rationally estimate distributions when presented with explanations of “artificially generated distributions” that are against commonsense. Specifically, we assess whether LLMs recognize counterintuitive explanations and adjust their predictions or simply follow these inconsistent explanations. Results indicate that smaller or less human-optimized LLMs tend to follow explanations uncritically, while larger or more optimized models are better at resisting counterintuitive explanations by leveraging their pretraining-acquired knowledge. These findings shed light on factors influencing distribution prediction performance in LLMs and are crucial for developing reliable distribution predictions using language models. 2025.naacl-srw.52 @@ -10636,7 +10636,7 @@ Rosetta-<fixed-case>PL</fixed-case>: Propositional Logic as a Benchmark for Large Language Model Reasoning - Shaun LeeBaek + Shaun LeeBaek ShaunEsua-Mensah CyrusTsui SejanVigneswaralingamTrinity School diff --git a/data/xml/2025.repl4nlp.xml b/data/xml/2025.repl4nlp.xml index 36e7515dd8..41d6348832 100644 --- a/data/xml/2025.repl4nlp.xml +++ b/data/xml/2025.repl4nlp.xml @@ -26,7 +26,7 @@ <fixed-case>DEPTH</fixed-case>: Discourse Education through Pre-Training Hierarchically Zachary ElishaBamberger OfekGlick - ChaimBaskinBen Gurion University of the Negev + ChaimBaskinBen Gurion University of the Negev YonatanBelinkovTechnion, Technion 1-25 2025.repl4nlp-1.1 @@ -60,7 +60,7 @@ ZhiZhongSony Group Corporation Wei-HsiangLiaoSony Corporation HiromiWakakiSony Group Corporation - YukiMitsufujiSony AI, Sony Group Corporation and Tokyo Institute of Technology, Tokyo Institute of Technology + YukiMitsufujiSony AI, Sony Group Corporation and Tokyo Institute of Technology, Tokyo Institute of Technology 51-58 Music-to-music-video generation is a challenging task due to the intrinsic differences between the music and video modalities. The advent of powerful text-to-video diffusion models has opened a promising pathway for music-video (MV) generation by first addressing the music-to-MV description task and subsequently leveraging these models for video generation. In this study, we focus on the MV description generation task and propose a comprehensive pipeline encompassing training data construction and multimodal model fine-tuning. We fine-tune existing pre-trained multimodal models on our newly constructed music-to-MV description dataset based on the Music4All dataset, which integrates both musical and visual information. Our experimental results demonstrate that music representations can be effectively mapped to textual domains, enabling the generation of meaningful MV description directly from music inputs. We also identify key components in the dataset construction pipeline that critically impact the quality of MV description and highlight specific musical attributes that warrant greater focus for improved MV description generation. 2025.repl4nlp-1.4 @@ -97,10 +97,10 @@ Investigating Adapters for Parameter-efficient Low-resource Automatic Speech Recognition - Ahnaf MozibSamin - ShekharNayakUniversity of Groningen/Campus Fryslan and Institute for Infocomm Research, A*STAR + Ahnaf MozibSamin + ShekharNayakUniversity of Groningen/Campus Fryslan and Institute for Infocomm Research, A*STAR AndreaDe MarcoNA - ClaudiaBorgUniversity of Malta + ClaudiaBorgUniversity of Malta 100-107 Recent years have witnessed the adoption of parameter-efficient adapters in pre-trained language models for natural language processing. Yet, their application in speech processing remains less studied. In this work, we explore the adapters for low-resource speech recognition, introducing a novel technique - ConvAdapt into pre-trained speech models. We investigate various aspects such as data requirements, transfer learning within adapters, and scaling of feed-forward layers in adapters. Our findings reveal that bottleneck adapters offer competitiveness with full fine-tuning with at least 10 hours of data, but they are not as effective in few-shot learning scenarios. Notably, ConvAdapt demonstrates improved performance in such cases. In addition, transfer learning in adapters shows promise, necessitating research in related languages. Furthermore, employing larger speech models for adapter-tuning surpasses fine-tuning with ample data, potentially due to reduced overfitting than fine-tuning. 2025.repl4nlp-1.8 @@ -112,8 +112,8 @@ ArneBinderGerman Research Center for AI DavidHarbeckeGerman Research Center for AI StalinVaranasi - LeonhardHennigGerman Research Center for AI - SimonOstermannGerman Research Center for AI + LeonhardHennigGerman Research Center for AI + SimonOstermannGerman Research Center for AI SebastianMöller Josef VanGenabithGerman Research Center for AI and Universität des Saarlandes 108-119 @@ -123,7 +123,7 @@ Punctuation Restoration Improves Structure Understanding without Supervision - JunghyunMin + JunghyunMin MinhoLee WoochulLee YeonsooLee @@ -135,7 +135,7 @@ Amuro & Char: Analyzing the Relationship between Pre-Training and Fine-Tuning of Large Language Models KaiserSunDepartment of Computer Science, Whiting School of Engineering - MarkDredzeDepartment of Computer Science, Whiting School of Engineering and Bloomberg + MarkDredzeDepartment of Computer Science, Whiting School of Engineering and Bloomberg 131-151 Large language model development relies on the pre-train-then-align paradigm, in which the model is typically pre-trained on a large text corpus and undergoes a tuning stage to align the model with human preference or downstream tasks. We investigate the relationship between pre-training and supervised fine-tuning by considering multiple tasks as well as different pre-trained model checkpoints. Our results on 18 datasets and two models suggest that i) although the model benefits significantly through supervised fine-tuning, it may forget previously known domain knowledge and tasks that are not seen during fine-tuning; ii) the model exhibits high sensitivity to evaluation prompts after supervised fine-tuning, but this sensitivity can be alleviated through further pre-training; iii) continual pre-training improves the model in a latent way that manifests after fine-tuning; iv) The model can already solve some tasks after pre-training while fine-tuning most benefits datasets where the model does not show capability during pre-training. 2025.repl4nlp-1.11 @@ -143,7 +143,7 @@ State Space Models are Strong Text Rerankers - ZhichaoXu + ZhichaoXu JinghuaYan AshimGupta VivekSrikumarUniversity of Utah @@ -155,7 +155,7 @@ Large Language Models Are Overparameterized Text Encoders Thennal DK - TimFischerUniversity of Hamburg + TimFischerUniversity of Hamburg ChrisBiemannU Hamburg 170-184 Large language models (LLMs) demonstrate strong performance as text embedding models when finetuned with supervised contrastive training. However, their large size balloons inference time and memory requirements. In this paper, we show that by pruning the last % layers of an LLM before supervised training for only 1000 steps, we can achieve a proportional reduction in memory and inference time. We evaluate four different state-of-the-art LLMs on text embedding tasks and find that our method can prune up to 30% of layers with negligible impact on performance and up to 80% with only a modest drop. With only three lines of code, our method is easily implemented in any pipeline for transforming LLMs to text encoders. We also propose L3Prune, a novel layer-pruning strategy based on the model’s initial loss that provides two optimal pruning configurations: a large variant with negligible performance loss and a small variant for resource-constrained settings. On average, the large variant prunes 21% of the parameters with a performance drop, and the small variant only suffers from a decrease while pruning 74% of the model. We consider these results strong evidence that LLMs are overparameterized for text embedding tasks, and can be easily pruned. diff --git a/data/xml/2025.wnut.xml b/data/xml/2025.wnut.xml index ad03451d36..938c7cbffd 100644 --- a/data/xml/2025.wnut.xml +++ b/data/xml/2025.wnut.xml @@ -37,9 +37,9 @@ Sentiment Analysis on Video Transcripts: Comparing the Value of Textual and Multimodal Annotations - QuanqiDu + QuanqiDu LoicDe Langhe - ElsLefeverGhent University + ElsLefeverGhent University VeroniqueHosteUniversiteit Gent 10-15 This study explores the differences between textual and multimodal sentiment annotations on videos and their impact on transcript-based sentiment modelling. Using the UniC and CH-SIMS datasets which are annotated at both the unimodal and multimodal level, we conducted a statistical analysis and sentiment modelling experiments. Results reveal significant differences between the two annotation types, with textual annotations yielding better performance in sentiment modelling and demonstrating superior generalization ability. These findings highlight the challenges of cross-modality generalization and provide insights for advancing sentiment analysis. @@ -57,7 +57,7 @@ Identifying and analyzing ‘noisy’ spelling errors in a second language corpus - AlanJuffsUniversity of Pittsburgh + AlanJuffsUniversity of Pittsburgh BenNaismithNA 26-37 This paper addresses the problem of identifying and analyzing ‘noisy’ spelling errors in texts written by second language (L2) learners’ texts in a written corpus. Using Python, spelling errors were identified in 5774 texts greater than or equal to 66 words (total=1,814,209 words), selected from a corpus of 4.2 million words (Authors-1). The statistical analysis used hurdle() models in R, which are appropriate for non-normal, count data, with many zeros. @@ -76,7 +76,7 @@ We’re Calling an Intervention: Exploring Fundamental Hurdles in Adapting Language Models to Nonstandard Text AarohiSrivastavaUniversity of Notre Dame - DavidChiangUniversity of Notre Dame + DavidChiangUniversity of Notre Dame 45-56 We present a suite of experiments that allow us to understand the underlying challenges of language model adaptation to nonstandard text. We do so by designing interventions that approximate core features of user-generated text and their interactions with existing biases of language models. Applying our interventions during language model adaptation to nonstandard text variations, we gain important insights into when such adaptation is successful, as well as the aspects of text variation and noise that are particularly difficult for language models to handle. For instance, on text with character-level variation, out-of-the-box performance improves even with a few additional training examples but approaches a plateau, suggesting that more data is not the solution. In contrast, on text with variation involving new words or meanings, far more data is needed, but it leads to a massive breakthrough in performance. Our findings reveal that existing models lack the necessary infrastructure to handle diverse forms of nonstandard text, guiding the development of more resilient language modeling techniques. We make the code for our interventions, which can be applied to any English text data, publicly available. 2025.wnut-1.6 @@ -87,7 +87,7 @@ RuneBirkmose Nathan MørkebergReece Esben HofstedtNorvin - JohannesBjervaAalborg University + JohannesBjervaAalborg University MikeZhang 57-67 This paper investigates whether Large Language Models (LLMs), fine-tuned on synthetic but domain-representative data, can perform the twofold task of (i) slot and intent detection and (ii) natural language response generation for a smart home assistant, while running solely on resource-limited, CPU-only edge hardware. We fine-tune LLMs to produce both JSON action calls and text responses. Our experiments show that 16-bit and 8-bit quantized variants preserve high accuracy on slot and intent detection and maintain strong semantic coherence in generated text, while the 4-bit model, while retaining generative fluency, suffers a noticeable drop in device-service classification accuracy. Further evaluations on noisy human (non-synthetic) prompts and out-of-domain intents confirm the models’ generalization ability, obtaining around 80–86% accuracy. While the average inference time is 5–6 seconds per query—acceptable for one-shot commands but suboptimal for multi-turn dialogue—our results affirm that an on-device LLM can effectively unify command interpretation and flexible response generation for home automation without relying on specialized hardware. @@ -97,7 +97,7 @@ Applying Transformer Architectures to Detect Cynical Comments in <fixed-case>S</fixed-case>panish Social Media SamuelGonzalez-LopezUniversidad Tecnológica de Nogales - StevenBethardUniversity of Arizona + StevenBethardUniversity of Arizona RogelioPlatt-MolinaNA FranciscaOrozcoNA 68-77 @@ -118,7 +118,7 @@ <fixed-case>F</fixed-case>a<fixed-case>BERT</fixed-case>: Pre-training <fixed-case>BERT</fixed-case> on <fixed-case>P</fixed-case>ersian Blogs MostafaMasumiSharif University of Technology Seyed SoroushMajd - MehrnoushShamsfardShahid Beheshti University + MehrnoushShamsfardShahid Beheshti University HamidBeigy 85-96 We introduce FaBERT, a Persian BERT-base model pre-trained on the HmBlogs corpus, encompassing both informal and formal Persian texts. FaBERT is designed to excel in traditional Natural Language Understanding (NLU) tasks, addressing the intricacies of diverse sentence structures and linguistic styles prevalent in the Persian language. In our comprehensive evaluation of FaBERT on 12 datasets in various downstream tasks, encompassing Sentiment Analysis (SA), Named Entity Recognition (NER), Natural Language Inference (NLI), Question Answering (QA), and Question Paraphrasing (QP), it consistently demonstrated improved performance, all achieved within a compact model size. The findings highlight the importance of utilizing diverse corpora, such as HmBlogs, to enhance the performance of language models like BERT in Persian Natural Language Processing (NLP) applications. @@ -129,8 +129,8 @@ Automatically Generating <fixed-case>C</fixed-case>hinese Homophone Words to Probe Machine Translation Estimation Systems ShenbinQian ConstantinOrasanUniversity of Surrey - DipteshKanojiaUniversity of Surrey - FélixDo CarmoUniversity of Surrey + DipteshKanojiaUniversity of Surrey + FélixDo CarmoUniversity of Surrey 97-107 Evaluating machine translation (MT) of user-generated content (UGC) involves unique challenges such as checking whether the nuance of emotions from the source are preserved in the target text. Recent studies have proposed emotion-related datasets, frameworks and models to automatically evaluate MT quality of Chinese UGC, without relying on reference translations. However, whether these models are robust to the challenge of preserving emotional nuances has been left largely unexplored. To this end, we introduce a novel method inspired by information theory which generates challenging Chinese homophone words related to emotions, by leveraging the concept of *self-information*. Our approach generates homophones that were observed to cause translation errors in emotion preservation, and exposes vulnerabilities in MT models struggling to preserve relevant emotions. We evaluate the efficacy of our method using human evaluation and compare it with an existing one, showing that our method achieves higher correlation with human judgments. The generated Chinese homophones, along with their manual translations, are utilized to generate perturbations and to probe the robustness of existing quality evaluation models, including models trained using multi-task learning, fine-tuned variants of multilingual language models, as well as large language models (LLMs). Our results indicate that LLMs with larger size exhibit higher stability and robustness to such perturbations. We release our data and code for reproducibility and further research. 2025.wnut-1.11 @@ -156,7 +156,7 @@ <fixed-case>W</fixed-case>ikipedia is Not a Dictionary, Delete! Text Classification as a Proxy for Analysing <fixed-case>W</fixed-case>iki Deletion Discussions - HsuvasBorkakotyCardiff University + HsuvasBorkakotyCardiff University LuisEspinosa-AnkeCardiff University and AMPLYFI 133-142 Automated content moderation for collaborative knowledge hubs like Wikipedia or Wikidata is an important yet challenging task due to multiple factors. In this paper, we construct a database of discussions happening around articles marked for deletion in several Wikis and in three languages, which we then use to evaluate a range of LMs on different tasks (from predicting the outcome of the discussion to identifying the implicit policy an individual comment might be pointing to). Our results reveal, among others, that discussions leading to deletion are easier to predict, and that, surprisingly, self-produced tags (keep, delete or redirect) don’t always help guiding the classifiers, presumably because of users’ hesitation or deliberation within comments @@ -166,7 +166,7 @@ From Conversational Speech to Readable Text: Post-Processing Noisy Transcripts in a Low-Resource Setting ArtursZnotins - NormundsGruzitis + NormundsGruzitis RobertsDargis 143-148 We present ongoing research on automatic post-processing approaches to enhance the readability of noisy speech transcripts in low-resource languages, with a focus on conversational speech in Latvian. We compare transformer-based sequence-labeling models and large language models (LLMs) for the standard punctuation and capitalization restoration task, while also considering automatic correction of mispronounced words and disfluency, and partial inverse text normalization. Our results show that very small LLMs (approx. 2B parameters), fine-tuned on a modest text corpus, can achieve near state-of-the-art performance, rivaling orders of magnitude larger LLMs. Additionally, we demonstrate that a fine-tuned Whisper model, leveraging acoustic cues, outperforms text-only systems on challenging conversational data, even for a low-resource language. Error analysis reveals recurring pitfalls in sentence boundary determination and disfluency handling, emphasizing the importance of consistent annotation and domain adaptation for robust post-processing. Our findings highlight the feasibility of developing efficient post-processing solutions that significantly refine ASR output in low-resource settings, while opening new possibilities for editing and formatting speech transcripts beyond mere restoration of punctuation and capitalization. @@ -181,9 +181,9 @@ KokiHoriguchi TomoyukiKajiwaraEhime University TakashiNinomiyaEhime University - HideakiHayashiOsaka University - YutaNakashimaOsaka University - HajimeNagaharaOsaka University + HideakiHayashiOsaka University + YutaNakashimaOsaka University + HajimeNagaharaOsaka University 149-157 We manually normalize noisy Japanese expressions on social networking services (SNS) to improve the performance of sentiment polarity classification.Despite advances in pre-trained language models, informal expressions found in social media still plague natural language processing.In this study, we analyzed 6,000 posts from a sentiment analysis corpus for Japanese SNS text, and constructed a text normalization taxonomy consisting of 33 types of editing operations.Text normalization according to our taxonomy significantly improved the performance of BERT-based sentiment analysis in Japanese.Detailed analysis reveals that most types of editing operations each contribute to improve the performance of sentiment analysis. 2025.wnut-1.16 From 0d40acabd44bf85bf388ea0033bf9c89cae8983b Mon Sep 17 00:00:00 2001 From: Matt Post Date: Fri, 6 Jun 2025 09:58:07 -0400 Subject: [PATCH 17/18] black --- bin/ingest_orcids.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/bin/ingest_orcids.py b/bin/ingest_orcids.py index 35df3ab837..b01abdcf12 100755 --- a/bin/ingest_orcids.py +++ b/bin/ingest_orcids.py @@ -91,12 +91,12 @@ def main( # print(f"Found {len(papers)} archival papers", file=sys.stderr) # for paper in papers: - # print("PAPER:", paper['id'], file=sys.stderr) - # for author in paper['authors']: - # print( - # f" {author['first_name']} {author['last_name']} ({author.get('institution', '')})", - # file=sys.stderr, - # ) + # print("PAPER:", paper['id'], file=sys.stderr) + # for author in paper['authors']: + # print( + # f" {author['first_name']} {author['last_name']} ({author.get('institution', '')})", + # file=sys.stderr, + # ) collection_id, volume_name = full_volume_id.split('-') @@ -122,7 +122,7 @@ def main( num_added = 0 for paper, paper_node in zip(papers, volume_node.findall('./paper')): # paper_num = int(paper["id"]) - paper_num = int(paper_node.attrib['id']) + # paper_num = int(paper_node.attrib['id']) # print(f"PAPER: YAML={paper_num}", file=sys.stderr) def get_author_xml(author_xml): @@ -159,7 +159,10 @@ def get_author_xml(author_xml): indent(root_node) tree = etree.ElementTree(root_node) tree.write(collection_file, encoding='UTF-8', xml_declaration=True, with_tail=True) - print(f"Added {num_added} ORCIDs for {full_volume_id} to {collection_file}", file=sys.stderr) + print( + f"Added {num_added} ORCIDs for {full_volume_id} to {collection_file}", + file=sys.stderr, + ) if __name__ == '__main__': From 962eec2b0ee6cfde569ac8a7330402fb81cf8821 Mon Sep 17 00:00:00 2001 From: Matt Post Date: Fri, 6 Jun 2025 09:58:17 -0400 Subject: [PATCH 18/18] Add ORCIDS for 2024 venues --- data/xml/2024.acl.xml | 150 +- data/xml/2024.blackboxnlp.xml | 86 +- data/xml/2024.conll.xml | 96 +- data/xml/2024.customnlp4u.xml | 42 +- data/xml/2024.emnlp.xml | 4256 ++++++++++++++++----------------- data/xml/2024.fever.xml | 60 +- data/xml/2024.findings.xml | 2864 +++++++++++----------- data/xml/2024.genbench.xml | 44 +- data/xml/2024.mrl.xml | 70 +- data/xml/2024.nlp4pi.xml | 62 +- data/xml/2024.nlp4science.xml | 30 +- 11 files changed, 3880 insertions(+), 3880 deletions(-) diff --git a/data/xml/2024.acl.xml b/data/xml/2024.acl.xml index 8a0101adda..496eb59ac6 100644 --- a/data/xml/2024.acl.xml +++ b/data/xml/2024.acl.xml @@ -13270,7 +13270,7 @@ <fixed-case>O</fixed-case>pen<fixed-case>VNA</fixed-case>: A Framework for Analyzing the Behavior of Multimodal Language Understanding System under Noisy Scenarios - ZiqiYuan + ZiqiYuan BaozhengZhang HuaXuTsinghua University, Tsinghua University ZhiyunLiang @@ -13286,7 +13286,7 @@ HaoFeiNational University of Singapore MeishanZhangHarbin Institute of Technology (Shenzhen), China and Tianjin University, China MinZhangHarbin Institute of Technology, Shenzhen - Tat-SengChuaNational University of Singapore + Tat-SengChuaNational University of Singapore 19-30 Structured Natural Language Processing (XNLP) is an important subset of NLP that entails understanding the underlying semantic or syntactic structure of texts, which serves as a foundational component for many downstream applications. Despite certain recent efforts to explore universal solutions for specific categories of XNLP tasks, a comprehensive and effective approach for unifying all XNLP tasks long remains underdeveloped. Meanwhile, while XNLP demonstration systems are vital for researchers exploring various XNLP tasks, existing platforms can be limited to, e.g., supporting few XNLP tasks, lacking interactivity and universalness. To this end, we propose an advanced XNLP demonstration system, where we leverage LLM to achieve universal XNLP, with one model for all with high generalizability. Overall, our system advances in multiple aspects, including universal XNLP modeling, high performance, interpretability, scalability, and interactivity, offering a unified platform for exploring diverse XNLP tasks in the community. 2024.acl-demos.3 @@ -13296,7 +13296,7 @@ Towards the <fixed-case>T</fixed-case>op<fixed-case>M</fixed-case>ost: A Topic Modeling System Toolkit XiaobaoWuNanyang Technological University - FengjunPan + FengjunPan Anh TuanLuuNanyang Technological University 31-41 Topic models have a rich history with various applications and have recently been reinvigorated by neural topic modeling. However, these numerous topic models adopt totally distinct datasets, implementations, and evaluations. This impedes quick utilization and fair comparisons, and thereby hinders their research progress and applications. To tackle this challenge, we in this paper propose a Topic Modeling System Toolkit (TopMost). Compared to existing toolkits, TopMost stands out by supporting more extensive features. It covers a broader spectrum of topic modeling scenarios with their complete lifecycles, including datasets, preprocessing, models, training, and evaluations. Thanks to its highly cohesive and decoupled modular design, TopMost enables rapid utilization, fair comparisons, and flexible extensions of diverse cutting-edge topic models. Our code, tutorials, and documentation are available at https://github.com/bobxwu/topmost. @@ -13306,7 +13306,7 @@ Wordflow: Social Prompt Engineering for Large Language Models - ZijieWangGeorgia Institute of Technology + ZijieWangGeorgia Institute of Technology AishwaryaChakravarthy DavidMunechika Duen HorngChauGeorgia Institute of Technology @@ -13319,7 +13319,7 @@ <fixed-case>LM</fixed-case> Transparency Tool: Interactive Tool for Analyzing Transformer Language Models IgorTufanovFacebook - KarenHambardzumyanFacebook and University College London, University of London + KarenHambardzumyanFacebook and University College London, University of London JavierFerrando ElenaVoitaFAIR at Meta AI and University of Amsterdam 51-60 @@ -13334,8 +13334,8 @@ HanZhangXidian University BinWang LiziLiaoSingapore Management University - QianLiuUniversity of Auckland - ErikCambriaNanyang Technological University + QianLiuUniversity of Auckland + ErikCambriaNanyang Technological University 61-71 This paper introduces EmpathyEar, a pioneering open-source, avatar-based multimodal empathetic chatbot, to fill the gap in traditional text-only empathetic response generation (ERG) systems. Leveraging the advancements of a large language model, combined with multimodal encoders and generators, EmpathyEar supports user inputs in any combination of text, sound, and vision, and produces multimodal empathetic responses, offering users, not just textual responses but also digital avatars with talking faces and synchronized speeches. A series of emotion-aware instruction-tuning is performed for comprehensive emotional understanding and generation capabilities. In this way, EmpathyEar provides users with responses that achieve a deeper emotional resonance, closely emulating human-like empathy. The system paves the way for the next emotional intelligence, for which we open-source the code for public access. 2024.acl-demos.7 @@ -13344,8 +13344,8 @@ <fixed-case>O</fixed-case>pen<fixed-case>W</fixed-case>eb<fixed-case>A</fixed-case>gent: An Open Toolkit to Enable Web Agents on Large Language Models - Iat LongIong - XiaoLiu + Iat LongIong + XiaoLiu YuxuanChen HanyuLai ShuntianYaoBeijing University of Posts and Telecommunications @@ -13361,13 +13361,13 @@ <fixed-case>E</fixed-case>asy<fixed-case>E</fixed-case>dit: An Easy-to-use Knowledge Editing Framework for Large Language Models - PengWang + PengWang NingyuZhangZhejiang University BozhongTian ZekunXi YunzhiYao ZiwenXuZhejiang University - MengruWangZhejiang University + MengruWangZhejiang University ShengyuMao XiaohanWangZhejiang University SiyuanCheng @@ -13383,7 +13383,7 @@ <fixed-case>E</fixed-case>asy<fixed-case>I</fixed-case>nstruct: An Easy-to-use Instruction Processing Framework for Large Language Models - YixinOu + YixinOu NingyuZhangZhejiang University HonghaoGui ZiwenXuZhejiang University @@ -13406,7 +13406,7 @@ YuyangHuang ZixunLuUniversity of Southern California TianliTong - JonathanMayUniversity of Southern California and USC/ISI + JonathanMayUniversity of Southern California and USC/ISI 107-116 Following the rapid progress in natural language processing (NLP) models, language models are applied to increasingly more complex interactive tasks such as negotiations and conversation moderations. Having human evaluators directly interact with these NLP models is essential for adequately evaluating the performance on such interactive tasks. We develop BotEval, an easily customizable, open-source, evaluation toolkit that focuses on enabling human-bot interactions as part of the evaluation process, as opposed to human evaluators making judgements for a static input. BotEval balances flexibility for customization and user-friendliness by providing templates for common use cases that span various degrees of complexity and built-in compatibility with popular crowdsourcing platforms.We showcase the numerous useful features of BotEval through a study that evaluates the performance of various chatbots on their effectiveness for conversational moderation and discuss how BotEval differs from other annotation tools. 2024.acl-demos.11 @@ -13415,9 +13415,9 @@ <fixed-case>G</fixed-case>en<fixed-case>GO</fixed-case>: <fixed-case>ACL</fixed-case> Paper Explorer with Semantic Features - SotaroTakeshitaUniversit�t Mannheim - SimonePonzettoUniversity of Mannheim - KaiEckertMannheim University of Applied Sciences + SotaroTakeshitaUniversit�t Mannheim + SimonePonzettoUniversity of Mannheim + KaiEckertMannheim University of Applied Sciences 117-126 We present GenGO, a system for exploring papers published in ACL conferences. Paper data stored in our database is enriched with multi-aspect summaries, extracted named entities, a field of study label, and text embeddings by our data processing pipeline. These metadata are used in our web-based user interface to enable researchers to quickly find papers relevant to their interests, and grasp an overview of papers without reading full-text of papers. To make GenGO to be available online as long as possible, we design GenGO to be simple and efficient to reduce maintenance and financial costs. In addition, the modularity of our data processing pipeline lets developers easily extend it to add new features. We make our code available to foster open development and transparency: https://gengo.sotaro.io. 2024.acl-demos.12 @@ -13426,8 +13426,8 @@ <fixed-case>NLP</fixed-case>-<fixed-case>KG</fixed-case>: A System for Exploratory Search of Scientific Literature in Natural Language Processing - TimSchopf - FlorianMatthesTechnische Universit�t M�nchen + TimSchopf + FlorianMatthesTechnische Universit�t M�nchen 127-135 Scientific literature searches are often exploratory, whereby users are not yet familiar with a particular field or concept but are interested in learning more about it. However, existing systems for scientific literature search are typically tailored to keyword-based lookup searches, limiting the possibilities for exploration. We propose NLP-KG, a feature-rich system designed to support the exploration of research literature in unfamiliar natural language processing (NLP) fields. In addition to a semantic search, NLP-KG allows users to easily find survey papers that provide a quick introduction to a field of interest. Further, a Fields of Study hierarchy graph enables users to familiarize themselves with a field and its related areas. Finally, a chat interface allows users to ask questions about unfamiliar concepts or specific articles in NLP and obtain answers grounded in knowledge retrieved from scientific publications. Our system provides users with comprehensive exploration possibilities, supporting them in investigating the relationships between different fields, understanding unfamiliar concepts in NLP, and finding relevant research literature. Demo, video, and code are available at: https://github.com/NLP-Knowledge-Graph/NLP-KG-WebApp. 2024.acl-demos.13 @@ -13447,9 +13447,9 @@ <fixed-case>JORA</fixed-case>: <fixed-case>JAX</fixed-case> Tensor-Parallel <fixed-case>L</fixed-case>o<fixed-case>RA</fixed-case> Library for Retrieval Augmented Fine-Tuning - AniqueTahirArizona State University - LuChengUniversity of Illinois at Chicago - HuanLiuArizona State University + AniqueTahirArizona State University + LuChengUniversity of Illinois at Chicago + HuanLiuArizona State University 152-159 The scaling of Large Language Models (LLMs) for retrieval-based tasks, particularly in Retrieval Augmented Generation (RAG), faces significant memory constraints, especially when fine-tuning extensive prompt sequences. Current open-source libraries support full-model inference and fine-tuning across multiple GPUs but fall short of accommodating the efficient parameter distribution required for retrieved context. Addressing this gap, we introduce a novel framework for PEFT-compatible fine-tuning of GPT models, leveraging distributed training. Our framework uniquely utilizes JAX’s just-in-time (JIT) compilation and tensor-sharding for efficient resource management, thereby enabling accelerated fine-tuning with reduced memory requirements. This advancement significantly improves the scalability and feasibility of fine-tuning LLMs for complex RAG applications, even on systems with limited GPU resources. Our experiments show more than 12x improvement in runtime compared to Hugging Face/DeepSpeed implementation with four GPUs while consuming less than half the VRAM per GPU. 2024.acl-demos.15 @@ -13472,7 +13472,7 @@ <fixed-case>IMGTB</fixed-case>: A Framework for Machine-Generated Text Detection Benchmarking MichalSpiegelKempelen Institute of Intelligent Technologies - DominikMackoKempelen Institute of Intelligent Technologies + DominikMackoKempelen Institute of Intelligent Technologies 172-179 In the era of large language models generating high quality texts, it is a necessity to develop methods for detection of machine-generated text to avoid their harmful use or simply for annotation purposes. It is, however, also important to properly evaluate and compare such developed methods. Recently, a few benchmarks have been proposed for this purpose; however, integration of newest detection methods is rather challenging, since new methods appear each month and provide slightly different evaluation pipelines.In this paper, we present the IMGTB framework, which simplifies the benchmarking of machine-generated text detection methods by easy integration of custom (new) methods and evaluation datasets. In comparison to existing frameworks, it enables to objectively compare statistical metric-based zero-shot detectors with classification-based detectors and with differently fine-tuned detectors. Its configurability and flexibility makes research and development of new detection methods easier, especially their comparison to the existing state-of-the-art detectors. The default set of analyses, metrics and visualizations offered by the tool follows the established practices of machine-generated text detection benchmarking found in state-of-the-art literature. 2024.acl-demos.17 @@ -13483,9 +13483,9 @@ <fixed-case>D</fixed-case>rug<fixed-case>W</fixed-case>atch: A Comprehensive Multi-Source Data Visualisation Platform for Drug Safety Information ArtemBobrovKing’s College London, University of London DomantasSaltenis - ZhaoyueSunUniversity of Warwick + ZhaoyueSunUniversity of Warwick GabrielePergolaUniversity of Warwick - YulanHeKing’s College London, University of London + YulanHeKing’s College London, University of London 180-189 Drug safety research is crucial for maintaining public health, often requiring comprehensive data support. However, the resources currently available to the public are limited and fail to provide a comprehensive understanding of the relationship between drugs and their side effects. This paper introduces “DrugWatch”, an easy-to-use and interactive multi-source information visualisation platform for drug safety study. It allows users to understand common side effects of drugs and their statistical information, flexibly retrieve relevant medical reports, or annotate their own medical texts with our automated annotation tool. Supported by NLP technology and enriched with interactive visual components, we are committed to providing researchers and practitioners with a one-stop information analysis, retrieval, and annotation service. The demonstration video is available at https://www.youtube.com/watch?v=RTqDgxzETjw. We also deployed an online demonstration system at https://drugwatch.net/. 2024.acl-demos.18 @@ -13499,7 +13499,7 @@ JiaxuanLiTianjin University RenrenJin YufeiHuang - LingShi + LingShi JunhuiZhang XinmengJi TingtingCui @@ -13507,7 +13507,7 @@ JinwangSong HongyingZanZhengzhou University SunLiChina Academy of Information and Communications Technology - DeyiXiongTianjin University + DeyiXiongTianjin University 190-210 The rapid development of Chinese large language models (LLMs) poses big challenges for efficient LLM evaluation. While current initiatives have introduced new benchmarks or evaluation platforms for assessing Chinese LLMs, many of these focus primarily on capabilities, usually overlooking potential alignment and safety issues. To address this gap, we introduce OpenEval, an evaluation testbed that benchmarks Chinese LLMs across capability, alignment and safety. For capability assessment, we include 12 benchmark datasets to evaluate Chinese LLMs from 4 sub-dimensions: NLP tasks, disciplinary knowledge, commonsense reasoning and mathematical reasoning. For alignment assessment, OpenEval contains 7 datasets that examines the bias, offensiveness and illegalness in the outputs yielded by Chinese LLMs. To evaluate safety, especially anticipated risks (e.g., power-seeking, self-awareness) of advanced LLMs, we include 6 datasets. In addition to these benchmarks, we have implemented a phased public evaluation and benchmark update strategy to ensure that OpenEval is in line with the development of Chinese LLMs or even able to provide cutting-edge benchmark datasets to guide the development of Chinese LLMs. In our first public evaluation, we have tested a range of Chinese LLMs, spanning from 7B to 72B parameters, including both open-source and proprietary models. Evaluation results indicate that while Chinese LLMs have shown impressive performance in certain tasks, more attention should be directed towards broader aspects such as commonsense reasoning, alignment, and safety. 2024.acl-demos.19 @@ -13558,7 +13558,7 @@ HanghaoWu JiajieZhangNortheastern University XuHanTsinghua University, Tsinghua University - ZhiyuanLiuTsinghua University + ZhiyuanLiuTsinghua University MaosongSun 247-257 Evaluation is pivotal for honing Large Language Models (LLMs), pinpointing their capabilities and guiding enhancements. The rapid development of LLMs calls for a lightweight and easy-to-use framework for swift evaluation deployment. However, due to the various implementation details to consider, developing a comprehensive evaluation platform is never easy. Existing platforms are often complex and poorly modularized, hindering seamless incorporation into researcher’s workflows. This paper introduces UltraEval, a user-friendly evaluation framework characterized by lightweight, comprehensiveness, modularity, and efficiency. We identify and reimplement three core components of model evaluation (models, data, and metrics). The resulting composability allows for the free combination of different models, tasks, prompts, and metrics within a unified evaluation workflow. Additionally, UltraEval supports diverse models owing to a unified HTTP service and provides sufficient inference acceleration. @@ -13569,9 +13569,9 @@ <fixed-case>P</fixed-case>y<fixed-case>F</fixed-case>oma: a Python finite-state compiler module MansHuldenUniversity of Colorado at Boulder - MichaelGinnUniversity of Colorado at Boulder + MichaelGinnUniversity of Colorado at Boulder MiikkaSilfverbergUniversity of British Columbia - MichaelHammondUniversity of Arizona + MichaelHammondUniversity of Arizona 258-265 We describe PyFoma, an open-source Python module for constructing weighted and unweighted finite-state transducers and automata from regular expressions, string rewriting rules, right-linear grammars, or low-level state/transition manipulation. A large variety of standard algorithms for working with finite-state machines is included, with a particular focus on the needs of linguistic and NLP applications. The data structures and code in the module are designed for legibility to allow for potential use in teaching the theory and algorithms associated with finite-state machines. 2024.acl-demos.24 @@ -13589,7 +13589,7 @@ KaihuaZhu SiliangXu ShizheDiaoHong Kong University of Science and Technology - TongZhangUIUC + TongZhangUIUC 266-277 The proliferation of fake news poses a significant threat not only by disseminating misleading information but also by undermining the very foundations of democracy. The recent advance of generative artificial intelligence has further exacerbated the challenge of distinguishing genuine news from fabricated stories. In response to this challenge, we introduce VeraCT Scan, a novel retrieval-augmented system for fake news detection. This system operates by extracting the core facts from a given piece of news and subsequently conducting an internet-wide search to identify corroborating or conflicting reports. Then sources’ credibility is leveraged for information verification. Besides determining the veracity of news, we also provide transparent evidence and reasoning to support its conclusions, resulting in the interpretability and trust in the results. In addition to GPT-4 Turbo, Llama-2 13B is also fine-tuned for news content understanding, information verification, and reasoning. Both implementations have demonstrated state-of-the-art accuracy in the realm of fake news detection. 2024.acl-demos.25 @@ -13599,7 +13599,7 @@ string2string: A Modern Python Library for String-to-String Algorithms MiracSuzgunStanford University - StuartShieberHarvard University + StuartShieberHarvard University DanJurafskyStanford University 278-285 We introduce **string2string**, an open-source library that offers a comprehensive suite of efficient algorithms for a broad range of string-to-string problems. It includes traditional algorithmic solutions as well as recent advanced neural approaches to tackle various problems in string alignment, distance measurement, lexical and semantic search, and similarity analysis�along with several helpful visualization tools and metrics to facilitate the interpretation and analysis of these methods. Notable algorithms featured in the library include the Smith-Waterman algorithm for pairwise local alignment, the Hirschberg algorithm for global alignment, the Wagner-Fischer algorithm for edit distance, BARTScore and BERTScore for similarity analysis, the Knuth-Morris-Pratt algorithm for lexical search, and Faiss for semantic search. In addition, it wraps existing efficient and widely-used implementations of certain frameworks and metrics, such as sacreBLEU and ROUGE. Overall, the library aims to provide extensive coverage and increased flexibility in comparison to existing libraries for strings. It can be used for many downstream applications, tasks, and problems in natural-language processing, bioinformatics, and computational social sciences. It is implemented in Python, easily installable via pip, and accessible through a simple API. Source code, documentation, and tutorials are all available on our GitHub page: https://github.com/stanfordnlp/string2string* Documentation: https://string2string.readthedocs.io/en/latest/* GitHub page: https://github.com/stanfordnlp/string2string* Short video: https://drive.google.com/file/d/1IT-pBACDVUoEHewk__5Pz5mU5oAMq5k_/view?usp=sharing @@ -13634,13 +13634,13 @@ ChenhuiShenNational University of Singapore Yew KenChia XingxuanLi - JianyuWangAlibaba DAMO Academy + JianyuWangAlibaba DAMO Academy QingyuTannational university of singaore, National University of Singapore LiyingCheng GuanzhengChen - YueDengSchool of Computer Science and Engineering, Nanyang Technological University + YueDengSchool of Computer Science and Engineering, Nanyang Technological University SenYangThe Chinese University of Hong Kong - ChaoqunLiu + ChaoqunLiu HangZhang LidongBingAlibaba Group 294-304 @@ -13667,7 +13667,7 @@ LeiZang JiaotuanWang ChenyiZhuang - JinjieGu + JinjieGu 315-325 Automatic Chinese classical poetry generation has attracted much research interest, but achieving effective control over format and content simultaneously remains challenging. Traditional systems usually accept keywords as user inputs, resulting in limited control over content. Large language models (LLMs) improve content control by allowing unrestricted user instructions, but the token-by-token generation process frequently makes format errors. Motivated by this, we propose CharPoet, a Chinese classical poetry generation system based on token-free LLM, which provides effective control over both format and content. Our token-free architecture generates in a character-by-character manner, enabling precise control over the number of characters. Pruned from existing token-based LLMs, CharPoet inherits their pretrained capabilities and can generate poetry following instructions like �Write me a poem for my mother’s birthday.� CharPoet achieves format accuracy above 0.96, outperforming Jiuge-GPT-2 (0.91) and GPT-4 (0.38). In terms of content quality, CharPoet surpasses traditional systems including Jiuge, and is comparable to other LLMs. Our system is open source and available at https://modelscope.cn/models/CharPoet/CharPoet. A video demonstration of CharPoet is available at https://youtu.be/voZ25qEp3Dc. 2024.acl-demos.30 @@ -13677,9 +13677,9 @@ <fixed-case>ITAKE</fixed-case>: Interactive Unstructured Text Annotation and Knowledge Extraction System with <fixed-case>LLM</fixed-case>s and <fixed-case>M</fixed-case>odel<fixed-case>O</fixed-case>ps JiaheSong - HongxinDing - ZhiyuanWang - YongxinXu + HongxinDing + ZhiyuanWang + YongxinXu YashaWang JunfengZhaoPeking University 326-334 @@ -13698,7 +13698,7 @@ YugeTu PengkaiLiCentral South University LeiShi - ZhiyuanLiuTsinghua University + ZhiyuanLiuTsinghua University MaosongSun 335-345 Despite advancements in Large Language Models (LLMs) and Large Multimodal Models (LMMs), their integration into language-grounded, human-like embodied agents remains incomplete, hindering complex real-life task performance in 3D environments. Existing integrations often feature limited open-sourcing, challenging collective progress in this field. We introduce LEGENT, an open, scalable platform for developing embodied agents using LLMs and LMMs. LEGENT offers a dual approach: a rich 3D environment with interactive, communicable, and actionable agents, paired with a user-friendly interface, and a sophisticated data generation pipeline utilizing advanced algorithms to exploit supervision from simulated worlds at scale. In our experiments, an embryonic vision-language-action model trained on LEGENT-generated data surpasses GPT-4V in embodied tasks, showcasing promising generalization capabilities. The demo video is available at the following link https://video.legent.ai. @@ -13709,8 +13709,8 @@ Variationist: Exploring Multifaceted Variation and Bias in Written Language Data AlanRamponiFondazione Bruno Kessler - CamillaCasulaUniversity of Trento and Fondazione Bruno Kessler - StefanoMenini + CamillaCasulaUniversity of Trento and Fondazione Bruno Kessler + StefanoMenini 346-354 Exploring and understanding language data is a fundamental stage in all areas dealing with human language. It allows NLP practitioners to uncover quality concerns and harmful biases in data before training, and helps linguists and social scientists to gain insight into language use and human behavior. Yet, there is currently a lack of a unified, customizable tool to seamlessly inspect and visualize language variation and bias across multiple variables, language units, and diverse metrics that go beyond descriptive statistics. In this paper, we introduce Variationist, a highly-modular, extensible, and task-agnostic tool that fills this gap. Variationist handles at once a potentially unlimited combination of variable types and semantics across diversity and association metrics with regards to the language unit of choice, and orchestrates the creation of up to five-dimensional interactive charts for over 30 variable type-semantics combinations. Through our case studies on computational dialectology, human label variation, and text generation, we show how Variationist enables researchers from different disciplines to effortlessly answer specific research questions or unveil undesired associations in language data. A Python library, code, documentation, and tutorials are made publicly available to the research community. 2024.acl-demos.33 @@ -13719,10 +13719,10 @@ An <fixed-case>LLM</fixed-case>-based Knowledge Synthesis and Scientific Reasoning Framework for Biomedical Discovery - OskarWysocki + OskarWysocki Magdalena.wysocka@cruk.manchester.ac.ukMagdalena.wysocka@cruk.manchester.ac.ukNA - DaniloCarvalhoUniversity of Manchester - AlexBogatu + DaniloCarvalhoUniversity of Manchester + AlexBogatu Danilo.miranda@idiap.chDanilo.miranda@idiap.chNA Maxime.delmas@idiap.chMaxime.delmas@idiap.chNA Harriet.unsworth@cruk.manchester.ac.ukHarriet.unsworth@cruk.manchester.ac.ukNA @@ -13769,7 +13769,7 @@ XiaoxueCheng GeyangGuoGeorgia Institute of Technology HanPeng - BowenZhengRenmin University of China + BowenZhengRenmin University of China YiruTang YingqianMin YushuoChen @@ -13782,7 +13782,7 @@ JunyiLi KunZhouRenmin University of China XinZhaoRenmin University of China - Ji-RongWenRenmin University of China + Ji-RongWenRenmin University of China 388-399 To facilitate the research on large language models (LLMs), this paper presents a comprehensive and unified library, LLMBox, to ease the development, use, and evaluation of LLMs. This library is featured with three main merits: (1) a unified data interface that supports the flexible implementation of various training strategies, (2) a comprehensive evaluation that covers extensive tasks, datasets, and models, and (3) more practical consideration, especially on user-friendliness and efficiency. With our library, users can easily reproduce existing methods, train new models, and conduct comprehensive performance comparisons. To rigorously test LLMBox, we conduct extensive experiments in a diverse coverage of evaluation settings, and experimental results demonstrate the effectiveness and efficiency of our library in supporting various implementations related to LLMs. The detailed introduction and usage guidance can be found at https://github.com/RUCAIBox/LLMBox. 2024.acl-demos.37 @@ -13791,8 +13791,8 @@ <fixed-case>L</fixed-case>lama<fixed-case>F</fixed-case>actory: Unified Efficient Fine-Tuning of 100+ Language Models - YaoweiZheng - RichongZhang + YaoweiZheng + RichongZhang JunhaoZhang YanhanYe ZheyanLuo @@ -13847,7 +13847,7 @@ Topic Modeling for Short Texts with Large Language Models TomokiDoi MasaruIsonuma - HitomiYanakathe University of Tokyo + HitomiYanakathe University of Tokyo 21-33 As conventional topic models rely on word co-occurrence to infer latent topics, topic modeling for short texts has been a long-standing challenge. Large Language Models (LLMs) can potentially overcome this challenge by contextually learning the meanings of words via pretraining. In this paper, we study two approaches to using LLMs for topic modeling: parallel prompting and sequential prompting. Input length limitations prevent LLMs from processing many texts at once. However, an arbitrary number of texts can be handled by LLMs by splitting the texts into smaller subsets and processing them in parallel or sequentially. Our experimental results demonstrate that our methods can identify more coherent topics than existing ones while maintaining the diversity of the induced topics. Furthermore, we found that the inferred topics cover the input texts to some extent, while hallucinated topics are hardly generated. 2024.acl-srw.3 @@ -13870,7 +13870,7 @@ <fixed-case>B</fixed-case>ias<fixed-case>DPO</fixed-case>: Mitigating Bias in Language Models through Direct Preference Optimization - AhmedAllam + AhmedAllam 42-50 Large Language Models (LLMs) have become pivotal in advancing natural language processing, yet their potential to perpetuate biases poses significant concerns. This paper introduces a new framework employing Direct Preference Optimization (DPO) to mitigate gender, racial, and religious biases in LLM-generated English text. By developing a loss function that favors less biased over biased completions, our approach cultivates a preference for respectful and non-discriminatory language in LLMs. We also contribute a manually designed dataset for training LLMs to recognize and correct biases. This dataset encompasses a diverse range of prompts paired with both biased and unbiased completions. Implementing this approach on the Microsoft Phi-2 model, we demonstrate substantial reductions in biased outputs as our model outperforms the baseline model on almost all bias benchmarks. Our model also achieves better performance compared to other open-source models on most benchmarks. By reducing biases in the language generated by the model, our study marks a significant step towards developing more ethical and socially responsible LLMs. We publicly release BiasDPO dataset on HuggingFace. 2024.acl-srw.7 @@ -13901,7 +13901,7 @@ Demystifying Instruction Mixing for Fine-tuning Large Language Models - RenxiWang + RenxiWang HaonanLi MinghaoWu YuxiaWang @@ -13916,7 +13916,7 @@ Fine-Tuning <fixed-case>ASR</fixed-case> models for Very Low-Resource Languages: A Study on Mvskoke - JuliaMainzinger + JuliaMainzinger Gina-AnneLevowUniversity of Washington and University of Washington 76-82 Recent advancements in multilingual models for automatic speech recognition (ASR) have been able to achieve a high accuracy for languages with extremely limited resources. This study examines ASR modeling for the Mvskoke language, an indigenous language of America. The parameter efficiency of adapter training is contrasted with training entire models, and it is demonstrated how performance varies with different amounts of data. Additionally, the models are evaluated with trigram language model decoding, and the outputs are compared across different types of speech recordings. Results show that training an adapter is both parameter efficient and gives higher accuracy for a relatively small amount of data. @@ -13970,8 +13970,8 @@ Narratives at Conflict: Computational Analysis of News Framing in Multilingual Disinformation Campaigns - AntoninaSinelnik - DirkHovyBocconi University + AntoninaSinelnik + DirkHovyBocconi University 131-143 Any report frames issues to favor a particular interpretation by highlighting or excluding certain aspects of a story. Despite the widespread use of framing in disinformation, framing properties and detection methods remain underexplored outside the English-speaking world. We explore how multilingual framing of the same issue differs systematically. We use eight years of Russia-backed disinformation campaigns, spanning 8k news articles in 4 languages targeting 15 countries. We find that disinformation campaigns consistently and intentionally favor specific framing, depending on the target language of the audience. We further discover how Russian-language articles consistently highlight selected frames depending on the region of the media coverage. We find that the two most prominent models for automatic frame analysis underperform and show high disagreement, highlighting the need for further research. 2024.acl-srw.21 @@ -13981,7 +13981,7 @@ Assessing In-context Learning and Fine-tuning for Topic Classification of <fixed-case>G</fixed-case>erman Web Data JulianSchelbUniversität Konstanz - RobertoUlloa + RobertoUlloa AndreasSpitzUniversität Konstanz 144-158 Researchers in the political and social sciences often rely on classification models to analyze trends in information consumption by examining browsing histories of millions of webpages. Automated scalable methods are necessary due to the impracticality of manual labeling. In this paper, we model the detection of topic-related content as a binary classification task and compare the accuracy of fine-tuned pre-trained encoder models against in-context learning strategies. Using only a few hundred annotated data points per topic, we detect content related to three German policies in a database of scraped webpages. We compare multilingual and monolingual models, as well as zero and few-shot approaches, and investigate the impact of negative sampling strategies and the combination of URL & content-based features. Our results show that a small sample of annotated data is sufficient to train an effective classifier. Fine-tuning encoder-based models yields better results than in-context learning. Classifiers using both URL & content-based features perform best, while using URLs alone provides adequate results when content is unavailable. @@ -14003,9 +14003,9 @@ Exploring the Effectiveness and Consistency of Task Selection in Intermediate-Task Transfer Learning - Pin-JieLin + Pin-JieLin MiaoranZhangSaarland University - MariusMosbachMcGill University and Mila - Quebec Artificial Intelligence Institute + MariusMosbachMcGill University and Mila - Quebec Artificial Intelligence Institute DietrichKlakowSaarland University 170-185 Identifying beneficial tasks to transfer from is a critical step toward successful intermediate-task transfer learning. In this work, we experiment with 130 source-target task combinations and demonstrate that the transfer performance exhibits severe variance across different source tasks and training seeds, highlighting the crucial role of intermediate-task selection in a broader context. We compare four representative task selection methods in a unified setup, focusing on their effectiveness and consistency. Compared to embedding-free methods and text embeddings, task embeddings constructed from fine-tuned weights can better estimate task transferability by improving task prediction scores from 2.59% to 3.96%. Despite their strong performance, we observe that the task embeddings do not consistently demonstrate superiority for tasks requiring reasoning abilities. Furthermore, we introduce a novel method that measures pairwise token similarity using maximum inner product search, leading to the highest performance in task prediction. Our findings suggest that token-wise similarity is better predictive for predicting transferability compared to averaging weights. @@ -14015,7 +14015,7 @@ Does the structure of textual content have an impact on language models for automatic summarization? - EveSauvage + EveSauvage SabrinaCampanoEDF R&D LydiaOuali CyrilGrouinCNRS @@ -14028,8 +14028,8 @@ Action Inference for Destination Prediction in Vision-and-Language Navigation AnirudhKondapallyHonda R&D Co., Ltd. - KentaroYamadaHonda R&D Co., Ltd. - HitomiYanakathe University of Tokyo + KentaroYamadaHonda R&D Co., Ltd. + HitomiYanakathe University of Tokyo 192-199 Vision-and-Language Navigation (VLN) encompasses interacting with autonomous vehicles using language and visual input from the perspective of mobility.Most of the previous work in this field focuses on spatial reasoning and the semantic grounding of visual information.However, reasoning based on the actions of pedestrians in the scene is not much considered.In this study, we provide a VLN dataset for destination prediction with action inference to investigate the extent to which current VLN models perform action inference.We introduce a crowd-sourcing process to construct a dataset for this task in two steps: (1) collecting beliefs about the next action for a pedestrian and (2) annotating the destination considering the pedestrian’s next action.Our benchmarking results of the models on destination prediction lead us to believe that the models can learn to reason about the effect of the action and the next action on the destination to a certain extent.However, there is still much scope for improvement. 2024.acl-srw.26 @@ -14039,7 +14039,7 @@ A Computational Analysis and Exploration of Linguistic Borrowings in <fixed-case>F</fixed-case>rench Rap Lyrics LucasZurbuchen - RobVoigtNorthwestern University + RobVoigtNorthwestern University 200-208 In France, linguistic borrowings in the relatively conservative French language are an important site of cultural debate, and rap in particular is a hotspot for borrowings. In this work, we use computational methods to understand the factors that affect the prominence and prevalence of a borrowing. To do so, we manually annotate a lexicon of over 700 borrowings occurring in this context (including key aspects for each borrowing such as origin and semantic class). We analyze the prevalence of these borrowings in a newly collected corpus of over 8000 French rap song lyrics and find that there are increases in the proportion of linguistic borrowings, interjections, and Niger-Congo borrowings while terms related to the arts are decreasing in prevalence. We release our code and data to facilitate further research in this area and discuss potential future directions. 2024.acl-srw.27 @@ -14049,7 +14049,7 @@ On Improving Repository-Level Code <fixed-case>QA</fixed-case> for Large Language Models JanStrich - FlorianSchneiderUniversität Hamburg + FlorianSchneiderUniversität Hamburg IrinaNikishina ChrisBiemannU Hamburg 209-244 @@ -14062,7 +14062,7 @@ Compromesso! <fixed-case>I</fixed-case>talian Many-Shot Jailbreaks undermine the safety of Large Language Models FabioPernisi DirkHovy - PaulRöttger + PaulRöttger 245-251 As diverse linguistic communities and users adopt Large Language Models (LLMs), assessing their safety across languages becomes critical. Despite ongoing efforts to align these models with safe and ethical guidelines, they can still be induced into unsafe behavior with jailbreaking, a technique in which models are prompted to act outside their operational guidelines. What research has been conducted on these vulnerabilities was predominantly on English, limiting the understanding of LLM behavior in other languages. We address this gap by investigating Many-Shot Jailbreaking (MSJ) in Italian, underscoring the importance of understanding LLM behavior in different languages. We base our analysis on a newly created Italian dataset to identify unique safety vulnerabilities in 4 families of open-source LLMs.We find that the models exhibit unsafe behaviors even with minimal exposure to harmful prompts, and–more alarmingly–this tendency rapidly escalates with more demonstrations. 2024.acl-srw.29 @@ -14073,7 +14073,7 @@ <fixed-case>V</fixed-case>i<fixed-case>M</fixed-case>ed<fixed-case>AQA</fixed-case>: A <fixed-case>V</fixed-case>ietnamese Medical Abstractive Question-Answering Dataset and Findings of Large Language Model Minh-NamTran Phu-VinhNguyen - LongNguyenHo Chi Minh city University of Science, Vietnam National University + LongNguyenHo Chi Minh city University of Science, Vietnam National University DienDinh 252-260 Question answering involves creating answers to questions. With the growth of large language models, the ability of question-answering systems has dramatically improved. However, there is a lack of Vietnamese abstractive question-answering datasets, especially in the medical domain. Therefore, this research aims to mitigate this gap by introducing ViMedAQA. This **Vi**etnamese **Med**ical **A**bstractive **Q**uestion-**A**nswering dataset covers four topics in the Vietnamese medical domain, including body parts, disease, drugs and medicine. Additionally, the empirical results on the proposed dataset examine the capability of the large language models in the Vietnamese medical domain, including reasoning, memorizing and awareness of essential information. @@ -14086,7 +14086,7 @@ YikunWang RuiZheng HaomingLi - QiZhangFudan University + QiZhangFudan University TaoGuiFudan University FeiLiuEmory University 261-272 @@ -14110,7 +14110,7 @@ <fixed-case>H</fixed-case>omophone2<fixed-case>V</fixed-case>ec: Embedding Space Analysis for Empirical Evaluation of Phonological and Semantic Similarity SophieWuMcGill University AnitaZhengMcGill University - JoeyChuangMcGill University + JoeyChuangMcGill University 287-292 This paper introduces a novel method for empirically evaluating the relationship between the phonological and semantic similarity of linguistic units using embedding spaces. Chinese character homophones are used as a proof-of-concept. We employ cosine similarity as a proxy for semantic similarity between characters, and compare relationships between phonologically-related characters and baseline characters (chosen as similar-frequency characters). We show there is a strongly statistically significant positive semantic relationship among different Chinese characters at varying levels of sound-sharing. We also perform some basic probing using t-SNE and UMAP visualizations, and indicate directions for future applications of this method. 2024.acl-srw.34 @@ -14130,8 +14130,8 @@ Can <fixed-case>LLM</fixed-case>s Augment Low-Resource Reading Comprehension Datasets? Opportunities and Challenges VinaySamuel - HoudaAynaou - ArijitChowdhuryAmazon + HoudaAynaou + ArijitChowdhuryAmazon KarthikVenkat Ramanan AmanChadhaAmazon 307-317 @@ -14160,7 +14160,7 @@ <fixed-case>C</fixed-case>o<fixed-case>V</fixed-case>o<fixed-case>S</fixed-case>witch: Machine Translation of Synthetic Code-Switched Text Based on Intonation Units - YeeunKang + YeeunKang 345-357 Multilingual code-switching research is often hindered by the lack and linguistically biased status of available datasets. To expand language representation, we synthesize code-switching data by replacing intonation units detected through PSST, a speech segmentation model fine-tuned from OpenAI’s Whisper, using a speech-to-text translation dataset, CoVoST 2. With our dataset, CoVoSwitch, spanning 13 languages, we evaluate the code-switching translation performance of two multilingual translation models, M2M-100 418M and NLLB-200 600M. We reveal that the inclusion of code-switching units results in higher translation performance than monolingual settings and that models are better at code-switching translation into English than non-English. Further, low-resource languages gain most from integration of code-switched units when translating into English but much less when translating into non-English. Translations into low-resource languages also perform worse than even raw code-switched inputs. We find that systems excel at copying English tokens but struggle with non-English tokens, that the off-target problem in monolingual settings is also relevant in code-switching settings, and that models hallucinate in code-switching translation by introducing words absent in both of the original source sentences. CoVoSwitch and code are available at https://github.com/sophiayk20/covoswitch. 2024.acl-srw.40 @@ -14181,9 +14181,9 @@ Improving Sentence Embeddings with Automatic Generation of Training Data Using Few-shot Examples - SomaSato - HayatoTsukagoshi - RyoheiSasanoNagoya University + SomaSato + HayatoTsukagoshi + RyoheiSasanoNagoya University KoichiTakedaNagoya University 378-389 Decoder-based large language models (LLMs) have shown high performance on many tasks in natural language processing. This is also true for sentence embedding learning, where a decoder-based model, PromptEOL, has achieved the best performance on semantic textual similarity (STS) tasks. However, PromptEOL requires a manually annotated natural language inference (NLI) dataset for fine-tuning.We aim to improve sentence embeddings without using large manually annotated datasets by automatically generating an NLI dataset with an LLM and using it for fine-tuning of PromptEOL. To achieve this, we explore methods of data generation suitable for sentence embedding learning in this study. Specifically, we will focus on automatic dataset generation through few-shot learning and explore the appropriate methods to leverage few-shot examples. Experimental results on the STS tasks demonstrate that our approach outperforms existing models in settings without large manually annotated datasets. @@ -14278,7 +14278,7 @@ ShanshanZhong ShanghuaGaoHarvard University ZhongzhanHuangSun Yat-Sen University - WushaoWenSUN YAT-SEN UNIVERSITY + WushaoWenSUN YAT-SEN UNIVERSITY MarinkaZitnikHarvard University PanZhouSingapore Management University 494-505 @@ -14298,7 +14298,7 @@ Automatically Suggesting Diverse Example Sentences for <fixed-case>L</fixed-case>2 <fixed-case>J</fixed-case>apanese Learners Using Pre-Trained Language Models EnricoBenedetti - AkikoAizawaNII, Tokyo Institute of Technology + AkikoAizawaNII, Tokyo Institute of Technology FlorianBoudinUniversity of Nantes 517-534 Providing example sentences that are diverse and aligned with learners’ proficiency levels is essential for fostering effective language acquisition.This study examines the use of Pre-trained Language Models (PLMs) to produce example sentences targeting L2 Japanese learners.We utilize PLMs in two ways: as quality scoring components in a retrieval system that draws from a newly curated corpus of Japanese sentences, and as direct sentence generators using zero-shot learning.We evaluate the quality of sentences by considering multiple aspects such as difficulty, diversity, and naturalness, with a panel of raters consisting of learners of Japanese, native speakers – and GPT-4.Our findings suggest that there is inherent disagreement among participants on the ratings of sentence qualities, except for difficulty. Despite that, the retrieval approach was preferred by all evaluators, especially for beginner and advanced target proficiency, while the generative approaches received lower scores on average.Even so, our experiments highlight the potential for using PLMs to enhance the adaptability of sentence suggestion systems and therefore improve the language learning journey. @@ -14317,7 +14317,7 @@ Plot Retrieval as an Assessment of Abstract Semantic Association - ShichengXu + ShichengXu LiangPangInstitute of Computing Technology, Chinese Academy of Sciences JiangnanLiWeChat, Tencent Inc. MoYuWeChat AI, Tencent diff --git a/data/xml/2024.blackboxnlp.xml b/data/xml/2024.blackboxnlp.xml index 31fd76593a..5df7e9ca11 100644 --- a/data/xml/2024.blackboxnlp.xml +++ b/data/xml/2024.blackboxnlp.xml @@ -25,9 +25,9 @@ Optimal and efficient text counterfactuals using Graph Neural Networks - DimitrisLymperopoulosNational Technical University of Athens - MariaLymperaiou - GiorgosFilandrianosNational Technical University of Athens + DimitrisLymperopoulosNational Technical University of Athens + MariaLymperaiou + GiorgosFilandrianosNational Technical University of Athens GiorgosStamouNational Technical University of Athens 1-14 As NLP models become increasingly integral to decision-making processes, the need for explainability and interpretability has become paramount. In this work, we propose a framework that achieves the aforementioned by generating semantically edited inputs, known as counterfactual interventions, which change the model prediction, thus providing a form of counterfactual explanations for the model. We frame the search for optimal counterfactual interventions as a graph assignment problem and employ a GNN to solve it, thus achieving high efficiency. We test our framework on two NLP tasks - binary sentiment classification and topic classification - and show that the generated edits are contrastive, fluent and minimal, while the whole process remains significantly faster than other state-of-the-art counterfactual editors. @@ -80,12 +80,12 @@ <fixed-case>LLM</fixed-case> Internal States Reveal Hallucination Risk Faced With a Query - ZiweiJiHong Kong University of Science and Technology + ZiweiJiHong Kong University of Science and Technology DelongChenHong Kong University of Science and Technology - EtsukoIshiiAmazon - SamuelCahyawijaya + EtsukoIshiiAmazon + SamuelCahyawijaya YejinBang - BryanWilie + BryanWilie PascaleFungHKUST 88-104 The hallucination problem of Large Language Models (LLMs) significantly limits their reliability and trustworthiness. Humans have a self-awareness process that allows us to recognize what we don’t know when faced with queries. Inspired by this, our paper investigates whether LLMs can estimate their own hallucination risk before response generation. We analyze the internal mechanisms of LLMs broadly both in terms of training data sources and across 15 diverse Natural Language Generation (NLG) tasks, spanning over 700 datasets. Our empirical analysis reveals two key insights: (1) LLM internal states indicate whether they have seen the query in training data or not; and (2) LLM internal states show they are likely to hallucinate or not regarding the query. Our study explores particular neurons, activation layers, and tokens that play a crucial role in the LLM perception of uncertainty and hallucination risk. By a probing estimator, we leverage LLM self-assessment, achieving an average hallucination estimation accuracy of 84.32% at run time. @@ -96,8 +96,8 @@ Enhancing adversarial robustness in Natural Language Inference using explanations AlexandrosKoulakos - MariaLymperaiou - GiorgosFilandrianosNational Technical University of Athens + MariaLymperaiou + GiorgosFilandrianosNational Technical University of Athens GiorgosStamouNational Technical University of Athens 105-117 The surge of state-of-the-art transformer-based models has undoubtedly pushed the limits of NLP model performance, excelling in a variety of tasks. We cast the spotlight on the underexplored task of Natural Language Inference (NLI), since models trained on popular well-suited datasets are susceptible to adversarial attacks, allowing subtle input interventions to mislead the model. In this work, we validate the usage of natural language explanation as a model-agnostic defence strategy through extensive experimentation: only by fine-tuning a classifier on the explanation rather than premise-hypothesis inputs, robustness under various adversarial attacks is achieved in comparison to explanation-free baselines. Moreover, since there is no standard strategy for testing the semantic validity of the generated explanations, we research the correlation of widely used language generation metrics with human perception, in order for them to serve as a proxy towards robust NLI models. Our approach is resource-efficient and reproducible without significant computational limitations. @@ -110,7 +110,7 @@ SeraphinaGoldfarb-Tarrant PedroRodriguezMeta FAIR JaneDwivedi-YuMeta AI - PatrickLewis + PatrickLewis 118-139 Dense retrievers compress source documents into (possibly lossy) vector representations, yet there is little analysis of what information is lost versus preserved, and how it affects downstream tasks. We conduct the first analysis of the information captured by dense retrievers compared to the language models they are based on (e.g., BERT versus Contriever). We use 25 MultiBert checkpoints as randomized initialisations to train MultiContrievers, a set of 25 contriever models. We test whether specific pieces of information—such as genderand occupation—can be extracted from contriever vectors of wikipedia-like documents. We measure this extractability via information theoretic probing. We then examine the relationship of extractability to performance and gender bias, as well as the sensitivity of these results to many random initialisations and data shuffles. We find that (1) contriever models have significantly increased extractability, but extractability usually correlates poorly with benchmark performance 2) gender bias is present, but is not caused by the contriever representations 3) there is high sensitivity to both random initialisation and to data shuffle, suggesting that future retrieval research should test across a wider spread of both. 2024.blackboxnlp-1.8 @@ -119,13 +119,13 @@ Can We Statically Locate Knowledge in Large Language Models? Financial Domain and Toxicity Reduction Case Studies - JordiArmengol-EstapéUniversity of Edinburgh + JordiArmengol-EstapéUniversity of Edinburgh LingyuLiBloomberg SebastianGehrmannBloomberg AchintyaGopal - David SRosenbergBloomberg + David SRosenbergBloomberg Gideon S.Mann - MarkDredzeDepartment of Computer Science, Whiting School of Engineering + MarkDredzeDepartment of Computer Science, Whiting School of Engineering 140-176 Current large language model (LLM) evaluations rely on benchmarks to assess model capabilities and their encoded knowledge. However, these evaluations cannot reveal where a model encodes its knowledge, and thus little is known about which weights contain specific information. We propose a method to statically (without forward or backward passes) locate topical knowledge in the weight space of an LLM, building on a prior insight that parameters can be decoded into interpretable tokens. If parameters can be mapped into the embedding space, it should be possible to directly search for knowledge via embedding similarity. We study the validity of this assumption across several LLMs for a variety of concepts in the financial domain and a toxicity detection setup. Our analysis yields an improved understanding of the promises and limitations of static knowledge location in real-world scenarios. 2024.blackboxnlp-1.9 @@ -146,7 +146,7 @@ Enhancing Question Answering on Charts Through Effective Pre-training Tasks AshimGupta VivekGuptaArizona State University - ShuoZhang + ShuoZhang YujieHeBloomberg L.P. NingZhangBloomberg ShalinShahBloomberg @@ -170,8 +170,8 @@ Transformers Learn Transition Dynamics when Trained to Predict <fixed-case>M</fixed-case>arkov Decision Processes YuxiChen SuweiMa - TonyDearColumbia University - XuChen + TonyDearColumbia University + XuChen 207-216 Language models have displayed a wide array of capabilities, but the reason for their performance remains a topic of heated debate and investigation. Do these models simply recite the observed training data, or are they able to abstract away surface statistics and learn the underlying processes from which the data was generated? To investigate this question, we explore the capabilities of a GPT model in the context of Markov Decision Processes (MDPs), where the underlying transition dynamics and policies are not directly observed. The model is trained to predict the next state or action without any initial knowledge of the MDPs or the players’ policies. Despite this, we present evidence that the model develops emergent representations of the underlying parameters governing the MDPs. 2024.blackboxnlp-1.13 @@ -181,8 +181,8 @@ On the alignment of <fixed-case>LM</fixed-case> language generation and human language comprehension Lena SophiaBolligerUniversity of Zurich - PatrickHallerUniversity of Zurich - Lena AnnJägerUniversity of Zurich and Universität Potsdam + PatrickHallerUniversity of Zurich + Lena AnnJägerUniversity of Zurich and Universität Potsdam 217-231 Previous research on the predictive power (PP) of surprisal and entropy has focused on determining which language models (LMs) generate estimates with the highest PP on reading times, and examining for which populations the PP is strongest. In this study, we leverage eye movement data on texts that were generated using a range of decoding strategies with different LMs. We then extract the transition scores that reflect the models’ production rather than comprehension effort. This allows us to investigate the alignment of LM language production and human language comprehension. Our findings reveal that there are differences in the strength of the alignment between reading behavior and certain LM decoding strategies and that this alignment further reflects different stages of language understanding (early, late, or global processes). Although we find lower PP of transition-based measures compared to surprisal and entropy for most decoding strategies, our results provide valuable insights into which decoding strategies impose less processing effort for readers. Our code is available via https://github.com/DiLi-Lab/LM-human-alignment. 2024.blackboxnlp-1.14 @@ -193,7 +193,7 @@ An Adversarial Example for Direct Logit Attribution: Memory Management in <fixed-case>GELU</fixed-case>-4<fixed-case>L</fixed-case> JettJaniakAI Safety Camp CanRager - JamesDao + JamesDao Yeu-TongLau 232-237 Prior work suggests that language models manage the limited bandwidth of the residual stream through a “memory management” mechanism, where certain attention heads and MLP layers clear residual stream directions set by earlier layers. Our study provides concrete evidence for this erasure phenomenon in a 4-layer transformer, identifying heads that consistently remove the output of earlier heads. We further demonstrate that direct logit attribution (DLA), a common technique for interpreting the output of intermediate transformer layers, can show misleading results by not accounting for erasure. @@ -203,9 +203,9 @@ Uncovering Syllable Constituents in the Self-Attention-Based Speech Representations of Whisper - ErfanA ShamsUniversity College Dublin - IonaGessinger - JulieCarson-BerndsenUniversity College Dublin + ErfanA ShamsUniversity College Dublin + IonaGessinger + JulieCarson-BerndsenUniversity College Dublin 238-247 As intuitive units of speech, syllables have been widely studied in linguistics. A syllable can be defined as a three-constituent unit with a vocalic centre surrounded by two (in some languages optional) consonant clusters. Syllables are also used to design automatic speech recognition (ASR) models. The significance of knowledge-driven syllable-based tokenisation in ASR over data-driven byte-pair encoding has often been debated. However, the emergence of transformer-based ASR models employing self-attention (SA) overshadowed this debate. These models learn the nuances of speech from large corpora without prior knowledge of the domain; yet, they are not interpretable by design. Consequently, it is not clear if the recent performance improvements are related to the extraction of human-interpretable knowledge. We probe such models for syllable constituents and use an SA head pruning method to assess the relevance of the SA weights. We also investigate the role of vowel identification in syllable constituent probing. Our findings show that the general features of syllable constituents are extracted in the earlier layers of the model and the syllable-related features mostly depend on the temporal knowledge incorporated in specific SA heads rather than on vowel identification. 2024.blackboxnlp-1.16 @@ -215,8 +215,8 @@ Recurrent Neural Networks Learn to Store and Generate Sequences using Non-Linear Representations RóbertCsordásStanford University - ChristopherPottsStanford University - Christopher DManningComputer Science Department, Stanford University + ChristopherPottsStanford University + Christopher DManningComputer Science Department, Stanford University AtticusGeigerPr(Ai)²R Group 248-262 The Linear Representation Hypothesis (LRH) states that neural networks learn to encode concepts as directions in activation space, and a strong version of the LRH states that models learn only such encodings. In this paper, we present a counterexample to this strong LRH: when trained to repeat an input token sequence, gated recurrent neural networks (RNNs) learn to represent the token at each position with a particular order of magnitude, rather than a direction. These representations have layered features that are impossible to locate in distinct linear subspaces. To show this, we train interventions to predict and manipulate tokens by learning the scaling factor corresponding to each sequence position. These interventions indicate that the smallest RNNs find only this magnitude-based solution, while larger RNNs have linear representations. These findings strongly indicate that interpretability research should not be confined by the LRH. @@ -226,11 +226,11 @@ Log Probabilities Are a Reliable Estimate of Semantic Plausibility in Base and Instruction-Tuned Language Models - CarinaKauf - EmmanueleChersoniThe Hong Kong Polytechnic University - AlessandroLenciUniversity of Pisa + CarinaKauf + EmmanueleChersoniThe Hong Kong Polytechnic University + AlessandroLenciUniversity of Pisa EvelinaFedorenkoMassachusetts Institute of Technology - Anna AIvanovaGeorgia Institute of Technology + Anna AIvanovaGeorgia Institute of Technology 263-277 Semantic plausibility (e.g. knowing that “the actor won the award” is more likely than “the actor won the battle”) serves as an effective proxy for general world knowledge. Language models (LMs) capture vast amounts of world knowledge by learning distributional patterns in text, accessible via log probabilities (LogProbs) they assign to plausible vs. implausible outputs. The new generation of instruction-tuned LMs can now also provide explicit estimates of plausibility via prompting. Here, we evaluate the effectiveness of LogProbs and basic prompting to measure semantic plausibility, both in single-sentence minimal pairs (Experiment 1) and short context-dependent scenarios (Experiment 2). We find that (i) in both base and instruction-tuned LMs, LogProbs offers a more reliable measure of semantic plausibility than direct zero-shot prompting, which yields inconsistent and often poor results; (ii) instruction-tuning generally does not alter the sensitivity of LogProbs to semantic plausibility (although sometimes decreases it); (iii) across models, context mostly modulates LogProbs in expected ways, as measured by three novel metrics of context-sensitive plausibility and their match to explicit human plausibility judgments. We conclude that, even in the era of prompt-based evaluations, LogProbs constitute a useful metric of semantic plausibility, both in base and instruction-tuned LMs. 2024.blackboxnlp-1.18 @@ -270,7 +270,7 @@ How Language Models Prioritize Contextual Grammatical Cues? HamidrezaAmirzadeh AfraAlishahiTilburg University - HoseinMohebbi + HoseinMohebbi 315-336 Transformer-based language models have shown an excellent ability to effectively capture and utilize contextual information. Although various analysis techniques have been used to quantify and trace the contribution of single contextual cues to a target task such as subject-verb agreement or coreference resolution, scenarios in which multiple relevant cues are available in the context remain underexplored.In this paper, we investigate how language models handle gender agreement when multiple gender cue words are present, each capable of independently disambiguating a target gender pronoun. We analyze two widely used Transformer-based models: BERT, an encoder-based, and GPT-2, a decoder-based model.Our analysis employs two complementary approaches: context mixing analysis, which tracks information flow within the model, and a variant of activation patching, which measures the impact of cues on the model’s prediction. We find that BERT tends to prioritize the first cue in the context to form both the target word representations and the model’s prediction, while GPT-2 relies more on the final cue. Our findings reveal striking differences in how encoder-based and decoder-based models prioritize and use contextual information for their predictions. 2024.blackboxnlp-1.21 @@ -293,11 +293,11 @@ <fixed-case>W</fixed-case>ell<fixed-case>D</fixed-case>unn: On the Robustness and Explainability of Language Models and Large Language Models in Identifying Wellness Dimensions SeyedaliMohammadiUniversity of Maryland, Baltimore County - EdwardRaffUniversity of Maryland, Baltimore County and Booz Allen Hamilton + EdwardRaffUniversity of Maryland, Baltimore County and Booz Allen Hamilton JinendraMalekar VedantPalitIndian Institute of Technology, Kharagpur - FrancisFerraroUniversity of Maryland, Baltimore County - ManasGaurUniversity of Maryland Baltimore County + FrancisFerraroUniversity of Maryland, Baltimore County + ManasGaurUniversity of Maryland Baltimore County 364-388 Language Models (LMs) are being proposed for mental health applications where the heightened risk of adverse outcomes means predictive performance may not be a sufficient litmus test of a model’s utility in clinical practice. A model that can be trusted for practice should have a correspondence between explanation and clinical determination, yet no prior research has examined the attention fidelity of these models and their effect on ground truth explanations. We introduce an evaluation design that focuses on the robustness and explainability of LMs in identifying Wellness Dimensions (WDs). We focus on two existing mental health and well-being datasets: (a) Multi-label Classification-based MultiWD, and (b) WellXplain for evaluating attention mechanism veracity against expert-labeled explanations. The labels are based on Halbert Dunn’s theory of wellness, which gives grounding to our evaluation. We reveal four surprising results about LMs/LLMs: (1) Despite their human-like capabilities, GPT-3.5/4 lag behind RoBERTa, and MedAlpaca, a fine-tuned LLM on WellXplain fails to deliver any remarkable improvements in performance or explanations. (2) Re-examining LMs’ predictions based on a confidence-oriented loss function reveals a significant performance drop. (3) Across all LMs/LLMs, the alignment between attention and explanations remains low, with LLMs scoring a dismal 0.0. (4) Most mental health-specific LMs/LLMs overlook domain-specific knowledge and undervalue explanations, causing these discrepancies. This study highlights the need for further research into their consistency and explanations in mental health and well-being. 2024.blackboxnlp-1.23 @@ -329,7 +329,7 @@ Pruning for Protection: Increasing Jailbreak Resistance in Aligned <fixed-case>LLM</fixed-case>s Without Fine-Tuning AdibHasanMassachusetts Institute of Technology IleanaRugina - AlexWang + AlexWang 417-430 This paper investigates the impact of model compression on the way Large Language Models (LLMs) process prompts, particularly concerning jailbreak resistance. We show that moderate WANDA pruning can enhance resistance to jailbreaking attacks without fine-tuning, while maintaining performance on standard benchmarks. To systematically evaluate this safety enhancement, we introduce a dataset of 225 harmful tasks across five categories. Our analysis of LLaMA-2 Chat, Vicuna 1.3, and Mistral Instruct v0.2 reveals that pruning benefits correlate with initial model safety levels. We interpret these results by examining changes in attention patterns and perplexity shifts, demonstrating that pruned models exhibit sharper attention and increased sensitivity to artificial jailbreak constructs. We extend our evaluation to the AdvBench harmful behavior tasks and the GCG attack method. We find that LLaMA-2 is much safer on AdvBench prompts than on our dataset when evaluated with manual jailbreak attempts, and that pruning is effective against both automated attacks and manual jailbreaking on Advbench. 2024.blackboxnlp-1.26 @@ -339,7 +339,7 @@ <fixed-case>I</fixed-case>v<fixed-case>RA</fixed-case>: A Framework to Enhance Attention-Based Explanations for Language Models with Interpretability-Driven Training SeanXie - SoroushVosoughiDartmouth College + SoroushVosoughiDartmouth College SaeedHassanpourDartmouth College 431-451 Attention has long served as a foundational technique for generating explanations. With the recent developments made in Explainable AI (XAI), the multi-faceted nature of interpretability has become more apparent. Can attention, as an explanation method, be adapted to meet the diverse needs that our expanded understanding of interpretability demands? In this work, we aim to address this question by introducing IvRA, a framework designed to directly train a language model’s attention distribution through regularization to produce attribution explanations that align with interpretability criteria such as simulatability, faithfulness, and consistency. Our extensive experimental analysis demonstrates that IvRA outperforms existing methods in guiding language models to generate explanations that are simulatable, faithful, and consistent, in tandem with their predictions. Furthermore, we perform ablation studies to verify the robustness of IvRA across various experimental settings and to shed light on the interactions among different interpretability criteria. @@ -350,7 +350,7 @@ Counterfactuals As a Means for Evaluating Faithfulness of Attribution Methods in Autoregressive Language Models SepehrKamahi - YadollahYaghoobzadehUniversity of Tehran + YadollahYaghoobzadehUniversity of Tehran 452-468 Despite the widespread adoption of autoregressive language models, explainability evaluation research has predominantly focused on span infilling and masked language models. Evaluating the faithfulness of an explanation method—how accurately it explains the inner workings and decision-making of the model—is challenging because it is difficult to separate the model from its explanation. Most faithfulness evaluation techniques corrupt or remove input tokens deemed important by a particular attribution (feature importance) method and observe the resulting change in the model’s output. However, for autoregressive language models, this approach creates out-of-distribution inputs due to their next-token prediction training objective. In this study, we propose a technique that leverages counterfactual generation to evaluate the faithfulness of attribution methods for autoregressive language models. Our technique generates fluent, in-distribution counterfactuals, making the evaluation protocol more reliable. 2024.blackboxnlp-1.28 @@ -381,10 +381,10 @@ Toward the Evaluation of Large Language Models Considering Score Variance across Instruction Templates YusukeSakaiNara Institute of Science and Technology, Japan - AdamNohejlNara Institute of Science and Technology, Japan + AdamNohejlNara Institute of Science and Technology, Japan JiangnanHang - HidetakaKamigaitoNara Institute of Science and Technology - TaroWatanabeNara Institute of Science and Technology, Japan + HidetakaKamigaitoNara Institute of Science and Technology + TaroWatanabeNara Institute of Science and Technology, Japan 499-529 The natural language understanding (NLU) performance of large language models (LLMs) has been evaluated across various tasks and datasets. The existing evaluation methods, however, do not take into account the variance in scores due to differences in prompts, which leads to unfair evaluation and comparison of NLU performance. Moreover, evaluation designed for specific prompts is inappropriate for instruction tuning, which aims to perform well with any prompt. It is therefore necessary to find a way to measure NLU performance in a fair manner, considering score variance between different instruction templates. In this study, we provide English and Japanese cross-lingual datasets for evaluating the NLU performance of LLMs, which include multiple instruction templates for fair evaluation of each task, along with regular expressions to constrain the output format. Furthermore, we propose the Sharpe score as an evaluation metric that takes into account the variance in scores between templates. Comprehensive analysis of English and Japanese LLMs reveals that the high variance among templates has a significant impact on the fair evaluation of LLMs. 2024.blackboxnlp-1.31 @@ -394,7 +394,7 @@ Accelerating Sparse Autoencoder Training via Layer-Wise Transfer Learning in Large Language Models DavideGhilardi - FedericoBelotti + FedericoBelotti MarcoMolinariLSE.AI JaehyukLim 530-550 @@ -407,7 +407,7 @@ Wrapper Boxes for Faithful Attribution of Model Predictions to Training Data YihengSu Junyi JessyLiUniversity of Texas, Austin - MatthewLeaseAmazon and University of Texas at Austin + MatthewLeaseAmazon and University of Texas at Austin 551-576 Can we preserve the accuracy of neural models while also providing faithful explanations of model decisions to training data? We propose a “wrapper box” pipeline: training a neural model as usual and then using its learned feature representation in classic, interpretable models to perform prediction. Across seven language models of varying sizes, including four large language models (LLMs), two datasets at different scales, three classic models, and four evaluation metrics, we first show that the predictive performance of wrapper classic models is largely comparable to the original neural models. Because classic models are transparent, each model decision is determined by a known set of training examples that can be directly shown to users. Our pipeline thus preserves the predictive performance of neural language models while faithfully attributing classic model decisions to training data. Among other use cases, such attribution enables model decisions to be contested based on responsible training instances. Compared to prior work, our approach achieves higher coverage and correctness in identifying which training data to remove to change a model decision. To reproduce findings, our source code is online at: https://github.com/SamSoup/WrapperBox. 2024.blackboxnlp-1.33 @@ -416,8 +416,8 @@ Multi-property Steering of Large Language Models with Dynamic Activation Composition - DanielScalenaUniversity of Milan - Bicocca and University of Groningen - GabrieleSartiUniversity of Groningen + DanielScalenaUniversity of Milan - Bicocca and University of Groningen + GabrieleSartiUniversity of Groningen MalvinaNissimUniversity of Groningen 577-603 Activation steering methods were shown to be effective in conditioning language model generation by additively intervening over models’ intermediate representations. However, the evaluation of these techniques has so far been limited to single conditioning properties and synthetic settings. In this work, we conduct a comprehensive evaluation of various activation steering strategies, highlighting the property-dependent nature of optimal parameters to ensure a robust effect throughout generation. To address this issue, we propose Dynamic Activation Composition, an information-theoretic approach to modulate the steering intensity of one or more properties throughout generation. Our experiments on multi-property steering show that our method successfully maintains high conditioning while minimizing the impact of conditioning on generation fluency. @@ -429,7 +429,7 @@ Probing Language Models on Their Knowledge Source ZineddineTighidet JialiMeiBNP Paribas - BenjaminPiwowarskiCNRS / ISIR, Sorbonne Université and CNRS + BenjaminPiwowarskiCNRS / ISIR, Sorbonne Université and CNRS PatrickGallinariCriteo AI Lab and Sorbonne Universite 604-614 Large Language Models (LLMs) often encounter conflicts between their learned, internal (parametric knowledge, PK) and external knowledge provided during inference (contextual knowledge, CK). Understanding how LLMs models prioritize one knowledge source over the other remains a challenge. In this paper, we propose a novel probing framework to explore the mechanisms governing the selection between PK and CK in LLMs. Using controlled prompts designed to contradict the model’s PK, we demonstrate that specific model activations are indicative of the knowledge source employed. We evaluate this framework on various LLMs of different sizes and demonstrate that mid-layer activations, particularly those related to relations in the input, are crucial in predicting knowledge source selection, paving the way for more reliable models capable of handling knowledge conflicts effectively. diff --git a/data/xml/2024.conll.xml b/data/xml/2024.conll.xml index acfd6244b3..0eaecb7ba7 100644 --- a/data/xml/2024.conll.xml +++ b/data/xml/2024.conll.xml @@ -21,8 +21,8 @@ Words That Stick: Using Keyword Cohesion to Improve Text Segmentation AmitMaraj - MiguelVargas MartinOntario Tech University - MasoudMakrehchiOntario Tech University + MiguelVargas MartinOntario Tech University + MasoudMakrehchiOntario Tech University 1-9 Text Segmentation (TS) is the idea of segmenting bodies of text into coherent blocks, mostly defined by the topics each segment contains. Historically, techniques in this area have been unsupervised, with more success recently coming from supervised methods instead. Although these approaches see better performance, they require training data and upfront training time. We propose a new method called Coherence, where we use strong sentence embeddings to pull representational keywords as the main constructor of sentences when comparing them to one another. Additionally, we include a storage of previously found keywords for the purposes of creating a more accurate segment representation instead of just the immediate sentence in question. With our system, we show improved results over current state-of-the-art unsupervised techniques when analyzed using Pk and WindowDiff scores. Because its unsupervised, Coherence requires no fine-tuning. 2024.conll-1.1 @@ -31,7 +31,7 @@ Investigating large language models for their competence in extracting grammatically sound sentences from transcribed noisy utterances - AlinaWróblewska + AlinaWróblewska 10-23 Selectively processing noisy utterances while effectively disregarding speech-specific elements poses no considerable challenge for humans, as they exhibit remarkable cognitive abilities to separate semantically significant content from speech-specific noise (i.e. filled pauses, disfluencies, and restarts). These abilities may be driven by mechanisms based on acquired grammatical rules that compose abstract syntactic-semantic structures within utterances. Segments without syntactic and semantic significance are consistently disregarded in these structures. The structures, in tandem with lexis, likely underpin language comprehension and thus facilitate effective communication.In our study, grounded in linguistically motivated experiments, we investigate whether large language models (LLMs) can effectively perform analogical speech comprehension tasks. In particular, we examine the ability of LLMs to extract well-structured utterances from transcriptions of noisy dialogues. We conduct two evaluation experiments in the Polish language scenario, using a dataset presumably unfamiliar to LLMs to mitigate the risk of data contamination. Our results show that not all extracted utterances are correctly structured, indicating that either LLMs do not fully acquire syntactic-semantic rules or they acquire them but cannot apply them effectively. We conclude that the ability of LLMs to comprehend noisy utterances is still relatively superficial compared to human proficiency in processing them. 2024.conll-1.2 @@ -45,7 +45,7 @@ FarhadMoghimifarMonash University SurajSharma Yuan-FangLiMonash University and Oracle - WeiqingWang + WeiqingWang RezaHafMonash University 24-35 Sociocultural norms serve as guiding principles for personal conduct in social interactions within a particular society or culture. The study of norm discovery has seen significant development over the last few years, with various interesting approaches. However, it is difficult to adopt these approaches to discover norms in a new culture, as they rely either on human annotations or real-world dialogue contents. This paper presents a robust automatic norm discovery pipeline, which utilizes the cultural knowledge of GPT-3.5 Turbo (ChatGPT) along with several social factors. By using these social factors and ChatGPT, our pipeline avoids the use of human dialogues that tend to be limited to specific scenarios, as well as the use of human annotations that make it difficult and costly to enlarge the dataset. The resulting database - Multi-cultural Norm Base (MNB) - covers 6 distinct cultures, with over 150k sociocultural norm statements in total. A state-of-the-art Large Language Model (LLM), Llama 3, fine-tuned with our proposed dataset, shows remarkable results on various downstream tasks, outperforming models fine-tuned on other datasets significantly. @@ -65,11 +65,11 @@ Global-Pruner: A Stable and Efficient Pruner for Retraining-Free Pruning of Encoder-Based Language Models - GuangzhenYao - YuehanWang - HuiXu - LongZhang - MiaoQIMiaoQI + GuangzhenYao + YuehanWang + HuiXu + LongZhang + MiaoQIMiaoQI 46-55 Large language models (LLMs) have achieved significant success in complex tasks across various domains, but they come with high computational costs and inference latency issues. Pruning, as an effective method, can significantly reduce inference costs. However, current pruning algorithms for encoder-based language models often focus on locally optimal solutions, neglecting a comprehensive exploration of the global solution space. This oversight can lead to instability in the solution process, thereby affecting the overall performance of the model. To address these challenges, we propose a structured pruning algorithm named G-Pruner (Global Pruner), comprising two integral components: PPOM (Proximal Policy Optimization Mask) and CG²MT (Conjugate Gradient Squared Mask Tuning), utilizing a global optimization strategy. This strategy not only eliminates the need for retraining but also ensures the algorithm’s stability and adaptability to environmental changes, effectively addressing the issue of focusing solely on immediate optima while neglecting long-term effects. This method is evaluated on the GLUE and SQuAD benchmarks using BERTBASE and DistilBERT models. The experimental results indicate that without any retraining, G-Pruner achieves significant accuracy improvements on the SQuAD_{2.0} task with a FLOPs constraint of 60%, demonstrating a 6.02% increase in F1 score compared with baseline algorithms. 2024.conll-1.5 @@ -78,8 +78,8 @@ Transformer verbatim in-context retrieval across time and scale - KristijanArmeniJohns Hopkins University - MarkoPranjićJozef Stefan Institute and Jozef Stefan International Postgraduate School + KristijanArmeniJohns Hopkins University + MarkoPranjićJozef Stefan Institute and Jozef Stefan International Postgraduate School SenjaPollak 56-68 To predict upcoming text, language models must in some cases retrieve in-context information verbatim. In this report, we investigated how the ability of language models to retrieve arbitrary in-context nouns developed during training (across time) and as language models trained on the same dataset increase in size (across scale). We then asked whether learning of in-context retrieval correlates with learning of more challenging zero-shot benchmarks. Furthermore, inspired by semantic effects in human short-term memory, we evaluated the retrieval with respect to a major semantic component of target nouns, namely whether they denote a concrete or abstract entity, as rated by humans. We show that verbatim in-context retrieval developed in a sudden transition early in the training process, after about 1% of the training tokens. This was observed across model sizes (from 14M and up to 12B parameters), and the transition occurred slightly later for the two smallest models. We further found that the development of verbatim in-context retrieval is positively correlated with the learning of zero-shot benchmarks. Around the transition point, all models showed the advantage of retrieving concrete nouns as opposed to abstract nouns. In all but two smallest models, the advantage dissipated away toward the end of training. @@ -95,7 +95,7 @@ TimoSchickFacebook ZhengbaoJiangSchool of Computer Science, Carnegie Mellon University MariaLomeliMeta - PatrickLewis + PatrickLewis GautierIzacard EdouardGraveFacebook SebastianRiedelGoogle and University College London @@ -122,8 +122,8 @@ Critical Questions Generation: Motivation and Challenges - BlancaCalvo FiguerasUniversidad del País Vasco - RodrigoAgerriUniversity of the Basque Country + BlancaCalvo FiguerasUniversidad del País Vasco + RodrigoAgerriUniversity of the Basque Country 105-116 The development of Large Language Models (LLMs) has brought impressive performances on mitigation strategies against misinformation, such as counterargument generation. However, LLMs are still seriously hindered by outdated knowledge and by their tendency to generate hallucinated content. In order to circumvent these issues, we propose a new task, namely, Critical Questions Generation, consisting of processing an argumentative text to generate the critical questions (CQs) raised by it.In argumentation theory CQs are tools designed to lay bare the blind spots of an argument by pointing at the information it could be missing.Thus, instead of trying to deploy LLMs to produce knowledgeable and relevant counterarguments, we use them to question arguments, without requiring any external knowledge.Research on CQs Generation using LLMs requires a reference dataset for large scale experimentation. Thus, in this work we investigate two complementary methods to create such a resource: (i) instantiating CQs templates as defined by Walton’s argumentation theory and (ii), using LLMs as CQs generators. By doing so, we contribute with a procedure to establish what is a valid CQ and conclude that, while LLMs are reasonable CQ generators, they still have a wide margin for improvement in this task. 2024.conll-1.9 @@ -143,7 +143,7 @@ Causal <fixed-case>ATE</fixed-case> Mitigates Unintended Bias in Controlled Text Generation RahulMadhavanIndian Institute of Management, Ahmedabad, Indian Institute of Technology, Madras, Dhirubhai Ambani Institute Of Information and Communication Technology and Indian Institute of Science, Bangalore - KahiniWadhawan + KahiniWadhawan 130-142 We study attribute control in language models through the method of Causal Average Treatment Effect (Causal ATE). Existing methodsfor the attribute control task in Language Models(LMs) check for the co-occurrence of words in a sentence with the attribute of interest, and control for them. However, spurious correlation of the words with the attribute in the training dataset, can cause models to hallucinate the presence of the attribute when presented with the spurious correlate during inference. We show that the simple perturbation-based method of Causal ATE removes this unintended effect. Specifically, we ground it in the problem of toxicity mitigation, where a significant challenge lies in the inadvertent bias that often emerges towards protected groups post detoxification. We show that this unintended bias can be solved by the use of the Causal ATE metric. We provide experimental validations for our claims and release our code (anonymously) here: [github.com/causalate-mitigates-bias](https://github.com/causalate-mitigates-bias/causal-ate-mitigates-bias). 2024.conll-1.11 @@ -154,7 +154,7 @@ On Functional Competence of <fixed-case>LLM</fixed-case>s for Linguistic Disambiguation RaihanKibria Sheikh Intiser UddinDipta - Muhammad AbdullahAdnanBangladesh University of Engineering and Technology + Muhammad AbdullahAdnanBangladesh University of Engineering and Technology 143-160 We study some Large Language Models to explore their deficiencies in resolving sense ambiguities. In this connection, we evaluate their performance on well-known word sense disambiguation datasets. Word Sense Disambiguation (WSD) has been a long-standing NLP problem, which has given rise to many evaluation datasets and models over the decades. Recently the emergence of Large Language Models (LLM) raises much hope in improving accuracy. In this work, we evaluate word sense disambiguation capabilities of four LLMs: OpenAI’s ChatGPT-3.5, Mistral’s 7b parameter model, Meta’s Llama 70b, and Google’s Gemini Pro. We evaluate many well-established datasets containing a variety of texts and senses on these. After observing the performances of some datasets, we selectively study some failure cases and identify the reasons for failures. We explore human judgments that would correct these failures. Our findings suggest that many failure cases are related to a lack of world knowledge and the reasoning to amalgamate this knowledge rather than the lack of linguistic knowledge. We categorize the judgments so that the next generation of LLMs can improve by incorporating deeper world knowledge and reasoning. We conclude that word sense disambiguation could serve as a guide for probing the reasoning power of LLMs to measure their functional competency. We also list the accuracy of these datasets. We find that on many occasions, accuracy drops to below 70%, which is much less than that of well-performing existing models. 2024.conll-1.12 @@ -194,7 +194,7 @@ <fixed-case>T</fixed-case>p<fixed-case>T</fixed-case>-<fixed-case>ADE</fixed-case>: Transformer Based Two-Phase <fixed-case>ADE</fixed-case> Extraction SuryamukhiKuchibhotlaIndian Institute of Technology, Hyderabad, Dhirubhai Ambani Institute Of Information and Communication Technology - ManishSingh + ManishSingh 209-218 Extracting adverse reactions to medications or treatments is a crucial activity in the biomedical domain. The task involves identifying mentions of drugs and their adverse effects/events in raw text, which is challenging due to the unstructured nature of clinical narratives. In this paper, we propose TpT-ADE, a novel joint two-phase transformer model combined with natural language processing (NLP) techniques, to identify adverse events (AEs) caused by drugs. In the first phase of TpT-ADE, entities are extracted and are grounded with their standard terms using the Unified Medical Language System (UMLS) knowledge base. In the second phase, entity and relation classification is performed to determine the presence of a relationship between the drug and AE pairs. TpT-ADE also identifies the intensity of AE entities by constructing a parts-of-speech (POS) embedding model. Unlike previous approaches that use complex classifiers, TpT-ADE employs a shallow neural network and yet outperforms the state-of-the-art methods on the standard ADE corpus. 2024.conll-1.16 @@ -204,8 +204,8 @@ The Effect of Surprisal on Reading Times in Information Seeking and Repeated Reading KerenGruteke Klein - YoavMeiri - OmerShubi + YoavMeiri + OmerShubi YevgeniBerzak 219-230 The effect of surprisal on processing difficulty has been a central topic of investigation in psycholinguistics. Here, we use eyetracking data to examine three language processing regimes that are common in daily life but have not been addressed with respect to this question: information seeking, repeated processing, and the combination of the two. Using standard regime-agnostic surprisal estimates we find that the prediction of surprisal theory regarding the presence of a linear effect of surprisal on processing times, extends to these regimes. However, when using surprisal estimates from regime-specific contexts that match the contexts and tasks given to humans, we find that in information seeking, such estimates do not improve the predictive power of processing times compared to standard surprisals. Further, regime-specific contexts yield near zero surprisal estimates with no predictive power for processing times in repeated reading. These findings point to misalignments of task and memory representations between humans and current language models, and question the extent to which such models can be used for estimating cognitively relevant quantities. We further discuss theoretical challenges posed by these results. @@ -241,7 +241,7 @@ Nguyen QuangVinh Thanh-DoNguyen Vinh VanNguyenVietnam National University Hanoi - Nam Khac-HoaiBuiViettel Group + Nam Khac-HoaiBuiViettel Group 259-268 This study introduces Simple Reasoning with Code (SiRC), a novel instruction fine-tuning method for solving mathematical reasoning problems, particularly effective for Vietnamese, which is considered a low-resource language. Specifically, solving mathematical problems requires strategic and logical reasoning, which remains challenging in this research area. This paper presents a simple yet effective instruction fine-tuning method for mathematical reasoning. Unlike previous approaches, our proposed method effectively combines chain-of-thought reasoning with code transfer methods without requiring a sophisticated inference procedure. Furthermore, we focus on exploiting small open-source large language models (LLMs) for the Vietnamese language. In this regard, we first introduce a trainable Vietnamese math reasoning dataset, which is named ViMath-InstructCode. The proposed dataset is then used for fine-tuning open-source LLMs (e.g., less than 10 billion parameters). Experiments conducted on our custom ViMath-Bench dataset, the largest benchmarking dataset focusing on Vietnamese mathematical problems, indicate the promising results of our proposed method. Our source code and dataset are available for further exploitation. 2024.conll-1.20 @@ -251,9 +251,9 @@ Generalizations across filler-gap dependencies in neural language models KatherineHowittUniversity of Maryland, College Park - SathvikNair + SathvikNair AllisonDodsUniversity of Maryland, College Park - Robert MelvinHopkins + Robert MelvinHopkins 269-279 Humans develop their grammars by making structural generalizations from finite input. We ask how filler-gap dependencies (FGDs), which share a structural generalization despite diverse surface forms, might arise from the input. We explicitly control the input to a neural language model (NLM) to uncover whether the model posits a shared representation for FGDs. We show that while NLMs do have success differentiating grammatical from ungrammatical FGDs, they rely on superficial properties of the input, rather than on a shared generalization. Our work highlights the need for specific linguistic inductive biases to model language acquisition. 2024.conll-1.21 @@ -262,12 +262,12 @@ Of Models and Men: Probing Neural Networks for Agreement Attraction with Psycholinguistic Data - MaximBazhukovHigher School of Economics + MaximBazhukovHigher School of Economics EkaterinaVoloshinaGöteborg University and Chalmers University of Technology SergeyPletenev ArsenyAnisimov OlegSerikovKing Abdullah University of Science and Technology - SvetlanaToldovaHigher School of Economics + SvetlanaToldovaHigher School of Economics 280-290 Interpretability studies have played an important role in the field of NLP. They focus on the problems of how models encode information or, for instance, whether linguistic capabilities allow them to prefer grammatical sentences to ungrammatical. Recently, several studies examined whether the models demonstrate patterns similar to humans and whether they are sensitive to the phenomena of interference like humans’ grammaticality judgements, including the phenomenon of agreement attraction.In this paper, we probe BERT and GPT models on the syntactic phenomenon of agreement attraction in Russian using the psycholinguistic data with syncretism. Working on the language with syncretism between some plural and singular forms allows us to differentiate between the effects of the surface form and of the underlying grammatical feature. Thus we can further investigate models’ sensitivity to this phenomenon and examine if the patterns of their behaviour are similar to human patterns. Moreover, we suggest a new way of comparing models’ and humans’ responses via statistical testing. We show that there are some similarities between models’ and humans’ results, while GPT is somewhat more aligned with human responses than BERT. Finally, preliminary results suggest that surface form syncretism influences attraction, perhaps more so than grammatical form syncretism. 2024.conll-1.22 @@ -278,7 +278,7 @@ Is Structure Dependence Shaped for Efficient Communication?: A Case Study on Coordination KoheiKajikawaThe University of Tokyo YusukeKubota - YoheiOsekiUniversity of Tokyo + YoheiOsekiUniversity of Tokyo 291-302 Natural language exhibits various universal properties.But why do these universals exist?One explanation is that they arise from functional pressures to achieve efficient communication, a view which attributes cross-linguistic properties to domain-general cognitive abilities.This hypothesis has successfully addressed some syntactic universal properties such as compositionality and Greenbergian word order universals.However, more abstract syntactic universals have not been explored from the perspective of efficient communication.Among such universals, the most notable one is structure dependence, that is, grammar-internal operations crucially depend on hierarchical representations.This property has traditionally been taken to be central to natural language and to involve domain-specific knowledge irreducible to communicative efficiency. In this paper, we challenge the conventional view by investigating whether structure dependence realizes efficient communication, focusing on coordinate structures.We design three types of artificial languages: (i) one with a structure-dependent reduction operation, which is similar to natural language, (ii) one without any reduction operations, and (iii) one with a linear (rather than structure-dependent) reduction operation.We quantify the communicative efficiency of these languages.The results demonstrate that the language with the structure-dependent reduction operation is significantly more communicatively efficient than the counterfactual languages.This suggests that the existence of structure-dependent properties can be explained from the perspective of efficient communication. 2024.conll-1.23 @@ -287,8 +287,8 @@ Large Language Model Recall Uncertainty is Modulated by the Fan Effect - JesseRobertsTennessee Technological University - KyleMooreVanderbilt University + JesseRobertsTennessee Technological University + KyleMooreVanderbilt University DouglasFisherVanderbilt University and Vanderbilt University OseremhenEwaleifoh ThaoPham @@ -302,7 +302,7 @@ Continuous Attentive Multimodal Prompt Tuning for Few-Shot Multimodal Sarcasm Detection SoumyadeepJana AnimeshDey - Ranbir SinghSanasamIndian Institute of Technology, Guwahati, Dhirubhai Ambani Institute Of Information and Communication Technology + Ranbir SinghSanasamIndian Institute of Technology, Guwahati, Dhirubhai Ambani Institute Of Information and Communication Technology 314-326 With the steep rise in multimodal content on social media, multimodal sarcasm detection has gained widespread attention from research communities. Existing studies depend on large-scale data, which is challenging to obtain and expensive to annotate. Thus, investigating this problem in a few-shot scenario is required. Overtly complex multimodal models are prone to overfitting on in-domain data, which hampers their performance on out-of-distribution (OOD) data. To address these issues, we propose Continuous Attentive Multimodal Prompt Tuning model (CAMP), that leverages the prompt tuning paradigm to handle few-shot multimodal sarcasm detection. To overcome the siloed learning process of continuous prompt tokens, we design a novel, continuous multimodal attentive prompt where the continuous tokens intricately engage with both image and text tokens, enabling the assimilation of knowledge from different input modalities. Experimental results indicate that our method outperforms other multimodal baseline methods in the few-shot setting and OOD scenarios. 2024.conll-1.25 @@ -322,7 +322,7 @@ <fixed-case>T</fixed-case>ext2<fixed-case>A</fixed-case>fford: Probing Object Affordance Prediction abilities of Language Models solely from Text - SayantanAdak + SayantanAdak DaivikAgrawal AnimeshMukherjeeIndian Institute of Technology Kharagpur SomakAdityaIndian Institute of Technology Kharagpur @@ -336,7 +336,7 @@ How Are Metaphors Processed by Language Models? The Case of Analogies JoanneBoisson AsahiUshio - HsuvasBorkakotyCardiff University + HsuvasBorkakotyCardiff University KiamehrRezaee DimosthenisAntypas ZaraSiddique @@ -350,8 +350,8 @@ Further Compressing Distilled Language Models via Frequency-aware Partial Sparse Coding of Embeddings - KohkiTamuraThe University of Tokyo - NaokiYoshinagaInstitute of Industrial Science, the University of Tokyo + KohkiTamuraThe University of Tokyo + NaokiYoshinagaInstitute of Industrial Science, the University of Tokyo MasatoNeishi 388-399 Although pre-trained language models (PLMs) are effective for natural language understanding (NLU) tasks, they demand a huge computational resource, thus preventing us from deploying them on edge devices. Researchers have therefore applied compression techniques for neural networks, such as pruning, quantization, and knowledge distillation, to the PLMs. Although these generic techniques can reduce the number of internal parameters of hidden layers in the PLMs, the embedding layers tied to the tokenizer arehard to compress, occupying a non-negligible portion of the compressed model. In this study, aiming to further compress PLMs reduced by the generic techniques, we exploit frequency-aware sparse coding to compress the embedding layers of the PLMs fine-tuned to downstream tasks. To minimize the impact of the compression on the accuracy, we retain the embeddings of common tokens as they are and use them to reconstruct embeddings of rare tokens by locally linear mapping. Experimental results on the GLUE and JGLUE benchmarks for language understanding in English and Japanese confirmed that our method can further compress the fine-tuned DistilBERT models models while maintaining accuracy. @@ -361,7 +361,7 @@ Translating Across Cultures: <fixed-case>LLM</fixed-case>s for Intralingual Cultural Adaptation - PushpdeepSinghTata Consultancy Services Limited, India + PushpdeepSinghTata Consultancy Services Limited, India MayurPatidarTata Consultancy Services Limited, India LovekeshVig 400-418 @@ -373,12 +373,12 @@ Explaining the Hardest Errors of Contextual Embedding Based Classifiers Claudio Moisés Valiense DeAndradeUniversidade Federal de Minas Gerais, Universidade Federal de Minas Gerais - WashingtonCunhaUniversidade Federal de Minas Gerais and Universidade Federal de Minas Gerais + WashingtonCunhaUniversidade Federal de Minas Gerais and Universidade Federal de Minas Gerais GuilhermeFonseca Ana Clara SouzaPagano Luana De CastroSantos - Adriana SilvinaPaganoUniversidade Federal de Minas Gerais, Universidade Federal de Minas Gerais - Leonardo Chaves Dutra DaRochaUniversidade Federal de São João del-Rei + Adriana SilvinaPaganoUniversidade Federal de Minas Gerais, Universidade Federal de Minas Gerais + Leonardo Chaves Dutra DaRochaUniversidade Federal de São João del-Rei Marcos AndréGonçalvesUniversidade Federal de Minas Gerais, Universidade Federal de Minas Gerais 419-434 We seek to explain the causes of the misclassification of the most challenging documents, namely those that no classifier using state-of-the-art, very semantically-separable contextual embedding representations managed to predict accurately. To do so, we propose a taxonomy of incorrect predictions, which we used to perform qualitative human evaluation. We posed two (research) questions, considering three sentiment datasets in two different domains – movie and product reviews. Evaluators with two different backgrounds evaluated documents by comparing the predominant sentiment assigned by the model to the label in the gold dataset in order to decide on a likely misclassification reason. Based on a high inter-evaluator agreement (81.7%), we observed significant differences between the product and movie review domains, such as the prevalence of ambivalence in product reviews and sarcasm in movie reviews. Our analysis also revealed an unexpectedly high rate of incorrect labeling in the gold dataset (up to 33%) and a significant amount of incorrect prediction by the model due to a series of linguistic phenomena (including amplified words, contrastive markers, comparative sentences, and references to world knowledge). Overall, our taxonomy and methodology allow us to explain between 80%-85% of the errors with high confidence (agreement) – enabling us to point out where future efforts to improve models should be concentrated. @@ -390,7 +390,7 @@ A Multimodal Large Language Model “Foresees” Objects Based on Verb Information but Not Gender ShuqiWangThe Chinese University of Hong Kong XufengDuan - ZhenguangCai + ZhenguangCai 435-441 This study employs the classical psycholinguistics paradigm, the visual world eye-tracking paradigm (VWP), to explore the predictive capabilities of LLAVA, a multimodal large language model (MLLM), and compare them with human anticipatory gaze behaviors. Specifically, we examine the attention weight distributions of LLAVA when presented with visual displays and English sentences containing verb and gender cues. Our findings reveal that LLAVA, like humans, can predictively attend to objects relevant to verbs, but fails to demonstrate gender-based anticipatory attention. Layer-wise analysis indicates that the middle layers of the model are more related to predictive attention than the early or late layers. This study is pioneering in applying psycholinguistic paradigms to compare the multimodal predictive attention of humans and MLLMs, revealing both similarities and differences between them. 2024.conll-1.32 @@ -402,11 +402,11 @@ ZhiweiLiuSalesforce AI Research WeiranYaoSalesForce.com JianguoZhangSalesForce AI Research - ZuxinLiuSalesforce AI Research - LiangweiYang + ZuxinLiuSalesforce AI Research + LiangweiYang RitheshR NSalesForce.com TianLanSalesForce - MingZhuSalesForce.com + MingZhuSalesForce.com JuntaoTanRutgers University ShirleyKokaneSalesForce.com Thai QuocHoangSalesforce Research @@ -424,7 +424,7 @@ Image-conditioned human language comprehension and psychometric benchmarking of visual language models Subha NawerPushpita - Roger P.LevyMassachusetts Institute of Technology + Roger P.LevyMassachusetts Institute of Technology 447-457 Large language model (LLM)s’ next-word predictions have shown impressive performance in capturing human expectations during real-time language comprehension. This finding has enabled a line of research on psychometric benchmarking of LLMs against human language-comprehension data in order to reverse-engineer humans’ linguistic subjective probability distributions and representations. However, to date, this work has exclusively involved unimodal (language-only) comprehension data, whereas much human language use takes place in rich multimodal contexts. Here we extend psychometric benchmarking to visual language models (VLMs). We develop a novel experimental paradigm, \textit{Image-Conditioned Maze Reading}, in which participants first view an image and then read a text describing an image within the Maze paradigm, yielding word-by-word reaction-time measures with high signal-to-noise ratio and good localization of expectation-driven language processing effects. We find a large facilitatory effect of correct image context on language comprehension, not only for words such as concrete nouns that are directly grounded in the image but even for ungrounded words in the image descriptions. Furthermore, we find that VLM surprisal captures most to all of this effect. We use these findings to benchmark a range of VLMs, showing that models with lower perplexity generally have better psychometric performance, but that among the best VLMs tested perplexity and psychometric performance dissociate. Overall, our work offers new possibilities for connecting psycholinguistics with multimodal LLMs for both scientific and engineering goals. 2024.conll-1.34 @@ -437,7 +437,7 @@ KamalaSreepada Ruolan LeslieFamularo SharonGoldwaterUniversity of Edinburgh - NaomiFeldmanUniversity of Maryland, College Park + NaomiFeldmanUniversity of Maryland, College Park 458-463 State of the art models in automatic speech recognition have shown remarkable improvements due to modern self-supervised (SSL) transformer-based architectures such as wav2vec 2.0 (Baevski et al., 2020). However, how these models encode phonetic information is still not well understood. We explore whether SSL speech models display a linguistic property that characterizes human speech perception: language specificity. We show that while wav2vec 2.0 displays an overall language specificity effect when tested on Hindi vs. English, it does not resemble human speech perception when tested on finer-grained differences in Hindi speech contrasts. 2024.conll-1.35 @@ -446,8 +446,8 @@ One-Vs-Rest Neural Network <fixed-case>E</fixed-case>nglish Grapheme Segmentation: A Linguistic Perspective - SamuelRose - NinaDethlefsUniversity of Hull + SamuelRose + NinaDethlefsUniversity of Hull C.Kambhampati 464-469 Grapheme-to-Phoneme (G2P) correspondences form foundational frameworks of tasks such as text-to-speech (TTS) synthesis or automatic speech recognition. The G2P process involves taking words in their written form and generating their pronunciation. In this paper, we critique the status quo definition of a grapheme, currently a forced alignment process relating a single character to either a phoneme or a blank unit, that underlies the majority of modern approaches. We develop a linguistically-motivated redefinition from simple concepts such as vowel and consonant count and word length and offer a proof-of-concept implementation based on a multi-binary neural classification task. Our model achieves state-of-the-art results with a 31.86% Word Error Rate on a standard benchmark, while generating linguistically meaningful grapheme segmentations. @@ -457,9 +457,9 @@ <fixed-case>C</fixed-case>rowd<fixed-case>C</fixed-case>ounter: A benchmark type-specific multi-target counterspeech dataset - PunyajoySaha + PunyajoySaha AbhilashDatta - AbhikJanaIIT Bhubaneswar + AbhikJanaIIT Bhubaneswar AnimeshMukherjeeIndian Institute of Technology Kharagpur 470-488 Counterspeech presents a viable alternative to banning or suspending users for hate speech while upholding freedom of expression. However, writing effective counterspeech is challenging for moderators/users. Hence, developing suggestion tools for writing counterspeech is the need of the hour. One critical challenge in developing such a tool is the lack of quality and diversity of the responses in the existing datasets. Hence, we introduce a new dataset - CrowdCounter containing 3,425 hate speech-counterspeech pairs spanning six different counterspeech types (empathy, humor, questioning, warning, shaming, contradiction), which is the first of its kind. The design of our annotation platform itself encourages annotators to write type-specific, non-redundant and high-quality counterspeech. We evaluate two frameworks for generating counterspeech responses - vanilla and type-controlled prompts - across four large language models. In terms of metrics, we evaluate the responses using relevance, diversity and quality. We observe that Flan-T5 is the best model in the vanilla framework across different models. Type-specific prompts enhance the relevance of the responses, although they might reduce the language quality. DialoGPT proves to be the best at following the instructions and generating the type-specific counterspeech accurately. @@ -469,7 +469,7 @@ Solving the Challenge Set without Solving the Task: On <fixed-case>W</fixed-case>inograd Schemas as a Test of Pronominal Coreference Resolution - IanPoradaMcGill University + IanPoradaMcGill University Jackie CKCheungMcGill University, Mila Research Institute and Microsoft 489-506 Challenge sets such as the Winograd Schema Challenge (WSC) are used to benchmark systems’ ability to resolve ambiguities in natural language. If one assumes as in existing work that solving a given challenge set is at least as difficult as solving some more general task, then high performance on the challenge set should indicate high performance on the general task overall. However, we show empirically that this assumption of difficulty does not always hold. In particular, we demonstrate that despite the strong performance of prompted language models (LMs) on the WSC and its variants, these same modeling techniques perform relatively poorly at resolving certain pronominal ambiguities attested in OntoNotes and related datasets that are perceived to be easier. Motivated by these findings, we propose a method for ensembling a prompted LM with a supervised, task-specific system that is overall more accurate at resolving pronominal coreference across datasets. Finally, we emphasize that datasets involving the same linguistic phenomenon draw on distinct, but overlapping, capabilities, and evaluating on any one dataset alone does not provide a complete picture of a system’s overall capability. @@ -480,7 +480,7 @@ Advancing <fixed-case>A</fixed-case>rabic Sentiment Analysis: <fixed-case>A</fixed-case>r<fixed-case>S</fixed-case>en Benchmark and the Improved Fuzzy Deep Hybrid Network YangFangHuaibei Normal University - ChengXu + ChengXu ShuhaoGuan NanYanGeorgia Institute of Technology YukeMeiWuhu Institute of Technology diff --git a/data/xml/2024.customnlp4u.xml b/data/xml/2024.customnlp4u.xml index f3929c41ac..eb3908ae09 100644 --- a/data/xml/2024.customnlp4u.xml +++ b/data/xml/2024.customnlp4u.xml @@ -29,9 +29,9 @@ Navigate Complex Physical Worlds via Geometrically Constrained <fixed-case>LLM</fixed-case> - YongqiangHuang + YongqiangHuang WentaoYe - LiyaoLiZhejiang University + LiyaoLiZhejiang University JunboZhaoZhejiang University 1-11 This study investigates the potential of Large Language Models (LLMs) for reconstructing and understanding the physical world based solely on textual knowledge. It explores the impact of model performance on spatial understanding abilities by introducing a set of geometric conventions and developing a workflow based on multi-layer graphs and multi-agent systems. The study examines how LLMs achieve multi-step and multi-objective geometric inference in a spatial environment, using unified geometric conventions and a graph-driven framework. A genetic algorithm, inspired by large-scale model knowledge, is employed to solve geometric constraint problems, enhancing the spatial reasoning capabilities of LLMs. This work innovatively explores the feasibility of using text-based LLMs as builders of the physical world and designs a workflow to enhance their spatial comprehension and construction capabilities. @@ -41,10 +41,10 @@ Empowering <fixed-case>AAC</fixed-case> Users: A Systematic Integration of Personal Narratives with Conversational <fixed-case>AI</fixed-case> - SayantanPalState University of New York at Buffalo + SayantanPalState University of New York at Buffalo SouvikDasState University of New York at Buffalo RohiniSrihariState University of New York at Buffalo - JeffHigginborhamState University of New York at Buffalo + JeffHigginborhamState University of New York at Buffalo JennaBizoviState University of New York at Buffalo 12-25 Communication barriers have long posed challenges for users of Alternate and Augmentative Communication (AAC). In AAC, effective conversational aids are not solely about harnessing Artificial Intelligence (AI) capabilities but more about ensuring these technologies resonate deeply with AAC user’s unique communication challenges. We aim to bridge the gap between generic outputs and genuine human interactions by integrating advanced Conversational AI with personal narratives. While existing solutions offer generic responses, a considerable gap in tailoring outputs reflecting an AAC user’s intent must be addressed. Thus, we propose to create a custom conversational dataset centered on the experiences and words of a primary AAC user to fine-tune advanced language models. Additionally, we employ a Retrieval-Augmented Generation (RAG) method, drawing context from a summarized version of authored content by the AAC user. This combination ensures that responses are contextually relevant and deeply personal. Preliminary evaluations underscore its transformative potential, with automated metrics and human assessments showcasing significantly enhanced response quality. @@ -68,7 +68,7 @@ Less is Fed More: Sparsity Reduces Feature Distortion in Federated Learning Abhinav SukumarRao AashiqMuhamed - HarshitaDiddeeCarnegie Mellon University + HarshitaDiddeeCarnegie Mellon University 37-46 Our work studies Multilingual Federated Learning (FL), a decentralized paradigm that, although promising, grapples with issues such as client drift and suboptimal generalization in diverse, multilingual settings. We highlight limitations in existing approaches to generalize across both actively participating and inactive client language pairs. To mitigate these challenges, we introduce FedSparseNet, which incorporates sparse-network training, and LoRA, based on Low-Rank Adaptation. These approaches maintain the model’s fidelity to its pretraining distribution, thereby ensuring robust performance on both seen and unseen language pairs, while simultaneously enhancing communication efficiency by selectively transmitting trainable parameters. Our empirical evaluations demonstrate that FedSparseNet outperforms conventional FL models on both seen and unseen clients, while LoRA shows remarkable improvements in unseen client performance. Additionally, we propose the Continuous Relative Robustness Metric, a novel metric to uniformly assess a model’s performance across diverse language pairs. We open-source our code for reproducibility on GitHub. 2024.customnlp4u-1.4 @@ -79,12 +79,12 @@ Understanding Players as if They Are Talking to the Game in a Customized Language: A Pilot Study TianzeWang MaryamHonarijahromiMicrosoft Xbox (king) - StylianiKatsarou - OlgaMikheevaKTH Royal Institute of Technology, Stockholm, Sweden - TheodorosPanagiotakopoulosKing + StylianiKatsarou + OlgaMikheevaKTH Royal Institute of Technology, Stockholm, Sweden + TheodorosPanagiotakopoulosKing OlegSmirnovMicrosoft Gaming - LeleCaoMicrosoft (ABK) - SaharAsadi + LeleCaoMicrosoft (ABK) + SaharAsadi 47-52 This pilot study explores the application of language models (LMs) to model game event sequences, treating them as a customized natural language. We investigate a popular mobile game, transforming raw event data into textual sequences and pretraining a Longformer model on this data. Our approach captures the rich and nuanced interactions within game sessions, effectively identifying meaningful player segments. The results demonstrate the potential of self-supervised LMs in enhancing game design and personalization without relying on ground-truth labels. 2024.customnlp4u-1.5 @@ -94,8 +94,8 @@ <fixed-case>L</fixed-case>3<fixed-case>M</fixed-case>asking: Multi-task Fine-tuning for Language Models by Leveraging Lessons Learned from Vanilla Models YusukeKimuraDoshisha University - TakahiroKomamizuNagoya University - KenjiHatanoDoshisha University + TakahiroKomamizuNagoya University + KenjiHatanoDoshisha University 53-62 When distributional differences exist between pre-training and fine-tuning data, language models (LMs) may perform poorly on downstream tasks.Recent studies have reported that multi-task learning of downstream task and masked language modeling (MLM) task during the fine-tuning phase improves the performance of the downstream task.Typical MLM tasks (e.g., random token masking (RTM)) tend not to care tokens corresponding to the knowledge already acquired during the pre-training phase, therefore LMs may not notice the important clue or not effective to acquire linguistic knowledge of the task or domain.To overcome this limitation, we propose a new masking strategy for MLM task, called L3Masking, that leverages lessons (specifically, token-wise likelihood in a context) learned from the vanilla language model to be fine-tuned.L3Masking actively masks tokens with low likelihood on the vanilla model.Experimental evaluations on text classification tasks in different domains confirms a multi-task text classification method with L3Masking performed task adaptation more effectively than that with RTM.These results suggest the usefulness of assigning a preference to the tokens to be learned as the task or domain adaptation. 2024.customnlp4u-1.6 @@ -106,7 +106,7 @@ Grounded Language Agent for Product Search via Intelligent Web Interactions MoghisFereidouni AdibMosharrofUniversity of Kentucky - A.b.SiddiqueUniversity of Kentucky + A.b.SiddiqueUniversity of Kentucky 63-75 Recent research has focused on developing agents powered by large language models (LLMs) to accomplish complex high-level user intents. However, employing LLMs with billions of parameters (e.g., GPT-4) may incur substantial costs on top of handcrafting extensive prompts. To address this, we introduce a Grounded Language Agent for Intelligent Web Interactions, named GLAINTEL. GLAINTEL employs Flan-T5 as its backbone and is flexible in training in various settings: unsupervised learning, supervised learning, and unsupervised domain adaptation. Specifically, we tackle both the challenge of learning without human demonstrations and the opportunity to leverage human demonstrations effectively when those are available. Additionally, we explore unsupervised domain adaptation for cases where demonstrations are limited to a specific domain. Experimental evaluations across diverse setups demonstrate the effectiveness of GLAINTEL in unsupervised settings, outperforming in-context learning-based approaches that employ larger models with up to 540 billion parameters. Surprisingly, behavioral cloning-based methods that straightforwardly use human demonstrations do not outperform unsupervised variants of GLAINTEL. Additionally, we show that combining human demonstrations with reinforcement learning-based training yields results comparable to methods utilizing GPT-4. The code is available at: https://github.com/MultifacetedNLP/Web-Agents-Unsupervised 2024.customnlp4u-1.7 @@ -115,9 +115,9 @@ <fixed-case>A</fixed-case>dapt<fixed-case>E</fixed-case>val: Evaluating Large Language Models on Domain Adaptation for Text Summarization - AnumAfzalTechnische Universität München + AnumAfzalTechnische Universität München RibinChalumattuETHZ - ETH Zurich - FlorianMatthesTechnische Universität München + FlorianMatthesTechnische Universität München LauraMascarell 76-85 Despite the advances in the abstractive summarization task using Large Language Models (LLM), there is a lack of research that asses their abilities to easily adapt to different domains. We evaluate the domain adaptation abilities of a wide range of LLMs on the summarization task across various domains in both fine-tuning and in-context learning settings. We also present AdaptEval, the first domain adaptation evaluation suite. AdaptEval includes a domain benchmark and a set of metrics to facilitate the analysis of domain adaptation. Our results demonstrate that LLMs exhibit comparable performance in the in-context learning setting, regardless of their parameter scale. @@ -131,7 +131,7 @@ IreneWangNA Bo-RuLuUniversity of Washington PrithvirajAmmanabroluUniversity of California, San Diego - Noah A.SmithUniversity of Washington and Allen Institute for Artificial Intelligence + Noah A.SmithUniversity of Washington and Allen Institute for Artificial Intelligence 86-112 Teams can outperform individuals; could adding AI teammates further bolster performance of teams solving problems collaboratively? Collaborative problem solving (CPS) research commonly studies teams with two agents (human-human or human-AI), but team research literature finds that, for complex tasks, larger teams are more effective. Progress in studying collaboration with more than two agents, through textual records of team interactions, is hindered by a major data challenge: available CPS corpora are predominantly dyadic, and adapting pre-existing CPS tasks to more agents is non-trivial. We address this data challenge by developing a CPS task generator, CPS-TaskForge, that can produce environments for studying CPS under a wide array of conditions, and releasing a CPS task design checklist grounded in the theoretical PISA 2015 CPS framework to help facilitate the development of CPS corpora with more agents. CPS-TaskForge takes the form of a resource management (tower defense) game, and different CPS tasks can be studied by manipulating game design parameters. We conduct a case study with groups of 3–4 humans to validate production of diverse natural language CPS communication in a game instance produced by CPS-TaskForge. We discuss opportunities for advancing research in CPS (both with human-only and human-AI teams) using different task configurations. We release all data and code. 2024.customnlp4u-1.9 @@ -172,7 +172,7 @@ Trustful <fixed-case>LLM</fixed-case>s: Customizing and Grounding Text Generation with knowledge bases and Dual Decoders XiaofengZhu - Jaya KrishnaMandivarapuMicrosoft + Jaya KrishnaMandivarapuMicrosoft 156-166 Although people are impressed by the content generation skills of large language models, the use of LLMs, such as ChatGPT, is limited by the domain grounding of the content. The correctness and groundedness of the generated content need to be based on a verified context, such as results from Retrieval-Augmented Generation (RAG). One important issue when adapting LLMs to a customized domain is that the generated responses are often incomplete, or the additions are not verified and may even be hallucinated. Prior studies on hallucination detection have focused on evaluation metrics, which are not easily adaptable to dynamic domains and can be vulnerable to attacks like jail-breaking. In this work, we propose 1) a post-processing algorithm of leveraging knowledge triplets in RAG context to correct hallucinations and 2) a dual-decoder model that fuses RAG context to guide the generation process. 2024.customnlp4u-1.13 @@ -195,7 +195,7 @@ Learning to Adapt Large Language Models to One-Shot In-Context Intent Classification on Unseen Domains JoongboShinLG AI Research - YoubinAhnLG Corporation + YoubinAhnLG Corporation SeungpilWonLG Corporation Stanley JungkyuChoiLanguage Lab, LG AI Research 182-197 @@ -262,7 +262,7 @@ “Let’s Argue Both Sides”: Argument Generation Can Force Small Models to Utilize Previously Inaccessible Reasoning Capabilities KavehEskandari Miandoab - VasanthSarathyTufts University + VasanthSarathyTufts University 269-283 Large Language Models (LLMs), despite achieving state-of-the-art results in a number of evaluation tasks, struggle to maintain their performance when logical reasoning is strictly required to correctly infer a prediction. In this work, we propose Argument Generation as a method of forcing models to utilize their reasoning capabilities when other approaches such as chain-of-thought reasoning prove insufficient. Our method involves the generation of arguments for each possible inference result, and asking the end model to rank the generated arguments. We show that Argument Generation can serve as an appropriate substitute for zero-shot prompting techniques without the requirement to add layers of complexity. Furthermore, we argue that knowledge-probing techniques such as chain-of-thought reasoning and Argument Generation are only useful when further reasoning is required to infer a prediction, making them auxiliary to more common zero-shot approaches. Finally, we demonstrate that our approach forces larger gains in smaller language models, showcasing a complex relationship between model size and prompting methods in foundation models. 2024.customnlp4u-1.20 @@ -280,7 +280,7 @@ Tak YeonLeeKorea Advanced Institute of Science & Technology HwajungHong JuhoKimKorea Advanced Institute of Science and Technology - So-YeonAhnKorea Advanced Institute of Science & Technology + So-YeonAhnKorea Advanced Institute of Science & Technology AliceOhKorea Advanced Institute of Science and Technology 284-293 In the context of English as a Foreign Language (EFL) writing education, LLM-as-a-tutor can assist students by providing real-time feedback on their essays. However, challenges arise in assessing LLM-as-a-tutor due to differing standards between educational and general use cases. To bridge this gap, we integrate pedagogical principles to assess student-LLM interaction. First, we explore how LLMs can function as English tutors, providing effective essay feedback tailored to students. Second, we propose three criteria to evaluate LLM-as-a-tutor specifically designed for EFL writing education, emphasizing pedagogical aspects. In this process, EFL experts evaluate the feedback from LLM-as-a-tutor regarding (1) quality and (2) characteristics. On the other hand, EFL learners assess their (3) learning outcomes from interaction with LLM-as-a-tutor. This approach lays the groundwork for developing LLMs-as-a-tutor tailored to the needs of EFL learners, advancing the effectiveness of writing education in this context. @@ -305,7 +305,7 @@ Adapting <fixed-case>LLM</fixed-case> Predictions in In-Context Learning with Data Priors - JavierChiyah-Garcia + JavierChiyah-Garcia PrasoonGoyalAmazon MichaelJohnstonAmazon RezaGhanadanUniversity of Maryland, College Park diff --git a/data/xml/2024.emnlp.xml b/data/xml/2024.emnlp.xml index 9b3a297b9e..e145c818d1 100644 --- a/data/xml/2024.emnlp.xml +++ b/data/xml/2024.emnlp.xml @@ -23,7 +23,7 @@ JuhwanChoi YeonghwaKim SeungukYu - JungMinYunChung-Ang University + JungMinYunChung-Ang University YoungBinKimChung-Ang University 1-14 Although pre-trained language models have exhibited great flexibility and versatility with prompt-based few-shot learning, they suffer from the extensive parameter size and limited applicability for inference. Recent studies have suggested that PLMs be used as dataset generators and a tiny task-specific model be trained to achieve efficient inference. However, their applicability to various domains is limited because they tend to generate domain-specific datasets. In this work, we propose a novel approach to universal domain generalization that generates a dataset regardless of the target domain. This allows for generalization of the tiny task model to any domain that shares the label space, thus enhancing the real-world applicability of the dataset generation paradigm. Our experiments indicate that the proposed method accomplishes generalizability across various domains while using a parameter set that is orders of magnitude smaller than PLMs. @@ -35,8 +35,8 @@ Multi-News+: Cost-efficient Dataset Cleansing via <fixed-case>LLM</fixed-case>-based Data Annotation JuhwanChoi - JungMinYunChung-Ang University - KyohoonJinChung-Ang University + JungMinYunChung-Ang University + KyohoonJinChung-Ang University YoungBinKimChung-Ang University 15-29 The quality of the dataset is crucial for ensuring optimal performance and reliability of downstream task models. However, datasets often contain noisy data inadvertently included during the construction process. Numerous attempts have been made to correct this issue through human annotators. However, hiring and managing human annotators is expensive and time-consuming. As an alternative, recent studies are exploring the use of large language models (LLMs) for data annotation.In this study, we present a case study that extends the application of LLM-based data annotation to enhance the quality of existing datasets through a cleansing strategy. Specifically, we leverage approaches such as chain-of-thought and majority voting to imitate human annotation and classify unrelated documents from the Multi-News dataset, which is widely used for the multi-document summarization task. Through our proposed cleansing method, we introduce an enhanced Multi-News+. By employing LLMs for data cleansing, we demonstrate an efficient and effective approach to improving dataset quality without relying on expensive human annotation efforts. @@ -47,7 +47,7 @@ <fixed-case>FIZZ</fixed-case>: Factual Inconsistency Detection by Zoom-in Summary and Zoom-out Document JoonhoYangChung-Ang University - SeunghyunYoonAdobe Research + SeunghyunYoonAdobe Research ByeongJeongKimChung-Ang University HwanheeLeeChung-Ang University 30-45 @@ -59,7 +59,7 @@ Prompts have evil twins - RimonMelamed + RimonMelamed Lucas HurleyMcCabeGeorge Washington University and LMI TanayWakhareMassachusetts Institute of Technology YejinKimGeorge Washington University @@ -76,7 +76,7 @@ Table Question Answering for Low-resourced <fixed-case>I</fixed-case>ndic Languages VaishaliPal - EvangelosKanoulasUniversity of Amsterdam and University of Amsterdam + EvangelosKanoulasUniversity of Amsterdam and University of Amsterdam AndrewYatesUniversity of Amsterdam Maartende RijkeUniversity of Amsterdam 75-92 @@ -89,7 +89,7 @@ <fixed-case>I</fixed-case>mage<fixed-case>I</fixed-case>n<fixed-case>W</fixed-case>ords: Unlocking Hyper-Detailed Image Descriptions - RoopalGargResearch, Google + RoopalGargResearch, Google AndreaBurnsGoogle DeepMind BurcuKaragol AyanGoogle DeepMind YonatanBittonGoogle @@ -111,10 +111,10 @@ ZhiqiangHuSingapore University of Technology and Design LeiWangSalesForce YangWang - DehengYeTencent and Tencent + DehengYeTencent and Tencent PeilinZhaoTencent AI Lab - Ee-PengLimSingapore Management University - HuiXiongHong Kong University of Science and Technology + Ee-PengLimSingapore Management University + HuiXiongHong Kong University of Science and Technology HaoWangThe Hong Kong University of Science and Technology (Guangzhou) 128-145 This paper explores the open research problem of understanding the social behaviors of LLM-based agents. Using Avalon as a testbed, we employ system prompts to guide LLM agents in gameplay. While previous studies have touched on gameplay with LLM agents, research on their social behaviors is lacking. We propose a novel framework, tailored for Avalon, features a multi-agent system facilitating efficient communication and interaction. We evaluate its performance based on game success and analyze LLM agents’ social behaviors. Results affirm the framework’s effectiveness in creating adaptive agents and suggest LLM-based agents’ potential in navigating dynamic social interactions. By examining collaboration and confrontation behaviors, we offer insights into this field’s research and applications. @@ -127,7 +127,7 @@ XiangyuZhang HexinLiu KaishuaiXuHong Kong Polytechnic University - QiquanZhangNational University of Singapore + QiquanZhangNational University of Singapore DaijiaoLiu BeenaAhmedUniversity of New South Wales JulienEppsUniversity of New South Wales @@ -142,9 +142,9 @@ XiangyuZhang DaijiaoLiu HexinLiu - QiquanZhangNational University of Singapore + QiquanZhangNational University of Singapore HanyuMeng - Leibny PaolaGarcia Perera + Leibny PaolaGarcia Perera EngSiongChngNanyang Technological University LinaYaoUniversity of New South Wales and CSIRO’s Data61 159-171 @@ -156,8 +156,8 @@ Hateful Word in Context Classification SanneHoekenUniversität Bielefeld - SinaZarrießBielefeld University - ÖzgeAlacamBielefeld University + SinaZarrießBielefeld University + ÖzgeAlacamBielefeld University 172-186 Hate speech detection is a prevalent research field, yet it remains underexplored at the level of word meaning. This is significant, as terms used to convey hate often involve non-standard or novel usages which might be overlooked by commonly leveraged LMs trained on general language use. In this paper, we introduce the Hateful Word in Context Classification (HateWiC) task and present a dataset of ~4000 WiC-instances, each labeled by three annotators. Our analyses and computational exploration focus on the interplay between the subjective nature (context-dependent connotations) and the descriptive nature (as described in dictionary definitions) of hateful word senses. HateWiC annotations confirm that hatefulness of a word in context does not always derive from the sense definition alone. We explore the prediction of both majority and individual annotator labels, and we experiment with modeling context- and sense-based inputs. Our findings indicate that including definitions proves effective overall, yet not in cases where hateful connotations vary. Conversely, including annotator demographics becomes more important for mitigating performance drop in subjective hate prediction. 2024.emnlp-main.10 @@ -166,9 +166,9 @@ Eyes Don’t Lie: Subjective Hate Annotation and Detection with Gaze - ÖzgeAlacamBielefeld University + ÖzgeAlacamBielefeld University SanneHoekenUniversität Bielefeld - SinaZarrießBielefeld University + SinaZarrießBielefeld University 187-205 Hate speech is a complex and subjective phenomenon. In this paper, we present a dataset (GAZE4HATE) that provides gaze data collected in a hate speech annotation experiment. We study whether the gaze of an annotator provides predictors of their subjective hatefulness rating, and how gaze features can improve Hate Speech Detection (HSD). We conduct experiments on statistical modeling of subjective hate ratings and gaze and analyze to what extent rationales derived from hate speech models correspond to human gaze and explanations in our data. Finally, we introduce MEANION, a first gaze-integrated HSD model. Our experiments show that particular gaze features like dwell time or fixation counts systematically correlate with annotators’ subjective hate ratings and improve predictions of text-only hate speech models. 2024.emnlp-main.11 @@ -182,7 +182,7 @@ JosephShtok SivanDovehWeizmann Institute of Science LeonidKarlinskyIBM Research AI - AssafArbelleInternational Business Machines + AssafArbelleInternational Business Machines 206-212 Language models struggle with handling numerical data and performing arithmetic operations. We hypothesize that this limitation can be partially attributed to non-intuitive textual numbers representation. When a digit is read or generated by a causal language model it does not know its place value (e.g. thousands vs. hundreds) until the entire number is processed. To address this issue, we propose a simple adjustment to how numbers are represented by including the count of digits before each number. For instance, instead of “42”, we suggest using “2:42” as the new format. This approach, which we term NumeroLogic, offers an added advantage in number generation by serving as a Chain of Thought (CoT). By requiring the model to consider the number of digits first, it enhances the reasoning process before generating the actual number. We use arithmetic tasks to demonstrate the effectiveness of the NumeroLogic formatting. We further demonstrate NumeroLogic applicability to general natural language modeling, improving language understanding performance in the MMLU benchmark. 2024.emnlp-main.12 @@ -197,7 +197,7 @@ PragyanBanerjee SimraShahidUniversity of Virginia, Charlottesville and Adobe Systems SumitBhatiaAdobe Systems - KokilJaidkaNational University of Singapore + KokilJaidkaNational University of Singapore 213-227 Existing debiasing techniques are typically training-based or require access to the model’s internals and output distributions, so they are inaccessible to end-users looking to adapt LLM outputs for their particular needs. In this study, we examine whether structured prompting techniques can offer opportunities for fair text generation. We evaluate a comprehensive end-user-focused iterative framework of debiasing that applies System 2 thinking processes for prompts to induce logical, reflective, and critical text generation, with single, multi-step, instruction, and role-based variants. By systematically evaluating many LLMs across many datasets and different prompting strategies, we show that the more complex System 2-based Implicative Prompts significantly improve over other techniques demonstrating lower mean bias in the outputs with competitive performance on the downstream tasks. Our work offers research directions for the design and the potential of end-user-focused evaluative frameworks for LLM use. 2024.emnlp-main.13 @@ -210,7 +210,7 @@ TianyiLi PavlosVougiouklisHuawei Technologies Ltd. MarkSteedmanUniversity of Edinburgh - Jeff Z.PanUniversity of Edinburgh, University of Edinburgh + Jeff Z.PanUniversity of Edinburgh, University of Edinburgh 228-236 Identifying and understanding user intents is a pivotal task for E-Commerce. Despite its essential role in product recommendation and business user profiling analysis, intent understanding has not been consistently defined or accurately benchmarked. In this paper, we focus on predicative user intents as “how a customer uses a product”, and pose intent understanding as a natural language reasoning task, independent of product ontologies. We identify two weaknesses of FolkScope, the SOTA E-Commerce Intent Knowledge Graph: category-rigidity and property-ambiguity. They limit its ability to strongly align user intents with products having the most desirable property, and to recommend useful products across diverse categories. Following these observations, we introduce a Product Recovery Benchmark featuring a novel evaluation framework and an example dataset. We further validate the above FolkScope weaknesses on this benchmark. Our code and dataset are available at https://github.com/stayones/Usgae-Centric-Intent-Understanding. 2024.emnlp-main.14 @@ -257,9 +257,9 @@ XinmengHuang ShuoLi MengxinYu - MatteoSesiaUniversity of Southern California + MatteoSesiaUniversity of Southern California HamedHassaniUniversity of Pennsylvania, University of Pennsylvania and University of Pennsylvania - InsupLeeUniversity of Pennsylvania + InsupLeeUniversity of Pennsylvania OsbertBastaniUniversity of Pennsylvania EdgarDobribanThe Wharton School, University of Pennsylvania 284-312 @@ -271,7 +271,7 @@ <fixed-case>R</fixed-case>o<fixed-case>TB</fixed-case>ench: A Multi-Level Benchmark for Evaluating the Robustness of Large Language Models in Tool Learning - JunjieYe + JunjieYe YilongWu SongyangGaoShanghai Artificial Intelligence Laboratory CaishuangHuang @@ -291,10 +291,10 @@ Learning Planning-based Reasoning by Trajectories Collection and Process Reward Synthesizing - FangkaiJiao + FangkaiJiao ChengweiQinNanyang Technological University ZhengyuanLiuI2R - Nancy F.Chen + Nancy F.Chen ShafiqJotySalesForce.com and Nanyang Technological University 334-350 Large Language Models (LLMs) have demonstrated significant potential in handling complex reasoning tasks through step-by-step rationale generation. However, recent studies have raised concerns regarding the hallucination and flaws in their reasoning process. Substantial efforts are being made to improve the reliability and faithfulness of the generated rationales. Some approaches model reasoning as planning, while others focus on annotating for process supervision. Nevertheless, the planning-based search process often results in high latency due to the frequent assessment of intermediate reasoning states and the extensive exploration space. Additionally, supervising the reasoning process with human annotation is costly and challenging to scale for LLM training. To address these issues, in this paper, we propose a framework to learn planning-based reasoning through Direct Preference Optimization (DPO) on collected trajectories, which are ranked according to synthesized process rewards. Our results on challenging logical reasoning benchmarks demonstrate the effectiveness of our learning framework, showing that our 7B model can surpass the strong counterparts like GPT-3.5-Turbo. @@ -306,7 +306,7 @@ Scaling Properties of Speech Language Models SantiagoCuervo - RicardMarxerUniversity of Toulon + RicardMarxerUniversity of Toulon 351-361 Speech Language Models (SLMs) aim to learn language from raw audio, without textual resources. Despite significant advances, our current models exhibit weak syntax and semantic abilities. However, if the scaling properties of neural language models hold for the speech modality, these abilities will improve as the amount of compute used for training increases. In this paper, we use models of this scaling behavior to estimate the scale at which our current methods will yield a SLM with the English proficiency of text-based Large Language Models (LLMs). We establish a strong correlation between pre-training loss and downstream syntactic and semantic performance in SLMs and LLMs, which results in predictable scaling of linguistic performance. We show that the linguistic performance of SLMs scales up to three orders of magnitude more slowly than that of text-based LLMs. Additionally, we study the benefits of synthetic data designed to boost semantic understanding and the effects of coarser speech tokenization. 2024.emnlp-main.21 @@ -315,7 +315,7 @@ “We Demand Justice!”: Towards Social Context Grounding of Political Texts - RajkumarPujariPurdue University + RajkumarPujariPurdue University ChengfeiWuPurdue University DanGoldwasserPurdue University and Purdue University 362-372 @@ -344,7 +344,7 @@ MiaoranZhangSaarland University BarryHaddowUniversity of Edinburgh XiaoyuShenAmazon - DietrichKlakowSaarland University + DietrichKlakowSaarland University 388-409 Traditionally, success in multilingual machine translation can be attributed to three key factors in training data: large volume, diverse translation directions, and high quality. In the current practice of fine-tuning large language models (LLMs) for translation, we revisit the importance of these factors. We find that LLMs display strong translation capability after being fine-tuned on as few as 32 parallel sentences and that fine-tuning on a single translation direction enables translation in multiple directions. However, the choice of direction is critical: fine-tuning LLMs with only English on the target side can lead to task misinterpretation, which hinders translation into non-English languages. Problems also arise when noisy synthetic data is placed on the target side, especially when the target language is well-represented in LLM pre-training. Yet interestingly, synthesized data in an under-represented language has a less pronounced effect. Our findings suggest that when adapting LLMs to translation, the requirement on data quantity can be eased but careful considerations are still crucial to prevent an LLM from exploiting unintended data biases. 2024.emnlp-main.24 @@ -354,12 +354,12 @@ Consolidating Ranking and Relevance Predictions of Large Language Models through Post-Processing LeYanGoogle - ZhenQinGoogle - HongleiZhuangGoogle Research - RolfJagermanGoogle + ZhenQinGoogle + HongleiZhuangGoogle Research + RolfJagermanGoogle XuanhuiWangGoogle - MichaelBenderskyGoogle - HarrieOosterhuis + MichaelBenderskyGoogle + HarrieOosterhuis 410-423 The powerful generative abilities of large language models (LLMs) show potential in generating relevance labels for search applications. Previous work has found that directly asking about relevancy, such as "*How relevant is document A to query Q?*”, results in suboptimal ranking. Instead, the pairwise-ranking prompting (PRP) approach produces promising ranking performance through asking about pairwise comparisons, e.g., "*Is document A more relevant than document B to query Q?*”. Thus, while LLMs are effective at their ranking ability, this is not reflected in their relevance label generation.In this work, we propose a post-processing method to consolidate the relevance labels generated by an LLM with its powerful ranking abilities. Our method takes both LLM generated relevance labels and pairwise preferences. The labels are then altered to satisfy the pairwise preferences of the LLM, while staying as close to the original values as possible. Our experimental results indicate that our approach effectively balances label accuracy and ranking performance. Thereby, our work shows it is possible to combine both the ranking and labeling abilities of LLMs through post-processing. 2024.emnlp-main.25 @@ -375,7 +375,7 @@ JiaLiu ZujieWen WenqiangLeiSichuan University - Tat-SengChuaNational University of Singapore + Tat-SengChuaNational University of Singapore 424-444 We investigate non-collaborative dialogue agents, which are expected to engage in strategic conversations with diverse users, for securing a mutual agreement that leans favorably towards the system’s objectives. This poses two main challenges for existing dialogue agents: 1) The inability to integrate user-specific characteristics into the strategic planning, and 2) The difficulty of training strategic planners that can be generalized to diverse users. To address these challenges, we propose TRIP to enhance the capability in tailored strategic planning, incorporating a user-aware strategic planning module and a population-based training paradigm. Through experiments on benchmark non-collaborative dialogue tasks, we demonstrate the effectiveness of TRIP in catering to diverse users. 2024.emnlp-main.26 @@ -384,12 +384,12 @@ Impeding <fixed-case>LLM</fixed-case>-assisted Cheating in Introductory Programming Assignments via Adversarial Perturbation - Saiful IslamSalimUniversity of Arizona + Saiful IslamSalimUniversity of Arizona Rubin YuchanYangUniversity of Arizona AlexanderCooper SuryashreeRay SaumyaDebrayUniversity of Arizona - SazzadurRahamanUniversity of Arizona + SazzadurRahamanUniversity of Arizona 445-463 While Large language model (LLM)-based programming assistants such as CoPilot and ChatGPT can help improve the productivity of professional software developers, they can also facilitate cheating in introductory computer programming courses. Assuming instructors have limited control over the industrial-strength models, this paper investigates the baseline performance of 5 widely used LLMs on a collection of introductory programming problems, examines adversarial perturbations to degrade their performance, and describes the results of a user study aimed at measuring the efficacy of such perturbations in hindering actual code generation for introductory programming assignments. The user study suggests that i) perturbations combinedly reduced the average correctness score by 77%, ii) the drop in correctness caused by these perturbations was affected based on their detectability. 2024.emnlp-main.27 @@ -406,8 +406,8 @@ XiaofengZhaoHuawei Technologies Ltd. MahongXia ZhangLi - BoxingChenHuawei Technologies Ltd. - HaoYang + BoxingChenHuawei Technologies Ltd. + HaoYang BeiLiMeituan TongXiaoNortheastern University JingBoZhuNortheastern University @@ -432,8 +432,8 @@ <fixed-case>E</fixed-case>mph<fixed-case>A</fixed-case>ssess : a Prosodic Benchmark on Assessing Emphasis Transfer in Speech-to-Speech Models Maureende Seyssel AntonyD’Avirro - AdinaWilliamsFAIR (Meta Platforms Inc.) - EmmanuelDupouxEHESS + AdinaWilliamsFAIR (Meta Platforms Inc.) + EmmanuelDupouxEHESS 495-507 We introduce EmphAssess, a prosodic benchmark designed to evaluate the capability of speech-to-speech models to encode and reproduce prosodic emphasis. We apply this to two tasks: speech resynthesis and speech-to-speech translation. In both cases, the benchmark evaluates the ability of the model to encode emphasis in the speech input and accurately reproduce it in the output, potentially across a change of speaker and language. As part of the evaluation pipeline, we introduce EmphaClass, a new model that classifies emphasis at the frame or word level. 2024.emnlp-main.30 @@ -443,11 +443,11 @@ On Fake News Detection with <fixed-case>LLM</fixed-case> Enhanced Semantics Mining - XiaoxiaoMaMacquarie University - YuchenZhang + XiaoxiaoMaMacquarie University + YuchenZhang KaizeDingNorthwestern University and Arizona State University - JianYangMacquarie University - JiaWuMacquarie University + JianYangMacquarie University + JiaWuMacquarie University HaoFanWuhan University 508-521 Large language models (LLMs) have emerged as valuable tools for enhancing textual features in various text-related tasks. Despite their superiority in capturing the lexical semantics between tokens for text analysis, our preliminary study on two popular LLMs, i.e., ChatGPT and Llama2, showcases that simply applying the news embeddings from LLMs is ineffective for fake news detection. Such embeddings only encapsulate the language styles between tokens. Meanwhile, the high-level semantics among named entities and topics, which reveal the deviating patterns of fake news, have been ignored. Therefore, we propose a topic model together with a set of specially designed prompts to extract topics and real entities from LLMs and model the relations among news, entities, and topics as a heterogeneous graph to facilitate investigating news semantics. We then propose a Generalized Page-Rank model and a consistent learning criteria for mining the local and global semantics centered on each news piece through the adaptive propagation of features across the graph. Our model shows superior performance on five benchmark datasets over seven baseline methods and the efficacy of the key ingredients has been thoroughly validated. @@ -457,9 +457,9 @@ On Sensitivity of Learning with Limited Labelled Data to the Effects of Randomness: Impact of Interactions and Systematic Choices - BranislavPecherKempelen Institute of Intelligent Technologies, Brno University of Technology and Kempelen Institute of Intelligent Technologies - IvanSrbaKempelen Institute of Intelligent Technologies - MariaBielikovaKempelen Institute of Intelligent Technologies + BranislavPecherKempelen Institute of Intelligent Technologies, Brno University of Technology and Kempelen Institute of Intelligent Technologies + IvanSrbaKempelen Institute of Intelligent Technologies + MariaBielikovaKempelen Institute of Intelligent Technologies 522-556 While learning with limited labelled data can effectively deal with a lack of labels, it is also sensitive to the effects of uncontrolled randomness introduced by so-called randomness factors (i.e., non-deterministic decisions such as choice or order of samples). We propose and formalise a method to systematically investigate the effects of individual randomness factors while taking the interactions (dependence) between them into consideration. To this end, our method mitigates the effects of other factors while observing how the performance varies across multiple runs. Applying our method to multiple randomness factors across in-context learning and fine-tuning approaches on 7 representative text classification tasks and meta-learning on 3 tasks, we show that: 1) disregarding interactions between randomness factors in existing works led to inconsistent findings due to incorrect attribution of the effects of randomness factors, such as disproving the consistent sensitivity of in-context learning to sample order even with random sample selection; and 2) besides mutual interactions, the effects of randomness factors, especially sample order, are also dependent on more systematic choices unexplored in existing works, such as number of classes, samples per class or choice of prompt format. 2024.emnlp-main.32 @@ -496,17 +496,17 @@ ShizheDiaoHong Kong University of Science and Technology JianmengLiu JipengZhang - RuiPanUniversity of Illinois at Urbana-Champaign + RuiPanUniversity of Illinois at Urbana-Champaign HaoxiangWangNVIDIA - WenbinHu + WenbinHu HanningZhang HanzeDongSalesForce RenjiePi - HanZhaoUniversity of Illinois, Urbana Champaign + HanZhaoUniversity of Illinois, Urbana Champaign NanJiangUniversity of Illinois at Urbana-Champaign HengJiUniversity of Illinois, Urbana-Champaign - YuanYaoThe Hong Kong University of Science and Technology - TongZhangUIUC + YuanYaoThe Hong Kong University of Science and Technology + TongZhangUIUC 580-606 LLMs acquire a wide range of abilities during pre-training, but aligning LLMs under Reinforcement Learning with Human Feedback (RLHF) can lead to forgetting pretrained abilities, which is also known as the alignment tax. To investigate alignment tax, we conducted experiments with existing RLHF algorithms using OpenLLaMA-3B, which revealed a pronounced alignment tax in NLP tasks. Whereas, despite various techniques to mitigate forgetting, they are often at odds with the RLHF performance, leading to a trade-off between alignment performance and forgetting mitigation, leading to an alignment-forgetting trade-off. In this paper we show that model averaging, which simply interpolates between pre and post RLHF model weights, surprisingly achieves the most strongest alignment-forgetting Pareto front among a wide range of competing methods. To understand its effectiveness, we offer theoretical insights into model averaging, revealing that it enhances performance Pareto front by increasing feature diversity on the layers where tasks share overlapped feature spaces. Empirical evidence corroborates our analysis by showing the benefits of averaging low-level transformer layers. Building on the analysis and the observation that averaging different layers of the transformer leads to significantly different alignment-forgetting trade-offs, we propose Heterogeneous Model Averaging (HMA) to Heterogeneously find various combination ratios of model layers. HMA seeks to maximize the alignment performance while incurring minimal alignment tax. Moreover, we validate HMA’s performance across a range of RLHF algorithms over OpenLLaMA-3B and further extend our findings to Mistral-7B which is evaluated by open-sourced preference model and GPT4. Code available here. 2024.emnlp-main.35 @@ -516,11 +516,11 @@ Evaluating Readability and Faithfulness of Concept-based Explanations MengLi - HaoranJinUniversity of Science and Technology of China + HaoranJinUniversity of Science and Technology of China RuixuanHuang ZhihaoXu - DefuLianUniversity of Science and Technology of China - ZijiaLinKuaishou Technology + DefuLianUniversity of Science and Technology of China + ZijiaLinKuaishou Technology DiZhangKuaishou Technology XitingWangRenmin University of China 607-625 @@ -534,7 +534,7 @@ ZhengyuanLiuI2R Stella XinYin GeyuLinInstitute of Infocomm Research, A*STAR - Nancy F.Chen + Nancy F.Chen 626-642 Intelligent Tutoring Systems (ITSs) can provide personalized and self-paced learning experience. The emergence of large language models (LLMs) further enables better human-machine interaction, and facilitates the development of conversational ITSs in various disciplines such as math and language learning. In dialogic teaching, recognizing and adapting to individual characteristics can significantly enhance student engagement and learning efficiency. However, characterizing and simulating student’s persona remain challenging in training and evaluating conversational ITSs. In this work, we propose a framework to construct profiles of different student groups by refining and integrating both cognitive and noncognitive aspects, and leverage LLMs for personality-aware student simulation in a language learning scenario. We further enhance the framework with multi-aspect validation, and conduct extensive analysis from both teacher and student perspectives. Our experimental results show that state-of-the-art LLMs can produce diverse student responses according to the given language ability and personality traits, and trigger teacher’s adaptive scaffolding strategies. 2024.emnlp-main.37 @@ -543,7 +543,7 @@ <fixed-case>MSI</fixed-case>-Agent: Incorporating Multi-Scale Insight into Embodied Agents for Superior Planning and Decision-Making - DayuanFu + DayuanFu BiqingQiShanghai Artificial Intelligence Laboratory YihuaiGaoStanford University CheJiang @@ -557,9 +557,9 @@ <fixed-case>C</fixed-case>o<fixed-case>C</fixed-case>o<fixed-case>L</fixed-case>o<fixed-case>F</fixed-case>a: A Dataset of News Comments with Common Logical Fallacies Written by <fixed-case>LLM</fixed-case>-Assisted Crowds - Min-HsuanYeh + Min-HsuanYeh RuyuanWanPennsylvania State University - Ting-Hao KennethHuangPennsylvania State University + Ting-Hao KennethHuangPennsylvania State University 660-677 Detecting logical fallacies in texts can help users spot argument flaws, but automating this detection is not easy. Manually annotating fallacies in large-scale, real-world text data to create datasets for developing and validating detection models is costly. This paper introduces CoCoLoFa, the largest known logical fallacy dataset, containing 7,706 comments for 648 news articles, with each comment labeled for fallacy presence and type. We recruited 143 crowd workers to write comments embodying specific fallacy types (e.g., slippery slope) in response to news articles. Recognizing the complexity of this writing task, we built an LLM-powered assistant into the workers’ interface to aid in drafting and refining their comments. Experts rated the writing quality and labeling validity of CoCoLoFa as high and reliable. BERT-based models fine-tuned using CoCoLoFa achieved the highest fallacy detection (F1=0.86) and classification (F1=0.87) performance on its test set, outperforming the state-of-the-art LLMs. Our work shows that combining crowdsourcing and LLMs enables us to more effectively construct datasets for complex linguistic phenomena that crowd workers find challenging to produce on their own. 2024.emnlp-main.39 @@ -574,7 +574,7 @@ HaoranZhang AlecAlameddineKensho Technologies OmriUzanBen Gurion University of the Negev - YuvalPinterBen-Gurion University of the Negev + YuvalPinterBen-Gurion University of the Negev ChrisTannerMassachusetts Institute of Technology and Kensho 678-702 Tokenization is a foundational step in natural language processing (NLP) tasks, bridging raw text and language models. Existing tokenization approaches like Byte-Pair Encoding (BPE) originate from the field of data compression, and it has been suggested that the effectiveness of BPE stems from its ability to condense text into a relatively small number of tokens. We test the hypothesis that fewer tokens lead to better downstream performance by introducing PathPiece, a new tokenizer that segments a document’s text into the minimum number of tokens for a given vocabulary. Through extensive experimentation we find this hypothesis not to be the case, casting doubt on the understanding of the reasons for effective tokenization. To examine which other factors play a role, we evaluate design decisions across all three phases of tokenization: pre-tokenization, vocabulary construction, and segmentation, offering new insights into the design of effective tokenizers. Specifically, we illustrate the importance of pre-tokenization and the benefits of using BPE to initialize vocabulary construction. We train 64 language models with varying tokenization, ranging in size from 350M to 2.4B parameters, all of which are made publicly available. @@ -615,7 +615,7 @@ HaoyuanWu HaishengZheng ZhuolunHe - BeiYuDepartment of Computer Science and Engineering, The Chinese University of Hong Kong + BeiYuDepartment of Computer Science and Engineering, The Chinese University of Hong Kong 737-749 Large language models (LLMs) have demonstrated considerable proficiency in general natural language processing (NLP) tasks. Instruction tuning, a successful paradigm, enhances the ability of LLMs to follow natural language instructions and exhibit robust generalization across general tasks. However, these models often encounter performance limitations across multiple tasks due to constrained model capacity. Expanding this capacity during the instruction tuning phase poses significant challenges. To address this issue, we introduce parameter-efficient sparsity crafting (PESC), which crafts dense models into sparse models using the mixture-of-experts (MoE) architecture. PESC integrates adapters into the MoE layers of sparse models, differentiating experts without altering the individual weights within these layers. This method significantly reduces computational costs and GPU memory requirements, facilitating model capacity expansion through a minimal parameter increase when guaranteeing the quality of approximation in function space compared to original sparse upcycling. Our empirical evaluation demonstrates the effectiveness of the PESC method. Using PESC during instruction tuning, our best sparse model outperforms other sparse and dense models and exhibits superior general capabilities compared to GPT-3.5.Our code is available at https://github.com/wuhy68/Parameter-Efficient-MoE. 2024.emnlp-main.43 @@ -627,9 +627,9 @@ ShihaoCai KeqinBao HangyuGuoAlibaba Group - JizhiZhangUniversity of Science and Technology of China + JizhiZhangUniversity of Science and Technology of China JunSongAlibaba Group - BoZhengAlibaba Group + BoZhengAlibaba Group 750-766 Large language models have seen widespread adoption in math problem-solving, yet for geometry problems, which often necessitate visual aids even for humans, the most advanced multi-modal models still struggle to effectively utilize image information. High-quality data is crucial for enhancing the geometric capabilities of multi-modal models, yet existing open-source datasets and related efforts are either too challenging for direct model learning or suffer from misalignment between text and images. To overcome this issue, we introduce a novel pipeline that leverages GPT-4 and GPT-4V to generate relatively basic geometry problems with aligned text and images, facilitating model learning. We have produced a dataset of 4.9K geometry problems and combined it with 19K open-source data to form our GeoGPT4V dataset. Experimental results demonstrate that the GeoGPT4V dataset significantly improves the geometry performance of various models on the MathVista and MathVision benchmarks. The code is available at https://anonymous.4open.science/r/GeoGPT4V-08B2. 2024.emnlp-main.44 @@ -638,11 +638,11 @@ <fixed-case>D</fixed-case>y<fixed-case>V</fixed-case>o: Dynamic Vocabularies for Learned Sparse Retrieval with Entities - ThongNguyen - ShubhamChatterjeeMissouri University of Science and Technology - SeanMacAvaneyUniversity of Glasgow + ThongNguyen + ShubhamChatterjeeMissouri University of Science and Technology + SeanMacAvaneyUniversity of Glasgow IainMackie - JeffDaltonUniversity of Edinburgh + JeffDaltonUniversity of Edinburgh AndrewYatesUniversity of Amsterdam 767-783 Learned Sparse Retrieval (LSR) models use vocabularies from pre-trained transformers, which often split entities into nonsensical fragments. Splitting entities diminishes retrieval accuracy and limits the model’s ability to incorporate up-to-date world knowledge not included in the training data. In this work, we enhance the LSR vocabulary with Wikipedia concepts and entities, enabling the model to resolve ambiguities more effectively and stay current with evolving knowledge. Central to our approach is a Dynamic Vocabulary (DyVo) head, which leverages existing entity embeddings and an entity retrieval component that identifies entities relevant to a query or document. We use the DyVo head to generate entity weights, which are then merged with word piece weights to create joint representations for efficient indexing and retrieval using an inverted index. In experiments across three entity-rich document ranking datasets, the resulting DyVo model substantially outperforms several state-of-the-art baselines. @@ -667,7 +667,7 @@ <fixed-case>L</fixed-case>ong<fixed-case>E</fixed-case>mbed: Extending Embedding Models for Long Context Retrieval DaweiZhu - LiangWangMicrosoft Research + LiangWangMicrosoft Research NanYangMicrosoft Research Asia YifanSong WenhaoWu @@ -683,7 +683,7 @@ Making Large Language Models Better Reasoners with Orchestrated Streaming Experiences XiangyangLiu JunliangHe - XipengQiuFudan University + XipengQiuFudan University 817-838 Large language models (LLMs) can perform complex reasoning by generating intermediate reasoning steps using chain-of-thought prompting under zero-shot or few-shot settings. However, zero-shot prompting always encounters low performance, and the superior performance of few-shot prompting hinges on the manual-crafting of task-specific demonstrations one by one. In this paper, we present **RoSE** (**R**easoning with **O**rchestrated **S**treaming **E**xperiences), a general framework for solving reasoning tasks that can self-improve as it answers various reasoning questions. To enable RoSE, we describe an architecture that extends an LLM to store all answered reasoning questions and their reasoning steps in a streaming experience pool and orchestrate helpful questions from the pool to assist itself in answering new questions. To set up a question-aware orchestration mechanism, RoSE first calculates the similarity of each question in the pool with the question to be answered. Since the solution to each question in the experience pool is not always correct, RoSE will sort the questions according to their similarity with the question to be answered, and then uniformly divide them into multiple buckets. It finally extracts one question from each bucket to make the extracted questions more diverse. To make the extracted questions help RoSE answer new questions as much as possible, we introduce two other attributes of uncertainty and complexity for each question. RoSE will preferentially select the questions with low uncertainty and high complexity from each bucket. We evaluate the versatility of RoSE in various complex reasoning tasks and LLMs, such as arithmetic and commonsense reasoning, and find that it can achieve excellent performance without any labeled data and pre-set unlabeled data. 2024.emnlp-main.48 @@ -692,7 +692,7 @@ Overcome Noise and Bias: Segmentation-Aided Multi-Granularity Denoising and Debiasing for Enhanced Quarduples Extraction in Dialogue - XianlongLuo + XianlongLuo MengYangSUN YAT-SEN UNIVERSITY, School of Computer Science and Engineering YihaoWangSun Yat-Sen University 839-856 @@ -715,8 +715,8 @@ In-context Contrastive Learning for Event Causality Identification LiangChao - WeiXiangCentral China Normal University - BangWangHuazhong University of Science and Technology + WeiXiangCentral China Normal University + BangWangHuazhong University of Science and Technology 868-881 Event Causality Identification (ECI) aims at determining the existence of a causal relation between two events. Although recent prompt learning-based approaches have shown promising improvements on the ECI task, their performance are often subject to the delicate design of multiple prompts and the positive correlations between the main task and derivate tasks. The in-context learning paradigm provides explicit guidance for label prediction in the prompt learning paradigm, alleviating its reliance on complex prompts and derivative tasks. However, it does not distinguish between positive and negative demonstrations for analogy learning. Motivated from such considerations, this paper proposes an **I**n-**C**ontext **C**ontrastive **L**earning (ICCL) model that utilizes contrastive learning to enhance the effectiveness of both positive and negative demonstrations. Additionally, we apply contrastive learning to event pairs to better facilitate event causality identification. Our ICCL is evaluated on the widely used corpora, including the EventStoryLine and Causal-TimeBank, and results show significant performance improvements over the state-of-the-art algorithms. 2024.emnlp-main.51 @@ -752,14 +752,14 @@ Large Language Models for Data Annotation and Synthesis: A Survey ZhenTan DaweiLi - SongWangUniversity of Virginia + SongWangUniversity of Virginia AlimohammadBeigi BohanJiangArizona State University - AmritaBhattacharjeeArizona State University - MansoorehKaramiMicrosoft + AmritaBhattacharjeeArizona State University + MansoorehKaramiMicrosoft JundongLiUniversity of Virginia - LuChengUniversity of Illinois at Chicago - HuanLiuArizona State University + LuChengUniversity of Illinois at Chicago + HuanLiuArizona State University 930-957 Data annotation and synthesis generally refers to the labeling or generating of raw data with relevant information, which could be used for improving the efficacy of machine learning models. The process, however, is labor-intensive and costly. The emergence of advanced Large Language Models (LLMs), exemplified by GPT-4, presents an unprecedented opportunity to automate the complicated process of data annotation and synthesis. While existing surveys have extensively covered LLM architecture, training, and general applications, we uniquely focus on their specific utility for data annotation. This survey contributes to three core aspects: LLM-Based Annotation Generation, LLM-Generated Annotations Assessment, and LLM-Generated Annotations Utilization. Furthermore, this survey includes an in-depth taxonomy of data types that LLMs can annotate, a comprehensive review of learning strategies for models utilizing LLM-generated annotations, and a detailed discussion of the primary challenges and limitations associated with using LLMs for data annotation and synthesis. Serving as a key guide, this survey aims to assist researchers and practitioners in exploring the potential of the latest LLMs for data annotation, thereby fostering future advancements in this critical field. 2024.emnlp-main.54 @@ -796,10 +796,10 @@ <fixed-case>R</fixed-case>ose<fixed-case>L</fixed-case>o<fixed-case>RA</fixed-case>: Row and Column-wise Sparse Low-rank Adaptation of Pre-trained Language Model for Knowledge Editing and Fine-tuning - HaoyuWangState University of New York at Albany + HaoyuWangState University of New York at Albany TianciLiu RuiruiLi - Monica XiaoChengAmazon + Monica XiaoChengAmazon TuoZhaoGeorgia Institute of Technology JingGaoPurdue University 996-1008 @@ -810,14 +810,14 @@ <fixed-case>B</fixed-case>lend<fixed-case>F</fixed-case>ilter: Advancing Retrieval-Augmented Large Language Models via Query Generation Blending and Knowledge Filtering - HaoyuWangState University of New York at Albany + HaoyuWangState University of New York at Albany RuiruiLi HaomingJiangAmazon JinjinTian ZhengyangWangAmazon ChenLuo XianfengTangAmazon - Monica XiaoChengAmazon + Monica XiaoChengAmazon TuoZhaoGeorgia Institute of Technology JingGaoPurdue University 1009-1025 @@ -828,10 +828,10 @@ <fixed-case>HEART</fixed-case>-felt Narratives: Tracing Empathy and Narrative Style in Personal Stories with <fixed-case>LLM</fixed-case>s - JocelynShen + JocelynShen JoelMire - Hae WonPark - CynthiaBreazeal + Hae WonPark + CynthiaBreazeal MaartenSap 1026-1046 Empathy serves as a cornerstone in enabling prosocial behaviors, and can be evoked through sharing of personal experiences in stories. While empathy is influenced by narrative content, intuitively, people respond to the way a story is told as well, through narrative style. Yet the relationship between empathy and narrative style is not fully understood. In this work, we empirically examine and quantify this relationship between style and empathy using LLMs and large-scale crowdsourcing studies. We introduce a novel, theory-based taxonomy, HEART (Human Empathy and Narrative Taxonomy) that delineates elements of narrative style that can lead to empathy with the narrator of a story. We establish the performance of LLMs in extracting narrative elements from HEART, showing that prompting with our taxonomy leads to reasonable, human-level annotations beyond what prior lexicon-based methods can do. To show empirical use of our taxonomy, we collect a dataset of empathy judgments of stories via a large-scale crowdsourcing study with N=2,624 participants. We show that narrative elements extracted via LLMs, in particular, vividness of emotions and plot volume, can elucidate the pathways by which narrative style cultivates empathy towards personal stories. Our work suggests that such models can be used for narrative analyses that lead to human-centered social and behavioral insights. @@ -844,8 +844,8 @@ JunruLuTencent Youtu Lab JiazhengLiKing’s College London, University of London SiyuAn - MengZhaoTencent Youtu Lab - YulanHeKing’s College London, University of London + MengZhaoTencent Youtu Lab + YulanHeKing’s College London, University of London DiYin XingSunTencent YouTu Lab 1047-1067 @@ -857,8 +857,8 @@ Bridging Cultures in the Kitchen: A Framework and Benchmark for Cross-Cultural Recipe Retrieval TianyiHu - MariaMaistroUniversity of Copenhagen - DanielHershcovichUniversity of Copenhagen + MariaMaistroUniversity of Copenhagen + DanielHershcovichUniversity of Copenhagen 1068-1080 The cross-cultural adaptation of recipes is an important application of identifying and bridging cultural differences in language. The challenge lies in retaining the essence of the original recipe while also aligning with the writing and dietary habits of the target culture. Information Retrieval (IR) offers a way to address the challenge because it retrieves results from the culinary practices of the target culture while maintaining relevance to the original recipe. We introduce a novel task about cross-cultural recipe retrieval and present a unique Chinese-English cross-cultural recipe retrieval benchmark. Our benchmark is manually annotated under limited resource, utilizing various retrieval models to generate a pool of candidate results for manual annotation. The dataset provides retrieval samples that are culturally adapted but textually diverse, presenting greater challenges. We propose CARROT, a plug-and-play cultural-aware recipe information retrieval framework that incorporates cultural-aware query rewriting and re-ranking methods and evaluate it both on our benchmark and intuitive human judgments. The results show that our framework significantly enhances the preservation of the original recipe and its cultural appropriateness for the target culture. We believe these insights will significantly contribute to future research on cultural adaptation. 2024.emnlp-main.61 @@ -873,7 +873,7 @@ KangyuZhu HaoranLi HongtuZhuUniversity of North Carolina at Chapel Hill - YunLiUniversity of North Carolina at Chapel Hill + YunLiUniversity of North Carolina at Chapel Hill GangLi LinjunZhangRutgers University HuaxiuYaoDepartment of Computer Science, University of North Carolina at Chapel Hill @@ -888,8 +888,8 @@ YuanLinational university of singaore, National University of Singapore BingqiaoLuonational university of singaore, National University of Singapore QianWang - NuoChen - XuLiuNational University of Singapore + NuoChen + XuLiuNational University of Singapore BingshengHeNational University of Singapore 1094-1106 The utilization of Large Language Models (LLMs) in financial trading has primarily been concentrated within the stock market, aiding in economic and financial decisions. Yet, the unique opportunities presented by the cryptocurrency market, noted for its on-chain data’s transparency and the critical influence of off-chain signals like news, remain largely untapped by LLMs. This work aims to bridge the gap by developing an LLM-based trading agent, CryptoTrade, which uniquely combines the analysis of on-chain and off-chain data. This approach leverages the transparency and immutability of on-chain data, as well as the timeliness and influence of off-chain signals, providing a comprehensive overview of the cryptocurrency market. CryptoTrade incorporates a reflective mechanism specifically engineered to refine its daily trading decisions by analyzing the outcomes of prior trading decisions. This research makes two significant contributions. Firstly, it broadens the applicability of LLMs to the domain of cryptocurrency trading. Secondly, it establishes a benchmark for cryptocurrency trading strategies. Through extensive experiments, CryptoTrade has demonstrated superior performance in maximizing returns compared to time-series baselines, but not compared to traditional trading signals, across various cryptocurrencies and market conditions. Our code and data are available at https://github.com/Xtra-Computing/CryptoTrade @@ -900,17 +900,17 @@ A Survey on In-context Learning QingxiuDong - LeiLiUniversity of Hong Kong + LeiLiUniversity of Hong Kong DamaiDai CeZhengPeking University JingyuanMa RuiLi - HemingXia + HemingXia JingjingXu ZhiyongWuShanghai Artificial Intelligence Laboratory BaobaoChangPeking University XuSun - LeiLiSchool of Computer Science, Carnegie Mellon University + LeiLiSchool of Computer Science, Carnegie Mellon University ZhifangSuiPeking University 1107-1128 With the increasing capabilities of large language models (LLMs), in-context learning (ICL) has emerged as a new paradigm for natural language processing (NLP), where LLMs make predictions based on contexts augmented with a few examples. It has been a significant trend to explore ICL to evaluate and extrapolate the ability of LLMs. In this paper, we aim to survey and summarize the progress and challenges of ICL. We first present a formal definition of ICL and clarify its correlation to related studies. Then, we organize and discuss advanced techniques, including training strategies, prompt designing strategies, and related analysis. Additionally, we explore various ICL application scenarios, such as data engineering and knowledge updating. Finally, we address the challenges of ICL and suggest potential directions for further research. We hope that our work can encourage more research on uncovering how ICL works and improving ICL. @@ -925,7 +925,7 @@ FeiyuGaoAlibaba Group ZiruiShaoZhejiang University ZhiYuZhejiang University - JiajunBuZhejiang University + JiajunBuZhejiang University QiZhengAlibaba Group CongYaoAlibaba Group 1129-1142 @@ -938,7 +938,7 @@ <fixed-case>AMR</fixed-case>-Evol: Adaptive Modular Response Evolution Elicits Better Knowledge Distillation for Large Language Models in Code Generation ZiyangLuoNational University of Singapore, 01.ai and Hong Kong Baptist University XinLiAlibaba Group - HongzhanLin + HongzhanLin JingMaHong Kong Baptist University LidongBingAlibaba Group 1143-1166 @@ -979,8 +979,8 @@ <fixed-case>LLM</fixed-case>s Are Zero-Shot Context-Aware Simultaneous Translators - RomanKoshkin - KatsuhitoSudohNara Women’s University + RomanKoshkin + KatsuhitoSudohNara Women’s University SatoshiNakamuraThe Chinese University of Hong Kong 1192-1207 The advent of transformers has fueled progress in machine translation. More recently large language models (LLMs) have come to the spotlight thanks to their generality and strong performance in a wide range of language tasks, including translation. Here we show that open-source LLMs perform on par with or better than some state-of-the-art baselines in simultaneous machine translation (SiMT) tasks, zero-shot. We also demonstrate that injection of minimal background information, which is easy with an LLM, brings further performance gains, especially on challenging technical subject-matter. This highlights LLMs’ potential for building next generation of massively multilingual, context-aware and terminologically accurate SiMT systems that require no resource-intensive training or fine-tuning. @@ -990,12 +990,12 @@ <fixed-case>A</fixed-case>gent<fixed-case>R</fixed-case>eview: Exploring Peer Review Dynamics with <fixed-case>LLM</fixed-case> Agents - YiqiaoJin + YiqiaoJin QinlinZhaoResearch, Microsoft and University of Science and Technology of China - YiyangWang + YiyangWang HaoChenCMU, Carnegie Mellon University KaijieZhu - YijiaXiao + YijiaXiao JindongWangMicrosoft Research 1208-1226 Peer review is fundamental to the integrity and advancement of scientific publication. Traditional methods of peer review analyses often rely on exploration and statistics of existing peer review data, which do not adequately address the multivariate nature of the process, account for the latent variables, and are further constrained by privacy concerns due to the sensitive nature of the data. We introduce AgentReview, the first large language model (LLM) based peer review simulation framework, which effectively disentangles the impacts of multiple latent factors and addresses the privacy issue. Our study reveals significant insights, including a notable 37.1% variation in paper decisions due to reviewers’ biases, supported by sociological theories such as the social influence theory, altruism fatigue, and authority bias. We believe that this study could offer valuable insights to improve the design of peer review mechanisms. @@ -1007,11 +1007,11 @@ <fixed-case>C</fixed-case>hat<fixed-case>R</fixed-case>etriever: Adapting Large Language Models for Generalized and Robust Conversational Dense Retrieval KelongMao ChenlongDengRenmin University of China - HaonanChen - FengranMo - ZhengLiu - TetsuyaSakaiNAVER and Waseda University - ZhichengDouRenmin University of China + HaonanChen + FengranMo + ZhengLiu + TetsuyaSakaiNAVER and Waseda University + ZhichengDouRenmin University of China 1227-1240 Conversational search requires accurate interpretation of user intent from complex multi-turn contexts. This paper presents ChatRetriever, which inherits the strong generalization capability of large language models to robustly represent complex conversational sessions for dense retrieval. To achieve this, we propose a simple and effective dual-learning approach that adapts LLM for retrieval via contrastive learning while enhancing the complex session understanding through masked instruction tuning on high-quality conversational instruction tuning data. Extensive experiments on five conversational search benchmarks demonstrate that ChatRetriever significantly outperforms existing conversational dense retrievers, achieving state-of-the-art performance on par with LLM-based rewriting approaches. Furthermore, ChatRetriever exhibits superior robustness in handling diverse conversational contexts. Our work highlights the potential of adapting LLMs for retrieval with complex inputs like conversational search sessions and proposes an effective approach to advance this research direction. 2024.emnlp-main.71 @@ -1020,10 +1020,10 @@ Fairer Preferences Elicit Improved Human-Aligned Large Language Model Judgments - HanZhou - XingchenWanGoogle + HanZhou + XingchenWanGoogle YinhongLiu - NigelCollierUniversity of Cambridge + NigelCollierUniversity of Cambridge IvanVulićUniversity of Cambridge and PolyAI Limited AnnaKorhonenUniversity of Cambridge 1241-1252 @@ -1036,7 +1036,7 @@ Learning Interpretable Legal Case Retrieval via Knowledge-Guided Case Reformulation ChenlongDengRenmin University of China KelongMao - ZhichengDouRenmin University of China + ZhichengDouRenmin University of China 1253-1265 Legal case retrieval for sourcing similar cases is critical in upholding judicial fairness. Different from general web search, legal case retrieval involves processing lengthy, complex, and highly specialized legal documents. Existing methods in this domain often overlook the incorporation of legal expert knowledge, which is crucial for accurately understanding and modeling legal cases, leading to unsatisfactory retrieval performance. This paper introduces KELLER, a legal knowledge-guided case reformulation approach based on large language models (LLMs) for effective and interpretable legal case retrieval. By incorporating professional legal knowledge about crimes and law articles, we enable large language models to accurately reformulate the original legal case into concise sub-facts of crimes, which contain the essential information of the case. Extensive experiments on two legal case retrieval benchmarks demonstrate superior retrieval performance and robustness on complex legal case queries of KELLER over existing methods. 2024.emnlp-main.73 @@ -1045,7 +1045,7 @@ Effective Demonstration Annotation for In-Context Learning via Language Model-Based Determinantal Point Process - PengWang + PengWang XiaobinWang ChaoLouShanghaiTech University ShengyuMao @@ -1086,11 +1086,11 @@ Mitigating Language Bias of <fixed-case>LMM</fixed-case>s in Social Intelligence Understanding with Virtual Counterfactual Calibration - PengChen + PengChen Xiao-YuGuoUniversity of Adelaide Yuan-FangLiMonash University - XiaowangZhangTianjin University, China - ZhiyongFengTianjin University + XiaowangZhangTianjin University, China + ZhiyongFengTianjin University 1300-1310 2024.emnlp-main.77 chen-etal-2024-mitigating @@ -1102,8 +1102,8 @@ YuanzheHuUniversity of California, San Diego TianyuPang YefanZhouDartmouth College - PuRenLawrence Berkeley National Lab - YaoqingYangDartmouth College + PuRenLawrence Berkeley National Lab + YaoqingYangDartmouth College 1311-1331 Recent advances in foundation models have emphasized the need to align pre-trained models with specialized domains using small, curated datasets. Studies on these foundation models underscore the importance of low-data training and fine-tuning. This topic, well-known in natural language processing (NLP), has also gained increasing attention in the emerging field of scientific machine learning (SciML). To address the limitations of low-data training and fine-tuning, we draw inspiration from Heavy-Tailed Self-Regularization (HT-SR) theory, analyzing the shape of empirical spectral densities (ESDs) and revealing an imbalance in training quality across different model layers. To mitigate this issue, we adapt a recently proposed layer-wise learning rate scheduler, TempBalance, which effectively balances training quality across layers and enhances low-data training and fine-tuning for both NLP and SciML tasks. Notably, TempBalance demonstrates increasing performance gains as the amount of available tuning data decreases. Comparative analyses further highlight the effectiveness of TempBalance and its adaptability as an “add-on” method for improving model performance. 2024.emnlp-main.78 @@ -1128,7 +1128,7 @@ Large Language Models as Foundations for Next-Gen Dense Retrieval: A Comprehensive Empirical Assessment KunLuo MinghaoQin - ZhengLiu + ZhengLiu ShitaoXiao JunZhaoInstitute of automation, Chinese academy of science KangLiuInstitute of automation, Chinese academy of science, Chinese Academy of Sciences @@ -1140,7 +1140,7 @@ A New Pipeline for Knowledge Graph Reasoning Enhanced by Large Language Models Without Fine-Tuning - ZhongwuChen + ZhongwuChen LongBaiInstitute of Computing Technology, Chinese Academy of Sciences ZixuanLiInstitute of Computing Technology, Chinese Academy of Sciences ZhenHuangNational University of Defense Technology @@ -1159,9 +1159,9 @@ Zhi-YuanChen ShiqiShenWechat, Tencent GuangyaoShenTencent - GongZhiWeChat - XuChenRenmin University of China - YankaiLinRenmin University of China + GongZhiWeChat + XuChenRenmin University of China + YankaiLinRenmin University of China 1382-1400 Recently, tool use with LLMs has become one of the primary research topics as it can help LLM generate truthful and helpful responses. Existing studies on tool use with LLMs primarily focus on enhancing the tool-calling ability of LLMs. In practice, like chat assistants, LLMs are also required to align with human values in the context of tool use. Specifically, LLMs should refuse to answer unsafe tool use relevant instructions and insecure tool responses to ensure their reliability and harmlessness. At the same time, LLMs should demonstrate autonomy in tool use to reduce the costs associated with tool calling. To tackle this issue, we first introduce the principle that LLMs should follow in tool use scenarios: H2A. The goal of H2A is to align LLMs with **helpfulness**, **harmlessness**, and **autonomy**. In addition, we propose ToolAlign, a dataset comprising instruction-tuning data and preference data to align LLMs with the H2A principle for tool use. Based on ToolAlign, we develop LLMs by supervised fine-tuning and preference learning, and experimental results demonstrate that the LLMs exhibit remarkable tool-calling capabilities, while also refusing to engage with harmful content, and displaying a high degree of autonomy in tool utilization. The code and datasets are available at: https://github.com/zhiyuanc2001/ToolAlign. 2024.emnlp-main.82 @@ -1177,7 +1177,7 @@ JieZhou YunqiBa JieCaiModelBest - ZhiyuanLiuTsinghua University + ZhiyuanLiuTsinghua University MaosongSun 1401-1418 The performance of Large Language Models (LLMs) is substantially influenced by the pretraining corpus, which consists of vast quantities of unsupervised data processed by the models. Despite its critical role in model performance, ensuring the quality of this data is challenging due to its sheer volume and the absence of sample-level quality annotations and enhancements. In this paper, we introduce DecorateLM, a data engineering method designed to refine the pretraining corpus through data rating, tagging and editing. Specifically, DecorateLM rates texts against quality criteria, tags texts with hierarchical labels, and edits texts into a more formalized format. Due to the massive size of the pretraining corpus, adopting an LLM for decorating the entire corpus is less efficient. Therefore, to balance performance with efficiency, we curate a meticulously annotated training corpus for DecorateLM using a large language model and distill data engineering expertise into a compact 1.2 billion parameter small language model (SLM). We then apply DecorateLM to enhance 100 billion tokens of the training corpus, selecting 45 billion tokens that exemplify high quality and diversity for the further training of another 1.2 billion parameter LLM. Our results demonstrate that employing such high-quality data can significantly boost model performance, showcasing a powerful approach to enhance the quality of the pretraining corpus. @@ -1188,12 +1188,12 @@ Lookback Lens: Detecting and Mitigating Contextual Hallucinations in Large Language Models Using Only Attention Maps - Yung-SungChuangMassachusetts Institute of Technology + Yung-SungChuangMassachusetts Institute of Technology LinluQiu Cheng-YuHsiehUniversity of Washington RanjayKrishnaDepartment of Computer Science YoonKimMassachusetts Institute of Technology - James R.GlassMassachusetts Institute of Technology + James R.GlassMassachusetts Institute of Technology 1419-1436 When asked to summarize articles or answer questions given a passage, large language models (LLMs) can hallucinate details and respond with unsubstantiated answers that are inaccurate with respect to the input context. This paper describes a simple approach for detecting such **contextual hallucinations**. We hypothesize that contextual hallucinations are related to the extent to which an LLM attends to information in the provided context versus its own generations. Based on this intuition, we propose a simple hallucination detection model whose input features are given by the ratio of attention weights on the context versus newly generated tokens (for each attention head). We find that a linear classifier based on these _lookback ratio_ features is as effective as a richer detector that utilizes the entire hidden states of an LLM or a text-based entailment model. The lookback ratio-based detector—**Lookback Lens**—is found to transfer across tasks and even models, allowing a detector that is trained on a 7B model to be applied (without retraining) to a larger 13B model. We further apply this detector to mitigate contextual hallucinations, and find that a simple classifier-guided decoding approach is able to reduce the amount of hallucination, for example by 9.6% in the XSum summarization task. 2024.emnlp-main.84 @@ -1203,17 +1203,17 @@ Controllable Preference Optimization: Toward Controllable Multi-Objective Alignment - YijuGuo + YijuGuo GanquCui LifanYuanUniversity of Illinois at Urbana-Champaign NingDingTsinghua University, Tsinghua University - ZexuSun - BowenSun + ZexuSun + BowenSun HuiminChenTsinghua University, Tsinghua University - RuobingXie - JieZhou - YankaiLinRenmin University of China - ZhiyuanLiuTsinghua University + RuobingXie + JieZhou + YankaiLinRenmin University of China + ZhiyuanLiuTsinghua University MaosongSun 1437-1454 Alignment in artificial intelligence pursues the consistency between model responses and human preferences as well as values. In practice, the multifaceted nature of human preferences inadvertently introduces what is known as the ”alignment tax”–a compromise where enhancements in alignment within one objective (e.g., harmlessness) can diminish performance in others (e.g., helpfulness). However, existing alignment techniques are mostly unidirectional, leading to suboptimal trade-offs and poor flexibility over various objectives. To navigate this challenge, we argue the prominence of grounding LLMs with evident preferences. We introduce controllable preference optimization (CPO), which explicitly specifies preference scores for different objectives, thereby guiding the model to generate responses that meet the requirements. Our experimental analysis reveals that the aligned models can provide responses that match various preferences among the ”3H” (helpfulness, honesty, harmlessness) desiderata. Furthermore, by introducing diverse data and alignment goals, we surpass baseline methods in aligning with single objectives, hence mitigating the impact of the alignment tax and achieving improvements in multi-objective alignment. @@ -1224,7 +1224,7 @@ Mitigating Matthew Effect: Multi-Hypergraph Boosted Multi-Interest Self-Supervised Learning for Conversational Recommendation - YongsenZhengNanyang Technological University + YongsenZhengNanyang Technological University RuilinXu GuohuaWangSouth China Agricultural University LiangLinSUN YAT-SEN UNIVERSITY @@ -1238,9 +1238,9 @@ Advancing Event Causality Identification via Heuristic Semantic Dependency Inquiry Network HaoranLi - QiangGaoComplex Laboratory of New Finance and Economics, Southwestern University of Finance and Economics - HongmeiWu - LiHuangSouthwestern University of Finance and Economics + QiangGaoComplex Laboratory of New Finance and Economics, Southwestern University of Finance and Economics + HongmeiWu + LiHuangSouthwestern University of Finance and Economics 1467-1478 2024.emnlp-main.87 li-etal-2024-advancing-event @@ -1250,7 +1250,7 @@ Exploring Union and Intersection of Visual Regions for Generating Questions, Answers, and Distractors WenjianDing YaoZhangNankai University - JunWang + JunWang AdamJatowt ZhengluYangNankai University 1479-1489 @@ -1276,7 +1276,7 @@ Tracking the perspectives of interacting language models HaydenHelmNomic AI and Helivan Research BrandonDuderstadtNomic AI - YoungserParkJohns Hopkins University + YoungserParkJohns Hopkins University CareyPriebeJohns Hopkins University 1508-1519 Large language models (LLMs) are capable of producing high quality information at unprecedented rates. As these models continue to entrench themselves in society, the content they produce will become increasingly pervasive in databases that are, in turn, incorporated into the pre-training data, fine-tuning data, retrieval data, etc. of other language models. In this paper we formalize the idea of a communication network of LLMs and introduce a method for representing the perspective of individual models within a collection of LLMs. Given these tools we systematically study information diffusion in the communication network of LLMs in various simulated settings. @@ -1286,9 +1286,9 @@ <fixed-case>MAR</fixed-case>: Matching-Augmented Reasoning for Enhancing Visual-based Entity Question Answering - ZhengxuanZhang - YinWu - YuyuLuo + ZhengxuanZhang + YinWu + YuyuLuo NanTangHong Kong University of Science and Technology 1520-1530 A multimodal large language model MLLMs may struggle with answering visual-based (personal) entity questions (VEQA), such as ”who is A?” or ”who is A that B is talking to?” for various reasons, e.g., the absence of the name of A in the caption or the inability of MLLMs to recognize A, particularly for less common entities. Furthermore, even if the MLLMs can identify A, it may refrain from answering due to privacy concerns. In this paper, we introduce a novel method called Matching-Augmented Reasoning (MAR) to enhance VEQA. Given a collection of visual objects with captions, MAR preprocesses each object individually, identifying faces, names, and their alignments within the object. It encodes this information and stores their vector representations in vector databases. When handling VEQA, MAR retrieves matching faces and names and organizes these entities into a matching graph. MAR then derives the answer to the query by reasoning over this matching graph. Extensive experiments show that MAR significantly improves VEQA compared with the state-of-the-art methods using MLLMs. @@ -1301,7 +1301,7 @@ ZheYangPeking University YichangZhangAlibaba Group TianyuLiu - JianYangAlibaba Group + JianYangAlibaba Group JunyangLin ChangZhou ZhifangSuiPeking University @@ -1318,7 +1318,7 @@ XiutianZhaoUniversity of Edinburgh WenhaoWu XunWangMicrosoft - KeWangHuawei Technologies Ltd. + KeWangHuawei Technologies Ltd. ChengLiHuawei Technologies Ltd. WeiPengHuawei Technologies Ltd. SujianLiPeking University @@ -1330,9 +1330,9 @@ Standardize: Aligning Language Models with Expert-Defined Standards for Content Generation - Joseph MarvinImperialUniversity of Bath + Joseph MarvinImperialUniversity of Bath GailForey - HarishTayyar MadabushiUniversity of Bath + HarishTayyar MadabushiUniversity of Bath 1573-1594 Domain experts across engineering, healthcare, and education follow strict standards for producing quality content such as technical manuals, medication instructions, and children’s reading materials. However, current works in controllable text generation have yet to explore using these standards as references for control. Towards this end, we introduce Standardize, a retrieval-style in-context learning-based framework to guide large language models to align with expert-defined standards. Focusing on English language standards in the education domain as a use case, we consider the Common European Framework of Reference for Languages (CEFR) and Common Core Standards (CCS) for the task of open-ended content generation. Our findings show that models can gain 45% to 100% increase in precise accuracy across open and commercial LLMs evaluated, demonstrating that the use of knowledge artifacts extracted from standards and integrating them in the generation process can effectively guide models to produce better standard-aligned content. 2024.emnlp-main.94 @@ -1343,12 +1343,12 @@ Cross-domain <fixed-case>NER</fixed-case> with Generated Task-Oriented Knowledge: An Empirical Study from Information Density Perspective - ZhihaoZhang + ZhihaoZhang Sophia Yat MeiLeeHong Kong Polytechnic University JunshuangWu DongZhang ShoushanLi - ErikCambriaNanyang Technological University + ErikCambriaNanyang Technological University GuodongZhouSoochow University, China 1595-1609 Cross-domain Named Entity Recognition (CDNER) is crucial for Knowledge Graph (KG) construction and natural language processing (NLP), enabling learning from source to target domains with limited data. Previous studies often rely on manually collected entity-relevant sentences from the web or attempt to bridge the gap between tokens and entity labels across domains. These approaches are time-consuming and inefficient, as these data are often weakly correlated with the target task and require extensive pre-training.To address these issues, we propose automatically generating task-oriented knowledge (GTOK) using large language models (LLMs), focusing on the reasoning process of entity extraction. Then, we employ task-oriented pre-training (TOPT) to facilitate domain adaptation. Additionally, current cross-domain NER methods often lack explicit explanations for their effectiveness. Therefore, we introduce the concept of information density to better evaluate the model’s effectiveness before performing entity recognition.We conduct systematic experiments and analyses to demonstrate the effectiveness of our proposed approach and the validity of using information density for model evaluation. @@ -1362,10 +1362,10 @@ ChengshuaiZhaoArizona State University RahaMoraffahWorcester Polytechnic Institute YifanLi - SongWangUniversity of Virginia + SongWangUniversity of Virginia JundongLiUniversity of Virginia TianlongChen - HuanLiuArizona State University + HuanLiuArizona State University 1610-1626 Retrieval-Augmented Generative (RAG) models enhance Large Language Models (LLMs) by integrating external knowledge bases, improving their performance in applications like fact-checking and information searching. In this paper, we demonstrate a security threat where adversaries can exploit the openness of these knowledge bases by injecting deceptive content into the retrieval database, intentionally changing the model’s behavior. This threat is critical as it mirrors real-world usage scenarios where RAG systems interact with publicly accessible knowledge bases, such as web scrapings and user-contributed data pools. To be more realistic, we target a realistic setting where the adversary has no knowledge of users’ queries, knowledge base data, and the LLM parameters. We demonstrate that it is possible to exploit the model successfully through crafted content uploads with access to the retriever. Our findings emphasize an urgent need for security measures in the design and deployment of RAG systems to prevent potential manipulation and ensure the integrity of machine-generated content. 2024.emnlp-main.96 @@ -1384,10 +1384,10 @@ <fixed-case>SHIELD</fixed-case>: Evaluation and Defense Strategies for Copyright Compliance in <fixed-case>LLM</fixed-case> Text Generation - XiaozeLiu + XiaozeLiu TingSun TianyangXuPurdue University - FeijieWu + FeijieWu CunxiangWang XiaoqianWangPurdue University JingGaoPurdue University @@ -1404,7 +1404,7 @@ JiayuanRao HaoningWu ChangLiuShanghai Jiaotong University - YanfengWangShanghai Jiao Tong University + YanfengWangShanghai Jiao Tong University WeidiXieShanghai Jiaotong University 1671-1685 Soccer is a globally popular sport with a vast audience, in this paper, we consider constructing an automatic soccer game commentary model to improve the audiences’ viewing experience. In general, we make the following contributions: *First*, observing the prevalent video-text misalignment in existing datasets, we manually annotate timestamps for 49 matches, establishing a more robust benchmark for soccer game commentary generation, termed as *SN-Caption-test-align*; *Second*, we propose a multi-modal temporal alignment pipeline to automatically correct and filter the existing dataset at scale, creating a higher-quality soccer game commentary dataset for training, denoted as *MatchTime*; *Third*, based on our curated dataset, we train an automatic commentary generation model, named **MatchVoice**. Extensive experiments and ablation studies have demonstrated the effectiveness of our alignment pipeline, and training model on the curated datasets achieves state-of-the-art performance for commentary generation, showcasing that better alignment can lead to significant performance improvements in downstream tasks. @@ -1415,10 +1415,10 @@ Rethinking Token Reduction for State Space Models ZhengZhanNortheastern University - YushuWuNortheastern University - ZhenglunKongHarvard Medical School, Harvard University - ChangdiYang - YifanGongNortheastern University + YushuWuNortheastern University + ZhenglunKongHarvard Medical School, Harvard University + ChangdiYang + YifanGongNortheastern University XuanShen XueLinNortheastern University PuZhao @@ -1437,7 +1437,7 @@ WeimingLuZhejiang University JianShaoZhejiang University YongfengHuang - HengChangTsinghua University, Tsinghua University + HengChangTsinghua University, Tsinghua University YuetingZhuang 1698-1710 Recent progress with LLM-based agents has shown promising results across various tasks. However, their use in answering questions from knowledge bases remains largely unexplored. Implementing a KBQA system using traditional methods is challenging due to the shortage of task-specific training data and the complexity of creating task-focused model structures. In this paper, we present Triad, a unified framework that utilizes an LLM-based agent with multiple roles for KBQA tasks. The agent is assigned three roles to tackle different KBQA subtasks: agent as a generalist for mastering various subtasks, as a decision maker for the selection of candidates, and as an advisor for answering questions with knowledge. Our KBQA framework is executed in four phases, involving the collaboration of the agent’s multiple roles. We evaluated the performance of our framework using three benchmark datasets, and the results show that our framework outperforms state-of-the-art systems on the LC-QuAD and YAGO-QA benchmarks, yielding F1 scores of 11.8% and 20.7%, respectively. @@ -1461,7 +1461,7 @@ Event Causality Identification with Synthetic Control HaoyuWangUniversity of Pennsylvania FengzeLiuUniversity of Pennsylvania, University of Pennsylvania - JiayaoZhangUniversity of Pennsylvania + JiayaoZhangUniversity of Pennsylvania DanRothUniversity of Pennsylvania KyleRichardsonAllen Institute for Artificial Intelligence 1725-1737 @@ -1475,12 +1475,12 @@ ChangMa HaitengZhao LinZheng - JiayiXinUniversity of Pennsylvania, University of Pennsylvania + JiayiXinUniversity of Pennsylvania, University of Pennsylvania QintongLi - LijunWu + LijunWu ZhihongDeng Yang YoungLuUniversity of Waterloo - QiLiuUniversity of Hong Kong + QiLiuUniversity of Hong Kong ShengWang LingpengKongDepartment of Computer Science, The University of Hong Kong 1738-1767 @@ -1493,7 +1493,7 @@ <fixed-case>HELPD</fixed-case>: Mitigating Hallucination of <fixed-case>LVLM</fixed-case>s by Hierarchical Feedback Learning with Vision-enhanced Penalty Decoding FanYuanNanjing University of Aeronautics and Astronautics ChiQinMicrosoft - XiaogangXu + XiaogangXu PijiLiNanjing University of Aeronautics and Astronautics 1768-1785 Large Vision-Language Models (LVLMs) have shown remarkable performance on many visual-language tasks. However, these models still suffer from multimodal hallucination, which means the generation of objects or content that violates the images. Many existing work detects hallucination by directly judging whether an object exists in an image, overlooking the association between the object and semantics. To address this issue, we propose Hierarchical Feedback Learning with Vision-enhanced Penalty Decoding (HELPD). This framework incorporates hallucination feedback at both object and sentence semantic levels. Remarkably, even with a marginal degree of training, this approach can alleviate over 15% of hallucination. Simultaneously, HELPD penalizes the output logits according to the image attention window to avoid being overly affected by generated text. HELPD can be seamlessly integrated with any LVLMs. Our experiments demonstrate that the proposed framework yields favorable results across multiple hallucination benchmarks. It effectively mitigates hallucination for different LVLMs and concurrently improves their text generation quality. @@ -1505,8 +1505,8 @@ <fixed-case>T</fixed-case>op<fixed-case>V</fixed-case>iew<fixed-case>RS</fixed-case>: Vision-Language Models as Top-View Spatial Reasoners ChengzuLi CaiqiZhang - HanZhou - NigelCollierUniversity of Cambridge + HanZhou + NigelCollierUniversity of Cambridge AnnaKorhonenUniversity of Cambridge IvanVulićUniversity of Cambridge and PolyAI Limited 1786-1807 @@ -1520,7 +1520,7 @@ YiboWang XiangjueDongTexas A&M University - College Station JamesCaverleeGoogle and Texas A&M University - College Station - Philip S.YuUniversity of Illinois, Chicago + Philip S.YuUniversity of Illinois, Chicago 1808-1825 Language models can be manipulated by adversarial attacks, which introduce subtle perturbations to input data. While recent attack methods can achieve a relatively high attack success rate (ASR), we’ve observed that the generated adversarial examples have a different data distribution compared with the original examples. Specifically, these adversarial examples exhibit reduced confidence levels and greater divergence from the training data distribution. Consequently, they are easy to detect using straightforward detection methods, diminishing the efficacy of such attacks. To address this issue, we propose a Distribution-Aware Adversarial Attack (DA^3) method. DA^3 considers the distribution shifts of adversarial examples to improve attacks’ effectiveness under detection methods. We further design a novel evaluation metric, the Non-detectable Attack Success Rate (NASR), which integrates both ASR and detectability for the attack task. We conduct experiments on four widely used datasets to validate the attack effectiveness and transferability of adversarial examples generated by DA^3 against both the white-box BERT-base and RoBERTa-base models and the black-box LLaMA2-7b model. 2024.emnlp-main.107 @@ -1531,7 +1531,7 @@ Evaluating Psychological Safety of Large Language Models XingxuanLi - YutongLi + YutongLi LinQiu ShafiqJotySalesForce.com and Nanyang Technological University LidongBingAlibaba Group @@ -1547,8 +1547,8 @@ LianxiWangGuangdong University of Foreign Studies YubenWu XinfengLiao - YujiaTian - JunyangZhong + YujiaTian + JunyangZhong 1844-1856 Sentiment classification (SC) often suffers from low-resource challenges such as domain-specific contexts, imbalanced label distributions, and few-shot scenarios. The potential of the diffusion language model (LM) for textual data augmentation (DA) remains unexplored, moreover, textual DA methods struggle to balance the diversity and consistency of new samples. Most DA methods either perform logical modifications or rephrase less important tokens in the original sequence with the language model. In the context of SC, strong emotional tokens could act critically on the sentiment of the whole sequence. Therefore, contrary to rephrasing less important context, we propose DiffusionCLS to leverage a diffusion LM to capture in-domain knowledge and generate pseudo samples by reconstructing strong label-related tokens. This approach ensures a balance between consistency and diversity, avoiding the introduction of noise and augmenting crucial features of datasets. DiffusionCLS also comprises a Noise-Resistant Training objective to help the model generalize. Experiments demonstrate the effectiveness of our method in various low-resource scenarios including domain-specific and domain-general problems. Ablation studies confirm the effectiveness of our framework’s modules, and visualization studies highlight optimal deployment conditions, reinforcing our conclusions. 2024.emnlp-main.109 @@ -1571,7 +1571,7 @@ <fixed-case>P</fixed-case>s<fixed-case>F</fixed-case>uture: A Pseudo-Future-based Zero-Shot Adaptive Policy for Simultaneous Machine Translation LiboZhao - JingLiThe Hong Kong Polytechnic University + JingLiThe Hong Kong Polytechnic University ZiqianZengSouth China University of Technology 1869-1881 Simultaneous Machine Translation (SiMT) requires target tokens to be generated in real-time as streaming source tokens are consumed. Traditional approaches to SiMT typically require sophisticated architectures and extensive parameter configurations for training adaptive read/write policies, which in turn demand considerable computational power and memory. We propose PsFuture, the first zero-shot adaptive read/write policy for SiMT, enabling the translation model to independently determine read/write actions without the necessity for additional training. Furthermore, we introduce a novel training strategy, Prefix-to-Full (P2F), specifically tailored to adjust offline translation models for SiMT applications, exploiting the advantages of the bidirectional attention mechanism inherent in offline models. Experiments across multiple benchmarks demonstrate that our zero-shot policy attains performance on par with strong baselines and the P2F method can further enhance performance, achieving an outstanding trade-off between translation quality and latency. @@ -1584,9 +1584,9 @@ LiangZhang AnwenHuAlibaba Group HaiyangXu - MingYan + MingYan YichenXu - QinJinRenmin University of China + QinJinRenmin University of China JiZhangAlibaba Group FeiHuangAlibaba Group 1882-1898 @@ -1599,7 +1599,7 @@ Do We Need Language-Specific Fact-Checking Models? The Case of <fixed-case>C</fixed-case>hinese CaiqiZhang ZhijiangGuoUniversity of Cambridge - AndreasVlachosUniversity of Cambridge + AndreasVlachosUniversity of Cambridge 1899-1914 This paper investigates the potential benefits of language-specific fact-checking models, focusing on the case of Chinese using CHEF dataset. To better reflect real-world fact-checking, we first develop a novel Chinese document-level evidence retriever, achieving state-of-the-art performance. We then demonstrate the limitations of translation-based methods and multilingual language models, highlighting the need for language-specific systems. To better analyze token-level biases in different systems, we construct an adversarial dataset based on the CHEF dataset, where each instance has a large word overlap with the original one but holds the opposite veracity label. Experimental results on the CHEF dataset and our adversarial dataset show that our proposed method outperforms translation-based methods and multilingual language models and is more robust toward biases, emphasizing the importance of language-specific fact-checking systems. 2024.emnlp-main.113 @@ -1611,9 +1611,9 @@ ZhiyuanLi DongnanLiu ChaoyiZhangThe University of Sydney, University of Sydney - HengWangSony R&D and University of Sydney, University of Sydney + HengWangSony R&D and University of Sydney, University of Sydney TengfeiXue - WeidongCaiThe University of Sydney + WeidongCaiThe University of Sydney 1915-1929 Recent advancements in Vision-Language (VL) research have sparked new benchmarks for complex visual reasoning, challenging models’ advanced reasoning ability. Traditional Vision-Language models (VLMs) perform well in visual perception tasks while struggling with complex reasoning scenarios. Conversely, Large Language Models (LLMs) demonstrate robust text reasoning capabilities; however, they lack visual acuity. To bridge this gap, we propose **C**omplex **V**isual **R**easoning **L**arge **L**anguage **M**odels (**CVR-LLM**), capitalizing on VLMs’ visual perception proficiency and LLMs’ extensive reasoning capability. Unlike recent multimodal large language models (MLLMs) that require a projection layer, our approach transforms images into detailed, context-aware descriptions using an iterative self-refinement loop and leverages LLMs’ text knowledge for accurate predictions without extra training. We also introduce a novel multi-modal in-context learning (ICL) methodology to enhance LLMs’ contextual understanding and reasoning. Additionally, we introduce Chain-of-Comparison (CoC), a step-by-step comparison technique enabling contrasting various aspects of predictions. Our CVR-LLM presents the first comprehensive study across a wide array of complex visual reasoning tasks and achieves SOTA performance among all. 2024.emnlp-main.114 @@ -1623,12 +1623,12 @@ <fixed-case>CMD</fixed-case>: a framework for Context-aware Model self-Detoxification - ZechengTangSoochow University + ZechengTangSoochow University KeyanZhou - JuntaoLiSoochow University, China + JuntaoLiSoochow University, China YuyangDing PinzhengWang - YanBowen + YanBowen RenjieHua MinZhangHarbin Institute of Technology, Shenzhen 1930-1949 @@ -1642,8 +1642,8 @@ XiaomengHu YimingZhang RuPeng - HaozheZhang - ChenweiWuDuke University + HaozheZhang + ChenweiWuDuke University GangChen JunboZhaoZhejiang University 1950-1959 @@ -1654,14 +1654,14 @@ <fixed-case>TCS</fixed-case>inger: Zero-Shot Singing Voice Synthesis with Style Transfer and Multi-Level Style Control - YuZhang + YuZhang ZiyueJiang RuiqiLi ChanghaoPanZhejiang University JinzhengHeZhejiang University RongjieHuangFAIR ChuxinWang - ZhouZhaoZhejiang University and Zhejiang University + ZhouZhaoZhejiang University and Zhejiang University 1960-1975 Zero-shot singing voice synthesis (SVS) with style transfer and style control aims to generate high-quality singing voices with unseen timbres and styles (including singing method, emotion, rhythm, technique, and pronunciation) from audio and text prompts. However, the multifaceted nature of singing styles poses a significant challenge for effective modeling, transfer, and control. Furthermore, current SVS models often fail to generate singing voices rich in stylistic nuances for unseen singers. To address these challenges, we introduce TCSinger, the first zero-shot SVS model for style transfer across cross-lingual speech and singing styles, along with multi-level style control. Specifically, TCSinger proposes three primary modules: 1) the clustering style encoder employs a clustering vector quantization model to stably condense style information into a compact latent space; 2) the Style and Duration Language Model (S&D-LM) concurrently predicts style information and phoneme duration, which benefits both; 3) the style adaptive decoder uses a novel mel-style adaptive normalization method to generate singing voices with enhanced details. Experimental results show that TCSinger outperforms all baseline models in synthesis quality, singer similarity, and style controllability across various tasks, including zero-shot style transfer, multi-level style control, cross-lingual style transfer, and speech-to-singing style transfer. 2024.emnlp-main.117 @@ -1671,10 +1671,10 @@ Be Helpful but Don’t Talk too Much - Enhancing Helpfulness in Conversations through Relevance in Multi-Turn Emotional Support - JunlinLi - BoPeng + JunlinLi + BoPeng Yu-YinHsuHong Kong Polytechnic University - Chu-RenHuang + Chu-RenHuang 1976-1988 For a conversation to help and support, speakers should maintain an “effect-effort” tradeoff. As outlined in the gist of “Cognitive Relevance Principle”, helpful speakers should optimize the “cognitive relevance” through maximizing the “cognitive effects” and minimizing the “processing effort” imposed on listeners. Although preference learning methods have given rise a boon of studies in pursuit of“effect-optimization”, none have delved into the critical “effort-optimiazation” to fully cultivate the awareness of “optimal relevance” into thecognition of conversation agents. To address this gap, we integrate the “Cognitive Relevance Principle” into emotional support agents in the environment of multi-turn conversation. The results demonstrate a significant and robust improvement against the baseline systems with respect to response quality, human-likedness and supportivenss. This study offers compelling evidence for the effectiveness of the “Relevance Principle” in generating human-like, helpful, and harmless emotional support conversations. The source code will be available at https://github.com/CN-Eyetk/VLESA-ORL.git 2024.emnlp-main.118 @@ -1686,11 +1686,11 @@ Hyuhng JoonKimSeoul National University YounaKimSeoul National University CheonbokParkNAVER - JunyeobKimSeoul National University - ChoonghyunParkSeoul National University + JunyeobKimSeoul National University + ChoonghyunParkSeoul National University Kang MinYooNAVER Sang-gooLeeSeoul National University - TaeukKimHanyang University + TaeukKimHanyang University 1989-2007 In interactions between users and language model agents, user utterances frequently exhibit ellipsis (omission of words or phrases) or imprecision (lack of exactness) to prioritize efficiency. This can lead to varying interpretations of the same input based on different assumptions or background knowledge. It is thus crucial for agents to adeptly handle the inherent ambiguity in queries to ensure reliability. However, even state-of-the-art large language models (LLMs) still face challenges in such scenarios, primarily due to the following hurdles: (1) LLMs are not explicitly trained to deal with ambiguous utterances; (2) the degree of ambiguity perceived by the LLMs may vary depending on the possessed knowledge. To address these issues, we propose Alignment with Perceived Ambiguity (APA), a novel pipeline that aligns LLMs to manage ambiguous queries by leveraging their own assessment of ambiguity (i.e., perceived ambiguity). Experimental results on question-answering datasets demonstrate that APA empowers LLMs to explicitly detect and manage ambiguous queries while retaining the ability to answer clear questions. Furthermore, our finding proves that APA excels beyond training with gold-standard labels, especially in out-of-distribution scenarios. The data and code are available at https://github.com/heyjoonkim/APA. 2024.emnlp-main.119 @@ -1699,10 +1699,10 @@ Tag-grounded Visual Instruction Tuning with Retrieval Augmentation - DaiqingQi + DaiqingQi HandongZhaoAdobe Systems ZijunWeiAdobe Systems - ShengLiUniversity of Virginia, Charlottesville + ShengLiUniversity of Virginia, Charlottesville 2008-2026 Despite recent advances in the general visual instruction-following ability of Multimodal Large Language Models (MLLMs), they still struggle with critical problems when required to provide a precise and detailed response to a visual instruction: (1) failure to identify novel objects or entities, (2) mention of non-existent objects, and (3) neglect of object’s attributed details. Intuitive solutions include improving the size and quality of data or using larger foundation models. They show effectiveness in mitigating these issues, but at an expensive cost of collecting a vast amount of new data and introducing a significantly larger model. Standing at the intersection of these approaches, we examine the three object-oriented problems from the perspective of the image-to-text mapping process by the multimodal connector. In this paper, we first identify the limitations of multimodal connectors stemming from insufficient training data. Driven by this, we propose to enhance the mapping with retrieval-augmented tag tokens, which contain rich object-aware information such as object names and attributes. With our Tag-grounded visual instruction tuning with retrieval Augmentation (TUNA), we outperform baselines that share the same language model and training data on 12 benchmarks. Furthermore, we show the zero-shot capability of TUNA when provided with specific datastores. 2024.emnlp-main.120 @@ -1712,7 +1712,7 @@ <fixed-case>GL</fixed-case>a<fixed-case>PE</fixed-case>: Gold Label-agnostic Prompt Evaluation for Large Language Models XuanchangZhang - ZhuoshengZhangShanghai Jiao Tong University + ZhuoshengZhangShanghai Jiao Tong University HaiZhaoShanghai Jiao Tong University 2027-2039 Despite the rapid progress of large language models (LLMs), their task performance remains sensitive to prompt design. Recent studies have explored leveraging the LLM itself as an optimizer to identify optimal prompts that maximize task accuracy. However, when evaluating prompts, such approaches heavily rely on elusive manually annotated gold labels to calculate task accuracy for each candidate prompt, which hinders its generality. To overcome the limitation, this work proposes GLaPE, a gold label-agnostic prompt evaluation method to alleviate dependence on gold labels. GLaPE is composed of two critical aspects: self-consistency evaluation of a single prompt and mutual-consistency refinement across multiple prompts. Experimental results on 8 widely-recognized reasoning tasks demonstrate that GLaPE can produce more effective prompts, achieving performance comparable to those derived from manually annotated gold labels. Analysis shows that GLaPE provides reliable evaluations aligned with accuracy, even in the absence of gold labels. Code is publicly available at **Anonymous**. @@ -1722,8 +1722,8 @@ Decoding the Echoes of Vision from f<fixed-case>MRI</fixed-case>: Memory Disentangling for Past Semantic Information - RunzeXia - CongchiYin + RunzeXia + CongchiYin PijiLiNanjing University of Aeronautics and Astronautics 2040-2052 2024.emnlp-main.122 @@ -1732,14 +1732,14 @@ Optimizing Code Retrieval: High-Quality and Scalable Dataset Annotation through Large Language Models - RuiLi + RuiLi QiLiuUniversity of Science and Technology of China LiyangHe - ZhengZhang - HaoZhang + ZhengZhang + HaoZhang ShengyuYe JunyuLuUniversity of Science and Technology of China - ZhenyaHuangUniversity of Science and Technology of China + ZhenyaHuangUniversity of Science and Technology of China 2053-2065 Code retrieval aims to identify code from extensive codebases that semantically aligns with a given query code snippet. Collecting a broad and high-quality set of query and code pairs is crucial to the success of this task. However, existing data collection methods struggle to effectively balance scalability and annotation quality. In this paper, we first analyze the factors influencing the quality of function annotations generated by Large Language Models (LLMs). We find that the invocation of intra-repository functions and third-party APIs plays a significant role. Building on this insight, we propose a novel annotation method that enhances the annotation context by incorporating the content of functions called within the repository and information on third-party API functionalities. Additionally, we integrate LLMs with a novel sorting method to address the multi-level function call relationships within repositories. Furthermore, by applying our proposed method across a range of repositories, we have developed the Query4Code dataset. The quality of this synthesized dataset is validated through both model training and human evaluation, demonstrating high-quality annotations. Moreover, cost analysis confirms the scalability of our annotation method. 2024.emnlp-main.123 @@ -1748,7 +1748,7 @@ Towards Difficulty-Agnostic Efficient Transfer Learning for Vision-Language Models - YongjinYangKAIST + YongjinYangKAIST JongwooKoKorea Advanced Institute of Science and Technology Se-YoungYunKAIST 2066-2085 @@ -1774,7 +1774,7 @@ An Inversion Attack Against Obfuscated Embedding Matrix in Language Model Inference - YuLin + YuLin QizhiZhangByteDance Inc. QuanweiCaiByteDance Inc. JueHong @@ -1790,15 +1790,15 @@ <fixed-case>V</fixed-case>ideo<fixed-case>S</fixed-case>core: Building Automatic Metrics to Simulate Fine-grained Human Feedback for Video Generation - XuanHe - DongfuJiang + XuanHe + DongfuJiang GeZhang MaxKu AchintSoni - ShermanSiu + ShermanSiu HaonanChen AbhranilChandra - ZiyanJiangAmazon + ZiyanJiangAmazon AaranArulrajMicrosoft and University of Waterloo KaiWang Quy DucDo @@ -1821,10 +1821,10 @@ WenxuanWang YiliuYang YouliangYuanThe Chinese University of Hong Kong-Shenzhen - Jen-tseHuang + Jen-tseHuang PinjiaHeThe Chinese University of Hong Kong, Shenzhen WenxiangJiaoTencent AI Lab - MichaelLyuThe Chinese University of Hong Kong + MichaelLyuThe Chinese University of Hong Kong 2124-2155 We introduce LogicAsker, a novel approach for evaluating and enhancing the logical reasoning capabilities of large language models (LLMs) such as ChatGPT and GPT-4. Despite LLMs’ prowess in tasks like writing assistance, code generation, and machine translation, assessing their ability to reason has been challenging. Traditional evaluations often prioritize accuracy on downstream tasks over direct assessments of reasoning processes. LogicAsker addresses this gap by employing a set of atomic reasoning skills grounded in propositional and predicate logic to systematically examine and improve the reasoning prowess of LLMs. Our methodology reveals significant gaps in LLMs’ learning of logical rules, with identified reasoning failures ranging from 29% to 90% across different models. Moreover, we leverage these findings to construct targeted demonstration examples and fine-tune data, notably enhancing logical reasoning in models like GPT-4o by up to 5%. To our knowledge, this is the first effort to utilize test case outcomes to effectively refine LLMs’ formal reasoning capabilities. We make our code, data, and results publicly available(https://github.com/yxwan123/LogicAsker) to facilitate further research and replication of our findings. 2024.emnlp-main.128 @@ -1833,10 +1833,10 @@ Integrating Structural Semantic Knowledge for Enhanced Information Extraction Pre-training - XiaoyangYiNankai University - YuruBao - JianZhangNankai University - YifangQin + XiaoyangYiNankai University + YuruBao + JianZhangNankai University + YifangQin FaxinLinNankai University 2156-2171 Information Extraction (IE), aiming to extract structured information from unstructured natural language texts, can significantly benefit from pre-trained language models. However, existing pre-training methods solely focus on exploiting the textual knowledge, relying extensively on annotated large-scale datasets, which is labor-intensive and thus limits the scalability and versatility of the resulting models. To address these issues, we propose SKIE, a novel pre-training framework tailored for IE that integrates structural semantic knowledge via contrastive learning, effectively alleviating the annotation burden. Specifically, SKIE utilizes Abstract Meaning Representation (AMR) as a low-cost supervision source to boost model performance without human intervention. By enhancing the topology of AMR graphs, SKIE derives high-quality cohesive subgraphs as additional training samples, providing diverse multi-level structural semantic knowledge. Furthermore, SKIE refines the graph encoder to better capture cohesive information and edge relation information, thereby improving the pre-training efficacy. Extensive experimental results demonstrate that SKIE outperforms state-of-the-art baselines across multiple IE tasks and showcases exceptional performance in few-shot and zero-shot settings. @@ -1848,7 +1848,7 @@ <fixed-case>F</fixed-case>use<fixed-case>G</fixed-case>en: <fixed-case>PLM</fixed-case> Fusion for Data-generation based Zero-shot Learning TianyuanZou YangLiuTsinghua University, Tsinghua University - PengLiTsinghua University + PengLiTsinghua University JianqingZhang JingjingLiuTsinghua University Ya-QinZhangAIR, Tsinghua University @@ -1862,7 +1862,7 @@ <fixed-case>I</fixed-case> Need Help! Evaluating <fixed-case>LLM</fixed-case>’s Ability to Ask for Users’ Support: A Case Study on Text-to-<fixed-case>SQL</fixed-case> Generation Cheng-KuangWuAppier - Zhi RuiTamAppier + Zhi RuiTamAppier Chao-ChungWuAppier Chieh-YenLinAppier Inc. Hung-yiLeeNational Taiwan University @@ -1875,8 +1875,8 @@ Oddballs and Misfits: Detecting Implicit Abuse in Which Identity Groups are Depicted as Deviating from the Norm - MichaelWiegandUniversität Vienna - JosefRuppenhoferFernuniversität Gesamthochschule Hagen + MichaelWiegandUniversität Vienna + JosefRuppenhoferFernuniversität Gesamthochschule Hagen 2200-2218 We address the task of detecting abusive sentences in which identity groups are depicted as deviating from the norm (e.g. Gays sprinkle flour over their gardens for good luck). These abusive utterances need not be stereotypes or negative in sentiment. We introduce the first dataset for this task. It is created via crowdsourcing and includes 7 identity groups. We also report on classification experiments. 2024.emnlp-main.132 @@ -1887,7 +1887,7 @@ By My Eyes: Grounding Multimodal Large Language Models with Sensor Data via Visual Prompting HyungjunYoon Biniyam AschalewTolera - TaesikGongUlsan National Institute of Science and Technology + TaesikGongUlsan National Institute of Science and Technology KiminLeeKorea Advanced Institute of Science & Technology Sung-JuLeeKorea Advanced Institute of Science & Technology 2219-2241 @@ -1899,7 +1899,7 @@ Prefixing Attention Sinks can Mitigate Activation Outliers for Large Language Model Quantization - SeungwooSon + SeungwooSon WonpyoParkGoogle WoohyunHanGoogle DeepMind KyuyeunKimGoogle @@ -1912,12 +1912,12 @@ <fixed-case>CHIQ</fixed-case>: Contextual History Enhancement for Improving Query Rewriting in Conversational Search - FengranMo + FengranMo AbbasGhaddarHuawei Technologies Ltd. KelongMao MehdiRezagholizadeh - BoxingChenHuawei Technologies Ltd. - QunLiuHuawei Noah’s Ark Lab + BoxingChenHuawei Technologies Ltd. + QunLiuHuawei Noah’s Ark Lab Jian-YunNieUniversity of Montreal 2253-2268 In this paper, we study how open-source large language models (LLMs) can be effectively deployed for improving query rewriting in conversational search, especially for ambiguous queries. We introduce CHIQ, a two-step method that leverages the capabilities of LLMs to resolve ambiguities in the conversation history before query rewriting. This approach contrasts with prior studies that predominantly use closed-source LLMs to directly generate search queries from conversation history. We demonstrate on five well-established benchmarks that CHIQ leads to state-of-the-art results across most settings, showing highly competitive performances with systems leveraging closed-source LLMs. Our study provides a first step towards leveraging open-source LLMs in conversational search, as a competitive alternative to the prevailing reliance on commercial LLMs. Data, models, and source code will be publicly available upon acceptance at https://github.com/fengranMark/CHIQ. @@ -1927,8 +1927,8 @@ Towards Low-Resource Harmful Meme Detection with <fixed-case>LMM</fixed-case> Agents - JianzhaoHuangBeijing University of Posts and Telecommunications - HongzhanLin + JianzhaoHuangBeijing University of Posts and Telecommunications + HongzhanLin LiuZiyan ZiyangLuoNational University of Singapore, 01.ai and Hong Kong Baptist University GuangChen @@ -1941,10 +1941,10 @@ <fixed-case>VIVA</fixed-case>: A Benchmark for Vision-Grounded Decision-Making with Human Values - ZheHuBaidu - YixiaoRen - JingLiThe Hong Kong Polytechnic University - YuYinCase Western Reserve University + ZheHuBaidu + YixiaoRen + JingLiThe Hong Kong Polytechnic University + YuYinCase Western Reserve University 2294-2311 This paper introduces VIVA, a benchmark for VIsion-grounded decision-making driven by human VAlues. While most large vision-language models (VLMs) focus on physical-level skills, our work is the first to examine their multimodal capabilities in leveraging human values to make decisions under a vision-depicted situation. VIVA contains 1,062 images depicting diverse real-world situations and the manually annotated decisions grounded in them. Given an image there, the model should select the most appropriate action to address the situation and provide the relevant human values and reason underlying the decision. Extensive experiments based on VIVA show the limitation of VLMs in using human values to make multimodal decisions. Further analyses indicate the potential benefits of exploiting action consequences and predicted human values. 2024.emnlp-main.137 @@ -1954,11 +1954,11 @@ Direct Multi-Turn Preference Optimization for Language Agents - WentaoShi + WentaoShi MengqiYuan JunkangWuUniversity of Science and Technology of China - QifanWangMeta AI - FuliFengUniversity of Science and Technology of China + QifanWangMeta AI + FuliFengUniversity of Science and Technology of China 2312-2324 Adapting Large Language Models (LLMs) for agent tasks is critical in developing language agents. Direct Preference Optimization (DPO) is a promising technique for this adaptation with the alleviation of compounding errors, offering a means to directly optimize Reinforcement Learning (RL) objectives. However, applying DPO to multi-turn tasks presents challenges due to the inability to cancel the partition function. Overcoming this obstacle involves making the partition function independent of the current state and addressing length disparities between preferred and dis-preferred trajectories. In this light, we replace the policy constraint with the state-action occupancy measure constraint in the RL objective and add length normalization to the Bradley-Terry model, yielding a novel loss function named DMPO for multi-turn agent tasks with theoretical explanations. Extensive experiments on three multi-turn agent task datasets confirm the effectiveness and superiority of the DMPO loss. 2024.emnlp-main.138 @@ -1967,7 +1967,7 @@ Self-Refine Instruction-Tuning for Aligning Reasoning in Language Models - LeonardoRanaldi + LeonardoRanaldi AndreFreitasIdiap Research Institute and University of Manchester 2325-2347 The alignment of reasoning abilities between smaller and larger Language Models are largely conducted via supervised fine-tuning using demonstrations generated from robust Large Language Models (LLMs). Although these approaches deliver more performant models, they do not show sufficiently strong generalization ability as the training only relies on the provided demonstrations.In this paper, we propose the Self-refine Instruction-tuning method that elicits Smaller Language Models to self-improve their abilities.Our approach is based on a two-stage process, where reasoning abilities are first transferred between LLMs and Small Language Models (SLMs) via Instruction-tuning on synthetic demonstrations provided by LLMs, and then the instructed models self-improve their abilities through preference optimization strategies.In particular, the second phase operates refinement heuristics based on Direct Preference Optimization, where the SLMs are elicited to deliver a series of reasoning paths by automatically sampling the generated responses and providing rewards using ground truths from the LLMs.Results obtained on commonsense and math reasoning tasks show that this approach consistently outperforms Instruction-tuning in both in-domain and out-domain scenarios, aligning the reasoning abilities of Smaller and Larger language models. @@ -2003,9 +2003,9 @@ ChenghaoPeng JiaqingLiangFudan University ZhixuLi - YanghuaXiaoFudan University + YanghuaXiaoFudan University LiqianWen - ZulongChen + ZulongChen 2371-2389 Web scraping is a powerful technique that extracts data from websites, enabling automated data collection, enhancing data analysis capabilities, and minimizing manual data entry efforts. Existing methods, wrappers-based methods suffer from limited adaptability and scalability when faced with a new website, while language agents, empowered by large language models (LLMs), exhibit poor reusability in diverse web environments. In this work, we introduce the paradigm of generating web scrapers with LLMs and propose AutoScraper, a two-stage framework that can handle diverse and changing web environments more efficiently. AutoScraper leverages the hierarchical structure of HTML and similarity across different web pages for generating web scrapers. Besides, we propose a new executability metric for better measuring the performance of web scraper generation tasks. We conduct comprehensive experiments with multiple LLMs and demonstrate the effectiveness of our framework. Our work is now open-source. 2024.emnlp-main.141 @@ -2019,7 +2019,7 @@ ShaharKatzComputer Science Departmen, Technion-Israel Institute of Technology YonatanBelinkovTechnion, Technion MorGevaTel Aviv University and Google Research - LiorWolfTel Aviv University, Tel Aviv University and Tel Aviv University + LiorWolfTel Aviv University, Tel Aviv University and Tel Aviv University 2390-2422 Understanding how Transformer-based Language Models (LMs) learn and recall information is a key goal of the deep learning community. Recent interpretability methods project weights and hidden states obtained from the forward pass to the models’ vocabularies, helping to uncover how information flows within LMs. In this work, we extend this methodology to LMs’ backward pass and gradients. We first prove that a gradient matrix can be cast as a low-rank linear combination of its forward and backward passes’ inputs. We then develop methods to project these gradients into vocabulary items and explore the mechanics of how new information is stored in the LMs’ neurons. 2024.emnlp-main.142 @@ -2035,7 +2035,7 @@ MinseoKimYonsei University SeungjuHanSeoul National University AshkanYousefpour - JackHesselSamaya AI + JackHesselSamaya AI YoungjaeYuYonsei University 2423-2451 Visual arguments, often used in advertising or social causes, rely on images to persuade viewers to do or believe something. Understanding these arguments requires selective vision: only specific visual stimuli within an image are relevant to the argument, and relevance can only be understood within the context of a broader argumentative structure. While visual arguments are readily appreciated by human audiences, we ask: are today’s AI capable of similar understanding?We present VisArgs, a dataset of 1,611 images annotated with 5,112 visual premises (with regions), 5,574 commonsense premises, and reasoning trees connecting them into structured arguments. We propose three tasks for evaluating visual argument understanding: premise localization, premise identification, and conclusion deduction.Experiments show that 1) machines struggle to capture visual cues: GPT-4-O achieved 78.5% accuracy, while humans reached 98.0%. Models also performed 19.5% worse when distinguishing between irrelevant objects within the image compared to external objects. 2) Providing relevant visual premises improved model performance significantly. @@ -2062,12 +2062,12 @@ Reusing Transferable Weight Increments for Low-resource Style Generation - ChunzhenJin + ChunzhenJin EliotHuangLeibowitz AI Org and Prismer AI Inc - HengChangTsinghua University, Tsinghua University - YaqiWangNortheastern University + HengChangTsinghua University, Tsinghua University + YaqiWangNortheastern University PengCaoNortheastern University - OsmarZaianeUniversity of Alberta + OsmarZaianeUniversity of Alberta 2470-2488 Text style transfer (TST) is crucial in natural language processing, aiming to endow text with a new style without altering its meaning. In real-world scenarios, not all styles have abundant resources. This work introduces TWIST (reusing Transferable Weight Increments for Style Text generation), a novel framework to mitigate data scarcity by utilizing style features in weight increments to transfer low-resource styles effectively. During target style learning, we derive knowledge via a specially designed weight pool and initialize the parameters for the unseen style. To enhance the effectiveness of merging, the target style weight increments are often merged from multiple source style weight increments through singular vectors. Considering the diversity of styles, we also designed a multi-key memory network that simultaneously focuses on task- and instance-level information to derive the most relevant weight increments. Results from multiple style transfer datasets show that TWIST demonstrates remarkable performance across different backbones, achieving particularly effective results in low-resource scenarios. 2024.emnlp-main.145 @@ -2091,7 +2091,7 @@ Seemingly Plausible Distractors in Multi-Hop Reasoning: Are Large Language Models Attentive Readers? NeeladriBhuiya ViktorSchlegelImperial College London - StefanWinklerNational University of Singapore + StefanWinklerNational University of Singapore 2514-2528 State-of-the-art Large Language Models (LLMs) are accredited with an increasing number of different capabilities, ranging from reading comprehension over advanced mathematical and reasoning skills to possessing scientific knowledge. In this paper we focus on multi-hop reasoning—the ability to identify and integrate information from multiple textual sources.Given the concerns with the presence of simplifying cues in existing multi-hop reasoning benchmarks, which allow models to circumvent the reasoning requirement, we set out to investigate whether LLMs are prone to exploiting such simplifying cues. We find evidence that they indeed circumvent the requirement to perform multi-hop reasoning, but they do so in more subtle ways than what was reported about their fine-tuned pre-trained language model (PLM) predecessors. We propose a challenging multi-hop reasoning benchmark by generating seemingly plausible multi-hop reasoning chains that ultimately lead to incorrect answers. We evaluate multiple open and proprietary state-of-the-art LLMs and show that their multi-hop reasoning performance is affected, as indicated by up to 45% relative decrease in F1 score when presented with such seemingly plausible alternatives. We also find that—while LLMs tend to ignore misleading lexical cues—misleading reasoning paths indeed present a significant challenge. The code and data are made available at https://github.com/zawedcvg/Are-Large-Language-Models-Attentive-Readers 2024.emnlp-main.147 @@ -2116,7 +2116,7 @@ <fixed-case>LEM</fixed-case>o<fixed-case>E</fixed-case>: Advanced Mixture of Experts Adaptor for Lifelong Model Editing of Large Language Models - RenzhiWang + RenzhiWang PijiLiNanjing University of Aeronautics and Astronautics 2551-2575 Large language models (LLMs) require continual knowledge updates to stay abreast of the ever-changing world facts, prompting the formulation of lifelong model editing task. While recent years have witnessed the development of various techniques for single and batch editing, these methods either fail to apply or perform sub-optimally when faced with lifelong editing. In this paper, we introduce LEMoE, an advanced Mixture of Experts (MoE) adaptor for lifelong model editing. We first analyze the factors influencing the effectiveness of conventional MoE adaptor in lifelong editing, including catastrophic forgetting, inconsistent routing and order sensitivity. Based on these insights, we propose a tailored module insertion method to achieve lifelong editing, incorporating a novel KV anchor routing to enhance routing consistency between training and inference stage, along with a concise yet effective clustering-based editing order planning. Experimental results demonstrate the effectiveness of our method in lifelong editing, surpassing previous model editing techniques while maintaining outstanding performance in batch editing task. Our code will be available. @@ -2127,9 +2127,9 @@ Collaborative Performance Prediction for Large Language Models QiyuanZhangCity University of Hong Kong - FuyuanLyuMcGill University, McGill University + FuyuanLyuMcGill University, McGill University XueLiuMcGill University - ChenMaCity University of Hong Kong + ChenMaCity University of Hong Kong 2576-2596 Comprehensively understanding and accurately predicting the performance of large language models across diverse downstream tasks has emerged as a pivotal challenge in NLP research. The pioneering scaling law on downstream works demonstrated intrinsic similarities within model families and utilized such similarities for performance prediction. However, they tend to overlook the similarities between model families and only consider design factors listed in the original scaling law. To overcome these limitations, we introduce a novel framework, Collaborative Performance Prediction (CPP), which significantly enhances prediction accuracy by leveraging the historical performance of various models on downstream tasks and other design factors for both model and task. We also collect a collaborative data sourced from online platforms containing both historical performance and additional design factors. With the support of the collaborative data, CPP not only surpasses traditional scaling laws in predicting the performance of scaled LLMs but also facilitates a detailed analysis of factor importance, an area previously overlooked. 2024.emnlp-main.150 @@ -2138,8 +2138,8 @@ Surveying the Dead Minds: Historical-Psychological Text Analysis with Contextualized Construct Representation (<fixed-case>CCR</fixed-case>) for Classical <fixed-case>C</fixed-case>hinese - YuqiChen - SixuanLiXiaoying Technology Company + YuqiChen + SixuanLiXiaoying Technology Company YingLi MohammadAtariUniversity of Massachusetts at Amherst 2597-2615 @@ -2155,7 +2155,7 @@ XintingHuangTencent AI Lab LeyangCui XiaojunQuanSUN YAT-SEN UNIVERSITY - WeiBiTencent AI Lab + WeiBiTencent AI Lab ShumingShiTencent AI Lab 2616-2633 While large language models (LLMs) have demonstrated exceptional performance across various tasks following human alignment, they may still generate responses that sound plausible but contradict factual knowledge, a phenomenon known as hallucination. In this paper, we demonstrate the feasibility of mitigating hallucinations by verifying and minimizing the inconsistency between external knowledge present in the alignment data and the intrinsic knowledge embedded within foundation LLMs. Specifically, we propose a novel approach called Knowledge Consistent Alignment (KCA), which employs a well-aligned LLM to automatically formulate assessments based on external knowledge to evaluate the knowledge boundaries of foundation LLMs. To address knowledge inconsistencies in the alignment data, KCA implements several specific strategies to deal with these data instances. We demonstrate the superior efficacy of KCA in reducing hallucinations across six benchmarks, utilizing foundation LLMs of varying backbones and scales. This confirms the effectiveness of mitigating hallucinations by reducing knowledge inconsistency. Our code, model weights, and data are openly accessible at https://github.com/fanqiwan/KCA. @@ -2165,10 +2165,10 @@ <fixed-case>QUITE</fixed-case>: Quantifying Uncertainty in Natural Language Text in <fixed-case>B</fixed-case>ayesian Reasoning Scenarios - Timo PierreSchraderUniversität Augsburg and Robert Bosch GmbH, Bosch + Timo PierreSchraderUniversität Augsburg and Robert Bosch GmbH, Bosch LukasLangeRobert Bosch GmbH, Bosch SimonRazniewskiTechnische Universität Dresden - AnnemarieFriedrichUniversity of Augsburg + AnnemarieFriedrichUniversity of Augsburg 2634-2652 Reasoning is key to many decision making processes. It requires consolidating a set of rule-like premises that are often associated with degrees of uncertainty and observations to draw conclusions. In this work, we address both the case where premises are specified as numeric probabilistic rules and situations in which humans state their estimates using words expressing degrees of certainty. Existing probabilistic reasoning datasets simplify the task, e.g., by requiring the model to only rank textual alternatives, by including only binary random variables, or by making use of a limited set of templates that result in less varied text.In this work, we present QUITE, a question answering dataset of real-world Bayesian reasoning scenarios with categorical random variables and complex relationships. QUITE provides high-quality natural language verbalizations of premises together with evidence statements and expects the answer to a question in the form of an estimated probability. We conduct an extensive set of experiments, finding that logic-based models outperform out-of-the-box large language models on all reasoning types (causal, evidential, and explaining-away). Our results provide evidence that neuro-symbolic models are a promising direction for improving complex reasoning. We release QUITE and code for training and experiments on Github. 2024.emnlp-main.153 @@ -2179,7 +2179,7 @@ <fixed-case>A</fixed-case>frican or <fixed-case>E</fixed-case>uropean Swallow? Benchmarking Large Vision-Language Models for Fine-Grained Object Classification GregorGeigleBayerische Julius-Maximilians-Universität Würzburg - RaduTimofteBayerische Julius-Maximilians-Universität Würzburg + RaduTimofteBayerische Julius-Maximilians-Universität Würzburg GoranGlavašJulius-Maximilians-Universität Würzburg 2653-2669 Recent Large Vision-Language Models (LVLMs) demonstrate impressive abilities on numerous image understanding and reasoning tasks. The task of fine-grained object classification (e.g., distinction between animal species), however, has been probed insufficiently, despite its downstream importance. We fill this evaluation gap by creating FOCI (Fine-grained Object ClassIfication), a difficult multiple-choice benchmark for fine-grained object classification, from existing object classification datasets: (1) multiple-choice avoids ambiguous answers associated with casting classification as open-ended QA task; (2) we retain classification difficulty by mining negative labels with a CLIP model. FOCI complements five popular classification datasets with four domain-specific subsets from ImageNet-21k. We benchmark 12 public LVLMs on and show that it tests for a complementary skill to established image understanding and reasoning benchmarks. Crucially, CLIP models exhibit dramatically better performance than LVLMs. Since the image encoders of LVLMs come from these CLIP models, this points to inadequate alignment for fine-grained object distinction between the encoder and the LLM and warrants (pre)training data with more fine-grained annotation. We release our code at ANONYMIZED. @@ -2205,7 +2205,7 @@ To Word Senses and Beyond: Inducing Concepts with Contextualized Language Models BastienLiétard - PascalDenisINRIA + PascalDenisINRIA MikaelaKellerUniversité de Lille 2684-2696 Polysemy and synonymy are two crucial interrelated facets of lexicalambiguity. While both phenomena are widely documented in lexical resources and have been studied extensively in NLP,leading to dedicated systems, they are often being consideredindependently in practictal problems. While many tasks dealing with polysemy (e.g. Word SenseDisambiguiation or Induction) highlight the role of word’s senses,the study of synonymy is rooted in the study of concepts, i.e. meaningsshared across the lexicon. In this paper, we introduce ConceptInduction, the unsupervised task of learning a soft clustering amongwords that defines a set of concepts directly from data. This taskgeneralizes Word Sense Induction. We propose a bi-levelapproach to Concept Induction that leverages both a locallemma-centric view and a global cross-lexicon view to induceconcepts. We evaluate the obtained clustering on SemCor’s annotateddata and obtain good performance (BCubed F1 above0.60). We find that the local and the global levels are mutuallybeneficial to induce concepts and also senses in our setting. Finally,we create static embeddings representing our induced concepts and usethem on the Word-in-Context task, obtaining competitive performancewith the State-of-the-Art. @@ -2228,7 +2228,7 @@ An Electoral Approach to Diversify <fixed-case>LLM</fixed-case>-based Multi-Agent Collective Decision-Making XiutianZhaoUniversity of Edinburgh - KeWangHuawei Technologies Ltd. + KeWangHuawei Technologies Ltd. WeiPengHuawei Technologies Ltd. 2712-2727 Modern large language models (LLMs) have exhibited cooperative synergy on complex task-solving, and collective decision-making (CDM) is a pivotal component in LLM-based multi-agent collaboration frameworks. Our survey on 52 recent such systems uncovers a severe lack of diversity, with a heavy reliance on dictatorial and plurality voting for CDM. Through the lens of social choice theory, we scrutinize widely-adopted CDM methods and identify their limitations. To enrich current landscape of LLM-based CDM, we present GEDI, an electoral CDM module that incorporates various ordinal preferential voting mechanisms. Our empirical case study across three benchmarks shows that the integration of certain CDM methods can markedly improve the reasoning capabilities and robustness of some leading LLMs, all without requiring intricate system designs. Additionally, we find that some CDM mechanisms generate positive synergies even with as few as three agents. The voting-based methods also demonstrate robustness against single points of failure, as well as diversity in terms of hit-rate@k and subject-wise impacts. @@ -2240,7 +2240,7 @@ Does Object Grounding Really Reduce Hallucination of Large Vision-Language Models? GregorGeigleBayerische Julius-Maximilians-Universität Würzburg - RaduTimofteBayerische Julius-Maximilians-Universität Würzburg + RaduTimofteBayerische Julius-Maximilians-Universität Würzburg GoranGlavašJulius-Maximilians-Universität Würzburg 2728-2742 Large vision-language models (LVLMs) have recently dramatically pushed the state of the art in image captioning and many image understanding tasks (e.g., visual question answering). LVLMs, however, often hallucinate and produce captions that mention concepts that cannot be found in the image. These hallucinations erode the trustworthiness of LVLMs and are arguably among the main obstacles to their ubiquitous adoption. Recent work suggests that addition of grounding objectives—those that explicitly align image regions or objects to text spans—reduces the amount of LVLM hallucination. Although intuitive, this claim is not empirically justified as the reduction effects have been established, we argue, with flawed evaluation protocols that (i) rely on data (i.e., MSCOCO) that has been extensively used in LVLM training and (ii) measure hallucination via question answering rather than open-ended caption generation.In this work, in contrast, we offer the first systematic analysis of the effect of fine-grained object grounding on LVLM hallucination under an evaluation protocol that more realistically captures LVLM hallucination in open generation. Our extensive experiments over three backbone LLMs reveal that grounding objectives have little to no effect on object hallucination in open caption generation. @@ -2254,9 +2254,9 @@ DongfangLiHarbin Institute of Technology XinshuoHu XinpingZhao - YibinChen - BaotianHuHarbin Institute of Technology, Shenzhen - MinZhangHarbin Institute of Technology + YibinChen + BaotianHuHarbin Institute of Technology, Shenzhen + MinZhangHarbin Institute of Technology 2743-2757 Recent studies have explored the working mechanisms of In-Context Learning (ICL). However, they mainly focus on classification and simple generation tasks, limiting their broader application to more complex generation tasks in practice. To address this gap, we investigate the impact of demonstrations on token representations within the practical alignment tasks. We find that the transformer embeds the task function learned from demonstrations into the separator token representation, which plays an important role in the generation of prior response tokens. Once the prior response tokens are determined, the demonstrations become redundant. Motivated by this finding, we propose an efficient Progressive In-Context Alignment (PICA) method consisting of two stages. In the first few-shot stage, the model generates several prior response tokens via standard ICL while concurrently extracting the ICL vector that stores the task function from the separator token representation. In the following zero-shot stage, this ICL vector guides the model to generate responses without further demonstrations. Extensive experiments demonstrate that our PICA not only surpasses vanilla ICL but also achieves comparable performance to other alignment tuning methods. The proposed training-free method reduces the time cost (e.g., 5.45×) with improved alignment performance (e.g., 6.57+). Consequently, our work highlights the application of ICL for alignment and calls for a deeper understanding of ICL for complex generations. The code will be available at https://github.com/HITsz-TMG/PICA. 2024.emnlp-main.160 @@ -2267,14 +2267,14 @@ <fixed-case>M</fixed-case>o<fixed-case>DULA</fixed-case>: Mixture of Domain-Specific and Universal <fixed-case>L</fixed-case>o<fixed-case>RA</fixed-case> for Multi-Task Learning YufeiMa ZihanLiang - HuangyuDai - BenChenAlibaba Group - DehongGaoNorthwest Polytechnical University Xi’an + HuangyuDai + BenChenAlibaba Group + DehongGaoNorthwest Polytechnical University Xi’an ZhuoranRan WangZihan LinboJin WenJiang - GuannanZhang + GuannanZhang XiaoyanCai LibinYang 2758-2770 @@ -2289,7 +2289,7 @@ JingyangChen JunchenShen ZijieZhaiEast China Normal University - PingLiSouthwest Petroleum University + PingLiSouthwest Petroleum University JieZhangFudan University KaiZhang 2771-2783 @@ -2306,11 +2306,11 @@ YihanChen HangqiLi HanYueZhejiang University - ShengyuZhangZhejiang University - HuaiyongDouZhejiang University - JunchiYanShanghai Jiao Tong University + ShengyuZhangZhejiang University + HuaiyongDouZhejiang University + JunchiYanShanghai Jiao Tong University ZeminLiuZhejiang University - YongquanZhangZhejiang University + YongquanZhangZhejiang University FeiWuZhejiang University 2784-2801 Philology, the study of ancient manuscripts, demands years of professional training in ex-tensive knowledge memorization and manual textual retrieval. Despite these requirements align closely with strengths of recent successful Large Language Models (LLMs), the scarcity of high-quality, specialized training data has hindered direct applications. To bridge this gap, we curated the PhiloCorpus-ZH, a rich collec-tion of ancient Chinese texts spanning a millen-nium with 30 diverse topics, including firsthand folk copies. This corpus facilitated the develop-ment of PhiloGPT, the first LLM tailored for discovering ancient Chinese manuscripts. To effectively tackle complex philological tasks like restoration, attribution, and linguistic anal-ysis, we introduced the PhiloCoP framework. Modeled on the analytical patterns of philol-ogists, PhiloCoP enhances LLM’s handling of historical linguistic peculiarities such as phonetic loans, polysemy, and syntactic inver-sions. We further integrated these tasks into the PhiloBenchmark, establishing a new standard for evaluating ancient Chinese LLMs address-ing philology tasks. Deploying PhiloGPT in practical scenarios has enabled Dunhuang spe-cialists to resolve philology tasks, such as iden-tifying duplication of copied text and assisting archaeologists with text completion, demon-strating its potential in real-world applications. @@ -2349,8 +2349,8 @@ Evaluating Large Language Models via Linguistic Profiling AlessioMiaschiInstitute for Computational Linguistics “A. Zampolli” (CNR-ILC), Pisa - FeliceDell’OrlettaIstituto di Linguistica Computazionale “A. Zampolli” (ILC) - GiuliaVenturiInstitute for Computational Linguistics “A. Zampolli” (ILC-CNR) + FeliceDell’OrlettaIstituto di Linguistica Computazionale “A. Zampolli” (ILC) + GiuliaVenturiInstitute for Computational Linguistics “A. Zampolli” (ILC-CNR) 2835-2848 Large Language Models (LLMs) undergo extensive evaluation against various benchmarks collected in established leaderboards to assess their performance across multiple tasks. However, to the best of our knowledge, there is a lack of comprehensive studies evaluating these models’ linguistic abilities independent of specific tasks. In this paper, we introduce a novel evaluation methodology designed to test LLMs’ sentence generation abilities under specific linguistic constraints. Drawing on the ‘linguistic profiling’ approach, we rigorously investigate the extent to which five LLMs of varying sizes, tested in both zero- and few-shot scenarios, effectively adhere to (morpho)syntactic constraints. Our findings shed light on the linguistic proficiency of LLMs, revealing both their capabilities and limitations in generating linguistically-constrained sentences. 2024.emnlp-main.166 @@ -2360,7 +2360,7 @@ With Ears to See and Eyes to Hear: Sound Symbolism Experiments with Multimodal Large Language Models - TylerLoakman + TylerLoakman YuchengLi ChenghuaLinUniversity of Manchester 2849-2867 @@ -2375,8 +2375,8 @@ ShulinCaoTsinghua University, Tsinghua University LinmeiHuBeijing Institute of Technology LingFengTsinghua University, Tsinghua University - LeiHouTsinghua University, Tsinghua University - JuanziLi + LeiHouTsinghua University, Tsinghua University + JuanziLi 2868-2882 Program induction (PI) has become a promising paradigm for using knowledge bases (KBs) to help large language models (LLMs) answer complex knowledge-intensive questions. Nonetheless, PI typically relies on a large number of parallel question-program pairs to make the LLM aware of the schema of a given KB, and is thus challenging for many low-resourced KBs that lack annotated data. To this end, we propose KB-Plugin, a plug-and-play framework that enables LLMs to induce programs over any low-resourced KB. Firstly, KB-Plugin adopts self-supervised learning to encode the detailed schema information of a given KB into a pluggable module, namely schema plugin. Secondly, KB-Plugin utilizes abundant annotated data from a rich-resourced KB to train another pluggable module, namely PI plugin, which can help the LLM extract question-relevant schema information from the schema plugin of any KB and utilize the information to induce programs over this KB. Experiments show that KB-Plugin outperforms SoTA low-resourced PI methods with 25x smaller backbone LLM on both large-scale and domain-specific KBs, and even approaches the performance of supervised methods. 2024.emnlp-main.168 @@ -2387,7 +2387,7 @@ Understanding Higher-Order Correlations Among Semantic Components in Embeddings MomoseOyamaKyoto University, Kyoto University HiroakiYamagiwaKyoto University, Kyoto University - HidetoshiShimodairaKyoto University and RIKEN + HidetoshiShimodairaKyoto University and RIKEN 2883-2899 Independent Component Analysis (ICA) offers interpretable semantic components of embeddings.While ICA theory assumes that embeddings can be linearly decomposed into independent components, real-world data often do not satisfy this assumption. Consequently, non-independencies remain between the estimated components, which ICA cannot eliminate. We quantified these non-independencies using higher-order correlations and demonstrated that when the higher-order correlation between two components is large, it indicates a strong semantic association between them, along with many words sharing common meanings with both components. The entire structure of non-independencies was visualized using a maximum spanning tree of semantic components. These findings provide deeper insights into embeddings through ICA. 2024.emnlp-main.169 @@ -2397,16 +2397,16 @@ <fixed-case>DGLF</fixed-case>: A Dual Graph-based Learning Framework for Multi-modal Sarcasm Detection ZhihongZhuTencent - KefanShen - ZhaorunChen + KefanShen + ZhaorunChen YunyanZhangJarvis Research Center, Tencent YouTu Lab YuyanChen XiaoqiJiaoHuazhong University of Science and Technology ZhongweiWan ShaorongXieShanghai University WeiLiuShanghai University - XianWuTencent - YefengZhengWestlake University + XianWuTencent + YefengZhengWestlake University 2900-2912 2024.emnlp-main.170 zhu-etal-2024-dglf @@ -2415,10 +2415,10 @@ Evaluating <fixed-case>D</fixed-case>-<fixed-case>MERIT</fixed-case> of Partial-annotation on Information Retrieval RoyiRassinResearch, Google and Bar-Ilan University - YaronFairsteinAmazon + YaronFairsteinAmazon OrenKalinskyAmazon GuyKushilevitzAmazon - NachshonCohenAmazon + NachshonCohenAmazon AlexanderLibovAmazon YoavGoldbergBar-Ilan University, Allen Institute for Artificial Intelligence and Bar Ilan University 2913-2932 @@ -2429,9 +2429,9 @@ Verification and Refinement of Natural Language Explanations through <fixed-case>LLM</fixed-case>-Symbolic Theorem Proving - XinQuanUniversity of Manchester + XinQuanUniversity of Manchester MarcoValentino - Louise A.DennisUniversity of Manchester, University of Manchester + Louise A.DennisUniversity of Manchester, University of Manchester AndreFreitasIdiap Research Institute and University of Manchester 2933-2958 Natural language explanations represent a proxy for evaluating explanation-based and multi-step Natural Language Inference (NLI) models. However, assessing the validity of explanations for NLI is challenging as it typically involves the crowd-sourcing of apposite datasets, a process that is time-consuming and prone to logical errors. To address existing limitations, this paper investigates the verification and refinement of natural language explanations through the integration of Large Language Models (LLMs) and Theorem Provers (TPs). Specifically, we present a neuro-symbolic framework, named Explanation-Refiner, that integrates TPs with LLMs to generate and formalise explanatory sentences and suggest potential inference strategies for NLI. In turn, the TP is employed to provide formal guarantees on the logical validity of the explanations and to generate feedback for subsequent improvements. We demonstrate how Explanation-Refiner can be jointly used to evaluate explanatory reasoning, autoformalisation, and error correction mechanisms of state-of-the-art LLMs as well as to automatically enhance the quality of explanations of variable complexity in different domains. @@ -2447,10 +2447,10 @@ MianqiuHuang RundongShi LinsenGuo - ChongPeng + ChongPeng PengYan YaqianZhouFudan University, Tsinghua University - XipengQiuFudan University + XipengQiuFudan University 2959-2979 Large language models optimized with techniques like RLHF have achieved good alignment in being helpful and harmless. However, post-alignment, these language models often exhibit overconfidence, where the expressed confidence does not accurately calibrate with their correctness rate. In this paper, we decompose the language model confidence into the Uncertainty about the question and the Fidelity to the answer generated by language models. Then, we propose a plug-and-play method, UF Calibration, to estimate the confidence of language models. Our method has shown good calibration performance by conducting experiments with 6 RLHF-LMs on four MCQA datasets. Moreover, we propose two novel metrics, IPR and CE, to evaluate the calibration of the model, and we have conducted a detailed discussion on Truly Well-Calibrated Confidence for large language models. Our method could serve as a strong baseline, and we hope that this work will provide some insights into the model confidence calibration. 2024.emnlp-main.173 @@ -2475,7 +2475,7 @@ How Hard is this Test Set? <fixed-case>NLI</fixed-case> Characterization by Exploiting Training Dynamics AdrianCosmaUniversitatea Nationala de Stiinta si Tehnologie POLITEHNICA Bucuresti StefanRusetiUniversity Politehnica of Bucharest - MihaiDascaluUniversity Politehnica of Bucharest + MihaiDascaluUniversity Politehnica of Bucharest CorneliaCarageaUniversity of Illinois, Chicago 2990-3001 Natural Language Inference (NLI) evaluation is crucial for assessing language understanding models; however, popular datasets suffer from systematic spurious correlations that artificially inflate actual model performance. To address this, we propose a method for the automated creation of a challenging test set without relying on the manual construction of artificial and unrealistic examples. We categorize the test set of popular NLI datasets into three difficulty levels by leveraging methods that exploit training dynamics. This categorization significantly reduces spurious correlation measures, with examples labeled as having the highest difficulty showing markedly decreased performance and encompassing more realistic and diverse linguistic phenomena. When our characterization method is applied to the training set, models trained with only a fraction of the data achieve comparable performance to those trained on the full dataset, surpassing other dataset characterization techniques. Our research addresses limitations in NLI dataset construction, providing a more authentic evaluation of model performance with implications for diverse NLU applications. @@ -2486,7 +2486,7 @@ Zero-shot Cross-Lingual Transfer for Synthetic Data Generation in Grammatical Error Detection Gaetan LopezLatoucheUbisoft - Marc-AndréCarbonneauUbisoft + Marc-AndréCarbonneauUbisoft BenjaminSwansonUbisoft 3002-3016 Grammatical Error Detection (GED) methods rely heavily on human annotated error corpora. However, these annotations are unavailable in many low-resource languages. In this paper, we investigate GED in this context. Leveraging the zero-shot cross-lingual transfer capabilities of multilingual pre-trained language models, we train a model using data from a diverse set of languages to generate synthetic errors in other languages. These synthetic error corpora are then used to train a GED model. Specifically we propose a two-stage fine-tuning pipeline where the GED model is first fine-tuned on multilingual synthetic data from target languages followed by fine-tuning on human-annotated GED corpora from source languages. This approach outperforms current state-of-the-art annotation-free GED methods. We also analyse the errors produced by our method and other strong baselines, finding that our approach produces errors that are more diverse and more similar to human errors. @@ -2511,9 +2511,9 @@ DongfangLiHarbin Institute of Technology YanZhong BorenHu - YibinChen - BaotianHuHarbin Institute of Technology, Shenzhen - MinZhangHarbin Institute of Technology + YibinChen + BaotianHuHarbin Institute of Technology, Shenzhen + MinZhangHarbin Institute of Technology 3027-3041 Recent studies in Retrieval-Augmented Generation (RAG) have investigated extracting evidence from retrieved passages to reduce computational costs and enhance the final RAG performance, yet it remains challenging. Existing methods heavily rely on heuristic-based augmentation, encountering several issues: (1) Poor generalization due to hand-crafted context filtering; (2) Semantics deficiency due to rule-based context chunking; (3) Skewed length due to sentence-wise filter learning. To address these issues, we propose a model-based evidence extraction learning framework, SEER, optimizing a vanilla model as an evidence extractor with desired properties through self-aligned learning. Extensive experiments show that our method largely improves the final RAG performance, enhances the faithfulness, helpfulness, and conciseness of the extracted evidence, and reduces the evidence length by 9.25 times. The code will be available at https://github.com/HITsz-TMG/SEER. 2024.emnlp-main.178 @@ -2523,7 +2523,7 @@ On the Role of Context in Reading Time Prediction AndreasOpedalDepartment of Computer Science, ETHZ - ETH Zurich - EleanorChodroffUniversity of Zurich and University of York + EleanorChodroffUniversity of Zurich and University of York RyanCotterellSwiss Federal Institute of Technology EthanWilcoxETHZ - ETH Zurich 3042-3058 @@ -2541,9 +2541,9 @@ JianzhuBaoHarbin Institute of Technology FangquanLinAlibaba Group ChengYangAlibaba Group - BingQinHarbin Institute of Technology + BingQinHarbin Institute of Technology RuifengXuHarbin Institute of Technology - WotaoYinAlibaba Group US + WotaoYinAlibaba Group US 3059-3077 Despite the remarkable progress made by large language models in mathematical reasoning, interactive theorem proving in formal logic still remains a prominent challenge. Previous methods resort to neural models for proofstep generation and search. However, they suffer from exploring possible proofsteps empirically in a large search space. Moreover, they directly use a less rigorous informal proof for proofstep generation, neglecting the incomplete reasoning within. In this paper, we propose BC-Prover, a backward chaining framework guided by pseudo steps. Specifically, BC-Prover prioritizes pseudo steps to proofstep generation. The pseudo steps boost the proof construction in two aspects: (1) Backward Chaining that decomposes the proof into sub-goals for goal-oriented exploration. (2) Step Planning that makes a fine-grained planning to bridge the gap between informal and formal proofs. Experiments on the miniF2F benchmark show significant performance gains by our framework over the state-of-the-art approaches. Our framework is also compatible with existing provers and further improves their performance with the backward chaining technique. 2024.emnlp-main.180 @@ -2555,9 +2555,9 @@ From Insights to Actions: The Impact of Interpretability and Analysis Research on <fixed-case>NLP</fixed-case> MariusMosbachMcGill University and Mila - Quebec Artificial Intelligence Institute - VagrantGautamSaarland University + VagrantGautamSaarland University TomásVergara Browne - DietrichKlakowSaarland University + DietrichKlakowSaarland University MorGevaTel Aviv University and Google Research 3078-3105 Interpretability and analysis (IA) research is a growing subfield within NLP with the goal of developing a deeper understanding of the behavior or inner workings of NLP systems and methods. Despite growing interest in the subfield, a criticism of this work is that it lacks actionable insights and therefore has little impact on NLP. In this paper, we seek to quantify the impact of IA research on the broader field of NLP. We approach this with a mixed-methods analysis of: (1) a citation graph of 185K+ papers built from all papers published at ACL and EMNLP conferences from 2018 to 2023, and their references and citations, and (2) a survey of 138 members of the NLP community. Our quantitative results show that IA work is well-cited outside of IA, and central in the NLP citation graph. Through qualitative analysis of survey responses and manual annotation of 556 papers, we find that NLP researchers build on findings from IA work and perceive it as important for progress in NLP, multiple subfields, and rely on its findings and terminology for their own work. Many novel methods are proposed based on IA findings and highly influenced by them, but highly influential non-IA work cites IA findings without being driven by them. We end by summarizing what is missing in IA work today and provide a call to action, to pave the way for a more impactful future of IA research. @@ -2572,7 +2572,7 @@ JingwuXiao ShuohuanWang YuSun - HuaWu + HuaWu 3106-3125 The integration of visual and textual information represents a promising direction in the advancement of language models. In this paper, we explore the dual modality of language—both visual and textual—within an autoregressive framework, pre-trained on both document images and texts. Our method employs a multimodal training strategy, utilizing visual data through next patch prediction with a regression head and/or textual data through next token prediction with a classification head. We focus on understanding the interaction between these two modalities and their combined impact on model performance. Our extensive evaluation across a wide range of benchmarks shows that incorporating both visual and textual data significantly improves the performance of pixel-based language models. Remarkably, we find that a unidirectional pixel-based model trained solely on visual data can achieve comparable results to state-of-the-art bidirectional models on several language understanding tasks. This work uncovers the untapped potential of integrating visual and textual modalities for more effective language modeling. We release our code, data, and model checkpoints at https://github.com/ernie-research/pixelgpt. 2024.emnlp-main.182 @@ -2586,7 +2586,7 @@ ShuohuanWang YuSun QiweiPeng - HuaWu + HuaWu 3126-3150 Amidst the rapid advancements in generative language models, the investigation of how training data shapes the performance of GPT models is still emerging. This paper presents GPTfluence, a novel approach that leverages a featurized simulation to assess the impact of training examples on the training dynamics of GPT models. Our approach not only traces the influence of individual training instances on performance trajectories, such as loss and other key metrics, on targeted test points but also enables a comprehensive comparison with existing methods across various training scenarios in GPT models, ranging from 14 million to 2.8 billion parameters, across a range of downstream tasks. Contrary to earlier methods that struggle with generalization to new data, GPTfluence introduces a parameterized simulation of training dynamics, demonstrating robust generalization capabilities to unseen training data. This adaptability is evident across both fine-tuning and instruction-tuning scenarios, spanning tasks in natural language understanding and generation. We make our code and data publicly available at https://github.com/ernie-research/gptfluence. 2024.emnlp-main.183 @@ -2595,9 +2595,9 @@ Understanding “Democratization” in <fixed-case>NLP</fixed-case> and <fixed-case>ML</fixed-case> Research - ArjunSubramonianUniversity of California, Los Angeles - VagrantGautamSaarland University - DietrichKlakowSaarland University + ArjunSubramonianUniversity of California, Los Angeles + VagrantGautamSaarland University + DietrichKlakowSaarland University ZeerakTalatMohamed bin Zayed University of Artificial Intelligence 3151-3166 Recent improvements in natural language processing (NLP) and machine learning (ML) and increased mainstream adoption have led to researchers frequently discussing the “democratization” of artificial intelligence. In this paper, we seek to clarify how democratization is understood in NLP and ML publications, through large-scale mixed-methods analyses of papers using the keyword “democra*” published in NLP and adjacent venues. We find that democratization is most frequently used to convey (ease of) access to or use of technologies, without meaningfully engaging with theories of democratization, while research using other invocations of “democra*” tends to be grounded in theories of deliberation and debate. Based on our findings, we call for researchers to enrich their use of the term democratization with appropriate theory, towards democratic technologies beyond superficial access. @@ -2607,7 +2607,7 @@ <fixed-case>D</fixed-case>oc<fixed-case>KD</fixed-case>: Knowledge Distillation from <fixed-case>LLM</fixed-case>s for Open-World Document Understanding Models - SungnyunKimAmazon and Korea Advanced Institute of Science and Technology + SungnyunKimAmazon and Korea Advanced Institute of Science and Technology HaofuLiaoAmazon SrikarAppalarajuAmazon PengTangAmazon @@ -2615,7 +2615,7 @@ Ravi KumarSatzodaAmazon R.ManmathaAmazon VijayMahadevanAmazon - StefanoSoattoAmazon Web Services and UCLA Computer Science Department, University of California, Los Angeles + StefanoSoattoAmazon Web Services and UCLA Computer Science Department, University of California, Los Angeles 3167-3193 Visual document understanding (VDU) is a challenging task that involves understanding documents across various modalities (text and image) and layouts (forms, tables, etc.). This study aims to enhance generalizability of small VDU models by distilling knowledge from LLMs. We identify that directly prompting LLMs often fails to generate informative and useful data. In response, we present a new framework (called DocKD) that enriches the data generation process by integrating external document knowledge. Specifically, we provide an LLM with various document elements like key-value pairs, layouts, and descriptions, to elicit open-ended answers. Our experiments show that DocKD produces high-quality document annotations and surpasses the direct knowledge distillation approach that does not leverage external document knowledge. Moreover, student VDU models trained with solely DocKD-generated data is not only comparable to those trained with human-annotated data on in-domain tasks but also significantly excel them on out-of-domain tasks. 2024.emnlp-main.185 @@ -2624,7 +2624,7 @@ Cross-lingual Transfer for Automatic Question Generation by Learning Interrogative Structures in Target Languages - SeonjeongHwangPohang University of Science and Technology + SeonjeongHwangPohang University of Science and Technology YunsuKimaiXplain, Inc. GaryLee 3194-3208 @@ -2665,7 +2665,7 @@ Improving Multi-party Dialogue Generation via Topic and Rhetorical Coherence YaxinFan PeifengLiSoochow University, China - QiaomingZhuSoochow University + QiaomingZhuSoochow University 3240-3253 Previous studies on multi-party dialogue generation predominantly concentrated on modeling the reply-to structure of dialogue histories, always overlooking the coherence between generated responses and target utterances. To address this issue, we propose a Reinforcement Learning approach emphasizing both Topic and Rhetorical Coherence (RL-TRC). In particular, the topic- and rhetorical-coherence tasks are designed to enhance the model’s perception of coherence with the target utterance. Subsequently, an agent is employed to learn a coherence policy, which guides the generation of responses that are topically and rhetorically aligned with the target utterance. Furthermore, three discourse-aware rewards are developed to assess the coherence between the generated response and the target utterance, with the objective of optimizing the policy. The experimental results and in-depth analyses on two popular datasets demonstrate that our RL-TRC significantly outperforms the state-of-the-art baselines, particularly in generating responses that are more coherent with the target utterances. 2024.emnlp-main.189 @@ -2676,7 +2676,7 @@ <fixed-case>SEEKR</fixed-case>: Selective Attention-Guided Knowledge Retention for Continual Learning of Large Language Models JinghanHeInstitute of automation, Chinese academy of science, Chinese Academy of Sciences HaiyunGuoInstitute of automation, Chinese Academy of Sciences - KuanZhuInstitute of automation, Chinese academy of science, Chinese Academy of Sciences + KuanZhuInstitute of automation, Chinese academy of science, Chinese Academy of Sciences ZihanZhao MingTangInstitute of automation, Chinese academy of science, Chinese Academy of Sciences JinqiaoWangInstitute of Automation, Chinese Academy of Sciences @@ -2689,7 +2689,7 @@ Neuron-Level Knowledge Attribution in Large Language Models ZepingYu - SophiaAnaniadouUniversity of Manchester + SophiaAnaniadouUniversity of Manchester 3267-3280 Identifying important neurons for final predictions is essential for understanding the mechanisms of large language models. Due to computational constraints, current attribution techniques struggle to operate at neuron level. In this paper, we propose a static method for pinpointing significant neurons. Compared to seven other methods, our approach demonstrates superior performance across three metrics. Additionally, since most static methods typically only identify “value neurons” directly contributing to the final prediction, we propose a method for identifying “query neurons” which activate these “value neurons”. Finally, we apply our methods to analyze six types of knowledge across both attention and feed-forward network (FFN) layers. Our method and analysis are helpful for understanding the mechanisms of knowledge storage and set the stage for future research in knowledge editing. The code is available on https://github.com/zepingyu0512/neuron-attribution. 2024.emnlp-main.191 @@ -2699,7 +2699,7 @@ How do Large Language Models Learn In-Context? Query and Key Matrices of In-Context Heads are Two Towers for Metric Learning ZepingYu - SophiaAnaniadouUniversity of Manchester + SophiaAnaniadouUniversity of Manchester 3281-3292 We investigate the mechanism of in-context learning (ICL) on sentence classification tasks with semantically-unrelated labels (“foo”/“bar”). We find intervening in only 1% heads (named “in-context heads”) significantly affects ICL accuracy from 87.6% to 24.4%. To understand this phenomenon, we analyze the value-output vectors in these heads and discover that the vectors at each label position contain substantial information about the corresponding labels. Furthermore, we observe that the prediction shift from “foo” to “bar” is due to the respective reduction and increase in these heads’ attention scores at “foo” and “bar” positions. Therefore, we propose a hypothesis for ICL: in in-context heads, the value-output matrices extract label features, while the query-key matrices compute the similarity between the features at the last position and those at each label position. The query and key matrices can be considered as two towers that learn the similarity metric between the last position’s features and each demonstration at label positions. Using this hypothesis, we explain the majority label bias and recency bias in ICL and propose two methods to reduce these biases by 22% and 17%, respectively. 2024.emnlp-main.192 @@ -2709,7 +2709,7 @@ Interpreting Arithmetic Mechanism in Large Language Models through Comparative Neuron Analysis ZepingYu - SophiaAnaniadouUniversity of Manchester + SophiaAnaniadouUniversity of Manchester 3293-3306 We find arithmetic ability resides within a limited number of attention heads, with each head specializing in distinct operations. To delve into the reason, we introduce the Comparative Neuron Analysis (CNA) method, which identifies an internal logic chain consisting of four distinct stages from input to prediction: feature enhancing with shallow FFN neurons, feature transferring by shallow attention layers, feature predicting by arithmetic heads, and prediction enhancing among deep FFN neurons. Moreover, we identify the human-interpretable FFN neurons within both feature-enhancing and feature-predicting stages. These findings lead us to investigate the mechanism of LoRA, revealing that it enhances prediction probabilities by amplifying the coefficient scores of FFN neurons related to predictions. Finally, we apply our method in model pruning for arithmetic tasks and model editing for reducing gender bias. Code is on https://github.com/zepingyu0512/arithmetic-mechanism. 2024.emnlp-main.193 @@ -2718,10 +2718,10 @@ Pixology: Probing the Linguistic and Visual Capabilities of Pixel-based Language Models - KushalTatariyaKU Leuven - VladimirAraujoKU Leuven - ThomasBauwensKU Leuven - Miryamde LhoneuxKU Leuven + KushalTatariyaKU Leuven + VladimirAraujoKU Leuven + ThomasBauwensKU Leuven + Miryamde LhoneuxKU Leuven 3307-3320 Pixel-based language models have emerged as a compelling alternative to subword-based language modelling, particularly because they can represent virtually any script. PIXEL, a canonical example of such a model, is a vision transformer that has been pre-trained on rendered text. While PIXEL has shown promising cross-script transfer abilities and robustness to orthographic perturbations, it falls short of outperforming monolingual subword counterparts like BERT in most other contexts. This discrepancy raises questions about the amount of linguistic knowledge learnt by these models and whether their performance in language tasks stems more from their visual capabilities than their linguistic ones. To explore this, we probe PIXEL using a variety of linguistic and visual tasks to assess its position on the vision-to-language spectrum. Our findings reveal a substantial gap between the model’s visual and linguistic understanding. The lower layers of PIXEL predominantly capture superficial visual features, whereas the higher layers gradually learn more syntactic and semantic abstractions. Additionally, we examine variants of PIXEL trained with different text rendering strategies, discovering that introducing certain orthographic constraints at the input level can facilitate earlier learning of surface-level features. With this study, we hope to provide insights that aid the further development of pixel-based language models. 2024.emnlp-main.194 @@ -2730,10 +2730,10 @@ <fixed-case>G</fixed-case>old<fixed-case>C</fixed-case>oin: Grounding Large Language Models in Privacy Laws via Contextual Integrity Theory - WeiFanHong Kong University of Science and Technology - HaoranLi + WeiFanHong Kong University of Science and Technology + HaoranLi ZheyeDengDepartment of Computer Science and Engineering, Hong Kong University of Science and Technology - WeiqiWangJohns Hopkins University and The Hong Kong University of Science and Technology + WeiqiWangJohns Hopkins University and The Hong Kong University of Science and Technology YangqiuSongThe Hong Kong University of Science and Technology 3321-3343 Privacy issues arise prominently during the inappropriate transmission of information between entities. Existing research primarily studies privacy by exploring various privacy attacks, defenses, and evaluations within narrowly predefined patterns, while neglecting that privacy is not an isolated, context-free concept limited to traditionally sensitive data (e.g., social security numbers), but intertwined with intricate social contexts that complicate the identification and analysis of potential privacy violations. The advent of Large Language Models (LLMs) offers unprecedented opportunities for incorporating the nuanced scenarios outlined in privacy laws to tackle these complex privacy issues. However, the scarcity of open-source relevant case studies restricts the efficiency of LLMs in aligning with specific legal statutes. To address this challenge, we introduce a novel framework, GoldCoin, designed to efficiently ground LLMs in privacy laws for judicial assessing privacy violations. Our framework leverages the theory of contextual integrity as a bridge, creating numerous synthetic scenarios grounded in relevant privacy statutes (e.g., HIPAA), to assist LLMs in comprehending the complex contexts for identifying privacy risks in the real world. Extensive experimental results demonstrate that GoldCoin markedly enhances LLMs’ capabilities in recognizing privacy risks across real court cases, surpassing the baselines on different judicial tasks. @@ -2745,12 +2745,12 @@ Noise, Novels, Numbers. A Framework for Detecting and Categorizing Noise in <fixed-case>D</fixed-case>anish and <fixed-case>N</fixed-case>orwegian Literature - AliAl-Laith - DanielHershcovichUniversity of Copenhagen - JensBjerring-HansenCopenhagen University + AliAl-Laith + DanielHershcovichUniversity of Copenhagen + JensBjerring-HansenCopenhagen University Jakob IngemannParby AlexanderConroy - Timothy RTangherliniUniversity of California, Berkeley + Timothy RTangherliniUniversity of California, Berkeley 3344-3354 We present a framework for detecting and categorizing noise in literary texts, demonstrated through its application to Danish and Norwegian literature from the late 19-th century. Noise, understood as “aberrant sonic behaviour,” is not only an auditory phenomenon but also a cultural construct tied to the processes of civilization and urbanization.We begin by utilizing topic modeling techniques to identify noise-related documents, followed by fine-tuning BERT-based language models trained on Danish and Norwegian texts to analyze a corpus of over 800 novels.We identify and track the prevalence of noise in these texts, offering insights into the literary perceptions of noise during the Scandinavian “Modern Breakthrough” period (1870-1899). Our contributions include the development of a comprehensive dataset annotated for noise-related segments and their categorization into human-made, non-human-made, and musical noises. This study illustrates the framework’s potential for enhancing the understanding of the relationship between noise and its literary representations, providing a deeper appreciation of the auditory elements in literary works, including as sources for cultural history. 2024.emnlp-main.196 @@ -2775,9 +2775,9 @@ Fine-Grained Prediction of Reading Comprehension from Eye Movements - OmerShubi - YoavMeiri - Cfir AvrahamHadar + OmerShubi + YoavMeiri + Cfir AvrahamHadar YevgeniBerzakTechnion - Israel Institute of Technology, Technion 3372-3391 Can human reading comprehension be assessed from eye movements in reading? In this work, we address this longstanding question using large-scale eyetracking data. We focus on a cardinal and largely unaddressed variant of this question: predicting reading comprehension of a single participant for a single question from their eye movements over a single paragraph. We tackle this task using a battery of recent models from the literature, and three new multimodal language models. We evaluate the models in two different reading regimes: ordinary reading and information seeking, and examine their generalization to new textual items, new participants, and the combination of both. The evaluations suggest that the task is highly challenging, and highlight the importance of benchmarking against a strong text-only baseline. While in some cases eye movements provide improvements over such a baseline, they tend to be small. This could be due to limitations of current modelling approaches, limitations of the data, or because eye movement behavior does not sufficiently pertain to fine-grained aspects of reading comprehension processes. Our study provides an infrastructure for making further progress on this question. @@ -2791,12 +2791,12 @@ ZiyuanZhuang ZhiyangZhang SitaoCheng - FangkaiYangMicrosoft + FangkaiYangMicrosoft JiaLiuNanjing University ShujianHuangNanjing University - QingweiLinMicrosoft Research + QingweiLinMicrosoft Research SaravanRajmohanMicrosoft - DongmeiZhangMicrosoft and Microsoft + DongmeiZhangMicrosoft and Microsoft QiZhang 3392-3411 Retrieval-augmented generation (RAG) methods encounter difficulties when addressing complex questions like multi-hop queries.While iterative retrieval methods improve performance by gathering additional information, current approaches often rely on multiple calls of large language models (LLMs).In this paper, we introduce EfficientRAG, an efficient retriever for multi-hop question answering.EfficientRAG iteratively generates new queries without the need for LLM calls at each iteration and filters out irrelevant information.Experimental results demonstrate that EfficientRAG surpasses existing RAG methods on three open-domain multi-hop question-answering datasets.The code is available in [aka.ms/efficientrag](https://github.com/NIL-zhuang/EfficientRAG-official). @@ -2806,7 +2806,7 @@ Unsupervised Human Preference Learning - SumukShashidhar + SumukShashidhar AbhinavChinta VaibhavSahai Dilek HakkaniTurUniversity of Illinois at Urbana-Champaign @@ -2822,10 +2822,10 @@ Is Safer Better? The Impact of Guardrails on the Argumentative Strength of <fixed-case>LLM</fixed-case>s in Hate Speech Countering HelenaBonaldiFondazione Bruno Kessler and University of Trento GretaDamo - Nicolás BenjamínOcampo - ElenaCabrioUniversité Côte d’Azur + Nicolás BenjamínOcampo + ElenaCabrioUniversité Côte d’Azur SerenaVillataCNRS - MarcoGueriniFondazione Bruno Kessler + MarcoGueriniFondazione Bruno Kessler 3446-3463 The potential effectiveness of counterspeech as a hate speech mitigation strategy is attracting increasing interest in the NLG research community, particularly towards the task of automatically producing it. However, automatically generated responses often lack the argumentative richness which characterises expert-produced counterspeech. In this work, we focus on two aspects of counterspeech generation to produce more cogent responses. First, by investigating the tension between helpfulness and harmlessness of LLMs, we test whether the presence of safety guardrails hinders the quality of the generations. Secondly, we assess whether attacking a specific component of the hate speech results in a more effective argumentative strategy to fight online hate. By conducting an extensive human and automatic evaluation, we show how the presence of safety guardrails can be detrimental also to a task that inherently aims at fostering positive social interactions. Moreover, our results show that attacking a specific component of the hate speech, and in particular its implicit negative stereotype and its hateful parts, leads to higher-quality generations. 2024.emnlp-main.201 @@ -2844,9 +2844,9 @@ <fixed-case>LLM</fixed-case>4<fixed-case>D</fixed-case>ecompile: Decompiling Binary Code with Large Language Models - HanzhuoTanHong Kong Polytechnic University + HanzhuoTanHong Kong Polytechnic University QiLuo - JingLiThe Hong Kong Polytechnic University + JingLiThe Hong Kong Polytechnic University YuqunZhang 3473-3487 Decompilation aims to convert binary code to high-level source code, but traditional tools like Ghidra often produce results that are difficult to read and execute. Motivated by the advancements in Large Language Models (LLMs), we propose LLM4Decompile, the first and largest open-source LLM series (1.3B to 33B) trained to decompile binary code. We optimize the LLM training process and introduce the LLM4Decompile-End models to decompile binary directly. The resulting models significantly outperform GPT-4o and Ghidra on the HumanEval and ExeBench benchmarks by over 100% in terms of re-executability rate. Additionally, we improve the standard refinement approach to fine-tune the LLM4Decompile-Ref models, enabling them to effectively refine the decompiled code from Ghidra and achieve a further 16.2% improvement over the LLM4Decompile-End. LLM4Decompile demonstrates the potential of LLMs to revolutionize binary code decompilation, delivering remarkable improvements in readability and executability while complementing conventional tools for optimal results. @@ -2872,10 +2872,10 @@ YikeWu YiHuangChina Mobile Research Institute NanHuSoutheast University - YunchengHua + YunchengHua GuilinQi JiaoyanChen - Jeff Z.PanUniversity of Edinburgh, University of Edinburgh + Jeff Z.PanUniversity of Edinburgh, University of Edinburgh 3501-3520 Recent studies have explored the use of Large Language Models (LLMs) with Retrieval Augmented Generation (RAG) for Knowledge Graph Question Answering (KGQA). They typically require rewriting retrieved subgraphs into natural language formats comprehensible to LLMs. However, when tackling complex questions, the knowledge rewritten by existing methods may include irrelevant information, omit crucial details, or fail to align with the question’s semantics. To address them, we propose a novel rewriting method CoTKR, Chain- of-Thought Enhanced Knowledge Rewriting, for generating reasoning traces and corresponding knowledge in an interleaved manner, thereby mitigating the limitations of single-step knowledge rewriting. Additionally, to bridge the preference gap between the knowledge rewriter and the question answering (QA) model, we propose a training strategy PAQAF, Preference Alignment from Question Answering Feedback, for leveraging feedback from the QA model to further optimize the knowledge rewriter. We conduct experiments using various LLMs across several KGQA benchmarks. Experimental results demonstrate that, compared with previous knowledge rewriting methods, CoTKR generates the most beneficial knowledge representation for QA models, which significantly improves the performance of LLMs in KGQA. 2024.emnlp-main.205 @@ -2888,9 +2888,9 @@ <fixed-case>MTLS</fixed-case>: Making Texts into Linguistic Symbols WenlongFei XiaohuaWangHefei University of Technology - MinHuHefei University of Technology - QingyuZhang - HongboLi + MinHuHefei University of Technology + QingyuZhang + HongboLi 3521-3535 In linguistics, all languages can be considered as symbolic systems, with each language relying on symbolic processes to associate specific symbols with meanings. In the same language, there is a fixed correspondence between linguistic symbol and meaning. In different languages, universal meanings follow varying rules of symbolization in one-to-one correspondence with symbols. Most work overlooks the properties of languages as symbol systems. In this paper, we shift the focus to the symbolic properties and introduce MTLS: a pre-training method to improve the multilingual capability of models by Making Texts into Linguistic Symbols. Initially, we replace the vocabulary in pre-trained language models by mapping relations between linguistic symbols and semantics. Subsequently, universal semantics within the symbolic system serve as bridges, linking symbols from different languages to the embedding space of the model, thereby enabling the model to process linguistic symbols. To evaluate the effectiveness of MTLS, we conducted experiments on multilingual tasks using BERT and RoBERTa, respectively, as the backbone. The results indicate that despite having just over 12,000 pieces of English data in pre-training, the improvement that MTLS brings to multilingual capabilities is remarkably significant. 2024.emnlp-main.206 @@ -2899,10 +2899,10 @@ <fixed-case>D</fixed-case>2<fixed-case>R</fixed-case>: Dual-Branch Dynamic Routing Network for Multimodal Sentiment Detection - YifanChen - KuntaoLi - WeixingMaiSouth China Normal University - QiaofengWu + YifanChen + KuntaoLi + WeixingMaiSouth China Normal University + QiaofengWu YunXue FenghuanLiGuangdong University of Technology 3536-3547 @@ -2912,12 +2912,12 @@ A Generic Method for Fine-grained Category Discovery in Natural Language Texts - ChangTian - Matthew B.BlaschkoKU Leuven + ChangTian + Matthew B.BlaschkoKU Leuven WenpengYinPennsylvania State University MingzheXingZhongguancun Laboratory - YinliangYueZhongguancun Laboratory - Marie-FrancineMoensKU Leuven, KU Leuven + YinliangYueZhongguancun Laboratory + Marie-FrancineMoensKU Leuven, KU Leuven 3548-3566 Fine-grained category discovery using only coarse-grained supervision is a cost-effective yet challenging task. Previous training methods focus on aligning query samples with positive samples and distancing them from negatives. They often neglect intra-category and inter-category semantic similarities of fine-grained categories when navigating sample distributions in the embedding space. Furthermore, some evaluation techniques that rely on pre-collected test samples are inadequate for real-time applications. To address these shortcomings, we introduce a method that successfully detects fine-grained clusters of semantically similar texts guided by a novel objective function. The method uses semantic similarities in a logarithmic space to guide sample distributions in the Euclidean space and to form distinct clusters that represent fine-grained categories. We also propose a centroid inference mechanism to support real-time applications. The efficacy of the method is both theoretically justified and empirically confirmed on three benchmark tasks. The proposed objective function is integrated in multiple contrastive learning based neural models. Its results surpass existing state-of-the-art approaches in terms of Accuracy, Adjusted Rand Index and Normalized Mutual Information of the detected fine-grained categories. Code and data are publicly available at https://github.com/changtianluckyforever/F-grained-STAR. 2024.emnlp-main.208 @@ -2927,7 +2927,7 @@ Toxicity Detection is <fixed-case>NOT</fixed-case> all you Need: Measuring the Gaps to Supporting Volunteer Content Moderators through a User-Centric Method Yang TristaCaoUniversity of Maryland, College Park - Lovely-FrancesDomingoUniversity of Maryland, College Park + Lovely-FrancesDomingoUniversity of Maryland, College Park SarahGilbertCornell University Michelle L.MazurekUniversity of Maryland, College Park KatieShiltonUniversity of Maryland, College Park @@ -2940,11 +2940,11 @@ A User-Centric Multi-Intent Benchmark for Evaluating Large Language Models - JiayinWang - FengranMo - WeizhiMaTsinghua University - PeijieSun - MinZhangTsinghua University, Tsinghua University + JiayinWang + FengranMo + WeizhiMaTsinghua University + PeijieSun + MinZhangTsinghua University, Tsinghua University Jian-YunNieUniversity of Montreal 3588-3612 Large language models (LLMs) are essential tools that users employ across various scenarios, so evaluating their performance and guiding users in selecting the suitable service is important. Although many benchmarks exist, they mainly focus on specific predefined model abilities, such as world knowledge, reasoning, etc. Based on these ability scores, it is hard for users to determine which LLM best suits their particular needs. To address these issues, we propose to evaluate LLMs from a user-centric perspective and design this benchmark to measure their efficacy in satisfying user needs under distinct intents. Firstly, we collect 1,846 real-world use cases from a user study with 712 participants from 23 countries. This first-hand data helps us understand actual user intents and needs in LLM interactions, forming the User Reported Scenarios (URS) dataset, which is categorized with six types of user intents. Secondly, based on this authentic dataset, we benchmark 10 LLM services with GPT-4-as-Judge. Thirdly, we show that benchmark scores align well with human preference in both real-world experience and pair-wise annotations, achieving Pearson correlations of 0.95 and 0.94, respectively. This alignment confirms that the URS dataset and our evaluation method establish an effective user-centric benchmark. The dataset, code, and process data are publicly available at https://github.com/Alice1998/URS. @@ -2978,7 +2978,7 @@ <fixed-case>VGB</fixed-case>ench: Evaluating Large Language Models on Vector Graphics Understanding and Generation - BochengZou + BochengZou MuCaiDepartment of Computer Science, University of Wisconsin, Madison JianruiZhang Yong JaeLeeDepartment of Computer Sciences, University of Wisconsin - Madison and Cruise @@ -2993,12 +2993,12 @@ What do Large Language Models Need for Machine Translation Evaluation? ShenbinQian - ArchchanaSindhujan + ArchchanaSindhujan MinnieKabra - DipteshKanojiaUniversity of Surrey + DipteshKanojiaUniversity of Surrey ConstantinOrasanUniversity of Surrey - TharinduRanasingheLancaster University - FredBlainTilburg University + TharinduRanasingheLancaster University + FredBlainTilburg University 3660-3674 Leveraging large language models (LLMs) for various natural language processing tasks has led to superlative claims about their performance. For the evaluation of machine translation (MT), existing research shows that LLMs are able to achieve results comparable to fine-tuned multilingual pre-trained language models. In this paper, we explore what translation information, such as the source, reference, translation errors and annotation guidelines, is needed for LLMs to evaluate MT quality. In addition, we investigate prompting techniques such as zero-shot, Chain of Thought (CoT) and few-shot prompting for eight language pairs covering high-, medium- and low-resource languages, leveraging varying LLM variants. Our findings indicate the importance of reference translations for an LLM-based evaluation. While larger models do not necessarily fare better, they tend to benefit more from CoT prompting, than smaller models. We also observe that LLMs do not always provide a numerical score when generating evaluations, which poses a question on their reliability for the task. Our work presents a comprehensive analysis for resource-constrained and training-less LLM-based evaluation of machine translation. We release the accrued prompt templates, code and data publicly for reproducibility. 2024.emnlp-main.214 @@ -3019,7 +3019,7 @@ External Knowledge-Driven Argument Mining: Leveraging Attention-Enhanced Multi-Network Models DebelaGemechu - ChrisReedUniversity of Dundee + ChrisReedUniversity of Dundee 3688-3709 Argument mining (AM) involves the identification of argument relations (AR) between Argumentative Discourse Units (ADUs). The essence of ARs among ADUs is context-dependent and lies in maintaining a coherent flow of ideas, often centered around the relations between discussed entities, topics, themes or concepts. However, these relations are not always explicitly stated; rather, inferred from implicit chains of reasoning connecting the concepts addressed in the ADUs. While humans can infer such background knowledge, machines face challenges when the contextual cues are not explicitly provided. This paper leverages external resources, including WordNet, ConceptNet, and Wikipedia to identify semantic paths (knowledge paths) connecting the concepts discussed in the ADUs to obtain the implicit chains of reasoning. To effectively leverage these paths for AR prediction, we propose attention-based Multi-Network architectures. Various architecture are evaluated on the external resources, and the Wikipedia based configuration attains F-scores of 0.85, 0.84, 0.70, and 0.87, respectively, on four diverse datasets, showing strong performance over the baselines. 2024.emnlp-main.216 @@ -3035,7 +3035,7 @@ KevinMoore SeanQuickUniversity of Iowa JohnathanMelvinUniversity of Iowa - PadminiSrinivasanUniversity of Iowa + PadminiSrinivasanUniversity of Iowa Mihailis E.DiamantisUniversity of Iowa RishabNithyanandUniversity of Iowa 3710-3722 @@ -3046,19 +3046,19 @@ <fixed-case>M</fixed-case><tex-math>^2</tex-math><fixed-case>PT</fixed-case>: Multimodal Prompt Tuning for Zero-shot Instruction Learning - TaowenWangRochester Institute of Technology and Rochester Institute of Technology + TaowenWangRochester Institute of Technology and Rochester Institute of Technology YiyangLiu James ChenhaoLiangU. S. Naval Research Laboratory and Rochester Institute of Technology - JunhanZhaoHarvard Medical School, Purdue University, Harvard University, Cornell University, Shanghai Jiaotong University, Pison Technology and Bosch - YimingCuiByteDance inc. + JunhanZhaoHarvard Medical School, Purdue University, Harvard University, Cornell University, Shanghai Jiaotong University, Pison Technology and Bosch + YimingCuiByteDance inc. YuningMaoMeta ShaoliangNieMeta Inc JiahaoLiuMeituan - FuliFengUniversity of Science and Technology of China - ZenglinXuFudan University - ChengHanUniversity of Missouri - Kansas City + FuliFengUniversity of Science and Technology of China + ZenglinXuFudan University + ChengHanUniversity of Missouri - Kansas City LifuHuangVirginia Tech - QifanWangMeta AI + QifanWangMeta AI DongfangLiuRochester Institute of Technology 3723-3740 Multimodal Large Language Models (MLLMs) demonstrate remarkable performance across a wide range of domains, with increasing emphasis on enhancing their zero-shot generalization capabilities for unseen tasks across various modalities. Instruction tuning has emerged as an effective strategy for achieving zero-shot generalization by finetuning pretrained models on diverse multimodal tasks. As the scale of MLLMs continues to grow, parameter-efficient finetuning becomes increasingly critical. However, most existing parameter-efficient approaches focus only on single modalities and often overlook the multimodal characteristics during finetuning. In this work, we introduce a novel Multimodal Prompt Tuning (M^2PT) approach for efficient instruction tuning of MLLMs. M^2PT effectively integrates visual and textual prompts into the vision encoder and language processor respectively during finetuning, facilitating the extraction and alignment of features across modalities. Empirical results on various multimodal evaluation datasets demonstrate the superior performance of our approach compared to several state-of-the-art baselines. A comprehensive set of ablation studies validates the effectiveness of our prompt design and the efficiency of our approach. @@ -3082,7 +3082,7 @@ Incubating Text Classifiers Following User Instruction with Nothing but <fixed-case>LLM</fixed-case> LetianPeng - ZilongWangUniversity of California, San Diego + ZilongWangUniversity of California, San Diego JingboShangUniversity of California, San Diego 3753-3766 In this paper, we aim to generate text classification data given arbitrary class definitions (i.e., user instruction), so one can train a text classifier without any human annotation or raw corpus. Recent advances in large language models (LLMs) lead to pioneer attempts to individually generate texts for each class via prompting. In this paper, we propose Incubator, the first framework that can handle complicated and even mutually dependent classes (e.g., "TED Talk given by Educator" and "Other"). Specifically, our Incubator is a fine-tuned LLM that takes the instruction of all class definitions as input, and in each inference, it can jointly generate one sample for every class. First, we tune Incubator on the instruction-to-data mappings that we obtained from classification datasets and descriptions on Hugging Face together with in-context augmentation by GPT-4. To emphasize the uniformity and diversity in generations, we refine Incubator by fine-tuning with the cluster centers of semantic textual embeddings of the generated samples. We compare Incubator on various classification tasks with strong baselines such as direct LLM-based inference and training data generation by prompt engineering. Experiments show Incubator is able to (1) outperform previous methods on traditional benchmarks, (2) take label interdependency and user preference into consideration, and (3) enable logical text mining by incubating multiple classifiers @@ -3092,11 +3092,11 @@ <fixed-case>PTD</fixed-case>-<fixed-case>SQL</fixed-case>: Partitioning and Targeted Drilling with <fixed-case>LLM</fixed-case>s in Text-to-<fixed-case>SQL</fixed-case> - RuilinLuo + RuilinLuo LiyuanWang BinghuaiLinTencent ZichengLin - YujiuYangGraduate School at Shenzhen,Tsinghua University + YujiuYangGraduate School at Shenzhen,Tsinghua University 3767-3799 Large Language Models (LLMs) have emerged as powerful tools for Text-to-SQL tasks, exhibiting remarkable reasoning capabilities. Different from tasks such as math word problem and commonsense reasoning, SQL solutions have a relatively fixed pattern. This facilitates the investigation of whether LLMs can benefit from categorical thinking, mirroring how humans acquire knowledge through inductive reasoning based on comparable examples. In this study, we propose that employing query group partitioning allows LLMs to focus on learning the thought processes specific to a single problem type, consequently enhancing their reasoning abilities across diverse difficulty levels and problem categories. Our experiments reveal that multiple advanced LLMs, when equipped with PTD-SQL, can either surpass or match previous state-of-the-art (SOTA) methods on the Spider and BIRD datasets. Intriguingly, models with varying initial performances have exhibited significant improvements mainly at the boundary of their capabilities after targeted drilling, suggesting a parallel with human progress. Code is available at https://github.com/lrlbbzl/PTD-SQL. 2024.emnlp-main.221 @@ -3107,7 +3107,7 @@ Conditional and Modal Reasoning in Large Language Models - Wesley H.HollidayUniversity of California, Berkeley + Wesley H.HollidayUniversity of California, Berkeley MatthewMandelkernNew York University Cedegao E.ZhangMassachusetts Institute of Technology 3800-3821 @@ -3128,7 +3128,7 @@ DongliangXu QingYang HongtaoLiuDu Xiaoman Financial - BingQinHarbin Institute of Technology + BingQinHarbin Institute of Technology 3822-3836 Teaching large language models (LLMs) to generate text with citations to evidence sources can mitigate hallucinations and enhance verifiability in information-seeking systems. However, improving this capability requires high-quality attribution data, which is costly and labor-intensive. Inspired by recent advances in self-improvement that enhance LLMs without manual annotation, we present START, a Self-Taught AttRibuTion framework for iteratively improving the attribution capability of LLMs. First, to prevent models from stagnating due to initially insufficient supervision signals, START leverages the model to self-construct synthetic training data for warming up. To further self-improve the model’s attribution ability, START iteratively utilizes fine-grained preference supervision signals constructed from its sampled responses to encourage robust, comprehensive, and attributable generation. Experiments on three open-domain question-answering datasets, covering long-form QA and multi-step reasoning, demonstrate significant performance gains of 25.13% on average without relying on human annotations and more advanced models. Further analysis reveals that START excels in aggregating information across multiple sources. 2024.emnlp-main.223 @@ -3137,9 +3137,9 @@ <fixed-case>A</fixed-case>lign<fixed-case>C</fixed-case>ap: Aligning Speech Emotion Captioning to Human Preferences - ZiqiLiangUniversity of Science and Technology of China + ZiqiLiangUniversity of Science and Technology of China HaoxiangShi - HanhuiChen + HanhuiChen 3837-3846 Speech Emotion Captioning (SEC) has gradually become an active research task. The emotional content conveyed through human speech are often complex, and classifying them into fixed categories may not be enough to fully capture speech emotions. Describing speech emotions through natural language may be a more effective approach. However, existing SEC methods often produce hallucinations and lose generalization on unseen speech. To overcome these problems, we propose AlignCap, which Aligning Speech Emotion Captioning to Human Preferences based on large language model (LLM) with two properties: 1) Speech-Text Alignment, which minimizing the divergence between the LLM’s response prediction distributions for speech and text inputs using knowledge distillation (KD) Regularization. 2) Human Preference Alignment, where we design Preference Optimization (PO) Regularization to eliminate factuality and faithfulness hallucinations. We also extract emotional clues as a prompt for enriching fine-grained information under KD-Regularization. Experiments demonstrate that AlignCap presents stronger performance to other state-of-the-art methods on Zero-shot SEC task. 2024.emnlp-main.224 @@ -3192,7 +3192,7 @@ YuelinZouColumbia University LijieHuKAUST ZiqianZengSouth China University of Technology - DiWangKAUST + DiWangKAUST HaiqinYangInternational Digital Economy Academy (IDEA) 3933-3941 Fine-tuning-based unlearning methods prevail for erasing targeted harmful, sensitive, or copyrighted information within large language models while preserving overall capabilities. However, the true effectiveness of the methods is unclear. In this paper, we delve into the limitations of fine-tuning-based unlearning through activation patching and parameter restoration experiments. Our findings reveal that these methods alter the model’s knowledge retrieval process, rather than genuinely erasing the problematic knowledge embedded in the model parameters. Furthermore, behavioral tests demonstrate that the unlearning mechanisms inevitably impact the global behavior of the models, affecting unrelated knowledge or capabilities. Our work advocates the development of more resilient unlearning techniques for truly erasing knowledge. @@ -3219,11 +3219,11 @@ Where is the signal in tokenization space? - RenatoGehUCLA Computer Science Department, University of California, Los Angeles + RenatoGehUCLA Computer Science Department, University of California, Los Angeles HonghuaZhangUniversity of California, Los Angeles KareemAhmedUniversity of California, Irvine BenjieWangUniversity of California, Los Angeles - GuyVan Den BroeckUniversity of California, Los Angeles + GuyVan Den BroeckUniversity of California, Los Angeles 3966-3979 Large Language Models (LLMs) are typically shipped with tokenizers that *deterministically* encode text into so-called *canonical* token sequences, to which the LLMs assign probability values.One common assumption is that the probability of a piece of text is the probability of its canonical token sequence.However, the tokenization of a string is not unique: e.g., the Llama2 tokenizer encodes ‘Tokens‘ as ‘[Tok,ens]‘, but ‘[Tok,en,s]‘ also represents the same text.In this paper, we study non-canonical tokenizations.We prove that, given a string, it is computationally hard to find the most likely tokenization for an autoregressive LLM, as well as to compute the marginal probability over all possible tokenizations.We then show how the marginal is, in most cases, indistinguishable from the canonical probability.Surprisingly, we then empirically demonstrate the existence of a significant amount of signal hidden within tokenization space.Notably, by simply aggregating the probabilities of non-canonical tokenizations, we achieve improvements across a range of LLM evaluation benchmarks for a variety of architectures, including transformers and state space models. 2024.emnlp-main.230 @@ -3232,11 +3232,11 @@ Private Language Models via Truncated Laplacian Mechanism - TianhaoHuang + TianhaoHuang TaoYang IvanHabernalRuhr-Universität Bochum LijieHuKAUST - DiWangKAUST + DiWangKAUST 3980-3993 Recently it has been shown that deep learning models for NLP tasks are prone to attacks that can even reconstruct the verbatim training texts. To prevent privacy leakage, researchers have investigated word-level perturbations, relying on the formal guarantees of differential privacy (DP) in the embedding space. However, many existing approaches either achieve unsatisfactory performance in the high privacy regime when using the Laplacian or Gaussian mechanism, or resort to weaker relaxations of DP that are inferior to the canonical DP in terms of privacy strength. This raises the question of whether a new method for private word embedding can be designed to overcome these limitations. In this paper, we propose a novel private embedding method called the high dimensional truncated Laplacian mechanism. Specifically, we introduce a non-trivial extension of the truncated Laplacian mechanism, which was previously only investigated in one-dimensional space cases. Theoretically, we show that our method has a lower variance compared to the previous private word embedding methods. To further validate its effectiveness, we conduct comprehensive experiments on private embedding and downstream tasks using three datasets. Remarkably, even in the high privacy regime, our approach only incurs a slight decrease in utility compared to the non-private scenario. 2024.emnlp-main.231 @@ -3254,8 +3254,8 @@ Consistent Autoformalization for Constructing Mathematical Libraries - LanZhangUniversity of Manchester - XinQuanUniversity of Manchester + LanZhangUniversity of Manchester + XinQuanUniversity of Manchester AndreFreitasIdiap Research Institute and University of Manchester 4020-4033 Autoformalization is the task of automatically translating mathematical content written in natural language to a formal language expression. The growing language interpretation capabilities of Large Language Models (LLMs), including in formal languages, are lowering the barriers for autoformalization. However, LLMs alone are not capable of consistently and reliably delivering autoformalization, in particular as the complexity and specialization of the target domain grows. As the field evolves into the direction of systematically applying autoformalization towards large mathematical libraries, the need to improve syntactic, terminological and semantic control increases. This paper proposes the coordinated use of three mechanisms, most-similar retrieval augmented generation (MS-RAG), denoising steps, and auto-correction with syntax error feedback (Auto-SEF) to improve autoformalization quality. The empirical analysis, across different models, demonstrates that these mechanisms can deliver autoformalizaton results which are syntactically, terminologically and semantically more consistent. These mechanisms can be applied across different LLMs and have shown to deliver improve results across different model types. @@ -3270,7 +3270,7 @@ YufeiTao AdamHiatt ErikHaake - Antonie J.JetterPortland State University + Antonie J.JetterPortland State University AmeetaAgrawalPortland State University 4034-4058 Large language models (LLMs) have demonstrated remarkable progress in leveraging diverse knowledge sources. This study investigates how nine widely used LLMs allocate knowledge between local context and global parameters when answering open-ended questions in knowledge-consistent scenarios. We introduce a novel dataset, WikiAtomic, and systematically vary context sizes to analyze how LLMs prioritize and utilize the provided information and their parametric knowledge in knowledge-consistent scenarios. Additionally, we also study their tendency to hallucinate under varying context sizes. Our findings reveal consistent patterns across models, including a consistent reliance on both contextual (around 70%) and parametric (around 30%) knowledge, and a decrease in hallucinations with increasing context. These insights highlight the importance of more effective context organization and developing models that use input more deterministically for robust performance. @@ -3291,9 +3291,9 @@ When Is Multilinguality a Curse? Language Modeling for 250 High- and Low-Resource Languages Tyler A.ChangGoogle and University of California, San Diego - CatherineArnett + CatherineArnett ZhuowenTuUniversity of California, San Diego - BenBergenUniversity of California, San Diego + BenBergenUniversity of California, San Diego 4074-4096 Multilingual language models are widely used to extend NLP systems to low-resource languages. However, concrete evidence for the effects of multilinguality on language modeling performance in individual languages remains scarce. Here, we pre-train over 10,000 monolingual and multilingual language models for over 250 languages, including multiple language families that are under-studied in NLP. We assess how language modeling performance in each language varies as a function of (1) monolingual dataset size, (2) added multilingual dataset size, (3) linguistic similarity of the added languages, and (4) model size (up to 45M parameters). We find that in moderation, adding multilingual data improves low-resource language modeling performance, similar to increasing low-resource dataset sizes by up to 33%. Improvements depend on the syntactic similarity of the added multilingual data, with marginal additional effects of vocabulary overlap. However, high-resource languages consistently perform worse in multilingual pre-training scenarios. As dataset sizes increase, adding multilingual data begins to hurt performance for both low-resource and high-resource languages, likely due to limited model capacity (the “curse of multilinguality”). These results suggest that massively multilingual pre-training may not be optimal for any languages involved, but that more targeted models can significantly improve performance. 2024.emnlp-main.236 @@ -3307,7 +3307,7 @@ YinongHe JianingYang YinpeiDaiUniversity of Michigan - Ann Arbor - JoyceChaiUniversity of Michigan + JoyceChaiUniversity of Michigan 4097-4114 In real-world scenarios, it is desirable for embodied agents to have the ability to leverage human language to gain explicit or implicit knowledge for learning tasks. Despite recent progress, most previous approaches adopt simple low-level instructions as language inputs, which may not reflect natural human communication. We expect human language to be informative (i.e., providing feedback on agents’ past behaviors and offering guidance on achieving their future goals) and diverse (i.e., encompassing a wide range of expressions and style nuances). To enable flexibility of language use in teaching agents tasks, this paper studies different types of language inputs in facilitating reinforcement learning (RL) embodied agents. More specifically, we examine how different levels of language informativeness and diversity impact agent learning and inference. Our empirical results based on four RL benchmarks demonstrate that agents trained with diverse and informative language feedback can achieve enhanced generalization and fast adaptation to new tasks. These findings highlight the pivotal role of language use in teaching embodied agents new tasks in an open world. 2024.emnlp-main.237 @@ -3321,7 +3321,7 @@ SnehaKuduguntaDepartment of Computer Science and Google DeepMind RominaStella SunipaDevGoogle - JasmijnBastingsGoogle DeepMind + JasmijnBastingsGoogle DeepMind 4115-4124 Translation systems, including foundation models capable of translation, can produce errors that result in gender mistranslation, and such errors can be especially harmful. To measure the extent of such potential harms when translating into and out of English, we introduce a dataset, MiTTenS, covering 26 languages from a variety of language families and scripts, including several traditionally under-represented in digital resources. The dataset is constructed with handcrafted passages that target known failure patterns, longer synthetically generated passages, and natural passages sourced from multiple domains. We demonstrate the usefulness of the dataset by evaluating both neural machine translation systems and foundation models, and show that all systems exhibit gender mistranslation and potential harm, even in high resource languages. 2024.emnlp-main.238 @@ -3331,7 +3331,7 @@ Teaching <fixed-case>LLM</fixed-case>s to Abstain across Languages via Multilingual Feedback - ShangbinFengUniversity of Washington + ShangbinFengUniversity of Washington WeijiaShi YikeWang WenxuanDingUniversity of Texas at Austin @@ -3339,7 +3339,7 @@ Shuyue StellaLiDepartment of Computer Science, University of Washington VidhishaBalachandranResearch, Microsoft SunayanaSitaramMicrosoft - YuliaTsvetkovDepartment of Computer Science, University of Washington + YuliaTsvetkovDepartment of Computer Science, University of Washington 4125-4150 Multilingual LLMs often have knowledge disparities across languages, with larger gaps in under-resourced languages. Teaching LLMs to abstain in the face of knowledge gaps is thus a promising strategy to mitigate hallucinations in multilingual settings. However, previous studies on LLM abstention primarily focus on English; we find that directly applying existing solutions beyond English results in up to 20.5% performance gaps between high and low-resource languages, potentially due to LLMs’ drop in calibration and reasoning beyond a few resource-rich languages. To this end, we propose strategies to enhance LLM abstention by learning from multilingual feedback, where LLMs self-reflect on proposed answers in one language by generating multiple feedback items in related languages: we show that this helps identifying the knowledge gaps across diverse languages, cultures, and communities. Extensive experiments demonstrate that our multilingual feedback approach outperforms various strong baselines, achieving up to 9.2% improvement for low-resource languages across three black-box and open models on three datasets, featuring open-book, closed-book, and commonsense QA. Further analysis reveals that multilingual feedback is both an effective and a more equitable abstain strategy to serve diverse language speakers, and cultural factors have great impact on language selection and LLM abstention behavior, highlighting future directions for multilingual and multi-cultural reliable language modeling. 2024.emnlp-main.239 @@ -3348,13 +3348,13 @@ Modular Pluralism: Pluralistic Alignment via Multi-<fixed-case>LLM</fixed-case> Collaboration - ShangbinFengUniversity of Washington - TaylorSorensenUniversity of Washington and Brigham Young University + ShangbinFengUniversity of Washington + TaylorSorensenUniversity of Washington and Brigham Young University YuhanLiu JillianFisherUniversity of Washington Chan YoungPark YejinChoiDepartment of Computer Science, University of Washington - YuliaTsvetkovDepartment of Computer Science, University of Washington + YuliaTsvetkovDepartment of Computer Science, University of Washington 4151-4171 While existing alignment paradigms have been integral in developing large language models (LLMs), LLMs often learn an averaged human preference and struggle to model diverse preferences across cultures, demographics, and communities. We propose Modular Pluralism, a modular framework based on multi-LLM collaboration for pluralistic alignment: it “plugs into” a base LLM a pool of smaller but specialized community LMs, where models collaborate in distinct modes to flexibility support three modes of pluralism: Overton, steerable, and distributional. Modular Pluralism is uniquely compatible with black-box LLMs and offers the modular control of adding new community LMs for previously underrepresented communities. We evaluate Modular Pluralism with six tasks and four datasets featuring questions/instructions with value-laden and perspective-informed responses. Extensive experiments demonstrate that Modular Pluralism advances the three pluralism objectives across six black-box and open-source LLMs. Further analysis reveals that LLMs are generally faithful to the inputs from smaller community LLMs, allowing seamless patching by adding a new community LM to better cover previously underrepresented communities. 2024.emnlp-main.240 @@ -3380,7 +3380,7 @@ WentingZhaoCornell University GeGaoCornell University ClaireCardieCornell University - Alexander MRushCornell University and School of Engineering and Applied Sciences, Harvard University + Alexander MRushCornell University and School of Engineering and Applied Sciences, Harvard University 4207-4220 When seeking information from unfamiliar documents, users frequently pose questions that cannot be answered by the documents. While existing large language models (LLMs) identify these unanswerable questions, they do not assist users in reformulating their questions, thereby reducing their overall utility. We curate CouldAsk, an evaluation benchmark composed of existing and new datasets for document-grounded question answering, specifically designed to study reformulating unanswerable questions. We evaluate state-of-the-art open-source and proprietary LLMs on CouldAsk. The results demonstrate the limited capabilities of these models in reformulating questions. Specifically, GPT-4 and Llama2-7B successfully reformulate questions only 26% and 12% of the time, respectively. Error analysis shows that 62% of the unsuccessful reformulations stem from the models merely rephrasing the questions or even generating identical questions. We publicly release the benchmark and the code to reproduce the experiments. 2024.emnlp-main.242 @@ -3403,7 +3403,7 @@ Hidden Persuaders: <fixed-case>LLM</fixed-case>s’ Political Leaning and Their Influence on Voters YujinPotterUniversity of California, Berkeley - ShiyangLai + ShiyangLai JunsolKim JamesEvans DawnSongUniversity of California Berkeley @@ -3417,7 +3417,7 @@ <fixed-case>SOUL</fixed-case>: Unlocking the Power of Second-Order Optimization for <fixed-case>LLM</fixed-case> Unlearning JinghanJia YihuaZhang - YimengZhangMichigan State University + YimengZhangMichigan State University JianchengLiuMichigan State University BharatRunwalMichigan State University JamesDiffenderferLawrence Livermore National Labs @@ -3432,9 +3432,9 @@ When Reasoning Meets Information Aggregation: A Case Study with Sports Narratives - YebowenHuUniversity of Central Florida + YebowenHuUniversity of Central Florida KaiqiangSongTencent AI Lab - SangwooChoCapital One + SangwooChoCapital One XiaoyangWangTencent AI Lab WenlinYaoTencent AI Lab HassanForooshUniversity of Central Florida @@ -3451,7 +3451,7 @@ Vu TrongKim MichaelKrumdickKensho VarshiniReddy - FranckDernoncourtAdobe Systems + FranckDernoncourtAdobe Systems Viet DacLaiAdobe Systems 4309-4333 FActScore has gained popularity as a metric to estimate the factuality of long-form texts generated by Large Language Models (LLMs) in English. However, there has not been any work in studying the behavior of FActScore in other languages. This paper studies the limitations of each component in the four-component pipeline of FActScore in the multilingual setting. We introduce a new dataset for FActScore on texts generated by strong multilingual LLMs. Our evaluation shows that LLMs exhibit distinct behaviors in both fact extraction and fact scoring tasks. No LLM produces consistent and reliable FActScore across languages of varying levels of resources. We also find that the knowledge source plays an important role in the quality of the estimated FActScore. Using Wikipedia as the knowledge source may hinder the true FActScore of long-form text due to its limited coverage in medium- and low-resource languages. We also incorporate 3 mitigations to our knowledge source that ultimately improve FActScore estimation across all languages. @@ -3470,7 +3470,7 @@ SeanWelleckCarnegie Mellon University GrahamNeubigCarnegie Mellon University MoontaeLeeUniversity of Illinois, Chicago - KyungjaeLee + KyungjaeLee MinjoonSeoTwelve Labs and Korea Advanced Institute of Science and Technology 4334-4353 Proprietary LMs such as GPT-4 are often employed to assess the quality of responses from various LMs. However, concerns including transparency, controllability, and affordability strongly motivate the development of open-source LMs specialized in evaluations. On the other hand, existing open evaluator LMs exhibit critical shortcomings: 1) they issue scores that significantly diverge from those assigned by humans, and 2) they lack the flexibility to perform both direct assessment and pairwise ranking, the two most prevalent forms of assessment. Additionally, they do not possess the ability to evaluate based on custom evaluation criteria, focusing instead on general attributes like helpfulness and harmlessness. To address these issues, we introduce Prometheus 2, a more powerful evaluator LM than its predecessor that closely mirrors human and GPT-4 judgements. Moreover, it is capable of processing both direct assessment and pair-wise ranking formats grouped with a user-defined evaluation criteria. On four direct assessment benchmarks and four pairwise ranking benchmarks, Prometheus 2 scores the highest correlation and agreement with humans and proprietary LM judges among all tested open evaluator LMs. Our models, code, and data are all publicly available. @@ -3487,7 +3487,7 @@ PengQiOrby AI YumoXuAWS AI Labs JenyuanWangAmazon - LanLiu + LanLiu William YangWangUC Santa Barbara BonanMinAmazon and Tufts University VittorioCastelliAmazon @@ -3500,11 +3500,11 @@ <fixed-case>P</fixed-case>rompt<fixed-case>R</fixed-case>eps: Prompting Large Language Models to Generate Dense and Sparse Representations for Zero-Shot Document Retrieval - ShengyaoZhuangCSIRO + ShengyaoZhuangCSIRO XueguangMa BevanKoopmanCSIRO and University of Queensland JimmyLinUniversity of Waterloo - GuidoZucconUniversity of Queensland + GuidoZucconUniversity of Queensland 4375-4391 Utilizing large language models (LLMs) for zero-shot document ranking is done in one of two ways: (1) prompt-based re-ranking methods, which require no further training but are only feasible for re-ranking a handful of candidate documents due to computational costs; and (2) unsupervised contrastive trained dense retrieval methods, which can retrieve relevant documents from the entire corpus but require a large amount of paired text data for contrastive training.In this paper, we propose PromptReps, which combines the advantages of both categories: no need for training and the ability to retrieve from the whole corpus. Our method only requires prompts to guide an LLM to generate query and document representations for effective document retrieval. Specifically, we prompt the LLMs to represent a given text using a single word, and then use the last token’s hidden states and the corresponding logits associated with the prediction of the next token to construct a hybrid document retrieval system. The retrieval system harnesses both dense text embedding and sparse bag-of-words representations given by the LLM.Our experimental evaluation on the MSMARCO, TREC deep learning and BEIR zero-shot document retrieval datasets illustrates that this simple prompt-based LLM retrieval method can achieve a similar or higher retrieval effectiveness than state-of-the-art LLM embedding methods that are trained with large amounts of unsupervised data, especially when using a larger LLM. 2024.emnlp-main.250 @@ -3517,10 +3517,10 @@ AnuoluwapoAremu DianaAbagyanUniversity of Washington HilaGonenUniversity of Washington - David IfeoluwaAdelani + David IfeoluwaAdelani DaudAbolade - Noah A.SmithUniversity of Washington and Allen Institute for Artificial Intelligence - YuliaTsvetkovDepartment of Computer Science, University of Washington + Noah A.SmithUniversity of Washington and Allen Institute for Artificial Intelligence + YuliaTsvetkovDepartment of Computer Science, University of Washington 4392-4409 Yoruba—an African language with roughly 47 million speakers—encompasses a continuum with several dialects. Recent efforts to develop NLP technologies for African languages have focused on their standard dialects, resulting in disparities for dialects and varieties for which there are little to no resources or tools. We take steps towards bridging this gap by introducing a new high-quality parallel text and speech corpus; YORULECT across three domains and four regional yoruba dialects. To develop this corpus, we engaged native speakers, traveling to communities where these dialects are spoken, to collect text and speech data. Using our newly created corpus, we conducted extensive experiments on (text) machine translation, automatic speech recognition, and speech-to-text translation. Our results reveal substantial performance disparities between standard yoruba and the other dialects across all tasks. However, we also show that with dialect-adaptive finetuning, we are able to narrow this gap. We believe our dataset and experimental analysis will contribute greatly to developing NLP tools for Yoruba and its dialects, and potentially for other African languages, by improving our understanding of existing challenges and offering a high-quality dataset for further development. We will release YORULECT dataset and models publicly under an open license. 2024.emnlp-main.251 @@ -3533,7 +3533,7 @@ Ju-SeungByun JiyunChun JihyungKilAdobe Research - AndrewPerraultOhio State University + AndrewPerraultOhio State University 4410-4430 Large Multimodal Models (LMMs) excel at comprehending human instructions and demonstrate remarkable results across a broad spectrum of tasks. Reinforcement Learning from Human Feedback (RLHF) and AI Feedback (RLAIF) further refine LLMs by aligning them with specific preferences. These methods primarily use ranking-based feedback for entire generations. With advanced AI models (Teacher), such as GPT-4 and Claude 3 Opus, we can request various types of detailed feedback that are expensive for humans to provide. We propose a two-stage algorithm ARES that Alternates REinforcement Learning (RL) and Supervised Fine-Tuning (SFT). First, we ask the Teacher to score how much each sentence contributes to solving the problem in a Chain-of-Thought (CoT). This sentence-level feedback allows us to consider individual valuable segments, providing more granular rewards for the RL procedure. Second, we ask the Teacher to correct wrong reasoning after the RL stage. The RL procedure requires substantial hyperparameter tuning and often generates errors such as repetitive words and incomplete sentences. With correction feedback, we stabilize the RL fine-tuned model through SFT. We conduct experiments on the multi-modal datasets ScienceQA and A-OKVQA to demonstrate the effectiveness of our proposal. The ARES rationale achieves around 70% win rate compared to baseline models judged by GPT-4o. Additionally, we observe that the improved rationale reasoning leads to a 2.5% increase in inference answer accuracy on average for the multi-modal datasets. 2024.emnlp-main.252 @@ -3557,9 +3557,9 @@ YuweiFangSnap Inc. WilliMenapaceSnap Inc. AliaksandrSiarohinSnap Inc. and Snap Inc. - Tsai-ShienChenUniversity of California, Merced + Tsai-ShienChenUniversity of California, Merced Kuan-ChiehWangSnap Inc. - IvanSkorokhodovSnap Inc. + IvanSkorokhodovSnap Inc. GrahamNeubigCarnegie Mellon University SergeyTulyakovSnap Inc. 4444-4456 @@ -3572,8 +3572,8 @@ <fixed-case>F</fixed-case><tex-math>^2</tex-math><fixed-case>RL</fixed-case>: Factuality and Faithfulness Reinforcement Learning Framework for Claim-Guided Evidence-Supported Counterspeech Generation HaiyangWangNational University of Defense Technology YuchenPan - XinSong - XuechenZhao + XinSong + XuechenZhao MinghaoHuCenter of Information Research, AMS BinZhouNational University of Defense Technology 4457-4470 @@ -3584,8 +3584,8 @@ Deciphering Rumors: A Multi-Task Learning Approach with Intent-aware Hierarchical Contrastive Learning - ChangYang - PengZhangTianjin University + ChangYang + PengZhangTianjin University HuiGao JingZhang 4471-4483 @@ -3598,8 +3598,8 @@ Visual Prompting in <fixed-case>LLM</fixed-case>s for Enhancing Emotion Recognition QixuanZhangAustralian National University ZhifengWang - DylanZhang - WenjiaNiu + DylanZhang + WenjiaNiu SabrinaCaldwellAustralian National University TomGedeon YangLiu @@ -3613,9 +3613,9 @@ <fixed-case>IDEAW</fixed-case>: Robust Neural Audio Watermarking with Invertible Dual-Embedding PengchengLi - XulongZhangPing An Technology (Shenzhen) Co., Ltd. - JingXiaoPingan Group - JianzongWangPingan Technology + XulongZhangPing An Technology (Shenzhen) Co., Ltd. + JingXiaoPingan Group + JianzongWangPingan Technology 4500-4511 The audio watermarking technique embeds messages into audio and accurately extracts messages from the watermarked audio. Traditional methods develop algorithms based on expert experience to embed watermarks into the time-domain or transform-domain of signals. With the development of deep neural networks, deep learning-based neural audio watermarking has emerged. Compared to traditional algorithms, neural audio watermarking achieves better robustness by considering various attacks during training. However, current neural watermarking methods suffer from low capacity and unsatisfactory imperceptibility. Additionally, the issue of watermark locating, which is extremely important and even more pronounced in neural audio water- marking, has not been adequately studied. In this paper, we design a dual-embedding wa- termarking model for efficient locating. We also consider the impact of the attack layer on the invertible neural network in robustness training, improving the model to enhance both its reasonableness and stability. Experiments show that the proposed model, IDEAW, can withstand various attacks with higher capacity and more efficient locating ability compared to existing methods. 2024.emnlp-main.258 @@ -3628,8 +3628,8 @@ Yen-HaoHuang Tsu-KengLiao Didier FernandoSalazar Estrada - RetnaniLatifah - Yi-ShinChen + RetnaniLatifah + Yi-ShinChen 4512-4522 In multi-person communications, conflicts often arise. Each individual may have their own perspective, which can differ. Additionally, commonly referenced offensive datasets frequently neglect contextual information and are primarily constructed with a focus on intended offenses. This study suggests that conflicts are pivotal in revealing a broader range of human interactions, including instances of unintended offensive language. This paper proposes a conflict-based data collection method to utilize inter-conflict cues in multi-person communications. By focusing on specific cue posts within conversation threads, our proposed approach effectively identifies relevant instances for analysis. Detailed analyses are provided to showcase the proposed approach efficiently gathers data on subtly offensive content. The experimental results indicate that incorporating elements of conflict into data collection significantly enhances the comprehensiveness and accuracy of detecting offensive language but also enriches our understanding of conflict dynamics in digital communication. 2024.emnlp-main.259 @@ -3638,10 +3638,10 @@ Outcome-Constrained Large Language Models for Countering Hate Speech - LingziHongUniversity of North Texas - PengchengLuoPeking University + LingziHongUniversity of North Texas + PengchengLuoPeking University EduardoBlancoUniversity of Arizona - XiaoyingSong + XiaoyingSong 4523-4536 Automatic counterspeech generation methods have been developed to assist efforts in combating hate speech. Existing research focuses on generating counterspeech with linguistic attributes such as being polite, informative, and intent-driven. However, the real impact of counterspeech in online environments is seldom considered. This study aims to develop methods for generating counterspeech constrained by conversation outcomes and evaluate their effectiveness. We experiment with large language models (LLMs) to incorporate into the text generation process two desired conversation outcomes: low conversation incivility and non-hateful hater reentry. Specifically, we experiment with instruction prompts, LLM finetuning, and LLM reinforcement learning (RL). Evaluation results show that our methods effectively steer the generation of counterspeech toward the desired outcomes. Our analyses, however, show that there are differences in the quality and style depending on the model. 2024.emnlp-main.260 @@ -3663,8 +3663,8 @@ Adaptive Immune-based Sound-Shape Code Substitution for Adversarial <fixed-case>C</fixed-case>hinese Text Attacks AoWangChina University of Petroleum (East China) XinghaoYangCollege of Control Science and Engineering - ChenLi - Bao-diLiu + ChenLi + Bao-diLiu WeifengLiuChina University of Petroleum (East China) 4553-4565 Adversarial textual examples reveal the vulnerability of natural language processing (NLP) models. Most existing text attack methods are designed for English text, while the robust implementation of the second popular language, i.e., Chinese with 1 billion users, is greatly underestimated. Although several Chinese attack methods have been presented, they either directly transfer from English attacks or adopt simple greedy search to optimize the attack priority, usually leading to unnatural sentences. To address these issues, we propose an adaptive Immune-based Sound-Shape Code (ISSC) algorithm for adversarial Chinese text attacks. Firstly, we leverage the Sound-Shape code to generate natural substitutions, which comprehensively integrate multiple Chinese features. Secondly, we employ adaptive immune algorithm (IA) to determine the replacement order, which can reduce the duplication of population to improve the search ability. Extensive experimental results validate the superiority of our ISSC in producing high-quality Chinese adversarial texts. Our code and data can be found in https://github.com/nohuma/chinese-attack-issc. @@ -3677,9 +3677,9 @@ Bootstrapped Policy Learning for Task-oriented Dialogue through Goal Shaping YangyangZhaoChangsha University of Science and Technology - BenNiu - MehdiDastaniUtrecht University - ShihanWangUtrecht University + BenNiu + MehdiDastaniUtrecht University + ShihanWangUtrecht University 4566-4580 Reinforcement learning shows promise in optimizing dialogue policies, but addressing the challenge of reward sparsity remains crucial. While curriculum learning offers a practical solution by strategically training policies from simple to complex, it hinges on the assumption of a gradual increase in goal difficulty to ensure a smooth knowledge transition across varied complexities. In complex dialogue environments without intermediate goals, achieving seamless knowledge transitions becomes tricky. This paper proposes a novel Bootstrapped Policy Learning (BPL) framework, which adaptively tailors progressively challenging subgoal curriculum for each complex goal through goal shaping, ensuring a smooth knowledge transition. Goal shaping involves goal decomposition and evolution, decomposing complex goals into subgoals with solvable maximum difficulty and progressively increasing difficulty as the policy improves. Moreover, to enhance BPL’s adaptability across various environments, we explore various combinations of goal decomposition and evolution within BPL, and identify two universal curriculum patterns that remain effective across different dialogue environments, independent of specific environmental constraints. By integrating the summarized curriculum patterns, our BPL has exhibited efficacy and versatility across four publicly available datasets with different difficulty levels. 2024.emnlp-main.263 @@ -3689,7 +3689,7 @@ <fixed-case>P</fixed-case>sy<fixed-case>GUARD</fixed-case>: An Automated System for Suicide Detection and Risk Assessment in Psychological Counseling HuachuanQiuWestlake University - LizhiMaWestlake University + LizhiMaWestlake University ZhenzhongLanWestlake University 4581-4607 As awareness of mental health issues grows, online counseling support services are becoming increasingly prevalent worldwide. Detecting whether users express suicidal ideation in text-based counseling services is crucial for identifying and prioritizing at-risk individuals. However, the lack of domain-specific systems to facilitate fine-grained suicide detection and corresponding risk assessment in online counseling poses a significant challenge for automated crisis intervention aimed at suicide prevention. In this paper, we propose PsyGUARD, an automated system for detecting suicide ideation and assessing risk in psychological counseling. To achieve this, we first develop a detailed taxonomy for detecting suicide ideation based on foundational theories. We then curate a large-scale, high-quality dataset called PsySUICIDE for suicide detection. To evaluate the capabilities of automated systems in fine-grained suicide detection, we establish a range of baselines. Subsequently, to assist automated services in providing safe, helpful, and tailored responses for further assessment, we propose to build a suite of risk assessment frameworks. Our study not only provides an insightful analysis of the effectiveness of automated risk assessment systems based on fine-grained suicide detection but also highlights their potential to improve mental health services on online counseling platforms. Code, data, and models are available at https://github.com/qiuhuachuan/PsyGUARD. @@ -3701,13 +3701,13 @@ World to Code: Multi-modal Data Generation via Self-Instructed Compositional Captioning and Filtering - JiacongWang + JiacongWang BohongWuShanghai Jiao Tong University HaiyongJiang ZhouXun - XinXiaoByteDance Inc. + XinXiaoByteDance Inc. HaoyuanGuoByteDance Inc. - JunXiaoUniversity of Chinese Academy of Sciences + JunXiaoUniversity of Chinese Academy of Sciences 4608-4623 Recent advances in Vision-Language Models (VLMs) and the scarcity of high-quality multi-modal alignment data have inspired numerous researches on synthetic VLM data generation. The conventional norm in VLM data construction uses a mixture of specialists in caption and OCR, or stronger VLM APIs and expensive human annotation.In this paper, we present World to Code (W2C), a meticulously curated multi-modal data construction pipeline that organizes the final generation output into a Python code format. The pipeline leverages the VLM itself to extract cross-modal information via different prompts and filter the generated outputs again via a consistency filtering strategy. Experiments have demonstrated the high quality of W2C by improving various existing visual question answering and visual grounding benchmarks across different VLMs. Further analysis also demonstrates that the new code parsing ability of VLMs presents better cross-modal equivalence than the commonly used detail caption ability. Our code is available at https://github.com/foundation-multimodal-models/World2Code. 2024.emnlp-main.265 @@ -3718,7 +3718,7 @@ <fixed-case>DVD</fixed-case>: Dynamic Contrastive Decoding for Knowledge Amplification in Multi-Document Question Answering JingJin HoufengWangPeking University - HaoZhangNanyang Technological University + HaoZhangNanyang Technological University XiaoguangLi ZhijiangGuoUniversity of Cambridge 4624-4637 @@ -3777,7 +3777,7 @@ ZihanNiuUniversity of Science and Technology of China ZheyongXie ChaoZhang - TongXuUniversity of Science and Technology of China + TongXuUniversity of Science and Technology of China YangWang EnhongChenUniversity of Science and Technology of China 4683-4702 @@ -3791,8 +3791,8 @@ <fixed-case>C</fixed-case>o<fixed-case>E</fixed-case>vol: Constructing Better Responses for Instruction Finetuning through Multi-Agent Cooperation RenhaoLiUniversity of Macau - MinghuanTanShenzhen Institute of Advanced Technology, Chinese Academy of Sciences, Chinese Academy of Sciences - Derek F.WongUniversity of Macau + MinghuanTanShenzhen Institute of Advanced Technology, Chinese Academy of Sciences, Chinese Academy of Sciences + Derek F.WongUniversity of Macau MinYangShenzhen Institutes of Advanced Technology, Chinese Academy of Sciences, Chinese Academy of Sciences 4703-4721 In recent years, instruction fine-tuning (IFT) on large language models (LLMs) has garnered considerable attention to enhance model performance on unseen tasks. Attempts have been made on automatic construction and effective selection for IFT data. However, we posit that previous methods have not fully harnessed the potential of LLMs for enhancing data quality. The responses within IFT data could be further enhanced by leveraging the capabilities of LLMs themselves.In this paper, we propose CoEvol, an LLM-based multi-agent cooperation framework for the improvement of responses for instructions. To effectively refine the responses, we develop an iterative framework following a _debate-advise-edit-judge_ paradigm. A two-stage multi-agent debate strategy is further devised to ensure the diversity and reliability of editing suggestions within the framework. Empirically, models equipped with CoEvol outperform competitive baselines evaluated by MT-Bench and AlpacaEval, demonstrating its effectiveness in enhancing instruction-following capabilities for LLMs. @@ -3806,7 +3806,7 @@ A Peek into Token Bias: Large Language Models Are Not Yet Genuine Reasoners BowenJiangUniversity of Pennsylvania YangxinyuXie - ZhuoqunHao + ZhuoqunHao XiaomengWang TanwiMallickArgonne National Laboratory Weijie JSuThe Wharton School, University of Pennsylvania @@ -3833,7 +3833,7 @@ <fixed-case>M</fixed-case>u<fixed-case>M</fixed-case>ath-Code: Combining Tool-Use Large Language Models with Multi-perspective Data Augmentation for Mathematical Reasoning - ShuoYin + ShuoYin WeihaoYou ZhilongJiTomorrow Advancing Life GuoqiangZhongOcean University of China @@ -3848,9 +3848,9 @@ Seeing the Forest through the Trees: Data Leakage from Partial Transformer Gradients - WeijunLi - QiongkaiXuMacquarie University - MarkDrasMacquarie University + WeijunLi + QiongkaiXuMacquarie University + MarkDrasMacquarie University 4786-4798 Recent studies have shown that distributed machine learning is vulnerable to gradient inversion attacks, where private training data can be reconstructed by analyzing the gradients of the models shared in training. Previous attacks established that such reconstructions are possible using gradients from all parameters in the entire models. However, we hypothesize that most of the involved modules, or even their sub-modules, are at risk of training data leakage, and we validate such vulnerabilities in various intermediate layers of language models. Our extensive experiments reveal that gradients from a single Transformer layer, or even a single linear component with 0.54% parameters, are susceptible to training data leakage. Additionally, we show that applying differential privacy on gradients during training offers limited protection against the novel vulnerability of data disclosure. 2024.emnlp-main.275 @@ -3862,11 +3862,11 @@ <fixed-case>RWKV</fixed-case>-<fixed-case>CLIP</fixed-case>: A Robust Vision-Language Representation Learner TianchengGu KaichengYang - XiangAndeepglint + XiangAndeepglint ZiyongFengDeepGlint DongnanLiu - WeidongCaiThe University of Sydney - JiankangDengImperial College London + WeidongCaiThe University of Sydney + JiankangDengImperial College London 4799-4812 Contrastive Language-Image Pre-training (CLIP) has significantly improved performance in various vision-language tasks by expanding the dataset with image-text pairs obtained from the web. This paper further explores CLIP from the perspectives of data and model architecture. To mitigate the impact of the noise data and enhance the quality of large-scale image-text data crawled from the internet, we introduce a diverse description generation framework that can leverage Large Language Models (LLMs) to combine and refine information from web-based image-text pairs, synthetic captions, and detection tags. Additionally, we propose RWKV-CLIP, the first RWKV-driven vision-language representation learning model that combines the effective parallel training of transformers with the efficient inference of RNNs. Extensive experiments across different model scales and pre-training datasets demonstrate that RWKV-CLIP is a robust vision-language representation learner and it achieves state-of-the-art performance across multiple downstream tasks, including linear probing, zero-shot classification, and zero-shot image-text retrieval. To facilitate future research, the code and pre-trained models are released at https://github.com/deepglint/RWKV-CLIP. 2024.emnlp-main.276 @@ -3889,7 +3889,7 @@ Using Language Models to Disambiguate Lexical Choices in Translation JoshBarua SanjaySubramanianUniversity of California, Berkeley - KayoYinUniversity of California, Berkeley + KayoYinUniversity of California, Berkeley AlaneSuhrUniversity of California, Berkeley 4837-4848 In translation, a concept represented by a single word in a source language can have multiple variations in a target language. The task of lexical selection requires using context to identify which variation is most appropriate for a source text. We work with native speakers of nine languages to create DTAiLS, a dataset of 1,377 sentence pairs that exhibit cross-lingual concept variation when translating from English. We evaluate recent LLMs and neural machine translation systems on DTAiLS, with the best-performing model, GPT-4, achieving from 67 to 85% accuracy across languages. Finally, we use language models to generate English rules describing target-language concept variations. Providing weaker models with high-quality lexical rules improves accuracy substantially, in some cases reaching or outperforming GPT-4. @@ -3902,8 +3902,8 @@ How Does the Disclosure of <fixed-case>AI</fixed-case> Assistance Affect the Perceptions of Writing? ZhuoyanLiPurdue University ChenLiang - JingPengUniversity of Connecticut - MingYinPurdue University + JingPengUniversity of Connecticut + MingYinPurdue University 4849-4868 Recent advances in generative AI technologies like large language models have boosted the incorporation of AI assistance in writing workflows, leading to the rise of a new paradigm of human-AI co-creation in writing. To understand how people perceive writings that are produced under this paradigm, in this paper, we conduct an experimental study to understand whether and how the disclosure of the level and type of AI assistance in the writing process would affect people’s perceptions of the writing on various aspects, including their evaluation on the quality of the writing, and their ranking of different writings. Our results suggest that disclosing the AI assistance in the writing process, especially if AI has provided assistance in generating new content, decreases the average quality ratings for both argumentative essays and creative stories. This decrease in the average quality ratings often comes with an increased level of variations in different individuals’ quality evaluations of the same writing. Indeed, factors such as an individual’s writing confidence and familiarity with AI writing assistants are shown to moderate the impact of AI assistance disclosure on their writing quality evaluations. We also find that disclosing the use of AI assistance may significantly reduce the proportion of writings produced with AI’s content generation assistance among the top-ranked writings. 2024.emnlp-main.279 @@ -3912,11 +3912,11 @@ An Unsupervised Approach to Achieve Supervised-Level Explainability in Healthcare Records - JoakimEdinUniversity of Copenhagen - MariaMaistroUniversity of Copenhagen + JoakimEdinUniversity of Copenhagen + MariaMaistroUniversity of Copenhagen LarsMaaløeTechnical University of Denmark - LasseBorgholt - Jakob DrachmannHavtornCorti + LasseBorgholt + Jakob DrachmannHavtornCorti TuukkaRuotsaloLappeenranta University of Technology, University of Copenhagen and University of Helsinki 4869-4890 Electronic healthcare records are vital for patient safety as they document conditions, plans, and procedures in both free text and medical codes. Language models have significantly enhanced the processing of such records, streamlining workflows and reducing manual data entry, thereby saving healthcare providers significant resources. However, the black-box nature of these models often leaves healthcare professionals hesitant to trust them. State-of-the-art explainability methods increase model transparency but rely on human-annotated evidence spans, which are costly. In this study, we propose an approach to produce plausible and faithful explanations without needing such annotations. We demonstrate on the automated medical coding task that adversarial robustness training improves explanation plausibility and introduce AttInGrad, a new explanation method superior to previous ones. By combining both contributions in a fully unsupervised setup, we produce explanations of comparable quality, or better, to that of a supervised approach. We release our code and model weights. @@ -3942,7 +3942,7 @@ <fixed-case>EVEDIT</fixed-case>: Event-based Knowledge Editing for Deterministic Knowledge Propagation JiatengLiu PengfeiYuBoson AI and University of Illinois at Urbana-Champaign - YujiZhang + YujiZhang ShaLiUniversity of Illinois, Urbana Champaign ZixuanZhang RuhiSarikaya @@ -3957,8 +3957,8 @@ Modeling Nonnative Sentence Processing with <fixed-case>L</fixed-case>2 Language Models - TatsuyaAoyama - NathanSchneiderGeorgetown University + TatsuyaAoyama + NathanSchneiderGeorgetown University 4927-4940 We study LMs pretrained sequentially on two languages (“L2LMs”) for modeling nonnative sentence processing. In particular, we pretrain GPT2 on 6 different first languages (L1s), followed by English as the second language (L2). We examine the effect of the choice of pretraining L1 on the model’s ability to predict human reading times, evaluating on English readers from a range of L1 backgrounds. Experimental results show that, while all of the LMs’ word surprisals improve prediction of L2 reading times, especially for human L1s distant from English, there is no reliable effect of the choice of L2LM’s L1. We also evaluate the learning trajectory of a monolingual English LM: for predicting L2 as opposed to L1 reading, it peaks much earlier and immediately falls off, possibly mirroring the difference in proficiency between the native and nonnative populations. Lastly, we provide examples of L2LMs’ surprisals, which could potentially generate hypotheses about human L2 reading. 2024.emnlp-main.283 @@ -3968,9 +3968,9 @@ From the Least to the Most: Building a Plug-and-Play Visual Reasoner via Data Synthesis ChuanqiChengRenmin University of China - JianGuan - WeiWuAnt Research - RuiYanRenmin University of China + JianGuan + WeiWuAnt Research + RuiYanRenmin University of China 4941-4957 We explore multi-step reasoning in vision-language models (VLMs). The problem is challenging, as reasoning data consisting of multiple steps of visual and language processing are barely available. To overcome the challenge, we first introduce a least-to-most visual reasoning paradigm, which interleaves steps of decomposing a question into sub-questions and invoking external tools for resolving sub-questions. Based on the paradigm, we further propose a novel data synthesis approach that can automatically create questions and multi-step reasoning paths for an image in a bottom-up manner. Our approach divides the complex synthesis task into a few simple sub-tasks, and (almost entirely) relies on open-sourced models to accomplish the sub-tasks. Therefore, the entire synthesis process is reproducible and cost-efficient, and the synthesized data is quality guaranteed. With the approach, we construct 50k visual reasoning examples. Then, we develop a visual reasoner through supervised fine-tuning, which is capable of generally enhancing the reasoning abilities of a wide range of existing VLMs in a plug-and-play fashion. Extensive experiments indicate that the visual reasoner can consistently and significantly improve four VLMs on four VQA benchmarks. 2024.emnlp-main.284 @@ -3982,7 +3982,7 @@ ShadiIskander SofiaTolmachAmazon OriShapiraOriginAI - NachshonCohenAmazon + NachshonCohenAmazon ZoharKarnintii 4958-4976 Training large language models (LLMs) for external tool usage is a rapidly expanding field, with recent research focusing on generating synthetic data to address the shortage of available data. However, the absence of systematic data quality checks poses complications for properly training and testing models. To that end, we propose two approaches for assessing the reliability of data for training LLMs to use external tools. The first approach uses intuitive, human-defined correctness criteria. The second approach uses a model-driven assessment with in-context evaluation. We conduct a thorough evaluation of data quality on two popular benchmarks, followed by an extrinsic evaluation that showcases the impact of data quality on model performance. Our results demonstrate that models trained on high-quality data outperform those trained on unvalidated data, even when trained with a smaller quantity of data. These findings empirically support the significance of assessing and ensuring the reliability of training data for tool-using LLMs. @@ -3996,10 +3996,10 @@ YuangLiHuawei Technologies Ltd. MinZhangHuawei Technologies Ltd. MengxinRen - XiaosongQiaoHuawei Technologies Ltd. + XiaosongQiaoHuawei Technologies Ltd. MiaomiaoMa DaimengWei - HaoYang + HaoYang 4977-4983 Audio deepfake detection (ADD) is essential for preventing the misuse of synthetic voices that may infringe on personal rights and privacy. Recent zero-shot text-to-speech (TTS) models pose higher risks as they can clone voices with a single utterance. However, the existing ADD datasets are outdated, leading to suboptimal generalization of detection models. In this paper, we construct a new cross-domain ADD dataset comprising over 300 hours of speech data that is generated by five advanced zero-shot TTS models. To simulate real-world scenarios, we employ diverse attack methods and audio prompts from different datasets. Experiments show that, through novel attack-augmented training, the Wav2Vec2-large and Whisper-medium models achieve equal error rates of 4.1% and 6.5% respectively. Additionally, we demonstrate our models’ outstanding few-shot ADD ability by fine-tuning with just one minute of target-domain data. Nonetheless, neural codec compressors greatly affect the detection accuracy, necessitating further research. Our dataset is publicly available (https://github.com/leolya/CD-ADD). 2024.emnlp-main.286 @@ -4008,9 +4008,9 @@ <fixed-case>M</fixed-case>a<fixed-case>PPER</fixed-case>: Multimodal Prior-guided Parameter Efficient Tuning for Referring Expression Comprehension - TingLiu + TingLiu ZunnanXuTSINGHUA UNIVERSITY - YueHuNational University of Defense Technology + YueHuNational University of Defense Technology LiangtaoShi ZhiqiangWangIFLYTEK CO.LTD. QuanjunYinNational University of Defense Technology @@ -4024,7 +4024,7 @@ Hierarchical Deconstruction of <fixed-case>LLM</fixed-case> Reasoning: A Graph-Based Framework for Analyzing Knowledge Utilization MiyoungKoKorea Advanced Institute of Science and Technology Sue HyunParkKorea Advanced Institute of Science & Technology - JoonsukParkUniversity of Richmond + JoonsukParkUniversity of Richmond MinjoonSeoTwelve Labs and Korea Advanced Institute of Science and Technology 4995-5027 Despite the advances in large language models (LLMs), how they use their knowledge for reasoning is not yet well understood.In this study, we propose a method that deconstructs complex real-world questions into a graph, representing each question as a node with predecessors of background knowledge needed to solve the question. We develop the DepthQA dataset, deconstructing questions into three depths: (i) recalling conceptual knowledge, (ii) applying procedural knowledge, and (iii) analyzing strategic knowledge. Based on a hierarchical graph, we quantify forward discrepancy, a discrepancy in LLM performance on simpler sub-problems versus complex questions. We also measure backward discrepancy where LLMs answer complex questions but struggle with simpler ones. Our analysis shows that smaller models exhibit more discrepancies than larger models. Distinct patterns of discrepancies are observed across model capacity and possibility of training data memorization. Additionally, guiding models from simpler to complex questions through multi-turn interactions improves performance across model sizes, highlighting the importance of structured intermediate steps in knowledge reasoning. This work enhances our understanding of LLM reasoning and suggests ways to improve their problem-solving abilities. @@ -4041,7 +4041,7 @@ WenshuaiHuoHarbin Institute of Technology ChengpengFu TingLiuHarbin Institute of Technology - BingQinHarbin Institute of Technology + BingQinHarbin Institute of Technology 5028-5041 Large Language models (LLMs) have exhibited remarkable abilities in understanding complex texts, offering a promising path towards human-like translation performance. However, this study reveals the misalignment between the translation-specific understanding and the general understanding inside LLMs. This understanding misalignment leads to LLMs mistakenly or literally translating some complicated concepts that they accurately comprehend in the general scenarios (e.g., QA). To align the translation-specific understanding to the general one, we propose a novel translation process, DUAT (Difficult words Understanding Aligned Translation), explicitly incorporating the general understanding on the complicated content incurring inconsistent understandings to guide the translation. Specifically, DUAT performs cross-lingual interpretation for the difficult-to-translate words and enhances the translation with the generated interpretations. Furthermore, we reframe the external tools to improve DUAT in detecting difficult words and generating helpful interpretations. We conduct experiments on the self-constructed benchmark Challenge-WMT, consisting of samples that are prone to mistranslation. Human evaluation results on high-resource and low-resource language pairs indicate that DUAT significantly facilitates the understanding alignment, which improves the translation quality (up to +3.85 COMET) and reduces translation literalness by -25% ∼ -51%. 2024.emnlp-main.289 @@ -4053,7 +4053,7 @@ MohamadBalloutUniversity of Osnabrück AnneDedertUniversität Osnabrück Nohayr MuhammadAbdelmoneim - UlfKrumnackInstitute of Cognitive Science, Osnabrück University, Universität Osnabrück + UlfKrumnackInstitute of Cognitive Science, Osnabrück University, Universität Osnabrück GuntherHeidemann Kai-UweKühnberger 5042-5059 @@ -4079,10 +4079,10 @@ YiboWang WentingZhao ZhongfenDengUniversity of Illinois, Chicago - ShuaiqiLiu - RenzeLouPennsylvania State University + ShuaiqiLiu + RenzeLouPennsylvania State University Henry PengZouUniversity of Illinois at Chicago - PranavNarayanan Venkit + PranavNarayanan Venkit NanZhang MukundSrinath Haoran RanranZhang @@ -4096,24 +4096,24 @@ CongyingXiaSalesForce.com ChenXingSalesForce.com ChengJiayangDepartment of Computer Science and Engineering, Hong Kong University of Science and Technology - ZhaoweiWangDepartment of Computer Science and Engineering, Hong Kong University of Science and Technology + ZhaoweiWangDepartment of Computer Science and Engineering, Hong Kong University of Science and Technology YingSu - Raj SanjayShahGeorgia Institute of Technology + Raj SanjayShahGeorgia Institute of Technology RuohaoGuoGeorgia Institute of Technology JingGu HaoranLi KangdaWeiTexas A&M University - College Station - ZihaoWang - LuChengUniversity of Illinois at Chicago - SurangikaRanathungaMassey University + ZihaoWang + LuChengUniversity of Illinois at Chicago + SurangikaRanathungaMassey University MengFangUniversity of Liverpool and Eindhoven University of Technology - JieFuShanghai Artificial Intelligence Laboratory + JieFuShanghai Artificial Intelligence Laboratory FeiLiuEmory University RuihongHuangTexas A&M University EduardoBlancoUniversity of Arizona YixinCaoFudan University RuiZhangPennsylvania State University - Philip S.YuUniversity of Illinois, Chicago + Philip S.YuUniversity of Illinois, Chicago WenpengYinPennsylvania State University 5081-5099 Claim: This work is not advocating the use of LLMs for paper (meta-)reviewing. Instead, wepresent a comparative analysis to identify and distinguish LLM activities from human activities. Two research goals: i) Enable better recognition of instances when someone implicitly uses LLMs for reviewing activities; ii) Increase community awareness that LLMs, and AI in general, are currently inadequate for performing tasks that require a high level of expertise and nuanced judgment.This work is motivated by two key trends. On one hand, large language models (LLMs) have shown remarkable versatility in various generative tasks such as writing, drawing, and question answering, significantly reducing the time required for many routine tasks. On the other hand, researchers, whose work is not only time-consuming but also highly expertise-demanding, face increasing challenges as they have to spend more time reading, writing, and reviewing papers. This raises the question: how can LLMs potentially assist researchers in alleviating their heavy workload?This study focuses on the topic of LLMs as NLP Researchers, particularly examining the effectiveness of LLMs in assisting paper (meta-)reviewing and its recognizability. To address this, we constructed the ReviewCritique dataset, which includes two types of information: (i) NLP papers (initial submissions rather than camera-ready) with both human-written and LLM-generated reviews, and (ii) each review comes with “deficiency” labels and corresponding explanations for individual segments, annotated by experts. Using ReviewCritique, this study explores two threads of research questions: (i) “LLMs as Reviewers”, how do reviews generated by LLMs compare with those written by humans in terms of quality and distinguishability? (ii) “LLMs as Metareviewers”, how effectively can LLMs identify potential issues, such as Deficient or unprofessional review segments, within individual paper reviews? To our knowledge, this is the first work to provide such a comprehensive analysis. @@ -4123,14 +4123,14 @@ Academics Can Contribute to Domain-Specialized Language Models - MarkDredzeDepartment of Computer Science, Whiting School of Engineering + MarkDredzeDepartment of Computer Science, Whiting School of Engineering Genta IndraWinataCapital One PrabhanjanKambadur - ShijieWuAnthropic + ShijieWuAnthropic OzanIrsoyBloomberg StevenLu VadimDabravolskiBloomberg - David SRosenbergBloomberg + David SRosenbergBloomberg SebastianGehrmannBloomberg 5100-5110 Commercially available models dominate academic leaderboards. While impressive, this has concentrated research on creating and adapting general-purpose models to improve NLP leaderboard standings for large language models. However, leaderboards collect many individual tasks and general-purpose models often underperform in specialized domains; domain-specific or adapted models yield superior results. This focus on large general-purpose models excludes many academics and draws attention away from areas where they can make important contributions. We advocate for a renewed focus on developing and evaluating domain- and task-specific models, and highlight the unique role of academics in this endeavor. @@ -4142,7 +4142,7 @@ Beyond Reference: Evaluating High Quality Translations Better than Human References KeonwoongNohHanyang University SeokjinOhHanyang University - WoohwanJungHanyang University + WoohwanJungHanyang University 5111-5127 In Machine Translation (MT) evaluations, the conventional approach is to compare a translated sentence against its human-created reference sentence. MT metrics provide an absolute score (e.g., from 0 to 1) to a candidate sentence based on the similarity with the reference sentence. Thus, existing MT metrics give the maximum score to the reference sentence. However, this approach overlooks the potential for a candidate sentence to exceed the reference sentence in terms of quality. In particular, recent advancements in Large Language Models (LLMs) have highlighted this issue, as LLM-generated sentences often exceed the quality of human-written sentences. To address the problem, we introduce the Residual score Metric (ResuMe), which evaluates the relative quality between reference and candidate sentences. ResuMe assigns a positive score to candidate sentences that outperform their reference sentences, and a negative score when they fall short. By adding the residual scores from ResuMe to the absolute scores from MT metrics, it can be possible to allocate higher scores to candidate sentences than what reference sentences are received from MT metrics. Experimental results demonstrate that ResuMe enhances the alignments between MT metrics and human judgments both at the segment-level and the system-level. 2024.emnlp-main.294 @@ -4151,7 +4151,7 @@ Unveiling the Lexical Sensitivity of <fixed-case>LLM</fixed-case>s: Combinatorial Optimization for Prompt Enhancement - PengweiZhan + PengweiZhan ZhenXuInstitute of Information Engineering, CAS QianTanInstitute of Information Engineering, Chinese Academy of Sciences JieSongInstitute of Information Engineering, Chinese Academy of Sciences @@ -4164,29 +4164,29 @@ <fixed-case>SEAC</fixed-case>rowd: A Multilingual Multimodal Data Hub and Benchmark Suite for <fixed-case>S</fixed-case>outheast <fixed-case>A</fixed-case>sian Languages - HolyLoveniaAI Singapore - RahmadMahendraRoyal Melbourne Institute of Technology and Universitas Indonesia + HolyLoveniaAI Singapore + RahmadMahendraRoyal Melbourne Institute of Technology and Universitas Indonesia Salsabil MaulanaAkbarInstitut Teknologi Bandung Lester James V.MirandaAllen Institute for Artificial Intelligence - JenniferSantosoRevComm, Inc. + JenniferSantosoRevComm, Inc. ElyanahAcoAteneo de Manila University - AkhdanFadhilahTohoku University + AkhdanFadhilahTohoku University JonibekMansurov - Joseph MarvinImperialUniversity of Bath - Onno P.Kampman + Joseph MarvinImperialUniversity of Bath + Onno P.Kampman Joel Ruben AntonyMonizApple Muhammad Ravi ShulthanHabibiUniversitas Indonesia FrederikusHudi RaileyMontalanAI Singapore and Ateneo de Manila University RyanIgnatiusInstitut Teknologi Bandung - Joanito AgiliLopoUniversitas Gadjah Mada + Joanito AgiliLopoUniversitas Gadjah Mada WilliamNixon - Börje F.KarlssonBeijing Academy of Artificial Intelligence (BAAI) + Börje F.KarlssonBeijing Academy of Artificial Intelligence (BAAI) JamesJayaGeorgia Institute of Technology RyanditoDiandaru YuzeGaoA*STAR PatrickAmadeus - BinWangI2R, A*STAR + BinWangI2R, A*STAR Jan Christian BlaiseCruzMohamed bin Zayed University of Artificial Intelligence ChenxiWhitehouseUniversity of Cambridge Ivan HalimParmonanganQueensland University of Technology @@ -4197,8 +4197,8 @@ Sonny LazuardiHermawan Dan JohnVelascoSamsung Muhammad Dehan AlKautsar - Willy FitraHendriaLunit Inc. - YasminMoslemBering Lab + Willy FitraHendriaLunit Inc. + YasminMoslemBering Lab NoahFlynnAmazon Muhammad FaridAdilazuarda HaochenLiNanyang Technological University @@ -4206,15 +4206,15 @@ R.Damanhuri ShuoSun Muhammad RezaQorib - AmirbekDjanibekov + AmirbekDjanibekov Wei QiLeongAI Singapore Quyet V.Do NiklasMuennighoffStanford University, Contextual AI and Allen Institute for Artificial Intelligence - TanradaPansuwan + TanradaPansuwan Ilham FirdausiPutra YanXuThe Hong Kong University of Science and Technology Tai NgeeChiaNational University of Singapore - AyuPurwariantiInstitut Teknologi Bandung + AyuPurwariantiInstitut Teknologi Bandung SebastianRuderCohere and Google WilliamTjhiAI Singapore PeeratLimkonchotiwatAI Singapore @@ -4224,7 +4224,7 @@ RuochenZhangBrown University FajriKotoMohamed bin Zayed University of Artificial Intelligence Zheng-XinYongBrown University - SamuelCahyawijaya + SamuelCahyawijaya 5155-5203 Southeast Asia (SEA) is a region rich in linguistic diversity and cultural variety, with over 1,300 indigenous languages and a population of 671 million people. However, prevailing AI models suffer from a significant lack of representation of texts, images, and audio datasets from SEA, compromising the quality of AI models for SEA languages. Evaluating models for SEA languages is challenging due to the scarcity of high-quality datasets, compounded by the dominance of English training data, raising concerns about potential cultural misrepresentation. To address these challenges, through a collaborative movement, we introduce SEACrowd, a comprehensive resource center that fills the resource gap by providing standardized corpora in nearly 1,000 SEA languages across three modalities. Through our SEACrowd benchmarks, we assess the quality of AI models on 36 indigenous languages across 13 tasks, offering valuable insights into the current AI landscape in SEA. Furthermore, we propose strategies to facilitate greater AI advancements, maximizing potential utility and resource equity for the future of AI in Southeast Asia. 2024.emnlp-main.296 @@ -4235,10 +4235,10 @@ Induct-Learn: Short Phrase Prompting with Instruction Induction - Po-ChunChenNational Taiwan University + Po-ChunChenNational Taiwan University Sheng-LunWeiDepartment of computer science and informational engineering, National Taiwan University - Hen-HsenHuangInstitute of Information Science, Academia Sinica - Hsin-HsiChenNational Taiwan University + Hen-HsenHuangInstitute of Information Science, Academia Sinica + Hsin-HsiChenNational Taiwan University 5204-5231 Large Language Models (LLMs) have demonstrated capability in “instruction induction,” generating instructions from demonstrations (input-output pairs). However, existing methods often rely on large datasets or numerous examples, which is impractical and costly in real-world scenarios. In this work, we propose a low-cost, task-level framework called Induct-Learn. It induces pseudo instructions from a few demonstrations and a short phrase, adding a CoT process into existing demonstrations. When encountering new problems, the learned pseudo instructions and demonstrations with the pseudo CoT process can be combined into a prompt to guide the LLM’s problem-solving process. We validate our approach on the BBH-Induct and Evals-Induct datasets, and the results show that the Induct-Learn framework outperforms state-of-the-art methods. We also exhibit cross-model adaptability and achieve superior performance at a lower cost compared to existing methods. 2024.emnlp-main.297 @@ -4250,9 +4250,9 @@ Multi-Granularity History and Entity Similarity Learning for Temporal Knowledge Graph Reasoning ShiMingcong ChunjiangZhuUniversity of North Carolina Greensboro - DetianZhangSoochow University - ShitingWenNingboTech University - LiQingThe Hong Kong Polytechnic University, Hong Kong Polytechnic University and Hong Kong Polytechnic University + DetianZhangSoochow University + ShitingWenNingboTech University + LiQingThe Hong Kong Polytechnic University, Hong Kong Polytechnic University and Hong Kong Polytechnic University 5232-5243 2024.emnlp-main.298 mingcong-etal-2024-multi @@ -4261,9 +4261,9 @@ <fixed-case>LUQ</fixed-case>: Long-text Uncertainty Quantification for <fixed-case>LLM</fixed-case>s CaiqiZhang - FangyuLiuGoogle DeepMind - MarcoBasaldellaAmazon - NigelCollierUniversity of Cambridge + FangyuLiuGoogle DeepMind + MarcoBasaldellaAmazon + NigelCollierUniversity of Cambridge 5244-5262 Large Language Models (LLMs) have demonstrated remarkable capability in a variety of NLP tasks. However, LLMs are also prone to generate nonfactual content. Uncertainty Quantification (UQ) is pivotal in enhancing our understanding of a model’s confidence on its generation, thereby aiding in the mitigation of nonfactual outputs. Existing research on UQ predominantly targets short text generation, typically yielding brief, word-limited responses. However, real-world applications frequently necessitate much longer responses. Our study first highlights the limitations of current UQ methods in handling long text generation. We then introduce Luq and its two variations, a series of novel sampling-based UQ approaches specifically designed for long text. Our findings reveal that Luq outperforms existing baseline methods in correlating with the model’s factuality scores (negative coefficient of -0.85 observed for Gemini Pro). To further improve the factuality of LLM responses, we propose Luq-Ensemble, a method that ensembles responses from multiple models and selects the response with the lowest uncertainty. The ensembling method greatly improves the response factuality upon the best standalone LLM. 2024.emnlp-main.299 @@ -4289,7 +4289,7 @@ Scaling Synthetic Logical Reasoning Datasets with Context-Sensitive Declarative Grammars - DamienSileoINRIA + DamienSileoINRIA 5275-5283 Logical reasoning remains a challenge for natural language processing, but it can be improved by training language models to mimic theorem provers on procedurally generated problems. Previous work used domain-specific proof generation algorithms, which biases reasoning toward specific proof traces and limits auditability and extensibility. We present a simpler and more general declarative framework with flexible context-sensitive rules binding multiple languages (specifically, simplified English and the TPTP theorem-proving language). We construct first-order logic problems by selecting up to 32 premises and one hypothesis. We demonstrate that using semantic constraints during generation and careful English verbalization of predicates enhances logical reasoning without hurting natural English tasks. Using relatively small DeBERTa-v3 models, we achieve state-of-the-art accuracy on the FOLIO human-authored logic dataset, surpassing GPT-4 in accuracy with or without an external solver by 12%. 2024.emnlp-main.301 @@ -4300,9 +4300,9 @@ Improving Spoken Language Modeling with Phoneme Classification: A Simple Fine-tuning Approach - MaximePoliEcole Normale Supérieure – PSL - EmmanuelChemlaCNRS - EmmanuelDupouxEHESS + MaximePoliEcole Normale Supérieure – PSL + EmmanuelChemlaCNRS + EmmanuelDupouxEHESS 5284-5292 Recent progress in Spoken Language Modeling has shown that learning language directly from speech is feasible. Generating speech through a pipeline that operates at the text level typically loses nuances, intonations, and non-verbal vocalizations. Modeling directly from speech opens up the path to more natural and expressive systems. On the other hand, speech-only systems require up to three orders of magnitude more data to catch up to their text-based counterparts in terms of their semantic abilities. We show that fine-tuning speech representation models on phoneme classification leads to more context-invariant representations, and language models trained on these units achieve comparable lexical comprehension to ones trained on hundred times more data. 2024.emnlp-main.302 @@ -4311,10 +4311,10 @@ Safely Learning with Private Data: A Federated Learning Framework for Large Language Model - Jia-YingZheng + Jia-YingZheng HainanZhang LingxiangWang - WangjieQiuBeihang University + WangjieQiuBeihang University Hong-WeiZheng Zhi-MingZheng 5293-5306 @@ -4342,7 +4342,7 @@ How Does the Textual Information Affect the Retrieval of Multimodal In-Context Learning? YangLuo - ZangweiZhengNational University of Singapore + ZangweiZhengNational University of Singapore ZiruiZhuNational University of Singapore YangYouNational University of Singapore 5321-5335 @@ -4356,7 +4356,7 @@ Shirley AnugrahHayatiUniversity of Minnesota - Twin Cities MinhwaLeeUniversity of Massachusetts at Lowell DheerajRajagopal - DongyeopKangUniversity of Minnesota + DongyeopKangUniversity of Minnesota 5336-5366 Collecting diverse human opinions is costly and challenging. This leads to a recent trend in exploiting large language models (LLMs) for generating diverse data for potential scalable and efficient solutions. However, the extent to which LLMs can generate diverse perspectives on subjective topics is still unclear. In this study, we explore LLMs’ capacity of generating diverse perspectives and rationales on subjective topics such as social norms and argumentative texts. We introduce the problem of extracting maximum diversity from LLMs. Motivated by how humans form opinions based on values, we propose a criteria-based prompting technique to ground diverse opinions. To see how far we can extract diverse perspectives from LLMs, or called diversity coverage, we employ a step-by-step recall prompting to generate more outputs from the model iteratively. Our methods, applied to various tasks, show that LLMs can indeed produce diverse opinions according to the degree of task subjectivity. We also find that LLMs performance of extracting maximum diversity is on par with human. 2024.emnlp-main.306 @@ -4365,7 +4365,7 @@ <fixed-case>EXPLORA</fixed-case>: Efficient Exemplar Subset Selection for Complex Reasoning - KiranPurohitIndian Institute of Technology Kharagpur + KiranPurohitIndian Institute of Technology Kharagpur VenkteshV RaghuramDevalla Krishna MohanYerragorla @@ -4383,7 +4383,7 @@ An <fixed-case>LLM</fixed-case> Feature-based Framework for Dialogue Constructiveness Assessment LexinZhou YoumnaFarag - AndreasVlachosUniversity of Cambridge + AndreasVlachosUniversity of Cambridge 5389-5409 Research on dialogue constructiveness assessment focuses on (i) analysing conversational factors that influence individuals to take specific actions, win debates, change their perspectives or broaden their open-mindedness and (ii) predicting constructiveness outcomes following dialogues for such use cases. These objectives can be achieved by training either interpretable feature-based models (which often involve costly human annotations) or neural models such as pre-trained language models (which have empirically shown higher task accuracy but lack interpretability). In this paper we propose an LLM feature-based framework for dialogue constructiveness assessment that combines the strengths of feature-based and neural approaches, while mitigating their downsides. The framework first defines a set of dataset-independent and interpretable linguistic features, which can be extracted by both prompting an LLM and simple heuristics. Such features are then used to train LLM feature-based models. We apply this framework to three datasets of dialogue constructiveness and find that our LLM feature-based models outperform or performs at least as well as standard feature-based models and neural models. We also find that the LLM feature-based model learns more robust prediction rules instead of relying on superficial shortcuts, which often trouble neural models. 2024.emnlp-main.308 @@ -4405,7 +4405,7 @@ <fixed-case>D</fixed-case>ialog2<fixed-case>F</fixed-case>low: Pre-training Soft-Contrastive Action-Driven Sentence Embeddings for Automatic Dialog Flow Extraction - SergioBurdissoIdiap Research Institute + SergioBurdissoIdiap Research Institute SrikanthMadikeriUniversity of Zurich PetrMotlicek 5421-5440 @@ -4417,7 +4417,7 @@ Words Worth a Thousand Pictures: Measuring and Understanding Perceptual Variability in Text-to-Image Generation RaphaelTangComcast - CrystinaZhangUniversity of Waterloo + CrystinaZhangUniversity of Waterloo LixinyuXuGeorgetown University YaoLu WenyanLi @@ -4433,7 +4433,7 @@ Investigating <fixed-case>LLM</fixed-case>s as Voting Assistants via Contextual Augmentation: A Case Study on the <fixed-case>E</fixed-case>uropean Parliament Elections 2024 - IliasChalkidis + IliasChalkidis 5455-5467 In light of the recent 2024 European Parliament elections, we are investigating if LLMs can be used as Voting Advice Applications (VAAs). We audit MISTRAL and MIXTRAL models and evaluate their accuracy in predicting the stance of political parties based on the latest “EU and I” voting assistance questionnaire. Furthermore, we explore alternatives to improve models’ performance by augmenting the input context via Retrieval-Augmented Generation (RAG) relying on web search, and Self-Reflection using staged conversations that aim to re-collect relevant content from the model’s internal memory. We find that MIXTRAL is highly accurate with an 82% accuracy on average with a significant performance disparity across different political groups (50-95%). Augmenting the input context with expert-curated information can lead to a significant boost of approx. 9%, which remains an open challenge for automated RAG approaches, even considering curated content. 2024.emnlp-main.312 @@ -4445,7 +4445,7 @@ MayiXuWuhan University YongqiLi KeSun - TieyunQianWuhan University + TieyunQianWuhan University 5468-5495 Large language models (LLMs) have shown excellent capability for solving reasoning problems. Existing approaches do not differentiate the question difficulty when designing prompting methods for them. Clearly, a simple method cannot elicit sufficient knowledge from LLMs to answer a hard question. Meanwhile, a sophisticated one will force the LLM to generate redundant or even inaccurate intermediate steps toward a simple question. Consequently, the performance of existing methods fluctuates among various questions.In this work, we propose Adaption-of-Thought (AdoT), an adaptive method to improve LLMs for the reasoning problem, which first measures the question difficulty and then tailors demonstration set construction and difficulty-adapted retrieval strategies for the adaptive demonstration construction. Experimental results on three reasoning tasks prove the superiority of our proposed method, showing an absolute improvement of up to 5.5% on arithmetic reasoning, 7.4% on symbolic reasoning, and 2.3% on commonsense reasoning. Our codes and implementation details are available at: https://github.com/NLPGM/AdoT 2024.emnlp-main.313 @@ -4456,7 +4456,7 @@ <fixed-case>L</fixed-case>ogic<fixed-case>ST</fixed-case>: A Logical Self-Training Framework for Document-Level Relation Extraction with Incomplete Annotations ShengdaFan YantingWangBeijing University of Aeronautics and Astronautics - ShashaMoBeijing University + ShashaMoBeijing University JianweiNiuBeihang University 5496-5510 Document-level relation extraction (DocRE) aims to identify relationships between entities within a document. Due to the vast number of entity pairs, fully annotating all fact triplets is challenging, resulting in datasets with numerous false negative samples. Recently, self-training-based methods have been introduced to address this issue. However, these methods are purely black-box and sub-symbolic, making them difficult to interpret and prone to overlooking symbolic interdependencies between relations.To remedy this deficiency, our insight is that symbolic knowledge, such as logical rules, can be used as diagnostic tools to identify conflicts between pseudo-labels. By resolving these conflicts through logical diagnoses, we can correct erroneous pseudo-labels, thus enhancing the training of neural models.To achieve this, we propose **LogicST**, a neural-logic self-training framework that iteratively resolves conflicts and constructs the minimal diagnostic set for updating models. Extensive experiments demonstrate that LogicST significantly improves performance and outperforms previous state-of-the-art methods. For instance, LogicST achieves an increase of **7.94%** in F1 score compared to CAST (Tan et al., 2023a) on the DocRED benchmark (Yao et al., 2019). Additionally, LogicST is more time-efficient than its self-training counterparts, requiring only **10%** of the training time of CAST. @@ -4482,7 +4482,7 @@ FeiHuangAlibaba Group RuPeng KemingLu - BowenYuAlibaba Group + BowenYuAlibaba Group ChangZhou JingrenZhouAlibaba Group 5527-5542 @@ -4494,7 +4494,7 @@ <fixed-case>NLEB</fixed-case>ench+<fixed-case>N</fixed-case>or<fixed-case>GLM</fixed-case>: A Comprehensive Empirical Analysis and Benchmark Dataset for Generative Language Models in <fixed-case>N</fixed-case>orwegian PengLiuNorwegian Institute of Technology - LemeiZhangNorwegian University of Science and Technology + LemeiZhangNorwegian University of Science and Technology TerjeFarup Even W.Lauvrak Jon EspenIngvaldsen @@ -4510,7 +4510,7 @@ <fixed-case>RSA</fixed-case>-Control: A Pragmatics-Grounded Lightweight Controllable Text Generation Framework YifanWang - VeraDembergUniversität des Saarlandes + VeraDembergUniversität des Saarlandes 5561-5582 Despite significant advancements in natural language generation, controlling language models to produce texts with desired attributes remains a formidable challenge. In this work, we introduce RSA-Control, a training-free controllable text generation framework grounded in pragmatics. RSA-Control directs the generation process by recursively reasoning between imaginary speakers and listeners, enhancing the likelihood that target attributes are correctly interpreted by listeners amidst distractors. Additionally, we introduce a self-adjustable rationality parameter, which allows for automatic adjustment of control strength based on context. Our experiments, conducted with two task types and two types of language models, demonstrate that RSA-Control achieves strong attribute control while maintaining language fluency and content consistency. Our code is available at https://github.com/Ewanwong/RSA-Control. 2024.emnlp-main.318 @@ -4537,7 +4537,7 @@ Vishal VivekSaleyIndian Institute of Technology Delhi Rocktim JyotiDasMohamed bin Zayed University of Artificial Intelligence DineshRaghuIBM Research - New Delhi - Mausam.Indian Institute of Technology Delhi + Mausam.Indian Institute of Technology Delhi 5596-5612 End-to-end Task-Oriented Dialog (TOD) systems typically require extensive training datasets to perform well. In contrast, large language model (LLM) based TOD systems can excel even with limited data due to their ability to learn tasks through in-context exemplars. However, these models lack alignment with the style of responses in training data and often generate comprehensive responses, making it difficult for users to grasp the information quickly. In response, we propose SyncTOD that synergizes LLMs with task-specific hints to improve alignment in low-data settings. SyncTOD employs small auxiliary models to provide hints and select exemplars for in-context prompts. With ChatGPT, SyncTOD achieves superior performance compared to LLM-based baselines and SoTA models in low-data settings, while retaining competitive performance in full-data settings. 2024.emnlp-main.320 @@ -4550,9 +4550,9 @@ YuhaoWangRenmin University of China RuiyangRen JunyiLi - XinZhaoRenmin University of China + XinZhaoRenmin University of China JingLiuBaidu - Ji-RongWenRenmin University of China + Ji-RongWenRenmin University of China 5613-5626 Considering the limited internal parametric knowledge, retrieval-augmented generation (RAG) has been widely used to extend the knowledge scope of large language models (LLMs). Despite the extensive efforts on RAG research, in existing methods, LLMs cannot precisely assess the relevance of retrieved documents, thus likely leading to misleading or even incorrect utilization of external knowledge (i.e., retrieved documents). To address this issue, in this paper, we propose REAR, a RElevance-Aware Retrieval-augmented approach for open-domain question answering (QA). As the key motivation, we aim to enhance the self-awareness regarding the reliability of external knowledge for LLMs, so as to adaptively utilize external knowledge in RAG systems. Specially, we develop a novel architecture for LLM based RAG system, by incorporating a specially designed assessnent module that precisely assesses the relevance of retrieved documents. Furthermore, we propose an improved training method based on bi-granularity relevance fusion and noise-resistant training. By combining the improvements in both architecture and training, our proposed REAR can better utilize external knowledge by effectively perceiving the relevance of retrieved documents. Experiments on four open-domain QA tasks show that REAR significantly outperforms previous a number of competitive RAG approaches. Our codes can be accessed at https://github.com/RUCAIBox/REAR. 2024.emnlp-main.321 @@ -4566,7 +4566,7 @@ MinzhengWang LongzeChen FuCheng - ShengyiLiaoAlibaba Group + ShengyiLiaoAlibaba Group XinghuaZhang BingliWu HaiyangYu @@ -4586,7 +4586,7 @@ On Mitigating Performance Disparities in Multilingual Speech Recognition MonoramaSwain - Anna Katrine VanZee + Anna Katrine VanZee AndersSøgaardCopenhagen University 5647-5655 How far have we come in mitigating performance disparities across genders in multilingual speech recognition? We compare the impact on gender disparity of different fine-tuning algorithms for automated speech recognition across model sizes, languages and gender. We look at both performance-focused and fairness-promoting algorithms. Across languages, we see slightly better performance for female speakers for larger models regardless of the fine-tuning algorithm. The best trade-off between performance and parity is found using adapter fusion. Fairness-promoting fine-tuning algorithms (Group-DRO and Spectral Decoupling) hurt performance compared to adapter fusion with only slightly better performance parity. LoRA increases disparities slightly. Fairness-mitigating fine-tuning techniques led to slightly higher variance in performance across languages, with the exception of adapter fusion. @@ -4596,8 +4596,8 @@ Thinking Outside of the Differential Privacy Box: A Case Study in Text Privatization with Language Model Prompting - StephenMeisenbacher - FlorianMatthesTechnische Universität München + StephenMeisenbacher + FlorianMatthesTechnische Universität München 5656-5665 The field of privacy-preserving Natural Language Processing has risen in popularity, particularly at a time when concerns about privacy grow with the proliferation of large language models. One solution consistently appearing in recent literature has been the integration of Differential Privacy (DP) into NLP techniques. In this paper, we take these approaches into critical view, discussing the restrictions that DP integration imposes, as well as bring to light the challenges that such restrictions entail. To accomplish this, we focus on **DP-Prompt**, a recent method for text privatization leveraging language models to rewrite texts. In particular, we explore this rewriting task in multiple scenarios, both with DP and without DP. To drive the discussion on the merits of DP in NLP, we conduct empirical utility and privacy experiments. Our results demonstrate the need for more discussion on the usability of DP in NLP and its benefits over non-DP approaches. 2024.emnlp-main.324 @@ -4618,9 +4618,9 @@ What is “Typological Diversity” in <fixed-case>NLP</fixed-case>? - EstherPloeger + EstherPloeger WesselPoelmanKU Leuven - Miryamde LhoneuxKU Leuven + Miryamde LhoneuxKU Leuven JohannesBjervaAalborg University 5681-5700 The NLP research community has devoted increased attention to languages beyond English, resulting in considerable improvements for multilingual NLP. However, these improvements only apply to a small subset of the world’s languages. An increasing number of papers aspires to enhance generalizable multilingual performance across languages. To this end, linguistic typology is commonly used to motivate language selection, on the basis that a broad typological sample ought to imply generalization across a broad range of languages. These selections are often described as being typologically diverse. In this meta-analysis, we systematically investigate NLP research that includes claims regarding typological diversity. We find there are no set definitions or criteria for such claims. We introduce metrics to approximate the diversity of resulting language samples along several axes and find that the results vary considerably across papers. Crucially, we show that skewed language selection can lead to overestimated multilingual performance. We recommend future work to include an operationalization of typological diversity that empirically justifies the diversity of language samples. To help facilitate this, we release the code for our diversity measures. @@ -4631,11 +4631,11 @@ The Computational Anatomy of Humility: Modeling Intellectual Humility in Online Public Discourse - XiaoboGuo + XiaoboGuo NeilPotnisNortheastern University MelodyYu - NabeelGillaniNortheastern University - SoroushVosoughiDartmouth College + NabeelGillaniNortheastern University + SoroushVosoughiDartmouth College 5701-5723 The ability for individuals to constructively engage with one another across lines of difference is a critical feature of a healthy pluralistic society. This is also true in online discussion spaces like social media platforms. To date, much social media research has focused on preventing ills—like political polarization and the spread of misinformation. While this is important, enhancing the quality of online public discourse requires not just reducing ills, but also, promoting foundational human virtues. In this study, we focus on one particular virtue: “intellectual humility” (IH), or acknowledging the potential limitations in one’s own beliefs. Specifically, we explore the development of computational methods for measuring IH at scale. We manually curate and validate an IH codebook on 350 posts about religion drawn from subreddits and use them to develop LLM-based models for automating this measurement. Our best model achieves a Macro-F1 score of 0.64 across labels (and 0.70 when predicting IH/IA/Neutral at the coarse level), higher than an expected naive baseline of 0.51 (0.32 for IH/IA/Neutral) but lower than a human annotator-informed upper bound of 0.85 (0.83 for IH/IA/Neutral). Our results both highlight the challenging nature of detecting IH online—opening the door to new directions in NLP research—and also lay a foundation for computational social science researchers interested in analyzing and fostering more IH in online public discourse. 2024.emnlp-main.327 @@ -4656,13 +4656,13 @@ Benchmarking Vision Language Models for Cultural Understanding - ShravanNayakMontreal Institute for Learning Algorithms, University of Montreal, Université de Montréal + ShravanNayakMontreal Institute for Learning Algorithms, University of Montreal, Université de Montréal KanishkJainUniversité de Montréal RabiulAwalMila - Quebec AI Institute SivaReddyMila, McGill University and Mila, McGill University Sjoerd VanSteenkisteGoogle Lisa AnneHendricksDeepMind - KarolinaStanczakMila - Quebec Artificial Intelligence Institute and McGill University, McGill University + KarolinaStanczakMila - Quebec Artificial Intelligence Institute and McGill University, McGill University AishwaryaAgrawalUniversité de Montréal, Mila – Quebec AI Institute and Google DeepMind 5769-5790 Foundation models and vision-language pre-training have notably advanced Vision Language Models (VLMs), enabling multimodal processing of visual and linguistic data. However, their performance has been typically assessed on general scene understanding - recognizing objects, attributes, and actions - rather than cultural comprehension. This study introduces CulturalVQA, a visual question-answering benchmark aimed at assessing VLM’s geo-diverse cultural understanding. We curate a diverse collection of 2,378 image-question pairs with 1-5 answers per question representing cultures from 11 countries across 5 continents. The questions probe understanding of various facets of culture such as clothing, food, drinks, rituals, and traditions. Benchmarking VLMs on CulturalVQA, including GPT-4V and Gemini, reveals disparity in their level of cultural understanding across regions, with strong cultural understanding capabilities for North America while significantly weaker capabilities for Africa. We observe disparity in their performance across cultural facets too, with clothing, rituals, and traditions seeing higher performances than food and drink. These disparities help us identify areas where VLMs lack cultural understanding and demonstrate the potential of CulturalVQA as a comprehensive evaluation set for gauging VLM progress in understanding diverse cultures. @@ -4673,7 +4673,7 @@ Methods of Automatic Matrix Language Determination for Code-Switched Speech OlgaIakovenko - ThomasHainUniversity of Sheffield + ThomasHainUniversity of Sheffield 5791-5800 Code-switching (CS) is the process of speakers interchanging between two or more languages which in the modern world becomes increasingly common. In order to better describe CS speech the Matrix Language Frame (MLF) theory introduces the concept of a Matrix Language, which is the language that provides the grammatical structure for a CS utterance. In this work the MLF theory was used to develop systems for Matrix Language Identity (MLID) determination. The MLID of English/Mandarin and English/Spanish CS text and speech was compared to acoustic language identity (LID), which is a typical way to identify a language in monolingual utterances. MLID predictors from audio show higher correlation with the textual principles than LID in all cases while also outperforming LID in an MLID recognition task based on F1 macro (60%) and correlation score (0.38). This novel approach has identified that non-English languages (Mandarin and Spanish) are preferred over the English language as the ML contrary to the monolingual choice of LID. 2024.emnlp-main.330 @@ -4684,9 +4684,9 @@ Analyzing Key Factors Influencing Emotion Prediction Performance of <fixed-case>VLLM</fixed-case>s in Conversational Contexts JaewookLeeKonkuk University YeajinJang - HongjinKimKonkuk University + HongjinKimKonkuk University WoojinLee - HarksooKimKonkuk University + HarksooKimKonkuk University 5801-5816 Emotional intelligence (EI) in artificial intelligence (AI), which refers to the ability of an AI to understand and respond appropriately to human emotions, has emerged as a crucial research topic. Recent studies have shown that large language models (LLMs) and vision large language models (VLLMs) possess EI and the ability to understand emotional stimuli in the form of text and images, respectively. However, factors influencing the emotion prediction performance of VLLMs in real-world conversational contexts have not been sufficiently explored. This study aims to analyze the key elements affecting the emotion prediction performance of VLLMs in conversational contexts systematically. To achieve this, we reconstructed the MELD dataset, which is based on the popular TV series Friends, and conducted experiments through three sub-tasks: overall emotion tone prediction, character emotion prediction, and contextually appropriate emotion expression selection. We evaluated the performance differences based on various model architectures (e.g., image encoders, modality alignment, and LLMs) and image scopes (e.g., entire scene, person, and facial expression). In addition, we investigated the impact of providing persona information on the emotion prediction performance of the models and analyzed how personality traits and speaking styles influenced the emotion prediction process. We conducted an in-depth analysis of the impact of various other factors, such as gender and regional biases, on the emotion prediction performance of VLLMs. The results revealed that these factors significantly influenced the model performance. 2024.emnlp-main.331 @@ -4695,7 +4695,7 @@ Context-Aware Assistant Selection for Improved Inference Acceleration with Large Language Models - JerryHuangMontreal Institute for Learning Algorithms, University of Montreal, Université de Montréal + JerryHuangMontreal Institute for Learning Algorithms, University of Montreal, Université de Montréal PrasannaParthasarathiHuawei Technologies Ltd. MehdiRezagholizadeh SarathChandarPolytechnique Montreal @@ -4732,7 +4732,7 @@ Quantifying the Gaps Between Translation and Native Perception in Training for Multimodal, Multilingual Retrieval - KyleBuettnerUniversity of Pittsburgh and University of Pittsburgh + KyleBuettnerUniversity of Pittsburgh and University of Pittsburgh AdrianaKovashkaUniversity of Pittsburgh 5863-5870 There is a scarcity of multilingual vision-language models that properly account for the perceptual differences that are reflected in image captions across languages and cultures. In this work, through a multimodal, multilingual retrieval case study, we quantify the existing lack of model flexibility. We empirically show performance gaps between training on captions that come from native German perception and captions that have been either machine-translated or human-translated from English into German. To address these gaps, we further propose and evaluate caption augmentation strategies. While we achieve mean recall improvements (+1.3), gaps still remain, indicating an open area of future work for the community. @@ -4742,7 +4742,7 @@ <fixed-case>MTA</fixed-case>4<fixed-case>DPR</fixed-case>: Multi-Teaching-Assistants Based Iterative Knowledge Distillation for Dense Passage Retrieval - QixiLuBeijing Language and Culture University + QixiLuBeijing Language and Culture University EndongXunBeijing Language and Culture University GongboTangBeijing Language and Culture University 5871-5883 @@ -4756,9 +4756,9 @@ Fine-Grained Detection of Solidarity for Women and Migrants in 155 Years of <fixed-case>G</fixed-case>erman Parliamentary Debates AidaKostikova DominikBeese - BenjaminPaassen - OlePützUniversität Bielefeld - GregorWiedemannLeibniz-Institute for Media Research | Hans-Bredow-Institut + BenjaminPaassen + OlePützUniversität Bielefeld + GregorWiedemannLeibniz-Institute for Media Research | Hans-Bredow-Institut SteffenEgerUniversity of Technology Nuremberg 5884-5907 Solidarity is a crucial concept to understand social relations in societies. In this study, we investigate the frequency of (anti-)solidarity towards women and migrants in German parliamentary debates between 1867 and 2022. Using 2,864 manually annotated text snippets, we evaluate large language models (LLMs) like Llama 3, GPT-3.5, and GPT-4. We find that GPT-4 outperforms other models, approaching human annotation accuracy. Using GPT-4, we automatically annotate 18,300 further instances and find that solidarity with migrants outweighs anti-solidarity but that frequencies and solidarity types shift over time. Most importantly, group-based notions of (anti-)solidarity fade in favor of compassionate solidarity, focusing on the vulnerability of migrant groups, and exchange-based anti-solidarity, focusing on the lack of (economic) contribution. This study highlights the interplay of historical events, socio-economic needs, and political ideologies in shaping migration discourse and social cohesion. @@ -4770,7 +4770,7 @@ <fixed-case>CI</fixed-case>tru<fixed-case>S</fixed-case>: Chunked Instruction-aware State Eviction for Long Sequence Modeling YuBai XiyuanZou - HeyanHuangBeijing Institute of Technology + HeyanHuangBeijing Institute of Technology SanxingChenDuke University Marc-AntoineRondeauMila - Quebec Artificial Intelligence Institute YangGao @@ -4798,8 +4798,8 @@ KuntingLiTsinghua University, Tsinghua University YongHu LiangHeTsinghua University, Tsinghua University - FandongMengWeChat AI, Tencent Inc. - JieZhou + FandongMengWeChat AI, Tencent Inc. + JieZhou 5944-5957 Chinese Spell Checking (CSC) aims to detect and correct spelling errors in sentences. Despite Large Language Models (LLMs) exhibit robust capabilities and are widely applied in various tasks, their performance on CSC is often unsatisfactory. We find that LLMs fail to meet the Chinese character-level constraints of the CSC task, namely equal length and phonetic similarity, leading to a performance bottleneck. Further analysis reveals that this issue stems from the granularity of tokenization, as current mixed character-word tokenization struggles to satisfy these character-level constraints. To address this issue, we propose C-LLM, a Large Language Model-based Chinese Spell Checking method that learns to check errors Character by Character. Character-level tokenization enables the model to learn character-level alignment, effectively mitigating issues related to character-level constraints. Furthermore, CSC is simplified to replication-dominated and substitution-supplemented tasks. Experiments on two CSC benchmarks demonstrate that C-LLM achieves a 2.1% enhancement in general scenarios and a significant 12% improvement in vertical domain scenarios compared to existing methods, establishing state-of-the-art performance. 2024.emnlp-main.340 @@ -4810,7 +4810,7 @@ <fixed-case>PSC</fixed-case>: Extending Context Window of Large Language Models via Phase Shift Calibration WenqiaoZhu ChaoXu - LuluWang + LuluWang JunWuZhejiang RoyalFlush Network Technology Co., Ltd. 5958-5970 Rotary Position Embedding (RoPE) is an efficient position encoding approach and is widely utilized in numerous large language models (LLMs). Recently, a lot of methods have been put forward to further expand the context window based on RoPE. The core concept of those methods is to predefine or search for a set of factors to rescale the base frequencies of RoPE. Nevertheless, it is quite a challenge for existing methods to predefine an optimal factor due to the exponential search space. In view of this, we introduce PSC (Phase Shift Calibration), a small module for calibrating the frequencies predefined by existing methods. With the employment of PSC, we demonstrate that many existing methods can be further enhanced, like PI, YaRN, and LongRoPE. We conducted extensive experiments across multiple models and tasks. The results demonstrate that (1) when PSC is enabled, the comparative reductions in perplexity increase as the context window size is varied from 16k, to 32k, and up to 64k. (2) Our approach is broadly applicable and exhibits robustness across a variety of models and tasks. @@ -4826,7 +4826,7 @@ BinZhu JiaxiCui MunanNingPeking University - PengJin + PengJin LiYuanPeking University 5971-5984 Large Vision-Language Model (LVLM) has enhanced the performance of various downstream tasks in visual-language understanding. Most existing approaches encode images and videos into separate feature spaces, which are then fed as inputs to large language models. However, due to the lack of unified tokenization for images and videos, namely misalignment before projection, it becomes challenging for a Large Language Model (LLM) to learn multi-modal interactions from several poor projection layers.In this work, we unify visual representation into the language feature space to advance the foundational LLM towards a unified LVLM. As a result, we establish a simple but robust LVLM baseline, Video-LLaVA, which learns from a mixed dataset of images and videos, mutually enhancing each other.As a result, Video-LLaVA outperforms Video-ChatGPT by 5.8%, 9.9%, 18.6%, and 10.1% on MSRVTT, MSVD, TGIF, and ActivityNet, respectively. Additionally, our Video-LLaVA also achieves superior performances on a broad range of 9 image benchmarks.Notably, extensive experiments demonstrate that Video-LLaVA mutually benefits images and videos within a unified visual representation, outperforming models designed specifically for images or videos. We aim for this work to provide modest insights into the multi-modal inputs for the LLM. @@ -4839,8 +4839,8 @@ TianyangXuPurdue University ShujinWu ShizheDiaoHong Kong University of Science and Technology - XiaozeLiu - XingyaoWangAll Hands AI and University of Illinois Urbana-Champaign + XiaozeLiu + XingyaoWangAll Hands AI and University of Illinois Urbana-Champaign YangyiChenDepartment of Computer Science, University of Illinois at Urbana-Champaign JingGaoPurdue University 5985-5998 @@ -4853,7 +4853,7 @@ Mitigating Frequency Bias and Anisotropy in Language Model Pre-Training with Syntactic Smoothing RichardDiehl Martinez - ZébulonGoriely + ZébulonGoriely AndrewCaines PaulaButtery LisaBeinborn @@ -4865,10 +4865,10 @@ <fixed-case>T</fixed-case>oxi<fixed-case>C</fixed-case>loak<fixed-case>CN</fixed-case>: Evaluating Robustness of Offensive Language Detection in <fixed-case>C</fixed-case>hinese with Cloaking Perturbations - YunzeXiao + YunzeXiao YujiaHuSingapore University of Technology and Design - Kenny Tsu WeiChooSingapore University of Technology and Design - Roy Ka-WeiLeeSingapore University of Technology and Design and University of Saskatchewan + Kenny Tsu WeiChooSingapore University of Technology and Design + Roy Ka-WeiLeeSingapore University of Technology and Design and University of Saskatchewan 6012-6025 Detecting hate speech and offensive language is essential for maintaining a safe and respectful digital environment. This study examines the limitations of state-of-the-art large language models (LLMs) in identifying offensive content within systematically perturbed data, with a focus on Chinese, a language particularly susceptible to such perturbations. We introduce ToxiCloakCN, an enhanced dataset derived from ToxiCN, augmented with homophonic substitutions and emoji transformations, to test the robustness of LLMs against these cloaking perturbations. Our findings reveal that existing models significantly underperform in detecting offensive content when these perturbations are applied. We provide an in-depth analysis of how different types of offensive content are affected by these perturbations and explore the alignment between human and model explanations of offensiveness. Our work highlights the urgent need for more advanced techniques in offensive language detection to combat the evolving tactics used to evade detection mechanisms. 2024.emnlp-main.345 @@ -4880,7 +4880,7 @@ SiyuYuan ChengJiayangDepartment of Computer Science and Engineering, Hong Kong University of Science and Technology LinQiu - DeqingYangFudan University + DeqingYangFudan University 6026-6036 Analogical reasoning plays a critical role in human cognition, enabling us to understand new concepts by associating them with familiar ones. Previous research in the AI community has mainly focused on identifying and generating analogies and then examining their quality under human evaluation, which overlooks the practical application of these analogies in real-world settings. Inspired by the human education process, in this paper, we propose to investigate how analogies created by teacher language models (LMs) can assist student LMs in understanding scientific concepts, thereby aligning more closely with practical scenarios. Our results suggest that free-form analogies can indeed aid LMs in understanding concepts. Additionally, analogies generated by student LMs can improve their own performance on scientific question answering, demonstrating their capability to use analogies for self-learning new knowledge. Resources are available athttps://github.com/siyuyuan/SCUA. 2024.emnlp-main.346 @@ -4890,8 +4890,8 @@ Model Internals-based Answer Attribution for Trustworthy Retrieval-Augmented Generation JiruiQi - GabrieleSartiUniversity of Groningen - RaquelFernándezUniversity of Amsterdam and University of Amsterdam + GabrieleSartiUniversity of Groningen + RaquelFernándezUniversity of Amsterdam and University of Amsterdam AriannaBisazzaUniversity of Groningen 6037-6053 Ensuring the verifiability of model answers is a fundamental challenge for retrieval-augmented generation (RAG) in the question answering (QA) domain. Recently, self-citation prompting was proposed to make large language models (LLMs) generate citations to supporting documents along with their answers. However, self-citing LLMs often struggle to match the required format, refer to non-existent sources, and fail to faithfully reflect LLMs’ context usage throughout the generation. In this work, we present MIRAGE – Model Internals-based RAG Explanations – a plug-and-play approach using model internals for faithful answer attribution in RAG applications. MIRAGE detects context-sensitive answer tokens and pairs them with retrieved documents contributing to their prediction via saliency methods. We evaluate our proposed approach on a multilingual extractive QA dataset, finding high agreement with human answer attribution. On open-ended QA, MIRAGE achieves citation quality and efficiency comparable to self-citation while also allowing for a finer-grained control of attribution parameters. Our qualitative evaluation highlights the faithfulness of MIRAGE’s attributions and underscores the promising application of model internals for RAG answer attribution. Code and data released at https://github.com/Betswish/MIRAGE. @@ -4903,7 +4903,7 @@ Do Large Language Models Know How Much They Know? GabrielePratoMontreal Institute for Learning Algorithms, University of Montreal, University of Montreal - JerryHuangMontreal Institute for Learning Algorithms, University of Montreal, Université de Montréal + JerryHuangMontreal Institute for Learning Algorithms, University of Montreal, Université de Montréal PrasannaParthasarathiHuawei Technologies Ltd. ShagunSodhaniFacebook SarathChandarPolytechnique Montreal @@ -4916,7 +4916,7 @@ Investigating Mysteries of <fixed-case>C</fixed-case>o<fixed-case>T</fixed-case>-Augmented Distillation SominWadhwaNortheastern University - SilvioAmirNortheastern University + SilvioAmirNortheastern University Byron CWallaceNortheastern University, Brown University and Northeastern University 6071-6086 Eliciting chain of thought (CoT) rationales - sequences of token that convey a “reasoning” process has been shown to consistently improve LLM performance on tasks like question answering. More recent efforts have shown that such rationales can also be used for model distillation: Including CoT sequences (elicited from a large “teacher” model) in addition to target labels when fine-tuning a small student model yields (often substantial) improvements. In this work we ask: Why and how does this additional training signal help in model distillation? We perform ablations to interrogate this, and report some potentially surprising results. Specifically: (1) Placing CoT sequences after labels (rather than before) realizes consistently better downstream performance – this means that no student “reasoning” is necessary at test time to realize gains. (2) When rationales are appended in this way, they need not be coherent reasoning sequences to yield improvements; performance increases are robust to permutations of CoT tokens, for example. In fact, (3) a small number of key tokens are sufficient to achieve improvements equivalent to those observed when full rationales are used in model distillation. @@ -4926,7 +4926,7 @@ <fixed-case>S</fixed-case>ci<fixed-case>P</fixed-case>rompt: Knowledge-augmented Prompting for Fine-grained Categorization of Scientific Topics - ZhiwenYouUniversity of Illinois Urbana-Champaign + ZhiwenYouUniversity of Illinois Urbana-Champaign KanyaoHanUniversity of Illinois at Urbana-Champaign HaotianZhu BertramLudaescher @@ -4943,7 +4943,7 @@ SamyadeepBasu Shell XuHuSamsung MaziarSanjabiMeta - DanielaMassicetiResearch, Microsoft + DanielaMassicetiResearch, Microsoft SoheilFeiziUniversity of Maryland, College Park 6105-6113 Image-text contrastive models like CLIP have wide applications in zero-shot classification, image-text retrieval, and transfer learning. However, they often struggle on compositional visio-linguistic tasks (e.g., attribute-binding or object-relationships) where their performance is no better than random chance. To address this, we introduce SDS-CLIP, a lightweight and sample-efficient distillation method to enhance CLIP’s compositional visio-linguistic reasoning. Our approach fine-tunes CLIP using a distillation objective borrowed from large text-to-image generative models like Stable-Diffusion, which are known for their strong visio-linguistic reasoning abilities. On the challenging Winoground benchmark, SDS-CLIP improves the visio-linguistic performance of various CLIP models by up to 7%, while on the ARO dataset, it boosts performance by up to 3%. This work underscores the potential of well-designed distillation objectives from generative models to enhance contrastive image-text models with improved visio-linguistic reasoning capabilities. @@ -4954,8 +4954,8 @@ Learning from Natural Language Explanations for Generalizable Entity Matching SominWadhwaNortheastern University - AditKrishnanAmazon - RunhuiWang + AditKrishnanAmazon + RunhuiWang Byron CWallaceNortheastern University, Brown University and Northeastern University LuyangKongAmazon 6114-6129 @@ -4971,7 +4971,7 @@ ChaoYanVanderbilt University Medical Center KamalikaDasIntuit SricharanKumar - MuratKantarciogluVirginia Polytechnic Institute and State University, University of Texas, Dallas, University of California Berkeley and Harvard University + MuratKantarciogluVirginia Polytechnic Institute and State University, University of Texas, Dallas, University of California Berkeley and Harvard University Bradley A.MalinVanderbilt University Medical Center and Vanderbilt University 6130-6151 Language models (LMs) are known to suffer from hallucinations and misinformation. Retrieval augmented generation (RAG) that retrieves verifiable information from an external knowledge corpus to complement the parametric knowledge in LMs provides a tangible solution to these problems. However, the generation quality of RAG is highly dependent on the relevance between a user’s query and the retrieved documents. Inaccurate responses may be generated when the query is outside of the scope of knowledge represented in the external knowledge corpus or if the information in the corpus is out-of-date. In this work, we establish a statistical framework that assesses how well a query can be answered by an RAG system by capturing the relevance of knowledge. We introduce an online testing procedure that employs goodness-of-fit (GoF) tests to inspect the relevance of each user query to detect out-of-knowledge queries with low knowledge relevance. Additionally, we develop an offline testing framework that examines a collection of user queries, aiming to detect significant shifts in the query distribution which indicates the knowledge corpus is no longer sufficiently capable of supporting the interests of the users. We demonstrate the capabilities of these strategies through a systematic evaluation on eight question-answering (QA) datasets, the results of which indicate that the new testing framework is an efficient solution to enhance the reliability of existing RAG systems. @@ -4981,12 +4981,12 @@ On the Reliability of Psychological Scales on Large Language Models - Jen-tseHuang + Jen-tseHuang WenxiangJiaoTencent AI Lab Man HoLam Eric JohnLi WenxuanWang - MichaelLyuThe Chinese University of Hong Kong + MichaelLyuThe Chinese University of Hong Kong 6152-6173 Recent research has focused on examining Large Language Models’ (LLMs) characteristics from a psychological standpoint, acknowledging the necessity of understanding their behavioral characteristics. The administration of personality tests to LLMs has emerged as a noteworthy area in this context. However, the suitability of employing psychological scales, initially devised for humans, on LLMs is a matter of ongoing debate. Our study aims to determine the reliability of applying personality assessments to LLMs, explicitly investigating whether LLMs demonstrate consistent personality traits. Analysis of 2,500 settings per model, including GPT-3.5, GPT-4, Gemini-Pro, and LLaMA-3.1, reveals that various LLMs show consistency in responses to the Big Five Inventory, indicating a satisfactory level of reliability. Furthermore, our research explores the potential of GPT-3.5 to emulate diverse personalities and represent various groups—a capability increasingly sought after in social sciences for substituting human participants with LLMs to reduce costs. Our findings reveal that LLMs have the potential to represent different personalities with specific prompt instructions. 2024.emnlp-main.354 @@ -5010,7 +5010,7 @@ Finer: Investigating and Enhancing Fine-Grained Visual Concept Recognition in Large Vision Language Models - JeonghwanKim + JeonghwanKim HengJiUniversity of Illinois, Urbana-Champaign 6187-6207 Recent advances in instruction-tuned Large Vision-Language Models (LVLMs) have imbued the models with the ability to generate high-level, image-grounded explanations with ease. While such capability is largely attributed to the rich world knowledge contained within the Large Language Models (LLMs), our work reveals their shortcomings in fine-grained visual categorization (FGVC) across six different benchmark settings. Most recent state-of-the-art LVLMs such as LLaVa-1.5, InstructBLIP and GPT-4V not only severely deteriorate in terms of classification performance, e.g., average drop of 65.58 in EM for Stanford Dogs for LLaVA-1.5, but also struggle to generate descriptive visual attributes based on a concept that appears within an input image despite their prominent zero-shot image captioning ability. In-depth analyses show that instruction-tuned LVLMs suffer from modality gap, showing discrepancy when given textual and visual inputs that correspond to the same concept. In an effort to further the community’s endeavor in this direction, we propose a multiple granularity attribute-centric benchmark and training mixture, Finer, which aims to establish a ground to evaluate LVLMs’ fine-grained visual comprehension ability and provide significantly improved explainability. @@ -5034,16 +5034,16 @@ <fixed-case>VLF</fixed-case>eedback: A Large-Scale <fixed-case>AI</fixed-case> Feedback Dataset for Large Vision-Language Models Alignment - LeiLiUniversity of Hong Kong + LeiLiUniversity of Hong Kong ZhihuiXieShanghai Jiao Tong University MukaiLi ShunianChenShenzhen Research Institute of Big Data PeiyiWang LiangChen - YazhengYang - BenyouWangThe Chinese University of Hong Kong, Shenzhen + YazhengYang + BenyouWangThe Chinese University of Hong Kong, Shenzhen LingpengKongDepartment of Computer Science, The University of Hong Kong - QiLiuUniversity of Hong Kong + QiLiuUniversity of Hong Kong 6227-6246 As large vision-language models (LVLMs) evolve rapidly, the demand for high-quality and diverse data to align these models becomes increasingly crucial. However, the creation of such data with human supervision proves costly and time-intensive. In this paper, we investigate the efficacy of AI feedback to scale supervision for aligning LVLMs. We introduce VLFeedback, the first large-scale vision-language feedback dataset, comprising over 82K multi-modal instructions and comprehensive rationales generated by off-the-shelf models without human annotations. To evaluate the effectiveness of AI feedback for vision-language alignment, we train Silkie, an LVLM fine-tuned via direct preference optimization on VLFeedback. Silkie showcases exceptional performance regarding helpfulness, visual faithfulness, and safety metrics. It outperforms its base model by 6.9% and 9.5% in perception and cognition tasks, reduces hallucination issues on MMHal-Bench, and exhibits enhanced resilience against red-teaming attacks. Furthermore, our analysis underscores the advantage of AI feedback, particularly in fostering preference diversity to deliver more comprehensive improvements. Our dataset, training code and models are available at https://vlf-silkie.github.io. 2024.emnlp-main.358 @@ -5054,11 +5054,11 @@ Focused Large Language Models are Stable Many-Shot Learners - PeiwenYuan + PeiwenYuan ShaoxiongFeng YiweiLi - XinglinWang - YueqiZhang + XinglinWang + YueqiZhang ChuyiTan BoyuanPan HedaWang @@ -5087,13 +5087,13 @@ <fixed-case>GAMA</fixed-case>: A Large Audio-Language Model with Advanced Audio Understanding and Complex Reasoning Abilities SreyanGhosh SonalKumar - AshishSeth + AshishSeth Chandra Kiran ReddyEvuru UtkarshTyagi SSakshi OriolNietoAdobe Systems - RamaniDuraiswamiUniversity of Maryland, College Park - DineshManochaUniversity of Maryland, College Park + RamaniDuraiswamiUniversity of Maryland, College Park + DineshManochaUniversity of Maryland, College Park 6288-6313 Perceiving and understanding non-speech sounds and non-verbal speech is essential to making decisions that help us interact with our surroundings. In this paper, we propose GAMA, a novel General-purpose Large Audio-Language Model (LALM) with Advanced Audio Understanding and Complex Reasoning Abilities. We build GAMA by integrating an LLM with multiple types of audio representations, including features from a custom Audio Q-Former, a multi-layer aggregator that aggregates features from multiple layers of an audio encoder. We fine-tune GAMA on a large-scale audio-language dataset, which augments it with audio understanding capabilities. Next, we propose CompA-R (Instruction-Tuning for Complex Audio Reasoning), a synthetically generated instruction-tuning (IT) dataset with instructions that require the model to perform complex reasoning on the input audio. We instruction-tune GAMA with CompA-R to endow it with complex reasoning abilities, where we further add a soft prompt as input with high-level semantic evidence by leveraging event tags of the input audio. Finally, we also propose CompA-R-test, a human-labeled evaluation dataset for evaluating the capabilities of LALMs on open-ended audio question-answering that requires complex reasoning. Through automated and expert human evaluations, we show that GAMA outperforms all other LALMs in literature on diverse audio understanding tasks by margins of 1%-84% and demonstrates state-of-the-art performance on deductive reasoning and hallucination evaluation benchmarks. Further, GAMA IT-ed on CompA-R proves to be superior in its complex reasoning capabilities. 2024.emnlp-main.361 @@ -5109,7 +5109,7 @@ Ioan-BogdanIordache Teodor-GeorgeMarchitanUniversity of Bucharest SimonaGeorgescuUniversity of Bucharest - LaurentiuZoicasUniversity of Bucharest + LaurentiuZoicasUniversity of Bucharest 6314-6326 We introduce a new database of cognate words and etymons for the five main Romance languages, the most comprehensive one to date. We propose a strong benchmark for the automatic reconstruction of protowords for Romance languages, by applying a set of machine learning models and features on these data. The best results reach 90% accuracy in predicting the protoword of a given cognate set, surpassing existing state-of-the-art results for this task and showing that computational methods can be very useful in assisting linguists with protoword reconstruction. 2024.emnlp-main.362 @@ -5120,7 +5120,7 @@ <fixed-case>C</fixed-case>hat<fixed-case>GPT</fixed-case> Doesn’t Trust Chargers Fans: Guardrail Sensitivity in Context Victoria RLiHarvard University - YidaChenHarvard University, Harvard University + YidaChenHarvard University, Harvard University NaomiSaphraHarvard University 6327-6345 While the biases of language models in production are extensively documented, the biases of their guardrails have been neglected. This paper studies how contextual information about the user influences the likelihood of an LLM to refuse to execute a request. By generating user biographies that offer ideological and demographic information, we find a number of biases in guardrail sensitivity on GPT-3.5. Younger, female, and Asian-American personas are more likely to trigger a refusal guardrail when requesting censored or illegal information. Guardrails are also sycophantic, refusing to comply with requests for a political position the user is likely to disagree with. We find that certain identity groups and seemingly innocuous information, e.g., sports fandom, can elicit changes in guardrail sensitivity similar to direct statements of political ideology. For each demographic category and even for American football team fandom, we find that ChatGPT appears to infer a likely political ideology and modify guardrail behavior accordingly. @@ -5144,11 +5144,11 @@ Satyrn: A Platform for Analytics Augmented Generation MarkoSterbentzNorthwestern University, Northwestern University - CameronBarrie + CameronBarrie ShubhamShahi AbhratanuDuttaNorthwestern University - DonnaHooshmand - HarperPackNorthwestern University + DonnaHooshmand + HarperPackNorthwestern University Kristian JHammond 6360-6385 Large language models (LLMs) are capable of producing documents, and retrieval augmented generation (RAG) has shown itself to be a powerful method for improving accuracy without sacrificing fluency. However, not all information can be retrieved from text. We propose an approach that uses the analysis of structured data to generate fact sets that are used to guide generation in much the same way that retrieved documents are used in RAG. This analytics augmented generation (AAG) approach supports the ability to utilize standard analytic techniques to generate facts that are then converted to text and passed to an LLM. We present a neurosymbolic platform, Satyrn, that leverages AAG to produce accurate, fluent, and coherent reports grounded in large scale databases. In our experiments, we find that Satyrn generates reports in which over 86% of claims are accurate while maintaining high levels of fluency and coherence, even when using smaller language models such as Mistral-7B, as compared to GPT-4 Code Interpreter in which just 57% of claims are accurate. @@ -5159,12 +5159,12 @@ <fixed-case>EH</fixed-case>-<fixed-case>MAM</fixed-case>: Easy-to-Hard Masked Acoustic Modeling for Self-Supervised Speech Representation Learning - AshishSeth + AshishSeth RamaneswaranSelvakumarUniversity of Maryland, College Park SSakshi SonalKumar SreyanGhosh - DineshManochaUniversity of Maryland, College Park + DineshManochaUniversity of Maryland, College Park 6386-6400 In this paper, we present EH-MAM (Easy-to-Hard adaptive Masked Acoustic Modeling), a novel self-supervised learning approach for speech representation learning. In contrast to the prior methods that use random masking schemes for Masked Acoustic Modeling (MAM), we introduce a novel selective and adaptive masking strategy. Specifically, during SSL training, we progressively introduce harder regions to the model for reconstruction. Our approach automatically selects hard regions and is built on the observation that the reconstruction loss of individual frames in MAM can provide natural signals to judge the difficulty of solving the MAM pre-text task for that frame. To identify these hard regions, we employ a teacher model that first predicts the frame-wise losses and then decides which frames to mask. By learning to create challenging problems, such as identifying harder frames and solving them simultaneously, the model is able to learn more effective representations and thereby acquire a more comprehensive understanding of the speech. Quantitatively, EH-MAM outperforms several state-of-the-art baselines across various low-resource speech recognition and SUPERB benchmarks by 5%-10%. Additionally, we conduct a thorough analysis to show that the regions masked by EH-MAM effectively capture useful context across speech frames. 2024.emnlp-main.366 @@ -5203,7 +5203,7 @@ HaozhenZhengDepartment of Computer Science BeitongTianUniversity of Illinois at Urbana-Champaign ChengXiangZhaiUniversity of Illinois, Urbana Champaign - KlaraNahrstedt + KlaraNahrstedt ZhitingHuUniversity of California, San Diego and Amazon 6432-6441 Smaller-scale Vision-Language Models (VLMs) often claim to perform on par with larger models in general-domain visual grounding and question-answering benchmarks while offering advantages in computational efficiency and storage. However, their ability to handle rare objects, which fall into the long tail of data distributions, is less understood. To rigorously evaluate this aspect, we introduce the “Uncontextualized Uncommon Objects” (UOUO) benchmark. This benchmark focuses on systematically testing VLMs with both large and small parameter counts on rare and specialized objects. Our comprehensive analysis reveals that while smaller VLMs maintain competitive performance on common datasets, they significantly underperform on tasks involving uncommon objects. We also propose an advanced, scalable pipeline for data collection and cleaning, ensuring the UOUO benchmark provides high-quality, challenging instances. These findings highlight the need to consider long-tail distributions when assessing the true capabilities of VLMs. Code and project details for UOUO can be found at https://zoezheng126.github.io/UOUO-Website/. @@ -5213,11 +5213,11 @@ Optimized Speculative Sampling for <fixed-case>GPU</fixed-case> Hardware Accelerators - DominikWagner + DominikWagner SeanieLeeKorea Advanced Institute of Science & Technology IljaBaumannTechnische Hochschule Nürnberg - PhilippSeeberger - KorbinianRiedhammerTechnische Hochschule Nürnberg Georg Simon Ohm + PhilippSeeberger + KorbinianRiedhammerTechnische Hochschule Nürnberg Georg Simon Ohm TobiasBockletTH Nürnberg and Intel 6442-6458 In this work, we optimize speculative sampling for parallel hardware accelerators to improve sampling speed. We notice that substantial portions of the intermediate matrices necessary for speculative sampling can be computed concurrently. This allows us to distribute the workload across multiple GPU threads, enabling simultaneous operations on matrix segments within thread blocks. This results in profiling time improvements ranging from 6% to 13% relative to the baseline implementation, without compromising accuracy. To further accelerate speculative sampling, probability distributions parameterized by softmax are approximated by sigmoid. This approximation approach results in significantly greater relative improvements in profiling time, ranging from 37% to 94%, with a minor decline in accuracy. We conduct extensive experiments on both automatic speech recognition and summarization tasks to validate the effectiveness of our optimization methods. @@ -5228,9 +5228,9 @@ Personalized Pieces: Efficient Personalized Large Language Models through Collaborative Efforts - ZhaoxuanTanUniversity of Notre Dame - ZheyuanLiuUniversity of Notre Dame - MengJiangUniversity of Notre Dame + ZhaoxuanTanUniversity of Notre Dame + ZheyuanLiuUniversity of Notre Dame + MengJiangUniversity of Notre Dame 6459-6475 Personalized large language models (LLMs) aim to tailor interactions, content, and recommendations to individual user preferences. While parameter-efficient fine-tuning (PEFT) methods excel in performance and generalization, they are costly and limit communal benefits when used individually. To this end, we introduce Personalized Pieces (Per-Pcs), a framework that allows users to safely share and assemble personalized PEFT efficiently with collaborative efforts. Per-Pcs involves selecting sharers, breaking their PEFT into pieces, and training gates for each piece. These pieces are added to a pool, from which target users can select and assemble personalized PEFT using their history data. This approach preserves privacy and enables fine-grained user modeling without excessive storage and computation demands. Experimental results show Per-Pcs outperforms non-personalized and PEFT retrieval baselines, offering performance comparable to OPPU with significantly lower resource use across six tasks. Further analysis highlights Per-Pcs’s robustness concerning sharer count and selection strategy, pieces sharing ratio, and scalability in computation time and storage space. Per-Pcs’s modularity promotes safe sharing, making LLM personalization more efficient, effective, and widely accessible through collaborative efforts. 2024.emnlp-main.371 @@ -5241,12 +5241,12 @@ Democratizing Large Language Models via Personalized Parameter-Efficient Fine-tuning - ZhaoxuanTanUniversity of Notre Dame - QingkaiZengUniversity of Notre Dame - YijunTian - ZheyuanLiuUniversity of Notre Dame + ZhaoxuanTanUniversity of Notre Dame + QingkaiZengUniversity of Notre Dame + YijunTian + ZheyuanLiuUniversity of Notre Dame BingYin - MengJiangUniversity of Notre Dame + MengJiangUniversity of Notre Dame 6476-6491 Personalization in large language models (LLMs) is increasingly important, aiming to align the LLMs’ interactions, content, and recommendations with individual user preferences. Recent advances have highlighted effective prompt design by enriching user queries with non-parametric knowledge through behavior history retrieval and textual profiles. However, these methods faced limitations due to a lack of model ownership, resulting in constrained customization and privacy issues, and often failed to capture complex, dynamic user behavior patterns. To address these shortcomings, we introduce One PEFT Per User (OPPU), employing personalized parameter-efficient fine-tuning (PEFT) modules to store user-specific behavior patterns and preferences. By plugging in personal PEFT parameters, users can own and use their LLMs individually. OPPU integrates parametric user knowledge in the personal PEFT parameters with non-parametric knowledge from retrieval and profiles, adapting LLMs to user behavior shifts. Experimental results demonstrate that OPPU significantly outperforms existing prompt-based methods across seven diverse tasks in the LaMP benchmark. Further studies reveal OPPU’s enhanced capabilities in handling user behavior shifts, modeling users at different activity levels, maintaining robustness across various user history formats, and displaying versatility with different PEFT methods. 2024.emnlp-main.372 @@ -5258,8 +5258,8 @@ Unifying Multimodal Retrieval via Document Screenshot Embedding XueguangMa - Sheng-ChiehLinUniversity of Waterloo - MinghanLi + Sheng-ChiehLinUniversity of Waterloo + MinghanLi WenhuChenUniversity of Waterloo and Google JimmyLinUniversity of Waterloo 6492-6505 @@ -5281,14 +5281,14 @@ An Audit on the Perspectives and Challenges of Hallucinations in <fixed-case>NLP</fixed-case> - PranavNarayanan Venkit + PranavNarayanan Venkit TatianaChakravorti VipulGuptaPennsylvania State University - HeidiBiggsGeorgia Institute of Technology + HeidiBiggsGeorgia Institute of Technology MukundSrinath KoustavaGoswamiAdobe Systems SarahRajtmajerPennsylvania State University - ShomirWilsonPennsylvania State University + ShomirWilsonPennsylvania State University 6528-6548 We audit how hallucination in large language models (LLMs) is characterized in peer-reviewed literature, using a critical examination of 103 publications across NLP research. Through the examination of the literature, we identify a lack of agreement with the term ‘hallucination’ in the field of NLP. Additionally, to compliment our audit, we conduct a survey with 171 practitioners from the field of NLP and AI to capture varying perspectives on hallucination. Our analysis calls for the necessity of explicit definitions and frameworks outlining hallucination within NLP, highlighting potential challenges, and our survey inputs provide a thematic understanding of the influence and ramifications of hallucination in society. 2024.emnlp-main.375 @@ -5300,7 +5300,7 @@ Discovering Knowledge-Critical Subnetworks in Pretrained Language Models DenizBayazitEPFL - Lausanne NegarForoutanSchool of Computer and Communication Sciences, EPFL - EPF Lausanne - ZemingChen + ZemingChen GailWeissEPFL - EPF Lausanne AntoineBosselutSwiss Federal Institute of Technology Lausanne 6549-6583 @@ -5314,7 +5314,7 @@ JunjieChuCISPA Helmholtz Center for Information Security ZeyangShaCISPA, saarland university, saarland informatics campus MichaelBackesCISPA Helmholtz Center for Information Security - YangZhangCISPA Helmholtz Center for Information Security + YangZhangCISPA Helmholtz Center for Information Security 6584-6600 Significant advancements have recently been made in large language models, represented by GPT models.Users frequently have multi-round private conversations with cloud-hosted GPT models for task optimization.Yet, this operational paradigm introduces additional attack surfaces, particularly in custom GPTs and hijacked chat sessions.In this paper, we introduce a straightforward yet potent Conversation Reconstruction Attack.This attack targets the contents of previous conversations between GPT models and benign users, i.e., the benign users’ input contents during their interaction with GPT models.The adversary could induce GPT models to leak such contents by querying them with designed malicious prompts.Our comprehensive examination of privacy risks during the interactions with GPT models under this attack reveals GPT-4’s considerable resilience.We present two advanced attacks targeting improved reconstruction of past conversations, demonstrating significant privacy leakage across all models under these advanced techniques.Evaluating various defense mechanisms, we find them ineffective against these attacks.Our findings highlight the ease with which privacy can be compromised in interactions with GPT models, urging the community to safeguard against potential abuses of these models’ capabilities. 2024.emnlp-main.377 @@ -5339,7 +5339,7 @@ Verifiable, Debuggable, and Repairable Commonsense Logical Reasoning via <fixed-case>LLM</fixed-case>-based Theory Resolution ArminToroghi WillisGuo - AliPesaranghaderLG Electronics + AliPesaranghaderLG Electronics ScottSannerDepartment of Mechanical and Industrial Engineering, University of Toronto and Department of Computer Science 6634-6652 Recent advances in Large Language Models (LLM) have led to substantial interest in their application to commonsense reasoning tasks. Despite their potential, LLMs are susceptible to reasoning errors and hallucinations that may be harmful in use cases where accurate reasoning is critical. This challenge underscores the need for verifiable, debuggable, and repairable LLM reasoning. Recent works have made progress toward verifiable reasoning with LLMs by using them as either (i) a reasoner over an axiomatic knowledge base, or (ii) a semantic parser for use in existing logical inference systems. However, both settings are unable to extract commonsense axioms from the LLM that are not already formalized in the knowledge base, and also lack a reliable method to repair missed commonsense inferences. In this work, we present LLM-TRes, a logical reasoning framework based on the notion of “theory resolution” that allows for seamless integration of the commonsense knowledge from LLMs with a verifiable logical reasoning framework that mitigates hallucinations and facilitates debugging of the reasoning procedure as well as repair. We crucially prove that repaired axioms are theoretically guaranteed to be given precedence over flawed ones in our theory resolution inference process. We conclude by evaluating on three diverse language-based reasoning tasks—preference reasoning, deductive reasoning, and causal commonsense reasoning—and demonstrate the superior performance of LLM-TRes vs. state-of-the-art LLM-based reasoning methods in terms of both accuracy and reasoning correctness. @@ -5363,11 +5363,11 @@ Can Large Language Models Learn Independent Causal Mechanisms? - GaelGendron - Bao TrungNguyenUniversity of Auckland + GaelGendron + Bao TrungNguyenUniversity of Auckland Alex YuxuanPeng - MichaelWitbrockUniversity of Auckland - GillianDobbieUniversity of Auckland + MichaelWitbrockUniversity of Auckland + GillianDobbieUniversity of Auckland 6678-6701 Despite impressive performance on language modelling and complex reasoning tasks, Large Language Models (LLMs) fall short on the same tasks in uncommon settings or with distribution shifts, exhibiting a lack of generalisation ability. By contrast, systems such as causal models, that learn abstract variables and causal relationships, can demonstrate increased robustness against changes in the distribution. One reason for this success is the existence and use of Independent Causal Mechanisms (ICMs) representing high-level concepts that only sparsely interact. In this work, we apply two concepts from causality to learn ICMs within LLMs. We develop a new LLM architecture composed of multiple sparsely interacting language modelling modules. We show that such causal constraints can improve out-of-distribution performance on abstract and causal reasoning tasks. We also investigate the level of independence and domain specialisation and show that LLMs rely on pre-trained partially domain-invariant mechanisms resilient to fine-tuning. 2024.emnlp-main.381 @@ -5390,7 +5390,7 @@ ZiyiLiu AbhishekAnand PeiZhouUniversity of Southern California and USC/ISI - Jen-tseHuang + Jen-tseHuang JieyuZhaoUniversity of Southern California 6718-6746 Large language models (LLMs) have demonstrated the potential to mimic human social intelligence. However, most studies focus on simplistic and static self-report or performance-based tests, which limits the depth and validity of the analysis. In this paper, we developed a novel framework, InterIntent, to assess LLMs’ social intelligence by mapping their ability to understand and manage intentions in a game setting. We focus on four dimensions of social intelligence: situational awareness, self-regulation, self-awareness, and theory of mind. Each dimension is linked to a specific game task: intention selection, intention following, intention summarization, and intention guessing. Our findings indicate that while LLMs exhibit high proficiency in selecting intentions, achieving an accuracy of 88%, their ability to infer the intentions of others is significantly weaker, trailing human performance by 20%. Additionally, game performance correlates with intention understanding, highlighting the importance of the four components towards success in this game. These findings underline the crucial role of intention understanding in evaluating LLMs’ social intelligence and highlight the potential of using social deduction games as a complex testbed to enhance LLM evaluation. InterIntent contributes a structured approach to bridging the evaluation gap in social intelligence within multiplayer LLM-based games. @@ -5402,9 +5402,9 @@ Locating Information Gaps and Narrative Inconsistencies Across Languages: A Case Study of <fixed-case>LGBT</fixed-case> People Portrayals on <fixed-case>W</fixed-case>ikipedia FarhanSamirUniversity of British Columbia Chan YoungPark - AnjalieFieldJohns Hopkins University + AnjalieFieldJohns Hopkins University VeredShwartz - YuliaTsvetkovDepartment of Computer Science, University of Washington + YuliaTsvetkovDepartment of Computer Science, University of Washington 6747-6762 To explain social phenomena and identify systematic biases, much research in computational social science focuses on comparative text analyses. These studies often rely on coarse corpus-level statistics or local word-level analyses, mainly in English. We introduce the InfoGap method—an efficient and reliable approach to locating information gaps and inconsistencies in articles at the fact level, across languages. We evaluate InfoGap by analyzing LGBT people’s portrayals, across 2.7K biography pages on English, Russian, and French Wikipedias. We find large discrepancies in factual coverage across the languages. Moreover, our analysis reveals that biographical facts carrying negative connotations are more likely to be highlighted in Russian Wikipedia. Crucially, InfoGap both facilitates large scale analyses, and pinpoints local document- and fact-level information gaps, laying a new foundation for targeted and nuanced comparative language analysis at scale. 2024.emnlp-main.384 @@ -5415,7 +5415,7 @@ From Local Concepts to Universals: Evaluating the Multicultural Understanding of Vision-Language Models MeharBhatia SahithyaRaviUniversity of British Columbia - AdityaChinchureUniversity of British Columbia + AdityaChinchureUniversity of British Columbia EunJeongHwangUniversity of British Columbia VeredShwartz 6763-6782 @@ -5428,7 +5428,7 @@ Dynamic Multi-Reward Weighting for Multi-Style Controllable Generation KarinDe LangisUniversity of Minnesota - Twin Cities RyanKoo - DongyeopKangUniversity of Minnesota + DongyeopKangUniversity of Minnesota 6783-6800 Textual style expresses a diverse set of information, including interpersonal dynamics (e.g., formality) and the author’s emotions or attitudes (e.g., disgust). An open question is how language models can be explicitly controlled so that they weave together target styles when generating text: for example, to produce text that is both negative and non-toxic. One approach to such controlled generation is multi-objective reinforcement learning (RL), but how to best combine multiple objectives in a reward function is an open question. In this paper, we investigate various formulations of multi-style reward formulations, including calibrated outputs from discriminators and dynamic weighting by discriminator gradient magnitudes. We find that our proposed dynamic weighting outperforms static weighting approaches with respect style control while maintaining linguistic quality, and we explore its effectiveness in 2- and 3-style control. 2024.emnlp-main.386 @@ -5438,10 +5438,10 @@ <fixed-case>MMN</fixed-case>euron: Discovering Neuron-Level Domain-Specific Interpretation in Multimodal Large Language Model JiahaoHuoHong Kong University of Science and Technology and Tongji University - YiboYanThe Hong Kong University of Science and Technology and Squirrel AI + YiboYanThe Hong Kong University of Science and Technology and Squirrel AI BorenHu - YutaoYueHong Kong University of Science and Technology (Guangzhou) and Institute of Deep Perception Technology, JITRI - XumingHuThe Hong Kong University of Science and Technology (Guangzhou) and Hong Kong University of Science and Technology + YutaoYueHong Kong University of Science and Technology (Guangzhou) and Institute of Deep Perception Technology, JITRI + XumingHuThe Hong Kong University of Science and Technology (Guangzhou) and Hong Kong University of Science and Technology 6801-6816 Projecting visual features into word embedding space has become a significant fusion strategy adopted by Multimodal Large Language Models (MLLMs). However, its internal mechanisms have yet to be explored. Inspired by multilingual research, we identify domain-specific neurons in multimodal large language models. Specifically, we investigate the distribution of domain-specific neurons and the mechanism of how MLLMs process features from diverse domains. Furthermore, we propose a three-stage framework for language model modules in MLLMs when handling projected image features, and verify this hypothesis using logit lens. Extensive experiments indicate that while current MLLMs exhibit Visual Question Answering (VQA) capability, they may not fully utilize domain-specific information. Manipulating domain-specific neurons properly will result in a 10% change of accuracy at most, shedding light on the development of cross-domain, all-encompassing MLLMs in the future. The source code is available at https://anonymous.4open.science/r/MMNeuron. 2024.emnlp-main.387 @@ -5450,13 +5450,13 @@ Learning to Extract Structured Entities Using Language Models - HaolunWu - YeYuan + HaolunWu + YeYuan LianaMikaelyanMicrosoft AlexanderMeulemansSwiss Federal Institute of Technology XueLiuMcGill University JamesHensmanMicrosoft Research - BhaskarMitraMicrosoft Research + BhaskarMitraMicrosoft Research 6817-6834 Recent advances in machine learning have significantly impacted the field of information extraction, with Language Models (LMs) playing a pivotal role in extracting structured information from unstructured text. Prior works typically represent information extraction as triplet-centric and use classical metrics such as precision and recall for evaluation. We reformulate the task to be entity-centric, enabling the use of diverse metrics that can provide more insights from various perspectives. We contribute to the field by introducing Structured Entity Extraction and proposing the Approximate Entity Set OverlaP (AESOP) metric, designed to appropriately assess model performance. Later, we introduce a new Multistage Structured Entity Extraction (MuSEE) model that harnesses the power of LMs for enhanced effectiveness and efficiency by decomposing the extraction task into multiple stages. Quantitative and human side-by-side evaluations confirm that our model outperforms baselines, offering promising directions for future advancements in structured entity extraction. Our source code is available at https://github.com/microsoft/Structured-Entity-Extraction. 2024.emnlp-main.388 @@ -5466,7 +5466,7 @@ Efficient <fixed-case>LLM</fixed-case> Comparative Assessment: A Product of Experts Framework for Pairwise Comparisons AdianLiusieUniversity of Cambridge - VatsalRaina + VatsalRaina YassirFathullahUniversity of Cambridge MarkGalesUniversity of Cambridge 6835-6855 @@ -5478,7 +5478,7 @@ A Survey of <fixed-case>AMR</fixed-case> Applications - ShiraWeinAmherst College + ShiraWeinAmherst College JuriOpitzRuprecht-Karls-Universität Heidelberg and University of Zurich 6856-6875 In the ten years since the development of the Abstract Meaning Representation (AMR) formalism, substantial progress has been made on AMR-related tasks such as parsing and alignment. Still, the engineering applications of AMR are not fully understood. In this survey, we categorize and characterize more than 100 papers which use AMR for downstream tasks— the first survey of this kind for AMR. Specifically, we highlight (1) the range of applications for which AMR has been harnessed, and (2) the techniques for incorporating AMR into those applications. We also detect broader AMR engineering patterns and outline areas of future work that seem ripe for AMR incorporation. We hope that this survey will be useful to those interested in using AMR and that it sparks discussion on the role of symbolic representations in the age of neural-focused NLP research. @@ -5489,9 +5489,9 @@ Beyond Embeddings: The Promise of Visual Table in Visual Reasoning YiwuZhongThe Chinese University of Hong Kong - Zi-YuanHu - MichaelLyuThe Chinese University of Hong Kong - LiweiWangThe Chinese University of Hong Kong + Zi-YuanHu + MichaelLyuThe Chinese University of Hong Kong + LiweiWangThe Chinese University of Hong Kong 6876-6911 Visual representation learning has been a cornerstone in computer vision, involving typical forms such as visual embeddings, structural symbols, and text-based representations. Despite the success of CLIP-type visual embeddings, they often lack access to world knowledge critical for visual reasoning. In this work, we propose Visual Table, a novel form of visual representation tailored for visual reasoning. Visual tables are constructed as hierarchical descriptions of visual scenes, featuring a scene description and multiple object-centric descriptions covering categories, attributes, and knowledge. Thanks to the structural and textual formats, visual tables offer unique properties over mere visual embeddings, such as explainability and controllable editing. Furthermore, they deliver instance-level world knowledge and detailed attributes that are essential for visual reasoning. To create visual tables, we develop a generator trained on the dataset with collected, small-scale annotations. Extensive results on 11 visual reasoning benchmarks demonstrate that the generated visual tables significantly outperform previous structural and text-based representations. Moreover, they consistently enhance state-of-the-art multi-modal large language models across diverse benchmarks, showcasing their potential for advancing visual reasoning tasks. Our code is available at https://github.com/LaVi-Lab/Visual-Table. 2024.emnlp-main.391 @@ -5500,12 +5500,12 @@ <fixed-case>C</fixed-case>are<fixed-case>C</fixed-case>orpus+: Expanding and Augmenting Caregiver Strategy Data to Support Pediatric Rehabilitation - ShahlaFarzana + ShahlaFarzana IvanaLucero VivianVillegasUniversity of Illinois at Chicago - Vera CKaelinUniversity of Illinois at Chicago + Vera CKaelinUniversity of Illinois at Chicago MaryKhetani - NataliePardeUniversity of Illinois Chicago + NataliePardeUniversity of Illinois Chicago 6912-6927 Caregiver strategy classification in pediatric rehabilitation contexts is strongly motivated by real-world clinical constraints but highly under-resourced and seldom studied in natural language processing settings. We introduce a large dataset of 4,037 caregiver strategies in this setting, a five-fold increase over the nearest contemporary dataset. These strategies are manually categorized into clinically established constructs with high agreement (\kappa=0.68-0.89). We also propose two techniques to further address identified data constraints. First, we manually supplement target task data with publicly relevant data from online child health forums. Next, we propose a novel data augmentation technique to generate synthetic caregiver strategies with high downstream task utility. Extensive experiments showcase the quality of our dataset. They also establish evidence that both the publicly available data and the synthetic strategies result in large performance gains, with relative F_1 increases of 22.6% and 50.9%, respectively. 2024.emnlp-main.392 @@ -5521,7 +5521,7 @@ JiayiYuanRice University HongyeJinTexas A&M ZiruiLiuUniversity of Minnesota - Twin Cities - VipinChaudharyCase Western Reserve University + VipinChaudharyCase Western Reserve University ShuaiXuCase Western Reserve University JamesCaverleeGoogle and Texas A&M University - College Station XiaHuRice University @@ -5538,7 +5538,7 @@ YuZhaoNankai University BaohangZhouNankai University XuhuiSui - LiZhang + LiZhang KehuiSong 6942-6952 Temporal Knowledge Graph Question Answering (TKGQA) aims to answer temporal questions using knowledge in Temporal Knowledge Graphs (TKGs). Previous works employ pre-trained TKG embeddings or graph neural networks to incorporate the knowledge of TKGs. However, these methods fail to fully understand the complex semantic information of time constraints in questions.In contrast, Large Language Models (LLMs) have shown exceptional performance in knowledge graph reasoning, unifying both semantic understanding and structural reasoning. To further enhance LLMs’ temporal reasoning ability, this paper aims to integrate relevant temporal knowledge from TKGs into LLMs through a Time-aware Retrieve-Rewrite-Retrieve-Rerank framework, which we named TimeR^4.Specifically, to reduce temporal hallucination in LLMs, we propose a retrieve-rewrite module to rewrite questions using background knowledge stored in the TKGs, thereby acquiring explicit time constraints. Then, we implement a retrieve-rerank module aimed at retrieving semantically and temporally relevant facts from the TKGs and reranking them according to the temporal constraints.To achieve this, we fine-tune a retriever using the contrastive time-aware learning framework.Our approach achieves great improvements, with relative gains of 47.8% and 22.5% on two datasets, underscoring its effectiveness in boosting the temporal reasoning abilities of LLMs. Our code is available at https://github.com/qianxinying/TimeR4. @@ -5556,7 +5556,7 @@ YangXu YunLuowestlake university PengfeiLiu - YueZhangWestlake University + YueZhangWestlake University ZhengZhangNew York University 6953-6975 Large Language Models (LLMs) have shown impressive capabilities but also a concerning tendency to hallucinate. This paper presents RefChecker, a framework that introduces claim-triplets to represent claims in LLM responses, aiming to detect fine-grained hallucinations. In RefChecker, an extractor generates claim-triplets from a response, which are then evaluated by a checker against a reference. We delineate three task settings: Zero, Noisy and Accurate Context, to reflect various real-world use cases. We curated a benchmark spanning various NLP tasks and annotated 11k claim-triplets from 2.1k responses by seven LLMs. RefChecker supports both proprietary and open-source models as the extractor and checker. Experiments demonstrate that claim-triplets enable superior hallucination detection, compared to other granularities such as response, sentence and sub-sentence level claims. RefChecker outperforms prior methods by 18.2 to 27.2 points on our benchmark and the checking results of RefChecker are strongly aligned with human judgments. @@ -5590,7 +5590,7 @@ Automatic Instruction Evolving for Large Language Models WeihaoZeng CanXuMicrosoft and Peking University - YingxiuZhao + YingxiuZhao Jian-GuangLouMicrosoft WeizhuChenMicrosoft GenAI 6998-7018 @@ -5602,14 +5602,14 @@ <fixed-case>R</fixed-case>ep<fixed-case>E</fixed-case>val: Effective Text Evaluation with <fixed-case>LLM</fixed-case> Representation ShuqianSheng - YiXu + YiXu TianhangZhang ZanweiShen LuoyiFu - JiaxinDingShanghai Jiaotong University + JiaxinDingShanghai Jiaotong University LeiZhouShanghai Jiaotong University - XiaoyingGanShanghai Jiaotong University - XinbingWangShanghai Jiao Tong University + XiaoyingGanShanghai Jiaotong University + XinbingWangShanghai Jiao Tong University ChenghuZhouIGSNRR, Chinese Academy of Sciences, Beijing, China 7019-7033 The era of Large Language Models (LLMs) raises new demands for automatic evaluation metrics, which should be adaptable to various application scenarios while maintaining low cost and effectiveness. Traditional metrics for automatic text evaluation are often tailored to specific scenarios, while LLM-based evaluation metrics are costly, requiring fine-tuning or rely heavily on the generation capabilities of LLMs. Besides, previous LLM-based metrics ignore the fact that, within the space of LLM representations, there exist direction vectors that indicate the estimation of text quality. To this end, we introduce RepEval, a metric that leverages the projection of LLM representations for evaluation. Through simple prompt modifications, RepEval can easily transition to various tasks, requiring only minimal sample pairs for direction vector construction. Results on fourteen datasets across two evaluation tasks demonstrate the high effectiveness of our method, which exhibits a higher correlation with human judgments than previous methods, even in complex evaluation scenarios involving pair-wise selection under nuanced aspects. Our work underscores the richness of information regarding text quality embedded within LLM representations, offering insights for the development of new metrics. @@ -5619,7 +5619,7 @@ Generative Models for Automatic Medical Decision Rule Extraction from Text - YuxinHeHarbin Institute of Technology + YuxinHeHarbin Institute of Technology BuzhouTangHarbin Institute of Technology XiaolingWangEast China Normal University 7034-7048 @@ -5636,7 +5636,7 @@ ZhiyuanHu XiaobaoWu Cong-Duy TNguyenSchool of Computer Science and Engineering, Nanyang Technological University - See-KiongNgNational University of Singapore + See-KiongNgNational University of Singapore Anh TuanLuuNanyang Technological University 7049-7066 Seeking answers effectively for long videos is essential to build video question answering (videoQA) systems. Previous methods adaptively select frames and regions from long videos to save computations. However, this fails to reason over the whole sequence of video, leading to sub-optimal performance. To address this problem, we introduce a state space layer (SSL) into multi-modal Transformer to efficiently integrate global semantics of the video, which mitigates the video information loss caused by frame and region selection modules. Our SSL includes a gating unit to enable controllability over the flow of global semantics into visual representations. To further enhance the controllability, we introduce a cross-modal compositional congruence objective to encourage global semantics aligned with the question. To rigorously evaluate long-form videoQA capacity, we construct two new benchmarks Ego-QA and MAD-QA featuring videos of considerably long length, i.e. 17.5 minutes and 1.9 hours, respectively. Extensive experiments demonstrate the superiority of our framework on these new as well as existing datasets. @@ -5648,12 +5648,12 @@ Towards Understanding Jailbreak Attacks in <fixed-case>LLM</fixed-case>s: A Representation Space Analysis - YupingLinMichigan State University + YupingLinMichigan State University PengfeiHeMichigan State University - HanXuUniversity of Arizona + HanXuUniversity of Arizona YueXingMichigan State University MakotoYamadaOkinawa Institute of Science and Technology (OIST) - HuiLiu + HuiLiu JiliangTangMichigan State University 7067-7085 Large language models (LLMs) are susceptible to a type of attack known as jailbreaking, which misleads LLMs to output harmful contents. Although there are diverse jailbreak attack strategies, there is no unified understanding on why some methods succeed and others fail. This paper explores the behavior of harmful and harmless prompts in the LLM’s representation space to investigate the intrinsic properties of successful jailbreak attacks. We hypothesize that successful attacks share some similar properties: They are effective in moving the representation of the harmful prompt towards the direction to the harmless prompts. We leverage hidden representations into the objective of existing jailbreak attacks to move the attacks along the acceptance direction, and conduct experiments to validate the above hypothesis using the proposed objective. We hope this study provides new insights into understanding how LLMs understand harmfulness information. @@ -5663,11 +5663,11 @@ Enhancing Legal Case Retrieval via Scaling High-quality Synthetic Query-Candidate Pairs - ChengGao + ChengGao ChaojunXiao ZhenghaoLiuNortheastern University HuiminChenTsinghua University, Tsinghua University - ZhiyuanLiuTsinghua University + ZhiyuanLiuTsinghua University MaosongSun 7086-7100 Legal case retrieval (LCR) aims to provide similar cases as references for a given fact description. This task is crucial for promoting consistent judgments in similar cases, effectively enhancing judicial fairness and improving work efficiency for judges. However, existing works face two main challenges for real-world applications: existing works mainly focus on case-to-case retrieval using lengthy queries, which does not match real-world scenarios; and the limited data scale, with current datasets containing only hundreds of queries, is insufficient to satisfy the training requirements of existing data-hungry neural models. To address these issues, we introduce an automated method to construct synthetic query-candidate pairs and build the largest LCR dataset to date, LEAD, which is hundreds of times larger than existing datasets. This data construction method can provide ample training signals for LCR models. Experimental results demonstrate that model training with our constructed data can achieve state-of-the-art results on two widely-used LCR benchmarks. Besides, the construction method can also be applied to civil cases and achieve promising results. The data and codes can be found in https://github.com/thunlp/LEAD. @@ -5683,7 +5683,7 @@ YantuanXianKunming University of Science and Technology ShengxiangGaoKunming University of Science and Technology KangLiuInstitute of automation, Chinese academy of science, Chinese Academy of Sciences - ZhengtaoYuKunming University of Science and Technology + ZhengtaoYuKunming University of Science and Technology 7101-7113 Large language models (LLMs) have demonstrated remarkable capabilities in comprehensively handling various types of natural language processing (NLP) tasks. However, there are significant differences in the knowledge and abilities required for different tasks. Therefore, it is important to understand whether the same LLM processes different tasks in the same way. Are there specific neurons in a LLM for different tasks? Inspired by neuroscience, this paper pioneers the exploration of whether distinct neurons are activated when a LLM handles different tasks. Compared with current research exploring the neurons of language and knowledge, task-specific neurons present a greater challenge due to their abstractness, diversity, and complexity. To address these challenges, this paper proposes a method for task-specific neuron localization based on Causal Gradient Variation with Special Tokens (CGVST). CGVST identifies task-specific neurons by concentrating on the most significant tokens during task processing, thereby eliminating redundant tokens and minimizing interference from non-essential neurons. Compared to traditional neuron localization methods, our approach can more effectively identify task-specific neurons. We conduct experiments across eight different public tasks. Experiments involving the inhibition and amplification of identified neurons demonstrate that our method can accurately locate task-specific neurons. 2024.emnlp-main.403 @@ -5692,7 +5692,7 @@ Liar, Liar, Logical Mire: A Benchmark for Suppositional Reasoning in Large Language Models - PhilippMondorfLudwig-Maximilians-Universität München + PhilippMondorfLudwig-Maximilians-Universität München BarbaraPlankLudwig-Maximilians-Universität München and IT University of Copenhagen 7114-7137 2024.emnlp-main.404 @@ -5705,7 +5705,7 @@ Advancing Test-Time Adaptation in Wild Acoustic Test Settings HongfuLiuNational University of Singapore HengguanHuangCopenhagen University - YeWang + YeWang 7138-7155 Acoustic foundation models, fine-tuned for Automatic Speech Recognition (ASR), suffer from performance degradation in wild acoustic test settings when deployed in real-world scenarios. Stabilizing online Test-Time Adaptation (TTA) under these conditions remains an open and unexplored question. Existing wild vision TTA methods often fail to handle speech data effectively due to the unique characteristics of high-entropy speech frames, which are unreliably filtered out even when containing crucial semantic content. Furthermore, unlike static vision data, speech signals follow short-term consistency, requiring specialized adaptation strategies. In this work, we propose a novel wild acoustic TTA method tailored for ASR fine-tuned acoustic foundation models. Our method, Confidence-Enhanced Adaptation, performs frame-level adaptation using a confidence-aware weight scheme to avoid filtering out essential information in high-entropy frames. Additionally, we apply consistency regularization during test-time optimization to leverage the inherent short-term consistency of speech signals. Our experiments on both synthetic and real-world datasets demonstrate that our approach outperforms existing baselines under various wild acoustic test settings, including Gaussian noise, environmental sounds, accent variations, and sung speech. 2024.emnlp-main.405 @@ -5720,7 +5720,7 @@ HarshJhamtaniMicrosoft PatrickXiaMicrosoft RichardShinGoogle - JasonEisnerMicrosoft and Johns Hopkins University + JasonEisnerMicrosoft and Johns Hopkins University BenjaminVan DurmeMicrosoft and Johns Hopkins University 7156-7168 We introduce iterative retrieval, a novel framework that empowers retrievers to make iterative decisions through policy optimization. Finding an optimal portfolio of retrieved items is a combinatorial optimization problem, generally considered NP-hard. This approach provides a learned approximation to such a solution, meeting specific task requirements under a given family of large language models (LLMs). We propose a training procedure based on reinforcement learning, incorporating feedback from LLMs. We instantiate an iterative retriever for composing in-context learning (ICL) exemplars and apply it to various semantic parsing tasks that demand synthesized programs as outputs. By adding only 4M additional parameters for state encoding, we convert an off-the-shelf dense retriever into a stateful iterative retriever, outperforming previous methods in selecting ICL exemplars on semantic parsing datasets such as CalFlow, TreeDST, and MTOP. Additionally, the trained iterative retriever generalizes across different inference LLMs beyond the one used during training. @@ -5731,12 +5731,12 @@ Taxonomy-guided Semantic Indexing for Academic Paper Search - SeongKuKangUniversity of Illinois Urbana-Champaign - YunyiZhangUniversity of Illinois Urbana-Champaign - PengchengJiangUniversity of Illinois at Urbana-Champaign - DonghaLeeYonsei University + SeongKuKangUniversity of Illinois Urbana-Champaign + YunyiZhangUniversity of Illinois Urbana-Champaign + PengchengJiangUniversity of Illinois at Urbana-Champaign + DonghaLeeYonsei University JiaweiHan - HwanjoYuPohang University of Science and Technology + HwanjoYuPohang University of Science and Technology 7169-7184 Academic paper search is an essential task for efficient literature discovery and scientific advancement. While dense retrieval has advanced various ad-hoc searches, it often struggles to match the underlying academic concepts between queries and documents, which is critical for paper search. To enable effective academic concept matching for paper search, we propose Taxonomy-guided Semantic Indexing (TaxoIndex) framework. TaxoIndex extracts key concepts from papers and organizes them as a semantic index guided by an academic taxonomy, and then leverages this index as foundational knowledge to identify academic concepts and link queries and documents. As a plug-and-play framework, TaxoIndex can be flexibly employed to enhance existing dense retrievers. Extensive experiments show that TaxoIndex brings significant improvements, even with highly limited training data, and greatly enhances interpretability. 2024.emnlp-main.407 @@ -5767,7 +5767,7 @@ Advancing Adversarial Suffix Transfer Learning on Aligned Large Language Models HongfuLiuNational University of Singapore YuxiXie - YeWang + YeWang MichaelShiehNational University of Singapore 7213-7224 Language Language Models (LLMs) face safety concerns due to potential misuse by malicious users. Recent red-teaming efforts have identified adversarial suffixes capable of jailbreaking LLMs using the gradient-based search algorithm Greedy Coordinate Gradient (GCG). However, GCG struggles with computational inefficiency, limiting further investigations regarding suffix transferability and scalability across models and data. In this work, we bridge the connection between search efficiency and suffix transferability. We propose a two-stage transfer learning framework, DeGCG, which decouples the search process into behavior-agnostic pre-searching and behavior-relevant post-searching. Specifically, we employ direct first target token optimization in pre-searching to facilitate the search process. We apply our approach to cross-model, cross-data, and self-transfer scenarios. Furthermore, we introduce an interleaved variant of our approach, i-DeGCG, which iteratively leverages self-transferability to accelerate the search process. Experiments on HarmBench demonstrate the efficiency of our approach across various models and domains. Notably, our i-DeGCG outperforms the baseline on Llama2-chat-7b with ASRs of 43.9 (+ 22.2) and 39.0 (+19.5) on valid and test sets, respectively. Further analysis on cross-model transfer indicates the pivotal role of first target token optimization in leveraging suffix transferability for efficient searching. @@ -5781,7 +5781,7 @@ ZhiyuCao PeifengLiSoochow University, China YaxinFan - QiaomingZhuSoochow University + QiaomingZhuSoochow University 7225-7238 Although existing fashionable generation methods on Incomplete Utterance Rewriting (IUR) can generate coherent utterances, they often result in the inclusion of irrelevant and redundant tokens in rewritten utterances due to their inability to focus on critical tokens in dialogue context. Furthermore, the limited size of the training datasets also contributes to the insufficient training of the IUR model. To address the first issue, we propose a multi-task learning framework EO-IUR (Editing Operation-guided Incomplete Utterance Rewriting) that introduces the editing operation labels generated by sequence labeling module to guide generation model to focus on critical tokens. Furthermore, we introduce a token-level heterogeneous graph to represent dialogues. To address the second issue, we propose a two-dimensional utterance augmentation strategy, namely editing operation-based incomplete utterance augmentation and LLM-based historical utterance augmentation. The experimental results on three datasets demonstrate that our EO-IUR outperforms previous state-of-the-art (SOTA) baselines in both open-domain and task-oriented dialogue. 2024.emnlp-main.410 @@ -5801,11 +5801,11 @@ Aligning Large Language Models with Diverse Political Viewpoints - DominikStammbach + DominikStammbach PhilineWidmer EunjungChoETHZ - ETH Zurich CaglarGulcehreEPFL - EPF Lausanne - ElliottAshSwiss Federal Institute of Technology + ElliottAshSwiss Federal Institute of Technology 7257-7267 Large language models such as ChatGPT exhibit striking political biases. If users query them about political information, they often take a normative stance. To overcome this, we align LLMs with diverse political viewpoints from 100,000 comments written by candidates running for national parliament in Switzerland. Models aligned with this data can generate more accurate political viewpoints from Swiss parties, compared to commercial models such as ChatGPT. We also propose a procedure to generate balanced overviews summarizing multiple viewpoints using such models. The replication package contains all code and data. 2024.emnlp-main.412 @@ -5833,7 +5833,7 @@ DongliangXu QingYang HongtaoLiuDu Xiaoman Financial - BingQinHarbin Institute of Technology + BingQinHarbin Institute of Technology 7288-7301 Scaling the rotary position embedding (RoPE) has become a common method for extending the context window of RoPE-based large language models (LLMs). However, existing scaling methods often rely on empirical approaches and lack a profound understanding of the internal distribution within RoPE, resulting in suboptimal performance in extending the context window length. In this paper, we propose to optimize the context window extending task from the view of rotary angle distribution. Specifically, we first estimate the distribution of the rotary angles within the model and analyze the extent to which length extension perturbs this distribution. Then, we present a novel extension strategy that minimizes the disturbance between rotary angle distributions to maintain consistency with the pre-training phase, enhancing the model’s capability to generalize to longer sequences. Experimental results compared to the strong baseline methods demonstrate that our approach reduces by up to 72% of the distributional disturbance when extending LLaMA2’s context window to 8k, and reduces by up to 32% when extending to 16k. On the LongBench-E benchmark, our method achieves an average improvement of up to 4.33% over existing state-of-the-art methods. Furthermore, Our method maintains the model’s performance on the Hugging Face Open LLM benchmark after context window extension, with only an average performance fluctuation ranging from -0.12 to +0.22. 2024.emnlp-main.414 @@ -5857,9 +5857,9 @@ DaquanZhouBytedance HongyuRen ZhenDongNexusflow.ai Inc - KurtKeutzerUniversity of California Berkeley - See-KiongNgNational University of Singapore - JiashiFengByteDance + KurtKeutzerUniversity of California Berkeley + See-KiongNgNational University of Singapore + JiashiFengByteDance 7315-7332 Large Language Models (LLMs) have significantly advanced natural language processing, demonstrating exceptional reasoning, tool usage, and memory capabilities. As their applications expand into multi-agent environments, there arises a need for a comprehensive evaluation framework that captures LLMs’ reasoning, planning, collaboration, and other social abilities. This work introduces a novel competition-based benchmark framework specifically designed to assess LLMs within multi-agent settings, providing quantitative metrics to evaluate their judgment, reasoning, deception, self-awareness, cooperation, coordination, and rationality.We utilize two social deduction games alongside three game-theory scenarios to create diverse environments.Our frame is fortified with the probabilistic graphic modeling (PGM) method, enhancing the LLMs’ capabilities in navigating complex social and cognitive dimensions. We evaluate seven LLMs, quantitatively highlighting a significant capability gap of over threefold between the strongest, GPT o1, and the weakest, Llama-2-70B. It also confirms that our PGM enhancement boosts the abilities of all selected models by an average of 37%. Our data and code can be found here https://github.com/cathyxl/MAgIC. 2024.emnlp-main.416 @@ -5869,7 +5869,7 @@ Position Engineering: Boosting Large Language Models through Positional Information Manipulation ZhiyuanHeMicrosoft - HuiqiangJiangMicrosoft + HuiqiangJiangMicrosoft ZilongWangMicrosoft Research YuqingYangResearch, Microsoft Luna K.QiuMicrosoft @@ -5889,10 +5889,10 @@ ShunianChenShenzhen Research Institute of Big Data Guiming HardyChenUniversity of Texas at Dallas XidongWang - ZhenyangCaiThe Chinese University of Hong Kong, Shenzhen + ZhenyangCaiThe Chinese University of Hong Kong, Shenzhen KeJi XiangWanShenzhen Research Institute of Big Data - BenyouWangThe Chinese University of Hong Kong, Shenzhen + BenyouWangThe Chinese University of Hong Kong, Shenzhen 7346-7370 The rapid development of multimodal large language models (MLLMs), such as GPT-4V, has led to significant advancements. However, these models still face challenges in medical multimodal capabilities due to limitations in the quantity and quality of medical vision-text data, stemming from data privacy concerns and high annotation costs. While pioneering approaches utilize PubMed’s large-scale, de-identified medical image-text pairs to address these limitations, they often fall short due to inherent data noise. To tackle this, we refined medical image-text pairs from PubMed and employed MLLMs (GPT-4V) in an ‘unblinded’ capacity to denoise and reformat the data, resulting in the creation of the **PubMedVision** dataset with 1.3 million medical VQA samples. Our validation demonstrates that: (1) PubMedVision can significantly enhance the medical multimodal capabilities of MLLMs, showing significant improvement in benchmarks including the MMMU Health & Medicine track; (2) manual checks by medical experts and empirical results validate the superior data quality of our dataset compared to other data construction methods. Using PubMedVision, we train a 34B medical MLLM **HuatuoGPT-Vision**, which shows superior performance in medical multimodal scenarios among open-source MLLMs. Our code and data are available at https://github.com/FreedomIntelligence/HuatuoGPT-Vision. 2024.emnlp-main.418 @@ -5903,11 +5903,11 @@ <fixed-case>ADELIE</fixed-case>: Aligning Large Language Models on Information Extraction YunjiaQiTsinghua University - HaoPengTsinghua University, Tsinghua University - XiaozhiWangDepartment of Computer Science and Technology, Tsinghua University - BinXu - LeiHouTsinghua University, Tsinghua University - JuanziLi + HaoPengTsinghua University, Tsinghua University + XiaozhiWangDepartment of Computer Science and Technology, Tsinghua University + BinXu + LeiHouTsinghua University, Tsinghua University + JuanziLi 7371-7387 2024.emnlp-main.419 2024.emnlp-main.419.software.zip @@ -5918,10 +5918,10 @@ Unveiling Factual Recall Behaviors of Large Language Models through Knowledge Neurons YifeiWang YuhengChen - WantingWen - YuShengInstitute of automation, Chinese academy of science, Chinese Academy of Sciences - LinjingLiInstitute of automation, Chinese academy of science, Chinese Academy of Sciences - Daniel DajunZengInstitute of automation, Chinese academy of science, Chinese Academy of Sciences + WantingWen + YuShengInstitute of automation, Chinese academy of science, Chinese Academy of Sciences + LinjingLiInstitute of automation, Chinese academy of science, Chinese Academy of Sciences + Daniel DajunZengInstitute of automation, Chinese academy of science, Chinese Academy of Sciences 7388-7402 In this paper, we investigate whether Large Language Models (LLMs) actively recall or retrieve their internal repositories of factual knowledge when faced with reasoning tasks. Through an analysis of LLMs’ internal factual recall at each reasoning step via Knowledge Neurons, we reveal that LLMs fail to harness the critical factual associations under certain circumstances. Instead, they tend to opt for alternative, shortcut-like pathways to answer reasoning questions. By manually manipulating the recall process of parametric knowledge in LLMs, we demonstrate that enhancing this recall process directly improves reasoning performance whereas suppressing it leads to notable degradation. Furthermore, we assess the effect of Chain-of-Thought (CoT) prompting, a powerful technique for addressing complex reasoning tasks. Our findings indicate that CoT can intensify the recall of factual knowledge by encouraging LLMs to engage in orderly and reliable reasoning. Furthermore, we explored how contextual conflicts affect the retrieval of facts during the reasoning process to gain a comprehensive understanding of the factual recall behaviors of LLMs. Code and data will be available soon. 2024.emnlp-main.420 @@ -5930,8 +5930,8 @@ Lexically Grounded Subword Segmentation - JindřichLibovickýCharles University Prague - JindřichHelclCharles University + JindřichLibovickýCharles University Prague + JindřichHelclCharles University 7403-7420 We present three innovations in tokenization and subword segmentation. First, we propose to use unsupervised morphological analysis with Morfessor as pre-tokenization. Second, we present an algebraic method for obtaining subword embeddings grounded in a word embedding space. Based on that, we design a novel subword segmentation algorithm that uses the embeddings, ensuring that the procedure considers lexical meaning. Third, we introduce an efficient segmentation algorithm based on a subword bigram model that can be initialized with the lexically aware segmentation method to avoid using Morfessor and large embedding tables at inference time. We evaluate the proposed approaches using two intrinsic metrics and measure their performance on two downstream tasks: part-of-speech tagging and machine translation. Our experiments show significant improvements in the morphological plausibility of the segmentation when evaluated using segmentation precision on morpheme boundaries and improved Rényi efficiency in 8 languages. Although the proposed tokenization methods do not have a large impact on automatic translation quality, we observe consistent performance gains in the arguably more morphological task of part-of-speech tagging. 2024.emnlp-main.421 @@ -5955,10 +5955,10 @@ Do Text-to-Vis Benchmarks Test Real Use of Visualisations? HyNguyenUniversity of Sydney, University of Sydney XuefeiHeUniversity of Hong Kong - AndrewReesonCSIRO - CecileParisCSIRO - JosiahPoonUniversity of Sydney - Jonathan K.KummerfeldUniversity of Sydney + AndrewReesonCSIRO + CecileParisCSIRO + JosiahPoonUniversity of Sydney + Jonathan K.KummerfeldUniversity of Sydney 7433-7441 Large language models are able to generate code for visualisations in response to simple user requests.This is a useful application and an appealing one for NLP research because plots of data provide grounding for language.However, there are relatively few benchmarks, and those that exist may not be representative of what users do in practice.This paper investigates whether benchmarks reflect real-world use through an empirical study comparing benchmark datasets with code from public repositories.Our findings reveal a substantial gap, with evaluations not testing the same distribution of chart types, attributes, and actions as real-world examples.One dataset is representative, but requires extensive modification to become a practical end-to-end benchmark. This shows that new benchmarks are needed to support the development of systems that truly address users’ visualisation needs.These observations will guide future data creation, highlighting which features hold genuine significance for users. 2024.emnlp-main.423 @@ -5969,7 +5969,7 @@ Gold Panning in Vocabulary: An Adaptive Method for Vocabulary Expansion of Domain-Specific <fixed-case>LLM</fixed-case>s ChengyuanLiu - ShihangWangAlibaba Group + ShihangWangAlibaba Group LizhiQingAlibaba Group KunKuangZhejiang University YangyangKangAlibaba Group @@ -5996,8 +5996,8 @@ Multi-Dialect <fixed-case>V</fixed-case>ietnamese: Task, Dataset, Baseline Models and Challenges Nguyen VanDinh Thanh ChiDang - LuanThanh NguyenUniversity of Information Technology, Vietnam National University Ho Chi Minh City - Kiet VanNguyenUniversity of Information Technology, VNU-HCM + LuanThanh NguyenUniversity of Information Technology, Vietnam National University Ho Chi Minh City + Kiet VanNguyenUniversity of Information Technology, VNU-HCM 7476-7498 Vietnamese, a low-resource language, is typically categorized into three primary dialect groups that belong to Northern, Central, and Southern Vietnam. However, each province within these regions exhibits its own distinct pronunciation variations. Despite the existence of various speech recognition datasets, none of them has provided a fine-grained classification of the 63 dialects specific to individual provinces of Vietnam. To address this gap, we introduce Vietnamese Multi-Dialect (ViMD) dataset, a novel comprehensive dataset capturing the rich diversity of 63 provincial dialects spoken across Vietnam. Our dataset comprises 102.56 hours of audio, consisting of approximately 19,000 utterances, and the associated transcripts contain over 1.2 million words. To provide benchmarks and simultaneously demonstrate the challenges of our dataset, we fine-tune state-of-the-art pre-trained models for two downstream tasks: (1) Dialect identification and (2) Speech recognition. The empirical results suggest two implications including the influence of geographical factors on dialects, and the constraints of current approaches in speech recognition tasks involving multi-dialect speech data. Our dataset is available for research purposes. 2024.emnlp-main.426 @@ -6006,7 +6006,7 @@ Is <fixed-case>LLM</fixed-case>-as-a-Judge Robust? Investigating Universal Adversarial Attacks on Zero-shot <fixed-case>LLM</fixed-case> Assessment - VyasRaina + VyasRaina AdianLiusieUniversity of Cambridge MarkGalesUniversity of Cambridge 7499-7517 @@ -6021,8 +6021,8 @@ ZhicongLu LiJin PeiguangLi - YuTian - LinhaoZhang + YuTian + LinhaoZhang SiruiWang GuangluanXuUniversity of the Chinese Academy of Sciences ChangyuanTian @@ -6039,7 +6039,7 @@ More Than Catastrophic Forgetting: Integrating General Capabilities For Domain-Specific <fixed-case>LLM</fixed-case>s ChengyuanLiu YangyangKangAlibaba Group - ShihangWangAlibaba Group + ShihangWangAlibaba Group LizhiQingAlibaba Group FubangZhaoAlibaba Group ChaoWu @@ -6054,10 +6054,10 @@ Muting Whisper: A Universal Acoustic Adversarial Attack on Speech Foundation Models - VyasRaina + VyasRaina RaoMa - CharlesMcGhee - KateKnillUniversity of Cambridge + CharlesMcGhee + KateKnillUniversity of Cambridge MarkGalesUniversity of Cambridge 7549-7565 2024.emnlp-main.430 @@ -6068,7 +6068,7 @@ <fixed-case>GENRA</fixed-case>: Enhancing Zero-shot Retrieval with Rank Aggregation GeorgiosKatsimprasNCSR Demokritos - GeorgiosPaliourasNCSR “Demokritos” + GeorgiosPaliourasNCSR “Demokritos” 7566-7577 Large Language Models (LLMs) have been shown to effectively perform zero-shot document retrieval, a process that typically consists of two steps: i) retrieving relevant documents, and ii) re-ranking them based on their relevance to the query. This paper presents GENRA, a new approach to zero-shot document retrieval that incorporates rank aggregation to improve retrieval effectiveness. Given a query, GENRA first utilizes LLMs to generate informative passages that capture the query’s intent. These passages are then employed to guide the retrieval process, selecting similar documents from the corpus. Next, we use LLMs again for a second refinement step. This step can be configured for either direct relevance assessment of each retrieved document or for re-ranking the retrieved documents. Ultimately, both approaches ensure that only the most relevant documents are kept. Upon this filtered set of documents, we perform multi-document retrieval, generating individual rankings for each document. As a final step, GENRA leverages rank aggregation, combining the individual rankings to produce a single refined ranking. Extensive experiments on benchmark datasets demonstrate that GENRA improves existing approaches, highlighting the effectiveness of the proposed methodology in zero-shot retrieval. 2024.emnlp-main.431 @@ -6080,7 +6080,7 @@ ZichenChenUniversity of California, Santa Barbara JiandaChenNanyang Technological University AmbujSinghUC Santa Barbara - MishaSraUniversity of California, Santa Barbara + MishaSraUniversity of California, Santa Barbara 7578-7596 Large Language Models (LLMs) have achieved remarkable success in natural language tasks, yet understanding their reasoning processes remains a significant challenge. We address this by introducing XplainLLM, a dataset accompanying an explanation framework designed to enhance LLM transparency and reliability. Our dataset comprises 24,204 instances where each instance interprets the LLM’s reasoning behavior using knowledge graphs (KGs) and graph attention networks (GAT), and includes explanations of LLMs such as the decoder-only Llama-3 and the encoder-only RoBERTa. XplainLLM also features a framework for generating grounded explanations and the debugger-scores for multidimensional quality analysis. Our explanations include why-choose and why-not-choose components, reason-elements, and debugger-scores that collectively illuminate the LLM’s reasoning behavior. Our evaluations demonstrate XplainLLM’s potential to reduce hallucinations and improve grounded explanation generation in LLMs. XplainLLM is a resource for researchers and practitioners to build trust and verify the reliability of LLM outputs. Our code and dataset are publicly available. 2024.emnlp-main.432 @@ -6089,8 +6089,8 @@ Divide and Conquer Radiology Report Generation via Observation Level Fine-grained Pretraining and Prompt Tuning - YuanpinZhou - HuogenWang + YuanpinZhou + HuogenWang 7597-7610 The automation of radiology report generation (RRG) holds immense potential to alleviate radiologists’ workloads and improve diagnostic accuracy. Despite advancements in image captioning and vision-language pretraining, RRG remains challenging due to the lengthy and complex nature of radiology reports. In this work, we proposes the Divide and Conquer Radiology Report Generation (DCRRG) model, which breaks down full-text radiology reports into concise observation descriptions. This approach enables the model to capture fine-grained representations from each observation through a two-stage process: an encoding stage focusing on observation prediction tasks to learn fine-grained representations, and a decoding stage for integrating these descriptions into cohesive and comprehensive radiology reports. Experimental results on two benchmark datasets demonstrate that DCRRG achieves significant improvements across all evaluated metrics, underscoring its capability to generate semantically coherent and clinically accurate radiology reports. 2024.emnlp-main.433 @@ -6101,7 +6101,7 @@ <fixed-case>SUR</fixed-case>f: Teaching Large Vision-Language Models to Selectively Utilize Retrieved Information JiashuoSun JihaiZhangThe Chinese University of Hong Kong - YuchengZhouUniversity of Macau + YuchengZhouUniversity of Macau ZhaochenSu XiaoyeQuShanghai Artificial Intelligence Laboratory YuChengThe Chinese University of Hong Kong @@ -6114,12 +6114,12 @@ <fixed-case>UNO</fixed-case> Arena for Evaluating Sequential Decision-Making Capability of Large Language Models ZhanyueQin - HaochuanWang + HaochuanWang DeyuanLiu - ZiyangSongHarbin Institute of Technology + ZiyangSongHarbin Institute of Technology CunhangFan ZhaoLvSchool of Computer Science and Technology, Anhui University, Hefei 230601, China - JinlinWuInstitute of automation, Chinese academy of science, Chinese Academy of Sciences + JinlinWuInstitute of automation, Chinese academy of science, Chinese Academy of Sciences ZhenLeiInstitute of Automation, Chinese Academy of Sciences ZhiyingTu DianhuiChu @@ -6134,13 +6134,13 @@ Middleware for <fixed-case>LLM</fixed-case>s: Tools Are Instrumental for Language Agents in Complex Environments YuGu - YihengShuThe Ohio State University + YihengShuThe Ohio State University HaoYu - XiaoLiu - YuxiaoDongTsinghua University + XiaoLiu + YuxiaoDongTsinghua University JieTangTsinghua University, Tsinghua University JayanthSrinivasa - HugoLatapieCisco + HugoLatapieCisco YuSuOhio State University and Microsoft 7646-7663 The applications of large language models (LLMs) have expanded well beyond the confines of text processing, signaling a new era where LLMs are envisioned as generalist agents capable of operating within complex environments. These environments are often highly expansive, making it impossible for the LLM to process them within its short-term memory. Motivated by recent research on extending the capabilities of LLMs with tools, we seek to investigate the intriguing potential of tools to augment LLMs in handling such complexity by introducing a novel class of tools, termed *middleware*, to aid in the proactive exploration within these massive environments. Such specialized tools can serve as a middleware layer shielding the LLM from environmental complexity. In two representative complex environments—knowledge bases (KBs) and databases—we demonstrate the significant potential of augmenting language agents with tools in complex environments. Notably, equipped with the middleware, GPT-4 achieves **2.8**X the performance of the best baseline in tasks requiring access to database content and **2.2**X in KB tasks. Our findings illuminate the path for advancing language agents in real-world applications. @@ -6153,8 +6153,8 @@ <fixed-case>MORPHEUS</fixed-case>: Modeling Role from Personalized Dialogue History by Exploring and Utilizing Latent Space YihongTang - BoWang - DongmingZhao + BoWang + DongmingZhao JinxiaojiaJinxiaojia ZhangjijunZhangjijun RuifangHeTianjin University @@ -6174,7 +6174,7 @@ RuiYeShanghai Jiaotong University JingyiChai SihengChenShanghai Jiao Tong University - YanfengWangShanghai Jiao Tong University + YanfengWangShanghai Jiao Tong University 7677-7695 The success of large language models (LLMs) facilitate many parties to fine-tune LLMs on their own private data. However, this practice raises privacy concerns due to the memorization of LLMs. Existing solutions, such as utilizing synthetic data for substitution, struggle to simultaneously improve performance and preserve privacy.They either rely on a local model for generation, resulting in a performance decline, or take advantage of APIs, directly exposing the data to API servers. To address this issue, we propose KnowledgeSG, a novel client-server framework which enhances synthetic data quality and improves model performance while ensuring privacy. We achieve this by learning local knowledge from the private data with differential privacy (DP) and distilling professional knowledge from the server. Additionally, inspired by federated learning, we transmit models rather than data between the client and server to prevent privacy leakage.Extensive experiments in medical and financial domains demonstrate the effectiveness of *KnowledgeSG*. Our code is now publicly available at https://github.com/wwh0411/KnowledgeSG. 2024.emnlp-main.438 @@ -6185,7 +6185,7 @@ <fixed-case>DAMRO</fixed-case>: Dive into the Attention Mechanism of <fixed-case>LVLM</fixed-case> to Reduce Object Hallucination XuanGong TianshiMing - XinpengWang + XinpengWang ZhihuaWeiTongji University 7696-7712 Despite the great success of Large Vision-Language Models (LVLMs), they inevitably suffer from hallucination. As we know, both the visual encoder and the Large Language Model (LLM) decoder in LVLMs are Transformer-based, allowing the model to extract visual information and generate text outputs via attention mechanisms. We find that the attention distribution of LLM decoder on image tokens is highly consistent with the visual encoder and both distributions tend to focus on particular background tokens rather than the referred objects in the image. We attribute to the unexpected attention distribution to an inherent flaw in the visual encoder itself, which misguides LLMs to over emphasize the redundant information and generate object hallucination. To address the issue, we propose DAMRO, a novel training-free strategy that **D**ive into **A**ttention **M**echanism of LVLM to **R**educe **O**bject Hallucination. Specifically, our approach employs classification token (CLS) of ViT to filter out high-attention tokens scattered in the background and then eliminate their influence during decoding stage. We evaluate our method on LVLMs including LLaVA-1.5, LLaVA-NeXT and InstructBLIP, using various benchmarks such as POPE, CHAIR, MME and GPT-4V Aided Evaluation. The results demonstrate that our approach significantly reduces the impact of these outlier tokens, thus effectively alleviating the hallucination of LVLMs. @@ -6225,10 +6225,10 @@ An Empirical Study of Multilingual Reasoning Distillation for Question Answering PatompornPayoungkhamdeeVidyasirimedhi Institute of Science and Technology PeeratLimkonchotiwatAI Singapore - JinheonBaekGoogle and Korea Advanced Institute of Science & Technology + JinheonBaekGoogle and Korea Advanced Institute of Science & Technology PotsaweeManakulSCB 10X CanUdomcharoenchaikitVidyasirimedhi Institute of Science and Technology (VISTEC) - EkapolChuangsuwanichChulalongkorn University + EkapolChuangsuwanichChulalongkorn University SaranaNutanong 7739-7751 Reasoning is one crucial capability in Large Language Models (LLMs), allowing them to perform complex tasks such as solving math problems and multi-step planning. While reasoning capability can emerge in larger models, smaller ones usually have to rely on distillation to transfer this capability from a larger model. However, recent efforts to distill reasoning capabilities have focused mainly on English, leaving multilingual distillation underexplored. To address this gap, this paper examines existing English reasoning distillation methods that utilize a variety of positive rationales in multilingual settings and proposes d-CoT-nR, a novel approach that incorporates incorrect rationales as additional guidance. Empirical results from multilingual high-school examinations show that d-CoT-nR significantly surpasses the baseline, improving accuracy in unseen languages and correctness in step-by-step reasoning. @@ -6253,7 +6253,7 @@ GalYonaResearch, Google RoeeAharoniGoogle MatanEyalAllen Institute for Artificial Intelligence - AmirFederColumbia University and Google + AmirFederColumbia University and Google RoiReichartTechnion, Israel Institute of Technology JonathanHerzigResearch, Google 7765-7784 @@ -6266,7 +6266,7 @@ Bridging Modalities: Enhancing Cross-Modality Hate Speech Detection with Few-Shot In-Context Learning Ming ShanHee AditiKumaresan - Roy Ka-WeiLeeSingapore University of Technology and Design and University of Saskatchewan + Roy Ka-WeiLeeSingapore University of Technology and Design and University of Saskatchewan 7785-7799 The widespread presence of hate speech on the internet, including formats such as text-based tweets and multimodal memes, poses a significant challenge to digital platform safety. Recent research has developed detection models tailored to specific modalities; however, there is a notable gap in transferring detection capabilities across different formats. This study conducts extensive experiments using few-shot in-context learning with large language models to explore the transferability of hate speech detection between modalities. Our findings demonstrate that text-based hate speech examples can significantly enhance the classification accuracy of vision-language hate speech. Moreover, text-based demonstrations outperform vision-language demonstrations in few-shot learning settings. These results highlight the effectiveness of cross-modality knowledge transfer and offer valuable insights for improving hate speech detection systems. 2024.emnlp-main.445 @@ -6275,20 +6275,20 @@ <fixed-case>MIND</fixed-case>: Multimodal Shopping Intention Distillation from Large Vision-language Models for <fixed-case>E</fixed-case>-commerce Purchase Understanding - BaixuanXuHong Kong University of Science and Technology - WeiqiWangJohns Hopkins University and The Hong Kong University of Science and Technology + BaixuanXuHong Kong University of Science and Technology + WeiqiWangJohns Hopkins University and The Hong Kong University of Science and Technology HaochenShi WenxuanDingUniversity of Texas at Austin - HuihaoJing + HuihaoJing TianqingFangTencent AI Lab - JiaxinBaiThe Hong Kong University of Science and Technology - XinLiuAmazon + JiaxinBaiThe Hong Kong University of Science and Technology + XinLiuAmazon ChanglongYuDepartment of Computer Science and Engineering, The Hong Kong University of Science and Technology ZhengLiAmazon - ChenLuoAmazon + ChenLuoAmazon QingyuYinAmazon BingYin - LongChenThe Hong Kong University of Science and Technology + LongChenThe Hong Kong University of Science and Technology YangqiuSongThe Hong Kong University of Science and Technology 7800-7815 Improving user experience and providing personalized search results in E-commerce platforms heavily rely on understanding purchase intention. However, existing methods for acquiring large-scale intentions bank on distilling large language models with human annotation for verification. Such an approach tends to generate product-centric intentions, overlook valuable visual information from product images, and incurs high costs for scalability. To address these issues, we introduce MIND, a multimodal framework that allows Large Vision-Language Models (LVLMs) to infer purchase intentions from multimodal product metadata and prioritize human-centric ones. Using Amazon Review data, we apply MIND and create a multimodal intention knowledge base, which contains 1,264,441 intentions derived from 126,142 co-buy shopping records across 107,215 products. Extensive human evaluations demonstrate the high plausibility and typicality of our obtained intentions and validate the effectiveness of our distillation framework and filtering mechanism. Further experiments reveal the positive downstream benefits that MIND brings to intention comprehension tasks and highlight the importance of multimodal generation and role-aware filtering. Additionally, MIND shows robustness to different prompts and superior generation quality compared to previous methods. @@ -6300,13 +6300,13 @@ <fixed-case>ECON</fixed-case>: On the Detection and Resolution of Evidence Conflicts ChengJiayangDepartment of Computer Science and Engineering, Hong Kong University of Science and Technology - ChunkitChan + ChunkitChan QianqianZhuang LinQiu TianhangZhang - TengxiaoLiuUniversity of California, Santa Barbara + TengxiaoLiuUniversity of California, Santa Barbara YangqiuSongThe Hong Kong University of Science and Technology - YueZhangWestlake University + YueZhangWestlake University PengfeiLiu ZhengZhangNew York University 7816-7844 @@ -6317,8 +6317,8 @@ “Image, Tell me your story!” Predicting the original meta-context of visual misinformation - JonathanTonglet - Marie-FrancineMoensKU Leuven, KU Leuven + JonathanTonglet + Marie-FrancineMoensKU Leuven, KU Leuven IrynaGurevychInstitute for Computer Science, Artificial Intelligence and Technology, Mohamed bin Zayed University of Artificial Intelligence and Technische Universität Darmstadt 7845-7864 To assist human fact-checkers, researchers have developed automated approaches for visual misinformation detection. These methods assign veracity scores by identifying inconsistencies between the image and its caption, or by detecting forgeries in the image. However, they neglect a crucial point of the human fact-checking process: identifying the original meta-context of the image. By explaining what is actually true about the image, fact-checkers can better detect misinformation, focus their efforts on check-worthy visual content, engage in counter-messaging before misinformation spreads widely, and make their explanation more convincing. Here, we fill this gap by introducing the task of automated image contextualization. We create 5Pils, a dataset of 1,676 fact-checked images with question-answer pairs about their original meta-context. Annotations are based on the 5 Pillars fact-checking framework. We implement a first baseline that grounds the image in its original meta-context using the content of the image and textual evidence retrieved from the open web. Our experiments show promising results while highlighting several open challenges in retrieval and reasoning. @@ -6335,7 +6335,7 @@ ChenxinDiao KaustubhVyasHuawei Technologies Ltd. YuanyiJi - Jeff Z.PanUniversity of Edinburgh, University of Edinburgh + Jeff Z.PanUniversity of Edinburgh, University of Edinburgh 7865-7879 2024.emnlp-main.449 shen-etal-2024-improving @@ -6343,10 +6343,10 @@ Mixture-of-Subspaces in Low-Rank Adaptation - TaiqiangWu + TaiqiangWu JiahaoWang ZheZhao - NgaiWongThe University of Hong Kong + NgaiWongThe University of Hong Kong 7880-7899 In this paper, we introduce a subspace-inspired Low-Rank Adaptation (LoRA) method, which is computationally efficient, easy to implement, and readily applicable to large language, multimodal, and diffusion models. Initially, we equivalently decompose the weights of LoRA into two subspaces, and find that simply mixing them can enhance performance. To study such a phenomenon, we revisit it through a fine-grained subspace lens, showing that such modification is equivalent to employing a fixed mixer to fuse the subspaces. To be more flexible, we jointly learn the mixer with the original LoRA weights, and term the method as Mixture-of-Subspaces LoRA (MoSLoRA). MoSLoRA consistently outperforms LoRA on tasks in different modalities, including commonsense reasoning, visual instruction tuning, and subject-driven text-to-image generation, demonstrating its effectiveness and robustness. 2024.emnlp-main.450 @@ -6357,10 +6357,10 @@ <fixed-case>PARIKSHA</fixed-case>: A Large-Scale Investigation of Human-<fixed-case>LLM</fixed-case> Evaluator Agreement on Multilingual and Multi-Cultural Data IshaanWattsGoogle DeepMind - VarunGummaMicrosoft + VarunGummaMicrosoft AdityaYadavalliKarya Inc VivekSeshadriResearch, Microsoft - ManoharSwaminathanResearch, Microsoft + ManoharSwaminathanResearch, Microsoft SunayanaSitaramMicrosoft 7900-7932 Evaluation of multilingual Large Language Models (LLMs) is challenging due to a variety of factors – the lack of benchmarks with sufficient linguistic diversity, contamination of popular benchmarks into LLM pre-training data and the lack of local, cultural nuances in translated benchmarks. In this work, we study human and LLM-based evaluation in a multilingual, multi-cultural setting. We evaluate 30 models across 10 Indic languages by conducting 90K human evaluations and 30K LLM-based evaluations and find that models such as GPT-4o and Llama-3 70B consistently perform best for most Indic languages. We build leaderboards for two evaluation settings - pairwise comparison and direct assessment and analyse the agreement between humans and LLMs. We find that humans and LLMs agree fairly well in the pairwise setting but the agreement drops for direct assessment evaluation especially for languages such as Bengali and Odia. We also check for various biases in human and LLM-based evaluation and find evidence of self-bias in the GPT-based evaluator. Our work presents a significant step towards scaling up multilingual evaluation of LLMs. @@ -6377,8 +6377,8 @@ ZhuoHan AlanHuangScience and Engineering Magnet School SongyangZhangShanghai AI Laboratory - KaiChenShanghai AI Laboratory - ZhixinYinNanjing University + KaiChenShanghai AI Laboratory + ZhixinYinNanjing University ZongwenShen JidongGeNanjing University VincentNgUniversity of Texas at Dallas @@ -6391,7 +6391,7 @@ Efficient Performance Tracking: Leveraging Large Language Models for Automated Construction of Scientific Leaderboards FurkanŞahinuçTechnische Universität Darmstadt - Thy ThyTran + Thy ThyTran YuliaGrishinaAmazon Development Center Germany YufangHouTechnische Universität Darmstadt and IBM Research Ireland BeiChen @@ -6406,7 +6406,7 @@ Efficient Vision-Language pre-training via domain-specific learning for human activities - AdrianBulatSamsung AI Center Cambridge + AdrianBulatSamsung AI Center Cambridge YassineOualiSamsung RicardoGuerreroSamsung AI Center BraisMartinezSamsung @@ -6439,11 +6439,11 @@ XinfengYuan SiyuYuan YuhanCui - TianheLin + TianheLin XintaoWang RuiXu JiangjieChenByteDance Inc. - DeqingYangFudan University + DeqingYangFudan University 8015-8036 Large language models (LLMs) have demonstrated impressive performance and spurred numerous AI applications, in which role-playing agents (RPAs) are particularly popular, especially for fictional characters. The prerequisite for these RPAs lies in the capability of LLMs to understand characters from fictional works. Previous efforts have evaluated this capability via basic classification tasks or characteristic imitation, failing to capture the nuanced character understanding with LLMs. In this paper, we propose evaluating LLMs’ character understanding capability via the character profiling task, i.e., summarizing character profiles from corresponding materials, a widely adopted yet understudied practice for RPA development. Specifically, we construct the CROSS dataset from literature experts and assess the generated profiles by comparing them with ground truth references and evaluating their applicability in downstream tasks. Our experiments, which cover various summarization methods and LLMs, have yielded promising results. These results strongly validate the character understanding capability of LLMs. Resources are available at https://github.com/Joanna0123/character_profiling. 2024.emnlp-main.456 @@ -6456,10 +6456,10 @@ ChangjiangGaonanjing university WenhaoZhuNanjing University JiajunChenNanjing University - XinHuangChina Mobile Communications Company Limited Research Institute + XinHuangChina Mobile Communications Company Limited Research Institute XueHan - JunlanFeng - ChaoDengChina Mobile Research Institute + JunlanFeng + ChaoDengChina Mobile Research Institute ShujianHuangNanjing University 8037-8051 Recently, Large Language Models (LLMs) have shown impressive language capabilities, while most of them have very unbalanced performance across different languages. Multilingual alignment based on the translation parallel data is an effective method to enhance LLMs’ multilingual capabilities. In this work, we first discover and comprehensively investigate the spontaneous multilingual alignment of LLMs. Firstly, we find that LLMs instruction-tuned on the question translation data (i.e. without annotated answers) are able to encourage the alignment between English and a wide range of languages, even including those unseen during instruction-tuning. Additionally, we utilize different settings and mechanistic interpretability methods to analyze the LLM’s performance in the multilingual scenario comprehensively. Our work suggests that LLMs have enormous potential for improving multilingual alignment efficiently with great language generalization and task generalization. @@ -6469,13 +6469,13 @@ <fixed-case>A</fixed-case>da<fixed-case>S</fixed-case>witch: Adaptive Switching between Small and Large Agents for Effective Cloud-Local Collaborative Learning - HaoSun + HaoSun JiayiWuEast China Normal University HengyiCai XiaochiWeiBaidu YueFeng - BoWangSchool of Computer Science & Technology, Beijing Institute of Technology - ShuaiqiangWang + BoWangSchool of Computer Science & Technology, Beijing Institute of Technology + ShuaiqiangWang YanZhangPeking University DaweiYinBaidu 8052-8062 @@ -6486,11 +6486,11 @@ <fixed-case>C</fixed-case>o<fixed-case>B</fixed-case>a: Convergence Balancer for Multitask Finetuning of Large Language Models - ZiGong + ZiGong HangYuAnt Group CongLiaoAnt Group - BingchangLiu - ChaoyuChenAlibaba Group + BingchangLiu + ChaoyuChenAlibaba Group JianguoLiAnt Group 8063-8077 Multi-task learning (MTL) benefits the fine-tuning of large language models (LLMs) by providing a single model with improved performance and generalization ability across tasks, presenting a resource-efficient alternative to developing separate models for each task. Yet, existing MTL strategies for LLMs often fall short by either being computationally intensive or failing to ensure simultaneous task convergence. This paper presents CoBa, a new MTL approach designed to effectively manage task convergence balance with minimal computational overhead. Utilizing Relative Convergence Scores (RCS), Absolute Convergence Scores (ACS), and a Divergence Factor (DF), CoBa dynamically adjusts task weights during the training process, ensuring that the validation loss of all tasks progress towards convergence at an even pace while mitigating the issue of individual task divergence. The results of our experiments involving three disparate datasets underscore that this approach not only fosters equilibrium in task improvement but enhances the LLMs’ performance by up to 13% relative to the second-best baselines. Code is open-sourced at https://github.com/codefuse-ai/MFTCoder. @@ -6506,7 +6506,7 @@ NanXuUniversity of Southern California ShengZhangMicrosoft HoifungPoonMicrosoft - MuhaoChenUniversity of California, Davis and University of Southern California + MuhaoChenUniversity of California, Davis and University of Southern California 8078-8088 Direct preference optimization (DPO) has shown to be an effective method for large language model (LLM) alignment. Recent works have attempted to apply DPO to multimodal scenarios but have found it challenging to achieve consistent improvement. Through a comparative experiment, we identify the unconditional preference problem in multimodal preference optimization, where the model overlooks the image condition. To address this problem, we propose mDPO, a multimodal DPO objective that prevents the over-prioritization of language-only preferences by also optimizing image preference. Moreover, we introduce a reward anchor that forces the reward to be positive for chosen responses, thereby avoiding the decrease in their likelihood—an intrinsic problem of relative preference optimization. Experiments on two multimodal LLMs of different sizes and three widely used benchmarks demonstrate that mDPO effectively addresses the unconditional preference problem in multimodal preference optimization and significantly improves model performance, particularly in reducing hallucination. 2024.emnlp-main.460 @@ -6532,11 +6532,11 @@ KajBostromUniversity of Texas, Austin HarshJhamtaniMicrosoft HaoFangMicrosoft - SamThomsonMicrosoft + SamThomsonMicrosoft RichardShinGoogle PatrickXiaMicrosoft BenjaminVan DurmeMicrosoft and Johns Hopkins University - JasonEisnerMicrosoft and Johns Hopkins University + JasonEisnerMicrosoft and Johns Hopkins University JacobAndreasMassachusetts Institute of Technology and Microsoft 8101-8112 Tools for translating natural language into code promise natural, open-ended interaction with databases, web APIs, and other software systems. However, this promise is complicated by the diversity and continual development of these systems, each with its own interface and distinct set of features. Building a new language-to-code translator, even starting with a large language model (LM), typically requires annotating a large set of natural language commands with their associated programs. In this paper, we describe ICIP (In-Context Inverse Programming), a method for bootstrapping a language-to-code system using mostly (or entirely) unlabeled programs written using a potentially unfamiliar (but human-readable) library or API. ICIP uses a pre-trained LM to assign candidate natural language descriptions to these programs, then iteratively refines the descriptions to ensure global consistency. Across nine different application domains from the Overnight and Spider benchmarks and text-davinci-003 and CodeLlama-7b-Instruct models, ICIP outperforms a number of prompting baselines. Indeed, in a “nearly unsupervised” setting with only a single annotated program and 100 unlabeled examples, it achieves up to 85% of the performance of a fully supervised system. @@ -6546,7 +6546,7 @@ Attribute or Abstain: Large Language Models as Long Document Assistants - JanBuchmannUKP (TU Darmstadt) + JanBuchmannUKP (TU Darmstadt) XiaoLiuTechnische Universität Darmstadt IrynaGurevychInstitute for Computer Science, Artificial Intelligence and Technology, Mohamed bin Zayed University of Artificial Intelligence and Technische Universität Darmstadt 8113-8140 @@ -6563,7 +6563,7 @@ JiaqiWang HoupingXiaoGeorgia State University JinghuiChenPennsylvania State University - FenglongMaPennsylvania State University + FenglongMaPennsylvania State University 8141-8154 Foundation models have demonstrated remarkable capabilities in handling diverse modalities and tasks, outperforming conventional artificial intelligence (AI) approaches that are highly task-specific and modality-reliant. In the medical domain, however, the development of comprehensive foundation models is constrained by limited access to diverse modalities and stringent privacy regulations. To address these constraints, this study introduces a novel knowledge injection approach, FedKIM, designed to scale the medical foundation model within a federated learning framework. FedKIM leverages lightweight local models to extract healthcare knowledge from private data and integrates this knowledge into a centralized foundation model using a designed adaptive Multitask Multimodal Mixture Of Experts (M^3OE) module. This method not only preserves privacy but also enhances the model’s ability to handle complex medical tasks involving multiple modalities. Our extensive experiments across twelve tasks in seven modalities demonstrate the effectiveness of FedKIM in various settings, highlighting its potential to scale medical foundation models without direct access to sensitive data. Source codes are available at https://github.com/XiaochenWang-PSU/FedKIM. 2024.emnlp-main.464 @@ -6574,9 +6574,9 @@ Retrieved In-Context Principles from Previous Mistakes - HaoSun + HaoSun YongJiang - BoWangSchool of Computer Science & Technology, Beijing Institute of Technology + BoWangSchool of Computer Science & Technology, Beijing Institute of Technology YingyanHou YanZhangPeking University PengjunXie @@ -6632,12 +6632,12 @@ Towards Verifiable Text Generation with Evolving Memory and Self-Reflection - HaoSun + HaoSun HengyiCai - BoWangSchool of Computer Science & Technology, Beijing Institute of Technology + BoWangSchool of Computer Science & Technology, Beijing Institute of Technology YingyanHou XiaochiWeiBaidu - ShuaiqiangWang + ShuaiqiangWang YanZhangPeking University DaweiYinBaidu 8211-8227 @@ -6650,7 +6650,7 @@ Pelican: Correcting Hallucination in Vision-<fixed-case>LLM</fixed-case>s via Claim Decomposition and Program of Thought Verification PritishSahuSRI International KaranSikka - AjayDivakaranSRI International + AjayDivakaranSRI International 8228-8248 Large Visual Language Models (LVLMs) struggle with hallucinations in visual instruction following task(s). These issues hinder their trustworthiness and real-world applicability. We propose Pelican – a novel framework designed to detect and mitigate hallucinations through claim verification. Pelican first decomposes the visual claim into a chain of sub-claims based on first-order predicates. These sub-claims consists of (predicate, question) pairs and can be conceptualized as nodes of a computational graph. We then use use Program-of-Thought prompting to generate Python code for answering these questions through flexible composition of external tools. Pelican improves over prior work by introducing (1) intermediate variables for precise grounding of object instances, and (2) shared computation for answering the sub-question to enable adaptive corrections and inconsistency identification. We finally use reasoning abilities of LLM to verify the correctness of the the claim by considering the consistency and confidence of the (question, answer) pairs from each sub-claim. Our experiments demonstrate consistent performance improvements over various baseline LVLMs and existing hallucination mitigation approaches across several benchmarks. 2024.emnlp-main.470 @@ -6663,8 +6663,8 @@ JeroneAndrewsSony AI DoraZhaoStanford University OrestisPapakyriakopoulosTechnische Universität München - ApostolosModasSony Europe Ltd. - YutaNakashimaOsaka University + ApostolosModasSony Europe Ltd. + YutaNakashimaOsaka University AliceXiangSony AI 8249-8267 We tackle societal bias in image-text datasets by removing spurious correlations between protected groups and image attributes. Traditional methods only target labeled attributes, ignoring biases from unlabeled ones. Using text-guided inpainting models, our approach ensures protected group independence from all attributes and mitigates inpainting biases through data filtering. Evaluations on multi-label image classification and image captioning tasks show our method effectively reduces bias without compromising performance across various models. Specifically, we achieve an average societal bias reduction of 46.1% in leakage-based bias metrics for multi-label classification and 74.8% for image captioning. @@ -6675,7 +6675,7 @@ <fixed-case>R</fixed-case>eal<fixed-case>V</fixed-case>ul: Can We Detect Vulnerabilities in Web Applications with <fixed-case>LLM</fixed-case>? DiCao - YongLiaoUniversity of Science and Technology of China and China Academic of Electronics and Information Technology + YongLiaoUniversity of Science and Technology of China and China Academic of Electronics and Information Technology XiuweiShang 8268-8282 The latest advancements in large language models (LLMs) have sparked interest in their potential for software vulnerability detection. However, there is currently a lack of research specifically focused on vulnerabilities in the PHP language, and challenges in data sampling and processing persist, hindering the model’s ability to effectively capture the characteristics of specific vulnerabilities. In this paper, we present RealVul, the first LLM-based framework designed for PHP vulnerability detection, addressing these issues. By improving code sampling methods and employing normalization techniques, we can isolate potential vulnerability triggers while streamlining the code and eliminating unnecessary semantic information, enabling the model to better understand and learn from the generated vulnerability samples. We also address the issue of insufficient PHP vulnerability samples by improving data synthesis methods. To evaluate RealVul’s performance, we conduct an extensive analysis using five distinct code LLMs on vulnerability data from 180 PHP projects. The results demonstrate a significant improvement in both effectiveness and generalization compared to existing methods, effectively boosting the vulnerability detection capabilities of these models. @@ -6699,8 +6699,8 @@ Guiming HardyChenUniversity of Texas at Dallas ShunianChenShenzhen Research Institute of Big Data ZicheLiu - FengJiangThe Chinese University of Hong Kong, Shenzhen - BenyouWangThe Chinese University of Hong Kong, Shenzhen + FengJiangThe Chinese University of Hong Kong, Shenzhen + BenyouWangThe Chinese University of Hong Kong, Shenzhen 8301-8327 Adopting human and large language models (LLM) as judges (*a.k.a* human- and LLM-as-a-judge) for evaluating the performance of LLMs has recently gained attention. Nonetheless, this approach concurrently introduces potential biases from human and LLMs, questioning the reliability of the evaluation results. In this paper, we propose a novel framework that is free from referencing groundtruth annotations for investigating **Misinformation Oversight Bias**, **Gender Bias**, **Authority Bias** and **Beauty Bias** on LLM and human judges. We curate a dataset referring to the revised Bloom’s Taxonomy and conduct thousands of evaluations. Results show that human and LLM judges are vulnerable to perturbations to various degrees, and that even the cutting-edge judges possess considerable biases. We further exploit these biases to conduct attacks on LLM judges. We hope that our work can notify the community of the bias and vulnerability of human- and LLM-as-a-judge, as well as the urgency of developing robust evaluation systems. 2024.emnlp-main.474 @@ -6743,10 +6743,10 @@ <fixed-case>M</fixed-case>eta<fixed-case>R</fixed-case>eflection: Learning Instructions for Language Agents using Past Reflections - PriyanshuGuptaMicrosoft + PriyanshuGuptaMicrosoft ShashankKirtaniaMicrosoft AnanyaSinghaResearch, Microsoft - SumitGulwaniResearch, Microsoft + SumitGulwaniResearch, Microsoft ArjunRadhakrishnaMicrosoft GustavoSoaresMicrosoft SherryShiMicrosoft @@ -6759,7 +6759,7 @@ Stepwise Verification and Remediation of Student Reasoning Errors with Large Language Model Tutors NicoDaheimTechnische Universität Darmstadt - JakubMacinaDepartment of Computer Science, ETHZ - ETH Zurich + JakubMacinaDepartment of Computer Science, ETHZ - ETH Zurich ManuKapurETHZ - ETH Zurich IrynaGurevychInstitute for Computer Science, Artificial Intelligence and Technology, Mohamed bin Zayed University of Artificial Intelligence and Technische Universität Darmstadt MrinmayaSachanSwiss Federal Institute of Technology @@ -6785,9 +6785,9 @@ ZetianOuyang YishuaiQiu LinlinWang - GerardDe MeloHasso Plattner Institute and University of Potsdam + GerardDe MeloHasso Plattner Institute and University of Potsdam YaZhangShanghai Jiao Tong University - YanfengWangShanghai Jiao Tong University + YanfengWangShanghai Jiao Tong University LiangHe 8428-8438 With the proliferation of Large Language Models (LLMs) in diverse domains, there is a particular need for unified evaluation standards in clinical medical scenarios, where models need to be examined very thoroughly. We present CliMedBench, a comprehensive benchmark with 14 expert-guided core clinical scenarios specifically designed to assess the medical ability of LLMs across 7 pivot dimensions. It comprises 33,735 questions derived from real-world medical reports of top-tier tertiary hospitals and authentic examination exercises. The reliability of this benchmark has been confirmed in several ways. Subsequent experiments with existing LLMs have led to the following findings: (i) Chinese medical LLMs underperform on this benchmark, especially where medical reasoning and factual consistency are vital, underscoring the need for advances in clinical knowledge and diagnostic accuracy. (ii) Several general-domain LLMs demonstrate substantial potential in medical clinics, while the limited input capacity of many medical LLMs hinders their practical use. These findings reveal both the strengths and limitations of LLMs in clinical scenarios and offer critical insights for medical research. @@ -6797,8 +6797,8 @@ The Best Defense is Attack: Repairing Semantics in Textual Adversarial Examples - HengYang - KeLiUniversity of Exeter + HengYang + KeLiUniversity of Exeter 8439-8457 Recent studies have revealed the vulnerability of pre-trained language models to adversarial attacks. Adversarial defense techniques have been proposed to reconstruct adversarial examples within feature or text spaces. However, these methods struggle to effectively repair the semantics in adversarial examples, resulting in unsatisfactory defense performance. To repair the semantics in adversarial examples, we introduce a novel approach named Reactive Perturbation Defocusing (Rapid), which employs an adversarial detector to identify the fake labels of adversarial examples and leverages adversarial attackers to repair the semantics in adversarial examples. Our extensive experimental results, conducted on four public datasets, demonstrate the consistent effectiveness of Rapid in various adversarial attack scenarios. For easy evaluation, we provide a click-to-run demo of Rapid at https://tinyurl.com/22ercuf8. 2024.emnlp-main.481 @@ -6824,8 +6824,8 @@ Catarina GBelémUniversity of California, Irvine MarkelleKelly MarkSteyversUniversity of California, Irvine - SameerSinghUniversity of California, Irvine and Allen Institute for Artificial Intelligence - PadhraicSmythUniversity of California, Irvine + SameerSinghUniversity of California, Irvine and Allen Institute for Artificial Intelligence + PadhraicSmythUniversity of California, Irvine 8467-8502 *Uncertainty expressions* such as ‘probably’ or ‘highly unlikely’ are pervasive in human language. While prior work has established that there is population-level agreement in terms of how humans quantitatively interpret these expressions, there has been little inquiry into the abilities of language models in the same context. In this paper, we investigate how language models map linguistic expressions of uncertainty to numerical responses. Our approach assesses whether language models can employ theory of mind in this setting: understanding the uncertainty of another agent about a particular statement, independently of the model’s own certainty about that statement. We find that 7 out of 10 models are able to map uncertainty expressions to probabilistic responses in a human-like manner. However, we observe systematically different behavior depending on whether a statement is actually true or false. This sensitivity indicates that language models are substantially more susceptible to bias based on their prior knowledge (as compared to humans). These findings raise important questions and have broad implications for human-AI and AI-AI communication. 2024.emnlp-main.483 @@ -6834,7 +6834,7 @@ Explaining and Improving Contrastive Decoding by Extrapolating the Probabilities of a Huge and Hypothetical <fixed-case>LM</fixed-case> - Haw-ShiuanChangDepartment of Computer Science, University of Massachusetts at Amherst + Haw-ShiuanChangDepartment of Computer Science, University of Massachusetts at Amherst NanyunPengUniversity of California, Los Angeles MohitBansalUniversity of North Carolina at Chapel Hill AnilRamakrishnaAmazon @@ -6847,10 +6847,10 @@ Zero-shot Cross-domain Dialogue State Tracking via Context-aware Auto-prompting and Instruction-following Contrastive Decoding - XiaoyuDong - YujieFengHong Kong Polytechnic University - ZexinLuHong Kong Polytechnic University - GuangyuanShi + XiaoyuDong + YujieFengHong Kong Polytechnic University + ZexinLuHong Kong Polytechnic University + GuangyuanShi Xiao-MingWuHong Kong Polytechnic University 8527-8540 Zero-shot cross-domain dialogue state tracking (DST) enables us to manage task-oriented dialogues in new, unseen domains without the cost of collecting in-domain data. Previous studies have implemented slot-based input improvements, such as schema-driven descriptions and question-answering formats, but still suffer from negative transfer for seen slots and inefficient transfer for unseen slots due to the significant source-target domain gap. To address these issues, we introduce a novel framework called Context-aware Auto-prompting and Instruction-following Contrastive Decoding (CAPID). This framework generates dynamic, context-aware slot queries, effectively improving the model’s transferability. Our context-aware auto-prompting approach tailors slot queries to the current dialogue context, increasing flexibility and reducing ambiguities. Additionally, an instruction-following contrastive decoding strategy helps reduce errors related to off-topic slots by penalizing deviations from the provided instructions. Extensive experiments on two datasets, with varying model sizes (from 60M to 7B), demonstrate the superior performance of CAPID. The source code is provided for reproducibility. @@ -6864,8 +6864,8 @@ ZehanQiTsinghua University ZhijiangGuoUniversity of Cambridge CunxiangWang - HongruWangThe Chinese University of Hong Kong - YueZhangWestlake University + HongruWangThe Chinese University of Hong Kong + YueZhangWestlake University WeiXuTsinghua University, Tsinghua University 8541-8565 This survey provides an in-depth analysis of knowledge conflicts for large language models (LLMs), highlighting the complex challenges they encounter when blending contextual and parametric knowledge. Our focus is on three categories of knowledge conflicts: context-memory, inter-context, and intra-memory conflict. These conflicts can significantly impact the trustworthiness and performance of LLMs, especially in real-world applications where noise and misinformation are common. By categorizing these conflicts, exploring the causes, examining the behaviors of LLMs under such conflicts, and reviewing available solutions, this survey aims to shed light on strategies for improving the robustness of LLMs, thereby serving as a valuable resource for advancing research in this evolving area. @@ -6899,12 +6899,12 @@ A Thorough Examination of Decoding Methods in the Era of <fixed-case>LLM</fixed-case>s - ChufanShi + ChufanShi HaoranYang DengCaiTencent AI Lab ZhisongZhangTencent - YifanWangTsinghua University - YujiuYangGraduate School at Shenzhen,Tsinghua University + YifanWangTsinghua University + YujiuYangGraduate School at Shenzhen,Tsinghua University WaiLamThe Chinese University of Hong Kong 8601-8629 Decoding methods play an indispensable role in converting language models from next-token predictors into practical task solvers. Prior research on decoding methods, primarily focusing on task-specific models, may not extend to the current era of general-purpose large language models (LLMs). Moreover, the recent influx of decoding strategies has further complicated this landscape. This paper provides a comprehensive and multifaceted analysis of various decoding methods within the context of LLMs, evaluating their performance, robustness to hyperparameter changes, and decoding speeds across a wide range of tasks, models, and deployment environments. Our findings reveal that decoding method performance is notably task-dependent and influenced by factors such as alignment, model size, and quantization. Intriguingly, sensitivity analysis exposes that certain methods achieve superior performance at the cost of extensive hyperparameter tuning, highlighting the trade-off between attaining optimal results and the practicality of implementation in varying contexts. @@ -6943,9 +6943,9 @@ Exploring Nested Named Entity Recognition with Large Language Models: Methods, Challenges, and Insights - HongjinKimKonkuk University + HongjinKimKonkuk University Jai-EunKimSaltlux Inc. - HarksooKimKonkuk University + HarksooKimKonkuk University 8653-8670 Nested Named Entity Recognition (NER) poses a significant challenge in Natural Language Processing (NLP), demanding sophisticated techniques to identify entities within entities. This research investigates the application of Large Language Models (LLMs) to nested NER, exploring methodologies from prior work and introducing specific reasoning techniques and instructions to improve LLM efficacy. Through experiments conducted on the ACE 2004, ACE 2005, and GENIA datasets, we evaluate the impact of these approaches on nested NER performance. Results indicate that output format critically influences nested NER performance, methodologies from previous works are less effective, and our nested NER-tailored instructions significantly enhance performance. Additionally, we find that label information and descriptions of nested cases are crucial in eliciting the capabilities of LLMs for nested NER, especially in specific domains (i.e., the GENIA dataset). However, these methods still do not outperform BERT-based models, highlighting the ongoing need for innovative approaches in nested NER with LLMs. 2024.emnlp-main.492 @@ -6960,7 +6960,7 @@ MinxingZhang RongGeDuke University JianPeiDuke University and Simon Fraser University - Neil ZhenqiangGongDuke University + Neil ZhenqiangGongDuke University BhuwanDhingraDuke University 8671-8689 The rapid scaling of large language models (LLMs) has raised concerns about the transparency and fair use of the data used in their pretraining. Detecting such content is challenging due to the scale of the data and limited exposure of each instance during training. We propose ReCaLL (Relative Conditional Log-Likelihood), a novel membership inference attack (MIA) to detect LLMs’ pretraining data by leveraging their conditional language modeling capabilities. ReCaLL examines the relative change in conditional log-likelihoods when prefixing target data points with non-member context. Our empirical findings show that conditioning member data on non-member prefixes induces a larger decrease in log-likelihood compared to non-member data. We conduct comprehensive experiments and show that ReCaLL achieves state-of-the-art performance on the WikiMIA dataset, even with random and synthetic prefixes, and can be further improved using an ensemble approach. Moreover, we conduct an in-depth analysis of LLMs’ behavior with different membership contexts, providing insights into how LLMs leverage membership information for effective inference at both the sequence and token level. @@ -6970,7 +6970,7 @@ “Flex Tape Can’t Fix That”: Bias and Misinformation in Edited Language Models - Karina HHalevy + Karina HHalevy AnnaSotnikovaEPFL - EPF Lausanne BadrAlKhamissiEPFL - EPF Lausanne SyrielleMontariolEPFL - EPF Lausanne @@ -7013,7 +7013,7 @@ Jellyfish: Instruction-Tuning Local Large Language Models for Data Preprocessing HaochenZhang YuyangDong - ChuanXiaoOsaka University + ChuanXiaoOsaka University MasafumiOyamadaNEC 8754-8782 This paper explores the utilization of LLMs for data preprocessing (DP), a crucial step in the data mining pipeline that transforms raw data into a clean format. We instruction-tune local LLMs as universal DP task solvers that operate on a local, single, and low-priced GPU, ensuring data security and enabling further customization. We select a collection of datasets across four representative DP tasks and construct instruction data using data configuration, knowledge injection, and reasoning data distillation techniques tailored to DP. By tuning Mistral-7B, Llama 3-8B, and OpenOrca-Platypus2-13B, our models, Jellyfish-7B/8B/13B, deliver competitiveness compared to GPT-3.5/4 models and strong generalizability to unseen tasks while barely compromising the base models’ abilities in NLP tasks. Meanwhile, Jellyfish offers enhanced reasoning capabilities compared to GPT-3.5. Our models are available at: https://huggingface.co/NECOUDBFM/JellyfishOur instruction dataset is available at: https://huggingface.co/datasets/NECOUDBFM/Jellyfish-Instruct @@ -7024,12 +7024,12 @@ A Comprehensive Survey of Scientific Large Language Models and Their Applications in Scientific Discovery - YuZhangTexas A&M University - College Station - XiusiChenUniversity of Illinois at Urbana-Champaign - BowenJin + YuZhangTexas A&M University - College Station + XiusiChenUniversity of Illinois at Urbana-Champaign + BowenJin ShengWang - ShuiwangJiTexas A&M University - WeiWangUniversity of California, Los Angeles + ShuiwangJiTexas A&M University + WeiWangUniversity of California, Los Angeles JiaweiHan 8783-8817 In many scientific fields, large language models (LLMs) have revolutionized the way text and other modalities of data (e.g., molecules and proteins) are handled, achieving superior performance in various applications and augmenting the scientific discovery process. Nevertheless, previous surveys on scientific LLMs often concentrate on one or two fields or a single modality. In this paper, we aim to provide a more holistic view of the research landscape by unveiling cross-field and cross-modal connections between scientific LLMs regarding their architectures and pre-training techniques. To this end, we comprehensively survey over 260 scientific LLMs, discuss their commonalities and differences, as well as summarize pre-training datasets and evaluation tasks for each field and modality. Moreover, we investigate how LLMs have been deployed to benefit scientific discovery. Resources related to this survey are available at https://github.com/yuzhimanhua/Awesome-Scientific-Language-Models. @@ -7051,8 +7051,8 @@ Beyond Label Attention: Transparency in Language Models for Automated Medical Coding via Dictionary Learning JohnWu - DavidWu - JimengSunUniversity of Illinois, Urbana Champaign, College of Computing and Georgia Institute of Technology + DavidWu + JimengSunUniversity of Illinois, Urbana Champaign, College of Computing and Georgia Institute of Technology 8848-8871 Medical coding, the translation of unstructured clinical text into standardized medical codes, is a crucial but time-consuming healthcare practice. Though large language models (LLM) could automate the coding process and improve the efficiency of such tasks, interpretability remains paramount for maintaining patient trust. Current efforts in interpretability of medical coding applications rely heavily on label attention mechanisms, which often leads to the highlighting of extraneous tokens irrelevant to the ICD code. To facilitate accurate interpretability in medical language models, this paper leverages dictionary learning that can efficiently extract sparsely activated representations from dense language model embeddings in superposition. Compared with common label attention mechanisms, our model goes beyond token-level representations by building an interpretable dictionary which enhances the mechanistic-based explanations for each ICD code prediction, even when the highlighted tokens are medically irrelevant. We show that dictionary features are human interpretable, can elucidate the hidden meanings of upwards of 90% of medically irrelevant tokens, and steer model behavior. 2024.emnlp-main.500 @@ -7074,7 +7074,7 @@ From <fixed-case>RAG</fixed-case> to Riches: Retrieval Interlaced with Sequence Generation - PalakJainGoogle + PalakJainGoogle LivioBaldini SoaresGoogle Deepmind TomKwiatkowski 8887-8904 @@ -7101,7 +7101,7 @@ Learning to Correct for <fixed-case>QA</fixed-case> Reasoning with Black-box <fixed-case>LLM</fixed-case>s JaehyungKim DongyoungKimKAIST - YimingYangSchool of Computer Science, Carnegie Mellon University + YimingYangSchool of Computer Science, Carnegie Mellon University 8916-8937 An open challenge in recent machine learning is about how to improve the reasoning capability of large language models (LLMs) in a black-box setting, i.e., without access to detailed information such as output token probabilities. Existing approaches either rely on accessibility (which is often unrealistic) or involve significantly increased train- and inference-time costs. This paper addresses those limitations or shortcomings by proposing a novel approach, namely CoBB (Correct for improving QA reasoning of Black-Box LLMs). It uses a trained adaptation model to perform a seq2seq mapping from the often-imperfect reasonings of the original black-box LLM to the correct or improved reasonings. Specifically, the adaptation model is initialized with a relatively small open-source LLM and adapted over a collection of sub-sampled training pairs. To select the representative pairs of correct and incorrect reasonings, we formulated the dataset construction as an optimization problem that minimizes the statistical divergence between the sampled subset and the entire collection, and solved it via a genetic algorithm. We then train the adaptation model over the sampled pairs by contrasting the likelihoods of correct and incorrect reasonings. Our experimental results demonstrate that CoBB significantly improves reasoning accuracy across various QA benchmarks, compared to the best-performing adaptation baselines. 2024.emnlp-main.504 @@ -7140,7 +7140,7 @@ XiaoyuShenAmazon RexhinaBlloshmiAmazon Development Center Germany DaweiZhu - JiahuanPeiCentrum voor Wiskunde en Informatica + JiahuanPeiCentrum voor Wiskunde en Informatica WeiZhangEastern Institute of Technology, Ningbo, China. 8988-9003 Retrieval-augmented generation has gained popularity as a framework to enhance large language models with external knowledge. However, its effectiveness hinges on the retrieval robustness of the model. If the model lacks retrieval robustness, its performance is constrained by the accuracy of the retriever, resulting in significant compromises when the retrieved context is irrelevant. In this paper, we evaluate the “implicit” retrieval robustness of various large language models, instructing them to directly output the final answer without explicitly judging the relevance of the retrieved context. Our findings reveal that fine-tuning on a mix of gold and distracting context significantly enhances the model’s robustness to retrieval inaccuracies, while still maintaining its ability to extract correct answers when retrieval is accurate. This suggests that large language models can implicitly handle relevant or irrelevant retrieved context by learning solely from the supervision of the final answer in an end-to-end manner. Introducing an additional process for explicit relevance judgment can be unnecessary and disrupts the end-to-end approach. @@ -7151,11 +7151,11 @@ On the Relationship between Truth and Political Bias in Language Models SuyashFulayMassachusetts Institute of Technology - WilliamBrannonMassachusetts Institute of Technology + WilliamBrannonMassachusetts Institute of Technology ShresthaMohantyMassachusetts Institute of Technology CassandraOverney - ElinorPoole-DayanMassachusetts Institute of Technology - DebRoyMassachusetts Institute of Technology + ElinorPoole-DayanMassachusetts Institute of Technology + DebRoyMassachusetts Institute of Technology JadKabbaraMassachusetts Institute of Technology 9004-9018 Language model alignment research often attempts to ensure that models are not only helpful and harmless, but also truthful and unbiased. However, optimizing these objectives simultaneously can obscure how improving one aspect might impact the others. In this work, we focus on analyzing the relationship between two concepts essential in both language model alignment and political science: truthfulness and political bias. We train reward models on various popular truthfulness datasets and subsequently evaluate their political bias. Our findings reveal that optimizing reward models for truthfulness on these datasets tends to result in a left-leaning political bias. We also find that existing open-source reward models (i.e., those trained on standard human preference datasets) already show a similar bias and that the bias is larger for larger models. These results raise important questions about the datasets used to represent truthfulness, potential limitations of aligning models to be both truthful and politically unbiased, and what language models capture about the relationship between truth and politics. @@ -7166,7 +7166,7 @@ Can Active Label Correction Improve <fixed-case>LLM</fixed-case>-based Modular <fixed-case>AI</fixed-case> Systems? KaranTanejaCollege of Computing, Georgia Institute of Technology - AshokGoel + AshokGoel 9019-9031 Modular AI systems can be developed using LLM-prompts-based modules to minimize deployment time even for complex tasks. However, these systems do not always perform well and improving them using the data traces collected from a deployment remains an open challenge. The data traces contain LLM inputs and outputs, but the annotations from LLMs are noisy. We hypothesize that Active Label Correction (ALC) can be use on the collected data to train smaller task-specific improved models that can replace LLM-based modules. In this paper, we study the noise in three GPT-3.5-annotated datasets and their denoising with human feedback. We also propose a novel method ALC3 that iteratively applies three updates to the training dataset: auto-correction, correction using human feedback and filtering. Our results show that ALC3 can lead to oracle performance with feedback on 17-24% fewer examples than the number of noisy examples in the dataset across three different NLP tasks. 2024.emnlp-main.509 @@ -7175,9 +7175,9 @@ Statistical Uncertainty in Word Embeddings: <fixed-case>G</fixed-case>lo<fixed-case>V</fixed-case>e-<fixed-case>V</fixed-case> - AndreaVallebueno + AndreaVallebueno CassandraHandan-NaderNew York University - Christopher DManningComputer Science Department, Stanford University + Christopher DManningComputer Science Department, Stanford University Daniel E.HoStanford University 9032-9047 Static word embeddings are ubiquitous in computational social science applications and contribute to practical decision-making in a variety of fields including law and healthcare. However, assessing the statistical uncertainty in downstream conclusions drawn from word embedding statistics has remained challenging. When using only point estimates for embeddings, researchers have no streamlined way of assessing the degree to which their model selection criteria or scientific conclusions are subject to noise due to sparsity in the underlying data used to generate the embeddings. We introduce a method to obtain approximate, easy-to-use, and scalable reconstruction error variance estimates for GloVe, one of the most widely used word embedding models, using an analytical approximation to a multivariate normal model. To demonstrate the value of embeddings with variance (GloVe-V), we illustrate how our approach enables principled hypothesis testing in core word embedding tasks, such as comparing the similarity between different word pairs in vector space, assessing the performance of different models, and analyzing the relative degree of ethnic or gender bias in a corpus using different word lists. @@ -7202,7 +7202,7 @@ NigelFernandezDepartment of Computer Science, University of Massachusetts at Amherst AlexanderScarlatosDepartment of Computer Science, University of Massachusetts at Amherst WanyongFeng - SimonWoodheadEedi + SimonWoodheadEedi AndrewLanUniversity of Massachusetts, Amherst 9063-9081 High-quality distractors are crucial to both the assessment and pedagogical value of multiple-choice questions (MCQs), where manually crafting ones that anticipate knowledge deficiencies or misconceptions among real students is difficult. Meanwhile, automated distractor generation, even with the help of large language models (LLMs), remains challenging for subjects like math. It is crucial to not only identify plausible distractors but also understand the error behind them. In this paper, we introduce DiVERT (Distractor Generation with Variational Errors Represented as Text), a novel variational approach that learns an interpretable representation of errors behind distractors in math MCQs. Through experiments on a real-world math MCQ dataset with 1,434 questions used by hundreds of thousands of students, we show that DiVERT, despite using a base open-source LLM with 7B parameters, outperforms state-of-the-art approaches using GPT-4o on downstream distractor generation. We also conduct a human evaluation with math educators and find that DiVERT leads to error labels that are of comparable quality to human-authored ones. @@ -7225,11 +7225,11 @@ <fixed-case>C</fixed-case>lean<fixed-case>G</fixed-case>en: Mitigating Backdoor Attacks for Generation Tasks in Large Language Models YuetaiLi - ZhangchenXu - FengqingJiangUniversity of Washington - LuyaoNiu - DinukaSahabandu - BhaskarRamasubramanianWestern Washington University + ZhangchenXu + FengqingJiangUniversity of Washington + LuyaoNiu + DinukaSahabandu + BhaskarRamasubramanianWestern Washington University RadhaPoovendranUniversity of Washington, Seattle 9101-9118 The remarkable performance of large language models (LLMs) in generation tasks has enabled practitioners to leverage publicly available models to power custom applications, such as chatbots and virtual assistants. However, the data used to train or fine-tune these LLMs is often undisclosed, allowing an attacker to compromise the data and inject backdoors into the models. In this paper, we develop a novel inference time defense, named CleanGen, to mitigate backdoor attacks for generation tasks in LLMs. CleanGen is a lightweight and effective decoding strategy that is compatible with the state-of-the-art (SOTA) LLMs. Our insight behind CleanGen is that compared to other LLMs, backdoored LLMs assign significantly higher probabilities to tokens representing the attacker-desired contents. These discrepancies in token probabilities enable CleanGen to identify suspicious tokens favored by the attacker and replace them with tokens generated by another LLM that is not compromised by the same attacker, thereby avoiding generation of attacker-desired content. We evaluate CleanGen against five SOTA backdoor attacks. Our results show that CleanGen achieves lower attack success rates (ASR) compared to five SOTA baseline defenses for all five backdoor attacks. Moreover, LLMs deploying CleanGen maintain helpfulness in their responses when serving benign user queries with minimal added computational overhead. @@ -7254,7 +7254,7 @@ Words Matter: Reducing Stigma in Online Conversations about Substance Use with Large Language Models - LaylaBouzoubaaDrexel University + LaylaBouzoubaaDrexel University ElhamAghakhani RezvanehRezapourDrexel University 9139-9156 @@ -7278,12 +7278,12 @@ <fixed-case>S</fixed-case>ign<fixed-case>CLIP</fixed-case>: Connecting Text and Sign Language by Contrastive Learning - ZifanJiang + ZifanJiang GerardSant AmitMoryossef - MathiasMüllerUniversity of Zurich + MathiasMüllerUniversity of Zurich RicoSennrichUniversity of Zurich and University of Edinburgh - SarahEblingUniversity of Zurich + SarahEblingUniversity of Zurich 9171-9193 We present SignCLIP, which re-purposes CLIP (Contrastive Language-Image Pretraining) to project spoken language text and sign language videos, two classes of natural languages of distinct modalities, into the same space. SignCLIP is an efficient method of learning useful visual representations for sign language processing from large-scale, multilingual video-text pairs, without directly optimizing for a specific task or sign language which is often of limited size.We pretrain SignCLIP on Spreadthesign, a prominent sign language dictionary consisting of ~500 thousand video clips in up to 44 sign languages, and evaluate it with various downstream datasets. SignCLIP discerns in-domain signing with notable text-to-video/video-to-text retrieval accuracy. It also performs competitively for out-of-domain downstream tasks such as isolated sign language recognition upon essential few-shot prompting or fine-tuning.We analyze the latent space formed by the spoken language text and sign language poses, which provides additional linguistic insights. Our code and models are openly available. 2024.emnlp-main.518 @@ -7292,11 +7292,11 @@ <fixed-case>APPLS</fixed-case>: Evaluating Evaluation Metrics for Plain Language Summarization - YueGuoUniversity of Illinois at Urbana-Champaign + YueGuoUniversity of Illinois at Urbana-Champaign TalAugust - GondyLeroyUniversity of Arizona + GondyLeroyUniversity of Arizona TrevorCohenUniversity of Washington - Lucy LuWangUniversity of Washington and Allen Institute for Artificial Intelligence + Lucy LuWangUniversity of Washington and Allen Institute for Artificial Intelligence 9194-9211 While there has been significant development of models for Plain Language Summarization (PLS), evaluation remains a challenge. PLS lacks a dedicated assessment metric, and the suitability of text generation evaluation metrics is unclear due to the unique transformations involved (e.g., adding background explanations, removing jargon). To address these questions, our study introduces a granular meta-evaluation testbed, APPLS, designed to evaluate metrics for PLS. We identify four PLS criteria from previous work—informativeness, simplification, coherence, and faithfulness—and define a set of perturbations corresponding to these criteria that sensitive metrics should be able to detect. We apply these perturbations to extractive hypotheses for two PLS datasets to form our testbed. Using APPLS, we assess performance of 14 metrics, including automated scores, lexical features, and LLM prompt-based evaluations. Our analysis reveals that while some current metrics show sensitivity to specific criteria, no single method captures all four criteria simultaneously. We therefore recommend a suite of automated metrics be used to capture PLS quality along all relevant criteria. This work contributes the first meta-evaluation testbed for PLS and a comprehensive evaluation of existing metrics. 2024.emnlp-main.519 @@ -7334,8 +7334,8 @@ <fixed-case>R</fixed-case>u<fixed-case>BL</fixed-case>i<fixed-case>MP</fixed-case>: <fixed-case>R</fixed-case>ussian Benchmark of Linguistic Minimal Pairs EkaterinaTaktashevaUniversity of Edinburgh, University of Edinburgh - MaximBazhukovHigher School of Economics - KirillKoncha + MaximBazhukovHigher School of Economics + KirillKoncha AlenaFenogenovaSaluteDevices EkaterinaArtemovaToloka AI VladislavMikhailovUniversity of Oslo @@ -7348,10 +7348,10 @@ Text-Tuple-Table: Towards Information Integration in Text-to-Table Generation via Global Tuple Extraction ZheyeDengDepartment of Computer Science and Engineering, Hong Kong University of Science and Technology - ChunkitChan - WeiqiWangJohns Hopkins University and The Hong Kong University of Science and Technology - YuxiSun - WeiFanHong Kong University of Science and Technology + ChunkitChan + WeiqiWangJohns Hopkins University and The Hong Kong University of Science and Technology + YuxiSun + WeiFanHong Kong University of Science and Technology TianshiZhengThe Hong Kong University of Science and Technology YauwaiYim YangqiuSongThe Hong Kong University of Science and Technology @@ -7366,8 +7366,8 @@ Toward Compositional Behavior in Neural Models: A Survey of Current Views KateMcCurdyUniversität des Saarlandes and University of Edinburgh, University of Edinburgh PaulSoulos - PaulSmolenskyMicrosoft and Johns Hopkins University - RolandFernandezMicrosoft Research AI + PaulSmolenskyMicrosoft and Johns Hopkins University + RolandFernandezMicrosoft Research AI JianfengGaoMicrosoft Research 9323-9339 Compositionality is a core property of natural language, and compositional behavior (CB) is a crucial goal for modern NLP systems. The research literature, however, includes conflicting perspectives on how CB should be defined, evaluated, and achieved. We propose a conceptual framework to address these questions and survey researchers active in this area.We find consensus on several key points. Researchers broadly accept our proposed definition of CB, agree that it is not solved by current models, and doubt that scale alone will achieve the target behavior. In other areas, we find the field is split on how to move forward, identifying diverse opportunities for future research. @@ -7378,11 +7378,11 @@ Optimizing Instructions and Demonstrations for Multi-Stage Language Model Programs KristaOpsahl-OngStanford University - Michael JRyanStanford University + Michael JRyanStanford University JoshPurtell - DavidBromanKTH Royal Institute of Technology - ChristopherPottsStanford University - MateiZahariaUniversity of California, Berkeley and Databricks + DavidBromanKTH Royal Institute of Technology + ChristopherPottsStanford University + MateiZahariaUniversity of California, Berkeley and Databricks OmarKhattab 9340-9366 Language Model Programs, i.e. sophisticated pipelines of modular language model (LM) calls, are increasingly advancing NLP tasks, but they require crafting prompts that are jointly effective for all modules. We study prompt optimization for LM programs, i.e. how to update these prompts to maximize a downstream metric without access to module-level labels or gradients. To make this tractable, we factorize our problem into optimizing the free-form instructions and few-shot demonstrations of every module and introduce several strategies to craft task-grounded instructions and navigate credit assignment across modules. Our strategies include (i) program- and data-aware techniques for proposing effective instructions, (ii) a stochastic mini-batch evaluation function for learning a surrogate model of our objective, and (iii) a meta-optimization procedure in which we refine how LMs construct proposals over time. Using these insights we develop MIPRO, a novel algorithm for optimizing LM programs. MIPRO outperforms baseline optimizers on five of seven diverse multi-stage LM programs using a best-in-class open-source model (Llama-3-8B), by as high as 13% accuracy. We have released our new optimizers and benchmark in DSPy at [http://dspy.ai](http://dspy.ai). @@ -7432,7 +7432,7 @@ Less is More: Parameter-Efficient Selection of Intermediate Tasks for Transfer Learning DavidSchulte - FelixHamborgHumboldt Universität Berlin + FelixHamborgHumboldt Universität Berlin AlanAkbikHumboldt Universität Berlin 9431-9442 Intermediate task transfer learning can greatly improve model performance. If, for example, one has little training data for emotion detection, first fine-tuning a language model on a sentiment classification dataset may improve performance strongly. But which task to choose for transfer learning? Prior methods producing useful task rankings are infeasible for large source pools, as they require forward passes through all source language models. We overcome this by introducing Embedding Space Maps (ESMs), light-weight neural networks that approximate the effect of fine-tuning a language model. We conduct the largest study on NLP task transferability and task selection with 12k source-target pairs. We find that applying ESMs on a prior method reduces execution time and disk space usage by factors of 10 and 278, respectively, while retaining high selection performance (avg. regret@5 score of 2.95). @@ -7444,8 +7444,8 @@ The effects of distance on <fixed-case>NPI</fixed-case> illusive effects in <fixed-case>BERT</fixed-case> - So YoungLeeMiami University - Mai HaVuUniversity of Toronto + So YoungLeeMiami University + Mai HaVuUniversity of Toronto 9443-9457 Previous studies have examined the syntactic capabilities of large pre-trained language models, such as BERT, by using stimuli from psycholinguistic studies. Studying well-known processing errors, such as NPI illusive effects can reveal whether a model prioritizes linear or hierarchical information when processing language. Recent experiments have found that BERT is mildly susceptible to Negative Polarity Item (NPI) illusion effects (Shin et al., 2023; Vu and Lee, 2022). We expand on these results by examining the effect of distance on the illusive effect, using and modifying stimuli from Parker and Phillips (2016). We also further tease apart whether the model is more affected by hierarchical distance or linear distance. We find that BERT is highly sensitive to syntactic hierarchical information: added hierarchical layers affected its processing capabilities compared to added linear distance. 2024.emnlp-main.530 @@ -7457,11 +7457,11 @@ NathanielWeirAmazon KateSandersDepartment of Computer Science, Whiting School of Engineering OrionWeller - ShreyaSharma + ShreyaSharma DongweiJiang ZhengpingJiangJohns Hopkins University BhavanaDalvi MishraAllen Institute for Artificial Intelligence - OyvindTafjordAllen Institute for Artificial Intelligence + OyvindTafjordAllen Institute for Artificial Intelligence PeterJansenUniversity of Arizona PeterClarkAllen Institute for Artificial Intelligence BenjaminVan DurmeMicrosoft and Johns Hopkins University @@ -7486,14 +7486,14 @@ Read Anywhere Pointed: Layout-aware <fixed-case>GUI</fixed-case> Screen Reading with Tree-of-Lens Grounding YueFan - LeiDingUniversity of California, Santa Cruz + LeiDingUniversity of California, Santa Cruz Ching-ChenKuo ShanJiangeBay Inc. YangZhao XinzeGuanOregon State University JieYangCybever, Inc. YiZhangUniversity of California, Santa Cruz - Xin EricWangUniversity of California, Santa Cruz + Xin EricWangUniversity of California, Santa Cruz 9503-9522 Graphical User Interfaces (GUIs) are central to our interaction with digital devices and growing efforts have been made to build models for various GUI understanding tasks. However, these efforts largely overlook an important GUI-referring task: screen reading based on user-indicated points, which we name the Screen Point-and-Read (ScreenPR) task. Currently, this task is predominantly handled by rigid accessible screen reading tools, in great need of new models driven by advancements in Multimodal Large Language Models (MLLMs). In this paper, we propose a Tree-of-Lens (ToL) agent, utilizing a novel ToL grounding mechanism, to address the ScreenPR task. Based on the input point coordinate and the corresponding GUI screenshot, our ToL agent constructs a Hierarchical Layout Tree. Based on the tree, our ToL agent not only comprehends the content of the indicated area but also articulates the layout and spatial relationships between elements. Such layout information is crucial for accurately interpreting information on the screen, distinguishing our ToL agent from other screen reading tools. We also thoroughly evaluate the ToL agent against other baselines on a newly proposed ScreenPR benchmark, which includes GUIs from mobile, web, and operating systems. Last but not least, we test the ToL agent on mobile GUI navigation tasks, demonstrating its utility in identifying incorrect actions along the path of agent execution trajectories. Code and data: https://screen-point-and-read.github.io. 2024.emnlp-main.533 @@ -7555,10 +7555,10 @@ <fixed-case>A</fixed-case>rxiv<fixed-case>DIGEST</fixed-case>ables: Synthesizing Scientific Literature into Tables using Language Models BenjaminNewmanUniversity of Washington - YoonjooLeeKorea Advanced Institute of Science & Technology + YoonjooLeeKorea Advanced Institute of Science & Technology AakankshaNaikAllen Institute for Artificial Intelligence and National Institutes of Health PaoSiangliulueAllen Institute for Artificial Intelligence - RaymondFokUniversity of Washington + RaymondFokUniversity of Washington JuhoKimKorea Advanced Institute of Science and Technology Daniel SWeldDepartment of Computer Science, University of Washington Joseph CheeChangAllen Institute for Artificial Intelligence @@ -7571,9 +7571,9 @@ Development of Cognitive Intelligence in Pre-trained Language Models - Raj SanjayShahGeorgia Institute of Technology - KhushiBhardwaj - SashankVarmaGeorgia Institute of Technology + Raj SanjayShahGeorgia Institute of Technology + KhushiBhardwaj + SashankVarmaGeorgia Institute of Technology 9632-9657 Recent studies show evidence for emergent cognitive abilities in Large Pre-trained Language Models (PLMs). The increasing cognitive alignment of these models has made them candidates for cognitive science theories. Prior research into the emergent cognitive abilities of PLMs has been path-independent to model training, i.e. has only looked at the final model weights and not the intermediate steps. However, building plausible models of human cognition using PLMs also requires aligning their performance during training to the developmental trajectories of children’s thinking. Guided by psychometric tests of human intelligence, we choose four task categories to investigate the alignment of ten popular families of PLMs and evaluate each of their available intermediate and final training steps: Numerical ability, Linguistic abilities, Conceptual understanding, and Fluid reasoning. We find a striking regularity: regardless of model size, the developmental trajectories of PLMs consistently exhibit a window of maximal alignment to human cognitive development. Before that window, training appears to endow models with the requisite structure to be poised to rapidly learn from experience. After that window, training appears to serve the engineering goal of reducing loss but not the scientific goal of increasing alignment with human cognition. 2024.emnlp-main.539 @@ -7585,14 +7585,14 @@ Modeling Layout Reading Order as Ordering Relations for Visually-rich Document Understanding ChongZhangFudan University - YiTu + YiTu YixiZhao - ChenshuYuanNankai University + ChenshuYuanNankai University HuanChen YueZhang MingxuChai - YaGuo - HuijiaZhu + YaGuo + HuijiaZhu QiZhangFudan University TaoGuiFudan University 9658-9678 @@ -7604,10 +7604,10 @@ Birdie: Advancing State Space Language Modeling with Dynamic Mixtures of Training Objectives - SamBlouirGeorge Mason University - Jimmy T.h.SmithLiquid AI and Stanford University - AntoniosAnastasopoulosAthena Research Center and George Mason University - AmardaShehuGeorge Mason University + SamBlouirGeorge Mason University + Jimmy T.h.SmithLiquid AI and Stanford University + AntoniosAnastasopoulosAthena Research Center and George Mason University + AmardaShehuGeorge Mason University 9679-9705 Efficient state space models (SSMs), including linear recurrent neural networks and linear attention variants, have emerged as potential alternative language models to Transformers. While efficient, SSMs struggle with tasks requiring in-context retrieval, such as text copying and associative recall, limiting their usefulness in practical settings. Prior work on how to meet this challenge has focused on the internal model architecture and not investigated the role of the training procedure. This paper proposes a new training procedure that improve the performance of SSMs on retrieval-intensive tasks. This novel pre-training procedure combines a bidirectional processing of the input with dynamic mixtures of pre-training objectives to improve the utilization of the SSM’s fixed-size state. Our experimental evaluations show that this procedure significantly improves performance on retrieval-intensive tasks that challenge current SSMs, such as phone book lookup, long paragraph question-answering, and infilling tasks. Our findings offer insights into a new direction to advance the training of SSMs to close the performance gap with Transformers. 2024.emnlp-main.541 @@ -7631,7 +7631,7 @@ SheridanFeucht DavidAtkinsonNortheastern University Byron CWallaceNortheastern University, Brown University and Northeastern University - DavidBauNortheastern University + DavidBauNortheastern University 9727-9739 LLMs process text as sequences of tokens that roughly correspond to words, where less common words are represented by multiple tokens. However, individual tokens are often semantically unrelated to the meanings of the words/concepts they comprise. For example, Llama-2-7b’s tokenizer splits the word “patrolling” into two tokens, “pat” and “rolling”, neither of which correspond to semantically meaningful units like “patrol” or "-ing.” Similarly, the overall meanings of named entities like “Neil Young” and multi-word expressions like “break a leg” cannot be directly inferred from their constituent tokens. Mechanistically, how do LLMs convert such arbitrary groups of tokens into useful higher-level representations? In this work, we find that last token representations of named entities and multi-token words exhibit a pronounced “erasure” effect, where information about previous and current tokens is rapidly forgotten in early layers. Using this observation, we propose a method to “read out” the implicit vocabulary of an autoregressive LLM by examining differences in token representations across layers, and present results of this method for Llama-2-7b and Llama-3-8B. To our knowledge, this is the first attempt to probe the implicit vocabulary of an LLM. 2024.emnlp-main.543 @@ -7654,9 +7654,9 @@ Evaluating the Effectiveness of Large Language Models in Establishing Conversational Grounding BisweshMohapatraINRIA - Manav NitinKapadnis + Manav NitinKapadnis LaurentRomaryINRIA - JustineCassellINRIA and Carnegie Mellon University + JustineCassellINRIA and Carnegie Mellon University 9767-9781 Conversational grounding, vital for building dependable dialog systems, involves ensuring a mutual understanding of shared information. Despite its importance, there has been limited research on this aspect of conversation in recent years, especially after the advent of Large Language Models (LLMs). Previous studies have highlighted the shortcomings of pre-trained language models in conversational grounding. However, most testing for conversational grounding capabilities involves human evaluations that are costly and time-consuming. This has led to a lack of testing across multiple models of varying sizes, a critical need given the rapid rate of new model releases. This gap in research becomes more significant considering recent advances in language models, which have led to new emergent capabilities. In this paper, we aim to evaluate the performance of LLMs in various aspects of conversational grounding and analyze why some models perform better than others. We demonstrate a direct correlation between the size of the pre-training data and conversational grounding abilities, meaning that they have independently acquired a specific form of pragmatic capabilities from larger pre-training datasets. Finally, we propose ways to enhance the capabilities of the models that lag in this aspect. 2024.emnlp-main.545 @@ -7670,7 +7670,7 @@ YawenWuAmazon JacksonTaylorDSCG Solutions Inc. and College of William and Mary CaoXiaoGE Healthcare - FengZhengSouthern University of Science and Technology + FengZhengSouthern University of Science and Technology WeiwenJiangGeorge Mason University ShangqianGaoFlorida State University YanfuZhangCollege of William and Mary @@ -7682,7 +7682,7 @@ If <fixed-case>CLIP</fixed-case> Could Talk: Understanding Vision-Language Model Representations Through Their Preferred Concept Descriptions - RezaEsfandiarpoorBrown University + RezaEsfandiarpoorBrown University CristinaMenghiniScale AI StephenBachComputer Science Department, Brown University and Snorkel AI 9797-9819 @@ -7720,7 +7720,7 @@ AnejSvete NadavBorenstein MikeZhou - IsabelleAugensteinUniversity of Copenhagen + IsabelleAugensteinUniversity of Copenhagen RyanCotterellSwiss Federal Institute of Technology 9851-9867 Much theoretical work has described the ability of transformers to represent formal languages. However, linking theoretical results to empirical performance is not straightforward due to the complex interplay between the architecture, the learning algorithm, and training data. To test whether theoretical lower bounds imply learnability of formal languages, we turn to recent work relating transformers to n-gram language models (LMs). We study transformers’ ability to learn random n-gram LMs of two kinds: ones with arbitrary next-symbol probabilities and ones where those are defined with shared parameters. We find that classic estimation techniques for n-gram LMs such as add-\lambda smoothing outperform transformers on the former, while transformers perform better on the latter, outperforming methods specifically designed to learn n-gram LMs. @@ -7756,10 +7756,10 @@ Multi-pass Decoding for Grammatical Error Correction - XiaoyingWang - LinglingMu + XiaoyingWang + LinglingMu JingyiZhang - HongfeiXuZhengzhou University + HongfeiXuZhengzhou University 9904-9916 Sequence-to-sequence (seq2seq) models achieve comparable or better grammatical error correction performance compared to sequence-to-edit (seq2edit) models. Seq2edit models normally iteratively refine the correction result, while seq2seq models decode only once without aware of subsequent tokens. Iteratively refining the correction results of seq2seq models via Multi-Pass Decoding (MPD) may lead to better performance. However, MPD increases the inference costs. Deleting or replacing corrections in previous rounds may lose useful information in the source input. We present an early-stop mechanism to alleviate the efficiency issue. To address the source information loss issue, we propose to merge the source input with the previous round correction result into one sequence. Experiments on the CoNLL-14 test set and BEA-19 test set show that our approach can lead to consistent and significant improvements over strong BART and T5 baselines (+1.80, +1.35, and +2.02 F0.5 for BART 12-2, large and T5 large respectively on CoNLL-14 and +2.99, +1.82, and +2.79 correspondingly on BEA-19), obtaining F0.5 scores of 68.41 and 75.36 on CoNLL-14 and BEA-19 respectively. 2024.emnlp-main.553 @@ -7768,10 +7768,10 @@ Into the Unknown Unknowns: Engaged Human Learning through Participation in Language Model Agent Conversations - YuchengJiang + YuchengJiang YijiaShaoComputer Science Department, Stanford University DekunMa - SinaSemnani + SinaSemnani MonicaLamStanford University 9917-9955 While language model (LM)-powered chatbots and generative search engines excel at answering concrete queries, discovering information in the terrain of unknown unknowns remains challenging for users. To emulate the common educational scenario where children/students learn by listening to and participating in conversations of their parents/teachers, we create Collaborative STORM (Co-STORM). Unlike QA systems that require users to ask all the questions, Co-STORM lets users observe and occasionally steer the discourse among several LM agents. The agents ask questions on the user’s behalf, allowing the user to discover unknown unknowns serendipitously. To facilitate user interaction, Co-STORM assists users in tracking the discourse by organizing the uncovered information into a dynamic mind map, ultimately generating a comprehensive report as takeaways. For automatic evaluation, we construct the WildSeek dataset by collecting real information-seeking records with user goals. Co-STORM outperforms baseline methods on both discourse trace and report quality. In a further human evaluation, 70% of participants prefer Co-STORM over a search engine, and 78% favor it over a RAG chatbot. @@ -7827,7 +7827,7 @@ ZhengyangQi Lawrence KeunhoJang RussSalakhutdinovCarnegie-Mellon University, Carnegie Mellon University and Department of Computer Science - Louis-PhilippeMorencyCarnegie Mellon University + Louis-PhilippeMorencyCarnegie Mellon University Paul PuLiangMassachusetts Institute of Technology 10006-10030 Advances in multimodal models have greatly improved how interactions relevant to various tasks are modeled. Today’s multimodal models mainly focus on the correspondence between images and text, using this for tasks like image-text matching. However, this covers only a subset of real-world interactions. Novel interactions, such as sarcasm expressed through opposing spoken words and gestures or humor expressed through utterances and tone of voice, remain challenging. In this paper, we introduce an approach to enhance multimodal models, which we call Multimodal Mixtures of Experts (MMoE). The key idea in MMoE is to train separate expert models for each type of multimodal interaction, such as redundancy present in both modalities, uniqueness in one modality, or synergy that emerges when both modalities are fused. On a sarcasm detection task (MUStARD) and a humor detection task (URFUNNY), we obtain new state-of-the-art results. MMoE is also able to be applied to various types of models to gain improvement. @@ -7877,7 +7877,7 @@ <fixed-case>ESC</fixed-case>: Efficient Speech Coding with Cross-Scale Residual Vector Quantized Transformers YuzheGu - EnmaoDiaoColAI + EnmaoDiaoColAI 10084-10096 Neural speech codecs aim to compress input signals into minimal bits while maintaining content quality in a low-latency manner. However, existing neural codecs often trade model complexity for reconstruction performance. These codecs primarily use convolutional blocks for feature transformation, which are not inherently suited for capturing the local redundancies in speech signals. To compensate, they require either adversarial discriminators or a large number of model parameters to enhance audio quality. In response to these challenges, we introduce the Efficient Speech Codec (ESC), a lightweight, parameter-efficient speech codec based on a cross-scale residual vector quantization scheme and transformers. Our model employs mirrored hierarchical window transformer blocks and performs step-wise decoding from coarse-to-fine feature representations. To enhance bitrate efficiency, we propose a novel combination of vector quantization techniques along with a pre-training paradigm. Extensive experiments demonstrate that ESC can achieve high-fidelity speech reconstruction with significantly lower model complexity, making it a promising alternative to existing convolutional audio codecs. 2024.emnlp-main.562 @@ -7901,7 +7901,7 @@ Detecting Subtle Differences between Human and Model Languages Using Spectrum of Relative Likelihood - YangXuSouthern University of Science and Technology + YangXuSouthern University of Science and Technology YuWang HaoAnSouthern University of Science and Technology ZhichenLiu @@ -7917,12 +7917,12 @@ Optimizing Language Models with Fair and Stable Reward Composition in Reinforcement Learning JiahuiLiZhejiang University - HanlinZhangAlibaba Group + HanlinZhangAlibaba Group FengdaZhangNanyang Technological University - Tai-WeiChangAnt Group + Tai-WeiChangAnt Group KunKuangZhejiang University - LongChenThe Hong Kong University of Science and Technology - JunZhouAnt Group + LongChenThe Hong Kong University of Science and Technology + JunZhouAnt Group 10122-10140 Reinforcement learning from human feedback (RLHF) and AI-generated feedback (RLAIF) have become prominent techniques that significantly enhance the functionality of pre-trained language models (LMs). These methods harness feedback, sourced either from humans or AI, as direct rewards or to shape reward models that steer LM optimization. Nonetheless, the effective integration of rewards from diverse sources presents a significant challenge due to their disparate characteristics. To address this, recent research has developed algorithms incorporating strategies such as weighting, ranking, and constraining to handle this complexity. Despite these innovations, a bias toward disproportionately high rewards can still skew the reinforcement learning process and negatively impact LM performance. This paper explores a methodology for reward composition that enables simultaneous improvements in LMs across multiple dimensions. Inspired by fairness theory, we introduce a training algorithm that aims to reduce disparity and enhance stability among various rewards. Our method treats the aggregate reward as a dynamic weighted sum of individual rewards, with alternating updates to the weights and model parameters. For efficient and straightforward implementation, we employ an estimation technique rooted in the mirror descent method for weight updates, eliminating the need for gradient computations. The empirical results under various types of rewards across a wide range of scenarios demonstrate the effectiveness of our method. 2024.emnlp-main.565 @@ -7931,9 +7931,9 @@ Fine-grained Pluggable Gradient Ascent for Knowledge Unlearning in Language Models - XiaoHuaFeng - ChaochaoChenZhejiang University - YuyuanLiHangzhou Dianzi University + XiaoHuaFeng + ChaochaoChenZhejiang University + YuyuanLiHangzhou Dianzi University ZibinLinZhejiang University 10141-10155 Pre-trained language models acquire knowledge from vast amounts of text data, which can inadvertently contain sensitive information. To mitigate the presence of undesirable knowledge, the task of knowledge unlearning becomes crucial for language models. Previous research relies on gradient ascent methods to achieve knowledge unlearning, which is simple and effective. However, this approach calculates all the gradients of tokens in the sequence, potentially compromising the general ability of language models. To overcome this limitation, we propose an adaptive objective that calculates gradients with fine-grained control specifically targeting sensitive tokens. Our adaptive objective is pluggable, ensuring simplicity and enabling extension to the regularization-based framework that utilizes non-target data or other models to preserve general ability. Through extensive experiments targeting the removal of typical sensitive data, we demonstrate that our proposed method enhances the general ability of language models while achieving knowledge unlearning. Additionally, it demonstrates the capability to adapt to behavior alignment, eliminating all the undesirable knowledge within a specific domain. @@ -7945,10 +7945,10 @@ <fixed-case>ARM</fixed-case>: An Alignment-and-Replacement Module for <fixed-case>C</fixed-case>hinese Spelling Check Based on <fixed-case>LLM</fixed-case>s ChangchunLiu - KaiZhang - JunzheJiang + KaiZhang + JunzheJiang ZiruiLiu - HanqingTaoChina University of Mining Technology - Xuzhou + HanqingTaoChina University of Mining Technology - Xuzhou MinGaoUniversity of Science and Technology of China EnhongChenUniversity of Science and Technology of China 10156-10168 @@ -7974,7 +7974,7 @@ Atomic Inference for <fixed-case>NLI</fixed-case> with Generated Facts as Atoms JoeStaceyImperial College London - PasqualeMinerviniUniversity of Edinburgh, University of Edinburgh + PasqualeMinerviniUniversity of Edinburgh, University of Edinburgh HaimDubossarskyQueen Mary University of London Oana-MariaCamburuDepartment of Computer Science, University College London, University of London MarekReiImperial College London @@ -7989,14 +7989,14 @@ Towards Robust Speech Representation Learning for Thousands of Languages WilliamChen WangyouZhang - YifanPengCarnegie Mellon University + YifanPengCarnegie Mellon University XinjianLiCarnegie Mellon University JinchuanTian JiatongShi - XuankaiChang + XuankaiChang SoumiMaitiCarnegie Mellon University KarenLivescuToyota Technological Institute at Chicago - ShinjiWatanabeCarnegie Mellon University + ShinjiWatanabeCarnegie Mellon University 10205-10224 Self-supervised learning (SSL) has helped extend speech technologies to more languages by reducing the need for labeled data. However, models are still far from supporting the world’s 7000+ languages. We propose XEUS, a Cross-lingual Encoder for Universal Speech, trained on over 1 million hours of data across 4057 languages, extending the language coverage of SSL models 4-fold. We combine 1 million hours of speech from existing publicly accessible corpora with a newly created corpus of 7400+ hours from 4057 languages, which will be publicly released. To handle the diverse conditions of multilingual speech data, we augment the typical SSL masked prediction approach with a novel dereverberation objective, increasing robustness. We evaluate XEUS on several benchmarks, and show that it consistently outperforms or achieves comparable results to state-of-the-art (SOTA) SSL models across a variety of tasks. XEUS sets a new SOTA on the ML-SUPERB benchmark: it outperforms MMS 1B and w2v-BERT 2.0 v2 by 0.8% and 4.4% respectively, despite having less parameters or pre-training data. Checkpoints, code, and data are found in https://www.wavlab.org/activities/2024/xeus/. 2024.emnlp-main.570 @@ -8006,7 +8006,7 @@ <fixed-case>I</fixed-case> Learn Better If You Speak My Language: Understanding the Superior Performance of Fine-Tuning Large Language Models with <fixed-case>LLM</fixed-case>-Generated Responses - XuanRen + XuanRen BiaoWu LingqiaoLiuUniversity of Adelaide 10225-10245 @@ -8047,7 +8047,7 @@ When Parts Are Greater Than Sums: Individual <fixed-case>LLM</fixed-case> Components Can Outperform Full Models Ting-YunChangUniversity of Southern California - JesseThomasonUniversity of Southern California and Amazon + JesseThomasonUniversity of Southern California and Amazon RobinJiaUniversity of Southern California 10280-10299 This paper studies in-context learning by decomposing the output of large language models into the individual contributions of attention heads and MLPs (components). We observe curious components: good-performing ones that individually do well on a classification task, even when the model performs poorly; bad-performing ones that do much worse than chance; and label-biased components that always predict the same label. We find that component accuracies are well-correlated across different demonstration sets and perturbations of prompt templates. Based on our findings, we propose component reweighting, which learns to linearly re-scale the component activations from a few labeled examples. Given 24 labeled examples, our method improves by an average of 6.0% accuracy points over 24-shot ICL across 8 tasks on Llama-2-7B. Overall, this paper both enriches our understanding of ICL and provides a practical method for improvement by examining model internals. @@ -8058,13 +8058,13 @@ Multimodal Clickbait Detection by De-confounding Biases Using Causal Representation Inference - JianxingYuSUN YAT-SEN UNIVERSITY + JianxingYuSUN YAT-SEN UNIVERSITY ShiqiWangSUN YAT-SEN UNIVERSITY HanYin ZhenlongSun - RuobingXie + RuobingXie BoZhangTencent AI Lab - YanghuiRaoSUN YAT-SEN UNIVERSITY + YanghuiRaoSUN YAT-SEN UNIVERSITY 10300-10317 This paper focuses on detecting clickbait posts on the Web. These posts often use eye-catching disinformation in mixed modalities to mislead users to click for profit. That affects the user experience and thus would be blocked by content provider. To escape detection, malicious creators use tricks to add some irrelevant non-bait content into bait posts, dressing them up as legal to fool the detector. This content often has biased relations with non-bait labels, yet traditional detectors tend to make predictions based on simple co-occurrence rather than grasping inherent factors that lead to malicious behavior. This spurious bias would easily cause misjudgments. To address this problem, we propose a new debiased method based on causal inference. We first employ a set of features in multiple modalities to characterize the posts. Considering these features are often mixed up with unknown biases, we then disentangle three kinds of latent factors from them, including the invariant factor that indicates intrinsic bait intention; the causal factor which reflects deceptive patterns in a certain scenario, and non-causal noise. By eliminating the noise that causes bias, we can use invariant and causal factors to build a robust model with good generalization ability. Experiments on three popular datasets show the effectiveness of our approach. 2024.emnlp-main.575 @@ -8074,9 +8074,9 @@ Matryoshka-Adaptor: Unsupervised and Supervised Tuning for Smaller Embedding Dimensions JinsungYoonGoogle - RajarishiSinhaGoogle - Sercan OArikGoogle - TomasPfisterGoogle + RajarishiSinhaGoogle + Sercan OArikGoogle + TomasPfisterGoogle 10318-10336 Embeddings from Large Language Models (LLMs) have emerged as critical components in various applications, particularly for information retrieval. While high-dimensional embeddings generally demonstrate superior performance as they contain more salient information, their practical application is frequently hindered by elevated computational latency and the associated higher cost. To address these challenges, we propose Matryoshka-Adaptor, a novel tuning framework designed for the customization of LLM embeddings. Matryoshka-Adaptor facilitates substantial dimensionality reduction while maintaining comparable performance levels, thereby achieving a significant enhancement in computational efficiency and cost-effectiveness. Our framework directly modifies the embeddings from pre-trained LLMs which is designed to be seamlessly integrated with any LLM architecture, encompassing those accessible exclusively through black-box APIs. Also, it exhibits efficacy in both unsupervised and supervised learning settings. A rigorous evaluation conducted across a diverse corpus of English, multilingual, and multimodal datasets consistently reveals substantial gains with Matryoshka-Adaptor. Notably, with Google and OpenAI Embedding APIs, Matryoshka-Adaptor achieves a reduction in dimensionality ranging from two- to twelve-fold without compromising performance across multiple BEIR datasets. 2024.emnlp-main.576 @@ -8086,7 +8086,7 @@ <fixed-case>KNN</fixed-case>-Instruct: Automatic Instruction Construction with K Nearest Neighbor Deduction JianshangKou - BenfengXu + BenfengXu ChiweiZhu ZhendongMaoUniversity of Science and Technology of China 10337-10350 @@ -8097,9 +8097,9 @@ Contextualized Sequence Likelihood: Enhanced Confidence Scores for Natural Language Generation - ZhenLin + ZhenLin ShubhenduTrivediMassachusetts Institute of Technology - JimengSunUniversity of Illinois, Urbana Champaign, College of Computing and Georgia Institute of Technology + JimengSunUniversity of Illinois, Urbana Champaign, College of Computing and Georgia Institute of Technology 10351-10368 The advent of large language models (LLMs) has dramatically advanced the state-of-the-art in numerous natural language generation tasks. For LLMs to be applied reliably, it is essential to have an accurate measure of their confidence. Currently, the most commonly used confidence score function is the likelihood of the generated sequence, which, however, conflates semantic and syntactic components. For instance, in question-answering (QA) tasks, an awkward phrasing of the correct answer might result in a lower probability prediction. Additionally, different tokens should be weighted differently depending on the context. In this work, we propose enhancing the predicted sequence probability by assigning different weights to various tokens using attention values elicited from the base LLM. By employing a validation set, we can identify the relevant attention heads, thereby significantly improving the reliability of the vanilla sequence probability confidence measure. We refer to this new score as the Contextualized Sequence Likelihood (CSL). CSL is easy to implement, fast to compute, and offers considerable potential for further improvement with task-specific prompts. Across several QA datasets and a diverse array of LLMs, CSL has demonstrated significantly higher reliability than state-of-the-art baselines in predicting generation quality, as measured by the AUROC or AUARC. 2024.emnlp-main.578 @@ -8127,8 +8127,8 @@ <fixed-case>CARER</fixed-case> - <fixed-case>C</fixed-case>linic<fixed-case>A</fixed-case>l Reasoning-Enhanced Representation for Temporal Health Risk Prediction Tuan DungNguyen Thanh TrungHuynh - Minh HieuPhanUniversity of Adelaide - Quoc Viet HungNguyenGriffith University + Minh HieuPhanUniversity of Adelaide + Quoc Viet HungNguyenGriffith University Phi LeNguyenHanoi University of Science and Technology 10392-10407 The increasing availability of multimodal data from electronic health records (EHR) has paved the way for deep learning methods to improve diagnosis accuracy. However, deep learning models are data-driven, requiring large-scale datasets to achieve high generalizability. Inspired by how human experts leverage reasoning for medical diagnosis, we propose CARER, a novel health risk prediction framework, that enhances deep learning models with clinical rationales derived from medically proficient Large Language Models (LLMs). In addition, we provide a cross-view alignment loss which aligns the “local” view from the patient’s health status with the “global” view from the external LLM’s clinical reasoning to boost the mutual feature learning. Through extensive experiments on two predictive tasks using two popular EHR datasets, our CARER’s significantly exceeds the performance of state-of-the-art models by up to 11.2%, especially in improving data efficiency and generalizability. Our code is available at https://github.com/tuandung2812/CARER-EMNLP-2024 @@ -8139,12 +8139,12 @@ “In-Dialogues We Learn”: Towards Personalized Dialogue Without Pre-defined Profiles through In-Dialogue Learning ChuanqiChengRenmin University of China - QuanTu - WeiWuAnt Research - ShuoShang + QuanTu + WeiWuAnt Research + ShuoShang CunliMao - ZhengtaoYuKunming University of Science and Technology - RuiYanRenmin University of China + ZhengtaoYuKunming University of Science and Technology + RuiYanRenmin University of China 10408-10422 Personalized dialogue systems have gained significant attention in recent years for their ability to generate responses in alignment with different personas. However, most existing approaches rely on pre-defined personal profiles, which are not only time-consuming and labor-intensive to create but also lack flexibility. We propose In-Dialogue Learning (IDL), a fine-tuning framework that enhances the ability of pre-trained large language models to leverage dialogue history to characterize persona for personalized dialogue generation tasks without pre-defined profiles. Our experiments on three datasets demonstrate that IDL brings substantial improvements, with BLEU and ROUGE scores increasing by up to 200% and 247%, respectively. Additionally, the results of human evaluations further validate the efficacy of our proposed method. 2024.emnlp-main.581 @@ -8158,7 +8158,7 @@ GuangyiChenMohamed bin Zayed University of Artificial Intelligence and Carnegie Mellon University YifeiWangMassachusetts Institute of Technology LinGuiKing’s College London, University of London - YulanHeKing’s College London, University of London + YulanHeKing’s College London, University of London 10423-10435 To better interpret the intrinsic mechanism of large language models (LLMs), recent studies focus on monosemanticity on its basic units. A monosemantic neuron is dedicated to a single and specific concept, which forms a one-to-one correlation between neurons and concepts. Despite extensive research in monosemanticity probing, it remains unclear whether monosemanticity is beneficial or harmful to model capacity. To explore this question, we revisit monosemanticity from the feature decorrelation perspective and advocate for its encouragement. We experimentally observe that the current conclusion by (CITATION), which suggests that decreasing monosemanticity enhances model performance, does not hold when the model changes. Instead, we demonstrate that monosemanticity consistently exhibits a positive correlation with model capacity, in the preference alignment process. Consequently, we apply feature correlation as a proxy for monosemanticity and incorporate a feature decorrelation regularizer into the dynamic preference optimization process. The experiments show that our method not only enhances representation diversity and activation sparsity but also improves preference alignment performance. 2024.emnlp-main.582 @@ -8168,7 +8168,7 @@ Enhancing Language Model Factuality via Activation-Based Confidence Calibration and Guided Decoding XinLiuUniversity of Michigan - Ann Arbor - FarimaFatahi Bayat + FarimaFatahi Bayat LuWangNortheastern University, Northeastern University and University of Michigan 10436-10448 Calibrating language models (LMs) aligns their generation confidence with the actual likelihood of answer correctness, which can inform users about LMs’ reliability and mitigate hallucinated content. However, prior calibration methods, such as self-consistency-based and logit-based approaches, are either limited in inference-time efficiency or fall short of providing informative signals. Moreover, simply filtering out low-confidence responses reduces the LM’s helpfulness when the answers are correct. Therefore, effectively using calibration techniques to enhance an LM’s factuality remains an unsolved challenge. In this paper, we first propose an activation-based calibration method, ActCab, which trains a linear layer on top of the LM’s last-layer activations that can better capture the representations of knowledge. Built on top of ActCab, we further propose CoDec, a confidence-guided decoding strategy to elicit truthful answers with high confidence from LMs. By evaluating on five popular QA benchmarks, ActCab achieves superior calibration performance than all competitive baselines, e.g., by reducing the average expected calibration error (ECE) score by up to 39%. Further experiments on CoDec show consistent improvements in several LMs’ factuality on challenging QA datasets, such as TruthfulQA, highlighting the value of confidence signals in enhancing the factuality. @@ -8201,7 +8201,7 @@ MozhiZhang KeRenFudan University BotianJiang - XipengQiuFudan University + XipengQiuFudan University 10460-10479 As large language models (LLMs) rapidly evolve, they are increasingly being customized through fine-tuning to suit the specific needs of various applications. A critical aspect of this advancement is the alignment process, which ensures that these models perform tasks in ways that align with human values and expectations. Current alignment methods, such as direct preference optimization (DPO) and reinforcement learning from human feedback (RLHF), focus primarily on alignment during training phase. However, these methods often involve complex and resource-intensive training processes, posing significant challenge for their implementation. Therefore, we propose InferAligner, a simple yet effective method for harmlessness alignment during inference phase. InferAligner decouples harmlessness from helpfulness. During the training phase, it focuses solely on enhancing the target model’s capabilities on downstream tasks. In the inference phase, it utilizes safety steering vectors extracted from the aligned model to guide the target model towards harmlessness alignment. Experimental results show that our method can be very effectively applied to domain-specific models in finance, medicine, and mathematics, as well as to multimodal large language models (MLLMs) such as LLaVA. It significantly diminishes the attack success rate (ASR) of both harmful instructions and jailbreak instructions, while maintaining almost unchanged performance in downstream tasks. 2024.emnlp-main.585 @@ -8210,9 +8210,9 @@ Belief Revision: The Adaptability of Large Language Models Reasoning - BryanWilie - SamuelCahyawijaya - EtsukoIshiiThe Hong Kong University of Science and Technology + BryanWilie + SamuelCahyawijaya + EtsukoIshiiThe Hong Kong University of Science and Technology JunxianHeHong Kong University of Science and Technology PascaleFungHKUST 10480-10496 @@ -8239,13 +8239,13 @@ Bio-<fixed-case>RFX</fixed-case>: Refining Biomedical Extraction via Advanced Relation Classification and Structural Constraints - MinjiaWangHarvard University, Harvard University - FangzhouLiuTsinghua University + MinjiaWangHarvard University, Harvard University + FangzhouLiuTsinghua University XiuxingLiBeijing Institute of Technology BowenDongTsinghua University, Tsinghua University and Tencent AI Lab - ZhenyuLi - TengyuPan - JianyongWangTsinghua University, Tsinghua University + ZhenyuLi + TengyuPan + JianyongWangTsinghua University, Tsinghua University 10524-10539 The ever-growing biomedical publications magnify the challenge of extracting structured data from unstructured texts. This task involves two components: biomedical entity identification (Named Entity Recognition, NER) and their interrelation determination (Relation Extraction, RE). However, existing methods often neglect unique features of the biomedical literature, such as ambiguous entities, nested proper nouns, and overlapping relation triplets, and underutilize prior knowledge, leading to an intolerable performance decline in the biomedical domain, especially with limited annotated training data. In this paper, we propose the Biomedical Relation-First eXtraction (Bio-RFX) model by leveraging sentence-level relation classification before entity extraction to tackle entity ambiguity. Moreover, we exploit structural constraints between entities and relations to guide the model’s hypothesis space, enhancing extraction performance across different training scenarios. Comprehensive experimental results on biomedical datasets show that Bio-RFX achieves significant improvements on both NER and RE tasks. Even under the low-resource training scenarios, it outperforms all baselines in NER and has highly competitive performance compared to the state-of-the-art fine-tuned baselines in RE. 2024.emnlp-main.588 @@ -8255,11 +8255,11 @@ Decoding Matters: Addressing Amplification Bias and Homogeneity Issue in Recommendations for Large Language Models KeqinBao - JizhiZhangUniversity of Science and Technology of China - YangZhangNational University of Singapore + JizhiZhangUniversity of Science and Technology of China + YangZhangNational University of Singapore XinyueHuoHuawei Technologies Ltd. ChongChenHuawei Technologies Ltd. - FuliFengUniversity of Science and Technology of China + FuliFengUniversity of Science and Technology of China 10540-10552 Adapting Large Language Models (LLMs) for recommendation requires careful consideration of the decoding process, given the inherent differences between generating items and natural language. Existing approaches often directly apply LLMs’ original decoding methods. However, we find these methods encounter significant challenges: 1) amplification bias—where standard length normalization inflates scores for items containing tokens with generation probabilities close to 1 (termed ghost tokens), and 2) homogeneity issue—generating multiple similar or repetitive items for a user. To tackle these challenges, we introduce a new decoding approach named Debiasing-Diversifying Decoding (D^3). D^3 disables length normalization for ghost tokens to alleviate amplification bias, and it incorporates a text-free assistant model to encourage tokens less frequently generated by LLMs for counteracting recommendation homogeneity. Extensive experiments on real-world datasets demonstrate the method’s effectiveness in enhancing accuracy and diversity. 2024.emnlp-main.589 @@ -8270,7 +8270,7 @@ <fixed-case>LLM</fixed-case>s Are Prone to Fallacies in Causal Inference NitishJoshiNew York University AbulhairSaparovPurdue University - YixinWangUniversity of Michigan - Ann Arbor + YixinWangUniversity of Michigan - Ann Arbor HeHeNew York University 10553-10569 Recent work shows that causal facts can be effectively extracted from LLMs through prompting, facilitating the creation of causal graphs for causal inference tasks. However, it is unclear if this success is limited to explicitly-mentioned causal facts in the pretraining data which the model can memorize. Thus, this work investigates: Can LLMs infer causal relations from other relational data in text? To disentangle the role of memorized causal facts vs inferred causal relations, we finetune LLMs on synthetic data containing temporal, spatial and counterfactual relations, and measure whether the LLM can then infer causal relations. We find that: (a) LLMs are susceptible to inferring causal relations from the order of two entity mentions in text (e.g. X mentioned before Y implies X causes Y); (b) if the order is randomized, LLMs still suffer from the post hoc fallacy, i.e. X occurs before Y (temporal relation) implies X causes Y. We also find that while LLMs can correctly deduce the absence of causal relations from temporal and spatial relations, they have difficulty inferring causal relations from counterfactuals, questioning their understanding of causality. @@ -8283,7 +8283,7 @@ RyanLouieStanford University AnanjanNandiStanford University WilliamFang - ChengChang + ChengChang EmmaBrunskillStanford University and Stanford University DiyiYangStanford University 10570-10603 @@ -8307,11 +8307,11 @@ When Generative Adversarial Networks Meet Sequence Labeling Challenges - YuTong - GeChenMidea Group + YuTong + GeChenMidea Group GuokaiZheng RuiLiShantou University - JiangDazhiShantou University + JiangDazhiShantou University 10625-10635 The current framework for sequence labeling encompasses a feature extractor and a sequence tagger. This study introduces a unified framework named SLGAN, which harnesses the capabilities of Generative Adversarial Networks to address the challenges associated with Sequence Labeling tasks. SLGAN not only mitigates the limitation of GANs in backpropagating loss to discrete data but also exhibits strong adaptability to various sequence labeling tasks. Unlike traditional GANs, the discriminator within SLGAN does not discriminate whether data originates from the discriminator or the generator; instead, it focuses on predicting the correctness of each tag within the tag sequence. We conducted evaluations on six different tasks spanning four languages, including Chinese, Japanese, and Korean Word Segmentation, Chinese and English Named Entity Recognition, and Chinese Part-of-Speech Tagging. Our experimental results illustrate that SLGAN represents a versatile and highly effective solution, consistently achieving state-of-the-art or competitive performance results, irrespective of the specific task or language under consideration. 2024.emnlp-main.593 @@ -8321,10 +8321,10 @@ Evidence-Focused Fact Summarization for Knowledge-Augmented Zero-Shot Question Answering SunghoKoYonsei University - HyunjinCho + HyunjinCho HyungjooChae - JinyoungYeoYonsei University - DonghaLeeYonsei University + JinyoungYeoYonsei University + DonghaLeeYonsei University 10636-10651 Recent studies have investigated utilizing Knowledge Graphs (KGs) to enhance Quesetion Answering (QA) performance of Large Language Models (LLMs), yet structured KG verbalization remains challenging. Existing methods, like concatenation or free-form textual conversion of triples, have limitations, including duplicated entities or relations, reduced evidence density, and failure to highlight crucial evidence. To address these issues, we propose EFSum, an Evidence-focused Fact Summarization framework for enhanced QA with knowledge-augmented LLMs. We optimize an LLM as a fact summarizer through distillation and preference alignment. Our extensive expeirments show that EFSum improves LLM’s zero-shot QA performance with its helpful and faithful summaries, especially when noisy facts are retrieved. 2024.emnlp-main.594 @@ -8335,12 +8335,12 @@ Speechworthy Instruction-tuned Language Models Hyundong JustinChoUSC/ISI Nicolaas PaulJedemaAmazon - Leonardo F. R.RibeiroAmazon + Leonardo F. R.RibeiroAmazon KarishmaSharma PedroSzekelyUniversity of Southern California - AlessandroMoschittiAmazon AGI + AlessandroMoschittiAmazon AGI RubenJanssen - JonathanMayUniversity of Southern California and USC/ISI + JonathanMayUniversity of Southern California and USC/ISI 10652-10670 Current instruction-tuned language models are exclusively trained with textual preference data and thus may not be aligned to the unique requirements of other modalities, such as speech. To better align language models with the speech domain, we explore i) prompting strategies based on radio-industry best practices and ii) preference learning using a novel speech-based preference data of 20K samples collected by annotators who listen to response pairs. Both human and automatic evaluation show that both prompting and preference learning increase the speech-suitability of popular instruction tuned LLMs. More interestingly, we show that these methods are additive; combining them achieves the best win rates in head-to-head comparison, resulting in responses that are preferred or tied to the base model in 76.2% of comparisons on average. Lastly, we share lexical, syntactical, and qualitative analyses that elicit how our studied methods differ with baselines in generating more speech-suitable responses. 2024.emnlp-main.595 @@ -8367,7 +8367,7 @@ Fine-Tuning and Prompt Optimization: Two Great Steps that Work Better Together DilaraSoylu - ChristopherPottsStanford University + ChristopherPottsStanford University OmarKhattab 10696-10710 Natural Language Processing (NLP) systems are increasingly taking the form of sophisticated modular pipelines, e.g., Retrieval Augmented Generation (RAG), where each module may involve a distinct Language Model (LM) and an associated prompt template. These compound systems often lack intermediate labels or gradient flow to optimize each module, making their end-to-end optimization challenging. Here we seek strategies to optimize both the module-level LM weights and the associated prompt templates of such systems to maximize a downstream task metric. We propose for the first time combining the weight and prompt optimization strategies to optimize a modular LM pipeline by alternating between the two to get the same LM to teach itself. In experiments with multi-hop QA, mathematical reasoning, and feature-based classification using mistral-7b, llama-2-7b, and llama-3-8b, these BetterTogether strategies optimizing the weights and prompts of a pipeline together outperform directly optimizing weights alone and prompts alone by up to 60% and 6%, respectively, on average across LMs and tasks. Our BetterTogether optimizer is released in DSPy at [http://dspy.ai](http://dspy.ai). @@ -8379,7 +8379,7 @@ Demystifying Verbatim Memorization in Large Language Models JingHuangStanford University DiyiYangStanford University - ChristopherPottsStanford University + ChristopherPottsStanford University 10711-10732 Large Language Models (LLMs) frequently memorize long sequences verbatim, often with serious legal and privacy implications. Much prior work has studied such verbatim memorization using observational data. To complement such work, we develop a framework to study verbatim memorization in a controlled setting by continuing pre-training from Pythia checkpoints with injected sequences. We find that (1) non-trivial amounts of repetition are necessary for verbatim memorization to happen; (2) later (and presumably better) checkpoints are more likely to verbatim memorize sequences, even for out-of-distribution sequences; (3) the generation of memorized sequences is triggered by distributed model states that encode high-level features and makes important use of general language modeling capabilities. Guided by these insights, we develop stress tests to evaluate unlearning methods and find they often fail to remove the verbatim memorized information, while also degrading the LM. Overall, these findings challenge the hypothesis that verbatim memorization stems from specific model weights or mechanisms. Rather, verbatim memorization is intertwined with the LM’s general capabilities and thus will be very difficult to isolate and suppress without degrading model quality. 2024.emnlp-main.598 @@ -8389,7 +8389,7 @@ <fixed-case>A</fixed-case>mbig<fixed-case>NLG</fixed-case>: Addressing Task Ambiguity in Instruction for <fixed-case>NLG</fixed-case> AyanaNiwaMegagon Labs - HayateIsoMegagon Labs, US + HayateIsoMegagon Labs, US 10733-10752 2024.emnlp-main.599 niwa-iso-2024-ambignlg @@ -8419,7 +8419,7 @@ Towards Fast Multilingual <fixed-case>LLM</fixed-case> Inference: Speculative Decoding and Specialized Drafters - EuiinYiKorea Advanced Institute of Science & Technology + EuiinYiKorea Advanced Institute of Science & Technology TaehyeonKimKorea Advanced Institute of Science and Technology HongseokJeungKorea Telecom Research Du-SeongChang @@ -8441,7 +8441,7 @@ DongliangXu QingYang HongtaoLiuDu Xiaoman Financial - BingQinHarbin Institute of Technology + BingQinHarbin Institute of Technology 10803-10821 News summarization in today’s global scene can be daunting with its flood of multilingual content and varied viewpoints from different sources. However, current studies often neglect such real-world scenarios as they tend to focus solely on either single-language or single-document tasks. To bridge this gap, we aim to unify Multi-lingual, Cross-lingual and Multi-document Summarization into a novel task, i.e., MCMS, which encapsulates the real-world requirements all-in-one. Nevertheless, the lack of a benchmark inhibits researchers from adequately studying this invaluable problem. To tackle this, we have meticulously constructed the GLOBESUMM dataset by first collecting a wealth of multilingual news reports and restructuring them into event-centric format. Additionally, we introduce the method of protocol-guided prompting for high-quality and cost-effective reference annotation. In MCMS, we also highlight the challenge of conflicts between news reports, in addition to the issues of redundancies and omissions, further enhancing the complexity of GLOBESUMM. Through extensive experimental analysis, we validate the quality of our dataset and elucidate the inherent challenges of the task. We firmly believe that GLOBESUMM, given its challenging nature, will greatly contribute to the multilingual communities and the evaluation of LLMs. 2024.emnlp-main.603 @@ -8451,11 +8451,11 @@ Breaking the Curse of Multilinguality with Cross-lingual Expert Language Models TerraBlevinsUniversität Vienna - TomaszLimisiewicz + TomaszLimisiewicz SuchinGururanganFacebook and University of Washington, Seattle MargaretLiMeta and University of Washington HilaGonenUniversity of Washington - Noah A.SmithUniversity of Washington and Allen Institute for Artificial Intelligence + Noah A.SmithUniversity of Washington and Allen Institute for Artificial Intelligence LukeZettlemoyerUniversity of Washington, Facebook and Meta 10822-10837 Despite their popularity in non-English NLP, multilingual language models often underperform monolingual ones due to inter-language competition for model parameters. We propose Cross-lingual Expert Language Models (X-ELM), which mitigate this competition by independently training language models on subsets of the multilingual corpus. This process specializes X-ELMs to different languages while remaining effective as a multilingual ensemble. Our experiments show that when given the same compute budget, X-ELM outperforms jointly trained multilingual models across all 16 considered languages and that these gains transfer to downstream tasks. X-ELM provides additional benefits over performance improvements: new experts can be iteratively added, adapting X-ELM to new languages without catastrophic forgetting. Furthermore, training is asynchronous, reducing the hardware requirements for multilingual training and democratizing multilingual modeling. @@ -8466,9 +8466,9 @@ More Insightful Feedback for Tutoring: Enhancing Generation Mechanisms and Automatic Evaluation WenckeLiermannElectronics and Telecommunications Research Institute and Chungnam National University - Jin-XiaHuangElectronics and Telecommunications Research Institute + Jin-XiaHuangElectronics and Telecommunications Research Institute YohanLeeElectronics and Telecommunications Research Institute - Kong JooLeeChungnam National University + Kong JooLeeChungnam National University 10838-10851 Incorrect student answers can become valuable learning opportunities, provided that the student understands where they went wrong and why. To this end, rather than being given the correct answer, students should receive elaborated feedback on how to correct a mistake on their own. Highlighting the complex demands that the generation of such feedback places on a model’s input utilization abilities, we propose two extensions to the training pipeline. Firstly, we employ a KL regularization term between a standard and enriched input format to achieve more targeted input representations. Secondly, we add a preference optimization step to encourage student answer-adaptive feedback generation. The effectiveness of those extensions is underlined by a significant increase in model performance of 3.3 METEOR points. We go beyond traditional surface form-based metrics to assess two important dimensions of feedback quality, i.e., faithfulness and informativeness. Hereby, we are the first to propose an automatic metric measuring the degree to which feedback divulges the correct answer, that we call Informativeness Index I^2. We verify in how far each metric captures feedback quality. 2024.emnlp-main.605 @@ -8478,7 +8478,7 @@ Stable Language Model Pre-training by Reducing Embedding Variability WoojinChung - JiwooHongKorea Advanced Institute of Science & Technology + JiwooHongKorea Advanced Institute of Science & Technology Na MinAnKAIST JamesThorneKAIST Se-YoungYunKAIST @@ -8491,8 +8491,8 @@ What is lost in Normalization? Exploring Pitfalls in Multilingual <fixed-case>ASR</fixed-case> Model Evaluations - KavyaManohar - Leena GPillaiDigital University Kerala and University of Kerala + KavyaManohar + Leena GPillaiDigital University Kerala and University of Kerala 10864-10869 This paper explores the pitfalls in evaluating multilingual automatic speech recognition (ASR) models, with a particular focus on Indic language scripts. We investigate the text normalization routine employed by leading ASR models, including OpenAI Whisper, Meta’s MMS, Seamless, and Assembly AI’s Conformer, and their unintended consequences on performance metrics. Our research reveals that current text normalization practices, while aiming to standardize ASR outputs for fair comparison, by removing inconsistencies such as variations in spelling, punctuation, and special characters, are fundamentally flawed when applied to Indic scripts. Through empirical analysis using text similarity scores and in-depth linguistic examination, we demonstrate that these flaws lead to artificially improved performance metrics for Indic languages. We conclude by proposing a shift towards developing text normalization routines that leverage native linguistic expertise, ensuring more robust and accurate evaluations of multilingual ASR models. 2024.emnlp-main.607 @@ -8501,7 +8501,7 @@ Diversity Over Size: On the Effect of Sample and Topic Sizes for Topic-Dependent Argument Mining Datasets - BenjaminSchillerTechnische Universität Darmstadt + BenjaminSchillerTechnische Universität Darmstadt JohannesDaxenbergersummetix GmbH AndreasWaldisTechnische Universität Darmstadt and Lucerne University of Applied Sciences and Arts IrynaGurevychInstitute for Computer Science, Artificial Intelligence and Technology, Mohamed bin Zayed University of Artificial Intelligence and Technische Universität Darmstadt @@ -8520,8 +8520,8 @@ SeunghyunHwang WonbyungLeeSung Kyun Kwan University DongyanNanSung Kyun Kwan University - Bernard JJansenHBKU - Jang HyunKimSung Kyun Kwan University + Bernard JJansenHBKU + Jang HyunKimSung Kyun Kwan University 10888-10901 This study is the first to explore whether multi-modal large language models (LLMs) can align their behaviors with visual personas, addressing a significant gap in the literature that predominantly focuses on text-based personas. We developed a novel dataset of 5K fictional avatar images for assignment as visual personas to LLMs, and analyzed their negotiation behaviors based on the visual traits depicted in these images, with a particular focus on aggressiveness. The results indicate that LLMs assess the aggressiveness of images in a manner similar to humans and output more aggressive negotiation behaviors when prompted with an aggressive visual persona. Interestingly, the LLM exhibited more aggressive negotiation behaviors when the opponent’s image appeared less aggressive than their own, and less aggressive behaviors when the opponent’s image appeared more aggressive. 2024.emnlp-main.609 @@ -8530,7 +8530,7 @@ <fixed-case>ATM</fixed-case>: Adversarial Tuning Multi-agent System Makes a Robust Retrieval-Augmented Generator - JundaZhuBeijing University of Aeronautics and Astronautics + JundaZhuBeijing University of Aeronautics and Astronautics LingyongYanBaidu Inc. HaiboShi DaweiYinBaidu @@ -8544,10 +8544,10 @@ Dynamic Multi-granularity Attribution Network for Aspect-based Sentiment Analysis - YanjiangChen - KaiZhang + YanjiangChen + KaiZhang FengHu - XianquanWang + XianquanWang RuikangLiUniversity of Science and Technology of China QiLiuUniversity of Science and Technology of China 10920-10931 @@ -8558,10 +8558,10 @@ Unlabeled Debiasing in Downstream Tasks via Class-wise Low Variance Regularization - ShahedMasoudian + ShahedMasoudian MarkusFrohmannJohannes Kepler Universität Linz - NavidRekabsazThomson Reuters - MarkusSchedlJohannes Kepler Universität Linz + NavidRekabsazThomson Reuters + MarkusSchedlJohannes Kepler Universität Linz 10932-10938 Language models frequently inherit societal biases from their training data. Numerous techniques have been proposed to mitigate these biases during both the pre-training and fine-tuning stages. However, fine-tuning a pre-trained debiased language model on a downstream task can reintroduce biases into the model. Additionally, existing debiasing methods for downstream tasks either (i) require labels of protected attributes (e.g., age, race, or political views) that are often not available or (ii) rely on indicators of bias, which restricts their applicability to gender debiasing since they rely on gender-specific words. To address this, we introduce a novel debiasing regularization technique based on the class-wise variance of embeddings. Crucially, our method does not require attribute labels and targets any attribute, thus addressing the shortcomings of existing debiasing methods. Our experiments on encoder language models and three datasets demonstrate that our method outperforms existing strong debiasing baselines that rely on target attribute labels while maintaining performance on the target task. 2024.emnlp-main.612 @@ -8570,7 +8570,7 @@ Large Language Models Know What is Key Visual Entity: An <fixed-case>LLM</fixed-case>-assisted Multimodal Retrieval for <fixed-case>VQA</fixed-case> - PuJianUniversity of the Chinese Academy of Sciences + PuJianUniversity of the Chinese Academy of Sciences DongleiYu JiajunZhangInstitute of automation, Chinese academy of science, Chinese Academy of Sciences 10939-10956 @@ -8582,7 +8582,7 @@ Towards Probing Speech-Specific Risks in Large Multimodal Models: A Taxonomy, Benchmark, and Insights HaoYangMonash University - LizhenQuMonash University + LizhenQuMonash University EhsanShareghiMonash University and University of Cambridge RezaHafMonash University 10957-10973 @@ -8594,9 +8594,9 @@ Self-<fixed-case>AMPLIFY</fixed-case>: Improving Small Language Models with Self Post Hoc Explanations MilanBhan - Jean-NoëlVittautLIP6 and Sorbonne Université - Faculté des Sciences (Paris VI) + Jean-NoëlVittautLIP6 and Sorbonne Université - Faculté des Sciences (Paris VI) NicolasChesneau - Marie-JeanneLesotLIP6 + Marie-JeanneLesotLIP6 10974-10991 Incorporating natural language rationales in the prompt and In-Context Learning (ICL) have led to a significant improvement of Large Language Models (LLMs) performance. However, generating high-quality rationales require human-annotation or the use of auxiliary proxy models. In this work, we propose Self-AMPLIFY to automatically generate rationales from post hoc explanation methods applied to Small Language Models (SLMs) to improve their own performance. Self-AMPLIFY is a 3-step method that targets samples, generates rationales and builds a final prompt to leverage ICL. Self-AMPLIFY performance is evaluated on four SLMs and five datasets requiring strong reasoning abilities. Self-AMPLIFY achieves good results against competitors, leading to strong accuracy improvement. Self-AMPLIFY is the first method to apply post hoc explanation methods to autoregressive language models to generate rationales to improve their own performance in a fully automated manner. 2024.emnlp-main.615 @@ -8619,10 +8619,10 @@ Paraphrase Types Elicit Prompt Engineering Capabilities - Jan PhilipWahleUniversity of Göttingen, Germany - TerryRuasGeorg-August Universität Göttingen + Jan PhilipWahleUniversity of Göttingen, Germany + TerryRuasGeorg-August Universität Göttingen YangXuDepartment of Computer Science, University of Toronto - BelaGippGeorg-August Universität Göttingen + BelaGippGeorg-August Universität Göttingen 11004-11033 Much of the success of modern language models depends on finding a suitable prompt to instruct the model. Until now, it has been largely unknown how variations in the linguistic expression of prompts affect these models. This study systematically and empirically evaluates which linguistic features influence models through paraphrase types, i.e., different linguistic changes at particular positions. We measure behavioral changes for five models across 120 tasks and six families of paraphrases (i.e., morphology, syntax, lexicon, lexico-syntax, discourse, and others). We also control for other prompt engineering factors (e.g., prompt length, lexical diversity, and proximity to training data). Our results show a potential for language models to improve tasks when their prompts are adapted in specific paraphrase types (e.g., 6.7% median gain in Mixtral 8x7B; 5.5% in LLaMA 3 8B). In particular, changes in morphology and lexicon, i.e., the vocabulary used, showed promise in improving prompts. These findings contribute to developing more robust language models capable of handling variability in linguistic expression. 2024.emnlp-main.617 @@ -8633,8 +8633,8 @@ <fixed-case>VLEU</fixed-case>: a Method for Automatic Evaluation for Generalizability of Text-to-Image Models JingtaoCao ZhangZheng - HongruWangThe Chinese University of Hong Kong - Kam-FaiWongThe Chinese University of Hong Kong + HongruWangThe Chinese University of Hong Kong + Kam-FaiWongThe Chinese University of Hong Kong 11034-11049 Progress in Text-to-Image (T2I) models has significantly advanced the generation of images from textual descriptions. Existing metrics, such as CLIP, effectively measure the semantic alignment between single prompts and their corresponding images. However, they fall short in evaluating a model’s ability to generalize across a broad spectrum of textual inputs. To address this gap, we propose the VLEU (Visual Language Evaluation Understudy) metric. VLEU leverages the power of Large Language Models (LLMs) to sample from the visual text domain, encompassing the entire range of potential inputs for the T2I task, to generate a wide variety of visual text. The images generated by T2I models from these prompts are then assessed for their alignment with the input text using the CLIP model. VLEU quantitatively measures a model’s generalizability by computing the Kullback-Leibler (KL) divergence between the visual text marginal distribution and the conditional distribution over the images generated by the model. This provides a comprehensive metric for comparing the overall generalizability of T2I models, beyond single-prompt evaluations, and offers valuable insights during the finetuning process. Our experimental results demonstrate VLEU’s effectiveness in evaluating the generalizability of various T2I models, positioning it as an essential metric for future research and development in image synthesis from text prompts. Our code and data will be publicly available at https://github.com/mio7690/VLEU. 2024.emnlp-main.618 @@ -8645,9 +8645,9 @@ Towards Online Continuous Sign Language Recognition and Translation - RonglaiZuo + RonglaiZuo FangyunWei - BrianMakHong Kong University of Science and Technology + BrianMakHong Kong University of Science and Technology 11050-11067 Research on continuous sign language recognition (CSLR) is essential to bridge the communication gap between deaf and hearing individuals. Numerous previous studies have trained their models using the connectionist temporal classification (CTC) loss. During inference, these CTC-based models generally require the entire sign video as input to make predictions, a process known as offline recognition, which suffers from high latency and substantial memory usage. In this work, we take the first step towards online CSLR. Our approach consists of three phases: 1) developing a sign dictionary; 2) training an isolated sign language recognition model on the dictionary; and 3) employing a sliding window approach on the input sign sequence, feeding each sign clip to the optimized model for online recognition. Additionally, our online recognition model can be extended to support online translation by integrating a gloss-to-text network and can enhance the performance of any offline model. With these extensions, our online approach achieves new state-of-the-art performance on three popular benchmarks across various task settings. Code and models are available at https://github.com/FangyunWei/SLRT. 2024.emnlp-main.619 @@ -8659,8 +8659,8 @@ Mitigate Extrinsic Social Bias in Pre-trained Language Models via Continuous Prompts Adjustment YiweiDai HengruiGuJilin University - YingWangJilin University - XinWangJilin University + YingWangJilin University + XinWangJilin University 11068-11083 Although pre-trained language models (PLMs) have been widely used in natural language understandings (NLU), they are still exposed to fairness issues. Most existing extrinsic debiasing methods rely on manually curated word lists for each sensitive groups to modify training data or to add regular constraints. However, these word lists are often limited by length and scope, resulting in the degradation performance of extrinsic bias mitigation. To address the aforementioned issues, we propose a **C**ontinuous **P**rompts **A**djustment **D**ebiasing method (CPAD), which generates continuous token lists from the entire vocabulary space and uses them to bridge the gap between outputs and targets in fairness learning process. Specifically, CPAD encapsulates fine-tuning objective and debiasing objectives into several independent prompts. To avoid the limitation of manual word lists, in fairness learning phase, we extract outputs from the entire vocabulary space via fine-tuned PLM. Then, we aggregate the outputs from the same sensitive group as continuous token lists to map the outputs into protected attribute labels. Finally, after we learn the debiasing prompts in the perspective of adversarial learning, we improve fairness by adjusting continuous prompts at model inference time. Through extensive experiments on three NLU tasks, we evaluate the debiasing performance from the perspectives of group fairness and fairness through unawareness. The experimental results show that CPAD outperforms all baselines in term of single and two-attributes debiasing performance. 2024.emnlp-main.620 @@ -8675,7 +8675,7 @@ DaoyuanWuHong Kong University of Science and Technology ShuaiWangHong Kong University of Science and Technology CuiyunGaoHarbin Institute of Technology - YangLiuNanyang Technological University + YangLiuNanyang Technological University 11084-11108 Large language models (LLMs) have shown promise as automated evaluators for assessing the quality of answers generated by AI systems. However, LLM-based evaluators exhibit position bias, or inconsistency, when used to evaluate candidate answers in pairwise comparisons, favoring either the first or second answer regardless of content. To address this limitation, we propose PORTIA, an alignment-based system designed to mimic human comparison strategies to calibrate position bias in a lightweight yet effective manner. Specifically, PORTIA splits the answers into multiple segments, taking into account both length and semantics, and merges them back into a single prompt for evaluation by LLMs. Extensive experiments with six LLMs on 11,520 answer pairs demonstrate that PORTIA markedly enhances the consistency rates for all models and forms of comparison tested, achieving an average relative improvement of 47.46%. It also enables PORTIA-enhanced GPT-3.5 to achieve agreement rates with humans comparable to GPT-4 and elevates GPT-4’s consistency rate up to 98%. Subsequent human evaluations indicate that the PORTIA-enhanced GPT-3.5 model can even surpass standalone GPT-4 in terms of alignment with human evaluators, highlighting PORTIA’s ability to correct position bias, improve LLM consistency, and boost performance while keeping cost efficiency. 2024.emnlp-main.621 @@ -8699,7 +8699,7 @@ WendaXu JiachenLiUniversity of California, Santa Barbara William YangWangUC Santa Barbara - LeiLiSchool of Computer Science, Carnegie Mellon University + LeiLiSchool of Computer Science, Carnegie Mellon University 11125-11139 Direct alignment from preferences (DAP) has emerged as a promising paradigm for aligning large language models (LLMs) to human desiderata from pre-collected, offline preference datasets. While recent studies indicate that existing offline DAP methods can directly benefit from online training samples, we highlight the need to develop specific online DAP algorithms to fully harness the power of online training. Specifically, we identify that the learned LLM should adhere to the proximity of the behavior LLM, which collects the training samples. To this end, we propose online Preference Optimization in proximity to the Behavior LLM (BPO), emphasizing the importance of constructing a proper trust region for LLM alignment.We conduct extensive experiments to validate the effectiveness and applicability of our approach by integrating it with various DAP methods, resulting in significant performance improvements across a wide range of tasks when training with the same amount of preference data. Even when only introducing one additional data collection phase, our online BPO improves its offline DAP baseline from 72.0% to 80.2% on TL;DR and from 82.2% to 89.1% on Anthropic Helpfulness in terms of win rate against human reference text. 2024.emnlp-main.623 @@ -8709,7 +8709,7 @@ <fixed-case>O</fixed-case>ne2<fixed-case>S</fixed-case>et + Large Language Model: Best Partners for Keyphrase Generation LiangyingShao - LiangZhang + LiangZhang MinlongPengBaidu GuoqiMa HaoYue @@ -8724,10 +8724,10 @@ Unlocking Markets: A Multilingual Benchmark to Cross-Market Question Answering - YifeiYuanCopenhagen University + YifeiYuanCopenhagen University YangDengSingapore Management University AndersSøgaardCopenhagen University - MohammadAliannejadiUniversity of Amsterdam + MohammadAliannejadiUniversity of Amsterdam 11154-11169 Users post numerous product-related questions on e-commerce platforms, affecting their purchase decisions. Product-related question answering (PQA) entails utilizing product-related resources to provide precise responses to users. We propose a novel task of Multilingual Cross-market Product-based Question Answering (MCPQA) and define the task as providing answers to product-related questions in a main marketplace by utilizing information from another resource-rich auxiliary marketplace in a multilingual context. We introduce a large-scale dataset comprising over 7 million questions from 17 marketplaces across 11 languages. We then perform automatic translation on the Electronics category of our dataset, naming it as McMarket. We focus on two subtasks: review-based answer generation and product-related question ranking. For each subtask, we label a subset of McMarket using an LLM and further evaluate the quality of the annotations via human assessment. We then conduct experiments to benchmark our dataset, using models ranging from traditional lexical models to LLMs in both single-market and cross-market scenarios across McMarket and the corresponding LLM subset. Results show that incorporating cross-market information significantly enhances performance in both tasks. 2024.emnlp-main.625 @@ -8737,7 +8737,7 @@ <fixed-case>ORPO</fixed-case>: Monolithic Preference Optimization without Reference Model - JiwooHongKorea Advanced Institute of Science & Technology + JiwooHongKorea Advanced Institute of Science & Technology NoahLeeKAIST JamesThorneKAIST 11170-11189 @@ -8760,11 +8760,11 @@ Do <fixed-case>LLM</fixed-case>s suffer from Multi-Party Hangover? A Diagnostic Approach to Addressee Recognition and Response Selection in Conversations - NicolòPenzo + NicolòPenzo MaryamSajediniaUniversity of Turin - BrunoLepriFondazione Bruno Kessler + BrunoLepriFondazione Bruno Kessler SaraTonelli - MarcoGueriniFondazione Bruno Kessler + MarcoGueriniFondazione Bruno Kessler 11210-11233 Assessing the performance of systems to classify Multi-Party Conversations (MPC) is challenging due to the interconnection between linguistic and structural characteristics of conversations. Conventional evaluation methods often overlook variances in model behavior across different levels of structural complexity on interaction graphs. In this work, we propose a methodological pipeline to investigate model performance across specific structural attributes of conversations. As a proof of concept we focus on Response Selection and Addressee Recognition tasks, to diagnose model weaknesses. To this end, we extract representative diagnostic subdatasets with a fixed number of users and a good structural variety from a large and open corpus of online MPCs. We further frame our work in terms of data minimization, avoiding the use of original usernames to preserve privacy, and propose alternatives to using original text messages. Results show that response selection relies more on the textual content of conversations, while addressee recognition requires capturing their structural dimension. Using an LLM in a zero-shot setting, we further highlight how sensitivity to prompt variations is task-dependent. 2024.emnlp-main.628 @@ -8778,7 +8778,7 @@ HaritzPuertoTU Darmstadt MartinTutekTechnion - Israel Institute of Technology, Technion SomakAdityaIndian Institute of Technology Kharagpur - XiaodanZhuQueen’s University + XiaodanZhuQueen’s University IrynaGurevychInstitute for Computer Science, Artificial Intelligence and Technology, Mohamed bin Zayed University of Artificial Intelligence and Technische Universität Darmstadt 11234-11258 Reasoning is a fundamental component of language understanding. Recent prompting techniques, such as chain of thought, have consistently improved LLMs’ performance on various reasoning tasks. Nevertheless, there is still little understanding of what triggers reasoning abilities in LLMs in the inference stage. In this paper, we investigate the effect of the input representation on the reasoning abilities of LLMs. We hypothesize that representing natural language tasks as code can enhance specific reasoning abilities such as entity tracking or logical reasoning. To study this, we propose code prompting, a methodology we operationalize as a chain of prompts that transforms a natural language problem into code and directly prompts the LLM using the generated code without resorting to external code execution. We find that code prompting exhibits a high-performance boost for multiple LLMs (up to 22.52 percentage points on GPT 3.5, 7.75 on Mixtral, and 16.78 on Mistral) across multiple conditional reasoning datasets. We then conduct comprehensive experiments to understand how the code representation triggers reasoning abilities and which capabilities are elicited in the underlying models. Our analysis on GPT 3.5 reveals that the code formatting of the input problem is essential for performance improvement. Furthermore, the code representation improves sample efficiency of in-context learning and facilitates state tracking of entities. @@ -8812,15 +8812,15 @@ <fixed-case>C</fixed-case>ode<fixed-case>A</fixed-case>gent: Autonomous Communicative Agents for Code Review - XunzhuTang - KisubKim - YeweiSong - CedricLothritzLuxembourg Institute of Science and Technology + XunzhuTang + KisubKim + YeweiSong + CedricLothritzLuxembourg Institute of Science and Technology BeiLiMeituan SaadEzziniLancaster University HaoyeTian - JacquesKleinUniversity of Luxemburg - Tegawendé F.BissyandéUniversity of Luxemburg + JacquesKleinUniversity of Luxemburg + Tegawendé F.BissyandéUniversity of Luxemburg 11279-11313 Code review, which aims at ensuring the overall quality and reliability of software, is a cornerstone of software development. Unfortunately, while crucial, Code review is a labor-intensive process that the research community is looking to automate. Existing automated methods rely on single input-output generative models and thus generally struggle to emulate the collaborative nature of code review. This work introduces CodeAgent, a novel multi-agent Large Language Model (LLM) system for code review automation. CodeAgent incorporates a supervisory agent, QA-Checker, to ensure that all the agents’ contributions address the initial review question. We evaluated CodeAgent on critical code review tasks: (1) detect inconsistencies between code changes and commit messages, (2) identify vulnerability introductions, (3) validate code style adherence, and (4) suggest code revisions. The results demonstrate CodeAgent’s effectiveness, contributing to a new state-of-the-art in code review automation. Our data and code are publicly available (https://github.com/Daniel4SE/codeagent). 2024.emnlp-main.632 @@ -8833,9 +8833,9 @@ <fixed-case>T</fixed-case>ro<fixed-case>L</fixed-case>: Traversal of Layers for Large Language and Vision Models Byung-KwanLeeKorea Advanced Institute of Science and Technology SangyunChungKAIST - Chae WonKim + Chae WonKim BeomchanParkKAIST - Yong ManRoKorea Advanced Institute of Science and Technology + Yong ManRoKorea Advanced Institute of Science and Technology 11314-11342 Large language and vision models (LLVMs) have been driven by the generalization power of large language models (LLMs) and the advent of visual instruction tuning. Along with scaling them up directly, these models enable LLVMs to showcase powerful vision language (VL) performances by covering diverse tasks via natural language instructions. However, existing open-source LLVMs that perform comparably to closed-source LLVMs such as GPT-4V are often considered too large (e.g., 26B, 34B, and 110B parameters), having a larger number of layers. These large models demand costly, high-end resources for both training and inference. To address this issue, we present a new efficient LLVM family with 1.8B, 3.8B, and 7B LLM model sizes, Traversal of Layers (TroL), which enables the reuse of layers in a token-wise manner. This layer traversing technique simulates the effect of looking back and retracing the answering stream while increasing the number of forward propagation layers without physically adding more layers. We demonstrate that TroL employs a simple layer traversing approach yet efficiently outperforms the open-source LLVMs with larger model sizes and rivals the performances of the closed-source LLVMs with substantial sizes. 2024.emnlp-main.633 @@ -8847,7 +8847,7 @@ ShunWangUniversity of Sheffield GeZhang HanWuUniversity of International Business and Economics - TylerLoakman + TylerLoakman WenhaoHuang ChenghuaLinUniversity of Manchester 11343-11358 @@ -8859,8 +8859,8 @@ Revisiting Supertagging for faster <fixed-case>HPSG</fixed-case> parsing - OlgaZamaraevaUniversidad de La Coruña - CarlosGómez-RodríguezUniversidade da Coruña + OlgaZamaraevaUniversidad de La Coruña + CarlosGómez-RodríguezUniversidade da Coruña 11359-11374 We present new supertaggers trained on English HPSG-based treebanks and test the effects of the best tagger on parsing speed and accuracy. HPSG treebanks are produced automatically by large manually built grammars and feature high-quality annotation based on a well-developed linguistic theory. The English Resource Grammar treebanks include diverse and challenging test datasets, beyond the usual WSJ section 23 and Wikipedia data. HPSG supertagging has previously relied on MaxEnt-based models. We use SVM and neural CRF- and BERT-based methods and show that both SVM and neural supertaggers achieve considerably higher accuracy compared to the baseline and lead to an increase not only in the parsing speed but also the parser accuracy with respect to gold dependency structures. Our fine-tuned BERT-based tagger achieves 97.26% accuracy on 950 sentences from WSJ23 and 93.88% on the out-of-domain technical essay The Cathedral and the Bazaar. We present experiments with integrating the best supertagger into an HPSG parser and observe a speedup of a factor of 3 with respect to the system which uses no tagging at all, as well as large recall gains and an overall precision gain. We also compare our system to an existing integrated tagger and show that although the well-integrated tagger remains the fastest, our experimental system can be more accurate. Finally, we hope that the diverse and difficult datasets we used for evaluation will gain more popularity in the field: we show that results can differ depending on the dataset, even if it is an in-domain one. We contribute the complete datasets reformatted for Huggingface token classification. 2024.emnlp-main.635 @@ -8870,9 +8870,9 @@ Improve Dense Passage Retrieval with Entailment Tuning - LuDaiHong Kong University of Science and Technology - HaoLiuThe Hong Kong University of Science and Technology (Guangzhou) - HuiXiongHong Kong University of Science and Technology + LuDaiHong Kong University of Science and Technology + HaoLiuThe Hong Kong University of Science and Technology (Guangzhou) + HuiXiongHong Kong University of Science and Technology 11375-11387 Retrieval module can be plugged into many downstream NLP tasks to improve their performance, such as open-domain question answering and retrieval-augmented generation. The key to a retrieval system is to calculate relevance scores to query and passage pairs. However, the definition of relevance is often ambiguous. We observed that a major class of relevance aligns with the concept of entailment in NLI tasks. Based on this observation, we designed a method called entailment tuning to improve the embedding of dense retrievers. Specifically, we unify the form of retrieval data and NLI data using existence claim as a bridge. Then, we train retrievers to predict the claims entailed in a passage with a variant task of masked prediction. Our method can be efficiently plugged into current dense retrieval methods, and experiments show the effectiveness of our method. 2024.emnlp-main.636 @@ -8884,16 +8884,16 @@ YuxiangZhang JingChenZhejiang University JunjieWang - YaxinLiu + YaxinLiu ChengYang - ChufanShi - XinyuZhuUniversity of Virginia, Charlottesville + ChufanShi + XinyuZhuUniversity of Virginia, Charlottesville ZihaoLin - HanwenWan - YujiuYangGraduate School at Shenzhen,Tsinghua University - TetsuyaSakaiNAVER and Waseda University - TianFengZhejiang University - HayatoYamanaWaseda University + HanwenWan + YujiuYangGraduate School at Shenzhen,Tsinghua University + TetsuyaSakaiNAVER and Waseda University + TianFengZhejiang University + HayatoYamanaWaseda University 11388-11422 Tool-augmented large language models (LLMs) are rapidly being integrated into real-world applications. Due to the lack of benchmarks, the community has yet to fully understand the hallucination issues within these models. To address this challenge, we introduce a comprehensive diagnostic benchmark, ToolBH. Specifically, we assess the LLM’s hallucinations through two perspectives: depth and breadth. In terms of depth, we propose a multi-level diagnostic process, including (1) solvability detection, (2) solution planning, and (3) missing-tool analysis. For breadth, we consider three scenarios based on the characteristics of the toolset: missing necessary tools, potential tools, and limited functionality tools. Furthermore, we developed seven tasks and collected 700 evaluation samples through multiple rounds of manual annotation. The results show the significant challenges presented by the ToolBH benchmark. The current advanced models Gemini-1.5-Pro and GPT-4o only achieve total scores of 45.3 and 37.0, respectively, on a scale of 100. In this benchmark, larger model parameters do not guarantee better performance; the training data and response strategies also play crucial roles in tool-enhanced LLM scenarios. Our diagnostic analysis indicates that the primary reason for model errors lies in assessing task solvability. Additionally, open-weight models suffer from performance drops with verbose replies, whereas proprietary models excel with longer reasoning. 2024.emnlp-main.637 @@ -8903,9 +8903,9 @@ <fixed-case>TEMA</fixed-case>: Token Embeddings Mapping for Enriching Low-Resource Language Models - RodolfoZevallos - NúriaBelUniversitat Pompeu Fabra - MireiaFarrúsUniversitat de Barcelona + RodolfoZevallos + NúriaBelUniversitat Pompeu Fabra + MireiaFarrúsUniversitat de Barcelona 11423-11435 The objective of the research we present is to remedy the problem of the low quality of language models for low-resource languages. We introduce an algorithm, the Token Embedding Mapping Algorithm (TEMA), that maps the token embeddings of a richly pre-trained model L1 to a poorly trained model L2, thus creating a richer L2’ model. Our experiments show that the L2’ model reduces perplexity with respect to the original monolingual model L2, and that for downstream tasks, including SuperGLUE, the results are state-of-the-art or better for the most semantic tasks. The models obtained with TEMA are also competitive or better than multilingual or extended models proposed as solutions for mitigating the low-resource language problems. 2024.emnlp-main.638 @@ -8918,7 +8918,7 @@ AnthonyDiazUniversity of California, Davis ZixunChen QingyangWuColumbia University - KunQian + KunQian ErikVossTeachers College, Columbia University ZhouYuColumbia University 11436-11458 @@ -8931,8 +8931,8 @@ <fixed-case>T</fixed-case>ext2<fixed-case>C</fixed-case>hart31: Instruction Tuning for Chart Generation with Automatic Feedback FatemehPesaran ZadehSeoul National University, Seoul National University JuyeonKim - Jin-HwaKimSeoul National University and NAVER - GunheeKimSeoul National University + Jin-HwaKimSeoul National University and NAVER + GunheeKimSeoul National University 11459-11480 Large language models (LLMs) have demonstrated strong capabilities across various language tasks, notably through instruction-tuning methods. However, LLMs face challenges in visualizing complex, real-world data through charts and plots. Firstly, existing datasets rarely cover a full range of chart types, such as 3D, volumetric, and gridded charts. Secondly, supervised fine-tuning methods do not fully leverage the intricate relationships within rich datasets, including text, code, and figures. To address these challenges, we propose a hierarchical pipeline and a new dataset for chart generation. Our dataset, Text2Chart31, includes 31 unique plot types referring to the Matplotlib library, with 11.1K tuples of descriptions, code, data tables, and plots. Moreover, we introduce a reinforcement learning-based instruction tuning technique for chart generation tasks without requiring human feedback. Our experiments show that this approach significantly enhances the model performance, enabling smaller models to outperform larger open-source models and be comparable to state-of-the-art proprietary models in data visualization tasks. 2024.emnlp-main.640 @@ -8954,10 +8954,10 @@ Universal Vulnerabilities in Large Language Models: Backdoor Attacks for In-context Learning - ShuaiZhao + ShuaiZhao MeihuiziJia Anh TuanLuuNanyang Technological University - FengjunPan + FengjunPan JinmingWen 11507-11522 In-context learning, a paradigm bridging the gap between pre-training and fine-tuning, has demonstrated high efficacy in several NLP tasks, especially in few-shot settings. Despite being widely applied, in-context learning is vulnerable to malicious attacks. In this work, we raise security concerns regarding this paradigm. Our studies demonstrate that an attacker can manipulate the behavior of large language models by poisoning the demonstration context, without the need for fine-tuning the model. Specifically, we design a new backdoor attack method, named ICLAttack, to target large language models based on in-context learning. Our method encompasses two types of attacks: poisoning demonstration examples and poisoning demonstration prompts, which can make models behave in alignment with predefined intentions. ICLAttack does not require additional fine-tuning to implant a backdoor, thus preserving the model’s generality. Furthermore, the poisoned examples are correctly labeled, enhancing the natural stealth of our attack method. Extensive experimental results across several language models, ranging in size from 1.3B to 180B parameters, demonstrate the effectiveness of our attack method, exemplified by a high average attack success rate of 95.0% across the three datasets on OPT models. @@ -8969,8 +8969,8 @@ Repairs in a Block World: A New Benchmark for Handling User Corrections with Multi-Modal Language Models - JavierChiyah-Garcia - AlessandroSugliaHeriot-Watt University + JavierChiyah-Garcia + AlessandroSugliaHeriot-Watt University ArashEshghiHeriot-Watt University 11523-11542 In dialogue, the addressee may initially misunderstand the speaker and respond erroneously, often prompting the speaker to correct the misunderstanding in the next turn with a Third Position Repair (TPR). The ability to process and respond appropriately to such repair sequences is thus crucial in conversational AI systems. In this paper, we first collect, analyse, and publicly release BlockWorld-Repairs: a dataset of multi-modal TPR sequences in an instruction-following manipulation task that is, by design, rife with referential ambiguity. We employ this dataset to evaluate several state-of-the-art Vision and Language Models (VLM) across multiple settings, focusing on their capability to process and accurately respond to TPRs and thus recover from miscommunication. We find that, compared to humans, all models significantly underperform in this task. We then show that VLMs can benefit from specialised losses targeting relevant tokens during fine-tuning, achieving better performance and generalising better to new scenarios. Our results suggest that these models are not yet ready to be deployed in multi-modal collaborative settings where repairs are common, and highlight the need to design training regimes and objectives that facilitate learning from interaction. Our code and data are available at www.github.com/JChiyah/blockworld-repairs @@ -8980,7 +8980,7 @@ Beyond the Turn-Based Game: Enabling Real-Time Conversations with Duplex Models - XinrongZhangTsinghua University + XinrongZhangTsinghua University YingfaChen ShengdingHu XuHanTsinghua University, Tsinghua University @@ -8988,7 +8988,7 @@ YuanweiXuModelBest WeilinZhaoTsinghua University, Tsinghua University MaosongSun - ZhiyuanLiuTsinghua University + ZhiyuanLiuTsinghua University 11543-11557 As large language models (LLMs) increasingly permeate daily lives, there is a growing demand for real-time interactions that mirror human conversations. Traditional turn-based chat systems driven by LLMs prevent users from verbally interacting with the system while generating responses.To overcome these limitations, we adapt existing LLMs to duplex models so that they can listen to users while generating output and dynamically adjust themselves to provide instant feedback.Specifically, we divide the queries and responses of conversations into several time slices and then adopt a time-division-multiplexing (TDM) encoding-decoding strategy to process these slices pseudo-simultaneously.Furthermore, to make LLMs proficient enough to handle real-time conversations, we build a fine-tuning dataset consisting of alternating time slices of queries and responses and covering typical feedback types in instantaneous interactions.Our experiments show that although the queries and responses of conversations are segmented into incomplete slices for processing, LLMs can preserve their original performance on standard benchmarks with a few fine-tuning steps on our dataset. Automatic and human evaluation indicate that duplex models make user-AI interactions more natural and human-like, and greatly improve user satisfaction compared to vanilla LLMs. Our duplex model and dataset will be released soon. 2024.emnlp-main.644 @@ -9012,8 +9012,8 @@ Puzzle Solving using Reasoning of Large Language Models: A Survey PanagiotisGiadikiaroglou - MariaLymperaiouNational Technical University of Athens - GiorgosFilandrianosNational Technical University of Athens + MariaLymperaiouNational Technical University of Athens + GiorgosFilandrianosNational Technical University of Athens GiorgosStamouNational Technical University of Athens 11574-11591 Exploring the capabilities of Large Language Models (LLMs) in puzzle solving unveils critical insights into their potential and challenges in AI, marking a significant step towards understanding their applicability in complex reasoning tasks. This survey leverages a unique taxonomy—dividing puzzles into rule-based and rule-less categories—to critically assess LLMs through various methodologies, including prompting techniques, neuro-symbolic approaches, and fine-tuning. Through a critical review of relevant datasets and benchmarks, we assess LLMs’ performance, identifying significant challenges in complex puzzle scenarios. Our findings highlight the disparity between LLM capabilities and human-like reasoning, particularly in those requiring advanced logical inference. The survey underscores the necessity for novel strategies and richer datasets to advance LLMs’ puzzle-solving proficiency and contribute to AI’s logical reasoning and creative problem-solving advancements. @@ -9023,20 +9023,20 @@ <fixed-case>S</fixed-case>ci<fixed-case>E</fixed-case>x: Benchmarking Large Language Models on Scientific Exams with Human Expert Grading and Automatic Grading - Tu AnhDinhKarlsruher Institut für Technologie + Tu AnhDinhKarlsruher Institut für Technologie CarlosMullovKarlsruher Institut für Technologie - LeonardBärmannKarlsruher Institut für Technologie + LeonardBärmannKarlsruher Institut für Technologie ZhaolinLiKarlsruher Institut für Technologie DanniLiuKarlsruher Institut für Technologie - SimonReißKarlsruher Institut für Technologie - JueunLeeKarlsruher Institut für Technologie + SimonReißKarlsruher Institut für Technologie + JueunLeeKarlsruher Institut für Technologie NathanLerzerKarlsruher Institut für Technologie - JianfengGao - FabianPeller-Konrad + JianfengGao + FabianPeller-Konrad TobiasRöddiger AlexanderWaibel TamimAsfourKarlsruhe Institute of Technology - MichaelBeiglKarlsruher Institut für Technologie + MichaelBeiglKarlsruher Institut für Technologie RainerStiefelhagenKarlsruhe Institute of Technology CarstenDachsbacherKarlsruhe Institute of Technology KlemensBöhmKarlsruher Institut für Technologie @@ -9053,7 +9053,7 @@ XiaofeiWen BangzhengLiUniversity of Southern California TenghaoHuang - MuhaoChenUniversity of California, Davis and University of Southern California + MuhaoChenUniversity of California, Davis and University of Southern California 11611-11630 Most language models currently available are prone to self-contradiction during dialogues. To mitigate this issue, this study explores a novel contradictory dialogue processing task that aims to detect and modify contradictory statements in a conversation. This task is inspired by research on context faithfulness and dialogue comprehension, which have demonstrated that the detection and understanding of contradictions often necessitate detailed explanations. We develop a dataset comprising contradictory dialogues, in which one side of the conversation contradicts itself. Each dialogue is accompanied by an explanatory label that highlights the location and details of the contradiction. With this dataset, we present a Red Teaming framework for contradictory dialogue processing. The framework detects and attempts to explain the dialogue, then modifies the existing contradictory content using the explanation. Our experiments demonstrate that the framework improves the ability to detect contradictory dialogues and provides valid explanations. Additionally, it showcases distinct capabilities for modifying such dialogues. Our study highlights the importance of the logical inconsistency problem in conversational AI. 2024.emnlp-main.648 @@ -9064,7 +9064,7 @@ Fishing for Magikarp: Automatically Detecting Under-trained Tokens in Large Language Models SanderLandCohere - MaxBartoloCohere and University College London + MaxBartoloCohere and University College London 11631-11646 The disconnect between tokenizer creation and model training in language models allows for specific inputs, such as the infamous SolidGoldMagikarp token, to induce unwanted model behaviour. Although such ‘glitch tokens’, tokens present in the tokenizer vocabulary but that are nearly or entirely absent during model training, have been observed across various models, a reliable method to identify and address them has been missing. We present a comprehensive analysis of Large Language Model tokenizers, specifically targeting this issue of detecting under-trained tokens. Through a combination of tokenizer analysis, model weight-based indicators, and prompting techniques, we develop novel and effective methods for automatically detecting these problematic tokens. Our findings demonstrate the prevalence of such tokens across a diverse set of models and provide insights into improving the efficiency and safety of language models. 2024.emnlp-main.649 @@ -9086,7 +9086,7 @@ Pragmatic Norms Are All You Need – Why The Symbol Grounding Problem Does Not Apply to <fixed-case>LLM</fixed-case>s - RetoGubelmannUniversity of Zurich and University of Zurich + RetoGubelmannUniversity of Zurich and University of Zurich 11663-11678 Do LLMs fall prey to Harnad’s symbol grounding problem (SGP), as it has recently been claimed? We argue that this is not the case. Starting out with countering the arguments of Bender and Koller (2020), we trace the origins of the SGP to the computational theory of mind (CTM), and we show that it only arises with natural language when questionable theories of meaning are presupposed. We conclude by showing that it would apply to LLMs only if they were interpreted in the manner of how the CTM conceives the mind, i.e., by postulating that LLMs rely on a version of a language of thought, or by adopting said questionable theories of meaning; since neither option is rational, we conclude that the SGP does not apply to LLMs. 2024.emnlp-main.651 @@ -9097,7 +9097,7 @@ Major Entity Identification: A Generalizable Alternative to Coreference Resolution Kawshik ManikantanSundar ShubhamToshniwalNVIDIA - MakarandTapaswiInternational Institute of Information Technology Hyderabad and Wadhwani Institute for Artificial Intelligence + MakarandTapaswiInternational Institute of Information Technology Hyderabad and Wadhwani Institute for Artificial Intelligence VineetGandhiInternational Institute of Information Technology Hyderabad 11679-11695 The limited generalization of coreference resolution (CR) models has been a major bottleneck in the task’s broad application. Prior work has identified annotation differences, especially for mention detection, as one of the main reasons for the generalization gap and proposed using additional annotated target domain data. Rather than relying on this additional annotation, we propose an alternative referential task, Major Entity Identification (MEI), where we: (a) assume the target entities to be specified in the input, and (b) limit the task to only the frequent entities. Through extensive experiments, we demonstrate that MEI models generalize well across domains on multiple datasets with supervised models and LLM-based few-shot prompting. Additionally, MEI fits the classification framework, which enables the use of robust and intuitive classification-based metrics. Finally, MEI is also of practical use as it allows a user to search for all mentions of a particular entity or a group of entities of interest. @@ -9107,10 +9107,10 @@ Enhancing High-order Interaction Awareness in <fixed-case>LLM</fixed-case>-based Recommender Model - XinfengWang - JinCui - FumiyoFukumotoYamanashi University - YoshimiSuzukiYamanashi University + XinfengWang + JinCui + FumiyoFukumotoYamanashi University + YoshimiSuzukiYamanashi University 11696-11711 Large language models (LLMs) have demonstrated prominent reasoning capabilities in recommendation tasks by transforming them into text-generation tasks. However, existing approaches either disregard or ineffectively model the user-item high-order interactions. To this end, this paper presents an enhanced LLM-based recommender (ELMRec). We enhance whole-word embeddings to substantially enhance LLMs’ interpretation of graph-constructed interactions for recommendations, without requiring graph pre-training. This finding may inspire endeavors to incorporate rich knowledge graphs into LLM-based recommenders via whole-word embedding. We also found that LLMs often recommend items based on users’ earlier interactions rather than recent ones, and present a reranking solution. Our ELMRec outperforms state-of-the-art (SOTA) methods, especially achieving a 124.3% to 293.7% improvement over SOTA LLM-based methods in direct recommendations. Our code is available online. 2024.emnlp-main.653 @@ -9123,9 +9123,9 @@ AkshayParuchuriDepartment of Computer Science, University of North Carolina at Chapel Hill JakeGarrisonGoogle ShunLiaoGoogle - John BHernandez + John BHernandez JacobSunshineUniversity of Washington - TimAlthoffDepartment of Computer Science, University of Washington + TimAlthoffDepartment of Computer Science, University of Washington XinLiuGoogle DanielMcDuff 11712-11733 @@ -9140,8 +9140,8 @@ <fixed-case>MARE</fixed-case>: Multi-Aspect Rationale Extractor on Unsupervised Rationale Extraction HanJiang JunwenDuanCentral South University - ZheQu - JianxinWangCentral South University + ZheQu + JianxinWangCentral South University 11734-11745 Unsupervised rationale extraction aims to extract text snippets to support model predictions without explicit rationale annotation.Researchers have made many efforts to solve this task. Previous works often encode each aspect independently, which may limit their ability to capture meaningful internal correlations between aspects. While there has been significant work on mitigating spurious correlations, our approach focuses on leveraging the beneficial internal correlations to improve multi-aspect rationale extraction. In this paper, we propose a Multi-Aspect Rationale Extractor (MARE) to explain and predict multiple aspects simultaneously. Concretely, we propose a Multi-Aspect Multi-Head Attention (MAMHA) mechanism based on hard deletion to encode multiple text chunks simultaneously. Furthermore, multiple special tokens are prepended in front of the text with each corresponding to one certain aspect. Finally, multi-task training is deployed to reduce the training overhead. Experimental results on two unsupervised rationale extraction benchmarks show that MARE achieves state-of-the-art performance. Ablation studies further demonstrate the effectiveness of our method. Our codes have been available at https://github.com/CSU-NLP-Group/MARE. 2024.emnlp-main.655 @@ -9151,8 +9151,8 @@ <fixed-case>L</fixed-case>o<fixed-case>RA</fixed-case>-Guard: Parameter-Efficient Guardrail Adaptation for Content Moderation of Large Language Models HayderElesedySamsung - Pedro MEsperancaSamsung - Silviu VladOpreaSamsung + Pedro MEsperancaSamsung + Silviu VladOpreaSamsung MeteOzaySamsung Research 11746-11765 Guardrails have emerged as an alternative to safety alignment for content moderation of large language models (LLMs). Existing model-based guardrails have not been designed for resource-constrained computational portable devices, such as mobile phones, more and more of which are running LLM-based applications locally. We introduce LoRA-Guard, a parameter-efficient guardrail adaptation method that relies on knowledge sharing between LLMs and guardrail models. LoRA-Guard extracts language features from the LLMs and adapts them for the content moderation task using low-rank adapters, while a dual-path design prevents any performance degradation on the generative task. We show that LoRA-Guard outperforms existing approaches with 100-1000x lower parameter overhead while maintaining accuracy, enabling on-device content moderation. @@ -9165,7 +9165,7 @@ ZhijunXu SiyuYuan LingjieChen - DeqingYangFudan University + DeqingYangFudan University 11766-11782 Puns play a vital role in academic research due to their distinct structure and clear definition, which aid in the comprehensive analysis of linguistic humor. However, the understanding of puns in large language models (LLMs) has not been thoroughly examined, limiting their use in creative writing and humor creation. In this paper, we leverage three popular tasks, i.e., pun recognition, explanation and generation to systematically evaluate the capabilities of LLMs in pun understanding. In addition to adopting the automated evaluation metrics from prior research, we introduce new evaluation methods and metrics that are better suited to the in-context learning paradigm of LLMs. These new metrics offer a more rigorous assessment of an LLM’s ability to understand puns and align more closely with human cognition than previous metrics. Our findings reveal the “lazy pun generation” pattern and identify the primary challenges LLMs encounter in understanding puns. 2024.emnlp-main.657 @@ -9175,10 +9175,10 @@ <fixed-case>QGE</fixed-case>val: Benchmarking Multi-dimensional Evaluation for Question Generation WeipingFuXi’an Jiaotong University - BifanWeiXi’an Jiaotong University + BifanWeiXi’an Jiaotong University JianxiangHuXi’an Jiaotong University ZhongminCaiXi’an Jiaotong University - JunLiuXi’an Jiaotong University + JunLiuXi’an Jiaotong University 11783-11803 Automatically generated questions often suffer from problems such as unclear expression or factual inaccuracies, requiring a reliable and comprehensive evaluation of their quality. Human evaluation is widely used in the field of question generation (QG) and serves as the gold standard for automatic metrics. However, there is a lack of unified human evaluation criteria, which hampers consistent and reliable evaluations of both QG models and automatic metrics. To address this, we propose **QGEval**, a multi-dimensional **Eval**uation benchmark for **Q**uestion **G**eneration, which evaluates both generated questions and existing automatic metrics across 7 dimensions: fluency, clarity, conciseness, relevance, consistency, answerability, and answer consistency. We demonstrate the appropriateness of these dimensions by examining their correlations and distinctions. Through consistent evaluations of QG models and automatic metrics with QGEval, we find that 1) most QG models perform unsatisfactorily in terms of answerability and answer consistency, and 2) existing metrics fail to align well with human judgments when evaluating generated questions across the 7 dimensions. We expect this work to foster the development of both QG technologies and their evaluation. 2024.emnlp-main.658 @@ -9189,9 +9189,9 @@ Dependency Graph Parsing as Sequence Labeling - AnaEzquerro - DavidVilaresUniversidade da Coruña - CarlosGómez-RodríguezUniversidade da Coruña + AnaEzquerro + DavidVilaresUniversidade da Coruña + CarlosGómez-RodríguezUniversidade da Coruña 11804-11828 Various linearizations have been proposed to cast syntactic dependency parsing as sequence labeling. However, these approaches do not support more complex graph-based representations, such as semantic dependencies or enhanced universal dependencies, as they cannot handle reentrancy or cycles. By extending them, we define a range of unbounded and bounded linearizations that can be used to cast graph parsing as a tagging task, enlarging the toolbox of problems that can be solved under this paradigm. Experimental results on semantic dependency and enhanced UD parsing show that with a good choice of encoding, sequence-labeling semantic dependency parsers combine high efficiency with accuracies close to the state of the art, in spite of their simplicity. 2024.emnlp-main.659 @@ -9202,7 +9202,7 @@ <fixed-case>N</fixed-case>u<fixed-case>NER</fixed-case>: Entity Recognition Encoder Pre-training via <fixed-case>LLM</fixed-case>-Annotated Data SergeiBogdanovNuMind AlexandreConstantin - TimothéeBernardUniversité Paris Cité + TimothéeBernardUniversité Paris Cité BenoitCrabbéUniversité de Paris Etienne PBernard 11829-11841 @@ -9213,8 +9213,8 @@ Towards a <fixed-case>G</fixed-case>reek Proverb Atlas: Computational Spatial Exploration and Attribution of <fixed-case>G</fixed-case>reek Proverbs - JohnPavlopoulosAthens University of Economics and Business - PanosLouridasAthens University of Economics and Business + JohnPavlopoulosAthens University of Economics and Business + PanosLouridasAthens University of Economics and Business PanagiotisFilosUniversity of Ioannina 11842-11854 Proverbs carry wisdom transferred orally from generation to generation. Based on the place they were recorded, this study introduces a publicly-available and machine-actionable dataset of more than one hundred thousand Greek proverb variants. By quantifying the spatial distribution of proverbs, we show that the most widespread proverbs come from the mainland while the least widespread proverbs come primarily from the islands. By focusing on the least dispersed proverbs, we present the most frequent tokens per location and undertake a benchmark in geographical attribution, using text classification and regression (text geocoding). Our results show that this is a challenging task for which specific locations can be attributed more successfully compared to others. The potential of our resource and benchmark is showcased by two novel applications. First, we extracted terms moving the regression prediction toward the four cardinal directions. Second, we leveraged conformal prediction to attribute 3,676 unregistered proverbs with statistically rigorous predictions of locations each of these proverbs was possibly registered in. @@ -9226,9 +9226,9 @@ Unraveling <fixed-case>B</fixed-case>abel: Exploring Multilingual Activation Patterns of <fixed-case>LLM</fixed-case>s and Their Applications WeizeLiu YinlongXu - HongxiaXu - JintaiChen - XumingHuThe Hong Kong University of Science and Technology (Guangzhou) and Hong Kong University of Science and Technology + HongxiaXu + JintaiChen + XumingHuThe Hong Kong University of Science and Technology (Guangzhou) and Hong Kong University of Science and Technology JianWu 11855-11881 Recently, large language models (LLMs) have achieved tremendous breakthroughs in the field of NLP, but still lack understanding of their internal neuron activities when processing different languages. We designed a method to convert dense LLMs into fine-grained MoE architectures, and then visually studied the multilingual activation patterns of LLMs through expert activation frequency heatmaps. Through comprehensive experiments on different model families, different model sizes, and different variants, we analyzed the similarities and differences in the internal neuron activation patterns of LLMs when processing different languages. Specifically, we investigated the distribution of high-frequency activated experts, multilingual shared experts, whether multilingual activation patterns are related to language families, and the impact of instruction tuning on activation patterns. We further explored leveraging the discovered differences in expert activation frequencies to guide sparse activation and pruning. Experimental results demonstrated that our method significantly outperformed random expert pruning and even exceeded the performance of unpruned models in some languages. Additionally, we found that configuring different pruning rates for different layers based on activation level differences could achieve better results. Our findings reveal the multilingual processing mechanisms within LLMs and utilize these insights to offer new perspectives for applications such as sparse activation and model pruning. @@ -9239,7 +9239,7 @@ Advancing Semantic Textual Similarity Modeling: A Regression Framework with Translated <fixed-case>R</fixed-case>e<fixed-case>LU</fixed-case> and Smooth K2 Loss BowenZhang - ChunpingLiTsinghua University, Tsinghua University + ChunpingLiTsinghua University, Tsinghua University 11882-11893 Since the introduction of BERT and RoBERTa, research on Semantic Textual Similarity (STS) has made groundbreaking progress. Particularly, the adoption of contrastive learning has substantially elevated state-of-the-art performance across various STS benchmarks. However, contrastive learning categorizes text pairs as either semantically similar or dissimilar, failing to leverage fine-grained annotated information and necessitating large batch sizes to prevent model collapse. These constraints pose challenges for researchers engaged in STS tasks that involve nuanced similarity levels or those with limited computational resources, compelling them to explore alternatives like Sentence-BERT. Despite its efficiency, Sentence-BERT tackles STS tasks from a classification perspective, overlooking the progressive nature of semantic relationships, which results in suboptimal performance. To bridge this gap, this paper presents an innovative regression framework and proposes two simple yet effective loss functions: Translated ReLU and Smooth K2 Loss. Experimental results demonstrate that our method achieves convincing performance across seven established STS benchmarks and offers the potential for further optimization of contrastive learning pre-trained models. 2024.emnlp-main.663 @@ -9251,7 +9251,7 @@ Rationalizing Transformer Predictions via End-To-End Differentiable Self-Training Marc FelixBrinnerUniversität Bielefeld - SinaZarrießBielefeld University + SinaZarrießBielefeld University 11894-11907 We propose an end-to-end differentiable training paradigm for stable training of a rationalized transformer classifier. Our approach results in a single model that simultaneously classifies a sample and scores input tokens based on their relevance to the classification. To this end, we build on the widely-used three-player-game for training rationalized models, which typically relies on training a rationale selector, a classifier and a complement classifier. We simplify this approach by making a single model fulfill all three roles, leading to a more efficient training paradigm that is not susceptible to the common training instabilities that plague existing approaches. Further, we extend this paradigm to produce class-wise rationales while incorporating recent advances in parameterizing and regularizing the resulting rationales, thus leading to substantially improved and state-of-the-art alignment with human annotations without any explicit supervision. 2024.emnlp-main.664 @@ -9266,7 +9266,7 @@ IgorSterner IvanVulićUniversity of Cambridge and PolyAI Limited BenjaminMinixhoferUniversity of Cambridge - MarkusSchedlJohannes Kepler Universität Linz + MarkusSchedlJohannes Kepler Universität Linz 11908-11941 Segmenting text into sentences plays an early and crucial role in many NLP systems. This is commonly achieved by using rule-based or statistical methods relying on lexical features such as punctuation. Although some recent works no longer exclusively rely on punctuation, we find that no prior method achieves all of (i) robustness to missing punctuation, (ii) effective adaptability to new domains, and (iii) high efficiency. We introduce a new model — Segment any Text (SaT) — to solve this problem. To enhance robustness, we propose a new pretraining scheme that ensures less reliance on punctuation. To address adaptability, we introduce an extra stage of parameter-efficient fine-tuning, establishing state-of-the-art performance in distinct domains such as verses from lyrics and legal documents. Along the way, we introduce architectural modifications that result in a threefold gain in speed over the previous state of the art and solve spurious reliance on context far in the future. Finally, we introduce a variant of our model with fine-tuning on a diverse, multilingual mixture of sentence-segmented data, acting as a drop-in replacement and enhancement for existing segmentation tools. Overall, our contributions provide a universal approach for segmenting any text. Our method outperforms all baselines — including strong LLMs — across 8 corpora spanning diverse domains and languages, especially in practically relevant situations where text is poorly formatted. Our models and code, including documentation, are readily available at https://github.com/segment-any-text/wtpsplit under the MIT license. 2024.emnlp-main.665 @@ -9275,9 +9275,9 @@ Applying Contrastive Learning to Code Vulnerability Type Classification - ChenJi + ChenJi SuYang - HongyuSun + HongyuSun YuqingZhangUniversity of Chinese Academy of Sciences 11942-11952 Vulnerability classification is a crucial task in software security analysis, essential for identifying and mitigating potential security risks. Learning-based methods often perform poorly due to the long-tail distribution of vulnerability classification datasets. Recent approaches try to address the problem but treat each CWE class in isolation, ignoring their relationships. This results in non-scalable code vector representations, causing significant performance drops when handling complex real-world vulnerabilities. We propose a hierarchical contrastive learning framework for code vulnerability type classification to bring vector representations of related CWEs closer together. To address the issue of class collapse and enhance model robustness, we mix self-supervised contrastive learning loss into our loss function. Additionally, we employ max-pooling to enable the model to handle longer vulnerability code inputs. Extensive experiments demonstrate that our proposed framework outperforms state-of-the-art methods by 2.97%-17.90% on accuracy and 0.98%-22.27% on weighted-F1, with even better performance on higher-quality datasets. We also utilize an ablation study to prove each component’s contribution. These findings underscore the potential and advantages of our approach in the multi-class vulnerability classification task. @@ -9287,13 +9287,13 @@ <fixed-case>T</fixed-case>heorem<fixed-case>L</fixed-case>lama: Transforming General-Purpose <fixed-case>LLM</fixed-case>s into Lean4 Experts - RuidaWangUniversity of Wisconsin - Madison + RuidaWangUniversity of Wisconsin - Madison JipengZhang - YizhenJiaUniversity of Illinois at Urbana-Champaign - RuiPanUniversity of Illinois at Urbana-Champaign + YizhenJiaUniversity of Illinois at Urbana-Champaign + RuiPanUniversity of Illinois at Urbana-Champaign ShizheDiaoHong Kong University of Science and Technology RenjiePi - TongZhangUIUC + TongZhangUIUC 11953-11974 Proving mathematical theorems using computer-verifiable formal languages like Lean significantly impacts mathematical reasoning. One approach to formal theorem proving involves generating complete proofs using Large Language Models (LLMs) based on Natural Language (NL) proofs. However, due to the scarcity of aligned NL and Formal Language (FL) theorem-proving data most modern LLMs exhibit suboptimal performance.This scarcity results in a paucity of methodologies for training LLMs and techniques to fully utilize their capabilities in composing formal proofs. To address these challenges, this paper proposes **TheoremLlama**, an end-to-end framework that trains a general-purpose LLM to be a Lean4 expert. **TheoremLlama** includes NL-FL dataset generation and bootstrapping method to obtain aligned dataset, curriculum learning and block training techniques to train the model, and iterative proof writing method to write Lean4 proofs that work together synergistically.Using the dataset generation method in **TheoremLlama**, we provide *Open Bootstrapped Theorems* (OBT), an NL-FL aligned and bootstrapped dataset. Our novel NL-FL bootstrapping method, where NL proofs are integrated into Lean4 code for training datasets, leverages the NL reasoning ability of LLMs for formal reasoning. The **TheoremLlama** framework achieves cumulative accuracies of 36.48% and 33.61% on MiniF2F-Valid and Test datasets respectively, surpassing the GPT-4 baseline of 22.95% and 25.41%. Our code, model checkpoints, and the generated dataset is published in GitHub 2024.emnlp-main.667 @@ -9302,15 +9302,15 @@ Multi-Level Cross-Modal Alignment for Speech Relation Extraction - LiangZhang + LiangZhang ZhenYang BiaoFu ZiyaoLuWeChat AI LiangyingShao ShiyuLiu - FandongMengWeChat AI, Tencent Inc. - JieZhou - XiaoliWangXiamen University + FandongMengWeChat AI, Tencent Inc. + JieZhou + XiaoliWangXiamen University JinsongSuXiamen University 11975-11986 Speech Relation Extraction (SpeechRE) aims to extract relation triplets from speech data. However, existing studies usually use synthetic speech to train and evaluate SpeechRE models, hindering the further development of SpeechRE due to the disparity between synthetic and real speech. Meanwhile, the modality gap issue, unexplored in SpeechRE, limits the performance of existing models. In this paper, we construct two real SpeechRE datasets to facilitate subsequent researches and propose a Multi-level Cross-modal Alignment Model (MCAM) for SpeechRE. Our model consists of three components: 1) a speech encoder, extracting speech features from the input speech; 2) an alignment adapter, mapping these speech features into a suitable semantic space for the text decoder; and 3) a text decoder, autoregressively generating relation triplets based on the speech features. During training, we first additionally introduce a text encoder to serve as a semantic bridge between the speech encoder and the text decoder, and then train the alignment adapter to align the output features of speech and text encoders at multiple levels. In this way, we can effectively train the alignment adapter to bridge the modality gap between the speech encoder and the text decoder. Experimental results and in-depth analysis on our datasets strongly demonstrate the efficacy of our method. @@ -9321,8 +9321,8 @@ Self-Training for Sample-Efficient Active Learning for Text Classification with Pre-Trained Language Models - ChristopherSchröderScaDS.AI and Leipzig University - GerhardHeyerUniversität Leipzig + ChristopherSchröderScaDS.AI and Leipzig University + GerhardHeyerUniversität Leipzig 11987-12004 Active learning is an iterative labeling process that is used to obtain a small labeled subset, despite the absence of labeled data, thereby enabling to train a model for supervised tasks such as text classification.While active learning has made considerable progress in recent years due to improvements provided by pre-trained language models, there is untapped potential in the often neglected unlabeled portion of the data, although it is available in considerably larger quantities than the usually small set of labeled data. In this work, we investigate how self-training, a semi-supervised approach that uses a model to obtain pseudo-labels for unlabeled data, can be used to improve the efficiency of active learning for text classification. Building on a comprehensive reproduction of four previous self-training approaches, some of which are evaluated for the first time in the context of active learning or natural language processing, we introduce HAST, a new and effective self-training strategy, which is evaluated on four text classification benchmarks. Our results show that it outperforms the reproduced self-training approaches and reaches classification results comparable to previous experiments for three out of four datasets, using as little as 25% of the data. The code is publicly available at https://github.com/chschroeder/self-training-for-sample-efficient-active-learning. 2024.emnlp-main.669 @@ -9331,8 +9331,8 @@ <fixed-case>PANDA</fixed-case>: Persona Attributes Navigation for Detecting and Alleviating Overuse Problem in Large Language Models - JinsungKim - SeonminKooKorea University + JinsungKim + SeonminKooKorea University HeuiseokLimKorea University 12005-12026 In the persona-grounded dialogue (PGD) task, it is required not only to respond fluently, but also to ground the attributes according to the current conversation topic properly. However, due to their tendency to overly ground given attributes, LLMs often generate unnatural responses provoked by using attributes that deviate from the flow of the conversation or by exploiting too many attributes at once. We term this phenomenon the *overuse* problem of LLMs. Unfortunately, research devising precise criteria and frameworks to quantitatively verify LLMs’ *overuse* problem is obviously insufficient. To address this issue, we propose **P**ersona **A**ttributes **N**avigation for **D**etecting and **A**lleviating the *overuse* problem (**PANDA**) framework. **PANDA** is the first study to quantify the persona *overuse* problem of LLMs by establishing clear standards of the problem and verifying various LLMs based on them. Moreover, this framework navigates us into understanding persona attributes by introducing diverse and detailed dialogue topics that consider practical conversation situations. We provide insights related to LLMs’ persona attribute *overuse* problem through comprehensive verification and analysis with **PANDA** in the PGD task. Our code and resources can be found at http://github.com/jin62304/PANDA. @@ -9343,7 +9343,7 @@ The Multilingual Alignment Prism: Aligning Global and Local Preferences to Reduce Harm Aakanksha - ArashAhmadian + ArashAhmadian BeyzaErmisCohere AI SeraphinaGoldfarb-Tarrant JuliaKreutzerCohere for AI @@ -9367,8 +9367,8 @@ Explicit, Implicit, and Scattered: Revisiting Event Extraction to Capture Complex Arguments - OmarSharifDartmouth College and Chittagong University of Engineering and Techology - JosephGattoDartmouth College + OmarSharifDartmouth College and Chittagong University of Engineering and Techology + JosephGattoDartmouth College MadhusudanBasakDartmouth College and Bangladesh University of Engineering and Technology Sarah MasudPreumDartmouth College 12061-12081 @@ -9379,9 +9379,9 @@ Let Me Teach You: Pedagogical Foundations of Feedback for Language Models - BeatrizBorgesSchool of Computer and Communication Sciences, EPFL - EPF Lausanne + BeatrizBorgesSchool of Computer and Communication Sciences, EPFL - EPF Lausanne NiketTandonAllen Institute for Artificial Intelligence - TanjaKäserSchool of Computer and Communication Sciences, EPFL - EPF Lausanne + TanjaKäserSchool of Computer and Communication Sciences, EPFL - EPF Lausanne AntoineBosselutSwiss Federal Institute of Technology Lausanne 12082-12104 Natural Language Feedback (NLF) is an increasingly popular mechanism for aligning Large Language Models (LLMs) to human preferences. Despite the diversity of the information it can convey, NLF methods are often hand-designed and arbitrary, with little systematic grounding. At the same time, research in learning sciences has long established several effective feedback models. In this opinion piece, we compile ideas from pedagogy to introduce FELT, a feedback framework for LLMs that outlines various characteristics of the feedback space, and a feedback content taxonomy based on these variables, providing a general mapping of the feedback space. In addition to streamlining NLF designs, FELT also brings out new, unexplored directions for research in NLF. We make our taxonomy available to the community, providing guides and examples for mapping our categorizations to future research. @@ -9392,8 +9392,8 @@ Unknown Claims: Generation of Fact-Checking Training Examples from Unstructured and Structured Data Jean-FlavienBussotti - LucaRagazzi - GiacomoFrisoni + LucaRagazzi + GiacomoFrisoni GianlucaMoroDISI - University of Bologna PaoloPapottiEurecom 12105-12122 @@ -9404,7 +9404,7 @@ <fixed-case>TL</fixed-case>-<fixed-case>CL</fixed-case>: Task And Language Incremental Continual Learning - ShreySatapara + ShreySatapara P. K.SrijithIndian Institute of Technology Hyderabad 12123-12142 This paper introduces and investigates the problem of Task and Language Incremental Continual Learning (TLCL), wherein a multilingual model is systematically updated to accommodate new tasks in previously learned languages or new languages for established tasks. This significant yet previously unexplored area holds substantial practical relevance as it mirrors the dynamic requirements of real-world applications. We benchmark a representative set of continual learning (CL) algorithms for TLCL. Furthermore, we propose Task and Language-Specific Adapters (TLSA), an adapter-based parameter-efficient fine-tuning strategy. TLSA facilitates cross-lingual and cross-task transfer and outperforms other parameter-efficient fine-tuning techniques. Crucially, TLSA reduces parameter growth stemming from saving adapters to linear complexity from polynomial complexity as it was with parameter isolation-based adapter tuning. We conducted experiments on several NLP tasks arising across several languages. We observed that TLSA outperforms all other parameter-efficient approaches without requiring access to historical data for replay. @@ -9419,7 +9419,7 @@ Daniel PJeongMachine Learning Department, Carnegie Mellon University SaurabhGarg Zachary ChaseLiptonCarnegie Mellon University - MichaelOberstCarnegie Mellon University and Johns Hopkins University + MichaelOberstCarnegie Mellon University and Johns Hopkins University 12143-12170 Several recent works seek to develop foundation models specifically for medical applications, adapting general-purpose large language models (LLMs) and vision-language models (VLMs) via continued pretraining on publicly available biomedical corpora. These works typically claim that such domain-adaptive pretraining (DAPT) improves performance on downstream medical tasks, such as answering medical licensing exam questions. In this paper, we compare seven public “medical” LLMs and two VLMs against their corresponding base models, arriving at a different conclusion: all medical VLMs and nearly all medical LLMs fail to consistently improve over their base models in the zero-/few-shot prompting regime for medical question-answering (QA) tasks. For instance, across the tasks and model pairs we consider in the 3-shot setting, medical LLMs only outperform their base models in 12.1% of cases, reach a (statistical) tie in 49.8% of cases, and are significantly worse than their base models in the remaining 38.2% of cases. Our conclusions are based on (i) comparing each medical model head-to-head, directly against the corresponding base model; (ii) optimizing the prompts for each model separately; and (iii) accounting for statistical uncertainty in comparisons. While these basic practices are not consistently adopted in the literature, our ablations show that they substantially impact conclusions. Our findings suggest that state-of-the-art general-domain models may already exhibit strong medical knowledge and reasoning capabilities, and offer recommendations to strengthen the conclusions of future studies. 2024.emnlp-main.677 @@ -9428,7 +9428,7 @@ Empowering Multi-step Reasoning across Languages via Program-Aided Language Models - LeonardoRanaldi + LeonardoRanaldi GiuliaPucci BarryHaddowUniversity of Edinburgh AlexandraBirchUniversity of Edinburgh @@ -9442,8 +9442,8 @@ Do <fixed-case>LLM</fixed-case>s Overcome Shortcut Learning? An Evaluation of Shortcut Challenges in Large Language Models YuYuan - LiliZhao - KaiZhang + LiliZhao + KaiZhang GuangtingZheng QiLiuUniversity of Science and Technology of China 12188-12200 @@ -9458,7 +9458,7 @@ NingWu JianhuiChang LinjunShou - JiaLiThe Hong Kong University of Science and Technology + JiaLiThe Hong Kong University of Science and Technology 12201-12217 Utilizing large language models (LLMs) for data augmentation has yielded encouraging results in mathematical reasoning. However, these approaches face constraints in problem diversity, potentially restricting them to in-domain/distribution data generation. To this end, we propose **ControlMath**, an iterative method involving an equation-generator module and two LLM-based agents. The module creates diverse equations, which the Problem-Crafter agent then transforms into math word problems. The Reverse-Agent filters and selects high-quality data, adhering to the “less is more” principle. This approach enables the generation of diverse math problems, not limited to specific domains or distributions. As a result, we collect ControlMathQA, which involves 190k math word problems. Extensive results prove that combining our dataset with in-domain datasets like GSM8K can help improve the model’s mathematical ability to generalize, leading to improved performance both within and beyond specific domains. 2024.emnlp-main.680 @@ -9478,10 +9478,10 @@ <fixed-case>R</fixed-case>ead<fixed-case>M</fixed-case>e++: Benchmarking Multilingual Language Models for Multi-Domain Readability Assessment - TarekNaous - Michael JRyanStanford University + TarekNaous + Michael JRyanStanford University AntonLavroukGeorgia Institute of Technology - MohitChandra + MohitChandra WeiXuGeorgia Institute of Technology 12230-12266 We present a comprehensive evaluation of large language models for multilingual readability assessment. Existing evaluation resources lack domain and language diversity, limiting the ability for cross-domain and cross-lingual analyses. This paper introduces ReadMe++, a multilingual multi-domain dataset with human annotations of 9757 sentences in Arabic, English, French, Hindi, and Russian, collected from 112 different data sources. This benchmark will encourage research on developing robust multilingual readability assessment methods. Using ReadMe++, we benchmark multilingual and monolingual language models in the supervised, unsupervised, and few-shot prompting settings. The domain and language diversity in ReadMe++ enable us to test more effective few-shot prompting, and identify shortcomings in state-of-the-art unsupervised methods. Our experiments also reveal exciting results of superior domain generalization and enhanced cross-lingual transfer capabilities by models trained on ReadMe++. We will make our data publicly available and release a python package tool for multilingual sentence readability prediction using our trained models at: https://github.com/tareknaous/readme @@ -9491,9 +9491,9 @@ <fixed-case>G</fixed-case>loss<fixed-case>LM</fixed-case>: A Massively Multilingual Corpus and Pretrained Model for Interlinear Glossed Text - MichaelGinnUniversity of Colorado at Boulder + MichaelGinnUniversity of Colorado at Boulder LindiaTjuatjaCMU, Carnegie Mellon University - TaiqiHe + TaiqiHe EnoraRiceUniversity of Colorado at Boulder GrahamNeubigCarnegie Mellon University AlexisPalmerUniversity of Colorado at Boulder @@ -9507,15 +9507,15 @@ <fixed-case>GDTB</fixed-case>: Genre Diverse Data for <fixed-case>E</fixed-case>nglish Shallow Discourse Parsing across Modalities, Text Types, and Domains - Yang JanetLiu - TatsuyaAoyama + Yang JanetLiu + TatsuyaAoyama WesleyScivetti YilunZhu ShabnamBehzad Lauren ElizabethLevineGeorgetown University - JessicaLinGeorgetown University + JessicaLinGeorgetown University DevikaTiwari - AmirZeldesGeorgetown University + AmirZeldesGeorgetown University 12287-12303 Work on shallow discourse parsing in English has focused on the Wall Street Journal corpus, the only large-scale dataset for the language in the PDTB framework. However, the data is not openly available, is restricted to the news domain, and is by now 35 years old. In this paper, we present and evaluate a new open-access, multi-genre benchmark for PDTB-style shallow discourse parsing, based on the existing UD English GUM corpus, for which discourse relation annotations in other frameworks already exist. In a series of experiments on cross-domain relation classification, we show that while our dataset is compatible with PDTB, substantial out-of-domain degradation is observed, which can be alleviated by joint training on both datasets. 2024.emnlp-main.684 @@ -9525,11 +9525,11 @@ <fixed-case>RA</fixed-case>2<fixed-case>FD</fixed-case>: Distilling Faithfulness into Efficient Dialogue Systems ZhiyuanZhu - YushengLiao + YushengLiao ChenxinXu YunfengGuanShanghai Jiaotong University - YanfengWangShanghai Jiao Tong University - YuWangShanghai Jiao Tong University + YanfengWangShanghai Jiao Tong University + YuWangShanghai Jiao Tong University 12304-12317 Generating faithful and fast responses is crucial in the knowledge-grounded dialogue. Retrieval Augmented Generation (RAG) strategies are effective but are inference inefficient, while previous Retrieval Free Generations (RFG) are more efficient but sacrifice faithfulness. To solve this faithfulness-efficiency trade-off dilemma, we propose a novel retrieval-free model training scheme named Retrieval Augmented to Retrieval Free Distillation (RA2FD) to build a retrieval-free model that achieves higher faithfulness than the previous RFG method while maintaining inference efficiency. The core idea of RA2FD is to use a teacher-student framework to distill the faithfulness capacity of a teacher, which is an oracle RAG model that generates multiple knowledge-infused responses. The student retrieval-free model learns how to generate faithful responses from these teacher labels through sequence-level distillation and contrastive learning. Experiment results show that RA2FD let the faithfulness performance of an RFG model surpass the previous SOTA RFG baseline on three knowledge-grounded dialogue datasets by an average of 33% and even matching an RAG model’s performance while significantly improving inference efficiency. Our code is available at https://github.com/zzysjtuiwct/RA2FD. 2024.emnlp-main.685 @@ -9563,12 +9563,12 @@ Leveraging Estimated Transferability Over Human Intuition for Model Selection in Text Ranking - JunBai + JunBai ZhuofanChenBeijing University of Aeronautics and Astronautics ZhenziLi HanhuaHong JianfeiZhangBeihang University - ChenLiBeijing University of Aeronautics and Astronautics + ChenLiBeijing University of Aeronautics and Astronautics ChenghuaLinUniversity of Manchester WengeRongBeihang University 12356-12374 @@ -9618,9 +9618,9 @@ Latent Concept-based Explanation of <fixed-case>NLP</fixed-case> Models - XueminYuDalhousie University + XueminYuDalhousie University FahimDalviHamad Bin Khalifa University - NadirDurraniQatar Computing Research Institute + NadirDurraniQatar Computing Research Institute MarziaNouriSharif University of Technology HassanSajjadDalhousie University 12435-12459 @@ -9643,8 +9643,8 @@ Enhancing Data Quality through Simple De-duplication: Navigating Responsible Computational Social Science Research YidaMu - MaliJinUniversity of Sheffield - XingyiSongUniversity of Sheffield + MaliJinUniversity of Sheffield + XingyiSongUniversity of Sheffield NikolaosAletrasUniversity of Sheffield, University of Sheffield and Amazon 12477-12492 Research in natural language processing (NLP) for Computational Social Science (CSS) heavily relies on data from social media platforms. This data plays a crucial role in the development of models for analysing socio-linguistic phenomena within online communities. In this work, we conduct an in-depth examination of 20 datasets extensively used in NLP for CSS to comprehensively examine data quality. Our analysis reveals that social media datasets exhibit varying levels of data duplication. Consequently, this gives rise to challenges like label inconsistencies and data leakage, compromising the reliability of models. Our findings also suggest that data duplication has an impact on the current claims of state-of-the-art performance, potentially leading to an overestimation of model effectiveness in real-world scenarios. Finally, we propose new protocols and best practices for improving dataset development from social media data and its usage. @@ -9677,7 +9677,7 @@ Neeko: Leveraging Dynamic <fixed-case>L</fixed-case>o<fixed-case>RA</fixed-case> for Efficient Multi-Character Role-Playing Agent XiaoyanYuBeijing Institute of Technology TongxuLuo - YifanWei + YifanWei FangyuLei YimingHuang HaoPeng @@ -9691,7 +9691,7 @@ <fixed-case>SLANG</fixed-case>: New Concept Comprehension of Large Language Models LingruiMei - ShenghuaLiu + ShenghuaLiu YiweiWangUniversity of California, Merced BaolongBi XueqiCheng, Chinese Academy of Sciences @@ -9716,7 +9716,7 @@ Why Does New Knowledge Create Messy Ripple Effects in <fixed-case>LLM</fixed-case>s? JiaxinQin ZixuanZhang - ChiHan + ChiHan PengfeiYuBoson AI and University of Illinois at Urbana-Champaign ManlingLiNorthwestern University HengJiUniversity of Illinois, Urbana-Champaign @@ -9730,7 +9730,7 @@ Lifelong Event Detection via Optimal Transport VietDao Van-CuongPham - QuyenTranVinAI Research + QuyenTranVinAI Research Thanh-ThienLe Linh VanNgoHanoi University of Science and Technology Thien HuuNguyen, University of Oregon @@ -9745,7 +9745,7 @@ <fixed-case>SUPER</fixed-case>: Evaluating Agents on Setting Up and Executing Tasks from Research Repositories BenBoginAllen Institute for Artificial Intelligence KejuanYang - ShashankGuptaAllen Institute for Artificial Intelligence + ShashankGuptaAllen Institute for Artificial Intelligence KyleRichardsonAllen Institute for Artificial Intelligence ErinBransomAllen Institute for Artificial Intelligence PeterClarkAllen Institute for Artificial Intelligence @@ -9760,14 +9760,14 @@ <fixed-case>FIRST</fixed-case>: Teach A Reliable Large Language Model Through Efficient Trustworthy Distillation - KaShunShum + KaShunShum MinruiXu JianshuZhang ZixinChen ShizheDiaoHong Kong University of Science and Technology HanzeDongSalesForce JipengZhang - Muhammad OmerRazaPurdue University + Muhammad OmerRazaPurdue University 12646-12659 Large language models (LLMs) have become increasingly prevalent in our daily lives, leading to an expectation for LLMs to be trustworthy —- both accurate and well-calibrated (the prediction confidence should align with its ground truth correctness likelihood). Nowadays, fine-tuning has become the most popular method for adapting a model to practical usage by significantly increasing accuracy on downstream tasks. Despite the great accuracy it achieves, we found fine-tuning is still far away from satisfactory trustworthiness due to “tuning-induced mis-calibration”. In this paper, we delve deeply into why and how mis-calibration exists in fine-tuned models, and how distillation can alleviate the issue. Then we further propose a brand new method named Efficient Trustworthy Distillation (FIRST), which utilizes a small portion of teacher’s knowledge to obtain a reliable language model in a cost-efficient way. Specifically, we identify the “concentrated knowledge” phenomenon during distillation, which can significantly reduce the computational burden. Then we apply a “trustworthy maximization” process to optimize the utilization of this small portion of concentrated knowledge before transferring it to the student. Experimental results demonstrate the effectiveness of our method, where better accuracy (+2.3%) and less mis-calibration (-10%) are achieved on average across both in-domain and out-of-domain scenarios, indicating better trustworthiness. 2024.emnlp-main.703 @@ -9810,7 +9810,7 @@ “Global is Good, Local is Bad?”: Understanding Brand Bias in <fixed-case>LLM</fixed-case>s MahammedKamruzzamanUniversity of South Florida Hieu MinhNguyen - Gene LouisKimUniversity of South Florida + Gene LouisKimUniversity of South Florida 12695-12702 Many recent studies have investigated social biases in LLMs but brand bias has received little attention. This research examines the biases exhibited by LLMs towards different brands, a significant concern given the widespread use of LLMs in affected use cases such as product recommendation and market analysis. Biased models may perpetuate societal inequalities, unfairly favoring established global brands while marginalizing local ones. Using a curated dataset across four brand categories, we probe the behavior of LLMs in this space. We find a consistent pattern of bias in this space—both in terms of disproportionately associating global brands with positive attributes and disproportionately recommending luxury gifts for individuals in high-income countries. We also find LLMs are subject to country-of-origin effects which may boost local brand preference in LLM outputs in specific contexts. 2024.emnlp-main.707 @@ -9835,7 +9835,7 @@ RyanSheaColumbia University AymenKallala Xin LucyLiuColumbia University - Michael W.MorrisColumbia University + Michael W.MorrisColumbia University ZhouYuColumbia University 12720-12749 The growing prominence of LLMs has led to an increase in the development of AI tutoring systems. These systems are crucial in providing underrepresented populations with improved access to valuable education. One important area of education that is unavailable to many learners is strategic bargaining related to negotiation. To address this, we develop a LLM-based Assistant for Coaching nEgotiation (ACE). ACE not only serves as a negotiation partner for users but also provides them with targeted feedback for improvement. To build our system, we collect a dataset of negotiation transcripts between MBA students. These transcripts come from trained negotiators and emulate realistic bargaining scenarios. We use the dataset, along with expert consultations, to design an annotation scheme for detecting negotiation mistakes. ACE employs this scheme to identify mistakes and provide targeted feedback to users. To test the effectiveness of ACE-generated feedback, we conducted a user experiment with two consecutive trials of negotiation and found that it improves negotiation performances significantly compared to a system that doesn’t provide feedback and one which uses an alternative method of providing feedback. @@ -9851,12 +9851,12 @@ CaishuangHuang YilongWu ShichunLiu - HuiyuanZheng + HuiyuanZheng YuruiDong YujiongShen ShihanDou JunZhao - JunjieYe + JunjieYe QiZhangFudan University TaoGuiFudan University XuanjingHuangFudan University @@ -9869,13 +9869,13 @@ <fixed-case>PATIENT</fixed-case>-<tex-math>\psi</tex-math>: Using Large Language Models to Simulate Patients for Training Mental Health Professionals RuiyiWangUniversity of California, San Diego - StephanieMilaniCarnegie Mellon University + StephanieMilaniCarnegie Mellon University Jamie C.ChiuPrinceton University JiayinZhi - Shaun M.EackUniversity of Pittsburgh + Shaun M.EackUniversity of Pittsburgh TravisLabrum Samuel MMurphy - NevJonesUniversity of Pittsburgh + NevJonesUniversity of Pittsburgh Kate VHardyStanford University HongShenCarnegie Mellon University FeiFangCarnegie Mellon University @@ -9890,7 +9890,7 @@ <fixed-case>DKEC</fixed-case>: Domain Knowledge Enhanced Multi-Label Classification for Diagnosis Prediction XuerenGeUniversity of Virginia, Charlottesville AbhishekSatpathy - Ronald DeanWilliamsUniversity of Virginia, Charlottesville + Ronald DeanWilliamsUniversity of Virginia, Charlottesville JohnStankovicUniversity of Virginia, Charlottesville HomaAlemzadeh 12798-12813 @@ -9901,12 +9901,12 @@ <tex-math>\texttt{ModSCAN}</tex-math>: Measuring Stereotypical Bias in Large Vision-Language Models from Vision and Language Modalities - YukunJiangCISPA - ZhengLiCISPA Helmholtz Center for Information Security + YukunJiangCISPA + ZhengLiCISPA Helmholtz Center for Information Security XinyueShenCISPA Helmholtz Center for Information Security - YugengLiuCISPA Helmholtz Center for Information Security + YugengLiuCISPA Helmholtz Center for Information Security MichaelBackesCISPA Helmholtz Center for Information Security - YangZhangCISPA Helmholtz Center for Information Security + YangZhangCISPA Helmholtz Center for Information Security 12814-12845 2024.emnlp-main.713 jiang-etal-2024-modscan @@ -9914,12 +9914,12 @@ Large Language Models Can Self-Correct with Key Condition Verification - ZhenyuWuUniversity of Notre Dame and Xi’an Jiaotong University - QingkaiZengUniversity of Notre Dame + ZhenyuWuUniversity of Notre Dame and Xi’an Jiaotong University + QingkaiZengUniversity of Notre Dame ZhihanZhang - ZhaoxuanTanUniversity of Notre Dame + ZhaoxuanTanUniversity of Notre Dame ChaoShenXi’an Jiaotong University - MengJiangUniversity of Notre Dame + MengJiangUniversity of Notre Dame 12846-12867 Intrinsic self-correct was a method that instructed large language models (LLMs) to verify and correct their responses without external feedback. Unfortunately, the study concluded that the LLMs could not self-correct reasoning yet. We find that a simple yet effective prompting method enhances LLM performance in identifying and correcting inaccurate answers without external feedback.That is to mask a key condition in the question, add the current response to construct a verification question, and predict the condition to verify the response. The condition can be an entity in an open-domain question or a numerical value in an arithmetic question, which requires minimal effort (via prompting) to identify. We propose an iterative verify-then-correct framework to progressively identify and correct (probably) false responses, named ProCo. We conduct experiments on three reasoning tasks. On average, ProCo, with GPT-3.5-Turbo-1106 as the backend LLM, yields +6.8 exact match on four open-domain question answering datasets, +14.1 accuracy on three arithmetic reasoning datasets, and +9.6 accuracy on a commonsense reasoning dataset, compared to Self-Correct.Our implementation is made publicly available at https://wzy6642.github.io/proco.github.io/. 2024.emnlp-main.714 @@ -9942,7 +9942,7 @@ Defending Against Social Engineering Attacks in the Age of <fixed-case>LLM</fixed-case>s LinAi Tharindu SandaruwanKumarageArizona State University - AmritaBhattacharjeeArizona State University + AmritaBhattacharjeeArizona State University ZizhouLiu ZhengHuiMicrosoft and Columbia University Michael S.DavinroyAptima, Inc. @@ -9952,8 +9952,8 @@ MatthiasKirchnerKitware ArslanBasharatKitware, Inc. AnthonyHoogsKitware, Inc. - JoshuaGarlandArizona State University - HuanLiuArizona State University + JoshuaGarlandArizona State University + HuanLiuArizona State University JuliaHirschbergColumbia University 12880-12902 2024.emnlp-main.716 @@ -9979,7 +9979,7 @@ Make Some Noise: Unlocking Language Model Parallel Inference Capability through Noisy Training YixuanWang XianzhenLuoHarbin Institute of Technology - FuxuanWei + FuxuanWei YijunLiu QingfuZhuHarbin Institute of Technology XuanyuZhang @@ -9997,7 +9997,7 @@ ErnieChangMeta AI Pin-JieLin YangLiIowa State University - ChangshengZhaoMeta Inc. + ChangshengZhaoMeta Inc. DaeilKimFacebook RastislavRabatinFacebook ZechunLiuMeta Inc. @@ -10018,7 +10018,7 @@ HyosangAhn SreyaMuppallaUniversity of California, Los Angeles Kai-WeiChangUniversity of California, Los Angeles - WeiWangUniversity of California, Los Angeles + WeiWangUniversity of California, Los Angeles NanyunPengUniversity of California, Los Angeles 12936-12965 Social media is often the first place where communities discuss the latest societal trends. Prior works have utilized this platform to extract epidemic-related information (e.g. infections, preventive measures) to provide early warnings for epidemic prediction. However, these works only focused on English posts, while epidemics can occur anywhere in the world, and early discussions are often in the local, non-English languages. In this work, we introduce the first multilingual Event Extraction (EE) framework SPEED++ for extracting epidemic event information for any disease and language. To this end, we extend a previous epidemic ontology with 20 argument roles; and curate our multilingual EE dataset SPEED++ comprising 5.1K tweets in four languages for four diseases. Annotating data in every language is infeasible; thus we develop zero-shot cross-lingual cross-disease models (i.e., training only on English COVID data) utilizing multilingual pre-training and show their efficacy in extracting epidemic-related events for 65 diverse languages across different diseases. Experiments demonstrate that our framework can provide epidemic warnings for COVID-19 in its earliest stages in Dec 2019 (3 weeks before global discussions) from Chinese Weibo posts without any training in Chinese. Furthermore, we exploit our framework’s argument extraction capabilities to aggregate community epidemic discussions like symptoms and cure measures, aiding misinformation detection and public attention monitoring. Overall, we lay a strong foundation for multilingual epidemic preparedness. @@ -10056,10 +10056,10 @@ Story Morals: Surfacing value-driven narrative schemas using large language models - David GHobsonMcGill University, McGill University + David GHobsonMcGill University, McGill University HaiqiZhou DerekRuthsMcGill University - AndrewPiperMcGill University + AndrewPiperMcGill University 12998-13032 Stories are not only designed to entertain but encode lessons reflecting their authors’ beliefs about the world. In this paper, we propose a new task of narrative schema labelling based on the concept of “story morals” to identify the values and lessons conveyed in stories. Using large language models (LLMs) such as GPT-4, we develop methods to automatically extract and validate story morals across a diverse set of narrative genres, including folktales, novels, movies and TV, personal stories from social media and the news. Our approach involves a multi-step prompting sequence to derive morals and validate them through both automated metrics and human assessments. The findings suggest that LLMs can effectively approximate human story moral interpretations and offer a new avenue for computational narrative understanding. By clustering the extracted morals on a sample dataset of folktales from around the world, we highlight the commonalities and distinctiveness of narrative values, providing preliminary insights into the distribution of values across cultures. This work opens up new possibilities for studying narrative schemas and their role in shaping human beliefs and behaviors. 2024.emnlp-main.723 @@ -10076,7 +10076,7 @@ JayneBottarini PeichenLiu EricRiceUniversity of Southern California - SwabhaSwayamdiptaUniversity of Southern California + SwabhaSwayamdiptaUniversity of Southern California 13033-13059 Warning: Contents of this paper may be upsetting.Public attitudes towards key societal issues, expressed on online media, are of immense value in policy and reform efforts, yet challenging to understand at scale. We study one such social issue: homelessness in the U.S., by leveraging the remarkable capabilities of large language models to assist social work experts in analyzing millions of posts from Twitter. We introduce a framing typology: Online Attitudes Towards Homelessness (OATH) Frames: nine hierarchical frames capturing critiques, responses and perceptions. We release annotations with varying degrees of assistance from language models, with immense benefits in scaling: 6.5× speedup in annotation time while only incurring a 3 point F1 reduction in performance with respect to the domain experts. Our experiments demonstrate the value of modeling OATH-Frames over existing sentiment and toxicity classifiers. Our large-scale analysis with predicted OATH-Frames on 2.4M posts on homelessness reveal key trends in attitudes across states, time periods and vulnerable populations, enabling new insights on the issue. Our work provides a general framework to understand nuanced public attitudes at scale, on issues beyond homelessness. 2024.emnlp-main.724 @@ -10090,11 +10090,11 @@ XiaoYeArizona State University AndrewWangDepartment of Computer Science, Whiting School of Engineering JacobChoi - YiningLuUniversity of Notre Dame - ShreyaSharma + YiningLuUniversity of Notre Dame + ShreyaSharma LingfengShenByteDance Inc. Vijay MurariTiyyala - NicholasAndrewsJohns Hopkins University + NicholasAndrewsJohns Hopkins University DanielKhashabiJohns Hopkins University 13060-13082 Humans regularly engage in analogical thinking, relating personal experiences to current situations (X is analogous to Y because of Z). Analogical thinking allows humans to solve problems in creative ways, grasp difficult concepts, and articulate ideas more effectively. Can language models (LMs) do the same? To answer this question, we propose AnaloBench, a benchmark to determine analogical reasoning ability in LMs. Our benchmarking approach focuses on aspects of this ability that are common among humans: (i) recalling related experiences from a large amount of information, and (ii) applying analogical reasoning to complex and lengthy scenarios. We collect a set of 340 high quality, human written analogies for use in our benchmark, which constitutes the largest such collection to date. We then test a broad collection of models consisting of 12 open source and 3 proprietary in various sizes and architectures. As in prior results, scaling up LMs results in some performance boosts. Surprisingly, scale offers minimal gains when, (i) analogies involve lengthy scenarios, or (ii) recalling relevant scenarios from a large pool of information, a process analogous to finding a needle in a haystack. We hope these observations encourage further research in this field. @@ -10106,7 +10106,7 @@ <fixed-case>S</fixed-case>ci<fixed-case>ER</fixed-case>: An Entity and Relation Extraction Dataset for Datasets, Methods, and Tasks in Scientific Documents QiZhang ZhijiaChenFacebook - HuitongPanTemple University + HuitongPanTemple University CorneliaCarageaUniversity of Illinois, Chicago Longin JanLateckiTemple University EduardDragutTemple University @@ -10119,10 +10119,10 @@ Analysis of Plan-based Retrieval for Grounded Text Generation AmeyaGodboleUniversity of Southern California - NicholasMonathGoogle + NicholasMonathGoogle SeungyeonKimGoogle DeepMind Ankit SinghRawatGoogle - AndrewMcCallumUniversity of Massachusetts Amherst + AndrewMcCallumUniversity of Massachusetts Amherst ManzilZaheerZaheer and DeepMind 13101-13119 In text generation, hallucinations refer to the generation of seemingly coherent text that contradicts established knowledge. One compelling hypothesis is that hallucinations occur when a language model is given a generation task outside its parametric knowledge (due to rarity, recency, domain, etc.). A common strategy to address this limitation is to infuse the language models with retrieval mechanisms, providing the model with relevant knowledge for the task. In this paper, we leverage the planning capabilities of instruction-tuned LLMs and analyze how planning can be used to guide retrieval to further reduce the frequency of hallucinations. We empirically evaluate several variations of our proposed approach on long-form text generation tasks. By improving the coverage of relevant facts, plan-guided retrieval and generation can produce more informative responses while providing a higher rate of attribution to source documents. @@ -10144,7 +10144,7 @@ <fixed-case>RLHF</fixed-case> Can Speak Many Languages: Unlocking Multilingual Preference Optimization for <fixed-case>LLM</fixed-case>s JohnDangCohere - ArashAhmadian + ArashAhmadian KellyMarchisioCohere and Cohere JuliaKreutzerCohere for AI AhmetÜstünCohere For AI @@ -10157,7 +10157,7 @@ Boosting Logical Fallacy Reasoning in <fixed-case>LLM</fixed-case>s via Logical Structure Tree - YuanyuanLei + YuanyuanLei RuihongHuangTexas A&M University 13157-13173 Logical fallacy uses invalid or faulty reasoning in the construction of a statement. Despite the prevalence and harmfulness of logical fallacies, detecting and classifying logical fallacies still remains a challenging task. We observe that logical fallacies often use connective words to indicate an intended logical relation between two arguments, while the argument semantics does not actually support the logical relation. Inspired by this observation, we propose to build a logical structure tree to explicitly represent and track the hierarchical logic flow among relation connectives and their arguments in a statement. Specifically, this logical structure tree is constructed in an unsupervised manner guided by the constituency tree and a taxonomy of connectives for ten common logical relations, with relation connectives as non-terminal nodes and textual arguments as terminal nodes, and the latter are mostly elementary discourse units. We further develop two strategies to incorporate the logical structure tree into LLMs for fallacy reasoning. Firstly, we transform the tree into natural language descriptions and feed the textualized tree into LLMs as a part of the hard text prompt. Secondly, we derive a relation-aware tree embedding and insert the tree embedding into LLMs as a soft prompt. Experiments on benchmark datasets demonstrate that our approach based on logical structure tree significantly improves precision and recall for both fallacy detection and fallacy classification. @@ -10172,7 +10172,7 @@ ErwanFagnouUniversité Paris-Dauphine - PSL PaulCaillon, Université Paris-Dauphine (Paris IX) BlaiseDelattre - AlexandreAllauzenEcole supérieure de physique et chimie and Univeristé Paris-Dauphine + AlexandreAllauzenEcole supérieure de physique et chimie and Univeristé Paris-Dauphine 13174-13188 This paper investigates the limitations of transformers for entity-tracking tasks in large language models. We identify a theoretical constraint, showing that transformers require at least \log_2 (n+1) layers to handle entity tracking with n state changes. To address this issue, we propose an efficient and frugal enhancement to the standard attention mechanism, enabling it to manage long-term dependencies more efficiently. By considering attention as an adjacency matrix, our model can track entity states with a single layer.Empirical results demonstrate significant improvements in entity tracking datasets while keeping competitive performance on standard natural language modeling. Our modified attention allows us to achieve the same performance with drastically fewer layers. Additionally, our enhanced mechanism reveals structured internal representations of attention. Extensive experiments on both toy and complex datasets validate our approach. Our contributions include theoretical insights, an improved attention mechanism, and empirical validation. 2024.emnlp-main.731 @@ -10182,7 +10182,7 @@ <fixed-case>BEEAR</fixed-case>: Embedding-based Adversarial Removal of Safety Backdoors in Instruction-tuned Language Models - YiZeng + YiZeng WeiyuSun TranHuynhVirginia Polytechnic Institute and State University DawnSongUniversity of California Berkeley @@ -10196,8 +10196,8 @@ A <fixed-case>B</fixed-case>ayesian Approach to Harnessing the Power of <fixed-case>LLM</fixed-case>s in Authorship Attribution - ZhengmianHuAdobe Systems - TongZheng + ZhengmianHuAdobe Systems + TongZheng HengHuangUniversity of Maryland, College Park 13216-13227 Authorship attribution aims to identify the origin or author of a document. Traditional approaches have heavily relied on manual features and fail to capture long-range correlations, limiting their effectiveness. Recent advancements leverage text embeddings from pre-trained language models, which require significant fine-tuning on labeled data, posing challenges in data dependency and limited interpretability. Large Language Models (LLMs), with their deep reasoning capabilities and ability to maintain long-range textual associations, offer a promising alternative. This study explores the potential of pre-trained LLMs in one-shot authorship attribution, specifically utilizing Bayesian approaches and probability outputs of LLMs. Our methodology calculates the probability that a text entails previous writings of an author, reflecting a more nuanced understanding of authorship. By utilizing only pre-trained models such as Llama-3-70B, our results on the IMDb and blog datasets show an impressive 85% accuracy in one-shot authorship classification across ten authors. Our findings set new baselines for one-shot authorship analysis using LLMs and expand the application scope of these models in forensic linguistics. This work also includes extensive ablation studies to validate our approach. @@ -10210,8 +10210,8 @@ <fixed-case>FAC</fixed-case><tex-math>^2</tex-math><fixed-case>E</fixed-case>: Better Understanding Large Language Model Capabilities by Dissociating Language and Cognition XiaoqiangWang LingfeiWuAnytime AI and Pinterest - TengfeiMaState University of New York at Stony Brook - BangLiuUniversity of Montreal + TengfeiMaState University of New York at Stony Brook + BangLiuUniversity of Montreal 13228-13243 Large language models (LLMs) are primarily evaluated by overall performance on various text understanding and generation tasks. However, such a paradigm fails to comprehensively differentiate the fine-grained language and cognitive skills, rendering the lack of sufficient interpretation to LLMs’ capabilities. In this paper, we present FAC^2E, a framework for Fine-grAined and Cognition-grounded LLMs’ Capability Evaluation. Specifically, we formulate LLMs’ evaluation in a multi-dimensional and explainable manner by dissociating the language-related capabilities and the cognition-related ones. Besides, through extracting the intermediate reasoning from LLMs, we further break down the process of applying a specific capability into three sub-steps: recalling relevant knowledge, utilizing knowledge, and solving problems. Finally, FAC^2E evaluates each sub-step of each fine-grained capability, providing a two-faceted diagnosis for LLMs. Utilizing FAC^2E, we identify a common shortfall in knowledge utilization among models and propose a straightforward, knowledge-enhanced method to mitigate this issue. Our results not only showcase promising performance enhancements but also highlight a direction for future LLM advancements. 2024.emnlp-main.734 @@ -10232,9 +10232,9 @@ Language Concept Erasure for Language-invariant Dense Retrieval ZhiqiHuang - PuxuanYu + PuxuanYu ShauliRavfogel - JamesAllanUniversity of Massachusetts, Amherst + JamesAllanUniversity of Massachusetts, Amherst 13261-13273 Multilingual models aim for language-invariant representations but still prominently encode language identity. This, along with the scarcity of high-quality parallel retrieval data, limits their performance in retrieval. We introduce LANCER, a multi-task learning framework that improves language-invariant dense retrieval by reducing language-specific signals in the embedding space. Leveraging the notion of linear concept erasure, we design a loss function that penalizes cross-correlation between representations and their language labels. LANCER leverages only English retrieval data and general multilingual corpora, training models to focus on language-invariant retrieval by semantic similarity without necessitating a vast parallel corpus. Experimental results on various datasets show our method consistently improves over baselines, with extensive analyses demonstrating greater language agnosticism. 2024.emnlp-main.736 @@ -10248,8 +10248,8 @@ HanlinZhuElectrical Engineering & Computer Science Department, University of California Berkeley XiaomengYangGoogle DeepMind AndrewCohen - LeiLiSchool of Computer Science, Carnegie Mellon University - YuandongTianMeta AI (FAIR) + LeiLiSchool of Computer Science, Carnegie Mellon University + YuandongTianMeta AI (FAIR) 13274-13292 Recent research has increasingly focused on evaluating large language models’ (LLMs) alignment with diverse human values and preferences, particularly for open-ended tasks like story generation. Traditional evaluation metrics rely heavily on lexical similarity with human-written references, often showing poor correlation with human judgments and failing to account for alignment with the diversity of human preferences. To address these challenges, we introduce PerSE, an interpretable evaluation framework designed to assess alignment with specific human preferences. It is tuned to infer specific preferences from an in-context personal profile and evaluate the alignment between the generated content and personal preferences. PerSE enhances interpretability by providing detailed comments and fine-grained scoring, facilitating more personalized content generation. Our 13B LLaMA-2-based PerSE shows a 15.8% increase in Kendall correlation and a 13.7% rise in accuracy with zero-shot reviewers compared to GPT-4. It also outperforms GPT-4 by 46.01% in Kendall correlation on new domains, indicating its transferability 2024.emnlp-main.737 @@ -10260,7 +10260,7 @@ Large Language Models Are Involuntary Truth-Tellers: Exploiting Fallacy Failure for Jailbreak Attacks YueZhouUniversity of Illinois at Chicago Henry PengZouUniversity of Illinois at Chicago - BarbaraDi EugenioUniversity of Illinois, Chicago + BarbaraDi EugenioUniversity of Illinois, Chicago YangZhang 13293-13304 We find that language models have difficulties generating fallacious and deceptive reasoning. When asked to generate deceptive outputs, language models tend to leak honest counterparts but believe them to be false. Exploiting this deficiency, we propose a jailbreak attack method that elicits an aligned language model for malicious output. Specifically, we query the model to generate a fallacious yet deceptively real procedure for the harmful behavior. Since a fallacious procedure is generally considered fake and thus harmless by LLMs, it helps bypass the safeguard mechanism. Yet the output is factually harmful since the LLM cannot fabricate fallacious solutions but proposes truthful ones. We evaluate our approach over five safety-aligned large language models, comparing four previous jailbreak methods, and show that our approach achieves competitive performance with more harmful outputs. We believe the findings could be extended beyond model safety, such as self-verification and hallucination. @@ -10279,7 +10279,7 @@ TianxiangSun HangYanAI lab DahuaLinThe Chinese University of Hong Kong - XipengQiuFudan University + XipengQiuFudan University 13305-13320 Sparse Mixture of Experts (MoE) models are popular for training large language models due to their computational efficiency. However, the commonly used top-k routing mechanism suffers from redundancy computation and memory costs due to the unbalanced routing. Some experts are overflow, where the exceeding tokens are dropped. While some experts are empty, which are padded with zeros, negatively impacting model performance. To address the dropped tokens and padding, we propose the Rectify-Router, comprising the Intra-GPU Rectification and the Fill-in Rectification. The Intra-GPU Rectification handles dropped tokens, efficiently routing them to experts within the GPU where they are located to avoid inter-GPU communication. The Fill-in Rectification addresses padding by replacing padding tokens with the tokens that have high routing scores. Our experimental results demonstrate that the Intra-GPU Rectification and the Fill-in Rectification effectively handle dropped tokens and padding, respectively. Furthermore, the combination of them achieves superior performance, surpassing the accuracy of the vanilla top-1 router by 4.7%. 2024.emnlp-main.739 @@ -10288,8 +10288,8 @@ Null-Shot Prompting: Rethinking Prompting Large Language Models With Hallucination - PittawatTaveekitworachaiRitsumeikan University - FebriAbdullahRitsumeikan University + PittawatTaveekitworachaiRitsumeikan University + FebriAbdullahRitsumeikan University RuckThawonmasRitsumeikan University 13321-13361 This paper presents a series of investigations into an interesting phenomenon where we observe performance increases in large language models (LLMs) when providing a prompt that causes and exploits hallucination. We propose null-shot prompting, a counter-intuitive approach where we intentionally instruct LLMs to look at and utilize information from a null section. We investigate null-shot prompting on a wide range of tasks, including arithmetic reasoning, commonsense reasoning, and reading comprehension. We observe a substantial increase in performance in arithmetic reasoning tasks for various models, with up to a 44.62% increase compared to a baseline in one model. Therefore, we investigate deeper into this task by utilizing a more challenging mathematics problem-solving benchmark. We observe that LLMs benefit from hallucination in null-shot prompting in this task and discuss the mathematical topics that benefit the most from introducing hallucination in the prompt. We continue our investigation by evaluating hallucination detection abilities of the LLMs when using null-shot prompting. We find surprising results where hallucination in prompts can improve hallucination detection abilities of many LLMs. We also examine the effects of introducing both reasoning, which is known to mitigate hallucination, and hallucination simultaneously in the prompt and observe another surprising turn for the mathematics problem-solving benchmark with many performance improvements. We hope this paper will spark more interest, investigations, and discussions on how hallucination in prompts LLMs and even bolsters them in certain cases. @@ -10302,7 +10302,7 @@ <fixed-case>C</fixed-case>omm<fixed-case>VQA</fixed-case>: Situating Visual Question Answering in Communicative Contexts Nandita ShankarNaikStanford University - ChristopherPottsStanford University + ChristopherPottsStanford University ElisaKreissUniversity of California, Los Angeles 13362-13377 Current visual question answering (VQA) models tend to be trained and evaluated on image-question pairs in isolation. However, the questions people ask are dependent on their informational needs and prior knowledge about the image content. To evaluate how situating images within naturalistic contexts shapes visual questions, we introduce CommVQA, a VQA dataset consisting of images, image descriptions, real-world communicative scenarios where the image might appear (e.g., a travel website), and follow-up questions and answers conditioned on the scenario and description. CommVQA, which contains 1000 images and 8,949 question-answer pairs, poses a challenge for current models. Error analyses and a human-subjects study suggest that generated answers still contain high rates of hallucinations, fail to fittingly address unanswerable questions, and don’t suitably reflect contextual information. @@ -10317,10 +10317,10 @@ XuHanTsinghua University, Tsinghua University WangXu ChaojunXiao - XinrongZhangTsinghua University + XinrongZhangTsinghua University YeweiFang KaihuoZhang - ZhiyuanLiuTsinghua University + ZhiyuanLiuTsinghua University MaosongSun 13378-13393 Speculative decoding is a widely used method that accelerates the generation process of large language models (LLMs) with no compromise in model performance. It achieves this goal by using an existing smaller model for drafting and then employing the target LLM to verify the draft in a low-cost parallel manner. Under such a drafting-verification framework, drafting efficiency has become a bottleneck in the final speedup of speculative decoding. Therefore, generating longer drafts at less cost can lead to better decoding speedup. To achieve this, we introduce Ouroboros, which can generate draft phrases to parallelize the drafting process and meanwhile lengthen drafts in a training-free manner. The experimental results on various typical text generation tasks show that Ouroboros can achieve speedups of up to 2.4\times over speculative decoding and 3.9\times over vanilla decoding, without fine-tuning draft and target models. Code available at https://github.com/thunlp/Ouroboros. @@ -10332,11 +10332,11 @@ 1+1>2: Can Large Language Models Serve as Cross-Lingual Knowledge Aggregators? YueHuang - ChenruiFan + ChenruiFan YuanLi SiyuanWu - TianyiZhouUniversity of Maryland, College Park - XiangliangZhangUniversity of Notre Dame + TianyiZhouUniversity of Maryland, College Park + XiangliangZhangUniversity of Notre Dame LichaoSunLehigh University 13394-13412 Large Language Models (LLMs) have garnered significant attention due to their remarkable ability to process information across various languages. Despite their capabilities, they exhibit inconsistencies in handling identical queries in different languages, presenting challenges for further advancement. This paper introduces a method to enhance the multilingual performance of LLMs by aggregating knowledge from diverse languages. This approach incorporates a low-resource knowledge detector specific to a language, a strategic language selection process, and mechanisms for answer replacement and integration. Our extensive experiments demonstrate notable performance improvements, particularly in reducing the performance disparity across languages. An ablation study confirms that each component of our method significantly contributes to these enhancements. This research highlights the inherent potential of LLMs to harmonize multilingual capabilities and offers valuable insights for further exploration. @@ -10349,11 +10349,11 @@ How to Leverage Demonstration Data in Alignment for Large Language Model? A Self-Imitation Learning Perspective TengXiao - MingxiaoLiTencent AI Lab - YigeYuan + MingxiaoLiTencent AI Lab + YigeYuan HuaishengZhu - ChaoCui - Vasant GHonavarPennsylvania State University + ChaoCui + Vasant GHonavarPennsylvania State University 13413-13426 This paper introduces a novel generalized self-imitation learning GSIL framework, which effectively and efficiently aligns large language models with offline demonstration data. We develop GSIL by deriving a surrogate objective of imitation learning with density ratio estimates, facilitating the use of self-generated data and optimizing the imitation learning objective with simple classification losses. GSIL eliminates the need for complex adversarial training in standard imitation learning, achieving lightweight and efficient fine-tuning for large language models. In addition, GSIL encompasses a family of offline losses parameterized by a general class of convex functions for density ratio estimation and enables a unified view for alignment with demonstration data. Extensive experiments show that GSIL consistently and significantly outperforms baselines in many challenging benchmarks, such as coding (HuamnEval), mathematical reasoning (GSM8K) and instruction-following benchmark (MT-Bench). Code is public available at https://github.com/tengxiao1/GSIL. 2024.emnlp-main.744 @@ -10377,7 +10377,7 @@ KunLiChinese University of Hong Kong, The Chinese University of Hong Kong HongyinLuoMassachusetts Institute of Technology XixinWuThe Chinese University of Hong Kong - James R.GlassMassachusetts Institute of Technology + James R.GlassMassachusetts Institute of Technology Helen M.MengThe Chinese University of Hong Kong 13444-13461 Query rewriting is a crucial technique for passage retrieval in open-domain conversational question answering (CQA). It decontexualizes conversational queries into self-contained questions suitable for off-the-shelf retrievers. Existing methods attempt to incorporate retriever’s preference during the training of rewriting models. However, these approaches typically rely on extensive annotations such as in-domain rewrites and/or relevant passage labels, limiting the models’ generalization and adaptation capabilities. In this paper, we introduce AdaQR (Adaptive Query Rewriting), a framework for training query rewriting models with limited rewrite annotations from seed datasets and completely no passage label. Our approach begins by fine-tuning compact large language models using only 10% of rewrite annotations from the seed dataset training split. The models are then utilized to self-sample rewrite candidates for each query instance, further eliminating the expense for human labeling or larger language model prompting often adopted in curating preference data. A novel approach is then proposed to assess retriever’s preference for these candidates with the probability of answers conditioned on the conversational query by marginalizing the Top-K passages. This serves as the reward for optimizing the rewriter further using Direct Preference Optimization (DPO), a process free of rewrite and retrieval annotations. Experimental results on four open-domain CQA datasets demonstrate that AdaQR not only enhances the in-domain capabilities of the rewriter with limited annotation requirement, but also adapts effectively to out-of-domain datasets. @@ -10387,9 +10387,9 @@ Grasping the Essentials: Tailoring Large Language Models for Zero-Shot Relation Extraction - SizheZhou - YuMengUniversity of Virginia - BowenJin + SizheZhou + YuMengUniversity of Virginia + BowenJin JiaweiHan 13462-13486 Relation extraction (RE) aims to identify semantic relationships between entities within text. Despite considerable advancements, existing models predominantly require extensive annotated training data, which is both costly and labor-intensive to collect. Moreover, these models often struggle to adapt to new or unseen relations. Few-shot learning, aiming to lessen annotation demands, typically provides incomplete and biased supervision for target relations, leading to degraded and unstable performance. To accurately and explicitly describe relation semantics while minimizing annotation demands, we explore the definition only zero-shot RE setting where only relation definitions expressed in natural language are used to train a RE model. We introduce REPaL, comprising three stages: (1) We leverage large language models (LLMs) to generate initial seed instances from relation definitions and an unlabeled corpus. (2) We fine-tune a bidirectional Small Language Model (SLM) with initial seeds to learn relations for the target domain. (3) We expand pattern coverage and mitigate bias from initial seeds by integrating feedback from the SLM’s predictions on the unlabeled corpus and the synthesis history. To accomplish this, we leverage the multi-turn conversation ability of LLMs to generate new instances in follow-up dialogues, informed by both the feedback and synthesis history. Studies reveal that definition-oriented seed synthesis enhances pattern coverage whereas indiscriminately increasing seed quantity leads to performance saturation. Experiments on two datasets show REPaL significantly improved cost-effective zero-shot performance by large margins. @@ -10400,14 +10400,14 @@ <fixed-case>DA</fixed-case>-Code: Agent Data Science Code Generation Benchmark for Large Language Models YimingHuang - JianwenLuo + JianwenLuo YanYu YitongZhangBeijing Institute of Technology FangyuLei - YifanWei + YifanWei ShizhuHeInstitute of automation, Chinese academy of science, Chinese Academy of Sciences LifuHuangVirginia Tech - XiaoLiuMicrosoft Research Asia + XiaoLiuMicrosoft Research Asia JunZhaoInstitute of automation, Chinese academy of science KangLiuInstitute of automation, Chinese academy of science, Chinese Academy of Sciences 13487-13521 @@ -10418,9 +10418,9 @@ Leveraging Context-Aware Prompting for Commit Message Generation - ZhihuaJiang + ZhihuaJiang JianweiChen - DongningRaoGuangdong University of Technology + DongningRaoGuangdong University of Technology GuanghuiYeHunan University 13522-13540 Writing comprehensive commit messages is tedious yet important, because these messages describe changes of code, such as fixing bugs or adding new features. However, most existing methods focus on either only the changed lines or nearest context lines, without considering the effectiveness of selecting useful contexts. On the other hand, it is possible that introducing excessive contexts can lead to noise. To this end, we propose a code model COMMIT (Context-aware prOMpting based comMIt-message generaTion) in conjunction with a code dataset CODEC (COntext and metaData Enhanced Code dataset). Leveraging program slicing, CODEC consolidates code changes along with related contexts via property graph analysis. Further, utilizing CodeT5+ as the backbone model, we train COMMIT via context-aware prompt on CODEC. Experiments show that COMMIT can surpass all compared models including pre-trained language models for code (code-PLMs) such as CommitBART and large language models for code (code-LLMs) such as Code-LlaMa. Besides, we investigate several research questions (RQs), further verifying the effectiveness of our approach. We release the data and code at: https://github.com/Jnunlplab/COMMIT.git. @@ -10433,7 +10433,7 @@ Linguistic Bias in <fixed-case>C</fixed-case>hat<fixed-case>GPT</fixed-case>: Language Models Reinforce Dialect Discrimination EveFleisig - GenevieveSmithUniversity of California, Berkeley and University of Oxford + GenevieveSmithUniversity of California, Berkeley and University of Oxford MadelineBossiUniversity of California, Berkeley IshitaRustagiUniversity of California, Berkeley XavierYinUniversity of California, Berkeley @@ -10446,9 +10446,9 @@ Lifelong Knowledge Editing for <fixed-case>LLM</fixed-case>s with Retrieval-Augmented Continuous Prompt Learning - QizhouChen + QizhouChen TaolinZhangAlibaba Group - XiaofengHeEast China Normal University + XiaofengHeEast China Normal University DongyangLiEast China Normal University ChengyuWangAlibaba Group LongtaoHuangAlibaba Group @@ -10493,13 +10493,13 @@ An Analysis and Mitigation of the Reversal Curse - AngLv + AngLv KaiyiZhangRenmin University of China ShufangXieRenmin University of China - QuanTu - YuhanChenXiaomi Corporation - Ji-RongWenRenmin University of China - RuiYanRenmin University of China + QuanTu + YuhanChenXiaomi Corporation + Ji-RongWenRenmin University of China + RuiYanRenmin University of China 13603-13615 2024.emnlp-main.754 lv-etal-2024-analysis @@ -10523,7 +10523,7 @@ <fixed-case>O</fixed-case>ne<fixed-case>N</fixed-case>et: A Fine-Tuning Free Framework for Few-Shot Entity Linking via Large Language Model Prompting XukaiLiuUniversity of Science and Technology of China YeLiuUniversity of Science and Technology of China - KaiZhang + KaiZhang KehangWang QiLiuUniversity of Science and Technology of China EnhongChenUniversity of Science and Technology of China @@ -10538,8 +10538,8 @@ YangDengSingapore Management University YongZhao MoxinLi - See-KiongNgNational University of Singapore - Tat-SengChuaNational University of Singapore + See-KiongNgNational University of Singapore + Tat-SengChuaNational University of Singapore 13652-13673 Despite the remarkable abilities of Large Language Models (LLMs) to answer questions, they often display a considerable level of overconfidence even when the question does not have a definitive answer. To avoid providing hallucinated answers to these unknown questions, existing studies typically investigate approaches to refusing to answer these questions. In this work, we propose a novel and scalable self-alignment method to utilize the LLM itself to enhance its response-ability to different types of unknown questions, being capable of not just refusing to answer but further proactively providing explanations to the unanswerability of unknown questions. Specifically, the Self-Align method first employ a two-stage class-aware self-augmentation approach to generate a large amount of unknown question-response data. Then we conduct disparity-driven self-curation to select qualified data for fine-tuning the LLM itself for aligning the responses to unknown questions as desired. Experimental results on two datasets across four types of unknown questions validate the superiority of the Self-Aligned method over existing baselines in terms of three types of task formulation. 2024.emnlp-main.757 @@ -10550,8 +10550,8 @@ Fewer is More: Boosting Math Reasoning with Reinforced Context Pruning XijieHuangHong Kong University of Science and Technology Li LynaZhangMicrosoft Research Asia - Kwang-TingChengHong Kong University of Science and Technology - FanYangResearch, Microsoft + Kwang-TingChengHong Kong University of Science and Technology + FanYangResearch, Microsoft MaoYang 13674-13695 Large Language Models (LLMs) have shown impressive capabilities, yet they still struggle with math reasoning. In this work, we propose CoT-Influx, a novel approach that pushes the boundary of few-shot Chain-of-Thoughts (CoT) learning to improve LLM mathematical reasoning. Motivated by the observation that adding more concise CoT examples in the prompt can improve LLM reasoning performance, CoT-Influx employs a coarse-to-fine pruner to maximize the input of effective and concise CoT examples. The pruner first selects as many crucial CoT examples as possible and then prunes unimportant tokens to fit the context window. As a result, by enabling more CoT examples with double the context window size in tokens, CoT-Influx significantly outperforms various prompting baselines across various LLMs (LLaMA2-7B, 13B, 70B) and 5 math datasets, achieving up to 4.55% absolute improvements. Remarkably, without any fine-tuning, LLaMA2-70B with CoT-Influx surpasses GPT-3.5 and a wide range of larger LLMs (PaLM, Minerva 540B, etc.) on the GSM8K. CoT-Influx is a plug-and-play module for LLMs, adaptable in various scenarios. It’s compatible with advanced reasoning prompting techniques, such as self-consistency, and supports different long-context LLMs, including Mistral-7B-v0.3-32K and Yi-6B-200K. @@ -10567,7 +10567,7 @@ QingyuYinAmazon JingfengYangAmazon XianfengTangAmazon - ChenLuoAmazon + ChenLuoAmazon MingZengCarnegie Mellon University HaomingJiangAmazon YifanGaoAmazon @@ -10589,9 +10589,9 @@ Holistic Automated Red Teaming for Large Language Models through Top-Down Test Case Generation and Multi-turn Interaction - JinchuanZhangUniversity of the Chinese Academy of Sciences + JinchuanZhangUniversity of the Chinese Academy of Sciences YanZhou - YaxinLiuInstitute of Information Engineering, Chinese Academy of Sciences + YaxinLiuInstitute of Information Engineering, Chinese Academy of Sciences ZimingLi SonglinHu 13711-13736 @@ -10617,7 +10617,7 @@ <fixed-case>D</fixed-case>ynamic<fixed-case>ER</fixed-case>: Resolving Emerging Mentions to Dynamic Entities for <fixed-case>RAG</fixed-case> JinyoungKimSeoul National University DayoonKo - GunheeKimSeoul National University + GunheeKimSeoul National University 13752-13770 In the rapidly evolving landscape of language, resolving new linguistic expressions in continuously updating knowledge bases remains a formidable challenge. This challenge becomes critical in retrieval-augmented generation (RAG) with knowledge bases, as emerging expressions hinder the retrieval of relevant documents, leading to generator hallucinations. To address this issue, we introduce a novel task aimed at resolving emerging mentions to dynamic entities and present DynamicER benchmark. Our benchmark includes dynamic entity mention resolution and entity-centric knowledge-intensive QA task, evaluating entity linking and RAG model’s adaptability to new expressions, respectively. We discovered that current entity linking models struggle to link these new expressions to entities. Therefore, we propose a temporal segmented clustering method with continual adaptation, effectively managing the temporal dynamics of evolving entities and emerging mentions. Extensive experiments demonstrate that our method outperforms existing baselines, enhancing RAG model performance on QA task with resolved mentions. 2024.emnlp-main.762 @@ -10628,7 +10628,7 @@ Preserving Generalization of Language models in Few-shot Continual Relation Extraction - QuyenTranVinAI Research + QuyenTranVinAI Research Nguyen XuanThanh Nguyen HoangAnh Nam LeHaiHanoi University of Science and Technology @@ -10653,10 +10653,10 @@ IsratJahanYork University AmranBhuiyan Chee WeiTanNanyang Technological University - Md RizwanParvezQatar Computing Research Institute and Bosch + Md RizwanParvezQatar Computing Research Institute and Bosch EnamulHoqueYork University ShafiqJotySalesForce.com and Nanyang Technological University - JimmyHuangYork University and York University + JimmyHuangYork University and York University 13785-13816 Large Language Models (LLMs) have recently gained significant attention due to their remarkable capabilities in performing diverse tasks across various domains. However, a thorough evaluation of these models is crucial before deploying them in real-world applications to ensure they produce reliable performance. Despite the well-established importance of evaluating LLMs in the community, the complexity of the evaluation process has led to varied evaluation setups, causing inconsistencies in findings and interpretations. To address this, we systematically review the primary challenges and limitations causing these inconsistencies and unreliable evaluations in various steps of LLM evaluation. Based on our critical review, we present our perspectives and recommendations to ensure LLM evaluations are reproducible, reliable, and robust. 2024.emnlp-main.764 @@ -10665,7 +10665,7 @@ Consecutive Batch Model Editing with <fixed-case>H</fixed-case>oo<fixed-case>K</fixed-case> Layers - ShuaiyiLiChinese University of Hong Kong, The Chinese University of Hong Kong + ShuaiyiLiChinese University of Hong Kong, The Chinese University of Hong Kong YangDengSingapore Management University DengCaiTencent AI Lab HongyuanLuThe Chinese University of Hong Kong @@ -10680,8 +10680,8 @@ Topic-Oriented Open Relation Extraction with A Priori Seed Generation LinyiDing - JinfengXiao - SizheZhou + JinfengXiao + SizheZhou ChaoqiYang JiaweiHan 13834-13845 @@ -10692,7 +10692,7 @@ Related Work and Citation Text Generation: A Survey - XiangciLi + XiangciLi JessicaOuyangUniversity of Texas at Dallas 13846-13864 To convince readers of the novelty of their research paper, authors must perform a literature review and compose a coherent story that connects and relates prior works to the current work. This challenging nature of literature review writing makes automatic related work generation (RWG) academically and computationally interesting, and also makes it an excellent test bed for examining the capability of SOTA natural language processing (NLP) models. Since the initial proposal of the RWG task, its popularity has waxed and waned, following the capabilities of mainstream NLP approaches. In this work, we survey the zoo of RWG historical works, summarizing the key approaches and task definitions and discussing the ongoing challenges of RWG. @@ -10708,7 +10708,7 @@ ShengjunCheng JunRaoHarbin Institute of Technology TengfeiYu - HexuanDengHarbin Institute of Technology, Shenzhen + HexuanDengHarbin Institute of Technology, Shenzhen MinZhangHarbin Institute of Technology, Shenzhen 13865-13881 Consistency learning (CL) has proven to be a valuable technique for improving the robustness of models in conditional sentence generation (CSG) tasks by ensuring stable predictions across various input data forms. However, models augmented with CL often face challenges in optimizing consistency features, which can detract from their efficiency and effectiveness. To address these challenges, we introduce Curriculum Consistency Learning (CCL), a novel strategy that guides models to learn consistency in alignment with their current capacity to differentiate between features. CCL is designed around the inherent aspects of CL-related losses, promoting task independence and simplifying implementation. Implemented across four representative CSG tasks, including instruction tuning (IT) for large language models and machine translation (MT) in three modalities (text, speech, and vision), CCL demonstrates marked improvements. Specifically, it delivers +2.0 average accuracy point improvement compared with vanilla IT and an average increase of +0.7 in COMET scores over traditional CL methods in MT tasks. Our comprehensive analysis further indicates that models utilizing CCL are particularly adept at managing complex instances, showcasing the effectiveness and efficiency of CCL in improving CSG models. Code and scripts are available at https://github.com/xinxinxing/Curriculum-Consistency-Learning. @@ -10720,8 +10720,8 @@ A Systematic Analysis of Large Language Models as Soft Reasoners: The Case of Syllogistic Inferences LeonardoBertolazziUniversity of Trento - AlbertGattUtrecht University - RaffaellaBernardiUniversity of Trento + AlbertGattUtrecht University + RaffaellaBernardiUniversity of Trento 13882-13905 The reasoning abilities of Large Language Models (LLMs) are becoming a central focus of study in NLP. In this paper, we consider the case of syllogistic reasoning, an area of deductive reasoning studied extensively in logic and cognitive psychology. Previous research has shown that pre-trained LLMs exhibit reasoning biases, such as content effects, avoid answering that no conclusion follows, align with human difficulties, and struggle with multi-step reasoning. We contribute to this research line by systematically investigating the effects of chain-of-thought reasoning, in-context learning (ICL), and supervised fine-tuning (SFT) on syllogistic reasoning, considering syllogisms with conclusions that support or violate world knowledge and with multiple premises. Crucially, we go beyond the standard focus on accuracy, with an in-depth analysis of the conclusions generated by the models. Our results suggest that the behavior of pre-trained LLMs can be explained by heuristics studied in cognitive science and that both ICL and SFT improve model performance on valid inferences, although only the latter can mitigate most reasoning biases while being consistent. 2024.emnlp-main.769 @@ -10731,7 +10731,7 @@ Pre-training Cross-lingual Open Domain Question Answering with Large-scale Synthetic Supervision FanJiang - TomDrummondUniversity of Melbourne + TomDrummondUniversity of Melbourne TrevorCohnGoogle and The University of Melbourne 13906-13933 2024.emnlp-main.770 @@ -10740,15 +10740,15 @@ <fixed-case>MOSEL</fixed-case>: 950,000 Hours of Speech Data for Open-Source Speech Foundation Model Training on <fixed-case>EU</fixed-case> Languages - MarcoGaidoFondazione Bruno Kessler - SaraPapi + MarcoGaidoFondazione Bruno Kessler + SaraPapi LuisaBentivogliFondazione Bruno Kessler - AlessioBruttiFondazione Bruno Kessler - MauroCettoloFondazione Bruno Kessler + AlessioBruttiFondazione Bruno Kessler + MauroCettoloFondazione Bruno Kessler RobertoGretterFondazione Bruno Kessler MarcoMatassoniFondazione Bruno Kessler - MohamedNabihFondazione Bruno Kessler - MatteoNegriFondazione Bruno Kessler + MohamedNabihFondazione Bruno Kessler + MatteoNegriFondazione Bruno Kessler 13934-13947 The rise of foundation models (FMs), coupled with regulatory efforts addressing their risks and impacts, has sparked significant interest in open-source models. However, existing speech FMs (SFMs) fall short of full compliance with the open-source principles, even if claimed otherwise, as no existing SFM has model weights, code, and training data publicly available under open-source terms. In this work, we take the first step toward filling this gap by focusing on the 24 official languages of the European Union (EU). We collect suitable training data by surveying automatic speech recognition datasets and unlabeled speech corpora under open-source compliant licenses, for a total of 950k hours. Additionally, we release automatic transcripts for 441k hours of unlabeled data under the permissive CC-BY license, thereby facilitating the creation of open-source SFMs for the EU languages. 2024.emnlp-main.771 @@ -10758,12 +10758,12 @@ Improving Knowledge Graph Completion with Structure-Aware Supervised Contrastive Learning - JiashiLinNorthwest Polytechnical University Xi’an - LifangWangNorthwest Polytechnical University Xi’an + JiashiLinNorthwest Polytechnical University Xi’an + LifangWangNorthwest Polytechnical University Xi’an XinyuLuXi’an University of Finance and Economics - ZhongtianHu - WeiZhangNorthwest Polytechnical University Xi’an - WenxuanLuNorthwest Polytechnical University Xi’an + ZhongtianHu + WeiZhangNorthwest Polytechnical University Xi’an + WenxuanLuNorthwest Polytechnical University Xi’an 13948-13959 Knowledge Graphs (KGs) often suffer from incomplete knowledge, which which restricts their utility. Recently, Contrastive Learning (CL) has been introduced to Knowledge Graph Completion (KGC), significantly improving the discriminative capabilities of KGC models and setting new benchmarks in performance. However, existing contrastive methods primarily focus on individual triples, overlooking the broader structural connectivities and topologies of KGs. This narrow focus limits a comprehensive understanding of the graph’s structural knowledge. To address this gap, we propose StructKGC, a novel contrastive learning framework designed to flexibly accommodate the diverse topologies inherent in KGs. Additionally, we introduce four contrastive tasks specifically tailored to KG data: Vertex-level CL, Neighbor-level CL, Path-level CL, and Relation composition level CL. These tasks are trained synergistically during the fine-tuning of pre-trained language models (PLMs), allowing for a more nuanced capture of subgraph semantics. To validate the effectiveness of our method, we perform a comprehensive set of experiments on several real-world datasets. The experimental results demonstrate that our approach achieves SOTA performance under standard supervised and low-resource settings. Furthermore, the different levels of structure-aware tasks introduced can mutually reinforce each other, leading to consistent performance improvements. 2024.emnlp-main.772 @@ -10774,7 +10774,7 @@ Contribution of Linguistic Typology to <fixed-case>U</fixed-case>niversal <fixed-case>D</fixed-case>ependency Parsing: An Empirical Investigation - AliBasiratUniversity of Copenhagen + AliBasiratUniversity of Copenhagen Navid BaradaranHemmati 13960-13971 Universal Dependencies (UD) is a global initiative to create a standard annotation for the dependency syntax of human languages. Addressing its deviation from typological principles, this study presents an empirical investigation of a typologically motivated transformation of UD proposed by William Croft. Our findings underscore the significance of the transformations across diverse languages and highlight their advantages and limitations. @@ -10784,11 +10784,11 @@ <fixed-case>TR</fixed-case>o<fixed-case>TR</fixed-case>: A Framework for Evaluating the Re-contextualization of Text Reuse - FrancescoPeritiUniversity of Milan - PierluigiCassottiGöteborg University - StefanoMontanelliUniversity of Milan - NinaTahmasebiGöteborg University - DominikSchlechtwegInstitute for Natural Language Processing, University of Stuttgart + FrancescoPeritiUniversity of Milan + PierluigiCassottiGöteborg University + StefanoMontanelliUniversity of Milan + NinaTahmasebiGöteborg University + DominikSchlechtwegInstitute for Natural Language Processing, University of Stuttgart 13972-13990 Current approaches for detecting text reuse do not focus on recontextualization, i.e., how the new context(s) of a reused text differs from its original context(s). In this paper, we propose a novel framework called TRoTR that relies on the notion of topic relatedness for evaluating the diachronic change of context in which text is reused. TRoTR includes two NLP tasks: TRiC and TRaC. TRiC is designed to evaluate the topic relatedness between a pair of recontextualizations. TRaC is designed to evaluate the overall topic variation within a set of recontextualizations. We also provide a curated TRoTR benchmark of biblical text reuse, human-annotated with topic relatedness. The benchmark exhibits an inter-annotator agreement of .811. We evaluate multiple, established SBERT models on the TRoTR tasks and find that they exhibit greater sensitivity to textual similarity than topic relatedness. Our experiments show that fine-tuning these models can mitigate such a kind of sensitivity. 2024.emnlp-main.774 @@ -10801,11 +10801,11 @@ Structured Optimal Brain Pruning for Large Language Models JiatengWei QuanLuMashang Financial Institution - NingJiangMashang Consumer Finance Co, Ltd + NingJiangMashang Consumer Finance Co, Ltd SiqiLiZhejiang University JingyangXiang - JunChenZhejiang Normal University - YongLiuZhejiang University + JunChenZhejiang Normal University + YongLiuZhejiang University 13991-14007 The massive parameters and computational demands hinder the widespread application of Large Language Models (LLMs). Network pruning provides a practical solution to this problem. However, existing pruning works for LLMs mainly focus on unstructured pruning or necessitate post-pruning fine-tuning. The former relies on special hardware to accelerate computation, while the latter may need substantial computational resources. In this paper, we introduce a retraining-free structured pruning method called SoBP (Structured Optimal Brain Pruning). It leverages global first-order information to select pruning structures, then refines them with a local greedy approach, and finally adopts module-wise reconstruction to mitigate information loss. We assess the effectiveness of SoBP across 14 models from 3 LLM families on 8 distinct datasets. Experimental results demonstrate that SoBP outperforms current state-of-the-art methods. 2024.emnlp-main.775 @@ -10814,9 +10814,9 @@ Automatically Generated Definitions and their utility for Modeling Word Meaning - FrancescoPeritiUniversity of Milan + FrancescoPeritiUniversity of Milan DavidAlfterGöteborg University - NinaTahmasebiGöteborg University + NinaTahmasebiGöteborg University 14008-14026 Modeling lexical semantics is a challenging task, often suffering from interpretability pitfalls. In this paper, we delve into the generation of dictionary-like sense definitions and explore their utility for modeling word meaning. We fine-tuned two Llama models and include an existing T5-based model in our evaluation. Firstly, we evaluate the quality of the generated definitions on existing English benchmarks, setting new state-of-the-art results for the Definition Generation task. Next, we explore the use of definitions generated by our models as intermediate representations subsequently encoded as sentence embeddings. We evaluate this approach on lexical semantics tasks such as the Word-in-Context, Word Sense Induction, and Lexical Semantic Change, setting new state-of-the-art results in all three tasks when compared to unsupervised baselines. 2024.emnlp-main.776 @@ -10829,9 +10829,9 @@ How Do Your Code <fixed-case>LLM</fixed-case>s perform? Empowering Code Instruction Tuning with Really Good Data YejieWang KeqingHeMeituan Group - DayuanFu + DayuanFu ZhuomaGongQue - HeyangXu + HeyangXu YanxuChen ZhexuWang YujiaFu @@ -10840,7 +10840,7 @@ JingangWangMeituan MengdiZhang XunliangCai - WeiranXu + WeiranXu 14027-14043 Recently, there has been a growing interest in studying how to construct better code instruction tuning data. However, we observe Code models trained with these datasets exhibit high performance on HumanEval but perform worse on other benchmarks such as LiveCodeBench. Upon further investigation, we find that many datasets suffer from severe data leakage. After cleaning up most of the leaked data, some well-known high-quality datasets perform poorly. This discovery reveals a new challenge: identifying which dataset genuinely qualify as high-quality code instruction data. To address this, we propose an efficient code data pruning strategy for selecting good samples. Our approach is based on three dimensions: instruction complexity, response quality, and instruction diversity. Based on our selected data, we present XCoder, a family of models finetuned from LLaMA3. Our experiments show Xcoder achieves new state-of-the-art performance using fewer training data, which verify the effectiveness of our data strategy. Moreover, we perform a comprehensive analysis on the data composition and find existing code datasets have different characteristics according to their construction methods, which provide new insights for future code LLMs. 2024.emnlp-main.777 @@ -10853,11 +10853,11 @@ ZhengliangShi Wu JiuLong LingyongYanBaidu Inc. - XinyuMaBaidu + XinyuMaBaidu YidingLiuBaidu MinCao DaweiYinBaidu - ZhaochunRenLeiden University + ZhaochunRenLeiden University 14044-14067 Recent information retrieval (IR) models are pre-trained and instruction-tuned on massive datasets and tasks, enabling them to perform well on a wide range of tasks and potentially generalize to unseen tasks with instructions. However, existing IR benchmarks focus on a limited scope of tasks, making them insufficient for evaluating the latest IR models. In this paper, we propose MAIR (Massive Instructed Retrieval Benchmark), a heterogeneous IR benchmark that includes 126 distinct IR tasks across 6 domains, collected from existing datasets. We benchmark state-of-the-art instruction-tuned text embedding models and re-ranking models. Our experiments reveal that instruction-tuned models generally achieve superior performance compared to non-instruction-tuned models on MAIR Additionally, our results suggest that current instruction-tuned text embedding models and re-ranking models still lack effectiveness in specific long-tail tasks. MAIR is publicly available at https://github.com/sunnweiwei/Mair. 2024.emnlp-main.778 @@ -10870,7 +10870,7 @@ LemaoLiuTencent MoYuWeChat AI, Tencent YueYuNational University of Defense Technology and PengCheng Lab - XiangAoUniversity of the Chinese Academy of Sciences and Institute of Computing Technology, Chinese Academy of Sciences + XiangAoUniversity of the Chinese Academy of Sciences and Institute of Computing Technology, Chinese Academy of Sciences 14068-14082 In-context learning (ICL) has demonstrated excellent performance across various downstream NLP tasks, especially when synergized with powerful large language models (LLMs). Existing studies evaluate ICL methods primarily based on downstream task performance. This evaluation protocol overlooks the significant cost associated with the demonstration configuration process, i.e., tuning the demonstration as the ICL prompt. However, in this work, we point out that the evaluation protocol leads to unfair comparisons and potentially biased evaluation, because we surprisingly find the correlation between the configuration costs and task performance. Then we call for a two-dimensional evaluation paradigm that considers both of these aspects, facilitating a fairer comparison.Finally, based on our empirical finding that the optimized demonstration on one language model generalizes across language models of different sizes, we introduce a simple yet efficient strategy that can be applied to any ICL method as a plugin, yielding a better trade-off between the two dimensions according to the proposed evaluation paradigm. 2024.emnlp-main.779 @@ -10906,7 +10906,7 @@ Enhancing Training Data Attribution for Large Language Models with Fitting Error Consideration KangxiWuInstitute of Computing Technology, Chinese Academy of Sciences - LiangPangInstitute of Computing Technology, Chinese Academy of Sciences + LiangPangInstitute of Computing Technology, Chinese Academy of Sciences HuaweiShenInstitute of Computing Technology, Chinese Academy of Sciences XueqiCheng, Chinese Academy of Sciences 14131-14143 @@ -10917,10 +10917,10 @@ Where am <fixed-case>I</fixed-case>? Large Language Models Wandering between Semantics and Structures in Long Contexts - SeonminKooKorea University - JinsungKim + SeonminKooKorea University + JinsungKim YoungJoonJangKorea University and Hongik University - ChanjunParkUpstage + ChanjunParkUpstage HeuiseokLimKorea University 14144-14160 As the utilization of Large Language Models (LLMs) becomes more widespread, there is a growing demand for their ability to handle more complex and longer external knowledge across various use cases. Most existing evaluations of the open-ended question answering (ODQA) task, which necessitates the use of external knowledge, focus solely on whether the model provides the correct answer. However, even when LLMs answer correctly, they often fail to provide an obvious source for their responses. Therefore, it is necessary to jointly evaluate and verify the correctness of the answers and the appropriateness of grounded evidence in complex external contexts. To address this issue, we examine the phenomenon of discrepancies in abilities across two distinct tasks—QA and evidence selection—when performed simultaneously, from the perspective of task alignment. To verify LLMs’ task alignment, we introduce a verification framework and resources considering both semantic relevancy and structural diversity of the given long context knowledge. Through extensive experiments and detailed analysis, we provide insights into the task misalignment between QA and evidence selection. Our code and resources will be available upon acceptance. @@ -10933,7 +10933,7 @@ MatthewShu NishantBalepur ShiFengGeorge Washington University - Jordan LeeBoyd-GraberUniversity of Maryland, College Park + Jordan LeeBoyd-GraberUniversity of Maryland, College Park 14161-14178 Flashcard schedulers rely on 1) *student models* to predict the flashcards a student knows; and 2) *teaching policies* to pick which cards to show next via these predictions.Prior student models, however, just use study data like the student’s past responses, ignoring the text on cards. We propose **content-aware scheduling**, the first schedulers exploiting flashcard content.To give the first evidence that such schedulers enhance student learning, we build KARL, a simple but effective content-aware student model employing deep knowledge tracing (DKT), retrieval, and BERT to predict student recall.We train KARL by collecting a new dataset of 123,143 study logs on diverse trivia questions.KARL bests existing student models in AUC and calibration error.To ensure our improved predictions lead to better student learning, we create a novel delta-based teaching policy to deploy KARL online.Based on 32 study paths from 27 users, KARL improves learning efficiency over SOTA, showing KARL’s strength and encouraging researchers to look beyond historical study data to fully capture student abilities. 2024.emnlp-main.784 @@ -10943,18 +10943,18 @@ Large Language Models Can Be Contextual Privacy Protection Learners - YijiaXiao - YiqiaoJin + YijiaXiao + YiqiaoJin YushiBai YueWu - XianjunYang - XiaoLuoUniversity of California, Los Angeles + XianjunYang + XiaoLuoUniversity of California, Los Angeles WenchaoYuUniversity of California, Los Angeles XujiangZhaoNEC Labs America YanchiLiuNEC-Labs QuanquanGuUniversity of California, Los Angeles HaifengChenNEC-Labs - WeiWangUniversity of California, Los Angeles + WeiWangUniversity of California, Los Angeles WeiChengNEC-Labs 14179-14201 The proliferation of Large Language Models (LLMs) has driven considerable interest in fine-tuning them with domain-specific data to create specialized language models. Nevertheless, such domain-specific fine-tuning data often contains contextually sensitive personally identifiable information (PII). Direct fine-tuning LLMs on this data without privacy protection poses a risk of data leakage of sensitive PII during inference time. To address this challenge, we introduce Contextual Privacy Protection Language Models (CPPLM), a novel paradigm for fine-tuning LLMs that effectively injects domain-specific knowledge while safeguarding inference-time data privacy. Our work offers a theoretical analysis for model design and delves into various techniques such as corpus curation, penalty-based unlikelihood in training loss, and instruction-based tuning, etc. Extensive experiments across diverse datasets and scenarios demonstrate the effectiveness of our approaches. In particular, instruction tuning with both positive and negative examples, stands out as a promising method, effectively protecting private data while enhancing the model’s knowledge. Our work underscores the potential for Large Language Models as robust contextual privacy protection learners. @@ -10970,7 +10970,7 @@ AlisonRobeySUNY Empire State University ShiFengGeorge Washington University SeraphinaGoldfarb-Tarrant - Jordan LeeBoyd-GraberUniversity of Maryland, College Park + Jordan LeeBoyd-GraberUniversity of Maryland, College Park 14202-14225 Keyword mnemonics are memorable explanations that link new terms to simpler keywords.Prior work generates mnemonics for students, but they do not train models using mnemonics students prefer and aid learning.We build SMART, a mnemonic generator trained on feedback from real students learning new terms.To train SMART, we first fine-tune LLaMA-2 on a curated set of user-written mnemonics.We then use LLM alignment to enhance SMART: we deploy mnemonics generated by SMART in a flashcard app to find preferences on mnemonics students favor.We gather 2684 preferences from 45 students across two types: **expressed** (inferred from ratings) and **observed** (inferred from student learning), yielding three key findings.First, expressed and observed preferences disagree; what students *think* is helpful does not always capture what is *truly* helpful.Second, Bayesian models can synthesize complementary data from multiple preference types into a single effectiveness signal.SMART is tuned via Direct Preference Optimization on this signal, which resolves ties and missing labels in the typical method of pairwise comparisons, augmenting data for LLM output quality gains. Third, mnemonic experts assess SMART as matching GPT-4 at much lower deployment costs, showing the utility of capturing diverse student feedback to align LLMs in education. 2024.emnlp-main.786 @@ -10983,7 +10983,7 @@ Mixture-of-Skills: Learning to Optimize Data Usage for Fine-Tuning Large Language Models MinghaoWu Thuy-TrangVuMonash University - LizhenQuMonash University + LizhenQuMonash University RezaHafMonash University 14226-14240 Large language models (LLMs) are typically fine-tuned on diverse and extensive datasets sourced from various origins to develop a comprehensive range of skills, such as writing, reasoning, chatting, coding, and more. Each skill has unique characteristics, and these datasets are often heterogeneous and imbalanced, making the fine-tuning process highly challenging. Balancing the development of each skill while ensuring the model maintains its overall performance requires sophisticated techniques and careful dataset curation. In this work, we propose a general, model-agnostic, reinforcement learning framework, Mixture-of-Skills (MoS), that learns to optimize data usage automatically during the fine-tuning process. This framework ensures the optimal comprehensive skill development of LLMs by dynamically adjusting the focus on different datasets based on their current learning state. To validate the effectiveness of MoS, we conduct extensive experiments using three diverse LLM backbones on two widely used benchmarks and demonstrate that MoS substantially enhances model performance. Building on the success of MoS, we propose MoSpec, an adaptation for task-specific fine-tuning, which harnesses the utilities of various datasets for a specific purpose. Our work underlines the significance of dataset rebalancing and present MoS as a powerful, general solution for optimizing data usage in the fine-tuning of LLMs for various purposes. @@ -10993,11 +10993,11 @@ <fixed-case>M</fixed-case>ol<fixed-case>TRES</fixed-case>: Improving Chemical Language Representation Learning for Molecular Property Prediction - Jun-HyungParkHankuk University of Foreign Studies + Jun-HyungParkHankuk University of Foreign Studies YeachanKimKorea University MingyuLeeKorea University HyuntaePark - SangKeunLeeKorea University + SangKeunLeeKorea University 14241-14254 Chemical representation learning has gained increasing interest due to the limited availability of supervised data in fields such as drug and materials design. This interest particularly extends to chemical language representation learning, which involves pre-training Transformers on SMILES sequences - textual descriptors of molecules. Despite its success in molecular property prediction, current practices often lead to overfitting and limited scalability due to early convergence. In this paper, we introduce a novel chemical language representation learning framework, called MolTRES, to address these issues. MolTRES incorporates generator-discriminator training, allowing the model to learn from more challenging examples that require structural understanding. In addition, we enrich molecular representations by transferring knowledge from scientific literature by integrating external materials embedding. Experimental results show that our model outperforms existing state-of-the-art models on popular molecular property prediction tasks. 2024.emnlp-main.788 @@ -11012,7 +11012,7 @@ ShusakuSoneNational Institute of Technology, Ichinoseki College and Tohoku University MasayaTaniguchi KeisukeSakaguchiTohoku University - KentaroInuiMohamed bin Zayed University of Artificial Intelligence, RIKEN and Tohoku University + KentaroInuiMohamed bin Zayed University of Artificial Intelligence, RIKEN and Tohoku University 14255-14271 Explicit multi-step reasoning, such as chain-of-thought, is widely adopted in the community to explore the better performance of language models (LMs). We report on the systematic strategy that LMs use in this process.Our controlled experiments reveal that LMs rely more heavily on heuristics, such as lexical overlap, in the earlier stages of reasoning when more steps are required to reach an answer. Conversely, their reliance on heuristics decreases as LMs progress closer to the final answer. This suggests that LMs track only a limited number of future steps and dynamically combine heuristic strategies with rational ones in solving tasks involving multi-step reasoning. 2024.emnlp-main.789 @@ -11025,7 +11025,7 @@ JiminSun So YeonMin YingshanChang - YonatanBiskMeta and Carnegie Mellon University + YonatanBiskMeta and Carnegie Mellon University 14272-14289 Tools have become a mainstay of LLMs, allowing them to retrieve knowledge not in their weights, to perform tasks on the web, and even to control robots. However, most ontologies and surveys of tool-use have assumed the core challenge for LLMs is choosing the tool. Instead, we introduce a framework for tools more broadly which guides us to explore a model’s ability to detect “silent” tool errors, and reflect on how to plan. This more directly aligns with the increasingly popular use of models as tools. We provide an initial approach to failure recovery with promising results both on a controlled calculator setting and embodied agent planning. 2024.emnlp-main.790 @@ -11035,7 +11035,7 @@ Pcc-tuning: Breaking the Contrastive Learning Ceiling in Semantic Textual Similarity BowenZhang - ChunpingLiTsinghua University, Tsinghua University + ChunpingLiTsinghua University, Tsinghua University 14290-14302 Semantic Textual Similarity (STS) constitutes a critical research direction in computational linguistics and serves as a key indicator of the encoding capabilities of embedding models. Driven by advances in pre-trained language models and contrastive learning, leading sentence representation methods have reached an average Spearman’s correlation score of approximately 86 across seven STS benchmarks in SentEval. However, further progress has become increasingly marginal, with no existing method attaining an average score higher than 86.5 on these tasks. This paper conducts an in-depth analysis of this phenomenon and concludes that the upper limit for Spearman’s correlation scores under contrastive learning is 87.5. To transcend this ceiling, we propose an innovative approach termed Pcc-tuning, which employs Pearson’s correlation coefficient as a loss function to refine model performance beyond contrastive learning. Experimental results demonstrate that Pcc-tuning can markedly surpass previous state-of-the-art strategies with only a minimal amount of fine-grained annotated samples. 2024.emnlp-main.791 @@ -11047,7 +11047,7 @@ Cross-lingual Back-Parsing: Utterance Synthesis from Meaning Representation for Zero-Resource Semantic Parsing DeokhyungKangPohang University of Science and Technology - SeonjeongHwangPohang University of Science and Technology + SeonjeongHwangPohang University of Science and Technology YunsuKimaiXplain, Inc. GaryLee 14303-14317 @@ -11060,8 +11060,8 @@ Shaking Up <fixed-case>VLM</fixed-case>s: Comparing Transformers and Structured State Space Models for Vision & Language Modeling GeorgiosPantazopoulos MalvinaNikandrouHeriot-Watt University - AlessandroSugliaHeriot-Watt University - OliverLemonHeriot-Watt University + AlessandroSugliaHeriot-Watt University + OliverLemonHeriot-Watt University ArashEshghiHeriot-Watt University 14318-14337 This study explores replacing Transformers in Visual Language Models (VLMs) with Mamba, a recent structured state space model (SSM) that demonstrates promising performance in sequence modeling. We test models up to 3B parameters under controlled conditions, showing that Mamba-based VLMs outperforms Transformers-based VLMs in captioning, question answering, and reading comprehension. However, we find that Transformers achieve greater performance in visual grounding and the performance gap widens with scale. We explore two hypotheses to explain this phenomenon: 1) the effect of task-agnostic visual encoding on the updates of the hidden states, and 2) the difficulty in performing visual grounding from the perspective of in-context multimodal retrieval. Our results indicate that a task-aware encoding yields minimal performance gains on grounding, however, Transformers significantly outperform Mamba at in-context multimodal retrieval. Overall, Mamba shows promising performance on tasks where the correct output relies on a summary of the image but struggles when retrieval of explicit information from the context is required. @@ -11071,9 +11071,9 @@ Are <fixed-case>LLM</fixed-case>s Good Zero-Shot Fallacy Classifiers? - FengjunPan + FengjunPan XiaobaoWu - ZongruiLiNanyang Technological University + ZongruiLiNanyang Technological University Anh TuanLuuNanyang Technological University 14338-14364 Fallacies are defective arguments with faulty reasoning. Detecting and classifying them is a crucial NLP task to prevent misinformation, manipulative claims, and biased decisions. However, existing fallacy classifiers are limited by the requirement for sufficient labeled data for training, which hinders their out-of-distribution (OOD) generalization abilities. In this paper, we focus on leveraging Large Language Models (LLMs) for zero-shot fallacy classification. To elicit fallacy-related knowledge and reasoning abilities of LLMs, we propose diverse single-round and multi-round prompting schemes, applying different taskspecific instructions such as extraction, summarization, and Chain-of-Thought reasoning. With comprehensive experiments on benchmark datasets, we suggest that LLMs could be potential zero-shot fallacy classifiers. In general, LLMs under single-round prompting schemes have achieved acceptable zeroshot performances compared to the best fullshot baselines and can outperform them in all OOD inference scenarios and some opendomain tasks. Our novel multi-round prompting schemes can effectively bring about more improvements, especially for small LLMs. Our analysis further underlines the future research on zero-shot fallacy classification. Codes and data are available at: https://github.com/panFJCharlotte98/Fallacy_Detection. @@ -11088,7 +11088,7 @@ YanzhengXiang HanqiYan LinGuiKing’s College London, University of London - YulanHeKing’s College London, University of London + YulanHeKing’s College London, University of London 14365-14378 Understanding in-context learning (ICL) capability that enables large language models (LLMs) to excel in proficiency through demonstration examples is of utmost importance. This importance stems not only from the better utilization of this capability across various tasks, but also from the proactive identification and mitigation of potential risks, including concerns regarding truthfulness, bias, and toxicity, that may arise alongside the capability. In this paper, we present a thorough survey on the interpretation and analysis of in-context learning. First, we provide a concise introduction to the background and definition of in-context learning. Then, we give an overview of advancements from two perspectives: 1) a theoretical perspective, emphasizing studies on mechanistic interpretability and delving into the mathematical foundations behind ICL; and 2) an empirical perspective, concerning studies that empirically analyze factors associated with ICL. We conclude by discussing open questions and the challenges encountered, and suggesting potential avenues for future research. We believe that our work establishes the basis for further exploration into the interpretation of in-context learning. To aid this effort, we have created a repository containing resources that will be continually updated. 2024.emnlp-main.795 @@ -11097,12 +11097,12 @@ More <fixed-case>DWUG</fixed-case>s: Extending and Evaluating Word Usage Graph Datasets in Multiple Languages - DominikSchlechtwegInstitute for Natural Language Processing, University of Stuttgart - PierluigiCassottiGöteborg University - BillNobleGöteborg University and University of Gothenburg + DominikSchlechtwegInstitute for Natural Language Processing, University of Stuttgart + PierluigiCassottiGöteborg University + BillNobleGöteborg University and University of Gothenburg DavidAlfterGöteborg University - SabineSchulte Im WaldeUniversity of Stuttgart - NinaTahmasebiGöteborg University + SabineSchulte Im WaldeUniversity of Stuttgart + NinaTahmasebiGöteborg University 14379-14393 Word Usage Graphs (WUGs) represent human semantic proximity judgments for pairs of word uses in a weighted graph, which can be clustered to infer word sense clusters from simple pairwise word use judgments, avoiding the need for word sense definitions. SemEval-2020 Task 1 provided the first and to date largest manually annotated, diachronic WUG dataset. In this paper, we check the robustness and correctness of the annotations by continuing the SemEval annotation algorithm for two more rounds and comparing against an established annotation paradigm. Further, we test the reproducibility by resampling a new, smaller set of word uses from the SemEval source corpora and annotating them. Our work contributes to a better understanding of the problems and opportunities of the WUG annotation paradigm and points to future improvements. 2024.emnlp-main.796 @@ -11116,7 +11116,7 @@ ChenxinLiThe Chinese University of Hong Kong LiuzhuozhengLi NieLin - MasashiSugiyamaRIKEN and The University of Tokyo + MasashiSugiyamaRIKEN and The University of Tokyo 14394-14410 Recent advances in fine-tuning Vision-Language Models (VLMs) have witnessed the success of prompt tuning and adapter tuning, while the classic model fine-tuning on inherent parameters seems to be overlooked. It is believed that fine-tuning the parameters of VLMs with few-shot samples corrupts the pre-trained knowledge since fine-tuning the CLIP model even degrades performance. In this paper, we revisit this viewpoint, and propose a new perspective: fine-tuning the specific parameters instead of all will uncover the power of classic model fine-tuning on VLMs. Through our meticulous study, we propose ClipFit, a simple yet effective method to fine-tune CLIP without introducing any overhead of extra parameters. We demonstrate that by only fine-tuning the specific bias terms and normalization layers, ClipFit can improve the performance of zero-shot CLIP by 7.27% average harmonic mean accuracy. Lastly, to understand how fine-tuning in CLIPFit affects the pre-trained models, we conducted extensive experimental analyses w.r.t. changes in internal parameters and representations. We found that low-level text bias layers and the first layer normalization layer change much more than other layers. The code will be released. 2024.emnlp-main.797 @@ -11127,7 +11127,7 @@ <fixed-case>ECIS</fixed-case>-<fixed-case>VQG</fixed-case>: Generation of Entity-centric Information-seeking Questions from Videos ArpanPhukan ManishGuptaMicrosoft - AsifEkbalIndian Institute of Technology, Jodhpur + AsifEkbalIndian Institute of Technology, Jodhpur 14411-14436 Previous studies on question generation from videos have mostly focused on generating questions about common objects and attributes and hence are not entity-centric. In this work, we focus on the generation of entity-centric information-seeking questions from videos. Such a system could be useful for video-based learning, recommending “People Also Ask” questions, video-based chatbots, and fact-checking. Our work addresses three key challenges: identifying question-worthy information, linking it to entities, and effectively utilizing multimodal signals. Further, to the best of our knowledge, there does not exist a large-scale dataset for this task. Most video question generation datasets are on TV shows, movies, or human activities or lack entity-centric information-seeking questions. Hence, we contribute a diverse dataset of YouTube videos, VideoQuestions, consisting of 411 videos with 2265 manually annotated questions. We further propose a model architecture combining Transformers, rich context signals (titles, transcripts, captions, embeddings), and a combination of cross-entropy and contrastive loss function to encourage entity-centric question generation. Our best method yields BLEU, ROUGE, CIDEr, and METEOR scores of 71.3, 78.6, 7.31, and 81.9, respectively, demonstrating practical usability. We make the code and dataset publicly available. 2024.emnlp-main.798 @@ -11137,8 +11137,8 @@ Distractor Generation in Multiple-Choice Tasks: A Survey of Methods, Datasets, and Evaluation ElafAlhazmi - Quan Z.ShengMacquarie University - Wei EmmaZhangThe University of Adelaide + Quan Z.ShengMacquarie University + Wei EmmaZhangThe University of Adelaide MunazzaZaib AhoudAlhazmiUmm Al-Qura University 14437-14458 @@ -11150,7 +11150,7 @@ Evaluating <tex-math>n</tex-math>-Gram Novelty of Language Models Using Rusty-<fixed-case>DAWG</fixed-case> WilliamMerrillNew York University - Noah A.SmithUniversity of Washington and Allen Institute for Artificial Intelligence + Noah A.SmithUniversity of Washington and Allen Institute for Artificial Intelligence YanaiElazarAllen Institute for Artificial Intelligence and Department of Computer Science 14459-14473 How novel are texts generated by language models (LMs) relative to their training corpora? In this work, we investigate the extent to which modern LMs generate n-grams from their training data, evaluating both (i) the probability LMs assign to complete training n-grams and (ii) n-novelty, the proportion of n-grams generated by an LM that did not appear in the training data (for arbitrarily large n). To enable arbitrary-length n-gram search over a corpus in constant time w.r.t. corpus size, we develop Rusty-DAWG, a novel search tool inspired by indexing of genomic data. We compare the novelty of LM-generated text to human-written text and explore factors that affect generation novelty, focusing on the Pythia models. We find that, for n > 4, LM-generated text is less novel than human-written text, though it is more novel for smaller n. Larger LMs and more constrained decoding strategies both decrease novelty. Finally, we show that LMs complete n-grams with lower loss if they are more frequent in the training data. Overall, our results reveal factors influencing the novelty of LM-generated text, and we release Rusty-DAWG to facilitate further pretraining data research. @@ -11160,13 +11160,13 @@ <fixed-case>ASL</fixed-case> <fixed-case>STEM</fixed-case> <fixed-case>W</fixed-case>iki: Dataset and Benchmark for Interpreting <fixed-case>STEM</fixed-case> Articles - KayoYinUniversity of California, Berkeley + KayoYinUniversity of California, Berkeley ChinmaySinghMicrosoft - Fyodor OMinakov + Fyodor OMinakov VanessaMilan HalDaumé IiiUniversity of Maryland - College Park, University of Maryland, College Park and Microsoft CyrilZhangMicrosoft - Alex XijieLuMicrosoft Research + Alex XijieLuMicrosoft Research DanielleBraggMicrosoft Research 14474-14490 Deaf and hard-of-hearing (DHH) students face significant barriers in accessing science, technology, engineering, and mathematics (STEM) education, notably due to the scarcity of STEM resources in signed languages. To help address this, we introduce ASL STEM Wiki: a parallel corpus of 254 Wikipedia articles on STEM topics in English, interpreted into over 300 hours of American Sign Language (ASL). ASL STEM Wiki is the first continuous signing dataset focused on STEM, facilitating the development of AI resources for STEM education in ASL.We identify several use cases of ASL STEM Wiki with human-centered applications. For example, because this dataset highlights the frequent use of fingerspelling for technical concepts, which inhibits DHH students’ ability to learn,we develop models to identify fingerspelled words—which can later be used to query for appropriate ASL signs to suggest to interpreters. @@ -11178,7 +11178,7 @@ Can Automatic Metrics Assess High-Quality Translations? SwetaAgrawalInstituto de Telecomunicações AntónioFarinhasInstituto Superior Técnico - RicardoReiUnbabel + RicardoReiUnbabel AndreMartinsInstituto Superior Técnico and Unbabel 14491-14502 Automatic metrics for evaluating translation quality are typically validated by measuring how well they correlate with human assessments. However, correlation methods tend to capture only the ability of metrics to differentiate between good and bad source-translation pairs, overlooking their reliability in distinguishing alternative translations for the same source. In this paper, we confirm that this is indeed the case by showing that current metrics are insensitive to nuanced differences in translation quality. This effect is most pronounced when the quality is high and the variance among alternatives is low. Given this finding, we shift towards detecting high-quality correct translations, an important problem in practical decision-making scenarios where a binary check of correctness is prioritized over a nuanced evaluation of quality. Using the MQM framework as the gold standard, we systematically stress-test the ability of current metrics to identify translations with no errors as marked by humans. Our findings reveal that current metrics often over or underestimate translation quality, indicating significant room for improvement in machine translation evaluation. @@ -11189,8 +11189,8 @@ Modeling User Preferences with Automatic Metrics: Creating a High-Quality Preference Dataset for Machine Translation SwetaAgrawalInstituto de Telecomunicações - José G. C.De SouzaUnbabel - RicardoReiUnbabel + José G. C.De SouzaUnbabel + RicardoReiUnbabel AntónioFarinhasInstituto Superior Técnico GonçaloFariaInstituto de Telecomunicações, Portugal PatrickFernandes @@ -11218,13 +11218,13 @@ <fixed-case>K</fixed-case>now<fixed-case>T</fixed-case>uning: Knowledge-aware Fine-tuning for Large Language Models YougangLyu LingyongYanBaidu Inc. - ShuaiqiangWang + ShuaiqiangWang HaiboShi DaweiYinBaidu PengjieRenShandong University ZhuminChenShandong University Maartende RijkeUniversity of Amsterdam - ZhaochunRenLeiden University + ZhaochunRenLeiden University 14535-14556 Despite their success at many natural language processing (NLP) tasks, large language models still struggle to effectively leverage knowledge for knowledge-intensive tasks, manifesting limitations such as generating incomplete, non-factual, or illogical answers. These limitations stem from inadequate knowledge awareness of LLMs during vanilla fine-tuning. To address these problems, we propose a knowledge-aware fine-tuning (KnowTuning) method to improve fine-grained and coarse-grained knowledge awareness of LLMs. We devise a fine-grained knowledge augmentation stage to train LLMs to identify difficult fine-grained knowledge in answers. We also propose a coarse-grained knowledge comparison stage to train LLMs to distinguish between reliable and unreliable knowledge, in three aspects: completeness, factuality, and logicality. Extensive experiments on both generic and medical question answering (QA) datasets confirm the effectiveness of KnowTuning, through automatic and human evaluations, across various sizes of LLMs. We further verify that KnowTuning generates more facts with less factual error rate under fine-grained facts evaluation. 2024.emnlp-main.805 @@ -11235,7 +11235,7 @@ <fixed-case>S</fixed-case>ec<fixed-case>C</fixed-case>oder: Towards Generalizable and Robust Secure Code Generation BoyuZhang - TianyuDuZhejiang University + TianyuDuZhejiang University JunkaiTong XuhongZhangZhejiang University KingsumChowZhejiang University @@ -11252,10 +11252,10 @@ Nash <fixed-case>C</fixed-case>o<fixed-case>T</fixed-case>: Multi-Path Inference with Preference Equilibrium - ZiqiZhang + ZiqiZhang CunxiangWang XiaoXiong - YueZhangWestlake University + YueZhangWestlake University DonglinWang 14572-14587 Chain of thought (CoT) is a reasoning framework that can enhance the performance of large language models (LLMs) on complex inference tasks. In particular, among various studies related to CoT, multi-path inference stands out as a simple yet effective improvement. However, there is no optimal setting for the number of inference paths. Therefore, we have to increase the number of inference paths to obtain better results, which in turn increases the inference cost. To address this limitation, we can utilize question-related role templates to guide LLMs into relevant roles, thereby increasing the possibility of correct inferences for each path and further reducing dependence on the number of inference paths while improving reasoning accuracy. However, placing LLMs into specific roles may reduce their reasoning diversity and performance on a few tasks where role dependence is low. To alleviate the excessive immersion of the LLM into a specific role, we propose Nash CoT by constructing a competitive system on each path that balances the generation from role-specific LLMs’ and the general LLMs’ generation, thereby ensuring both effective role adoption and diversity in LLM generation further maintaining the performance of multi-path inference while reducing the requirement of the number of inference paths. We evaluate Nash CoT across various inference tasks, including Arabic Reasoning, Commonsense Question Answering, and Symbolic Inference, achieving results that are comparable to or better than those of multi-path CoT with the equal number of inference paths. @@ -11282,12 +11282,12 @@ Small Agent Can Also Rock! Empowering Small Language Models as Hallucination Detector XiaoxueCheng JunyiLi - XinZhaoRenmin University of China + XinZhaoRenmin University of China HongzhiZhangKuaishou- 快手科技 FuzhengZhang DiZhangKuaishou Technology KunGai - Ji-RongWenRenmin University of China + Ji-RongWenRenmin University of China 14600-14615 Hallucination detection is a challenging task for large language models (LLMs), and existing studies heavily rely on powerful closed-source LLMs such as GPT-4. In this paper, we propose an autonomous LLM-based agent framework, called HaluAgent, which enables relatively smaller LLMs (e.g. Baichuan2-Chat 7B) to actively select suitable tools for detecting multiple hallucination types such as text, code, and mathematical expression. In HaluAgent, we integrate the LLM, multi-functional toolbox, and design a fine-grained three-stage detection framework along with memory mechanism. To facilitate the effectiveness of HaluAgent, we leverage existing Chinese and English datasets to synthesize detection trajectories for fine-tuning, which endows HaluAgent with the capability for bilingual hallucination detection. Extensive experiments demonstrate that only using 2K samples for tuning LLMs, HaluAgent can perform hallucination detection on various types of tasks and datasets, achieving performance comparable to or even higher than GPT-4 without tool enhancements on both in-domain and out-of-domain datasets. 2024.emnlp-main.809 @@ -11299,7 +11299,7 @@ WeiLiUniversity of Science and Technology of China ZhenHuang XinmeiTianUniversity of Science and Technology of China - LeLuAlibaba Group + LeLuAlibaba Group HouqiangLi XuShenAlibaba Group JiepingYeAlibaba Group @@ -11313,7 +11313,7 @@ <fixed-case>LLM</fixed-case> Task Interference: An Initial Study on the Impact of Task-Switch in Conversational History AkashGupta IvaxiShethCISPA, saarland university, saarland informatics campus - VyasRaina + VyasRaina MarkGalesUniversity of Cambridge MarioFritzCISPA Helmholtz Center for Information Security and Saarland University 14633-14652 @@ -11325,10 +11325,10 @@ Social Bias Probing: Fairness Benchmarking for Language Models - MartaMarchiori Manerba - KarolinaStanczakMila - Quebec Artificial Intelligence Institute and McGill University, McGill University - RiccardoGuidottiUniversity of Pisa - IsabelleAugensteinUniversity of Copenhagen + MartaMarchiori Manerba + KarolinaStanczakMila - Quebec Artificial Intelligence Institute and McGill University, McGill University + RiccardoGuidottiUniversity of Pisa + IsabelleAugensteinUniversity of Copenhagen 14653-14671 While the impact of social biases in language models has been recognized, prior methods for bias evaluation have been limited to binary association tests on small datasets, limiting our understanding of bias complexities. This paper proposes a novel framework for probing language models for social biases by assessing disparate treatment, which involves treating individuals differently according to their affiliation with a sensitive demographic group. We curate SoFa, a large-scale benchmark designed to address the limitations of existing fairness collections. SoFa expands the analysis beyond the binary comparison of stereotypical versus anti-stereotypical identities to include a diverse range of identities and stereotypes. Comparing our methodology with existing benchmarks, we reveal that biases within language models are more nuanced than acknowledged, indicating a broader scope of encoded biases than previously recognized. Benchmarking LMs on SoFa, we expose how identities expressing different religions lead to the most pronounced disparate treatments across all models. Finally, our findings indicate that real-life adversities faced by various groups such as women and people with disabilities are mirrored in the behavior of these models. 2024.emnlp-main.812 @@ -11344,7 +11344,7 @@ PeixinCao KaixinMaTencent AI Lab JianLiTencent - HongweiWangTencent AI Lab + HongweiWangTencent AI Lab DongYuTencent AI Lab 14672-14685 Retrieval-augmented language model (RALM) represents a significant advancement in mitigating factual hallucination by leveraging external knowledge sources. However, the reliability of the retrieved information is not always guaranteed, and the retrieval of irrelevant data can mislead the response generation. Moreover, standard RALMs frequently neglect their intrinsic knowledge due to the interference from retrieved information. In instances where the retrieved information is irrelevant, RALMs should ideally utilize their intrinsic knowledge or, in the absence of both intrinsic and retrieved knowledge, opt to respond with “unknown” to avoid hallucination. In this paper, we introduces Chain-of-Note (CoN), a novel approach to improve robustness of RALMs in facing noisy, irrelevant documents and in handling unknown scenarios. The core idea of CoN is to generate sequential reading notes for each retrieved document, enabling a thorough evaluation of their relevance to the given question and integrating this information to formulate the final answer. Our experimental results show that GPT-4, when equipped with CoN, outperforms the Chain-of-Thought approach. Besides, we utilized GPT-4 to create 10K CoN data, subsequently trained on smaller models like OPT and LLaMa-2. Our experiments across four open-domain QA benchmarks show that fine-tuned RALMs equipped with CoN significantly outperform standard fine-tuned RALMs. @@ -11356,10 +11356,10 @@ <fixed-case>D</fixed-case>yna<fixed-case>T</fixed-case>hink: Fast or Slow? A Dynamic Decision-Making Framework for Large Language Models JiabaoPan YanZhangTencent - ChenZhangNational University of Singapore - ZuozhuLiuZhejiang University + ChenZhangNational University of Singapore + ZuozhuLiuZhejiang University HongweiWangZhejiang University - HaizhouLiThe Chinese University of Hong Kong (Shenzhen); National University of Singapore and National University of Singapore + HaizhouLiThe Chinese University of Hong Kong (Shenzhen); National University of Singapore and National University of Singapore 14686-14695 Large language models (LLMs) have demonstrated emergent capabilities across diverse reasoning tasks via popular Chains-of-Thought (COT) prompting. However, such a simple and fast COT approach often encounters limitations in dealing with complicated problems, while a thorough method, which considers multiple reasoning pathways and verifies each step carefully, results in slower inference. This paper addresses the challenge of enabling LLMs to autonomously select between fast and slow inference methods, thereby optimizing both efficiency and effectiveness. We introduce a dynamic decision-making framework that categorizes tasks into two distinct pathways: ‘Fast,’ designated for tasks where the LLM quickly identifies a high-confidence solution, and ‘Slow,’ allocated for tasks that the LLM perceives as complex and for which it has low confidence in immediate solutions as well as requiring more reasoning paths to verify. Experiments on five popular reasoning benchmarks demonstrated the superiority of the DynaThink over baselines. For example, when we compared it to strong COT with self-consistency baseline on the complicated MATH dataset, DynaThink achieved more than 3% increase in accuracy with lower cost. The code will be made available upon publication. 2024.emnlp-main.814 @@ -11384,7 +11384,7 @@ Italo Luis DaSilva HanqiYan LinGuiKing’s College London, University of London - YulanHeKing’s College London, University of London + YulanHeKing’s College London, University of London 14707-14719 The inherent ambiguity of cause and effect boundaries poses a challenge in evaluating causal event extraction tasks. Traditional metrics like Exact Match and BertScore poorly reflect model performance, so we trained evaluation models to approximate human evaluation, achieving high agreement. We used them to perform Reinforcement Learning with extraction models to align them with human preference, prioritising semantic understanding. We successfully explored our approach through multiple datasets, including transferring an evaluator trained on one dataset to another as a way to decrease the reliance on human-annotated data. In that vein, we also propose a weak-to-strong supervision method that uses a fraction of the annotated data to train an evaluation model while still achieving high performance in training an RL model. 2024.emnlp-main.816 @@ -11400,9 +11400,9 @@ ZhenwenLiang WenhaoYuTencent AI Lab DianYuTencent AI Lab - MengzhaoJiaUniversity of Notre Dame + MengzhaoJiaUniversity of Notre Dame DongYuTencent AI Lab - MengJiangUniversity of Notre Dame + MengJiangUniversity of Notre Dame 14720-14738 Supervised fine-tuning enhances the problem-solving abilities of language models across various mathematical reasoning tasks. To maximize such benefits, existing research focuses on *broadening* the training set with various data augmentation techniques, which is effective for standard single-round question-answering settings. Our work introduces a novel technique aimed at cultivating a *deeper* understanding of the training problems at hand, enhancing performance not only in standard settings but also in more complex scenarios that require reflective thinking. Specifically, we propose **reflective augmentation**, a method that embeds problem reflection into each training instance. It trains the model to consider alternative perspectives and engage with abstractions and analogies, thereby fostering a thorough comprehension through reflective reasoning. Extensive experiments validate the achievement of our aim, underscoring the unique advantages of our method and its complementary nature relative to existing augmentation techniques. 2024.emnlp-main.817 @@ -11443,9 +11443,9 @@ <fixed-case>B</fixed-case>ias<fixed-case>A</fixed-case>lert: A Plug-and-play Tool for Social Bias Detection in <fixed-case>LLM</fixed-case>s ZhitingFan - RuizheChen + RuizheChen RuilingXu - ZuozhuLiuZhejiang University + ZuozhuLiuZhejiang University 14778-14790 Evaluating the bias of LLMs becomes more crucial with their rapid development. However, existing evaluation approaches rely on fixed-form outputs and cannot adapt to the flexible open-text generation scenarios of LLMs (e.g., sentence completion and question answering). To address this, we introduce BiasAlert, a plug-and-play tool designed to detect social bias in open-text generations of LLMs. BiasAlert integrates external human knowledge with its inherent reasoning capabilities to detect bias reliably. Extensive experiments demonstrate that BiasAlert significantly outperforms existing state-of-the-art methods like GPT-4-as-Judge in detecting bias. Furthermore, through application studies, we showcase the utility of BiasAlert in reliable LLM fairness evaluation and bias mitigation across various scenarios. Model and code will be publicly released. 2024.emnlp-main.820 @@ -11457,7 +11457,7 @@ JiliangHu ZuchaoLi PingWangWuhan University - HaojunAiWuhan University + HaojunAiWuhan University LefeiZhangWuhan University HaiZhaoShanghai Jiao Tong University 14791-14804 @@ -11472,7 +11472,7 @@ JosefValvoda TianyuLiuETHZ - ETH Zurich AnejSvete - YanxiaQinSingapore University of Technology and Design + YanxiaQinSingapore University of Technology and Design Min-YenKanNational University of Singapore RyanCotterellSwiss Federal Institute of Technology 14805-14829 @@ -11483,11 +11483,11 @@ Bridging Local Details and Global Context in Text-Attributed Graphs YaokeWang - YunZhu + YunZhu WenqiaoZhang YuetingZhuang LiyunfeiLiyunfei - SiliangTangZhejiang University + SiliangTangZhejiang University 14830-14841 Representation learning on text-attributed graphs (TAGs) is vital for real-world applications, as they combine semantic textual and contextual structural information. Research in this field generally consist of two main perspectives: local-level encoding and global-level aggregating, respectively refer to textual node information unification (e.g., using Language Models) and structure-augmented modeling (e.g., using Graph Neural Networks). Most existing works focus on combining different information levels but overlook the interconnections, i.e., the contextual textual information among nodes, which provides semantic insights to bridge local and global levels. In this paper, we propose GraphBridge, a multi-granularity integration framework that bridges local and global perspectives by leveraging contextual textual information, enhancing fine-grained understanding of TAGs. Besides, to tackle scalability and efficiency challenges, we introduce a graph-aware token reduction module. Extensive experiments across various models and datasets show that our method achieves state-of-the-art performance, while our graph-aware token reduction module significantly enhances efficiency and solves scalability issues. Codes are available at https://github.com/wykk00/GraphBridge. 2024.emnlp-main.823 @@ -11497,9 +11497,9 @@ Building Resources for Emakhuwa: Machine Translation and News Classification Benchmarks - Felermino D. M. A.Ali - HenriqueLopes CardosoUniversidade do Porto - RuiSousa-Silva + Felermino D. M. A.Ali + HenriqueLopes CardosoUniversidade do Porto + RuiSousa-Silva 14842-14857 This paper introduces a comprehensive collection of NLP resources for Emakhuwa, Mozambique’s most widely spoken language. The resources include the first manually translated news bitext corpus between Portuguese and Emakhuwa, news topic classification datasets, and monolingual data. We detail the process and challenges of acquiring this data and present benchmark results for machine translation and news topic classification tasks. Our evaluation examines the impact of different data types—originally clean text, post-corrected OCR, and back-translated data—and the effects of fine-tuning from pre-trained models, including those focused on African languages.Our benchmarks demonstrate good performance in news topic classification and promising results in machine translation. We fine-tuned multilingual encoder-decoder models using real and synthetic data and evaluated them on our test set and the FLORES evaluation sets. The results highlight the importance of incorporating more data and potential for future improvements.All models, code, and datasets are available in the https://huggingface.co/LIACC repository under the CC BY 4.0 license. 2024.emnlp-main.824 @@ -11533,7 +11533,7 @@ A Closer Look at Multidimensional Online Political Incivility SagiPendzel NirLotan - AlonZoiznerUniversity of Haifa + AlonZoiznerUniversity of Haifa EinatMinkovUniversity of Haifa 14881-14896 Toxic online political discourse has become prevalent, where scholars debate about its impact to Democratic processes. This work presents a large-scale study of political incivility on Twitter. In line with theories of political communication, we differentiate between harsh ‘impolite’ style and intolerant substance. We present a dataset of 13K political tweets in the U.S. context, which we collected and labeled by those categories using crowd sourcing. Our dataset and results shed light on hostile political discourse focused on partisan conflicts in the U.S. The evaluation of state-of-the-art classifiers illustrates the challenges involved in political incivility detection, which often requires high-level semantic and social understanding. Nevertheless, performing incivility detection at scale, we are able to characterise its distribution across individual users and geopolitical regions, where our findings align and extend existing theories of political communication. In particular, we find that roughly 80% of the uncivil tweets are authored by 20% of the users, where users who are politically engaged are more inclined to use uncivil language. We further find that political incivility exhibits network homophily, and that incivility is more prominent in highly competitive geopolitical regions. Our results apply to both uncivil style and substance. @@ -11546,7 +11546,7 @@ ZetongLi QinliangSuSUN YAT-SEN UNIVERSITY ShijingSi - JianxingYuSUN YAT-SEN UNIVERSITY + JianxingYuSUN YAT-SEN UNIVERSITY 14897-14913 BERT and TFIDF features excel in capturing rich semantics and important words, respectively. Since most existing clustering methods are solely based on the BERT model, they often fall short in utilizing keyword information, which, however, is very useful in clustering short texts. In this paper, we propose a **CO**-**T**raining **C**lustering (**COTC**) framework to make use of the collective strengths of BERT and TFIDF features. Specifically, we develop two modules responsible for the clustering of BERT and TFIDF features, respectively. We use the deep representations and cluster assignments from the TFIDF module outputs to guide the learning of the BERT module, seeking to align them at both the representation and cluster levels. Reversely, we also use the BERT module outputs to train the TFIDF module, thus leading to the mutual promotion. We then show that the alternating co-training framework can be placed under a unified joint training objective, which allows the two modules to be connected tightly and the training signals to be propagated efficiently. Experiments on eight benchmark datasets show that our method outperforms current SOTA methods significantly. 2024.emnlp-main.828 @@ -11579,10 +11579,10 @@ <fixed-case>S</fixed-case>parse<fixed-case>G</fixed-case>rad: A Selective Method for Efficient Fine-tuning of <fixed-case>MLP</fixed-case> Layers - Viktoriia A.Chekalina + Viktoriia A.Chekalina AnnaRudenko GlebMezentsevSkolkovo Institute of Science and Technology - AleksandrMikhalevSkolkovo Institute of Science and Technology + AleksandrMikhalevSkolkovo Institute of Science and Technology AlexanderPanchenkoSkoltech IvanOseledetsArtificial Intelligence Research Institute and Skolkovo Institute of Science and Technology 14929-14939 @@ -11593,9 +11593,9 @@ <fixed-case>M</fixed-case>o<fixed-case>C</fixed-case>o<fixed-case>KGC</fixed-case>: Momentum Contrast Entity Encoding for Knowledge Graph Completion - QingyangLi + QingyangLi YanruZhongGuilin University Of Electronic Technology - YuchuQinUniversity of Huddersfield + YuchuQinUniversity of Huddersfield 14940-14952 In recent years, numerous studies have sought to enhance the capabilities of pretrained language models (PLMs) for Knowledge Graph Completion (KGC) tasks by integrating structural information from knowledge graphs. However, existing approaches have not effectively combined the structural attributes of knowledge graphs with the textual descriptions of entities to generate robust entity encodings.To address this issue, this paper proposes MoCoKGC (Momentum Contrast Entity Encoding for Knowledge Graph Completion), which incorporates three primary encoders: the entity-relation encoder, the entity encoder, and the momentum entity encoder. Momentum contrastive learning not only provides more negative samples but also allows for the gradual updating of entity encodings. Consequently, we reintroduce the generated entity encodings into the encoder to incorporate the graph’s structural information.Additionally, MoCoKGC enhances the inferential capabilities of the entity-relation encoder through deep prompts of relations. On the standard evaluation metric, Mean Reciprocal Rank (MRR), the MoCoKGC model demonstrates superior performance, achieving a 7.1% improvement on the WN18RR dataset and an 11% improvement on the Wikidata5M dataset, while also surpassing the current best model on the FB15k-237 dataset. Through a series of experiments, this paper thoroughly examines the role and contribution of each component and parameter of the model. 2024.emnlp-main.832 @@ -11622,12 +11622,12 @@ Shortcuts Arising from Contrast: Towards Effective and Lightweight Clean-Label Attacks in Prompt-Based Learning XiaopengXie - MingYan + MingYan XiwenZhou ChenlongZhao SuliWang YongZhangBeijing University of Posts and Telecommunications - Joey TianyiZhouA*STAR Centre for Frontier AI Research + Joey TianyiZhouA*STAR Centre for Frontier AI Research 14966-14977 Prompt-based learning paradigm has been shown to be vulnerable to backdoor attacks. Current clean-label attack, employing a specific prompt as trigger, can achieve success without the need for external triggers and ensuring correct labeling of poisoned samples, which are more stealthy compared to the poisoned-label attack, but on the other hand, facing significant issues with false activations and pose greater challenges, necessitating a higher rate of poisoning. Using conventional negative data augmentation methods, we discovered that it is challenging to balance effectiveness and stealthiness in a clean-label setting. In addressing this issue, we are inspired by the notion that a backdoor acts as a shortcut, and posit that this shortcut stems from the contrast between the trigger and the data utilized for poisoning. In this study, we propose a method named Contrastive Shortcut Injection (CSI), by leveraging activation values, integrates trigger design and data selection strategies to craft stronger shortcut features. With extensive experiments on full-shot and few-shot text classification tasks, we empirically validate CSI’s high effectiveness and high stealthiness at low poisoning rates. 2024.emnlp-main.834 @@ -11652,7 +11652,7 @@ ChaoyiWuShanghai Jiaotong University XiaomanZhangHarvard Medical School, Harvard University YaZhangShanghai Jiao Tong University - YanfengWangShanghai Jiao Tong University + YanfengWangShanghai Jiao Tong University WeidiXieShanghai Jiaotong University 15004-15019 This paper introduces a novel, entity-aware metric, termed as Radiological Report (Text) Evaluation (RaTEScore), to assess the quality of medical reports generated by AI models. RaTEScore emphasizes crucial medical entities such as diagnostic outcomes and anatomical details, and is robust against complex medical synonyms and sensitive to negation expressions. Technically, we developed a comprehensive medical NER dataset, RaTE-NER, and trained an NER model specifically for this purpose. This model enables the decomposition of complex radiological reports into constituent medical entities. The metric itself is derived by comparing the similarity of entity embeddings, obtained from a language model, based on their types and relevance to clinical significance. Our evaluations demonstrate that RaTEScore aligns more closely with human preference than existing metrics, validated both on established public benchmarks and our newly proposed RaTE-Eval benchmark. @@ -11705,7 +11705,7 @@ AlexisChevalier TanyaGoyalCornell University DanqiChenDepartment of Computer Science, Princeton University - TianyuGao + TianyuGao 15068-15083 Literature search questions, such as “where can I find research on the evaluation of consistency in generated summaries?” pose significant challenges for modern search engines and retrieval systems. These questions often require a deep understanding of research concepts and the ability to reason over entire articles. In this work, we introduce LitSearch, a retrieval benchmark comprising 597 realistic literature search queries about recent ML and NLP papers. LitSearch is constructed using a combination of (1) questions generated by GPT-4 based on paragraphs containing inline citations from research papers and (2) questions about recently published papers, manually written by their authors. All LitSearch questions were manually examined or edited by experts to ensure high quality. We extensively benchmark state-of-the-art retrieval models and also evaluate two LLM-based reranking pipelines. We find a significant performance gap between BM25 and state-of-the-art dense retrievers, with a 24.8% difference in absolute recall@5. The LLM-based reranking strategies further improve the best-performing dense retriever by 4.4%. Additionally, commercial search engines and research tools like Google Search perform poorly on LitSearch, lagging behind the best dense retriever by 32 points. Taken together, these results show that LitSearch is an informative new testbed for retrieval systems while catering to a real-world use case. 2024.emnlp-main.840 @@ -11715,10 +11715,10 @@ Open-world Multi-label Text Classification with Extremely Weak Supervision XintongLi - JinyaJiang + JinyaJiang RiaDharmani JayanthSrinivasa - GaowenLiu + GaowenLiu JingboShangUniversity of California, San Diego 15084-15096 We study open-world multi-label text classification under extremely weak supervision (XWS), where the user only provides a brief description for classification objectives without any labels or ground-truth label space. Similar single-label XWS settings have been explored recently, however, these methods cannot be easily adapted for multi-label. We observe that (1) most documents have a dominant class covering the majority of content and (2) long-tail labels would appear in some documents as a dominant class. Therefore, we first utilize the user description to prompt a large language model (LLM) for dominant keyphrases of a subset of raw documents, and then construct a (initial) label space via clustering. We further apply a zero-shot multi-label classifier to locate the documents with small top predicted scores, so we can revisit their dominant keyphrases for more long-tail labels. We iterate this process to discover a comprehensive label space and construct a multi-label classifier as a novel method, X-MLClass. X-MLClass exhibits a remarkable increase in ground-truth label space coverage on various datasets, for example, a 40% improvement on the AAPD dataset over topic modeling and keyword extraction methods. Moreover, X-MLClass achieves the best end-to-end multi-label classification accuracy. @@ -11728,10 +11728,10 @@ <fixed-case>LLM</fixed-case>s learn governing principles of dynamical systems, revealing an in-context neural scaling law - Toni J.b.Liu - NicolasBoulleImperial College London - RaphaëlSarfati - ChristopherEarlsCornell University + Toni J.b.Liu + NicolasBoulleImperial College London + RaphaëlSarfati + ChristopherEarlsCornell University 15097-15117 We study LLMs’ ability to extrapolate the behavior of various dynamical systems, including stochastic, chaotic, continuous, and discrete systems, whose evolution is governed by principles of physical interest. Our results show that LLaMA-2, a language model trained on text, achieves accurate predictions of dynamical system time series without fine-tuning or prompt engineering. Moreover, the accuracy of the learned physical rules increases with the length of the input context window, revealing an in-context version of a neural scaling law. Along the way, we present a flexible and efficient algorithm for extracting probability density functions of multi-digit numbers directly from LLMs. 2024.emnlp-main.842 @@ -11757,7 +11757,7 @@ AkariAsaiPaul G. Allen School of Computer Science & Engineering, University of Washington NiloofarMireshghallahUniversity of Washington SewonMinUniversity of California, Berkeley and Allen Institute for Artificial Intelligence - JamesGrimmelmannCornell University + JamesGrimmelmannCornell University YejinChoiDepartment of Computer Science, University of Washington HannanehHajishirziUniversity of Washington, University of Washington, Allen Institute for Artificial Intelligence and University of Washington, Seattle LukeZettlemoyerUniversity of Washington, Facebook and Meta @@ -11771,7 +11771,7 @@ Dense <fixed-case>X</fixed-case> Retrieval: What Retrieval Granularity Should We Use? TongChen - HongweiWangTencent AI Lab + HongweiWangTencent AI Lab SihaoChen WenhaoYuTencent AI Lab KaixinMaTencent AI Lab @@ -11792,7 +11792,7 @@ AzureZhouStanford University JiaaoChen WeiyanShi - WeiWangUniversity of California, Los Angeles + WeiWangUniversity of California, Los Angeles DiyiYangStanford University 15178-15194 Susceptibility to misinformation describes the degree of belief in unverifiable claims, a latent aspect of individuals’ mental processes that is not observable. Existing susceptibility studies heavily rely on self-reported beliefs, which can be subject to bias, expensive to collect, and challenging to scale for downstream applications. To address these limitations, in this work, we propose a computational approach to efficiently model users’ latent susceptibility levels. As shown in previous work, susceptibility is influenced by various factors (e.g., demographic factors, political ideology), and directly influences people’s reposting behavior on social media. To represent the underlying mental process, our susceptibility modeling incorporates these factors as inputs, guided by the supervision of people’s sharing behavior. Using COVID-19 as a testbed, our experiments demonstrate a significant alignment between the susceptibility scores estimated by our computational modeling and human judgments, confirming the effectiveness of this latent modeling approach. Furthermore, we apply our model to annotate susceptibility scores on a large-scale dataset and analyze the relationships between susceptibility with various factors. Our analysis reveals that political leanings and other psychological factors exhibit varying degrees of association with susceptibility to COVID-19 misinformation, and shows that susceptibility is unevenly distributed across different professional and geographical backgrounds. @@ -11803,7 +11803,7 @@ Layer by Layer: Uncovering Where Multi-Task Learning Happens in Instruction-Tuned Large Language Models ZhengZhaoUniversity of Edinburgh, University of Edinburgh - YftahZiserNVIDIA + YftahZiserNVIDIA Shay BCohenUniversity of Edinburgh 15195-15214 Fine-tuning pre-trained large language models (LLMs) on a diverse array of tasks has become a common approach for building models that can solve various natural language processing (NLP) tasks. However, where and to what extent these models retain task-specific knowledge remains largely unexplored. This study investigates the task-specific information encoded in pre-trained LLMs and the effects of instruction tuning on their representations across a diverse set of over 60 NLP tasks. We use a set of matrix analysis tools to examine the differences between the way pre-trained and instruction-tuned LLMs store task-specific information. Our findings reveal that while some tasks are already encoded within the pre-trained LLMs, others greatly benefit from instruction tuning. Additionally, we pinpointed the layers in which the model transitions from high-level general representations to more task-oriented representations. This finding extends our understanding of the governing mechanisms of LLMs and facilitates future research in the fields of parameter-efficient transfer learning and multi-task learning. Our code is available at: https://github.com/zsquaredz/layer_by_layer/ @@ -11837,7 +11837,7 @@ Control Large Language Models via Divide and Conquer - BingxuanLi + BingxuanLi YiweiWangUniversity of California, Merced TaoMeng Kai-WeiChangUniversity of California, Los Angeles @@ -11850,13 +11850,13 @@ Joint Pre-Encoding Representation and Structure Embedding for Efficient and Low-Resource Knowledge Graph Completion - ChenyuQiu - PengjiangQianJiangnan University - ChuangWang - JianYao - LiLiuJiangnan University + ChenyuQiu + PengjiangQianJiangnan University + ChuangWang + JianYao + LiLiuJiangnan University FangWeiJiangnan University - Eddie Y.k.Eddie + Eddie Y.k.Eddie 15257-15269 Knowledge graph completion (KGC) aims to infer missing or incomplete parts in knowledge graph. The existing models are generally divided into structure-based and description-based models, among description-based models often require longer training and inference times as well as increased memory usage. In this paper, we propose Pre-Encoded Masked Language Model (PEMLM) to efficiently solve KGC problem. By encoding textual descriptions into semantic representations before training, the necessary resources are significantly reduced. Furthermore, we introduce a straightforward but effective fusion framework to integrate structural embedding with pre-encoded semantic description, which enhances the model’s prediction performance on 1-N relations. The experimental results demonstrate that our proposed strategy attains state-of-the-art performance on the WN18RR (MRR+5.4% and Hits@1+6.4%) and UMLS datasets. Compared to existing models, we have increased inference speed by 30x and reduced training memory by approximately 60%. 2024.emnlp-main.851 @@ -11871,7 +11871,7 @@ BinghaiWang SenjieJin CaishuangHuang - JunjieYe + JunjieYe ZhihaoZhang YuhaoZhou ZhihengXi @@ -11886,7 +11886,7 @@ <fixed-case>R</fixed-case>o<fixed-case>CEL</fixed-case>: Advancing Table Entity Linking through Distinctive Row and Column Contexts - YuanzhengWangInstitute of Computing Technology, Chinese Academy of Sciences and University of the Chinese Academy of Sciences + YuanzhengWangInstitute of Computing Technology, Chinese Academy of Sciences and University of the Chinese Academy of Sciences YixingFan JiafengGuoInstitute of Computing Technolgy, Chinese Academy of Sciences RuqingZhang @@ -11902,7 +11902,7 @@ Zi’ouZhengQueen’s University ChristopherMalonNEC Laboratories America Martin RenqiangMinNEC Laboratories America - XiaodanZhuQueen’s University + XiaodanZhuQueen’s University 15299-15312 When performing complex multi-step reasoning tasks, the ability of Large Language Models (LLMs) to derive structured intermediate proof steps is important for ensuring that the models truly perform the desired reasoning and for improving models’ explainability. This paper is centred around a focused study: whether the current state-of-the-art generalist LLMs can leverage the structures in a few examples to better construct the proof structures with in-context learning. Our study specifically focuses on structure-aware demonstration and structure-aware pruning. We demonstrate that they both help improve performance. A detailed analysis is provided to help understand the results. 2024.emnlp-main.854 @@ -11915,7 +11915,7 @@ PeeratLimkonchotiwatAI Singapore PotsaweeManakulSCB 10X CanUdomcharoenchaikitVidyasirimedhi Institute of Science and Technology (VISTEC) - EkapolChuangsuwanichChulalongkorn University + EkapolChuangsuwanichChulalongkorn University SaranaNutanong 15313-15321 Entity disambiguation (ED) is crucial in natural language processing (NLP) for tasks such as question-answering and information extraction. A major challenge in ED is handling overshadowed entities—uncommon entities sharing mention surfaces with common entities. The current approach to enhance performance on these entities involves reasoning over facts in a knowledge base (KB), increasing computational overhead during inference. We argue that the ED performance on overshadowed entities can be enhanced during training by addressing shortcut learning, which does not add computational overhead at inference. We propose a simple yet effective debiasing technique to prevent models from shortcut learning during training. Experiments on a range of ED datasets show that our method achieves state-of-the-art performance without compromising inference speed. Our findings suggest a new research direction for improving entity disambiguation via shortcut learning mitigation. @@ -11925,14 +11925,14 @@ <fixed-case>A</fixed-case>pp<fixed-case>B</fixed-case>ench: Planning of Multiple <fixed-case>API</fixed-case>s from Various <fixed-case>APP</fixed-case>s for Complex User Instruction - HongruWangThe Chinese University of Hong Kong + HongruWangThe Chinese University of Hong Kong RuiWang BoyangXue - HemingXia + HemingXia JingtaoCao - ZemingLiu - Jeff Z.PanUniversity of Edinburgh, University of Edinburgh - Kam-FaiWongThe Chinese University of Hong Kong + ZemingLiu + Jeff Z.PanUniversity of Edinburgh, University of Edinburgh + Kam-FaiWongThe Chinese University of Hong Kong 15322-15336 Large Language Models (LLMs) can interact with the real world by connecting with versatile external APIs, resulting in better problem-solving and task automation capabilities. Previous research primarily either focuses on APIs with limited arguments from a single source or overlooks the complex dependency relationship between different APIs. However, it is essential to utilize multiple APIs collaboratively from various sources, especially for complex user instructions. In this paper, we introduce MetaBench, the first benchmark to evaluate LLMs’ ability to plan and execute multiple APIs from various sources in order to complete the user’s task. Specifically, we consider two significant challenges in multiple APIs: 1) graph structures: some APIs can be executed independently while others need to be executed one by one, resulting in graph-like execution order; and 2) permission constraints: which source is authorized to execute the API call. We have experimental results on 9 distinct LLMs; e.g., GPT-4o achieves only a 2.0% success rate at the most complex instruction, revealing that the existing state-of-the-art LLMs still cannot perform well in this situation even with the help of in-context learning and finetuning. Our code and data are publicly available at https://github.com/ruleGreen/AppBench. 2024.emnlp-main.856 @@ -11943,9 +11943,9 @@ Not Everything is All You Need: Toward Low-Redundant Optimization for Large Language Model Alignment ZhipengChen KunZhouUniversity of California, San Diego - XinZhaoRenmin University of China - JingyuanWangBeijing University of Aeronautics and Astronautics - Ji-RongWenRenmin University of China + XinZhaoRenmin University of China + JingyuanWangBeijing University of Aeronautics and Astronautics + Ji-RongWenRenmin University of China 15337-15351 Large language models (LLMs) are still struggling in aligning with human preference in complex tasks and scenarios. They are prone to overfit into the unexpected patterns or superficial styles in the training data. We conduct an empirical study that only selects the top-10% most updated parameters in LLMs for alignment training, and see improvements in the convergence process and final performance. It indicates the existence of redundant neurons in LLMs for alignment training. To reduce its influence, we propose a low-redundant alignment method named **ALLO**, focusing on optimizing the most related neurons with the most useful supervised signals. Concretely, we first identify the neurons that are related to the human preference data by a gradient-based strategy, then identify the alignment-related key tokens by reward models for computing loss. Besides, we also decompose the alignment process into the forgetting and learning stages, where we first forget the tokens with unaligned knowledge and then learn aligned knowledge, by updating different ratios of neurons, respectively. Experimental results on 10 datasets have shown the effectiveness of ALLO. Our code and data will be publicly released. 2024.emnlp-main.857 @@ -11955,17 +11955,17 @@ <fixed-case>A</fixed-case>udio<fixed-case>VSR</fixed-case>: Enhancing Video Speech Recognition with Audio Data - XiaodaYang - XizeCheng - JiaqiDuanQingdao University - HongshunQiu + XiaodaYang + XizeCheng + JiaqiDuanQingdao University + HongshunQiu MinjieHong - MinghuiFang - ShengpengJi - JialongZuo + MinghuiFang + ShengpengJi + JialongZuo ZhiqingHong - ZhimengZhangZhejiang University and Zhejiang University - TaoJin + ZhimengZhangZhejiang University and Zhejiang University + TaoJin 15352-15361 Visual Speech Recognition (VSR) aims to predict spoken content by analyzing lip movements in videos. Recently reported state-of-the-art results in VSR often rely on increasingly large amounts of video data, while the publicly available transcribed video datasets are insufficient compared to the audio data. To further enhance the VSR model using the audio data, we employed a generative model for data inflation, integrating the synthetic data with the authentic visual data. Essentially, the generative model incorporates another insight, which enhances the capabilities of the recognition model. For the cross-language issue, previous work has shown poor performance with non-Indo-European languages. We trained a multi-language-family modal fusion model, AudioVSR. Leveraging the concept of modal transfer, we achieved significant results in downstream VSR tasks under conditions of data scarcity. To the best of our knowledge, AudioVSR represents the first work on cross-language-family audio-lip alignment, achieving a new SOTA in the cross-language scenario. 2024.emnlp-main.858 @@ -11987,11 +11987,11 @@ Ladder: A Model-Agnostic Framework Boosting <fixed-case>LLM</fixed-case>-based Machine Translation to the Next Level - ZhaopengFengZhejiang University - RuizheChen + ZhaopengFengZhejiang University + RuizheChen YanZhangTencent - ZijieMeng - ZuozhuLiuZhejiang University + ZijieMeng + ZuozhuLiuZhejiang University 15377-15393 General-purpose Large Language Models (LLMs) like GPT-4 have achieved remarkable advancements in machine translation (MT) by leveraging extensive web content. On the other hand, translation-specific LLMs are built by pre-training on domain-specific monolingual corpora and fine-tuning with human-annotated translation data. Despite the superior performance, these methods either demand an unprecedented scale of computing and data or substantial human editing and annotation efforts. In this paper, we develop MT-Ladder, a novel model-agnostic and cost-effective tool to refine the performance of general LLMs for MT. MT-Ladder is trained on pseudo-refinement triplets which can be easily obtained from existing LLMs without additional human cost. During training, we propose a hierarchical fine-tuning strategy with an easy-to-hard schema, improving MT-Ladder’s refining performance progressively. The trained MT-Ladder can be seamlessly integrated with any general-purpose LLMs to boost their translation performance. By utilizing Gemma-2B/7B as the backbone, MT-Ladder-2B can elevate raw translations to the level of top-tier open-source models (e.g., refining BigTranslate-13B with +6.91 BLEU and +3.52 COMET for XX→En), and MT-Ladder-7B can further enhance model performance to be on par with the state-of-the-art GPT-4. Extensive ablation and analysis corroborate the effectiveness of MT-Ladder in diverse settings. 2024.emnlp-main.860 @@ -12014,9 +12014,9 @@ Effective Synthetic Data and Test-Time Adaptation for <fixed-case>OCR</fixed-case> Correction ShuhaoGuan - ChengXu + ChengXu MouleLin - DerekGreeneUniversity College Dublin + DerekGreeneUniversity College Dublin 15412-15425 Post-OCR technology is used to correct errors in the text produced by OCR systems. This study introduces a method for constructing post-OCR synthetic data with different noise levels using weak supervision. We define Character Error Rate (CER) thresholds for “effective” and “ineffective” synthetic data, allowing us to create more useful multi-noise level synthetic datasets. Furthermore, we propose Self-Correct-Noise Test-Time Adaptation (SCN-TTA), which combines self-correction and noise generation mechanisms. SCN-TTA allows a model to dynamically adjust to test data without relying on labels, effectively handling proper nouns in long texts and further reducing CER. In our experiments we evaluate a range of models, including multiple PLMs and LLMs. Results indicate that our method yields models that are effective across diverse text types. Notably, the ByT5 model achieves a CER reduction of 68.67% without relying on manually annotated data 2024.emnlp-main.862 @@ -12025,13 +12025,13 @@ <fixed-case>SRF</fixed-case>: Enhancing Document-Level Relation Extraction with a Novel Secondary Reasoning Framework - FuZhangNortheastern University - QiMiao - JingweiChengNortheastern University, China - HongsenYu - YiYan - XinLiNortheastern University - YongxueWu + FuZhangNortheastern University + QiMiao + JingweiChengNortheastern University, China + HongsenYu + YiYan + XinLiNortheastern University + YongxueWu 15426-15439 Document-level Relation Extraction (DocRE) aims to extract relations between entity pairs in a document and poses many challenges as it involves multiple mentions of entities and cross-sentence inference. However, several aspects that are important for DocRE have not been considered and explored. Existing work ignores bidirectional mention interaction when generating relational features for entity pairs. Also, sophisticated neural networks are typically designed for cross-sentence evidence extraction to further enhance DocRE. More interestingly, we reveal a noteworthy finding: If a model has predicted a relation between an entity and other entities, this relation information may help infer and predict more relations between the entity’s adjacent entities and these other entities. Nonetheless, none of existing methods leverage secondary reasoning to exploit results of relation prediction. To this end, we propose a novel Secondary Reasoning Framework (SRF) for DocRE. In SRF, we initially propose a DocRE model that incorporates bidirectional mention fusion and a simple yet effective evidence extraction module (incurring only an additional learnable parameter overhead) for relation prediction. Further, for the first time, we elaborately design and propose a novel secondary reasoning method to discover more relations by exploring the results of the first relation prediction. Extensive experiments show that SRF achieves SOTA performance and our secondary reasoning method is both effective and general when integrated into existing models. 2024.emnlp-main.863 @@ -12053,7 +12053,7 @@ Exploring the Learning Capabilities of Language Models using <fixed-case>LEVERWORLDS</fixed-case> EitanWagnerHebrew University of Jerusalem - AmirFederColumbia University and Google + AmirFederColumbia University and Google OmriAbendHebrew University of Jerusalem 15458-15468 Learning a model of a stochastic setting often involves learning both general structure rules and specific properties of the instance. This paper investigates the interplay between learning the general and the specific in various learning methods, with emphasis on sample efficiency. We design a framework called LEVERWORLDS, which allows the generation of simple physics-inspired worlds that follow a similar generative process with different distributions, and their instances can be expressed in natural language. These worlds allow for controlled experiments to assess the sample complexity of different learning methods. We experiment with classic learning algorithms as well as Transformer language models, both with fine-tuning and In-Context Learning (ICL). Our general finding is that (1) Transformers generally succeed in the task; but (2) they are considerably less sample efficient than classic methods that make stronger assumptions about the structure, such as Maximum Likelihood Estimation and Logistic Regression. This finding is in tension with the recent tendency to use Transformers as general-purpose estimators. We propose an approach that leverages the ICL capabilities of contemporary language models to apply simple algorithms for this type of data. Our experiments show that models currently struggle with the task but show promising potential. @@ -12064,7 +12064,7 @@ <fixed-case>CONTESTS</fixed-case>: a Framework for Consistency Testing of Span Probabilities in Language Models EitanWagnerHebrew University of Jerusalem - YuliSlavutsky + YuliSlavutsky OmriAbendHebrew University of Jerusalem 15469-15484 Although language model scores are often treated as probabilities, their reliability as probability estimators has mainly been studied through calibration, overlooking other aspects. In particular, it is unclear whether language models produce the same value for different ways of assigning joint probabilities to word spans. Our work introduces a novel framework, ConTestS (Consistency Testing over Spans), involving statistical tests to assess score consistency across interchangeable completion and conditioning orders. We conduct experiments on post-release real and synthetic data to eliminate training effects. Our findings reveal that both Masked Language Models (MLMs) and autoregressive models exhibit inconsistent predictions, with autoregressive models showing larger discrepancies. Larger MLMs tend to produce more consistent predictions, while autoregressive models show the opposite trend. Moreover, for both model types, prediction entropies offer insights into the true word span likelihood and therefore can aid in selecting optimal decoding strategies. The inconsistencies revealed by our analysis, as well their connection to prediction entropies and differences between model types, can serve as useful guides for future research on addressing these limitations. @@ -12076,12 +12076,12 @@ <fixed-case>D</fixed-case>oc<fixed-case>E</fixed-case>dit-v2: Document Structure Editing Via Multimodal <fixed-case>LLM</fixed-case> Grounding MananSuri PuneetMathurAdobe Systems - FranckDernoncourtAdobe Systems + FranckDernoncourtAdobe Systems RajivJainAdobe Systems Vlad IMorariuAdobe RamitSawhneyGeorgia Institute of Technology - PreslavNakovMohamed bin Zayed University of Artificial Intelligence - DineshManochaUniversity of Maryland, College Park + PreslavNakovMohamed bin Zayed University of Artificial Intelligence + DineshManochaUniversity of Maryland, College Park 15485-15505 Document structure editing involves manipulating localized textual, visual, and layout components in document images based on the user’s requests. Past works have shown that multimodal grounding of user requests in the document image and identifying the accurate structural components and their associated attributes remain key challenges for this task. To address these, we introduce the DocEditAgent, a novel framework that performs end-to-end document editing by leveraging Large Multimodal Models (LMMs). It consists of three novel components – (1) Doc2Command to simultaneously localize edit regions of interest (RoI) and disambiguate user edit requests into edit commands. (2) LLM-based Command Reformulation prompting to tailor edit commands originally intended for specialized software into edit instructions suitable for generalist LMMs. (3) Moreover, DocEditAgent processes these outputs via Large Multimodal Models like GPT-4V and Gemini, to parse the document layout, execute edits on grounded Region of Interest (RoI), and generate the edited document image. Extensive experiments on the DocEdit dataset show that DocEditAgent significantly outperforms strong baselines on edit command generation (2-33%), RoI bounding box detection (12-31%), and overall document editing (1-12%) tasks. 2024.emnlp-main.867 @@ -12103,7 +12103,7 @@ Understanding Slang with <fixed-case>LLM</fixed-case>s: Modelling Cross-Cultural Nuances through Paraphrasing IfeoluwaWuraola - NinaDethlefsUniversity of Hull + NinaDethlefsUniversity of Hull DanielMarciniak 15525-15531 In the realm of social media discourse, the integration of slang enriches communication, reflecting the sociocultural identities of users. This study investigates the capability of large language models (LLMs) to paraphrase slang within climate-related tweets from Nigeria and the UK, with a focus on identifying emotional nuances. Using DistilRoBERTa as the base-line model, we observe its limited comprehension of slang. To improve cross-cultural understanding, we gauge the effectiveness of leading LLMs ChatGPT 4, Gemini, and LLaMA3 in slang paraphrasing. While ChatGPT 4 and Gemini demonstrate comparable effectiveness in slang paraphrasing, LLaMA3 shows less coverage, with all LLMs exhibiting limitations in coverage, especially of Nigerian slang. Our findings underscore the necessity for culturally sensitive LLM development in emotion classification, particularly in non-anglocentric regions. @@ -12117,7 +12117,7 @@ SemihYavuzSalesForce.com JinQuSalesforce AI Research JiachengXuSalesForce.com - RuiMengSalesForce Research + RuiMengSalesForce Research CaimingXiongSalesforce Research YingboZhouSalesforce Research 15532-15548 @@ -12132,8 +12132,8 @@ ChongyangTaoBeihang University TaoShenOracle CanXuMicrosoft and Peking University - HongboXuInstitute of Information Engineering - GuodongLongUniversity of Technology Sydney + HongboXuInstitute of Information Engineering + GuodongLongUniversity of Technology Sydney Jian-GuangLouMicrosoft ShuaiMaBeihang University 15549-15575 @@ -12145,7 +12145,7 @@ Adaptive Axes: A Pipeline for In-domain Social Stereotype Analysis QingchengZengNorthwestern University, Northwestern University - MingyuJinRutgers University + MingyuJinRutgers University RobVoigtNorthwestern University 15576-15593 Prior work has explored the possibility of using the semantic information obtained from embedding representations to quantify social stereotypes, leveraging techniques such as word embeddings combined with a list of traits (Garg et al., 2018; Charlesworth et al., 2022) or semantic axes (An et al., 2018; Lucy et al., 2022). However, these approaches have struggled to fully capture the variability in stereotypes across different conceptual domains for the same social group (e.g., black in science, health, and art), in part because the identity of a word and the associations formed during pre-training can dominate its contextual representation (Field and Tsvetkov, 2019). This study explores the ability to recover stereotypes from the contexts surrounding targeted entities by utilizing state-of-the-art text embedding models and adaptive semantic axes enhanced by large language models (LLMs). Our results indicate that the proposed pipeline not only surpasses token-based methods in capturing in-domain framing but also effectively tracks stereotypes over time and along domain-specific semantic axes for in-domain texts. Our research highlights the potential of employing text embedding models to achieve a deeper understanding of nuanced social stereotypes. @@ -12169,7 +12169,7 @@ Human-<fixed-case>LLM</fixed-case> Hybrid Text Answer Aggregation for Crowd Annotations - JiyiLiUniversity of Yamanashi + JiyiLiUniversity of Yamanashi 15609-15622 The quality is a crucial issue for crowd annotations. Answer aggregation is an important type of solution. The aggregated answers estimated from multiple crowd answers to the same instance are the eventually collected annotations, rather than the individual crowd answers themselves. Recently, the capability of Large Language Models (LLMs) on data annotation tasks has attracted interest from researchers. Most of the existing studies mainly focus on the average performance of individual crowd workers; several recent works studied the scenarios of aggregation on categorical labels and LLMs used as label creators. However, the scenario of aggregation on text answers and the role of LLMs as aggregators are not yet well-studied. In this paper, we investigate the capability of LLMs as aggregators in the scenario of close-ended crowd text answer aggregation. We propose a human-LLM hybrid text answer aggregation method with a Creator-Aggregator Multi-Stage (CAMS) crowdsourcing framework. We make the experiments based on public crowdsourcing datasets. The results show the effectiveness of our approach based on the collaboration of crowd workers and LLMs. 2024.emnlp-main.874 @@ -12190,8 +12190,8 @@ Revisiting Supervised Contrastive Learning for Microblog Classification - JunboHuangUniversität Hamburg - RicardoUsbeckLeuphana Universität Lüneburg + JunboHuangUniversität Hamburg + RicardoUsbeckLeuphana Universität Lüneburg 15644-15653 Microblog content (e.g., Tweets) is noisy due to its informal use of language and its lack of contextual information within each post. To tackle these challenges, state-of-the-art microblog classification models rely on pre-training language models (LMs). However, pre-training dedicated LMs is resource-intensive and not suitable for small labs. Supervised contrastive learning (SCL) has shown its effectiveness with small, available resources. In this work, we examine the effectiveness of fine-tuning transformer-based language models, regularized with a SCL loss for English microblog classification. Despite its simplicity, the evaluation on two English microblog classification benchmarks (TweetEval and Tweet Topic Classification) shows an improvement over baseline models. The result shows that, across all subtasks, our proposed method has a performance gain of up to 11.9 percentage points. All our models are open source. 2024.emnlp-main.876 @@ -12215,7 +12215,7 @@ Images Speak Louder than Words: Understanding and Mitigating Bias in Vision-Language Model from a Causal Mediation Perspective ZhaotianWeng - ZijunGaoUniversity of Southern California + ZijunGaoUniversity of Southern California JeroneAndrewsSony AI JieyuZhaoUniversity of Southern California 15669-15680 @@ -12226,11 +12226,11 @@ Mitigating the Language Mismatch and Repetition Issues in <fixed-case>LLM</fixed-case>-based Machine Translation via Model Editing - WeichuanWang + WeichuanWang ZhaoyiLiCity University of Hong Kong and University of Science and Technology of China - DefuLianUniversity of Science and Technology of China - ChenMaCity University of Hong Kong - LinqiSongCity University of Hong Kong + DefuLianUniversity of Science and Technology of China + ChenMaCity University of Hong Kong + LinqiSongCity University of Hong Kong YingWeiNanyang Technological University 15681-15700 Large Language Models (LLMs) have recently revolutionized the NLP field, while they still fall short in some specific down-stream tasks. In the work, we focus on utilizing LLMs to perform machine translation, where we observe that two patterns of errors frequently occur and drastically affect the translation quality: language mismatch and repetition. The work sets out to explore the potential for mitigating these two issues by leveraging model editing methods, e.g., by locating Feed-Forward Network (FFN) neurons or something that are responsible for the errors and deactivating them in the inference time.We find that directly applying such methods either limited effect on the targeted errors or has significant negative side-effect on the general translation quality, indicating that the located components may also be crucial for ensuring machine translation with LLMs on the rails.To this end, we propose to refine the located components by fetching the intersection of the locating results under different language settings, filtering out the aforementioned information that is irrelevant to targeted errors. The experiment results empirically demonstrate that our methods can effectively reduce the language mismatch and repetition ratios and meanwhile enhance or keep the general translation quality in most cases. @@ -12242,13 +12242,13 @@ <fixed-case>S</fixed-case>ci<fixed-case>A</fixed-case>gent: Tool-augmented Language Models for Scientific Reasoning YuboMaSchool of Computer Science and Engineering, Nanyang Technological University ZhibinGou - JunhengHaoMicrosoft + JunhengHaoMicrosoft RuochenXuMicrosoft ShuohangWang LiangmingPanUniversity of Arizona - YujiuYangGraduate School at Shenzhen,Tsinghua University + YujiuYangGraduate School at Shenzhen,Tsinghua University YixinCaoFudan University - AixinSunNanyang Technological University + AixinSunNanyang Technological University 15701-15736 Scientific reasoning poses an excessive challenge for even the most advanced Large Language Models (LLMs). To make this task more practical and solvable for LLMs, we introduce a new task setting named tool-augmented scientific reasoning. This setting supplements LLMs with scalable toolsets, and shifts the focus from pursuing an omniscient problem solver to a proficient tool-user. To facilitate the research of such setting, we construct a tool-augmented training corpus named MathFunc which encompasses over 30,000 samples and roughly 6,000 tools. Building on MathFunc, we develop SciAgent to retrieve, understand and, if necessary, use tools for scientific problem solving. Additionally, we craft a benchmark, SciToolBench, spanning five scientific domains to evaluate LLMs’ abilities with tool assistance. Extensive experiments on SciToolBench confirm the effectiveness of SciAgent. Notably, SciAgent-Llama3-8B surpasses other LLMs with the comparable size by more than 8.0% in absolute accuracy. Furthermore, SciAgent-DeepMath-7B shows much superior performance than ChatGPT. 2024.emnlp-main.880 @@ -12258,10 +12258,10 @@ Global Reward to Local Rewards: Multimodal-Guided Decomposition for Improving Dialogue Agents Dong WonLee - Hae WonParkAmazon and Massachusetts Institute of Technology + Hae WonParkAmazon and Massachusetts Institute of Technology YoonKimMassachusetts Institute of Technology - CynthiaBreazeal - Louis-PhilippeMorencyCarnegie Mellon University + CynthiaBreazeal + Louis-PhilippeMorencyCarnegie Mellon University 15737-15762 We describe an approach for aligning an LLM based dialogue agent for long-term social dialogue, where there is only a single global score given by the user at the end of the session. In this paper, we propose the usage of denser naturally-occurring multimodal communicative signals as local implicit feedback to improve the turn-level utterance generation. Therefore, our approach (dubbed GELI) learns a local, turn-level reward model by decomposing the human-provided Global Explicit (GE) session level reward, using Local Implicit (LI) multimodal reward signals to crossmodally shape the reward decomposition step. This decomposed reward model is then used as part of the RLHF pipeline to improve an LLM-based dialog agent. We run quantitative and qualitative human studies on two large-scale datasets to evaluate the performance of our GELI approach, and find that it shows consistent improvements across various conversational metrics compared to baseline methods. 2024.emnlp-main.881 @@ -12286,19 +12286,19 @@ <fixed-case>ESC</fixed-case>-Eval: Evaluating Emotion Support Conversations in Large Language Models - HaiquanZhao - LingyuLi + HaiquanZhao + LingyuLi ShisongChen - ShuqiKong - JiaanWangTencent + ShuqiKong + JiaanWangTencent KexinHuang - TianleGu + TianleGu YixuWang JianWangShanghai University - LiangDandan + LiangDandan ZhixuLi - YanTengShanghai Artificial Intelligence Laboratory - YanghuaXiaoFudan University + YanTengShanghai Artificial Intelligence Laboratory + YanghuaXiaoFudan University YingchunWangShanghai Artificial Intelligence Laboratory 15785-15810 Emotion Support Conversation (ESC) is a crucial application, which aims to reduce human stress, offer emotional guidance, and ultimately enhance human mental and physical well-being. With the advancement of Large Language Models (LLMs), many researchers have employed LLMs as the ESC models. However, the evaluation of these LLM-based ESCs remains uncertain. In detail, we first re-organize 2,801 role-playing cards from seven existing datasets to define the roles of the role-playing agent. Second, we train a specific role-playing model called ESC-Role which behaves more like a confused person than GPT-4. Third, through ESC-Role and organized role cards, we systematically conduct experiments using 14 LLMs as the ESC models, including general AI-assistant LLMs (e.g., ChatGPT) and ESC-oriented LLMs (e.g., ExTES-Llama). We conduct comprehensive human annotations on interactive multi-turn dialogues of different ESC models. The results show that ESC-oriented LLMs exhibit superior ESC abilities compared to general AI-assistant LLMs, but there is still a gap behind human performance. Moreover, to automate the scoring process for future ESC models, we developed ESC-RANK, which trained on the annotated data, achieving a scoring performance surpassing 35 points of GPT-4. @@ -12323,9 +12323,9 @@ Text Fluoroscopy: Detecting <fixed-case>LLM</fixed-case>-Generated Text through Intrinsic Features XiaoYuUniversity of Science and Technology of China - KejiangChenUniversity of Science and Technology of China + KejiangChenUniversity of Science and Technology of China QiYangShanghai University - WeimingZhangUniversity of Science and Technology of China + WeimingZhangUniversity of Science and Technology of China NenghaiYuUniversity of Science and Technology of China 15838-15846 Large language models (LLMs) have revolutionized the domain of natural language processing because of their excellent performance on various tasks. Despite their impressive capabilities, LLMs also have the potential to generate texts that pose risks of misuse. Consequently, detecting LLM-generated text has become increasingly important.Previous LLM-generated text detection methods use semantic features, which are stored in the last layer. This leads to methods that overfit the training set domain and exhibit shortcomings in generalization. Therefore, We argue that utilizing intrinsic features rather than semantic features for detection results in better performance.In this work, we design Text Fluoroscopy, a black-box method with better generalizability for detecting LLM-generated text by mining the intrinsic features of the text to be detected. Our method captures the text’s intrinsic features by identifying the layer with the largest distribution difference from the last and first layers when projected to the vocabulary space.Our method achieves 7.36% and 2.84% average improvement in detection performance compared to the baselines in detecting texts from different domains generated by GPT-4 and Claude3, respectively. @@ -12341,7 +12341,7 @@ SahajpreetSinghIIT Delhi ViktorHangyaThe Center for Information and Language Processing, University of Munich AlexanderFraserTechnical University of Munich - TanmoyChakrabortyIndian Institute of Technology, Delhi + TanmoyChakrabortyIndian Institute of Technology, Delhi 15847-15863 For subjective tasks such as hate detection, where people perceive hate differently, the Large Language Model’s (LLM) ability to represent diverse groups is unclear. By including additional context in prompts, we comprehensively analyze LLM’s sensitivity to geographical priming, persona attributes, and numerical information to assess how well the needs of various groups are reflected. Our findings on two LLMs, five languages, and six datasets reveal that mimicking persona-based attributes leads to annotation variability. Meanwhile, incorporating geographical signals leads to better regional alignment. We also find that the LLMs are sensitive to numerical anchors, indicating the ability to leverage community-based flagging efforts and exposure to adversaries. Our work provides preliminary guidelines and highlights the nuances of applying LLMs in culturally sensitive cases. 2024.emnlp-main.886 @@ -12355,7 +12355,7 @@ AshutoshBajpaiIndian Institute of Technology, Delhi AaryanGoyal AtifAnwer - TanmoyChakrabortyIndian Institute of Technology, Delhi + TanmoyChakrabortyIndian Institute of Technology, Delhi 15864-15881 The prolific use of Large Language Models (LLMs) as an alternate knowledge base requires them to be factually consistent, necessitating both correctness and consistency traits for paraphrased queries. Recently, significant attempts have been made to benchmark datasets and metrics to evaluate LLMs for these traits. However, structural simplicity (subject-relation-object) and contemporary association in their query formulation limit the broader definition of factuality and consistency. In this study, we introduce TeCFaP, a novel Temporally Consistent Factuality Probe task to expand the consistent factuality probe in the temporal dimension. To this end, we propose TEMP-COFAC, a high-quality dataset of prefix-style English query paraphrases. Subsequently, we extend the definitions of existing metrics to represent consistent factuality across temporal dimension. We experiment with a diverse set of LLMs and find most of them performing poorly on TeCFaP. Next, we propose a novel solution CoTSeLF (Consistent-Time-Sensitive Learning Framework) combining multi-task instruction tuning (MT-IT) with consistent-time-sensitive reinforcement learning (CTSRL) to improve temporally consistent factuality in LLMs. Our experiments demonstrate the efficacy of CoTSeLF over several baselines. 2024.emnlp-main.887 @@ -12366,9 +12366,9 @@ A Comparison of Language Modeling and Translation as Multilingual Pretraining Objectives ZihaoLiUniversity of Helsinki ShaoxiongJiUniversity of Helsinki - TimotheeMickusUniversity of Helsinki + TimotheeMickusUniversity of Helsinki VincentSegonneUniversité de Bretagne Sud - JörgTiedemannUniversity of Helsinki + JörgTiedemannUniversity of Helsinki 15882-15894 Pretrained language models (PLMs) display impressive performances and have captured the attention of the NLP community.Establishing best practices in pretraining has, therefore, become a major focus of NLP research, especially since insights gained from monolingual English models may not necessarily apply to more complex multilingual models.One significant caveat of the current state of the art is that different works are rarely comparable: they often discuss different parameter counts, training data, and evaluation methodology.This paper proposes a comparison of multilingual pretraining objectives in a controlled methodological environment. We ensure that training data and model architectures are comparable, and discuss the downstream performances across 6 languages that we observe in probing and fine-tuning scenarios.We make two key observations: (1) the architecture dictates which pretraining objective is optimal; (2) multilingual translation is a very effective pretraining objective under the right conditions.We make our code, data, and model weights available at https://github.com/Helsinki-NLP/lm-vs-mt. 2024.emnlp-main.888 @@ -12378,9 +12378,9 @@ Can <fixed-case>LLM</fixed-case>s replace Neil de<fixed-case>G</fixed-case>rasse Tyson? Evaluating the Reliability of <fixed-case>LLM</fixed-case>s as Science Communicators PrasoonBajpai - NiladriChatterjee + NiladriChatterjee SubhabrataDuttaTechnische Universität Darmstadt - TanmoyChakrabortyIndian Institute of Technology, Delhi + TanmoyChakrabortyIndian Institute of Technology, Delhi 15895-15912 Large Language Models (LLMs) and AI assistants driven by these models are experiencing exponential growth in usage among both expert and amateur users. In this work, we focus on evaluating the reliability of current LLMs as science communicators. Unlike existing benchmarks, our approach emphasizes assessing these models on scientific question-answering tasks that require a nuanced understanding and awareness of answerability. We introduce a novel dataset, SCiPS-QA, comprising 742 Yes/No queries embedded in complex scientific concepts, along with a benchmarking suite that evaluates LLMs for correctness and consistency across various criteria. We benchmark three proprietary LLMs from the OpenAI GPT family and 13 open-access LLMs from the Meta Llama-2, Llama-3, and Mistral families. While most open-access models significantly underperform compared to GPT-4 Turbo, our experiments identify Llama-3-70B as a strong competitor, often surpassing GPT-4 Turbo in various evaluation aspects. We also find that even the GPT models exhibit a general incompetence in reliably verifying LLM responses. Moreover, we observe an alarming trend where human evaluators are deceived by incorrect responses from GPT-4 Turbo. 2024.emnlp-main.889 @@ -12389,10 +12389,10 @@ <fixed-case>LL</fixed-case>a<fixed-case>MA</fixed-case>-<fixed-case>M</fixed-case>o<fixed-case>E</fixed-case>: Building Mixture-of-Experts from <fixed-case>LL</fixed-case>a<fixed-case>MA</fixed-case> with Continual Pre-Training - TongZhuSoochow University, China + TongZhuSoochow University, China XiaoyeQuShanghai Artificial Intelligence Laboratory - DaizeDongShanghai Artificial Intelligence Laboratory - JiachengRuan + DaizeDongShanghai Artificial Intelligence Laboratory + JiachengRuan JingqiTong ConghuiHeShanghai AI Lab YuChengThe Chinese University of Hong Kong @@ -12404,7 +12404,7 @@ Themis: A Reference-free <fixed-case>NLG</fixed-case> Evaluation Language Model with Flexibility and Interpretability - XinyuHuPeking University + XinyuHuPeking University LiLin MingqiGao XunjianYin @@ -12433,7 +12433,7 @@ Generating Demonstrations for In-Context Compositional Generalization in Grounded Language Learning SamSpilsbury - PekkaMarttinenAalto University + PekkaMarttinenAalto University AlexanderIlinAalto University 15960-15991 In-Context-learning and few-shot prompting are viable methods compositional output generation. However, these methods can be very sensitive to the choice of support examples used. Retrieving good supports from the training data for a given test query is already a difficult problem, but in some cases solving this may not even be enough. We consider the setting of grounded language learning problems where finding relevant supports in the same or similar states as the query may be difficult. We design an agent which instead generates possible supports inputs and targets current state of the world, then uses them in-context-learning to solve the test query. We show substantially improved performance on a previously unsolved compositional generalization test without a loss of performance in other areas. The approach is general and can even scale to instructions expressed in natural language. @@ -12445,7 +12445,7 @@ <fixed-case>FAME</fixed-case>: Towards Factual Multi-Task Model Editing LiZeng YingyuShan - ZemingLiu + ZemingLiu JiashuYao YuhangGuo 15992-16011 @@ -12459,12 +12459,12 @@ RenjiePi TianyangHan JianshuZhang - YueqiXie - RuiPanUniversity of Illinois at Urbana-Champaign + YueqiXie + RuiPanUniversity of Illinois at Urbana-Champaign QingLianThe Hong Kong University of Science and Technology HanzeDongSalesForce JipengZhang - TongZhangUIUC + TongZhangUIUC 16012-16027 The deployment of multimodal large language models (MLLMs) has brought forth a unique vulnerability: susceptibility to malicious attacks through visual inputs. This paper investigates the novel challenge of defending MLLMs against such attacks. Compared to large language models (LLMs), MLLMs include an additional image modality. We discover that images act as a “foreign language” that is not considered during safety alignment, making MLLMs more prone to producing harmful responses. Unfortunately, unlike the discrete tokens considered in text-based LLMs, the continuous nature of image signals presents significant alignment challenges, which poses difficulty to thoroughly cover all possible scenarios. This vulnerability is exacerbated by the fact that most state-of-the-art MLLMs are fine-tuned on limited image-text pairs that are much fewer than the extensive text-based pretraining corpus, which makes the MLLMs more prone to catastrophic forgetting of their original abilities during safety fine-tuning. To tackle these challenges, we introduce MLLM-Protector, a plug-and-play strategy that solves two subtasks: 1) identifying harmful responses via a lightweight harm detector, and 2) transforming harmful responses into harmless ones via a detoxifier. This approach effectively mitigates the risks posed by malicious visual inputs without compromising the original performance of MLLMs. Our results demonstrate that MLLM-Protector offers a robust solution to a previously unaddressed aspect of MLLM security. 2024.emnlp-main.895 @@ -12473,7 +12473,7 @@ Leveraging Large Language Models for <fixed-case>NLG</fixed-case> Evaluation: Advances and Challenges - ZhenLi + ZhenLi XiaohanXu TaoShenOracle CanXuMicrosoft and Peking University @@ -12505,7 +12505,7 @@ ChengyuWangAlibaba Group KunzheHuang JunHuang - LianwenJinSouth China University of Technology + LianwenJinSouth China University of Technology 16061-16075 Contrastive Language-Image Pre-training (CLIP) has been widely studied and applied in numerous applications. However, the emphasis on brief summary texts during pre-training prevents CLIP from understanding long descriptions. This issue is particularly acute regarding videos given that videos often contain abundant detailed contents. In this paper, we propose the VideoCLIP-XL (eXtra Length) model, which aims to unleash the long-description understanding capability of video CLIP models. Firstly, we establish an automatic data collection system and gather a large-scale VILD pre-training dataset with VIdeo and Long-Description pairs. Then, we propose Text-similarity-guided Primary Component Matching (TPCM) to better learn the distribution of feature space while expanding the long description capability. We also introduce two new tasks namely Detail-aware Description Ranking (DDR) and Hallucination-aware Description Ranking (HDR) for further understanding improvement. Finally, we construct a Long Video Description Ranking (LVDR) benchmark for evaluating the long-description capability more comprehensively. Extensive experimental results on widely-used text-video retrieval benchmarks with both short and long descriptions and our LVDR benchmark can fully demonstrate the effectiveness of our method. 2024.emnlp-main.898 @@ -12516,7 +12516,7 @@ <fixed-case>C</fixed-case>orr<fixed-case>S</fixed-case>ynth - A Correlated Sampling Method for Diverse Dataset Generation from <fixed-case>LLM</fixed-case>s - Suhas SKowshikAmazon + Suhas SKowshikAmazon AbhishekDivekar VijitMalikAmazon 16076-16095 @@ -12528,8 +12528,8 @@ Defining Knowledge: Bridging Epistemology and Large Language Models ConstanzaFierroCopenhagen University - RuchiraDhar - FilipposStamatiouCopenhagen University and University of Stellenbosch + RuchiraDhar + FilipposStamatiouCopenhagen University and University of Stellenbosch NicolasGarneau AndersSøgaardCopenhagen University 16096-16111 @@ -12540,7 +12540,7 @@ <fixed-case>TKGT</fixed-case>: Redefinition and A New Way of Text-to-Table Tasks Based on Real World Demands and Knowledge Graphs Augmented <fixed-case>LLM</fixed-case>s - PeiwenJiang + PeiwenJiang XinboLinShanghai Jiaotong University ZiboZhao RuhuiMaShanghai Jiao Tong University @@ -12585,12 +12585,12 @@ The Instinctive Bias: Spurious Images lead to Illusion in <fixed-case>MLLM</fixed-case>s TianyangHan QingLianThe Hong Kong University of Science and Technology - RuiPanUniversity of Illinois at Urbana-Champaign + RuiPanUniversity of Illinois at Urbana-Champaign RenjiePi JipengZhang ShizheDiaoHong Kong University of Science and Technology YongLin - TongZhangUIUC + TongZhangUIUC 16163-16177 Large language models (LLMs) have recently experienced remarkable progress, where the advent of multi-modal large language models (MLLMs) has endowed LLMs with visual capabilities, leading to impressive performances in various multi-modal tasks. However, those powerful MLLMs such as GPT-4V still fail spectacularly when presented with certain image and text inputs. In this paper, we identify a typical class of inputs that baffles MLLMs, which consist of images that are highly relevant but inconsistent with answers, causing MLLMs to suffer from visual illusion. To quantify the effect, we propose CorrelationQA, the first benchmark that assesses the visual illusion level given spurious images. This benchmark contains 7,308 text-image pairs across 13 categories. Based on the proposed CorrelationQA, we conduct a thorough analysis on 9 mainstream MLLMs, illustrating that they universally suffer from this instinctive bias to varying degrees. We hope that our curated benchmark and evaluation results aid in better assessments of the MLLMs’ robustness in the presence of misleading images. The code and datasets are available at https://github.com/MasaiahHan/CorrelationQA. 2024.emnlp-main.904 @@ -12600,7 +12600,7 @@ Rationale-Aware Answer Verification by Pairwise Self-Evaluation AkiraKawabataThe Asahi Shimbun Company - SakuSugawaraNational Institute of Informatics + SakuSugawaraNational Institute of Informatics 16178-16196 Answer verification identifies correct solutions among candidates generated by large language models (LLMs). Current approaches typically train verifier models by labeling solutions as correct or incorrect based solely on whether the final answer matches the gold answer. However, this approach neglects any flawed rationale in the solution yielding the correct answer, undermining the verifier’s ability to distinguish between sound and flawed rationales. We empirically show that in StrategyQA, only 19% of LLM-generated solutions with correct answers have valid rationales, thus leading to an unreliable verifier. Furthermore, we demonstrate that training a verifier on valid rationales significantly improves its ability to distinguish valid and flawed rationale. To make a better verifier without extra human supervision, we introduce REPS (Rationale Enhancement through Pairwise Selection), a method for selecting valid rationales from candidates by iteratively applying pairwise self-evaluation using the same LLM that generates the solutions. Verifiers trained on solutions selected by REPS outperform those trained using conventional training methods on three reasoning benchmarks (ARC-Challenge, DROP, and StrategyQA). Our results suggest that training reliable verifiers requires ensuring the validity of rationales in addition to the correctness of the final answers, which would be critical for models assisting humans in solving complex reasoning tasks. 2024.emnlp-main.905 @@ -12609,13 +12609,13 @@ On the Robustness of Editing Large Language Models - XinbeiMa + XinbeiMa TianjieJu JiyangQiu - ZhuoshengZhangShanghai Jiao Tong University + ZhuoshengZhangShanghai Jiao Tong University HaiZhaoShanghai Jiao Tong University - LifengLiu - YulongWang + LifengLiu + YulongWang 16197-16216 Large language models (LLMs) have played a pivotal role in building communicative AI, yet they encounter the challenge of efficient updates. Model editing enables the manipulation of specific knowledge memories and the behavior of language generation without retraining. However, the robustness of model editing remains an open question. This work seeks to understand the strengths and limitations of editing methods, facilitating practical applications of communicative AI. We focus on three key research questions. RQ1: Can edited LLMs behave consistently resembling communicative AI in realistic situations? RQ2: To what extent does the rephrasing of prompts lead LLMs to deviate from the edited knowledge memory? RQ3: Which knowledge features are correlated with the performance and robustness of editing? Our empirical studies uncover a substantial disparity between existing editing methods and the practical application of LLMs. On rephrased prompts that are flexible but common in realistic applications, the performance of editing experiences a significant decline. Further analysis shows that more popular knowledge is memorized better, easier to recall, and more challenging to edit effectively. 2024.emnlp-main.906 @@ -12637,9 +12637,9 @@ Distract Large Language Models for Automatic Jailbreak Attack - ZeguanXiao + ZeguanXiao YanYang - GuanhuaChenSouthern University of Science and Technology + GuanhuaChenSouthern University of Science and Technology YunChenShanghai University of Finance and Economics 16230-16244 Extensive efforts have been made before the public release of Large language models (LLMs) to align their behaviors with human values. However, even meticulously aligned LLMs remain vulnerable to malicious manipulations such as jailbreaking, leading to unintended behaviors. In this work, we propose a novel black-box jailbreak framework for automated red teaming of LLMs. We designed malicious content concealing and memory reframing with an iterative optimization algorithm to jailbreak LLMs, motivated by the research about the distractibility and over-confidence phenomenon of LLMs. Extensive experiments of jailbreaking both open-source and proprietary LLMs demonstrate the superiority of our framework in terms of effectiveness, scalability and transferability. We also evaluate the effectiveness of existing jailbreak defense methods against our attack and highlight the crucial need to develop more effective and practical defense strategies. @@ -12660,7 +12660,7 @@ <fixed-case>W</fixed-case>orry<fixed-case>W</fixed-case>ords: Norms of Anxiety Association for over 44k <fixed-case>E</fixed-case>nglish Words - Saif M.MohammadNational Research Council Canada + Saif M.MohammadNational Research Council Canada 16261-16278 Anxiety, the anticipatory unease about a potential negative outcome, is a common and beneficial human emotion. However, there is still much that is not known about anxiety, such as how it relates to our body and how it manifests in language; especially pertinent given the increasing impact of related disorders.In this work,we introduce WorryWords, the first large-scale repository of manually derived word–anxiety associations for over 44,450 English words. We show that the anxiety associations are highly reliable.We use WorryWords to study the relationship between anxiety and other emotion constructs, as well as the rate at which children acquire anxiety words with age. Finally, we show that using WorryWords alone, one can accurately track the change of anxiety in streams of text.WorryWords enables a wide variety of anxiety-related research in psychology, NLP, public health, and social sciences.WorryWords (and its translations to over 100 languages) is freely available. http://saifmohammad.com/worrywords.html 2024.emnlp-main.910 @@ -12687,8 +12687,8 @@ CanZu XuHao YiLu - WeiHeFudan University - YiwenDing + WeiHeFudan University + YiwenDing TaoGuiFudan University QiZhangFudan University XuanjingHuangFudan University @@ -12701,9 +12701,9 @@ <fixed-case>A</fixed-case>uto<fixed-case>P</fixed-case>ersuade: A Framework for Evaluating and Explaining Persuasive Arguments Till RaphaelSaenger - MusashiHinckIntel + MusashiHinckIntel JustinGrimmerStanford University - Brandon M.Stewart + Brandon M.Stewart 16325-16342 We introduce a three-part framework for constructing persuasive messages, AutoPersuade. First, we curate a large collection of arguments and gather human evaluations of their persuasiveness. Next, we introduce a novel topic model to identify the features of these arguments that influence persuasion. Finally, we use the model to predict the persuasiveness of new arguments and to assess the causal effects of argument components, offering an explanation of the results. We demonstrate the effectiveness of AutoPersuade in an experimental study on arguments for veganism, validating our findings through human studies and out-of-sample predictions. 2024.emnlp-main.913 @@ -12742,7 +12742,7 @@ Scaling Laws for Linear Complexity Language Models - XuyangShenShanghai AI Lab + XuyangShenShanghai AI Lab DongLi RuitaoLengAustralian National University ZhenQinTapTap @@ -12756,8 +12756,8 @@ Autoregressive Multi-trait Essay Scoring via Reinforcement Learning with Scoring-aware Multiple Rewards - HeejinDoPohang University of Science and Technology - SangwonRyuPohang University of Science and Technology + HeejinDoPohang University of Science and Technology + SangwonRyuPohang University of Science and Technology GaryLee 16427-16438 Recent advances in automated essay scoring (AES) have shifted towards evaluating multiple traits to provide enriched feedback. Like typical AES systems, multi-trait AES employs the quadratic weighted kappa (QWK) to measure agreement with human raters, aligning closely with the rating schema; however, its non-differentiable nature prevents its direct use in neural network training. In this paper, we propose Scoring-aware Multi-reward Reinforcement Learning (SaMRL), which integrates actual evaluation schemes into the training process by designing QWK-based rewards with a mean-squared error penalty for multi-trait AES. Existing reinforcement learning (RL) applications in AES are limited to classification models despite associated performance degradation, as RL requires probability distributions; instead, we adopt an autoregressive score generation framework to leverage token generation probabilities for robust multi-trait score predictions. Empirical analyses demonstrate that SaMRL facilitates model training, notably enhancing scoring of previously inferior prompts. @@ -12779,9 +12779,9 @@ <fixed-case>ATAP</fixed-case>: Automatic Template-Augmented Commonsense Knowledge Graph Completion via Pre-Trained Language Models - FuZhangNortheastern University + FuZhangNortheastern University YifanDing - JingweiChengNortheastern University, China + JingweiChengNortheastern University, China 16456-16472 The mission of commonsense knowledge graph completion (CKGC) is to infer missing facts from known commonsense knowledge. CKGC methods can be roughly divided into two categories: triple-based methods and text-based methods. Due to the imbalanced distribution of entities and limited structural information, triple-based methods struggle with long-tail entities. Text-based methods alleviate this issue, but require extensive training and fine-tuning of language models, which reduces efficiency. To alleviate these problems, we propose ATAP, the first CKGC framework that utilizes automatically generated continuous prompt templates combined with pre-trained language models (PLMs). Moreover, ATAP uses a carefully designed new prompt template training strategy, guiding PLMs to generate optimal prompt templates for CKGC tasks. Combining the rich knowledge of PLMs with the template automatic augmentation strategy, ATAP effectively mitigates the long-tail problem and enhances CKGC performance. Results on benchmark datasets show that ATAP achieves state-of-the-art performance overall. 2024.emnlp-main.919 @@ -12792,7 +12792,7 @@ <fixed-case>LM</fixed-case>2: A Simple Society of Language Models Solves Complex Reasoning GurushaJuneja SubhabrataDuttaTechnische Universität Darmstadt - TanmoyChakrabortyIndian Institute of Technology, Delhi + TanmoyChakrabortyIndian Institute of Technology, Delhi 16473-16484 Despite demonstrating emergent reasoning abilities, Large Language Models (LLMS) often lose track of complex, multi-step reasoning. Existing studies show that providing guidance via decomposing the original question into multiple subproblems elicits more robustness in LLM reasoning – a decomposer generates the subproblems, and a solver solves each of these subproblems. However, these techniques fail to accommodate coordination between the decomposer and the solver modules (either in a single model or different specialized ones) – the decomposer does not keep track of the ability of the solver to follow the decomposed reasoning. In this paper, we propose LM2 to address these challenges. LM2 modularizes the decomposition, solution, and verification into three different language models. The decomposer module identifies the key concepts necessary to solve the problem and generates step-by-step subquestions according to the reasoning requirement. The solver model generates the solution to the subproblems that are then checked by the verifier module; depending upon the feedback from the verifier, the reasoning context is constructed using the subproblems and the solutions. These models are trained to coordinate using policy learning. Exhaustive experimentation suggests the superiority of LM2 over existing methods on in- and out-domain reasoning problems, outperforming the best baselines by 8.1% on MATH, 7.71% on JEEBench, and 9.7% on MedQA problems (code available at https://github.com/ LCS2-IIITD/Language_Model_Multiplex). 2024.emnlp-main.920 @@ -12801,7 +12801,7 @@ Towards a Similarity-adjusted Surprisal Theory - ClaraMeister + ClaraMeister MarioGiulianelliDepartment of Computer Science, ETHZ - ETH Zurich TiagoPimentelDepartment of Computer Science, ETHZ - ETH Zurich 16485-16498 @@ -12813,9 +12813,9 @@ Multi-Level Information Retrieval Augmented Generation for Knowledge-based Visual Question Answering OmarAdjaliCEA - OlivierFerretCEA - SaharGhannayUniversuté paris saclay - HervéLe BorgneCEA + OlivierFerretCEA + SaharGhannayUniversuté paris saclay + HervéLe BorgneCEA 16499-16513 The Knowledge-Aware Visual Question Answering about Entity task aims to disambiguate entities using textual and visual information, as well as knowledge. It usually relies on two independent steps, information retrieval then reading comprehension, that do not benefit each other. Retrieval Augmented Generation (RAG) offers a solution by using generated answers as feedback for retrieval training. RAG usually relies solely on pseudo-relevant passages retrieved from external knowledge bases which can lead to ineffective answer generation. In this work, we propose a multi-level information RAG approach that enhances answer generation through entity retrieval and query expansion. We formulate a joint-training RAG loss such that answer generation is conditioned on both entity and passage retrievals. We show through experiments new state-of-the-art performance on the VIQuAE KB-VQA benchmark and demonstrate that our approach can help retrieve more actual relevant knowledge to generate accurate answers. 2024.emnlp-main.922 @@ -12826,12 +12826,12 @@ Can We Trust the Performance Evaluation of Uncertainty Estimation Methods in Text Summarization? JianfengHeVirginia Tech RuningYang - LinlinYu + LinlinYu ChangbinLi RuoxiJiaVirginia Tech FengChenUniversity of Texas, Dallas MingJinVirginia Tech - Chang-TienLuVirginia Tech + Chang-TienLuVirginia Tech 16514-16575 Text summarization, a key natural language generation (NLG) task, is vital in various domains. However, the high cost of inaccurate summaries in risk-critical applications, particularly those involving human-in-the-loop decision-making, raises concerns about the reliability of uncertainty estimation on text summarization (UE-TS) evaluation methods. This concern stems from the dependency of uncertainty model metrics on diverse and potentially conflicting NLG metrics. To address this issue, we introduce a comprehensive UE-TS benchmark incorporating 31 NLG metrics across four dimensions. The benchmark evaluates the uncertainty estimation capabilities of two large language models and one pre-trained language model on three datasets, with human-annotation analysis incorporated where applicable. We also assess the performance of 14 common uncertainty estimation methods within this benchmark. Our findings emphasize the importance of considering multiple uncorrelated NLG metrics and diverse uncertainty estimation methods to ensure reliable and efficient evaluation of UE-TS techniques. Our code and data are available: https://github.com/he159ok/Benchmark-of-Uncertainty-Estimation-Methods-in-Text-Summarization. 2024.emnlp-main.923 @@ -12856,7 +12856,7 @@ <fixed-case>BPE</fixed-case> Gets Picky: Efficient Vocabulary Refinement During Tokenizer Training PavelChizhov - CatherineArnett + CatherineArnett ElizavetaKorotkovaUniversity of Tartu Ivan P.YamshchikovTechnical University of Applied Sciences Würzburg-Schweinfurt and ISEG, University of Lisbon 16587-16604 @@ -12869,7 +12869,7 @@ <fixed-case>SEGMENT</fixed-case>+: Long Text Processing with Short-Context Language Models WeiShi - ShuangLi + ShuangLi KerunYu JingleiChenANT GROUP ZujieLiangAnt Group @@ -12879,7 +12879,7 @@ BoZhengAlibaba Group JiaqingLiangFudan University JiangjieChenByteDance Inc. - YanghuaXiaoFudan University + YanghuaXiaoFudan University 16605-16617 There is a growing interest in expanding the input capacity of language models (LMs) across various domains. However, simply increasing the context window does not guarantee robust performance across diverse long-input processing tasks, such as understanding extensive documents and extracting detailed information from lengthy and noisy data. In response, we introduce Segment+, a general framework that enables LMs to handle extended inputs within limited context windows efficiently. Segment+ utilizes structured notes and a filtering module to manage information flow, resulting in a system that is both controllable and interpretable. Our extensive experiments across various model sizes, focusing on long-document question-answering and Needle-in-a-Haystack tasks, demonstrate the effectiveness of Segment+ in improving performance. 2024.emnlp-main.926 @@ -12889,12 +12889,12 @@ Explicit Memory Learning with Expectation Maximization ZhangyueYin - QiushiSunUniversity of Hong Kong + QiushiSunUniversity of Hong Kong QipengGuoShanghai AI Laboratory ZhiyuanZeng QinyuanCheng - XipengQiuFudan University - XuanjingHuangFudan University + XipengQiuFudan University + XuanjingHuangFudan University 16618-16635 2024.emnlp-main.927 yin-etal-2024-explicit @@ -12919,7 +12919,7 @@ WeizhouShen ChenliangLi HongzhanChenSUN YAT-SEN UNIVERSITY - MingYan + MingYan XiaojunQuanSUN YAT-SEN UNIVERSITY HehongChen JiZhangAlibaba Group @@ -12949,7 +12949,7 @@ Shantanu DeepakPatankar MadhumithaChandrasekaranLearning Disability Clinic, PND Centre, Lokmanya Tilak Municipal General Hospital SnehaD’silva - Jemima S.Jacob + Jemima S.Jacob RashmiGupta 16698-16721 In this study, we introduce ANGST, a novel, first of its kind benchmark for depression-anxiety comorbidity classification from social media posts. Unlike contemporary datasets that often oversimplify the intricate interplay between different mental health disorders by treating them as isolated conditions, ANGST enables multi-label classification, allowing each post to be simultaneously identified as indicating depression and/or anxiety. Comprising 2876 meticulously annotated posts by expert psychologists and an additional 7667 silver-labeled posts, ANGST posits a more representative sample of online mental health discourse. Moreover, we benchmark ANGST using various state-of-the-art language models, ranging from Mental-BERT to GPT-4. Our results provide significant insights into the capabilities and limitations of these models in complex diagnostic scenarios. While GPT-4 generally outperforms other models, none achieve an F1 score exceeding 72% in multi-class comorbid classification, underscoring the ongoing challenges in applying language models to mental health diagnostics. @@ -12961,8 +12961,8 @@ The Odyssey of Commonsense Causality: From Foundational Benchmarks to Cutting-Edge Reasoning ShaoboCuiEPFL - EPF Lausanne - ZhijingJin - BernhardSchölkopfELLIS Institute and Max Planck Institute for Intelligent Systems, Max-Planck Institute + ZhijingJin + BernhardSchölkopfELLIS Institute and Max Planck Institute for Intelligent Systems, Max-Planck Institute BoiFaltings 16722-16763 Understanding commonsense causality is a unique mark of intelligence for humans. It helps people understand the principles of the real world better and benefits the decision-making process related to causation. For instance, commonsense causality is crucial in judging whether a defendant’s action causes the plaintiff’s loss in determining legal liability. Despite its significance, a systematic exploration of this topic is notably lacking. Our comprehensive survey bridges this gap by focusing on taxonomies, benchmarks, acquisition methods, qualitative reasoning, and quantitative measurements in commonsense causality, synthesizing insights from over 200 representative articles. Our work aims to provide a systematic overview, update scholars on recent advancements, provide a practical guide for beginners, and highlight promising future research directions in this vital field. A summary of the related literature is available at https://github.com/cui-shaobo/causality-papers . @@ -12972,10 +12972,10 @@ Investigating Large Language Models for Complex Word Identification in Multilingual and Multidomain Setups - Răzvan-AlexandruSmădu, University Politehnica of Bucharest + Răzvan-AlexandruSmădu, University Politehnica of Bucharest David-GabrielIon - Dumitru-ClementinCercelNational University of Science and Technology POLITEHNICA Bucharest - FlorinPop, University Politehnica of Bucharest + Dumitru-ClementinCercelNational University of Science and Technology POLITEHNICA Bucharest + FlorinPop, University Politehnica of Bucharest Mihaela-ClaudiaCercel 16764-16800 Complex Word Identification (CWI) is an essential step in the lexical simplification task and has recently become a task on its own. Some variations of this binary classification task have emerged, such as lexical complexity prediction (LCP) and complexity evaluation of multi-word expressions (MWE). Large language models (LLMs) recently became popular in the Natural Language Processing community because of their versatility and capability to solve unseen tasks in zero/few-shot settings. Our work investigates LLM usage, specifically open-source models such as Llama 2, Llama 3, and Vicuna v1.5, and closed-source, such as ChatGPT-3.5-turbo and GPT-4o, in the CWI, LCP, and MWE settings. We evaluate zero-shot, few-shot, and fine-tuning settings and show that LLMs struggle in certain conditions or achieve comparable results against existing methods. In addition, we provide some views on meta-learning combined with prompt learning. In the end, we conclude that the current state of LLMs cannot or barely outperform existing methods, which are usually much smaller. @@ -13003,8 +13003,8 @@ DivyaPatel PathikPatel AnkushChander - SourishDasguptaDhirubhai Ambani Institute of Information & Communication Technology - TanmoyChakrabortyIndian Institute of Technology, Delhi + SourishDasguptaDhirubhai Ambani Institute of Information & Communication Technology + TanmoyChakrabortyIndian Institute of Technology, Delhi 16820-16842 2024.emnlp-main.935 patel-etal-2024-large @@ -13016,7 +13016,7 @@ GoonjanSaha Rocktim JyotiDasMohamed bin Zayed University of Artificial Intelligence DineshRaghuIBM Research - New Delhi - Mausam.Indian Institute of Technology Delhi + Mausam.Indian Institute of Technology Delhi 16843-16877 Medical task-oriented dialogue systems can assist doctors by collecting patient medical history, aiding in diagnosis, or guiding treatment selection, thereby reducing doctor burnout and expanding access to medical services. However, doctor-patient dialogue datasets are not readily available, primarily due to privacy regulations. Moreover, existing datasets lack comprehensive annotations involving medical slots and their different attributes, such as symptoms and their onset, progression, and severity. These comprehensive annotations are crucial for accurate diagnosis. Finally, most existing datasets are non-English, limiting their utility for the larger research community.In response, we introduce MediTOD, a new dataset of doctor-patient dialogues in English for the medical history-taking task. Collaborating with doctors, we devise a questionnaire-based labeling scheme tailored to the medical domain. Then, medical professionals create the dataset with high-quality comprehensive annotations, capturing medical slots and their attributes. We establish benchmarks in supervised and few-shot settings on MediTOD for natural language understanding, policy learning, and natural language generation subtasks, evaluating models from both TOD and biomedical domains. We make MediTOD publicly available for future research. 2024.emnlp-main.936 @@ -13026,7 +13026,7 @@ ***<fixed-case>Y</fixed-case>es<fixed-case>B</fixed-case>ut***: A High-Quality Annotated Multimodal Dataset for evaluating Satire Comprehension capability of Vision-Language Models - AbhilashNandyIndian Institute of Technology Kharagpur + AbhilashNandyIndian Institute of Technology Kharagpur YashAgarwal AshishPatwa Millon MadhurDas @@ -13046,7 +13046,7 @@ ChunhuiZhangDartmouth College YirenJianByteDance Inc. ZhongyuOuyang - SoroushVosoughiDartmouth College + SoroushVosoughiDartmouth College 16896-16922 This study explores the inherent limitations of large language models (LLMs) from a scaling perspective, focusing on the upper bounds of their cognitive capabilities. We integrate insights from cognitive science to quantitatively examine how LLMs perform on n-back tasks—a benchmark used to assess working memory, which involves temporarily holding and manipulating information. Our findings reveal that despite the increased model size, LLMs still face significant challenges in holding and processing information effectively, especially under complex task conditions. We also assess various prompting strategies, revealing their diverse impacts on LLM performance. The results highlight the struggle of current LLMs to autonomously discover optimal problem-solving patterns without heavily relying on manually corrected prompts. To move beyond these constraints, fundamental improvements in the planning and search of LLMs are essential for them to reason autonomously. Improving these capabilities will reduce the reliance on external corrections and enable LLMs to become more autonomous in their problem-solving processes. 2024.emnlp-main.938 @@ -13055,7 +13055,7 @@ <fixed-case>RAFT</fixed-case>: Realistic Attacks to Fool Text Detectors - James LiyuanWangColumbia University + James LiyuanWangColumbia University RanLiColumbia University JunfengYangColumbia University ChengzhiMaoGoogle @@ -13097,7 +13097,7 @@ <fixed-case>LLM</fixed-case>-based Code-Switched Text Generation for Grammatical Error Correction TomPotterThomson Reuters - ZhengYuanKing’s College London, University of London + ZhengYuanKing’s College London, University of London 16957-16965 With the rise of globalisation, code-switching (CSW) has become a ubiquitous part of multilingual conversation, posing new challenges for natural language processing (NLP), especially in Grammatical Error Correction (GEC). This work explores the complexities of applying GEC systems to CSW texts. Our objectives include evaluating the performance of state-of-the-art GEC systems on an authentic CSW dataset from English as a Second Language (ESL) learners, exploring synthetic data generation as a solution to data scarcity, and developing a model capable of correcting grammatical errors in monolingual and CSW texts. We generated synthetic CSW GEC data, resulting in one of the first substantial datasets for this task, and showed that a model trained on this data is capable of significant improvements over existing systems. This work targets ESL learners, aiming to provide educational technologies that aid in the development of their English grammatical correctness without constraining their natural multilingualism. 2024.emnlp-main.942 @@ -13106,8 +13106,8 @@ Deciphering the Interplay of Parametric and Non-parametric Memory in Retrieval-augmented Language Models - MehrdadFarahani - RichardJohanssonChalmers University of Technology and Göteborg University + MehrdadFarahani + RichardJohanssonChalmers University of Technology and Göteborg University 16966-16977 Generative language models often struggle with specialized or less-discussed knowledge. A potential solution is found in Retrieval-Augmented Generation (RAG) models which act like retrieving information before generating responses. In this study, we explore how the Atlas approach, a RAG model, decides between what it already knows (parametric) and what it retrieves (non-parametric). We use causal mediation analysis and controlled experiments to examine how internal representations influence information processing. Our findings disentangle the effects of parametric knowledge and the retrieved context. They indicate that in cases where the model can choose between both types of information (parametric and non-parametric), it relies more on the context than the parametric knowledge. Furthermore, the analysis investigates the computations involved in how the model uses the information from the context. We find that multiple mechanisms are active within the model and can be detected with mediation analysis: first, the decision of whether the context is relevant, and second, how the encoder computes output representations to support copying when relevant. 2024.emnlp-main.943 @@ -13128,11 +13128,11 @@ Community-Cross-Instruct: Unsupervised Instruction Generation for Aligning Large Language Models to Online Communities - ZihaoHe + ZihaoHe Minh DucChu RebeccaDorn - SiyiGuo - KristinaLermanUniversity of Southern California and USC Information Sciences Institute + SiyiGuo + KristinaLermanUniversity of Southern California and USC Information Sciences Institute 17001-17019 Social scientists use surveys to probe the opinions and beliefs of populations, but these methods are slow, costly, and prone to biases. Recent advances in large language models (LLMs) enable the creating of computational representations or “digital twins” of populations that generate human-like responses mimicking the population’s language, styles, and attitudes. We introduce Community-Cross-Instruct, an unsupervised framework for aligning LLMs to online communities to elicit their beliefs. Given a corpus of a community’s online discussions, Community-Cross-Instruct automatically generates instruction-output pairs by an advanced LLM to (1) finetune a foundational LLM to faithfully represent that community, and (2) evaluate the alignment of the finetuned model to the community. We demonstrate the method’s utility in accurately representing political and diet communities on Reddit. Unlike prior methods requiring human-authored instructions, Community-Cross-Instruct generates instructions in a fully unsupervised manner, enhancing scalability and generalization across domains. This work enables cost-effective and automated surveying of diverse online communities. 2024.emnlp-main.945 @@ -13166,7 +13166,7 @@ One Thousand and One Pairs: A “novel” challenge for long-context language models - MarzenaKarpinska + MarzenaKarpinska KatherineThaiUniversity of Massachusetts at Amherst KyleLoAllen Institute for Artificial Intelligence TanyaGoyalCornell University @@ -13215,10 +13215,10 @@ How Susceptible are Large Language Models to Ideological Manipulation? KaiChen - ZihaoHe + ZihaoHe JunYanGoogle TaiweiShi - KristinaLermanUniversity of Southern California and USC Information Sciences Institute + KristinaLermanUniversity of Southern California and USC Information Sciences Institute 17140-17161 Large Language Models (LLMs) possess the potential to exert substantial influence on public perceptions and interactions with information. This raises concerns about the societal impact that could arise if the ideologies within these models can be easily manipulated. In this work, we investigate how effectively LLMs can learn and generalize ideological biases from their instruction-tuning data. Our findings reveal a concerning vulnerability: exposure to only a small amount of ideologically driven samples significantly alters the ideology of LLMs. Notably, LLMs demonstrate a startling ability to absorb ideology from one topic and generalize it to even unrelated ones. The ease with which LLMs’ ideologies can be skewed underscores the risks associated with intentionally poisoned training data by malicious actors or inadvertently introduced biases by data annotators. It also emphasizes the imperative for robust safeguards to mitigate the influence of ideological manipulations on LLMs. 2024.emnlp-main.952 @@ -13247,7 +13247,7 @@ JinZhao JingxuanTu HanDu - NianwenXueBrandeis University + NianwenXueBrandeis University 17197-17210 Framing is used to present some selective aspects of an issue and making them more salient, which aims to promote certain values, interpretations, or solutions (Entman, 1993). This study investigates the nuances of media framing on public perception and understanding by examining how events are presented within news articles. Unlike previous research that primarily focused on word choice as a framing device, this work explores the comprehensive narrative construction through events and their relations. Our method integrates event extraction, cross-document event coreference, and causal relationship mapping among events to extract framing devices employed by media to assess their role in framing the narrative. We evaluate our approach with a media attitude detection task and show that the use of event mentions, event cluster descriptors, and their causal relations effectively captures the subtle nuances of framing, thereby providing deeper insights into the attitudes conveyed by news articles. The experimental results show the framing device models surpass the baseline models and offers a more detailed and explainable analysis of media framing effects. We make the source code and dataset publicly available. 2024.emnlp-main.954 @@ -13256,9 +13256,9 @@ Fill In The Gaps: Model Calibration and Generalization with Synthetic Data - YangBa - Michelle VMancenidoArizona State University - RongPanArizona State University + YangBa + Michelle VMancenidoArizona State University + RongPanArizona State University 17211-17225 As machine learning models continue to swiftly advance, calibrating their performance has become a major concern prior to practical and widespread implementation. Most existing calibration methods often negatively impact model accuracy due to the lack of diversity of validation data, resulting in reduced generalizability. To address this, we propose a calibration method that incorporates synthetic data without compromising accuracy. We derive the expected calibration error (ECE) bound using the Probably Approximately Correct (PAC) learning framework. Large language models (LLMs), known for their ability to mimic real data and generate text with mixed class labels, are utilized as a synthetic data generation strategy to lower the ECE bound and improve model accuracy on real test data. Additionally, we propose data generation mechanisms for efficient calibration. Testing our method on four different natural language processing tasks, we observed an average up to 34% increase in accuracy and 33% decrease in ECE. 2024.emnlp-main.955 @@ -13267,8 +13267,8 @@ Adaptive Question Answering: Enhancing Language Model Proficiency for Addressing Knowledge Conflicts with Source Citations - SagiShaier - AriKobrenOracle Labs + SagiShaier + AriKobrenOracle Labs Philip V.OgrenOracle 17226-17239 Resolving knowledge conflicts is a crucial challenge in Question Answering (QA) tasks, as the internet contains numerous conflicting facts and opinions. While some research has made progress in tackling ambiguous settings where multiple valid answers exist, these approaches often neglect to provide source citations, leaving users to evaluate the factuality of each answer. On the other hand, existing work on citation generation has focused on unambiguous settings with single answers, failing to address the complexity of real-world scenarios. Despite the importance of both aspects, no prior research has combined them, leaving a significant gap in the development of QA systems. In this work, we bridge this gap by proposing the novel task of QA with source citation in ambiguous settings, where multiple valid answers exist. To facilitate research in this area, we create a comprehensive framework consisting of: (1) five novel datasets, obtained by augmenting three existing reading comprehension datasets with citation meta-data across various ambiguous settings, such as distractors and paraphrasing; (2) the first ambiguous multi-hop QA dataset featuring real-world, naturally occurring contexts; (3) two new metrics to evaluate models’ performances; and (4) several strong baselines using rule-based, prompting, and finetuning approaches over five large language models. We hope that this new task, datasets, metrics, and baselines will inspire the community to push the boundaries of QA research and develop more trustworthy and interpretable systems. @@ -13303,7 +13303,7 @@ <fixed-case>M</fixed-case>eme<fixed-case>CLIP</fixed-case>: Leveraging <fixed-case>CLIP</fixed-case> Representations for Multimodal Meme Classification Siddhant BikramShahNortheastern University - ShuvamShiwakoti + ShuvamShiwakoti MaheepChaudhary HaohanWangUniversity of Illinois at Urbana-Champaign 17320-17332 @@ -13329,14 +13329,14 @@ <fixed-case>S</fixed-case>tory<fixed-case>S</fixed-case>park<fixed-case>QA</fixed-case>: Expert-Annotated <fixed-case>QA</fixed-case> Pairs with Real-World Knowledge for Children’s Story-Based Learning JiajuChen - YuxuanLu - ShaoZhangShanghai Jiao Tong University - BingshengYaoNortheastern University + YuxuanLu + ShaoZhangShanghai Jiao Tong University + BingshengYaoNortheastern University YuanzheDongStanford University YingXuUniversity of Michigan - Ann Arbor YunyaoLiAdobe Systems - QianwenWangUniversity of Minnesota - Twin Cities - DakuoWangNortheastern University + QianwenWangUniversity of Minnesota - Twin Cities + DakuoWangNortheastern University YulingSunEast China Normal University 17351-17370 Interactive story reading is common in early childhood education, where teachers expect to teach both language skills and real-world knowledge beyond the story. While many story reading systems have been developed for this activity, they often fail to infuse real-world knowledge into the conversation. This limitation can be attributed to the existing question-answering (QA) datasets used for children’s education, upon which the systems are built, failing to capture the nuances of how education experts think when conducting interactive story reading activities. To bridge this gap, we design an annotation framework, empowered by existing knowledge graph to capture experts’ annotations and thinking process, and leverage this framework to construct StorySparkQA dataset, which comprises 5, 868 expert-annotated QA pairs with real-world knowledge. We conduct automated and human expert evaluations across various QA pair generation settings to demonstrate that our StorySparkQA can effectively support models in generating QA pairs that target real-world knowledge beyond story content. StorySparkQA is available at https://huggingface.co/datasets/NEU-HAI/StorySparkQA. @@ -13349,8 +13349,8 @@ JiaxiangLiu YuanWang JiaweiDu - Joey TianyiZhouA*STAR Centre for Frontier AI Research - ZuozhuLiuZhejiang University + Joey TianyiZhouA*STAR Centre for Frontier AI Research + ZuozhuLiuZhejiang University 17371-17389 Artificial intelligence has advanced in Medical Visual Question Answering (Med-VQA), but prevalent research tends to focus on the accuracy of the answers, often overlooking the reasoning paths and interpretability, which are crucial in clinical settings. Besides, current Med-VQA algorithms, typically reliant on singular models, lack the robustness needed for real-world medical diagnostics which usually require collaborative expert evaluation. To address these shortcomings, this paper presents MedCoT, a novel hierarchical expert verification reasoning chain method designed to enhance interpretability and accuracy in biomedical imaging inquiries. MedCoT is predicated on two principles: The necessity for explicit reasoning paths in Med-VQA and the requirement for multi-expert review to formulate accurate conclusions. The methodology involves an Initial Specialist proposing diagnostic rationales, followed by a Follow-up Specialist who validates these rationales, and finally, a consensus is reached through a vote among a sparse Mixture of Experts within the locally deployed Diagnostic Specialist, which then provides the definitive diagnosis. Experimental evaluations on four standard Med-VQA datasets demonstrate that MedCoT surpasses existing state-of-the-art approaches, providing significant improvements in performance and interpretability. 2024.emnlp-main.962 @@ -13398,7 +13398,7 @@ A Simple yet Effective Training-free Prompt-free Approach to <fixed-case>C</fixed-case>hinese Spelling Correction Based on Large Language Models - HouquanZhouSoochow University, China + HouquanZhouSoochow University, China ZhenghuaLiSoochow University, China BoZhang ChenLi @@ -13416,7 +13416,7 @@ Representational Analysis of Binding in Language Models QinDaiTohoku University BenjaminHeinzerlingRIKEN and Tohoku University - KentaroInuiMohamed bin Zayed University of Artificial Intelligence, RIKEN and Tohoku University + KentaroInuiMohamed bin Zayed University of Artificial Intelligence, RIKEN and Tohoku University 17468-17493 Entity tracking is essential for complex reasoning. To perform in-context entity tracking, language models (LMs) must bind an entity to its attribute (e.g., bind a container to its content) to recall attribute for a given entity. For example, given a context mentioning “The coffee is in Box Z, the stone is in Box M, the map is in Box H”, to infer “Box Z contains the coffee” later, LMs must bind “Box Z” to “coffee”. To explain the binding behaviour of LMs, existing research introduces a Binding ID mechanism and states that LMs use a abstract concept called Binding ID (BI) to internally mark entity-attribute pairs. However, they have not directly captured the BI information from entity activations. In this work, we provide a novel view of the Binding ID mechanism by localizing the BI information. Specifically, we discover that there exists a low-rank subspace in the hidden state (or activation) of LMs, that primarily encodes BIs. To identify this subspace, we take principle component analysis as our first attempt and it is empirically proven to be effective. Moreover, we also discover that when editing representations along directions in the subspace, LMs tend to bind a given entity to other attributes accordingly. For example, by patching activations along the BI encoding direction we can make the LM to infer “Box Z contains the stone” and “Box Z contains the map”. 2024.emnlp-main.967 @@ -13427,10 +13427,10 @@ <fixed-case>C</fixed-case>o<fixed-case>S</fixed-case>afe: Evaluating Large Language Model Safety in Multi-Turn Dialogue Coreference ErxinYuHong Kong Polytechnic University - JingLiThe Hong Kong Polytechnic University + JingLiThe Hong Kong Polytechnic University MingLiao - SiqiWang - GaoZuchen + SiqiWang + GaoZuchen FeiMi LanqingHongHuawei Technologies Ltd. 17494-17508 @@ -13441,7 +13441,7 @@ <fixed-case>C</fixed-case>lim<fixed-case>R</fixed-case>etrieve: A Benchmarking Dataset for Information Retrieval from Corporate Climate Disclosures - TobiasSchimanski + TobiasSchimanski JingweiNiETHZ - ETH Zurich Roberto SpaceyMartínUniversity of Oxford NicolaRanger @@ -13454,10 +13454,10 @@ Context-Aware Adapter Tuning for Few-Shot Relation Learning in Knowledge Graphs - LiuRan - ZhongzhouLiuSingapore Management University - XiaoliLi - YuanFangSingapore Management University + LiuRan + ZhongzhouLiuSingapore Management University + XiaoliLi + YuanFangSingapore Management University 17525-17537 Knowledge graphs (KGs) are instrumental in various real-world applications, yet they often suffer from incompleteness due to missing relations. To predict instances for novel relations with limited training examples, few-shot relation learning approaches have emerged, utilizing techniques such as meta-learning. However, the assumption is that novel relations in meta-testing and base relations in meta-training are independently and identically distributed, which may not hold in practice. To address the limitation, we propose RelAdapter, a context-aware adapter for few-shot relation learning in KGs designed to enhance the adaptation process in meta-learning. First, RelAdapter is equipped with a lightweight adapter module that facilitates relation-specific, tunable adaptation of meta-knowledge in a parameter-efficient manner. Second, RelAdapter is enriched with contextual information about the target relation, enabling enhanced adaptation to each distinct relation. Extensive experiments on three benchmark KGs validate the superiority of RelAdapter over state-of-the-art methods. 2024.emnlp-main.970 @@ -13468,7 +13468,7 @@ Zero-Shot Detection of <fixed-case>LLM</fixed-case>-Generated Text using Token Cohesiveness - ShixuanMaBeijing University of Posts and Telecommunications + ShixuanMaBeijing University of Posts and Telecommunications QuanWangBeijing University of Posts and Telecommunications 17538-17553 The increasing capability and widespread usage of large language models (LLMs) highlight the desirability of automatic detection of LLM-generated text. Zero-shot detectors, due to their training-free nature, have received considerable attention and notable success. In this paper, we identify a new feature, token cohesiveness, that is useful for zero-shot detection, and we demonstrate that LLM-generated text tends to exhibit higher token cohesiveness than human-written text. Based on this observation, we devise TOCSIN, a generic dual-channel detection paradigm that uses token cohesiveness as a plug-and-play module to improve existing zero-shot detectors. To calculate token cohesiveness, TOCSIN only requires a few rounds of random token deletion and semantic difference measurement, making it particularly suitable for a practical black-box setting where the source model used for generation is not accessible. Extensive experiments with four state-of-the-art base detectors on various datasets, source models, and evaluation settings demonstrate the effectiveness and generality of the proposed approach. Code available at: https://github.com/Shixuan-Ma/TOCSIN. @@ -13521,7 +13521,7 @@ Shishir GPatilUniversity of California, Berkeley ZiyangWu TianjunZhangUniversity of California Berkeley - KurtKeutzerUniversity of California Berkeley + KurtKeutzerUniversity of California Berkeley Joseph E.GonzalezUniversity of California - Berkeley, University of California-Berkeley and UC Berkeley, University of California Berkeley Raluca AdaPopaUniversity of California, Berkeley 17605-17621 @@ -13549,7 +13549,7 @@ Mentor-<fixed-case>KD</fixed-case>: Making Small Language Models Better Multi-step Reasoners HojaeLee JunhoKimKorea University - SangKeunLeeKorea University + SangKeunLeeKorea University 17643-17658 Large Language Models (LLMs) have displayed remarkable performances across various complex tasks by leveraging Chain-of-Thought (CoT) prompting. Recently, studies have proposed a Knowledge Distillation (KD) approach, reasoning distillation, which transfers such reasoning ability of LLMs through fine-tuning language models of multi-step rationales generated by LLM teachers. However, they have inadequately considered two challenges regarding insufficient distillation sets from the LLM teacher model, in terms of 1) data quality and 2) soft label provision. In this paper, we propose Mentor-KD, which effectively distills the multi-step reasoning capability of LLMs to smaller LMs while addressing the aforementioned challenges. Specifically, we exploit a mentor, intermediate-sized task-specific fine-tuned model, to augment additional CoT annotations and provide soft labels for the student model during reasoning distillation. We conduct extensive experiments and confirm Mentor-KD’s effectiveness across various models and complex reasoning tasks. 2024.emnlp-main.977 @@ -13560,11 +13560,11 @@ Are Large Language Models Capable of Generating Human-Level Narratives? YufeiTian TenghaoHuang - MiriLiuUniversity of California, Los Angeles - DerekJiang + MiriLiuUniversity of California, Los Angeles + DerekJiang AlexanderSpangherUniversity of Southern California - MuhaoChenUniversity of California, Davis and University of Southern California - JonathanMayUniversity of Southern California and USC/ISI + MuhaoChenUniversity of California, Davis and University of Southern California + JonathanMayUniversity of Southern California and USC/ISI NanyunPengUniversity of California, Los Angeles 17659-17681 As daily reliance on large language models (LLMs) grows, assessing their generation quality is crucial to understanding how they might impact on our communications. This paper investigates the capability of LLMs in storytelling, focusing on narrative development and plot progression. We introduce a novel computational framework to analyze narratives through three discourse-level aspects: i) story arcs, ii) turning points, and iii) affective dimensions, including arousal and valence. By leveraging expert and automatic annotations, we uncover significant discrepancies between the LLM- and human- written stories. While human-written stories are suspenseful, arousing, and diverse in narrative structures, LLM stories are homogeneously positive and lack tension. Next, we measure narrative reasoning skills as a precursor to generative capacities, concluding that most LLMs fall short of human abilities in discourse understanding. Finally, we show that explicit integration of aforementioned discourse features can enhance storytelling, as is demonstrated by over 40% improvement in neural storytelling in terms of diversity, suspense, and arousal. Such advances promise to facilitate greater and more natural roles LLMs in human communication. @@ -13577,8 +13577,8 @@ <fixed-case>MP</fixed-case>2<fixed-case>D</fixed-case>: An Automated Topic Shift Dialogue Generation Framework Leveraging Knowledge Graphs YerinHwangSeoul National University YongilKimSeoul National University - YunahJang - JeesooBangLG AI Research + YunahJang + JeesooBangLG AI Research HyunkyungBaeLG AI Research KyominJung 17682-17702 @@ -13591,8 +13591,8 @@ Can Large Language Models Enhance Predictions of Disease Progression? Investigating Through Disease Network Link Prediction - HaohuiLuUniversity of Sydney, University of Sydney - UsmanNaseemMacquarie University + HaohuiLuUniversity of Sydney, University of Sydney + UsmanNaseemMacquarie University 17703-17715 Large Language Models (LLMs) have made significant strides in various tasks, yet their effectiveness in predicting disease progression remains relatively unexplored. To fill this gap, we use LLMs and employ advanced graph prompting and Retrieval-Augmented Generation (RAG) to predict disease comorbidity within disease networks. Specifically, we introduce a disease Comorbidity prediction model using LLM, named ComLLM, which leverages domain knowledge to enhance the prediction performance. Based on the comprehensive experimental results, ComLLM consistently outperforms conventional models, such as Graph Neural Networks, achieving average area under the curve (AUC) improvements of 10.70% and 6.07% over the best baseline models in two distinct disease networks. ComLLM is evaluated across multiple settings for disease progression prediction, employing various prompting strategies, including zero-shot, few-shot, Chain-of-Thought, graph prompting and RAG. Our results show that graph prompting and RAG enhance LLM performance in disease progression prediction tasks. ComLLM exhibits superior predictive capabilities and serves as a proof-of-concept for LLM-based systems in disease progression prediction, highlighting its potential for broad applications in healthcare. 2024.emnlp-main.980 @@ -13601,7 +13601,7 @@ Searching for Best Practices in Retrieval-Augmented Generation - XiaohuaWang + XiaohuaWang ZhenghuaWang XuanGao FeiranZhang @@ -13613,7 +13613,7 @@ QiQian RuichengYin ChangzeLv - XiaoqingZheng + XiaoqingZheng XuanjingHuangFudan University 17716-17736 Retrieval-augmented generation (RAG) techniques have proven to be effective in integrating up-to-date information, mitigating hallucinations, and enhancing response quality, particularly in specialized domains. While many RAG approaches have been proposed to enhance large language models through query-dependent retrievals, these approaches still suffer from their complex implementation and prolonged response times. Typically, a RAG workflow involves multiple processing steps, each of which can be executed in various ways. Here, we investigate existing RAG approaches and their potential combinations to identify optimal RAG practices. Through extensive experiments, we suggest several strategies for deploying RAG that balance both performance and efficiency. Moreover, we demonstrate that multimodal retrieval techniques can significantly enhance question-answering capabilities about visual inputs and accelerate the generation of multimodal content using a “retrieval as generation” strategy. @@ -13638,7 +13638,7 @@ The Zeno’s Paradox of ‘Low-Resource’ Languages Hellina HailuNigatuMohamed bin Zayed University of Artificial Intelligence and Electrical Engineering & Computer Science Department, University of California, Berkeley - Atnafu LambeboTonjaMohamed bin Zayed University of Artificial Intelligence and Instituto Politécnico Nacional + Atnafu LambeboTonjaMohamed bin Zayed University of Artificial Intelligence and Instituto Politécnico Nacional BenjaminRosmanUniversity of the Witwatersrand ThamarSolorioMohamed bin Zayed University of Artificial Intelligence and University of Houston MonojitChoudhuryMohamed bin Zayed University of Artificial Intelligence @@ -13652,8 +13652,8 @@ Knowledge Planning in Large Language Models for Domain-Aligned Counseling Summarization AseemSrivastavaIndraprastha Institute of Information Technology, Delhi - SmritiJoshi - TanmoyChakrabortyIndian Institute of Technology, Delhi + SmritiJoshi + TanmoyChakrabortyIndian Institute of Technology, Delhi Md ShadAkhtarIndraprastha Institute of Information Technology, Delhi 17775-17789 In mental health counseling, condensing dialogues into concise and relevant summaries (aka counseling notes) holds pivotal significance. Large Language Models (LLMs) exhibit remarkable capabilities in various generative tasks; however, their adaptation to domain-specific intricacies remains challenging, especially within mental health contexts. Unlike standard LLMs, mental health experts first plan to apply domain knowledge in writing summaries. Our work enhances LLMs’ ability by introducing a novel planning engine to orchestrate structuring knowledge alignment. To achieve high-order planning, we divide knowledge encapsulation into two major phases: (i) holding dialogue structure and (ii) incorporating domain-specific knowledge. We employ a planning engine on Llama-2, resulting in a novel framework, PIECE. Our proposed system employs knowledge filtering-cum-scaffolding to encapsulate domain knowledge. Additionally, PIECE leverages sheaf convolution learning to enhance its understanding of the dialogue’s structural nuances. We compare PIECE with 14 baseline methods and observe a significant improvement across ROUGE and Bleurt scores. Further, expert evaluation and analyses validate the generation quality to be effective, sometimes even surpassing the gold standard. We further benchmark PIECE with other LLMs and report improvement, including Llama-2 (+2.72%), Mistral (+2.04%), and Zephyr (+1.59%), to justify the generalizability of the planning engine. @@ -13677,9 +13677,9 @@ From Descriptive Richness to Bias: Unveiling the Dark Side of Generative Image Caption Enrichment YusukeHirotaOsaka University - RyoHachiumaNVIDIA - Chao-Han HuckYangNVIDIA Research - YutaNakashimaOsaka University + RyoHachiumaNVIDIA + Chao-Han HuckYangNVIDIA Research + YutaNakashimaOsaka University 17807-17816 Large language models (LLMs) have enhanced the capacity of vision-language models to caption visual text. This generative approach to image caption enrichment further makes textual captions more descriptive, improving alignment with the visual context. However, while many studies focus on the benefits of generative caption enrichment (GCE), are there any negative side effects? We compare standard-format captions and recent GCE processes from the perspectives of gender bias and hallucination, showing that enriched captions suffer from increased gender bias and hallucination. Furthermore, models trained on these enriched captions amplify gender bias by an average of 30.9% and increase hallucination by 59.5%. This study serves as a caution against the trend of making captions more descriptive. 2024.emnlp-main.986 @@ -13711,8 +13711,8 @@ Embedded Named Entity Recognition using Probing Classifiers - NicholasPopovicKarlsruher Institut für Technologie, Karlsruher Institut für Technologie and Karlsruher Institut für Technologie - MichaelFärberTechnische Universität Dresden + NicholasPopovicKarlsruher Institut für Technologie, Karlsruher Institut für Technologie and Karlsruher Institut für Technologie + MichaelFärberTechnische Universität Dresden 17830-17850 Streaming text generation, has become a common way of increasing the responsiveness of language model powered applications such as chat assistants. At the same time, extracting semantic information from generated text is a useful tool for applications such as automated fact checking or retrieval augmented generation. Currently, this requires either separate models during inference, which increases computational cost, or destructive fine-tuning of the language model. Instead, we propose an approach called EMBER which enables streaming named entity recognition in decoder-only language models without fine-tuning them and while incurring minimal additional computational cost at inference time. Specifically, our experiments show that EMBER maintains high token generation rates, with only a negligible decrease in speed of around 1% compared to a 43.64% slowdown measured for a baseline. We make our code and data available online, including a toolkit for training, testing, and deploying efficient token classification models optimized for streaming text generation. 2024.emnlp-main.988 @@ -13723,9 +13723,9 @@ Unleashing the Power of Emojis in Texts via Self-supervised Graph Pre-Training ZhouZhang DongzengTanAlibaba Group - JiaanWangTencent + JiaanWangTencent YilongChen - JiarongXuFudan University + JiarongXuFudan University 17851-17863 Emojis have gained immense popularity on social platforms, serving as a common means to supplement or replace text. However, existing data mining approaches generally either completely ignore or simply treat emojis as ordinary Unicode characters, which may limit the model’s ability to grasp the rich semantic information in emojis and the interaction between emojis and texts. Thus, it is necessary to release the emoji’s power in social media data mining. To this end, we first construct a heterogeneous graph consisting of three types of nodes, i.e. post, word and emoji nodes to improve the representation of different elements in posts. The edges are also well-defined to model how these three elements interact with each other. To facilitate the sharing of information among post, word and emoji nodes, we propose a graph pre-train framework for text and emoji co-modeling, which contains two graph pre-training tasks: node-level graph contrastive learning and edge-level link reconstruction learning. Extensive experiments on the Xiaohongshu and Twitter datasets with two types of downstream tasks demonstrate that our approach proves significant improvement over previous strong baseline methods. 2024.emnlp-main.989 @@ -13734,8 +13734,8 @@ Data Contamination Can Cross Language Barriers - FengYao - YufanZhuangUniversity of California, San Diego + FengYao + YufanZhuangUniversity of California, San Diego ZihaoSun SunanXu AnimeshKumar @@ -13750,7 +13750,7 @@ Automated Essay Scoring: A Reflection on the State of the Art - ShengjieLiUniversity of Texas at Dallas + ShengjieLiUniversity of Texas at Dallas VincentNgUniversity of Texas at Dallas 17876-17888 While steady progress has been made on the task of automated essay scoring (AES) in the past decade, much of the recent work in this area has focused on developing models that beat existing models on a standard evaluation dataset. While improving performance numbers remains an important goal in the short term, such a focus is not necessarily beneficial for the long-term development of the field. We reflect on the state of the art in AES research, discussing issues that we believe can encourage researchers to think bigger than improving performance numbers with the ultimate goal of triggering discussion among AES researchers on how we should move forward. @@ -13761,12 +13761,12 @@ Encouraging Divergent Thinking in Large Language Models through Multi-Agent Debate TianLiangTencent AI Lab - ZhiweiHeShanghai Jiao Tong University + ZhiweiHeShanghai Jiao Tong University WenxiangJiaoTencent AI Lab - XingWangTencent AI Lab + XingWangTencent AI Lab YanWangTencent - RuiWangShanghai Jiao Tong University - YujiuYangGraduate School at Shenzhen,Tsinghua University + RuiWangShanghai Jiao Tong University + YujiuYangGraduate School at Shenzhen,Tsinghua University ShumingShiTencent AI Lab ZhaopengTuTencent AI Lab 17889-17904 @@ -13782,7 +13782,7 @@ YiwenGuo HaojieWei ZhanqiuZhang - PasqualeMinerviniUniversity of Edinburgh, University of Edinburgh + PasqualeMinerviniUniversity of Edinburgh, University of Edinburgh RuotianMa TaoGuiFudan University QiZhangFudan University @@ -13795,10 +13795,10 @@ <fixed-case>CURE</fixed-case>: Context- and Uncertainty-Aware Mental Disorder Detection - MigyeongKangSungkyunkwan University + MigyeongKangSungkyunkwan University GounChoi HyolimJeon - Ji HyunAnSamsung + Ji HyunAnSamsung DaejinChoi JinyoungHanSungkyunkwan University 17924-17940 @@ -13810,10 +13810,10 @@ <fixed-case>P</fixed-case>ep<fixed-case>R</fixed-case>ec: Progressive Enhancement of Prompting for Recommendation - YakunYu - Shi-angQiUniversity of Alberta - BaochunLi - DiNiuUniversity of Alberta and University of Alberta + YakunYu + Shi-angQiUniversity of Alberta + BaochunLi + DiNiuUniversity of Alberta and University of Alberta 17941-17953 With large language models (LLMs) achieving remarkable breakthroughs in natural language processing (NLP) domains, recent researchers have actively explored the potential of LLMs for recommendation systems by converting the input data into textual sentences through prompt templates. Although semantic knowledge from LLMs can help enrich the content information of items, to date it is still hard for them to achieve comparable performance to traditional deep learning recommendation models, partly due to a lack of ability to leverage collaborative filtering. In this paper, we propose a novel training-free prompting framework, PepRec, which aims to capture knowledge from both content-based filtering and collaborative filtering to boost recommendation performance with LLMs, while providing interpretation for the recommendation. Experiments based on two real-world datasets from different domains show that PepRec significantly outperforms various traditional deep learning recommendation models and prompt-based recommendation systems. 2024.emnlp-main.995 @@ -13825,7 +13825,7 @@ ChuanhaoLi ChenchenJing ZhenLiBeijing Institute of Technology - MingliangZhai + MingliangZhai YuweiWu YundeJia 17954-17966 @@ -13877,7 +13877,7 @@ WenduanXuQuantinuum StephenClarkQuantinuum DouglasBrown - GabrielMatosQuantinuum + GabrielMatosQuantinuum KonstantinosMeichanetzidis 18020-18027 We develop quantum RNNs with cells based on Parametrised Quantum Circuits (PQCs). PQCs can provide a form of hybrid quantum-classical computation where the input and the output is in the form of classical data. The previous “hidden” state is the quantum state from the previous time-step, and an angle encoding is used to define a (non-linear) mapping from a classical word embedding into the quantum Hilbert space. Measurements of the quantum state provide classical statistics which are used for classification. We report results which are competitive with various RNN baselines on the Rotten Tomatoes dataset, as well as emulator results which demonstrate the feasibility of running such models on quantum hardware. @@ -13888,8 +13888,8 @@ Tree of Problems: Improving structured problem solving with compositionality Armel RandyZebazeINRIA - BenoîtSagotInria - RachelBawdenInria + BenoîtSagotInria + RachelBawdenInria 18028-18047 Large Language Models (LLMs) have demonstrated remarkable performance across multipletasks through in-context learning. For complex reasoning tasks that require step-by-step thinking, Chain-of-Thought (CoT) prompting has given impressive results, especially when combined with self-consistency. Nonetheless, some tasks remain particularly difficult for LLMs to solve. Tree of Thoughts (ToT) and Graph of Thoughts (GoT) emerged as alternatives, dividing the complex problem into paths of subproblems. In this paper, we propose Tree of Problems (ToP), a simpler version of ToT, which we hypothesise can work better for complex tasks that can be divided into identical subtasks. Our empirical results show that our approach outperforms ToT and GoT, and in addition per forms better than CoT on complex reasoning tasks. All code for this paper will be made available. 2024.emnlp-main.1001 @@ -13898,10 +13898,10 @@ What the Harm? Quantifying the Tangible Impact of Gender Bias in Machine Translation with a Human-centered Study - BeatriceSavoldi - SaraPapi - MatteoNegriFondazione Bruno Kessler - AnaGuerberof-ArenasUniversity of Groningen + BeatriceSavoldi + SaraPapi + MatteoNegriFondazione Bruno Kessler + AnaGuerberof-ArenasUniversity of Groningen LuisaBentivogliFondazione Bruno Kessler 18048-18076 Gender bias in machine translation (MT) is recognized as an issue that can harm people and society. And yet, advancements in the field rarely involve people, the final MT users, or inform how they might be impacted by biased technologies. Current evaluations are often restricted to automatic methods, which offer an opaque estimate of what the downstream impact of gender disparities might be. We conduct an extensive human-centered study to examine if and to what extent bias in MT brings harms with tangible costs, such as quality of service gaps across women and men. To this aim, we collect behavioral data from ~90 participants, who post-edited MT outputs to ensure correct gender translation. Across multiple datasets, languages, and types of users, our study shows that feminine post-editing demands significantly more technical and temporal effort, also corresponding to higher financial costs. Existing bias measurements, however, fail to reflect the found disparities. Our findings advocate for human-centered approaches that can inform the societal impact of bias. @@ -13913,10 +13913,10 @@ <fixed-case>S</fixed-case>eg2<fixed-case>A</fixed-case>ct: Global Context-aware Action Generation for Document Logical Structuring ZichaoLiInstitute of Software, Chinese Academy of Sciences - ShaojieHe + ShaojieHe MengLiao XuanangChen - YaojieLuInstitute of Software, Chinese Academy of Sciences + YaojieLuInstitute of Software, Chinese Academy of Sciences HongyuLinInstitute of Software, Chinese Academy of Sciences YanxiongLu XianpeiHanInstitute of Software, CAS @@ -13958,7 +13958,7 @@ A Survey of Ontology Expansion for Conversational Understanding JingguiLiangSingapore Management University YuxiaWuSingapore Management University - YuanFangSingapore Management University + YuanFangSingapore Management University HaoFeiNational University of Singapore LiziLiaoSingapore Management University 18111-18127 @@ -13972,7 +13972,7 @@ JohnathanXie Annie SChenStanford University YoonhoLeeStanford University - EricMitchell + EricMitchell ChelseaFinnStanford University and Google 18128-18138 The effectiveness of large language models (LLMs) is not only measured by their ability to generate accurate outputs but also by their calibration—how well their confidence scores reflect the probability of their outputs being correct. While unsupervised pre-training has been shown to yield LLMs with well-calibrated conditional probabilities, recent studies have shown that after fine-tuning with reinforcement learning from human feedback (RLHF), the calibration of these models degrades significantly. In this work, we introduce Adaptive Temperature Scaling (ATS), a post-hoc calibration method that predicts a temperature scaling parameter for each token prediction. The predicted temperature values adapt based on token-level features and are fit over a standard supervised fine-tuning (SFT) dataset. The adaptive nature of ATS addresses the varying degrees of calibration shift that can occur after RLHF fine-tuning. ATS improves calibration by over 10-50% across three downstream natural language evaluation benchmarks compared to prior calibration methods and does not impede performance improvements from RLHF. @@ -13986,7 +13986,7 @@ TakeshiKojimaThe University of Tokyo AndrewGambardellaThe University of Tokyo, Tokyo University QiCaoThe University of Tokyo - YusukeIwasawaThe University of Tokyo + YusukeIwasawaThe University of Tokyo YutakaMatsuoThe University of Tokyo and The University of Tokyo 18139-18149 Recent large language models (LLMs) have demonstrated remarkable generalization abilities in mathematics and logical reasoning tasks.Prior research indicates that LLMs pre-trained with programming language data exhibit high mathematical and reasoning abilities; however, this causal relationship has not been rigorously tested. Our research aims to verify which programming languages and features during pre-training affect logical inference performance. Specifically, we pre-trained decoder-based language models from scratch using datasets from ten programming languages (e.g., Python, C, Java) and three natural language datasets (Wikipedia, Fineweb, C4) under identical conditions. Thereafter, we evaluated the trained models in a few-shot in-context learning setting on logical reasoning tasks: FLD and bAbi, which do not require commonsense or world knowledge. The results demonstrate that nearly all models trained with programming languages consistently outperform those trained with natural languages, indicating that programming languages contain factors that elicit logic inference performance. In addition, we found that models trained with programming languages exhibit a better ability to follow instructions compared to those trained with natural languages. Further analysis reveals that the depth of Abstract Syntax Trees representing parsed results of programs also affects logical reasoning performance. These findings will offer insights into the essential elements of pre-training for acquiring the foundational abilities of LLMs. @@ -13996,8 +13996,8 @@ Why do objects have many names? A study on word informativeness in language use and lexical systems - EleonoraGualdoni - GemmaBoledaICREA and Universitat Pompeu Fabra + EleonoraGualdoni + GemmaBoledaICREA and Universitat Pompeu Fabra 18150-18163 Human lexicons contain many different words that speakers can use to refer to the same object, e.g., *purple* or *magenta* for the same shade of color. On the one hand, studies on language use have explored how speakers adapt their referring expressions to successfully communicate in context, without focusing on properties of the lexical system. On the other hand, studies in language evolution have discussed how competing pressures for informativeness and simplicity shape lexical systems, without tackling in-context communication. We aim at bridging the gap between these traditions, and explore why a soft mapping between referents and words is a good solution for communication, by taking into account both in-context communication and the structure of the lexicon. We propose a simple measure of informativeness for words and lexical systems, grounded in a visual space, and analyze color naming data for English and Mandarin Chinese. We conclude that optimal lexical systems are those where multiple words can apply to the same referent, conveying different amounts of information. Such systems allow speakers to maximize communication accuracy and minimize the amount of information they convey when communicating about referents in contexts. 2024.emnlp-main.1009 @@ -14006,7 +14006,7 @@ Dual-Space Knowledge Distillation for Large Language Models - SongmingZhangBeijing Jiaotong University + SongmingZhangBeijing Jiaotong University XueZhangBeijing Jiaotong University ZengkuiSun YufengChen @@ -14043,7 +14043,7 @@ <fixed-case>P</fixed-case>air<fixed-case>D</fixed-case>istill: Pairwise Relevance Distillation for Dense Retrieval - Chao-WeiHuangNational Taiwan University + Chao-WeiHuangNational Taiwan University Yun-NungChenDepartment of Computer Science and Informational Engineering, National Taiwan University 18225-18237 Effective information retrieval (IR) from vast datasets relies on advanced techniques to extract relevant information in response to queries. Recent advancements in dense retrieval have showcased remarkable efficacy compared to traditional sparse retrieval methods. To further enhance retrieval performance, knowledge distillation techniques, often leveraging robust cross-encoder rerankers, have been extensively explored. However, existing approaches primarily distill knowledge from pointwise rerankers, which assign absolute relevance scores to documents, thus facing challenges related to inconsistent comparisons. This paper introduces Pairwise Relevance Distillation (PairDistill) to leverage pairwise reranking, offering fine-grained distinctions between similarly relevant documents to enrich the training of dense retrieval models. Our experiments demonstrate that PairDistill outperforms existing methods, achieving new state-of-the-art results across multiple benchmarks. This highlights the potential of PairDistill in advancing dense retrieval techniques effectively. Our source code and trained models are released at https://github.com/MiuLab/PairDistill @@ -14067,12 +14067,12 @@ <fixed-case>H</fixed-case>i<fixed-case>FT</fixed-case>: A Hierarchical Full Parameter Fine-Tuning Strategy - YongKangLiu + YongKangLiu YiqunZhangNortheastern University - QianLiShandong University + QianLiShandong University TongLiuLudwig-Maximilians-Universität München - ShiFengNortheastern University, China - DalingWangNortheastern University + ShiFengNortheastern University, China + DalingWangNortheastern University YifeiZhangNortheastern University HinrichSchuetze 18266-18287 @@ -14107,7 +14107,7 @@ <fixed-case>T</fixed-case>ool<fixed-case>P</fixed-case>lanner: A Tool Augmented <fixed-case>LLM</fixed-case> for Multi Granularity Instructions with Path Planning and Feedback - QinzhuoWu + QinzhuoWu WeiLiuxiaomi JianLuan BinWangAI Lab, Xiaomi Inc. @@ -14131,7 +14131,7 @@ How to Compute the Probability of a Word TiagoPimentelDepartment of Computer Science, ETHZ - ETH Zurich - ClaraMeister + ClaraMeister 18358-18375 Language models (LMs) estimate a probability distribution over strings in a natural language; these distributions are crucial for computing perplexity and surprisal in linguistics research. While we are usually concerned with measuring these values for words, most LMs operate over subwords. Despite seemingly straightforward, accurately computing probabilities over one unit given probabilities over the other requires care. Indeed, we show here that many recent linguistic studies have been incorrectly computing these values. This paper derives the correct methods for computing word probabilities, highlighting issues when relying on language models that use beginning-of-word (bow)-marking tokenisers, e.g., the GPT family. Empirically, we show that correcting the widespread bug in probability computations affects measured outcomes in sentence comprehension and lexical optimisation analyses. 2024.emnlp-main.1020 @@ -14140,10 +14140,10 @@ A linguistically-motivated evaluation methodology for unraveling model’s abilities in reading comprehension tasks - ElieAntoineUniversité d’Aix-Marseille - FredericBechetAcadémie d’Aix-Marseille + ElieAntoineUniversité d’Aix-Marseille + FredericBechetAcadémie d’Aix-Marseille GéraldineDamnatiOrange Innovation - PhilippeLanglaisUniversité de Montréal + PhilippeLanglaisUniversité de Montréal 18376-18392 We introduce an evaluation methodology for reading comprehension tasks based on the intuition that certain examples, by the virtue of their linguistic complexity, consistently yield lower scores regardless of model size or architecture. We capitalize on semantic frame annotation for characterizing this complexity, and study seven complexity factors that may account for model’s difficulty. We first deploy this methodology on a carefully annotated French reading comprehension benchmark showing that two of those complexity factors are indeed good predictors of models’ failure, while others are less so. We further deploy our methodology on a well studied English benchmark by using chatGPT as a proxy for semantic annotation.Our study reveals that fine-grained linguistically-motivated automatic evaluation of a reading comprehension task is not only possible, but helps understand models’ abilities to handle specific linguistic characteristics of input examples. It also shows that current state-of-the-art models fail with some for those characteristics which suggests that adequately handling them requires more than merely increasing model size. 2024.emnlp-main.1021 @@ -14152,7 +14152,7 @@ <fixed-case>G</fixed-case>uard<fixed-case>B</fixed-case>ench: A Large-Scale Benchmark for Guardrail Models - EliasBassaniEuropean Commission, Joint Research Centre + EliasBassaniEuropean Commission, Joint Research Centre IgnacioSanchez 18393-18409 Generative AI systems powered by Large Language Models have become increasingly popular in recent years. Lately, due to the risk of providing users with unsafe information, the adoption of those systems in safety-critical domains has raised significant concerns. To respond to this situation, input-output filters, commonly called guardrail models, have been proposed to complement other measures, such as model alignment. Unfortunately, the lack of a standard benchmark for guardrail models poses significant evaluation issues and makes it hard to compare results across scientific publications. To fill this gap, we introduce GuardBench, a large-scale benchmark for guardrail models comprising 40 safety evaluation datasets. To facilitate the adoption of GuardBench, we release a Python library providing an automated evaluation pipeline built on top of it. With our benchmark, we also share the first large-scale prompt moderation datasets in German, French, Italian, and Spanish. To assess the current state-of-the-art, we conduct an extensive comparison of recent guardrail models and show that a general-purpose instruction-following model of comparable size achieves competitive results without the need for specific fine-tuning. @@ -14165,9 +14165,9 @@ YaoXu ShizhuHeInstitute of automation, Chinese academy of science, Chinese Academy of Sciences JiabeiChenInstitute of automation, Chinese academy of science, Chinese Academy of Sciences - ZihaoWang + ZihaoWang YangqiuSongThe Hong Kong University of Science and Technology - HanghangTong + HanghangTong GuangLiu JunZhaoInstitute of automation, Chinese academy of science KangLiuInstitute of automation, Chinese academy of science, Chinese Academy of Sciences @@ -14180,7 +14180,7 @@ Language models and brains align due to more than next-word prediction and word-level information GabrieleMerlinMPI-SWS - MariyaTonevaMax Planck Institute for Software Systems + MariyaTonevaMax Planck Institute for Software Systems 18431-18454 Pretrained language models have been shown to significantly predict brain recordings of people comprehending language. Recent work suggests that the prediction of the next word is a key mechanism that contributes to this alignment. What is not yet understood is whether prediction of the next word is necessary for this observed alignment or simply sufficient, and whether there are other shared mechanisms or information that are similarly important. In this work, we take a step towards understanding the reasons for brain alignment via two simple perturbations in popular pretrained language models. These perturbations help us design contrasts that can control for different types of information. By contrasting the brain alignment of these differently perturbed models, we show that improvements in alignment with brain recordings are due to more than improvements in next-word prediction and word-level information. 2024.emnlp-main.1024 @@ -14191,9 +14191,9 @@ <fixed-case>LLME</fixed-case>dge<fixed-case>R</fixed-case>efine: Enhancing Text Clustering with <fixed-case>LLM</fixed-case>-Based Boundary Point Refinement ZijinFengChinese University of Hong Kong LuyangLinThe Chinese University of Hong Kong - LingzhiWangThe Chinese University of Hong Kong + LingzhiWangThe Chinese University of Hong Kong HongChengThe Chinese University of Hong Kong - Kam-FaiWongThe Chinese University of Hong Kong + Kam-FaiWongThe Chinese University of Hong Kong 18455-18462 Text clustering is a fundamental task in natural language processing with numerous applications. However, traditional clustering methods often struggle with domain-specific fine-tuning and the presence of outliers. To address these challenges, we introduce LLMEdgeRefine, an iterative clustering method enhanced by large language models (LLMs), focusing on edge points refinement. LLMEdgeRefine enhances current clustering methods by creating super-points to mitigate outliers and iteratively refining clusters using LLMs for improved semantic coherence. Our method demonstrates superior performance across multiple datasets, outperforming state-of-the-art techniques, and offering robustness, adaptability, and cost-efficiency for diverse text clustering applications. 2024.emnlp-main.1025 @@ -14204,10 +14204,10 @@ <fixed-case>C</fixed-case>asi<fixed-case>M</fixed-case>edicos-Arg: A Medical Question Answering Dataset Annotated with Explanatory Argumentative Structures EkaterinaSviridovaUniversité Côte d’Azur AnarYeginbergen - AinaraEstarronaUniversidad del País Vasco - ElenaCabrioUniversité Côte d’Azur + AinaraEstarronaUniversidad del País Vasco + ElenaCabrioUniversité Côte d’Azur SerenaVillataCNRS - RodrigoAgerriUniversity of the Basque Country + RodrigoAgerriUniversity of the Basque Country 18463-18475 Explaining Artificial Intelligence (AI) decisions is a major challenge nowadays in AI, in particular when applied to sensitive scenarios like medicine and law. However, the need to explain the rationale behind decisions is a main issues also for human-based deliberation as it is important to justify why a certain decision has been taken. Resident medical doctors for instance are required not only to provide a (possibly correct) diagnosis, but also to explain how they reached a certain conclusion. Developing new tools to aid residents to train their explanation skills is therefore a central objective of AI in education. In this paper, we follow this direction, and we present, to the best of our knowledge, the first multilingual dataset for Medical Question Answering where correct and incorrect diagnoses for a clinical case are enriched with a natural language explanation written by doctors. These explanations have been manually annotated with argument components (i.e., premise, claim) and argument relations (i.e., attack, support). The Multilingual CasiMedicos-arg dataset consists of 558 clinical cases (English, Spanish, French, Italian) with explanations, where we annotated 5021 claims, 2313 premises, 2431 support relations, and 1106 attack relations. We conclude by showing how competitive baselines perform over this challenging dataset for the argument mining task. 2024.emnlp-main.1026 @@ -14216,10 +14216,10 @@ A Simple and Effective <tex-math>L\_2</tex-math> Norm-Based Strategy for <fixed-case>KV</fixed-case> Cache Compression - AlessioDevoto - YuZhaoUniversity of Edinburgh - SimoneScardapaneSapienza University of Rome - PasqualeMinerviniUniversity of Edinburgh, University of Edinburgh + AlessioDevoto + YuZhaoUniversity of Edinburgh + SimoneScardapaneSapienza University of Rome + PasqualeMinerviniUniversity of Edinburgh, University of Edinburgh 18476-18499 The deployment of large language models (LLMs) is often hindered by the extensive memory requirements of the Key-Value (KV) cache, especially as context lengths increase. Existing approaches to reduce the KV cache size involve either fine-tuning the model to learn a compression strategy or leveraging attention scores to reduce the sequence length. We analyse the attention distributions in decoder-only Transformers-based models and observe that attention allocation patterns stay consistent across most layers. Surprisingly, we find a clear correlation between the L_2 norm and the attention scores over cached KV pairs, where a low L_2 norm of a key embedding usually leads to a high attention score during decoding. This finding indicates that the influence of a KV pair is potentially determined by the key embedding itself before being queried. Based on this observation, we compress the KV cache based on the L_2 norm of key embeddings. Our experimental results show that this simple strategy can reduce the KV cache size by 50% on language modelling and needle-in-a-haystack tasks and 90% on passkey retrieval tasks without losing accuracy. Moreover, without relying on the attention scores, this approach remains compatible with FlashAttention, enabling broader applicability. 2024.emnlp-main.1027 @@ -14229,8 +14229,8 @@ <fixed-case>GOME</fixed-case>: Grounding-based Metaphor Binding With Conceptual Elaboration For Figurative Language Illustration - LinhaoZhang - JintaoLiu + LinhaoZhang + JintaoLiu LiJin HaoWangNorth China University of Technology KaiwenWeiChongqing University @@ -14244,7 +14244,7 @@ <fixed-case>D</fixed-case>3<fixed-case>CODE</fixed-case>: Disentangling Disagreements in Data across Cultures on Offensiveness Detection and Evaluation AidaMostafazadeh DavaniResearch, Google - MarkDiazGoogle + MarkDiazGoogle Dylan KBakerDistributed AI Research Institute VinodkumarPrabhakaranGoogle 18511-18526 @@ -14256,9 +14256,9 @@ <fixed-case>PALM</fixed-case>: Few-Shot Prompt Learning for Audio Language Models AsifHanifMohamed bin Zayed University of Artificial Intelligence - Maha TufailAgro + Maha TufailAgro Mohammad AreebQazi - HananAldarmakiMohamed bin Zayed University of Artificial Intelligence + HananAldarmakiMohamed bin Zayed University of Artificial Intelligence 18527-18536 Audio-Language Models (ALMs) have recently achieved remarkable success in zero-shot audio recognition tasks, which match features of audio waveforms with class-specific text prompt features, inspired by advancements in Vision-Language Models (VLMs). Given the sensitivity of zero-shot performance to the choice of hand-crafted text prompts, many prompt learning techniques have been developed for VLMs. We explore the efficacy of these approaches in ALMs and propose a novel method, Prompt Learning in Audio Language Models (PALM), which optimizes the feature space of the text encoder branch. Unlike existing methods that work in the input space, our approach results in greater training efficiency. We demonstrate the effectiveness of our approach on 11 audio recognition datasets, encompassing a variety of speech-processing tasks, and compare the results with three baselines in a few-shot learning setup. Our method is either on par with or outperforms other approaches while being computationally less demanding. Our code is publicly available at https://asif-hanif.github.io/palm/. 2024.emnlp-main.1030 @@ -14267,10 +14267,10 @@ Annotator-Centric Active Learning for Subjective <fixed-case>NLP</fixed-case> Tasks - Michielvan der Meer + Michielvan der Meer NeeleFalk Pradeep K.Murukannaiah - EnricoLiscio + EnricoLiscio 18537-18555 Active Learning (AL) addresses the high costs of collecting human annotations by strategically annotating the most informative samples. However, for subjective NLP tasks, incorporating a wide range of perspectives in the annotation process is crucial to capture the variability in human judgments. We introduce Annotator-Centric Active Learning (ACAL), which incorporates an annotator selection strategy following data sampling. Our objective is two-fold: (1) to efficiently approximate the full diversity of human judgments, and (2) to assess model performance using annotator-centric metrics, which value minority and majority perspectives equally. We experiment with multiple annotator selection strategies across seven subjective NLP tasks, employing both traditional and novel, human-centered evaluation metrics. Our findings indicate that ACAL improves data efficiency and excels in annotator-centric performance evaluations. However, its success depends on the availability of a sufficiently large and diverse pool of annotators to sample from. 2024.emnlp-main.1031 @@ -14282,9 +14282,9 @@ On the Proper Treatment of Tokenization in Psycholinguistics MarioGiulianelliDepartment of Computer Science, ETHZ - ETH Zurich LucaMalaguttiDepartment of Computer Science, ETHZ - ETH Zurich - Juan LuisGastaldi - BrianDuSellDepartment of Computer Science, ETHZ - ETH Zurich - TimVieiraJohns Hopkins University + Juan LuisGastaldi + BrianDuSellDepartment of Computer Science, ETHZ - ETH Zurich + TimVieiraJohns Hopkins University RyanCotterellSwiss Federal Institute of Technology 18556-18572 Language models are widely used in computational psycholinguistics to test theories that relate the negative log probability (the surprisal) of a region of interest (a substring of characters) under a language model to its cognitive cost experienced by readers, as operationalized, for example, by gaze duration on the region. However, the application of modern language models to psycholinguistic studies is complicated by the practice of using tokenization as an intermediate step in training a model. Doing so results in a language model over *token* strings rather than one over character strings. Vexingly, regions of interest are generally misaligned with these token strings. The paper argues that token-level language models should be (approximately) marginalized into character-level language models before they are used in psycholinguistic studies to compute the surprisal of a region of interest; then, the marginalized character-level language model can be used to compute the surprisal of an arbitrary character substring, which we term a focal area, that the experimenter may wish to use as a predictor. Our proposal of marginalizing a token-level model into a character-level one solves this misalignment issue independently of the tokenization scheme. Empirically, we discover various focal areas whose surprisal is a better psychometric predictor than the surprisal of the region of interest itself. @@ -14308,7 +14308,7 @@ Jailbreaking <fixed-case>LLM</fixed-case>s with <fixed-case>A</fixed-case>rabic Transliteration and <fixed-case>A</fixed-case>rabizi - MansourAl GhanimUniversity of Central Florida + MansourAl GhanimUniversity of Central Florida SalehAlmohaimeedUniversity of Central Florida MengxinZhengUniversity of Central Florida YanSolihinUniversity of Central Florida @@ -14350,10 +14350,10 @@ Recurrent Alignment with Hard Attention for Hierarchical Text Rating - ChenxiLin - RenJiayu + ChenxiLin + RenJiayu GuoxiuHeEast China Normal University - ZhuorenJiangZhejiang University + ZhuorenJiangZhejiang University HaiyanYu XiaominZhuNational University of Defense Technology 18643-18657 @@ -14364,7 +14364,7 @@ <fixed-case>CHESS</fixed-case>: Optimizing <fixed-case>LLM</fixed-case> Inference via Channel-Wise Thresholding and Selective Sparsification - JunhuiHeWuhan University + JunhuiHeWuhan University ShangyuWuCity University of Hong Kong WeidongWenWuhan University Chun JasonXueMohamed bin Zayed University of Artificial Intelligence @@ -14380,7 +14380,7 @@ YongjingYin JunranDing KaiSong - YueZhangWestlake University + YueZhangWestlake University 18669-18680 Next-token prediction serves as the dominant component in current neural language models.During the training phase, the model employs teacher forcing, which predicts tokens based on all preceding ground truth tokens.However, this approach has been found to create shortcuts, utilizing the revealed prefix to spuriously fit future tokens, potentially compromising the accuracy of the next-token predictor.In this paper, we introduce Semformer, a novel method of training a Transformer language model that explicitly models the semantic planning of response.Specifically, we incorporate a sequence of planning tokens into the prefix, guiding the planning token representations to predict the latent semantic representations of the response, which are induced by an autoencoder.In a minimal planning task (i.e., graph path-finding), our model exhibits near-perfect performance and effectively mitigates shortcut learning, a feat that standard training methods and baseline models have been unable to accomplish.Furthermore, we pretrain Semformer from scratch with 125M parameters, demonstrating its efficacy through measures of perplexity, in-context learning, and fine-tuning on summarization tasks. 2024.emnlp-main.1039 @@ -14390,10 +14390,10 @@ <fixed-case>D</fixed-case>oc<fixed-case>CG</fixed-case>en: Document-based Controlled Code Generation SameerPimparkhede - MehantKammakomatiInternational Business Machines + MehantKammakomatiInternational Business Machines Srikanth G.TamilselvamInternational Business Machines PrinceKumarInternational Business Machines - Ashok PonKumarIBM Research India + Ashok PonKumarIBM Research India PushpakBhattacharyyaIndian Institute of Technology, Bombay, Dhirubhai Ambani Institute Of Information and Communication Technology 18681-18697 Recent developments show that Large Language Models (LLMs) produce state-of-the-art performance on natural language (NL) to code generation for resource-rich general-purpose languages like C++, Java, and Python. However, their practical usage for structured domain-specific languages (DSLs) such as YAML, JSON is limited due to domain-specific schema, grammar, and customizations generally unseen by LLMs during pre-training. Efforts have been made to mitigate this challenge via in-context learning through relevant examples or by fine-tuning. However, it suffers from problems, such as limited DSL samples and prompt sensitivity but enterprises maintain good documentation of the DSLs. Therefore, we propose DocCGen, a framework that can leverage such rich knowledge by breaking the NL-to-Code generation task for structured code languages into a two-step process. First, it detects the correct libraries using the library documentation that best matches the NL query. Then, it utilizes schema rules extracted from the documentation of these libraries to constrain the decoding. We evaluate our framework for two complex structured languages, Ansible YAML and Bash command, consisting of two settings: Out-of-domain (OOD) and In domain (ID). Our extensive experiments show that DocCGen consistently improves different sized language models across all six evaluation metrics, reducing syntactic and semantic errors in structured code. @@ -14416,7 +14416,7 @@ The Emergence of Compositional Languages in Multi-entity Referential Games: from Image to Graph Representations - DanielAkkerman + DanielAkkerman PhongLeAmazon Raquel G.AlhamaUniversity of Amsterdam, University of Amsterdam 18713-18723 @@ -14432,7 +14432,7 @@ MatanelOrenMicrosoft and Hebrew University, Hebrew University of Jerusalem MichaelHassid NirYardenHebrew University of Jerusalem - YossiAdiHebrew University of Jerusalem and Facebook + YossiAdiHebrew University of Jerusalem and Facebook RoySchwartzHebrew University, Hebrew University of Jerusalem 18724-18741 2024.emnlp-main.1043 @@ -14442,7 +14442,7 @@ Evaluating Large Language Models along Dimensions of Language Variation: A Systematik Invesdigatiom uv Cross-lingual Generalization NiyatiBafnaJohns Hopkins University - KentonMurrayJohns Hopkins University + KentonMurrayJohns Hopkins University DavidYarowskyJohns Hopkins University 18742-18762 While large language models exhibit certain cross-lingual generalization capabilities, they suffer from performance degradation (PD) on unseen closely-related languages (CRLs) and dialects relative to their high-resource language neighbour (HRLN). However, we currently lack a fundamental understanding of what kinds of linguistic distances contribute to PD, and to what extent. Furthermore, studies of cross-lingual generalization are confounded by unknown quantities of CRL language traces in the training data, and by the frequent lack of availability of evaluation data in lower-resource related languages and dialects. To address these issues, we model phonological, morphological, and lexical distance as Bayesian noise processes to synthesize artificial languages that are controllably distant from the HRLN. We analyse PD as a function of underlying noise parameters, offering insights on model robustness to isolated and composed linguistic phenomena, and the impact of task and HRL characteristics on PD. We calculate parameter posteriors on real CRL-HRLN pair data and show that they follow computed trends of artificial languages, demonstrating the viability of our noisers. Our framework offers a cheap solution for estimating task performance on an unseen CRL given HRLN performance using its posteriors, as well as for diagnosing observed PD on a CRL in terms of its linguistic distances from its HRLN, and opens doors to principled methods of mitigating performance degradation. @@ -14503,8 +14503,8 @@ Are Data Augmentation Methods in Named Entity Recognition Applicable for Uncertainty Estimation? WataruHashimotoNara Institute of Science and Technology, Japan - HidetakaKamigaitoNara Institute of Science and Technology - TaroWatanabeNara Institute of Science and Technology, Japan + HidetakaKamigaitoNara Institute of Science and Technology + TaroWatanabeNara Institute of Science and Technology, Japan 18852-18867 This work investigates the impact of data augmentation on confidence calibration and uncertainty estimation in Named Entity Recognition (NER) tasks. For the future advance of NER in safety-critical fields like healthcare and finance, it is essential to achieve accurate predictions with calibrated confidence when applying Deep Neural Networks (DNNs), including Pre-trained Language Models (PLMs), as a real-world application. However, DNNs are prone to miscalibration, which limits their applicability. Moreover, existing methods for calibration and uncertainty estimation are computational expensive. Our investigation in NER found that data augmentation improves calibration and uncertainty in cross-genre and cross-lingual setting, especially in-domain setting. Furthermore, we showed that the calibration for NER tends to be more effective when the perplexity of the sentences generated by data augmentation is lower, and that increasing the size of the augmentation further improves calibration and uncertainty. 2024.emnlp-main.1049 @@ -14514,11 +14514,11 @@ <fixed-case>N</fixed-case>euro<fixed-case>T</fixed-case>rial<fixed-case>NER</fixed-case>: An Annotated Corpus for Neurological Diseases and Therapies in Clinical Trial Registries Simona EmilovaDoneva - TiliaEllendorffUniversity of Zurich and University of Zurich + TiliaEllendorffUniversity of Zurich and University of Zurich BeateSickUniversity of Zurich and ZHAW - Zürcher Hochschule für Angewandte Wissenschaften Jean-PhilippeGoldman Amelia ElaineCannon - GeroldSchneiderUniversity of Zurich + GeroldSchneiderUniversity of Zurich Benjamin VictorIneichen 18868-18890 Extracting and aggregating information from clinical trial registries could provide invaluable insights into the drug development landscape and advance the treatment of neurologic diseases. However, achieving this at scale is hampered by the volume of available data and the lack of an annotated corpus to assist in the development of automation tools. Thus, we introduce NeuroTrialNER, a new and fully open corpus for named entity recognition (NER). It comprises 1093 clinical trial summaries sourced from ClinicalTrials.gov, annotated for neurological diseases, therapeutic interventions, and control treatments. We describe our data collection process and the corpus in detail. We demonstrate its utility for NER using large language models and achieve a close-to-human performance. By bridging the gap in data resources, we hope to foster the development of text processing tools that help researchers navigate clinical trials data more easily. @@ -14530,12 +14530,12 @@ Fool Me Once? Contrasting Textual and Visual Explanations in a Clinical Decision-Support Setting Maxime GuillaumeKayserMemorial Sloan Kettering Cancer Centre and University of Oxford BayarMenzatTechnische Universität Wien - CorneliusEmde - Bogdan AlexandruBerceanPolytechnic University of Timisoara and Rayscape + CorneliusEmde + Bogdan AlexandruBerceanPolytechnic University of Timisoara and Rayscape AlexNovak - Abdalá Trinidad EspinosaMorgadoOxford University Hospitals NHS Foundation Trust - BartlomiejPapiezUniversity of Oxford and University of Oxford - SusanneGaubeUniversity College London, University of London + Abdalá Trinidad EspinosaMorgadoOxford University Hospitals NHS Foundation Trust + BartlomiejPapiezUniversity of Oxford and University of Oxford + SusanneGaubeUniversity College London, University of London ThomasLukasiewiczInstitute of Logic and Computation, Technische Universität Wien and Department of Computer Science, University of Oxford Oana-MariaCamburuDepartment of Computer Science, University College London, University of London 18891-18919 @@ -14548,8 +14548,8 @@ Towards Faithful Knowledge Graph Explanation Through Deep Alignment in Commonsense Question Answering WeiheZhai - ArkaitzZubiagaQueen Mary University of London - BingquanLiuHarbin Institute of Technology + ArkaitzZubiagaQueen Mary University of London + BingquanLiuHarbin Institute of Technology ChengjieSunHarbin Institute of Technology YalongZhao 18920-18930 @@ -14574,8 +14574,8 @@ Argument Relation Classification through Discourse Markers and Adversarial Training Michele LucaContalbo - FrancescoGuerra - MatteoPaganelli + FrancescoGuerra + MatteoPaganelli 18949-18954 Argument relation classification (ARC) identifies supportive, contrasting and neutral relations between argumentative units. The current approaches rely on transformer architectures which have proven to be more effective than traditional methods based on hand-crafted linguistic features. In this paper, we introduce DISARM, which advances the state of the art with a training procedure combining multi-task and adversarial learning strategies. By jointly solving the ARC and discourse marker detection tasks and aligning their embedding spaces into a unified latent space, DISARM outperforms the accuracy of existing approaches. 2024.emnlp-main.1054 @@ -14598,7 +14598,7 @@ Dae YonHwangAmazon AGI BilalTaha HarshitPande - YaroslavNechaevAmazon + YaroslavNechaevAmazon 18971-18982 Despite the recent advancements in information retrieval (IR), zero-shot IR remains a significant challenge, especially when dealing with new domains, languages, and newly-released use cases that lack historical query traffic from existing users. For such cases, it is common to use query augmentations followed by fine-tuning pre-trained models on the document data paired with synthetic queries. In this work, we propose a novel Universal Document Linking (UDL) algorithm, which links similar documents to enhance synthetic query generation across multiple datasets with different characteristics. UDL leverages entropy for the choice of similarity models and named entity recognition (NER) for the link decision of documents using similarity scores. Our empirical studies demonstrate the effectiveness and universality of the UDL across diverse datasets and IR models, surpassing state-of-the-art methods in zero-shot cases. The developed code for reproducibility is included in https://github.com/eoduself/UDL 2024.emnlp-main.1056 @@ -14643,7 +14643,7 @@ Unsupervised Extraction of Dialogue Policies from Conversations Makesh NarsimhanSreedharNVIDIA - TraianRebedeaNVIDIA and University Politehnica of Bucharest + TraianRebedeaNVIDIA and University Politehnica of Bucharest ChristopherParisien 19029-19045 Dialogue policies play a crucial role in developing task-oriented dialogue systems, yet their development and maintenance are challenging and typically require substantial effort from experts in dialogue modeling. While in many situations, large amounts of conversational data are available for the task at hand, people lack an effective solution able to extract dialogue policies from this data. In this paper, we address this gap by first illustrating how Large Language Models (LLMs) can be instrumental in extracting dialogue policies from datasets, through the conversion of conversations into a unified intermediate representation consisting of canonical forms. We then propose a novel method for generating dialogue policies utilizing a controllable and interpretable graph-based methodology. By combining canonical forms across conversations into a flow network, we find that running graph traversal algorithms helps in extracting dialogue flows. These flows are a better representation of the underlying interactions than flows extracted by prompting LLMs. Our technique focuses on giving conversation designers greater control, offering a productivity tool to improve the process of developing dialogue policies. @@ -14653,10 +14653,10 @@ <fixed-case>GRIZAL</fixed-case>: Generative Prior-guided Zero-Shot Temporal Action Localization - Onkar KishorSusladkar - Gayatri SudhirDeshmukhIndian Institute of Technology, Roorkee + Onkar KishorSusladkar + Gayatri SudhirDeshmukhIndian Institute of Technology, Roorkee VandanGoradeNorthwestern University - SparshMittalIndian Institution Technology Roorkee + SparshMittalIndian Institution Technology Roorkee 19046-19059 Zero-shot temporal action localization (TAL) aims to temporally localize actions in videos without prior training examples. To address the challenges of TAL, we offer GRIZAL, a model that uses multimodal embeddings and dynamic motion cues to localize actions effectively. GRIZAL achieves sample diversity by using large-scale generative models such as GPT-4 for generating textual augmentations and DALL-E for generating image augmentations. Our model integrates vision-language embeddings with optical flow insights, optimized through a blend of supervised and self-supervised loss functions. On ActivityNet, Thumos14 and Charades-STA datasets, GRIZAL greatly outperforms state-of-the-art zero-shot TAL models, demonstrating its robustness and adaptability across a wide range of video content. We will make all the models and code publicly available by open-sourcing them. 2024.emnlp-main.1061 @@ -14667,7 +14667,7 @@ Preserving Multi-Modal Capabilities of Pre-trained <fixed-case>VLM</fixed-case>s for Improving Vision-Linguistic Compositionality YoungtaekOhKorea Advanced Institute of Science and Technology Jae WonChoSejong University - Dong-JinKimHanyang University + Dong-JinKimHanyang University In SoKweon JunmoKimKorea Advanced Institute of Science and Technology 19060-19076 @@ -14679,16 +14679,16 @@ <fixed-case>F</fixed-case>oodie<fixed-case>QA</fixed-case>: A Multimodal Dataset for Fine-Grained Understanding of <fixed-case>C</fixed-case>hinese Food Culture WenyanLi - CrystinaZhangUniversity of Waterloo - JiaangLi + CrystinaZhangUniversity of Waterloo + JiaangLi QiweiPeng RaphaelTangComcast LiZhouThe Chinese University of Hong Kong - WeijiaZhang - GuiminHu - YifeiYuanCopenhagen University + WeijiaZhang + GuiminHu + YifeiYuanCopenhagen University AndersSøgaardCopenhagen University - DanielHershcovichUniversity of Copenhagen + DanielHershcovichUniversity of Copenhagen DesmondElliottUniversity of Copenhagen 19077-19095 Food is a rich and varied dimension of cultural heritage, crucial to both individuals and social groups. To bridge the gap in the literature on the often-overlooked regional diversity in this domain, we introduce FoodieQA, a manually curated, fine-grained image-text dataset capturing the intricate features of food cultures across various regions in China. We evaluate vision–language Models (VLMs) and large language models (LLMs) on newly collected, unseen food images and corresponding questions. FoodieQA comprises three multiple-choice question-answering tasks where models need to answer questions based on multiple images, a single image, and text-only descriptions, respectively. While LLMs excel at text-based question answering, surpassing human accuracy, the open-sourced VLMs still fall short by 41% on multi-image and 21% on single-image VQA tasks, although closed-weights models perform closer to human levels (within 10%). Our findings highlight that understanding food and its cultural implications remains a challenging and under-explored direction. @@ -14700,7 +14700,7 @@ A Two-Step Approach for Data-Efficient <fixed-case>F</fixed-case>rench Pronunciation Learning HoyeonLeeNAVER Cloud HyeeunJangUniversité de Strasbourg - JonghwanKimNAVER Cloud + JonghwanKimNAVER Cloud JaeminKimNAVER 19096-19103 Recent studies have addressed intricate phonological phenomena in French, relying on either extensive linguistic knowledge or a significant amount of sentence-level pronunciation data. However, creating such resources is expensive and non-trivial. To this end, we propose a novel two-step approach that encompasses two pronunciation tasks: grapheme-to-phoneme and post-lexical processing. We then investigate the efficacy of the proposed approach with a notably limited amount of sentence-level pronunciation data. Our findings demonstrate that the proposed two-step approach effectively mitigates the lack of extensive labeled data, and serves as a feasible solution for addressing French phonological phenomena even under resource-constrained environments. @@ -14712,7 +14712,7 @@ Exploring Intra and Inter-language Consistency in Embeddings with <fixed-case>ICA</fixed-case> RongzhiLi TakeruMatsudaThe University of Tokyo and RIKEN - HitomiYanakathe University of Tokyo + HitomiYanakathe University of Tokyo 19104-19111 Word embeddings represent words as multidimensional real vectors, facilitating data analysis and processing, but are often challenging to interpret. Independent Component Analysis (ICA) creates clearer semantic axes by identifying independent key features. Previous research has shown ICA’s potential to reveal universal semantic axes across languages. However, it lacked verification of the consistency of independent components within and across languages. We investigated the consistency of semantic axes in two ways: both within a single language and across multiple languages. We first probed into intra-language consistency, focusing on the reproducibility of axes by performing ICA multiple times and clustering the outcomes. Then, we statistically examined inter-language consistency by verifying those axes’ correspondences using statistical tests. We newly applied statistical methods to establish a robust framework that ensures the reliability and universality of semantic axes. 2024.emnlp-main.1065 @@ -14721,9 +14721,9 @@ <fixed-case>D</fixed-case>etox<fixed-case>LLM</fixed-case>: A Framework for Detoxification with Explanations - Md Tawkat IslamKhondakerUniversity of British Columbia - MuhammadAbdul-MageedUniversity of British Columbia - Laks V. S.LakshmananUniversity of British Columbia + Md Tawkat IslamKhondakerUniversity of British Columbia + MuhammadAbdul-MageedUniversity of British Columbia + Laks V. S.LakshmananUniversity of British Columbia 19112-19139 Prior works on detoxification are scattered in the sense that they do not cover all aspects of detoxification needed in a real-world scenario. Notably, prior works restrict the task of developing detoxification models to only a seen subset of platforms, leaving the question of how the models would perform on unseen platforms unexplored. Additionally, these works do not address non-detoxifiability, a phenomenon whereby the toxic text cannot be detoxified without altering the meaning. We propose DetoxLLM, the first comprehensive end-to-end detoxification framework, which attempts to alleviate the aforementioned limitations. We first introduce a cross-platform pseudo-parallel corpus applying multi-step data processing and generation strategies leveraging ChatGPT. We then train a suite of detoxification models with our cross-platform corpus. We show that our detoxification models outperform the SoTA model trained with human-annotated parallel corpus. We further introduce explanation to promote transparency and trustworthiness. DetoxLLM additionally offers a unique paraphrase detector especially dedicated for the detoxification task to tackle the non-detoxifiable cases. Through experimental analysis, we demonstrate the effectiveness of our cross-platform corpus and the robustness of DetoxLLM against adversarial toxicity. 2024.emnlp-main.1066 @@ -14732,9 +14732,9 @@ Comparing a <fixed-case>BERT</fixed-case> Classifier and a <fixed-case>GPT</fixed-case> classifier for Detecting Connective Language Across Multiple Social Media - JosephineLukitoUniversity of Texas at Austin + JosephineLukitoUniversity of Texas at Austin BinChen - Gina M.MasulloUniversity of Texas at Austin + Gina M.MasulloUniversity of Texas at Austin Natalie JominiStroudUniversity of Texas at Austin 19140-19153 This study presents an approach for detecting connective language—defined as language that facilitates engagement, understanding, and conversation—from social media discussions. We developed and evaluated two types of classifiers: BERT and GPT-3.5 turbo. Our results demonstrate that the BERT classifier significantly outperforms GPT-3.5 turbo in detecting connective language. Furthermore, our analysis confirms that connective language is distinct from related concepts measuring discourse qualities, such as politeness and toxicity. We also explore the potential of BERT-based classifiers for platform-agnostic tools. This research advances our understanding of the linguistic dimensions of online communication and proposes practical tools for detecting connective language across diverse digital environments. @@ -14747,10 +14747,10 @@ <fixed-case>S</fixed-case>hadow<fixed-case>LLM</fixed-case>: Predictor-based Contextual Sparsity for Large Language Models YashAkhauri - Ahmed F.AbouElhamayed + Ahmed F.AbouElhamayed JordanDotzel ZhiruZhang - Alexander M.Rush + Alexander M.Rush SafeenHuda Mohamed S.Abdelfattah 19154-19167 @@ -14762,10 +14762,10 @@ Emotion Granularity from Text: An Aggregate-Level Indicator of Mental Health KrishnapriyaVishnubhotla - DanielaTeodorescu - Mallory JFeldman + DanielaTeodorescu + Mallory JFeldman KristenLindquistUniversity of North Carolina at Chapel Hill - Saif M.MohammadNational Research Council Canada + Saif M.MohammadNational Research Council Canada 19168-19185 We are united in how emotions are central to shaping our experiences; yet, individuals differ greatly in how we each identify, categorize, and express emotions. In psychology, variation in the ability of individuals to differentiate between emotion concepts is called emotion granularity (determined through self-reports of one’s emotions). High emotion granularity has been linked with better mental and physical health; whereas low emotion granularity has been linked with maladaptive emotion regulation strategies and poor health outcomes. In this work, we propose computational measures of emotion granularity derived from temporally-ordered speaker utterances in social media (in lieu of self reports that suffer from various biases). We then investigate the effectiveness of such text-derived measures of emotion granularity in functioning as markers of various mental health conditions (MHCs). We establish baseline measures of emotion granularity derived from textual utterances, and show that, at an aggregate level, emotion granularities are significantly lower for people self-reporting as having an MHC than for the control population. This paves the way towards a better understanding of the MHCs, and specifically the role emotions play in our well-being. 2024.emnlp-main.1069 @@ -14821,7 +14821,7 @@ <fixed-case>D</fixed-case>ata<fixed-case>N</fixed-case>arrative: Automated Data-Driven Storytelling with Visualizations and Texts Mohammed SaidulIslamYork University Md Tahmid RahmanLaskarDialpad Inc. - Md RizwanParvezQatar Computing Research Institute and Bosch + Md RizwanParvezQatar Computing Research Institute and Bosch EnamulHoqueYork University ShafiqJotySalesForce.com and Nanyang Technological University 19253-19286 @@ -14835,7 +14835,7 @@ DhananjayRamAmazon AdityaRawalAmazon MomchilHardalovAWS AI Labs - NikolaosPappasAWS AI Labs + NikolaosPappasAWS AI Labs ShengZhaAmazon 19287-19301 Training with mixed data distributions is a common and important part of creating multi-task and instruction-following models. The diversity of the data distributions and cost of joint training makes the optimization procedure extremely challenging. Data mixing methods partially address this problem, albeit having a sub-optimal performance across data sources and require multiple expensive training runs. In this paper, we propose a simple and efficient alternative for better optimization of the data sources by combining models individually trained on each data source with the base model using basic element-wise vector operations. The resulting model, namely Distribution Edited Model (DEM), is cheaper than standard data mixing and outperforms strong baselines on a variety of benchmarks, yielding upto 6.2% improvement on MMLU, 11.5% on BBH, 16.1% on DROP, 6% MathQA and 9.3% on HELM with models of size 3B to 13B. Notably, DEM does not require full re-training when modifying a single data-source, thus making it very flexible and scalable for training with diverse data sources. The code is available at https://github.com/amazon-science/dem-distribution-edited-model. @@ -14852,12 +14852,12 @@ Po-YaoHuangMeta XiaoqingTanMeta AI Ching-FengYehFacebook - JacobKahnFacebook AI Research and Meta AI + JacobKahnFacebook AI Research and Meta AI ChristineJouFacebook GargiGhoshMeta AI - OmerLevyFacebook + OmerLevyFacebook LukeZettlemoyerUniversity of Washington, Facebook and Meta - Wen-tauYihMeta Platforms, Inc. + Wen-tauYihMeta Platforms, Inc. Shang-WenLiFacebook SainingXieNew York University ChristophFeichtenhoferFacebook @@ -14884,7 +14884,7 @@ VanyaCohen NathanaelChambersUS Naval Academy NiranjanBalasubramanianState University of New York, Stony Brook - RayMooney, University of Texas, Austin + RayMooney, University of Texas, Austin 19336-19354 Understanding the abilities of LLMs to reason about natural language plans, such as instructional text and recipes, is critical to reliably using them in decision-making systems. A fundamental aspect of plans is the temporal order in which their steps need to be executed, which reflects the underlying causal dependencies between them. We introduce CaT-Bench, a benchmark of Step Order Prediction questions, which test whether a step must necessarily occur before or after another in cooking recipe plans. We use this to evaluate how well frontier LLMs understand causal and temporal dependencies. We find that SOTA LLMs are underwhelming (best zero-shot is only 0.59 in F1), and are biased towards predicting dependence more often, perhaps relying on temporal order of steps as a heuristic. While prompting for explanations and using few-shot examples improve performance, the best F1 result is only 0.73. Further, human evaluation of explanations along with answer correctness show that, on average, humans do not agree with model reasoning. Surprisingly, we also find that explaining after answering leads to better performance than normal chain-of-thought prompting, and LLM answers are not consistent across questions about the same step pairs. Overall, results show that LLMs’ ability to detect dependence between steps has significant room for improvement. 2024.emnlp-main.1077 @@ -14920,7 +14920,7 @@ Investigating the Role of Instruction Variety and Task Difficulty in Robotic Manipulation Tasks AmitParekh NikolasVitsakis - AlessandroSugliaHeriot-Watt University + AlessandroSugliaHeriot-Watt University IoannisKonstasHeriot-Watt University 19389-19424 Evaluating the generalisation capabilities of multimodal models based solely on their performance on out-of-distribution data fails to capture their true robustness. This work introduces a comprehensive evaluation framework that systematically examines the role of instructions and inputs in the generalisation abilities of such models, considering architectural design, input perturbations across language and vision modalities, and increased task complexity. The proposed framework uncovers the resilience of multimodal models to extreme instruction perturbations and their vulnerability to observational changes, raising concerns about overfitting to spurious correlations. By employing this evaluation framework on current Transformer-based multimodal models for robotic manipulation tasks, we uncover limitations and suggest future advancements should focus on architectural and training innovations that better integrate multimodal inputs, enhancing a model’s generalisation prowess by prioritising sensitivity to input content over incidental correlations. @@ -14944,10 +14944,10 @@ XinyiHe JiaruZou YunLin - MengyuZhouMicrosoft Research - ShiHanMicrosoft Research Asia + MengyuZhouMicrosoft Research + ShiHanMicrosoft Research Asia ZejianYuanXi’an Jiaotong University - DongmeiZhangMicrosoft and Microsoft + DongmeiZhangMicrosoft and Microsoft 19433-19451 Large Language Models have revolutionized code generation ability by converting natural language descriptions into executable code. However, generating complex code within real-world scenarios remains challenging due to intricate structures, subtle bugs, understanding of advanced data types, and lack of supplementary contents. To address these challenges, we introduce the CoCoST framework, which enhances complex code generation by online searching for more information with planned queries and correctness testing for code refinement. Moreover, CoCoST serializes the complex inputs and outputs to improve comprehension and generates test cases to ensure the adaptability for real-world applications. CoCoST is validated through rigorous experiments on the DS-1000 and ClassEval datasets. Experimental results show that CoCoST substantially improves the quality of complex code generation, highlighting its potential to enhance the practicality of LLMs in generating complex code. 2024.emnlp-main.1082 @@ -14957,9 +14957,9 @@ Sequential <fixed-case>API</fixed-case> Function Calling Using <fixed-case>G</fixed-case>raph<fixed-case>QL</fixed-case> Schema - AvirupSahaInternational Business Machines + AvirupSahaInternational Business Machines LakshmiMandal - BalajiGanesanIBM Research India + BalajiGanesanIBM Research India SambitGhoshInternational Business Machines RenukaSindhgattaInternational Business Machines CarlosEberhardtInternational Business Machines @@ -14976,12 +14976,12 @@ The Illusion of Competence: Evaluating the Effect of Explanations on Users’ Mental Models of Visual Question Answering Systems JudithSiekerUniversität Bielefeld - SimeonJunkerUniversität Bielefeld + SimeonJunkerUniversität Bielefeld RonjaUtescherUniversität Bielefeld NaziaAttariBielefeld University HeikoWersingHonda Research Institute - HendrikBuschmeierUniversität Bielefeld - SinaZarrießBielefeld University + HendrikBuschmeierUniversität Bielefeld + SinaZarrießBielefeld University 19459-19475 We examine how users perceive the limitations of an AI system when it encounters a task that it cannot perform perfectly and whether providing explanations alongside its answers aids users in constructing an appropriate mental model of the system’s capabilities and limitations. We employ a visual question answer and explanation task where we control the AI system’s limitations by manipulating the visual inputs: during inference, the system either processes full-color or grayscale images. Our goal is to determine whether participants can perceive the limitations of the system. We hypothesize that explanations will make limited AI capabilities more transparent to users. However, our results show that explanations do not have this effect. Instead of allowing users to more accurately assess the limitations of the AI system, explanations generally increase users’ perceptions of the system’s competence – regardless of its actual performance. 2024.emnlp-main.1084 @@ -14990,14 +14990,14 @@ Re-Evaluating Evaluation for Multilingual Summarization - Jessica ZosaFordeBrown University + Jessica ZosaFordeBrown University RuochenZhangBrown University LintangSutawikaEleutherAI Alham FikriAjiMohamed bin Zayed University of Artificial Intelligence and Amazon - SamuelCahyawijaya + SamuelCahyawijaya Genta IndraWinataCapital One MinghaoWu - CarstenEickhoffEberhard-Karls-Universität Tübingen + CarstenEickhoffEberhard-Karls-Universität Tübingen StellaBidermanEleutherAI and Booz Allen Hamilton ElliePavlickBrown University 19476-19493 @@ -15010,9 +15010,9 @@ Video-Text Prompting for Weakly Supervised Spatio-Temporal Video Grounding HengZhaoInstitute of High Performance Computing, Singapore, A*STAR ZhaoYinjie - BihanWen - Yew-SoonOngNanyang Technological University - Joey TianyiZhouA*STAR Centre for Frontier AI Research + BihanWen + Yew-SoonOngNanyang Technological University + Joey TianyiZhouA*STAR Centre for Frontier AI Research 19494-19505 Weakly-supervised Spatio-Temporal Video Grounding(STVG) aims to localize target object tube given a text query, without densely annotated training data. Existing methods extract each candidate tube feature independently by cropping objects from video frame feature, discarding all contextual information such as position change and inter-entity relationship. In this paper, we propose Video-Text Prompting(VTP) to construct candidate feature. Instead of cropping tube region from feature map, we draw visual markers(e.g. red circle) over objects tubes as video prompts; corresponding text prompt(e.g. in red circle) is also inserted after the subject word of query text to highlight its presence. Nevertheless, each candidate feature may look similar without cropping. To address this, we further propose Contrastive VTP(CVTP) by introducing negative contrastive samples whose candidate object is erased instead of being highlighted; by comparing the difference between VTP candidate and the contrastive sample, the gap of matching score between correct candidate and the rest is enlarged. Extensive experiments and ablations are conducted on several STVG datasets and our results surpass existing weakly-supervised methods by a great margin, demonstrating the effectiveness of our proposed methods. 2024.emnlp-main.1086 @@ -15032,11 +15032,11 @@ Factuality of Large Language Models: A Survey YuxiaWang MinghanWangMonash University - Muhammad ArslanManzoor + Muhammad ArslanManzoor FeiLiu Georgi NenkovGeorgiev Rocktim JyotiDasMohamed bin Zayed University of Artificial Intelligence - PreslavNakovMohamed bin Zayed University of Artificial Intelligence + PreslavNakovMohamed bin Zayed University of Artificial Intelligence 19519-19529 Large language models (LLMs), especially when instruction-tuned for chat, have become part of our daily lives, freeing people from the process of searching, extracting, and integrating information from multiple sources by offering a straightforward answer to a variety of questions in a single place. Unfortunately, in many cases, LLM responses are factually incorrect, which limits their applicability in real-world scenarios. As a result, research on evaluating and improving the factuality of LLMs has attracted a lot of research attention recently. In this survey, we critically analyze existing work with the aim to identify the major challenges and their associated causes, pointing out to potential solutions for improving the factuality of LLMs, and analyzing the obstacles to automated factuality evaluation for open-ended text generation. We further offer an outlook on where future research should go. 2024.emnlp-main.1088 @@ -15045,9 +15045,9 @@ Discovering Biases in Information Retrieval Models Using Relevance Thesaurus as Global Explanation - YoungwooKimPohang University of Science and Technology + YoungwooKimPohang University of Science and Technology RaziehRahimiUniversity of Massachusetts Amherst - JamesAllanUniversity of Massachusetts, Amherst + JamesAllanUniversity of Massachusetts, Amherst 19530-19547 Most of the efforts in interpreting neural relevance models have been on local explanations, which explain the relevance of a document to a query. However, local explanations are not effective in predicting the model’s behavior on unseen texts. We aim at explaining a neural relevance model by providing lexical explanations that can be globally generalized. Specifically, we construct a relevance thesaurus containing semantically relevant query term and document term pairs, which can augment BM25 scoring functions to better approximate the neural model’s predictions. We propose a novel method to build a relevance thesaurus construction. Our method involves training a neural relevance model which can score the relevance for partial segments of query and documents. The trained model is used to identify relevant terms over the vocabulary space. The resulting thesaurus explanation is evaluated based on ranking effectiveness and fidelity to the targeted neural ranking model. Finally, our thesaurus reveals the existence of brand name bias in ranking models, which further supports the utility of our explanation method. 2024.emnlp-main.1089 @@ -15059,8 +15059,8 @@ RongchenGuo IsarNejadgholiNational Research Council Canada and University of Ottawa HillaryDawkinsNational Research Council Canada - Kathleen C.FraserNational Research Council Canada - SvetlanaKiritchenkoNational Research Council Canada + Kathleen C.FraserNational Research Council Canada + SvetlanaKiritchenkoNational Research Council Canada 19548-19564 This work provides an explanatory view of how LLMs can apply moral reasoning to both criticize and defend sexist language. We assessed eight large language models, all of which demonstrated the capability to provide explanations grounded in varying moral perspectives for both critiquing and endorsing views that reflect sexist assumptions. With both human and automatic evaluation, we show that all eight models produce comprehensible and contextually relevant text, which is helpful in understanding diverse views on how sexism is perceived. Also, through analysis of moral foundations cited by LLMs in their arguments, we uncover the diverse ideological perspectives in models’ outputs, with some models aligning more with progressive or conservative views on gender roles and sexism.Based on our observations, we caution against the potential misuse of LLMs to justify sexist language. We also highlight that LLMs can serve as tools for understanding the roots of sexist beliefs and designing well-informed interventions. Given this dual capacity, it is crucial to monitor LLMs and design safety mechanisms for their use in applications that involve sensitive societal topics, such as sexism. 2024.emnlp-main.1090 @@ -15082,7 +15082,7 @@ Soumya SuvraGhosalUniversity of Maryland, College Park SamyadeepBasu SoheilFeiziUniversity of Maryland, College Park - DineshManochaUniversity of Maryland, College Park + DineshManochaUniversity of Maryland, College Park 19584-19601 Image-text contrastive models such as CLIP learn transferable and robust representations for zero-shot transfer to a variety of downstream tasks. However, to obtain strong downstream performances, prompts need to be carefully curated, which can be a tedious engineering task. To address the issue of manual prompt engineering, prompt-tuning is used where a set of contextual vectors are learned by leveraging information from the training data. Despite their effectiveness, existing prompt-tuning frameworks often lack interpretability, thus limiting their ability to understand the compositional nature of images. In this work, we first identify that incorporating compositional attributes (e.g., a “green” tree frog) in the design of manual prompts can significantly enhance image-text alignment scores. Building upon this observation, we propose a novel and interpretable prompt-tuning method named IntCoOp, which learns to jointly align attribute-level inductive biases and class embeddings during prompt-tuning. To assess the effectiveness of our approach, we evaluate IntCoOp across two representative tasks in a few-shot learning setup: generalization to novel classes, and unseen domain shifts. Through extensive experiments across 10 downstream datasets on CLIP, we find that introducing attribute-level inductive biases leads to superior performance against state-of-art prompt tuning frameworks. Notably, in a 16-shot setup, IntCoOp improves CoOp by 7.35% in average performance across 10 diverse datasets. 2024.emnlp-main.1092 @@ -15106,9 +15106,9 @@ The Generation Gap: Exploring Age Bias in the Value Systems of Large Language Models SiyangLiu TrishaMaturi - BowenYi + BowenYi SiqiShenUniversity of Michigan - Ann Arbor - RadaMihalceaUniversity of Michigan + RadaMihalceaUniversity of Michigan 19617-19634 We explore the alignment of values in Large Language Models (LLMs) with specific age groups, leveraging data from the World Value Survey across thirteen categories. Through a diverse set of prompts tailored to ensure response robustness, we find a general inclination of LLM values towards younger demographics, especially when compared to the US population. Although a general inclination can be observed, we also found that this inclination toward younger groups can be different across different value categories. Additionally, we explore the impact of incorporating age identity information in prompts and observe challenges in mitigating value discrepancies with different age cohorts. Our findings highlight the age bias in LLMs and provide insights for future work. Materials for our analysis will be available via https://github.com/anonymous 2024.emnlp-main.1094 @@ -15119,7 +15119,7 @@ <fixed-case>T</fixed-case>empo<fixed-case>F</fixed-case>ormer: A Transformer for Temporally-aware Representations in Change Detection TaliaTseriotou AdamTsakalidisCedefop and Alan Turing Institute - MariaLiakataQueen Mary University London + MariaLiakataQueen Mary University London 19635-19653 Dynamic representation learning plays a pivotal role in understanding the evolution of linguistic content over time. On this front both context and time dynamics as well as their interplay are of prime importance. Current approaches model context via pre-trained representations, which are typically temporally agnostic. Previous work on modelling context and temporal dynamics has used recurrent methods, which are slow and prone to overfitting. Here we introduce TempoFormer, the first task-agnostic transformer-based and temporally-aware model for dynamic representation learning. Our approach is jointly trained on inter and intra context dynamics and introduces a novel temporal variation of rotary positional embeddings. The architecture is flexible and can be used as the temporal representation foundation of other models or applied to different transformer-based architectures. We show new SOTA performance on three different real-time change detection tasks. 2024.emnlp-main.1095 @@ -15129,9 +15129,9 @@ Pron vs Prompt: Can Large Language Models already Challenge a World-Class Fiction Author at Creative Text Writing? GuillermoMarco - JulioGonzaloUniversidad Nacional de Educación a Distancia - M.TeresaMateo-GironaUniversidad Complutense de Madrid - Ramón Del CastilloSantosUniversidad Nacional de Educación a Distancia + JulioGonzaloUniversidad Nacional de Educación a Distancia + M.TeresaMateo-GironaUniversidad Complutense de Madrid + Ramón Del CastilloSantosUniversidad Nacional de Educación a Distancia 19654-19670 Are LLMs ready to compete in creative writing skills with a top (rather than average) novelist? To provide an initial answer for this question, we have carried out a contest between Patricio Pron (an awarded novelist, considered one of the best of his generation) and GPT-4 (one of the top performing LLMs), in the spirit of AI-human duels such as DeepBlue vs Kasparov and AlphaGo vs Lee Sidol. We asked Pron and GPT-4 to provide thirty titles each, and then to write short stories for both their titles and their opponent’s. Then, we prepared an evaluation rubric inspired by Boden’s definition of creativity, and we collected several detailed expert assessments of the texts, provided by literature critics and scholars. The results of our experimentation indicate that LLMs are still far from challenging a top human creative writer. We also observed that GPT-4 writes more creatively using Pron’s titles than its own titles (which is an indication of the potential for human-machine co-creation). Additionally, we found that GPT-4 has a more creative writing style in English than in Spanish. 2024.emnlp-main.1096 @@ -15142,7 +15142,7 @@ Evaluating Diversity in Automatic Poetry Generation YanranChen HannesGröner - SinaZarrießBielefeld University + SinaZarrießBielefeld University SteffenEgerUniversity of Technology Nuremberg 19671-19692 Natural Language Generation (NLG), and more generally generative AI, are among the currently most impactful research fields. Creative NLG, such as automatic poetry generation, is a fascinating niche in this area. While most previous research has focused on forms of the Turing test when evaluating automatic poetry generation — can humans distinguish between automatic and human generated poetry — we evaluate the diversity of automatically generated poetry (with a focus on quatrains), by comparing distributions of generated poetry to distributions of human poetry along structural, lexical, semantic and stylistic dimensions, assessing different model types (word vs. character-level, general purpose LLMs vs. poetry-specific models), including the very recent LLaMA3-8B, and types of fine-tuning (conditioned vs. unconditioned). We find that current automatic poetry systems are considerably underdiverse along multiple dimensions — they often do not rhyme sufficiently, are semantically too uniform and even do not match the length distribution of human poetry. Our experiments reveal, however, that style-conditioning and character-level modeling clearly increases diversity across virtually all dimensions we explore. Our identified limitations may serve as the basis for more genuinely diverse future poetry generation models. @@ -15152,8 +15152,8 @@ Evaluating Short-Term Temporal Fluctuations of Social Biases in Social Media Data and Masked Language Models - YiZhouCardiff University - DanushkaBollegalaAmazon and University of Liverpool + YiZhouCardiff University + DanushkaBollegalaAmazon and University of Liverpool JoseCamacho-ColladosCardiff University 19693-19708 Social biases such as gender or racial biases have been reported in language models (LMs), including Masked Language Models (MLMs). Given that MLMs are continuously trained with increasing amounts of additional data collected over time, an important yet unanswered question is how the social biases encoded with MLMs vary over time. In particular, the number of social media users continues to grow at an exponential rate, and it is a valid concern for the MLMs trained specifically on social media data whether their social biases (if any) would also amplify over time. To empirically analyse this problem, we use a series of MLMs pretrained on chronologically ordered temporal snapshots of corpora. Our analysis reveals that, although social biases are present in all MLMs, most types of social bias remain relatively stable over time (with a few exceptions). To further understand the mechanisms that influence social biases in MLMs, we analyse the temporal corpora used to train the MLMs. Our findings show that some demographic groups, such as male, obtain higher preference over the other, such as female on the training corpora constantly. @@ -15164,7 +15164,7 @@ Delving into Qualitative Implications of Synthetic Data for Hate Speech Detection - CamillaCasula + CamillaCasula SebastianoVecellio Salto AlanRamponi SaraTonelli @@ -15188,9 +15188,9 @@ Threshold-driven Pruning with Segmented Maximum Term Weights for Approximate Cluster-based Sparse Retrieval - YifanQiaoApple - ParkerCarlson - ShanxiuHeUniversity of California, Santa Barbara + YifanQiaoApple + ParkerCarlson + ShanxiuHeUniversity of California, Santa Barbara YingruiYangUniversity of California, Santa Barbara TaoYangUniversity of California, Santa Barbara 19742-19757 @@ -15213,11 +15213,11 @@ <fixed-case>MIPD</fixed-case>: Exploring Manipulation and Intention In a Novel Corpus of <fixed-case>P</fixed-case>olish Disinformation - ArkadiuszModzelewski - GiovanniDa San MartinoUniversity of Padua - PavelSavovPolsko-Japońska Akademia Technik Komputerowych + ArkadiuszModzelewski + GiovanniDa San MartinoUniversity of Padua + PavelSavovPolsko-Japońska Akademia Technik Komputerowych Magdalena AnnaWilczyńskaPolsko-Japońska Akademia Technik Komputerowych and Polska Akademia Nauk - AdamWierzbickiPolish-Japanese Institute of Information Technology in Warsaw + AdamWierzbickiPolish-Japanese Institute of Information Technology in Warsaw 19769-19785 This study presents a novel corpus of 15,356 Polish web articles, including articles identified as containing disinformation. Our dataset enables a multifaceted understanding of disinformation. We present a distinctive multilayered methodology for annotating disinformation in texts. What sets our corpus apart is its focus on uncovering hidden intent and manipulation in disinformative content. A team of experts annotated each article with multiple labels indicating both disinformation creators’ intents and the manipulation techniques employed. Additionally, we set new baselines for binary disinformation detection and two multiclass multilabel classification tasks: manipulation techniques and intention types classification. 2024.emnlp-main.1103 @@ -15229,7 +15229,7 @@ Unsupervised Discrete Representations of <fixed-case>A</fixed-case>merican <fixed-case>S</fixed-case>ign <fixed-case>L</fixed-case>anguage ArtemAbzaliev - RadaMihalceaUniversity of Michigan + RadaMihalceaUniversity of Michigan 19786-19793 Many modalities are naturally represented as continuous signals, making it difficult to use them with models that expect discrete units, such as LLMs. In this paper, we explore the use of audio compression techniques for the discrete representation of the gestures used in sign language. We train a tokenizer for American Sign Language (ASL) fingerspelling, which discretizes sequences of fingerspelling signs into tokens. We also propose a loss function to improve the interpretability of these tokens such that they preserve both the semantic and the visual information of the signal. We show that the proposed method improves the performance of the discretized sequence on downstream tasks. 2024.emnlp-main.1104 @@ -15240,12 +15240,12 @@ Perceptions to Beliefs: Exploring Precursory Inferences for Theory of Mind in Large Language Models ChaniJungKorea Advanced Institute of Science & Technology DongkwanKimKorea Advanced Institute of Science and Technology - JihoJinKorea Advanced Institute of Science and Technology + JihoJinKorea Advanced Institute of Science and Technology JiseonKimKorea Advanced Institute of Science and Technology YeonSeonwooAmazon YejinChoiDepartment of Computer Science, University of Washington AliceOhKorea Advanced Institute of Science and Technology - HyunwooKimAllen Institute for Artificial Intelligence + HyunwooKimAllen Institute for Artificial Intelligence 19794-19809 While humans naturally develop theory of mind (ToM), the capability to understand other people’s mental states and beliefs, state-of-the-art large language models (LLMs) underperform on simple ToM benchmarks. We posit that we can extend our understanding of LLMs’ ToM abilities by evaluating key human ToM precursors-perception inference and perception-to-belief inference-in LLMs. We introduce two datasets, Percept-ToMi and Percept-FANToM, to evaluate these precursory inferences for ToM in LLMs by annotating characters’ perceptions on ToMi and FANToM, respectively.Our evaluation of eight state-of-the-art LLMs reveals that the models generally perform well in perception inference while exhibiting limited capability in perception-to-belief inference (e.g., lack of inhibitory control).Based on these results, we present PercepToM, a novel ToM method leveraging LLMs’ strong perception inference capability while supplementing their limited perception-to-belief inference. Experimental results demonstrate that PercepToM significantly enhances LLM’s performance, especially in false belief scenarios. 2024.emnlp-main.1105 @@ -15256,10 +15256,10 @@ Towards Enhancing Coherence in Extractive Summarization: Dataset and Experiments with <fixed-case>LLM</fixed-case>s MihirParmar HaniehDeilamsalehyAdobe Systems - FranckDernoncourtAdobe Systems - SeunghyunYoonAdobe Research + FranckDernoncourtAdobe Systems + SeunghyunYoonAdobe Research Ryan A.RossiAdobe Research - TrungBuiAdobe Research + TrungBuiAdobe Research 19810-19820 Extractive summarization plays a pivotal role in natural language processing due to its wide-range applications in summarizing diverse content efficiently, while also being faithful to the original content. Despite significant advancement achieved in extractive summarization by Large Language Models (LLMs), these summaries frequently exhibit incoherence. An important aspect of the coherent summary is its readability for intended users. Although there have been many datasets and benchmarks proposed for creating coherent extractive summaries, none of them currently incorporate user intent to improve coherence in extractive summarization. Motivated by this, we propose a systematically created human-annotated dataset consisting of coherent summaries for five publicly available datasets and natural language user feedback, offering valuable insights into how to improve coherence in extractive summaries. We utilize this dataset for aligning LLMs through supervised fine-tuning with natural language human feedback to enhance the coherence of their generated summaries. Preliminary experiments with Falcon-40B and Llama-2-13B show significant performance improvements (~10% Rouge-L) in terms of producing coherent summaries. We further utilize human feedback to benchmark results over instruction-tuned models such as FLAN-T5 which resulted in several interesting findings. 2024.emnlp-main.1106 @@ -15284,7 +15284,7 @@ BeyzaErmisCohere AI MircoRavanelliConcordia University CemSubakanUniversite Laval, Université Laval - ÇağatayYıldızEberhard-Karls-Universität Tübingen + ÇağatayYıldızEberhard-Karls-Universität Tübingen 19834-19843 In the last decade, the generalization and adaptation abilities of deep learning models were typically evaluated on fixed training and test distributions. Contrary to traditional deep learning, large language models (LLMs) are (i) even more overparameterized, (ii) trained on unlabeled text corpora curated from the Internet with minimal human intervention, and (iii) trained in an online fashion. These stark contrasts prevent researchers from transferring lessons learned on model generalization and adaptation in deep learning contexts to LLMs.To this end, our short paper introduces empirical observations that aim to shed light on further training of already pretrained language models. Specifically, we demonstrate that training a model on a text domain could degrade its perplexity on the test portion of the same domain. We observe with our subsequent analysis that the performance degradation is positively correlated with the similarity between the additional and the original pretraining dataset of the LLM. Our further token-level perplexity analysis reveals that the perplexity degradation is due to a handful of tokens that are not informative about the domain. We hope these findings will guide us in determining when to adapt a model vs when to rely on its foundational capabilities. 2024.emnlp-main.1108 @@ -15293,7 +15293,7 @@ Not All Contexts Are Equal: Teaching <fixed-case>LLM</fixed-case>s Credibility-aware Generation - RuotongPan + RuotongPan BoxiCao HongyuLinInstitute of Software, Chinese Academy of Sciences XianpeiHanInstitute of Software, CAS @@ -15314,7 +15314,7 @@ SuhongMoon MarwaAbdulhaiUniversity of California, Berkeley MinwooKangUniversity of California, Berkeley - JosephSuh + JosephSuh WidyadewiSoedarmadji Eran KohenBehar DavidChanUniversity of California, Berkeley @@ -15330,10 +15330,10 @@ MihirParmar MohithKulkarni AswinRrv - NisargPatel + NisargPatel MutsumiNakamuraArizona State University ArindamMitraResearch, Microsoft - ChittaBaralArizona State University + ChittaBaralArizona State University 19898-19915 Solving grid puzzles involves a significant amount of logical reasoning. Hence, it is a good domain to evaluate reasoning capability of a model which can then guide us to improve the reasoning ability of models. However, most existing works evaluate only the final predicted answer of a puzzle, without delving into an in-depth analysis of the LLMs’ reasoning chains (such as where they falter) or providing any finer metrics to evaluate them. Since LLMs may rely on simple heuristics or artifacts to predict the final answer, it is crucial to evaluate the generated reasoning chain beyond overall correctness measures, for accurately evaluating the reasoning abilities of LLMs. To this end, we first develop GridPuzzle, an evaluation dataset comprising of 274 grid-based puzzles with different complexities. Second, we propose a new error taxonomy derived from manual analysis of reasoning chains from LLMs including GPT-4, Claude-3, Gemini, Mistral, and Llama-2. Then, we develop a LLM-based framework for large-scale subjective evaluation (i.e., identifying errors) and an objective metric, PuzzleEval, to evaluate the correctness of reasoning chains. Evaluating reasoning chains from LLMs leads to several interesting findings. We further show that existing prompting methods used for enhancing models’ reasoning abilities do not improve performance on GridPuzzle. This highlights the importance of understanding fine-grained errors and presents a challenge for future research to enhance LLMs’ puzzle-solving abilities by developing methods that address these errors. 2024.emnlp-main.1111 @@ -15359,8 +15359,8 @@ The Empirical Variability of Narrative Perceptions of Social Media Texts JoelMire MariaAntoniak - ElliottAshSwiss Federal Institute of Technology - AndrewPiperMcGill University + ElliottAshSwiss Federal Institute of Technology + AndrewPiperMcGill University MaartenSapCarnegie Mellon University 19940-19968 Most NLP work on narrative detection has focused on prescriptive definitions of stories crafted by researchers, leaving open the questions: how do crowd workers perceive texts to be a story, and why? We investigate this by building StoryPerceptions, a dataset of 2,496 perceptions of storytelling in 502 social media texts from 255 crowd workers, including categorical labels along with free-text storytelling rationales, authorial intent, and more. We construct a fine-grained bottom-up taxonomy of crowd workers’ varied and nuanced perceptions of storytelling by open-coding their free-text rationales. Through comparative analyses at the label and code level, we illuminate patterns of disagreement among crowd workers and across other annotation contexts, including prescriptive labeling from researchers and LLM-based predictions. Notably, plot complexity, references to generalized or abstract actions, and holistic aesthetic judgments (such as a sense of cohesion) are especially important in disagreements. Our empirical findings broaden understanding of the types, relative importance, and contentiousness of features relevant to narrative detection, highlighting opportunities for future work on reader-contextualized models of narrative reception. @@ -15386,7 +15386,7 @@ Revealing Personality Traits: A New Benchmark Dataset for Explainable Personality Recognition on Dialogues LeiSun JinmingZhao - QinJinRenmin University of China + QinJinRenmin University of China 19988-20002 Personality recognition aims to identify the personality traits implied in user data such as dialogues and social media posts. Current research predominantly treats personality recognition as a classification task, failing to reveal the supporting evidence for the recognized personality. In this paper, we propose a novel task named Explainable Personality Recognition, aiming to reveal the reasoning process as supporting evidence of the personality trait. Inspired by personality theories, personality traits are made up of stable patterns of personality state, where the states are short-term characteristic patterns of thoughts, feelings, and behaviors in a concrete situation at a specific moment in time. We propose an explainable personality recognition framework called Chain-of-Personality-Evidence (CoPE), which involves a reasoning process from specific contexts to short-term personality states to long-term personality traits. Furthermore, based on the CoPE framework, we construct an explainable personality recognition dataset from dialogues, PersonalityEvd. We introduce two explainable personality state recognition and explainable personality trait recognition tasks, which require models to recognize the personality state and trait labels and their corresponding support evidence. Our extensive experiments based on Large Language Models on the two tasks show that revealing personality traits is very challenging and we present some insights for future research. We will release our dataset and source code to facilitate further studies in this direction. 2024.emnlp-main.1115 @@ -15418,7 +15418,7 @@ <fixed-case>C</fixed-case>ode<fixed-case>J</fixed-case>udge: Evaluating Code Generation with Large Language Models WeixiTong - TianyiZhangPurdue University + TianyiZhangPurdue University 20032-20051 Large Language Models (LLMs) have shown promising performance in code generation. However, how to reliably evaluate code generated by LLMs remains an unresolved problem. This paper presents CodeJudge, a code evaluation framework that leverages LLMs to evaluate the semantic correctness of generated code without the need for test cases. We investigate different ways to guide the LLM in performing “slow thinking” to arrive at an in-depth and reliable evaluation. We experimented with four LLMs as evaluators on four code generation datasets and five programming languages. The results show that CodeJudge significantly outperformed existing methods in most settings. Furthermore, compared with a SOTA GPT-3.5-based code evaluation method, CodeJudge achieved better results even when using a much smaller model, Llama-3-8B-Instruct. Our code and datasets are available on GitHub https://github.com/VichyTong/CodeJudge. 2024.emnlp-main.1118 @@ -15429,9 +15429,9 @@ Self-Training Large Language and Vision Assistant for Medical Question Answering - GuohaoSunRochester Institute of Technology + GuohaoSunRochester Institute of Technology CanQinSalesForce.com - HuazhuFuInstitute of High Performance Computing, Singapore, A*STAR + HuazhuFuInstitute of High Performance Computing, Singapore, A*STAR LinweiWangRochester Institute of Technology ZhiqiangTaoRochester Institute of Technology 20052-20060 @@ -15443,7 +15443,7 @@ <fixed-case>SYNFAC</fixed-case>-<fixed-case>EDIT</fixed-case>: Synthetic Imitation Edit Feedback for Factual Alignment in Clinical Summarization PrakamyaMishraAMD AI - ZonghaiYaoUniversity of Massachusetts at Amherst + ZonghaiYaoUniversity of Massachusetts at Amherst ParthVashisht Feiyun Ouyang BeiningWang @@ -15457,13 +15457,13 @@ Defending Jailbreak Prompts via In-Context Adversarial Game - YujunZhouUniversity of Notre Dame + YujunZhouUniversity of Notre Dame YufeiHanINRIA HaominZhuangUniversity of Notre Dame KehanGuo ZhenwenLiang HongyanBaoKAUST - XiangliangZhangUniversity of Notre Dame + XiangliangZhangUniversity of Notre Dame 20084-20105 Large Language Models (LLMs) demonstrate remarkable capabilities across diverse applications. However, concerns regarding their security, particularly the vulnerability to jailbreak attacks, persist. Drawing inspiration from adversarial training in deep learning and LLM agent learning processes, we introduce the In-Context Adversarial Game (ICAG) for defending against jailbreaks without the need for fine-tuning. ICAG leverages agent learning to conduct an adversarial game, aiming to dynamically extend knowledge to defend against jailbreaks. Unlike traditional methods that rely on static datasets, ICAG employs an iterative process to enhance both the defense and attack agents. This continuous improvement process strengthens defenses against newly generated jailbreak prompts. Our empirical studies affirm ICAG’s efficacy, where LLMs safeguarded by ICAG exhibit significantly reduced jailbreak success rates across various attack scenarios. Moreover, ICAG demonstrates remarkable transferability to other LLMs, indicating its potential as a versatile defense mechanism. The code is available at https://github.com/YujunZhou/In-Context-Adversarial-Game. 2024.emnlp-main.1121 @@ -15472,11 +15472,11 @@ Detecting Online Community Practices with Large Language Models: A Case Study of Pro-<fixed-case>U</fixed-case>krainian Publics on <fixed-case>T</fixed-case>witter - KaterynaKasianenko + KaterynaKasianenko ShimaKhanehzarCSIRO - StephenWanCSIRO - EhsanDehghanQueensland University of Technology - AxelBrunsQueensland University of Technology + StephenWanCSIRO + EhsanDehghanQueensland University of Technology + AxelBrunsQueensland University of Technology 20106-20135 Communities on social media display distinct patterns of linguistic expression and behaviour, collectively referred to as practices. These practices can be traced in textual exchanges, and reflect the intentions, knowledge, values, and norms of users and communities. This paper introduces a comprehensive methodological workflow for computational identification of such practices within social media texts. By focusing on supporters of Ukraine during the Russia-Ukraine war in (1) the activist collective NAFO and (2) the Eurovision Twitter community, we present a gold-standard data set capturing their unique practices. Using this corpus, we perform practice prediction experiments with both open-source baseline models and OpenAI’s large language models (LLMs). Our results demonstrate that closed-source models, especially GPT-4, achieve superior performance, particularly with prompts that incorporate salient features of practices, or utilize Chain-of-Thought prompting. This study provides a detailed error analysis and offers valuable insights into improving the precision of practice identification, thereby supporting context-sensitive moderation and advancing the understanding of online community dynamics. 2024.emnlp-main.1122 @@ -15499,13 +15499,13 @@ <fixed-case>MT</fixed-case>-Eval: A Multi-Turn Capabilities Evaluation Benchmark for Large Language Models Wai-ChungKwan XingshanZengHuawei Technologies Ltd. - YuxinJiang + YuxinJiang YufeiWang - LiangyouLiHuawei Noah’s Ark Lab + LiangyouLiHuawei Noah’s Ark Lab LifengShangHuawei Technologies Ltd. - XinJiang - QunLiuHuawei Noah’s Ark Lab - Kam-FaiWongThe Chinese University of Hong Kong + XinJiang + QunLiuHuawei Noah’s Ark Lab + Kam-FaiWongThe Chinese University of Hong Kong 20153-20177 Large language models (LLMs) are increasingly used for complex multi-turn conversations across diverse real-world applications. However, existing benchmarks mainly focus on single-turn evaluations, overlooking the models’ capabilities in multi-turn interactions. To address this gap, we introduce , a comprehensive benchmark to evaluate the multi-turn conversational abilities of LLMs. By analyzing human-LLM conversations, we categorize interaction patterns into four types: recollection, expansion, refinement, and follow-up. We construct multi-turn queries for each category either by augmenting existing datasets or creating new examples using GPT-4 with a human-in-the-loop process to avoid data leakage. To study the factors impacting multi-turn abilities, we create single-turn versions of the 1170 multi-turn queries and compare performance. Our evaluation of 10 well-known LLMs shows that while closed-source models generally surpass open-source ones, certain open-source models exceed GPT-3.5-Turbo in specific tasks. We observe significant performance degradation in multi-turn settings compared to single-turn settings in most models, which is not correlated with the models’ fundamental capabilities. Moreover, we identify the distance to relevant content and susceptibility to error propagation as the key factors influencing multi-turn performance. 2024.emnlp-main.1124 @@ -15517,7 +15517,7 @@ AmirZur ElisaKreissUniversity of California, Los Angeles KarelD’Oosterlinck - ChristopherPottsStanford University + ChristopherPottsStanford University AtticusGeigerPr(Ai)²R Group 20178-20187 Although CLIPScore is a powerful generic metric that captures the similarity between a text and an image, it fails to distinguish between a caption that is meant to complement the information in an image and a description that is meant to replace an image entirely, e.g., for accessibility. We address this shortcoming by updating the CLIP model with the Concadia dataset to assign higher scores to descriptions than captions using parameter efficient fine-tuning and a loss objective derived from work on causal interpretability. This model correlates with the judgements of blind and low-vision people while preserving transfer capabilities and has interpretable structure that sheds light on the caption–description distinction. @@ -15530,7 +15530,7 @@ Sian-YaoHuangCycraft Inc. Cheng-LinYangEdinburgh University, University of Edinburgh Che-YuLinCyCraft Technology Corporation - Chun-YingHuangNational Yang Ming Chiao Tung University + Chun-YingHuangNational Yang Ming Chiao Tung University 20188-20206 This research addresses command-line embedding in cybersecurity, a field obstructed by the lack of comprehensive datasets due to privacy and regulation concerns. We propose the first dataset of similar command lines, named CyPHER, for training and unbiased evaluation. The training set is generated using a set of large language models (LLMs) comprising 28,520 similar command-line pairs. Our testing dataset consists of 2,807 similar command-line pairs sourced from authentic command-line data.In addition, we propose a command-line embedding model named CmdCaliper, enabling the computation of semantic similarity with command lines. Performance evaluations demonstrate that the smallest version of CmdCaliper (30 million parameters) suppresses state-of-the-art (SOTA) sentence embedding models with ten times more parameters across various tasks (e.g., malicious command-line detection and similar command-line retrieval).Our study explores the feasibility of data generation using LLMs in the cybersecurity domain. Furthermore, we release our proposed command-line dataset, embedding models’ weights and all program codes to the public. This advancement paves the way for more effective command-line embedding for future researchers. 2024.emnlp-main.1126 @@ -15542,7 +15542,7 @@ Back to School: Translation Using Grammar Books JonathanHusGeorge Mason University - AntoniosAnastasopoulosAthena Research Center and George Mason University + AntoniosAnastasopoulosAthena Research Center and George Mason University 20207-20219 Machine translation systems for high resource languages perform exceptionally well and produce high quality translations. Unfortunately, the vast majority of languages are not considered high resource and lack the quantity of parallel sentences needed to train such systems. These under-represented languages are not without resources, however, and bilingual dictionaries and grammar books are available as linguistic reference material. With current large language models (LLMs) supporting near book-length contexts, we can begin to use the available material to ensure advancements are shared among all of the world’s languages. In this paper, we demonstrate incorporating grammar books in the prompt of GPT-4 to improve machine translation and evaluate the performance on 16 topologically diverse low-resource languages, using a combination of reference material to show that the machine translation performance of LLMs can be improved using this method. 2024.emnlp-main.1127 @@ -15584,10 +15584,10 @@ <fixed-case>AMPO</fixed-case>: Automatic Multi-Branched Prompt Optimization ShengYang - YurongWu + YurongWu YanGao ZinengZhou - Bin BenjaminZhuMicrosoft Research + Bin BenjaminZhuMicrosoft Research XiaodiSun Jian-GuangLouMicrosoft ZhimingDingInstitute of Software Chinese Academy of Sciences @@ -15606,11 +15606,11 @@ <fixed-case>D</fixed-case>e<fixed-case>MPT</fixed-case>: Decoding-enhanced Multi-phase Prompt Tuning for Making <fixed-case>LLM</fixed-case>s Be Better Context-aware Translators XinglinLyuSoochow University JunhuiLiSoochow University, China - YanqingZhaoHuawei Technologies Ltd. + YanqingZhaoHuawei Technologies Ltd. MinZhangHuawei Technologies Ltd. DaimengWei ShiminTaoHuawei Technologies Ltd. - HaoYang + HaoYang MinZhangHarbin Institute of Technology, Shenzhen 20280-20295 2024.emnlp-main.1131 @@ -15631,10 +15631,10 @@ Unveiling Multi-level and Multi-modal Semantic Representations in the Human Brain using Large Language Models YukoNakagiOsaka University TakuyaMatsuyamaOsaka University, Graduate School of Frontier Biosciences - NaokoKoide-MajimaNational Institute of Information and Communications Technology (NICT), National Institute of Advanced Industrial Science and Technology + NaokoKoide-MajimaNational Institute of Information and Communications Technology (NICT), National Institute of Advanced Industrial Science and Technology Hiroto Q.YamaguchiNational Institute of Information and Communications Technology (NICT), National Institute of Advanced Industrial Science and Technology RiekoKuboTokyo Medical and Dental University - ShinjiNishimotoOsaka University + ShinjiNishimotoOsaka University YuTakagiOsaka University 20313-20338 In recent studies, researchers have used large language models (LLMs) to explore semantic representations in the brain; however, they have typically assessed different levels of semantic content, such as speech, objects, and stories, separately. In this study, we recorded brain activity using functional magnetic resonance imaging (fMRI) while participants viewed 8.3 hours of dramas and movies. We annotated these stimuli at multiple semantic levels, which enabled us to extract latent representations of LLMs for this content. Our findings demonstrate that LLMs predict human brain activity more accurately than traditional language models, particularly for complex background stories. Furthermore, we identify distinct brain regions associated with different semantic representations, including multi-modal vision-semantic representations, which highlights the importance of modeling multi-level and multi-modal semantic representations simultaneously. We will make our fMRI dataset publicly available to facilitate further research on aligning LLMs with human brain function. @@ -15644,7 +15644,7 @@ “They are uncultured”: Unveiling Covert Harms and Social Threats in <fixed-case>LLM</fixed-case> Generated Conversations - Preetam Prabhu SrikarDammuUniversity of Washington + Preetam Prabhu SrikarDammuUniversity of Washington HayoungJung AnjaliSingh MonojitChoudhuryMohamed bin Zayed University of Artificial Intelligence @@ -15662,7 +15662,7 @@ Anh TuanLuuNanyang Technological University KenjiKawaguchiNational University of Singapore Min-YenKanNational University of Singapore - Nancy F.Chen + Nancy F.Chen 20370-20401 We present Multi-expert Prompting, a novel enhancement of ExpertPrompting (Xu et al., 2023), designed to improve the large language model (LLM) generation. Specifically, it guides an LLM to fulfill an input instruction by simulating multiple experts, aggregating their responses, and selecting the best among individual and aggregated responses. This process is performed in a single chain of thoughts through our seven carefully designed subtasks derived from the Nominal Group Technique (Ven and Delbecq, 1974), a well-established decision-making framework. Our evaluations demonstrate that Multi-expert Prompting significantly outperforms ExpertPrompting and comparable baselines in enhancing the truthfulness, factuality, informativeness, and usefulness of responses while reducing toxicity and hurtfulness. It further achieves state-of-the-art truthfulness by outperforming the best baseline by 8.69% with ChatGPT. Multi-expert Prompting is efficient, explainable, and highly adaptable to diverse scenarios, eliminating the need for manual prompt construction. 2024.emnlp-main.1135 @@ -15682,11 +15682,11 @@ Eliciting In-Context Learning in Vision-Language Models for Videos Through Curated Data Distributional Properties - Keunwoo PeterYuUniversity of Michigan - Ann Arbor + Keunwoo PeterYuUniversity of Michigan - Ann Arbor ZheyuanZhang FengyuanHu ShaneStorksUniversity of Michigan - JoyceChaiUniversity of Michigan + JoyceChaiUniversity of Michigan 20416-20431 2024.emnlp-main.1137 yu-etal-2024-eliciting @@ -15695,10 +15695,10 @@ Waterfall: Scalable Framework for Robust Text Watermarking and Provenance for <fixed-case>LLM</fixed-case>s Gregory Kang RueyLauNational University of Singapore - XinyuanNiunational university of singaore, National University of Singapore + XinyuanNiunational university of singaore, National University of Singapore HieuDaoNational University of Singapore - JiangweiChennational university of singaore, National University of Singapore - Chuan-ShengFooCentre for Frontier AI Research, A*STAR and Institute for Infocomm Research, A*STAR + JiangweiChennational university of singaore, National University of Singapore + Chuan-ShengFooCentre for Frontier AI Research, A*STAR and Institute for Infocomm Research, A*STAR Bryan Kian HsiangLowNational University of Singapore 20432-20466 Protecting intellectual property (IP) of text such as articles and code is increasingly important, especially as sophisticated attacks become possible, such as paraphrasing by large language models (LLMs) or even unauthorized training of LLMs on copyrighted text to infringe such IP. However, existing text watermarking methods are not robust enough against such attacks nor scalable to millions of users for practical implementation. In this paper, we propose Waterfall, the first training-free framework for robust and scalable text watermarking applicable across multiple text types (e.g., articles, code) and languages supportable by LLMs, for general text and LLM data provenance. Waterfall comprises several key innovations, such as being the first to use LLM as paraphrasers for watermarking along with a novel combination of techniques that are surprisingly effective in achieving robust verifiability and scalability. We empirically demonstrate that Waterfall achieves significantly better scalability, robust verifiability, and computational efficiency compared to SOTA article-text watermarking methods, and also showed how it could be directly applied to the watermarking of code. @@ -15709,8 +15709,8 @@ <fixed-case>MASIVE</fixed-case>: Open-Ended Affective State Identification in <fixed-case>E</fixed-case>nglish and <fixed-case>S</fixed-case>panish - NicholasDeasColumbia University - ElsbethTurcan + NicholasDeasColumbia University + ElsbethTurcan Ivan Ernesto PerezMejia KathleenMcKeown 20467-20485 @@ -15726,7 +15726,7 @@ SaptarashmiBandyopadhyayUniversity of Maryland, College Park HaoZou AbhranilChandra - Jordan LeeBoyd-GraberUniversity of Maryland, College Park + Jordan LeeBoyd-GraberUniversity of Maryland, College Park 20486-20510 Training question-answering QA and information retrieval systems for web queries require large, expensive datasets that are difficult to annotate and time-consuming to gather. Moreover, while natural datasets of information-seeking questions are often prone to ambiguity or ill-formed, there are troves of freely available, carefully crafted question datasets for many languages. Thus, we automatically generate shorter, information-seeking questions, resembling web queries in the style of the Natural Questions (NQ) dataset from longer trivia data. Training a QA system on these transformed questions is a viable strategy for alternating to more expensive training setups showing the F1 score difference of less than six points and contrasting the final systems. 2024.emnlp-main.1140 @@ -15736,11 +15736,11 @@ <fixed-case>A</fixed-case>lpha<fixed-case>L</fixed-case>o<fixed-case>RA</fixed-case>: Assigning <fixed-case>L</fixed-case>o<fixed-case>RA</fixed-case> Experts Based on Layer Training Quality PeijunQingDartmouth College - ChongyangGao + ChongyangGao YefanZhouDartmouth College - XingjianDiao - YaoqingYangDartmouth College - SoroushVosoughiDartmouth College + XingjianDiao + YaoqingYangDartmouth College + SoroushVosoughiDartmouth College 20511-20523 Parameter-efficient fine-tuning methods, such as Low-Rank Adaptation (LoRA), are known to enhance training efficiency in Large Language Models (LLMs). Due to the limited parameters of LoRA, recent studies seek to combine LoRA with Mixture-of-Experts (MoE) to boost performance across various tasks. However, inspired by the observed redundancy in traditional MoE structures, prior studies find that LoRA experts within the MoE architecture also exhibit redundancy, suggesting a need to vary the allocation of LoRA experts across different layers. In this paper, we leverage Heavy-Tailed Self-Regularization (HT-SR) Theory to design a fine-grained allocation strategy. Our analysis reveals that the number of experts per layer correlates with layer training quality, which exhibits significant variability across layers. Based on this, we introduce AlphaLoRA, a theoretically principled and training-free method for allocating LoRA experts to reduce redundancy further. Experiments on three models across ten language processing and reasoning benchmarks demonstrate that AlphaLoRA achieves comparable or superior performance over all baselines. Our code is available at https://github.com/morelife2017/alphalora. 2024.emnlp-main.1141 @@ -15757,7 +15757,7 @@ WenzhiWangTohoku University, Tokyo Institute of Technology ShoichiNaitoTohoku University, Tokyo Institute of Technology JungminChoiRIKEN - KentaroInuiMohamed bin Zayed University of Artificial Intelligence, RIKEN and Tohoku University + KentaroInuiMohamed bin Zayed University of Artificial Intelligence, RIKEN and Tohoku University 20524-20540 Prior research in computational argumentation has mainly focused on scoring the quality of arguments, with less attention on explicating logical errors. In this work, we introduce four sets of explainable templates for common informal logical fallacies designed to explicate a fallacy’s implicit logic. Using our templates, we conduct an annotation study on top of 400 fallacious arguments taken from LOGIC dataset and achieve a high agreement score (Krippendorf’s \alpha of 0.54) and reasonable coverage 83%. Finally, we conduct an experiment for detecting the structure of fallacies and discover that state-of-the-art language models struggle with detecting fallacy templates (0.47 accuracy). To facilitate research on fallacies, we make our dataset and guidelines publicly available. 2024.emnlp-main.1142 @@ -15768,7 +15768,7 @@ Advancing Social Intelligence in <fixed-case>AI</fixed-case> Agents: Technical Challenges and Open Questions LeenaMathurCarnegie Mellon University Paul PuLiangMassachusetts Institute of Technology - Louis-PhilippeMorencyCarnegie Mellon University + Louis-PhilippeMorencyCarnegie Mellon University 20541-20560 Building socially-intelligent AI agents (Social-AI) is a multidisciplinary, multimodal research goal that involves creating agents that can sense, perceive, reason about, learn from, and respond to affect, behavior, and cognition of other agents (human or artificial). Progress towards Social-AI has accelerated in the past decade across several computing communities, including natural language processing, machine learning, robotics, human-machine interaction, computer vision, and speech. Natural language processing, in particular, has been prominent in Social-AI research, as language plays a key role in constructing the social world. In this position paper, we identify a set of underlying technical challenges and open questions for researchers across computing communities to advance Social-AI. We anchor our discussion in the context of social intelligence concepts and prior progress in Social-AI research. 2024.emnlp-main.1143 @@ -15778,10 +15778,10 @@ <fixed-case>RA</fixed-case>t: Injecting Implicit Bias for Text-To-Image Prompt Refinement Models - ZiyiKouFacebook + ZiyiKouFacebook ShichaoPeiUniversity of Massachusetts Boston - MengJiangUniversity of Notre Dame - XiangliangZhangUniversity of Notre Dame + MengJiangUniversity of Notre Dame + XiangliangZhangUniversity of Notre Dame 20561-20570 Text-to-image prompt refinement (T2I-Refine) aims to rephrase or extend an input prompt with more descriptive details that can be leveraged to generate images with higher quality. In this paper, we study an adversarial prompt attacking problem for T2I-Refine, where to goal is to implicitly inject specific concept bias to the input prompts during the refinement process so that the generated images, still with higher quality, are explicitly biased to the target group. Our study is motivated by the limitation of current T2I-Refine research that lacks of explorations on the potential capacity of T2I-Refine models to provide prompt refinement service in a biased or advertising manner. To address the limitations, we develop RAt, a prompt refinement and attacking framework that attacks input prompts with intentionally selected adversarial replacements by optimizing a token distribution matrix based on the text-to-image finetuning strategy with a token-level bias obfuscation loss as regularization. We evaluate RAt on a large-scale text-to-image dataset with various concepts as target in both in-domain and transfer-domain scenarios. The evaluation results demonstrate that, compared to other T2I-Refine schemes, RAt is well capable of implicitly attacking input prompts to generate images with higher quality and explicit visual bias towards specific concept group. 2024.emnlp-main.1144 @@ -15790,8 +15790,8 @@ Can <fixed-case>LLM</fixed-case> Generate Culturally Relevant Commonsense <fixed-case>QA</fixed-case> Data? Case Study in <fixed-case>I</fixed-case>ndonesian and <fixed-case>S</fixed-case>undanese - Rifki AfinaPutriKorea Advanced Institute of Science & Technology - Faiz GhifariHaznitramaKorea Advanced Institute of Science & Technology + Rifki AfinaPutriKorea Advanced Institute of Science & Technology + Faiz GhifariHaznitramaKorea Advanced Institute of Science & Technology DeaAdhista AliceOhKorea Advanced Institute of Science and Technology 20571-20590 @@ -15805,12 +15805,12 @@ Can Language Models Induce Grammatical Knowledge from Indirect Evidence? MiyuOba - YoheiOsekiUniversity of Tokyo + YoheiOsekiUniversity of Tokyo AkiyoFukatsuTokyo University, Tokyo Institute of Technology AkariHaga HirokiOuchiNAIST - TaroWatanabeNara Institute of Science and Technology, Japan - SakuSugawaraNational Institute of Informatics + TaroWatanabeNara Institute of Science and Technology, Japan + SakuSugawaraNational Institute of Informatics 20591-20603 What kinds of and how much data is necessary for language models to induce grammatical knowledge to judge sentence acceptability? Recent language models still have much room for improvement in their data efficiency compared to humans. This paper investigates whether language models efficiently use indirect data (indirect evidence), from which they infer sentence acceptability. In contrast, humans use indirect evidence efficiently, which is considered one of the inductive biases contributing to efficient language acquisition. To explore this question, we introduce the Wug InDirect Evidence Test (WIDET), a dataset consisting of training instances inserted into the pre-training data and evaluation instances. We inject synthetic instances with newly coined wug words into pretraining data and explore the model’s behavior on evaluation data that assesses grammatical acceptability regarding those words. We prepare the injected instances by varying their levels of indirectness and quantity. Our experiments surprisingly show that language models do not induce grammatical knowledge even after repeated exposure to instances with the same structure but differing only in lexical items from evaluation instances in certain language phenomena. Our findings suggest a potential direction for future research: developing models that use latent indirect evidence to induce grammatical knowledge. 2024.emnlp-main.1146 @@ -15822,7 +15822,7 @@ JialiangXu ShenglanLi ZhaozhuoXuStevens Institute of Technology - DenghuiZhangStevens Institute of Technology + DenghuiZhangStevens Institute of Technology 20604-20619 Prior study shows that LLMs sometimes generate content that violates copyright. In this paper, we study another important yet underexplored problem, i.e., will LLMs respect copyright information in user input, and behave accordingly? The research problem is critical, as a negative answer would imply that LLMs will become the primary facilitator and accelerator of copyright infringement behavior. We conducted a series of experiments using a diverse set of language models, user prompts, and copyrighted materials, including books, news articles, API documentation, and movie scripts. Our study offers a conservative evaluation of the extent to which language models may infringe upon copyrights when processing user input containing protected material. This research emphasizes the need for further investigation and the importance of ensuring LLMs respect copyright regulations when handling user input to prevent unauthorized use or reproduction of protected content. We also release a benchmark dataset serving as a test bed for evaluating infringement behaviors by LLMs and stress the need for future alignment. 2024.emnlp-main.1147 @@ -15832,7 +15832,7 @@ <fixed-case>S</fixed-case>pec<fixed-case>H</fixed-case>ub: Provable Acceleration to Multi-Draft Speculative Decoding RyanSun - TianyiZhouUniversity of Maryland, College Park + TianyiZhouUniversity of Maryland, College Park XunChenSamsung Research America LichaoSunLehigh University 20620-20641 @@ -15845,7 +15845,7 @@ Interventional Speech Noise Injection for <fixed-case>ASR</fixed-case> Generalizable Spoken Language Understanding YeonJoonJungSeoul National University JaeseongLeeSeoul National University - SeungtaekChoiYanolja + SeungtaekChoiYanolja DohyeonLeeSeoul National University MinsooKimSeoul National University Seung-wonHwangSeoul National University @@ -15868,7 +15868,7 @@ Visual Text Matters: Improving Text-<fixed-case>KVQA</fixed-case> with Visual Text Entity Knowledge-aware Large Multimodal Assistant - Abhirama SubramanyamPenamakuriIndian Institute of Technology, Jodhpur + Abhirama SubramanyamPenamakuriIndian Institute of Technology, Jodhpur AnandMishraIndian Institute of Technology, Jodhpur 20675-20688 We revisit knowledge-aware text-based visual question answering, also known as Text-KVQA in the light of modern advancements in large multimodal models (LMMs), and make the following contributions: (i) We propose VisTEL – a principled approach to perform visual text entity linking. The proposed VisTEL module harnesses a state-of-the-art visual text recognition engine and the power of a large multimodal model to jointly reason using textual and visual context obtained using surrounding cues in the image to link visual text entity to the correct knowledge base entity. (ii) We present KaLMA – knowledge-aware large multimodal assistant that augments an LMM with knowledge associated with visual text entity in the image to arrive at an accurate answer. Further, we provide a comprehensive experimental analysis and comparison of our approach with traditional visual question answering, pre-large multimodal models, and large multimodal models, as well as prior top-performing approaches. Averaging over three splits of Text-KVQA, our proposed approach surpasses the previous best approach by a substantial 23.3% on an absolute scale and establishes a new state of the art. We make our implementation publicly available. @@ -15879,11 +15879,11 @@ Beyond Correlation: Interpretable Evaluation of Machine Translation Metrics - StefanoPerrella - LorenzoProietti - Pere-LluísHuguet Cabot + StefanoPerrella + LorenzoProietti + Pere-LluísHuguet Cabot EdoardoBarbaUniversity of Roma “La Sapienza” - RobertoNavigliSapienza University of Rome + RobertoNavigliSapienza University of Rome 20689-20714 Machine Translation (MT) evaluation metrics assess translation quality automatically. Recently, researchers have employed MT metrics for various new use cases, such as data filtering and translation re-ranking. However, most MT metrics return assessments as scalar scores that are difficult to interpret, posing a challenge to making informed design choices. Moreover, MT metrics’ capabilities have historically been evaluated using correlation with human judgment, which, despite its efficacy, falls short of providing intuitive insights into metric performance, especially in terms of new metric use cases. To address these issues, we introduce an interpretable evaluation framework for MT metrics. Within this framework, we evaluate metrics in two scenarios that serve as proxies for the data filtering and translation re-ranking use cases. Furthermore, by measuring the performance of MT metrics using Precision, Recall, and F-score, we offer clearer insights into their capabilities than correlation with human judgments. Finally, we raise concerns regarding the reliability of manually curated data following the Direct Assessments+Scalar Quality Metrics (DA+SQM) guidelines, reporting a notably low agreement with Multidimensional Quality Metrics (MQM) annotations. 2024.emnlp-main.1152 @@ -15894,8 +15894,8 @@ <fixed-case>IFC</fixed-case>ap: Image-like Retrieval and Frequency-based Entity Filtering for Zero-shot Captioning SoeunLee Si-WooKim - TaewhanKim - Dong-JinKimHanyang University + TaewhanKim + Dong-JinKimHanyang University 20715-20727 Recent advancements in image captioning have explored text-only training methods to overcome the limitations of paired image-text data. However, existing text-only training methods often overlook the modality gap between using text data during training and employing images during inference. To address this issue, we propose a novel approach called Image-like Retrieval, which aligns text features with visually relevant features to mitigate the modality gap. Our method further enhances the accuracy of generated captions by designing a fusion module that integrates retrieved captions with input features. Additionally, we introduce a Frequency-based Entity Filtering technique that significantly improves caption quality. We integrate these methods into a unified framework, which we refer to as IFCap (**I**mage-like Retrieval and **F**requency-based Entity Filtering for Zero-shot **Cap**tioning). Through extensive experimentation, our straightforward yet powerful approach has demonstrated its efficacy, outperforming the state-of-the-art methods by a significant margin in both image captioning and video captioning compared to zero-shot captioning based on text-only training. 2024.emnlp-main.1153 @@ -15904,16 +15904,16 @@ Encoding Spreadsheets for Large Language Models - HaoyuDong + HaoyuDong JianboZhao YuzhangTian JunyuXiong - MengyuZhouMicrosoft Research + MengyuZhouMicrosoft Research YunLin JoséCambronero - YeyeHeMicrosoft - ShiHanMicrosoft Research Asia - DongmeiZhangMicrosoft and Microsoft + YeyeHeMicrosoft + ShiHanMicrosoft Research Asia + DongmeiZhangMicrosoft and Microsoft 20728-20748 Spreadsheets are characterized by their extensive two-dimensional grids, flexible layouts, and varied formatting options, which pose significant challenges for large language models (LLMs). In response, we introduce SheetEncoder, pioneering an efficient encoding method designed to unleash and optimize LLMs’ powerful understanding and reasoning capability on spreadsheets. Initially, we propose a vanilla serialization approach that incorporates cell addresses, values, and formats. However, this approach was limited by LLMs’ token constraints, making it impractical for most applications. To tackle this challenge, three innovative modules are proposed to compress spreadsheets effectively: structural-anchor-based compression, inverse index translation, and data-format-aware aggregation. It significantly improves performance in spreadsheet table detection task, outperforming the vanilla approach by 25.6% in GPT4’s in-context learning setting. Moreover, fine-tuned LLM with SheetEncoder has an average compression ratio of 25×, but achieves a state-of-the-art 78.9% F1 score, surpassing the best existing models by 12.3%, demonstrating that SheetEncoder greatly boosts LLMs’s performance on spreadsheet data. 2024.emnlp-main.1154 @@ -15924,7 +15924,7 @@ Let’s discuss! Quality Dimensions and Annotated Datasets for Computational Argument Quality Assessment - Rositsa VIvanovaUniversität St. Gallen + Rositsa VIvanovaUniversität St. Gallen ThomasHuberUniversität St. Gallen ChristinaNiklausUniversität St. Gallen 20749-20779 @@ -15935,11 +15935,11 @@ Automatic sentence segmentation of clinical record narratives in real-world data - DongfangXuCedars Sinai Medical Center + DongfangXuCedars Sinai Medical Center DavyWeissenbacher KarenO’ConnorUniversity of Pennsylvania, University of Pennsylvania - SiddharthRawalAmazon - Graciela GonzalezHernandezCedars Sinai Medical Center + SiddharthRawalAmazon + Graciela GonzalezHernandezCedars Sinai Medical Center 20780-20793 Sentence segmentation is a linguistic task and is widely used as a pre-processing step in many NLP applications. The need for sentence segmentation is particularly pronounced in clinical notes, where ungrammatical and fragmented texts are common. We propose a straightforward and effective sequence labeling classifier to predict sentence spans using a dynamic sliding window based on the prediction of each input sequence. This sliding window algorithm allows our approach to segment long text sequences on the fly. To evaluate our approach, we annotated 90 clinical notes from the MIMIC-III dataset. Additionally, we tested our approach on five other datasets to assess its generalizability and compared its performance against state-of-the-art systems on these datasets. Our approach outperformed all the systems, achieving an F1 score that is 15% higher than the next best-performing system on the clinical dataset. 2024.emnlp-main.1156 @@ -15959,7 +15959,7 @@ <fixed-case>B</fixed-case>ayesian Example Selection Improves In-Context Learning for Speech, Text and Visual Modalities SiyinWangTsinghua University, Tsinghua University - Chao-Han HuckYangNVIDIA Research + Chao-Han HuckYangNVIDIA Research JiWu ChaoZhangTsinghua University and University College London 20812-20828 @@ -15972,10 +15972,10 @@ Investigating Multilingual Instruction-Tuning: Do Polyglot Models Demand for Multilingual Instructions? Alexander ArnoWeberFraunhofer Institute IAIS, Fraunhofer IAIS KlaudiaThellmannTU Dresden - JanEbertForschungszentrum Jülich GmbH + JanEbertForschungszentrum Jülich GmbH NicolasFlores-HerrMax-Planck Institute and Fraunhofer Institute IAIS, Fraunhofer IAIS - JensLehmannAmazon, Technische Universität Dresden, University of Bonn and Fraunhofer IAIS - MichaelFrommFraunhofer Institute IAIS, Fraunhofer IAIS + JensLehmannAmazon, Technische Universität Dresden, University of Bonn and Fraunhofer IAIS + MichaelFrommFraunhofer Institute IAIS, Fraunhofer IAIS MehdiAliFraunhofer Institute IAIS, Fraunhofer IAIS 20829-20855 The adaption of multilingual pre-trained LLMs into eloquent and helpful assistants is essential to facilitate their use across different language regions. In that spirit, we are the first to conduct an extensive study of the performance of multilingual models instruction-tuned on different language compositions on parallel instruction-tuning benchmarks across a selection of the most spoken Indo-European languages. We systematically examine the effects of language and instruction dataset size on a mid-sized and a large, multilingual LLMs by instruction-tuning them on parallel instruction-tuning datasets. Our results demonstrate that instruction-tuning on parallel instead of monolingual corpora benefits cross-lingual instruction following capabilities by up to 9.9%. Furthermore, we show that the Superficial Alignment Hypothesis does not hold in general, as the investigated multilingual 7B parameter model presents a counter-example requiring large-scale instruction-tuning datasets. Finally, we conduct a human annotation study to understand the alignment between human-based and GPT-4-based evaluation within multilingual chat scenarios. @@ -15987,13 +15987,13 @@ Multi-<fixed-case>L</fixed-case>ogi<fixed-case>E</fixed-case>val: Towards Evaluating Multi-Step Logical Reasoning Ability of Large Language Models - NisargPatel + NisargPatel MohithKulkarni MihirParmar AashnaBudhiraja MutsumiNakamuraArizona State University NeerajVarshney - ChittaBaralArizona State University + ChittaBaralArizona State University 20856-20879 As Large Language Models (LLMs) continue to exhibit remarkable performance in natural language understanding tasks, there is a crucial need to measure their ability for human-like multi-step logical reasoning. Existing logical reasoning evaluation benchmarks often focus primarily on simplistic single-step or multi-step reasoning with a limited set of inference rules. Furthermore, the lack of datasets for evaluating non-monotonic reasoning represents a crucial gap since it aligns more closely with human-like reasoning. To address these limitations, we propose Multi-LogiEval, a comprehensive evaluation dataset encompassing multi-step logical reasoning with various inference rules and depths. Multi-LogiEval covers three logic types — propositional, first-order, and non-monotonic consisting of more than 30 inference rules and more than 60 of their combinations with various depths. Leveraging this dataset, we conduct evaluations on a range of LLMs such as GPT-4, ChatGPT, Gemini-Pro, Orca, and Mistral, employing a zero-shot chain-of-thought. Experimental results show that there is a significant drop in the performance of LLMs as the reasoning steps/depth increases (average accuracy of ~68% at depth-1 to ~43% at depth-5). We further conduct a thorough investigation of reasoning chains generated by LLMs which reveals several important findings. We believe that Multi-LogiEval facilitates future research for evaluating and enhancing the logical reasoning ability of LLMs. 2024.emnlp-main.1160 @@ -16002,9 +16002,9 @@ Linear Layer Extrapolation for Fine-Grained Emotion Classification - MayukhSharma + MayukhSharma SeanO’BrienUniversity of California, San Diego - JulianMcAuleyUniversity of California, San Diego, University of California, San Diego + JulianMcAuleyUniversity of California, San Diego, University of California, San Diego 20880-20888 Certain abilities of Transformer-based language models consistently emerge in their later layers. Previous research has leveraged this phenomenon to improve factual accuracy through self-contrast, penalizing early-exit predictions based on the premise that later-layer updates are more factually reliable than earlier-layer associations. We observe a similar pattern for fine-grained emotion classification in text, demonstrating that self-contrast can enhance encoder-based text classifiers. Additionally, we reinterpret self-contrast as a form of linear extrapolation, which motivates a refined approach that dynamically adjusts the contrastive strength based on the selected intermediate layer. Experiments across multiple models and emotion classification datasets show that our method outperforms standard classification techniques in fine-grained emotion classification tasks. 2024.emnlp-main.1161 @@ -16020,7 +16020,7 @@ QiangLou YiLiu Shao-LunHuangTsinghua University, Tsinghua University - JianJiaoMicrosoft + JianJiaoMicrosoft 20889-20907 Large Language Models (LLMs) have shown superior performance in various applications and fields. To achieve better performance on specialized domains such as law and advertisement, LLMs are often continue pre-trained on in-domain data. However, existing approaches suffer from two major issues. First, in-domain data are scarce compared with general domain-agnostic data. Second, data used for continual pre-training are not task-aware, such that they may not be helpful to downstream applications. We propose TRAIT, a task-oriented in-domain data augmentation framework. Our framework is divided into two parts: in-domain data selection and task-oriented synthetic passage generation. The data selection strategy identifies and selects a large amount of in-domain data from general corpora, and thus significantly enriches domain knowledge in the continual pre-training data. The synthetic passages contain guidance on how to use domain knowledge to answer questions about downstream tasks. By training on such passages, the model aligns with the need of downstream applications. We adapt LLMs to two domains: advertisement and math. On average, TRAIT improves LLM performance by 8% in the advertisement domain and 7.5% in the math domain. 2024.emnlp-main.1162 @@ -16029,7 +16029,7 @@ <fixed-case>S</fixed-case>ci<fixed-case>DQA</fixed-case>: A Deep Reading Comprehension Dataset over Scientific Papers - ShrutiSinghIIT Gandhinagar + ShrutiSinghIIT Gandhinagar NandanSarkar ArmanCohanYale University and Allen Institute for Artificial Intelligence 20908-20923 @@ -16043,13 +16043,13 @@ Mixture-of-Modules: Reinventing Transformers as Dynamic Assemblies of Modules ZhuochengGong - AngLv - JianGuan - WeiWuAnt Research + AngLv + JianGuan + WeiWuAnt Research HuishuaiZhangPeking University MinlieHuang DongyanZhaoPeking University - RuiYanRenmin University of China + RuiYanRenmin University of China 20924-20938 Is it always necessary to compute tokens from shallow to deep layers in Transformers? The continued success of vanilla Transformers and their variants suggests an undoubted “yes”. In this work, however, we attempt to break the depth-ordered convention by proposing a novel architecture dubbed mixture-of-modules (MoM), which is motivated by an intuition that any layer, regardless of its position, can be used to compute a token as long as it possesses the needed processing capabilities. The construction of MoM starts from a finite set of modules defined by multi-head attention and feed-forward networks, each distinguished by its unique parameterization. Two routers then iteratively select attention modules and feed-forward modules from the set to process a token. The selection dynamically expands the computation graph in the forward pass of the token, culminating in an assembly of modules. We show that MoM provides not only a unified framework for Transformers and their numerous variants but also a flexible and learnable approach for reducing redundancy in Transformer parameterization. We pre-train various MoMs using OpenWebText. Empirical results demonstrate that MoMs, of different sizes, consistently outperform vanilla transformers. More interestingly, after removing 50% of the multi-head attention modules and 25% of the feed-forward modules, an MoM model still holds comparable performance. Additionally, by properly adjusting the number of modules and compressing the model depth, one can have an MoM that achieves comparable performance to GPT-2 (774M) while saving 16% TFLOPs and 42% memory usage during forward computation. 2024.emnlp-main.1164 @@ -16058,13 +16058,13 @@ No Culture Left Behind: <fixed-case>A</fixed-case>rt<fixed-case>EL</fixed-case>ingo-28, a Benchmark of <fixed-case>W</fixed-case>iki<fixed-case>A</fixed-case>rt with Captions in 28 Languages - YoussefMohamed + YoussefMohamed RunjiaLi - Ibrahim SaidAhmadNortheastern University + Ibrahim SaidAhmadNortheastern University KilichbekHaydarovKing Abdullah University of Science and Technology PhilipTorrUniversity of Oxford - KennethChurchNortheastern University - MohamedElhoseinyKAUST + KennethChurchNortheastern University + MohamedElhoseinyKAUST 20939-20962 Research in vision and language has made considerable progress thanks to benchmarks such as COCO. COCO captions focused on unambiguous facts in English; ArtEmis introduced subjective emotions and ArtELingo introduced some multilinguality (Chinese and Arabic). However we believe there should be more multilinguality. Hence, we present ArtELingo-28, a vision-language benchmark that spans 28 languages and encompasses approximately 200,000 annotations (140 annotations per image). Traditionally, vision research focused on unambiguous class labels, whereas ArtELingo-28 emphasizes diversity of opinions over languages and cultures. The challenge is to build machine learning systems that assign emotional captions to images. Baseline results will be presented for three novel conditions: Zero-Shot, Few-Shot and One-vs-All Zero-Shot. We find that cross-lingual transfer is more successful for culturally-related languages. Data and code will be made publicly available. 2024.emnlp-main.1165 @@ -16075,9 +16075,9 @@ <fixed-case>PREDICT</fixed-case>: Multi-Agent-based Debate Simulation for Generalized Hate Speech Detection SomeenPark JaehoonKimHanyang University - SeungwanJin + SeungwanJin SohyunPark - KyungsikHanHanyang University + KyungsikHanHanyang University 20963-20987 While a few public benchmarks have been proposed for training hate speech detection models, the differences in labeling criteria between these benchmarks pose challenges for generalized learning, limiting the applicability of the models. Previous research has presented methods to generalize models through data integration or augmentation, but overcoming the differences in labeling criteria between datasets remains a limitation. To address these challenges, we propose PREDICT, a novel framework that uses the notion of multi-agent for hate speech detection. PREDICT consists of two phases: (1) PRE (Perspective-based REasoning): Multiple agents are created based on the induced labeling criteria of given datasets, and each agent generates stances and reasons; (2) DICT (Debate using InCongruenT references): Agents representing hate and non-hate stances conduct the debate, and a judge agent classifies hate or non-hate and provides a balanced reason. Experiments on five representative public benchmarks show that PREDICT achieves superior cross-evaluation performance compared to methods that focus on specific labeling criteria or majority voting methods. Furthermore, we validate that PREDICT effectively mediates differences between agents’ opinions and appropriately incorporates minority opinions to reach a consensus. Our code is available at https://github.com/Hanyang-HCC-Lab/PREDICT 2024.emnlp-main.1166 @@ -16088,10 +16088,10 @@ <fixed-case>T</fixed-case>oken<fixed-case>V</fixed-case>erse: Towards Unifying Speech and <fixed-case>NLP</fixed-case> Tasks via Transducer-based <fixed-case>ASR</fixed-case> ShashiKumarEPFL - EPF Lausanne SrikanthMadikeriUniversity of Zurich - Juan PabloZuluaga Gomez - IuliiaThorbecke - EsaúVillatoro-telloIdiap Research Institute - SergioBurdissoIdiap Research Institute + Juan PabloZuluaga Gomez + IuliiaThorbecke + EsaúVillatoro-telloIdiap Research Institute + SergioBurdissoIdiap Research Institute PetrMotlicek Karthik Pandia DS AravindGanapathiraju @@ -16103,7 +16103,7 @@ <fixed-case>A</fixed-case>pi<fixed-case>Q</fixed-case>: Finetuning of 2-Bit Quantized Large Language Model - BaohaoLiao + BaohaoLiao ChristianHeroldRheinisch Westfälische Technische Hochschule Aachen ShahramKhadivieBay Research ChristofMonzUniversity of Amsterdam, University of Amsterdam @@ -16124,8 +16124,8 @@ BoWangFudan University YunhuaZhouShanghai Artificial Intelligence Laboratory LinlinLi - QunLiuHuawei Noah’s Ark Lab - XipengQiuFudan University + QunLiuHuawei Noah’s Ark Lab + XipengQiuFudan University 21021-21034 The evolution of Large Language Models (LLMs) has led to significant advancements, with models like Claude and Gemini capable of processing contexts up to 1 million tokens. However, efficiently handling long sequences remains challenging, particularly during the prefilling stage when input lengths exceed GPU memory capacity. Traditional methods often segment sequence into chunks and compress them iteratively with fixed-size memory. However, our empirical analysis shows that the fixed-size memory results in wasted computational and GPU memory resources. Therefore, we introduces Incremental Memory (IM), a method that starts with a small memory size and gradually increases it, optimizing computational efficiency. Additionally, we propose Decremental Chunk based on Incremental Memory (IMDC), which reduces chunk size while increasing memory size, ensuring stable and lower GPU memory usage. Our experiments demonstrate that IMDC is consistently faster (1.45x) and reduces GPU memory consumption by 23.3% compared to fixed-size memory, achieving comparable performance on the LongBench Benchmark. 2024.emnlp-main.1169 @@ -16146,7 +16146,7 @@ <fixed-case>I</fixed-case> love pineapple on pizza != <fixed-case>I</fixed-case> hate pineapple on pizza: Stance-Aware Sentence Transformers for Opinion Mining - VahidGhafouriUniversidad Carlos III de Madrid and IMDEA Networks Institute + VahidGhafouriUniversidad Carlos III de Madrid and IMDEA Networks Institute JoseSuchUniversidad Politécnica de Valencia and King’s College London GuillermoSuarez-TangilIMDEA Networks Institute 21046-21058 @@ -16157,9 +16157,9 @@ <fixed-case>B</fixed-case>ias<fixed-case>W</fixed-case>ipe: Mitigating Unintended Bias in Text Classifiers through Model Interpretability - MamtaMamtaIndian Institute of Technology, Patna + MamtaMamtaIndian Institute of Technology, Patna RishikantChigrupaatii - AsifEkbalIndian Institute of Technology, Jodhpur + AsifEkbalIndian Institute of Technology, Jodhpur 21059-21070 Toxic content detection plays a vital role in addressing the misuse of social media platforms to harm people or groups due to their race, gender or ethnicity. However, due to the nature of the datasets, systems develop an unintended bias due to the over-generalization of the model to the training data. This compromises the fairness of the systems, which can impact certain groups due to their race, gender, etc.Existing methods mitigate bias using data augmentation, adversarial learning, etc., which require re-training and adding extra parameters to the model.In this work, we present a robust and generalizable technique BiasWipe to mitigate unintended bias in language models. BiasWipe utilizes model interpretability using Shapley values, which achieve fairness by pruning the neuron weights responsible for unintended bias. It first identifies the neuron weights responsible for unintended bias and then achieves fairness by pruning them without loss of original performance. It does not require re-training or adding extra parameters to the model. To show the effectiveness of our proposed technique for bias unlearning, we perform extensive experiments for Toxic content detection for BERT, RoBERTa, and GPT models. . 2024.emnlp-main.1172 @@ -16168,11 +16168,11 @@ <fixed-case>A</fixed-case>r<fixed-case>M</fixed-case>eme: Propagandistic Content in <fixed-case>A</fixed-case>rabic Memes - FirojAlamQatar Computing Research Institute - AbulHasnat + FirojAlamQatar Computing Research Institute + AbulHasnat FatemaAhmadHamad Bin Khalifa University - Md. AridHasanUniversity of New Brunswick - MaramHasanainQatar Computing Research Institute + Md. AridHasanUniversity of New Brunswick + MaramHasanainQatar Computing Research Institute 21071-21090 With the rise of digital communication memes have become a significant medium for cultural and political expression that is often used to mislead audience. Identification of such misleading and persuasive multimodal content become more important among various stakeholders, including social media platforms, policymakers, and the broader society as they often cause harm to the individuals, organizations and/or society. While there has been effort to develop AI based automatic system for resource rich languages (e.g., English), it is relatively little to none for medium to low resource languages. In this study, we focused on developing an Arabic memes dataset with manual annotations of propagandistic content. We annotated \sim6K Arabic memes collected from various social media platforms, which is a first resource for Arabic multimodal research. We provide a comprehensive analysis aiming to develop computational tools for their detection. We made the dataset publicly available for the community. 2024.emnlp-main.1173 @@ -16184,10 +16184,10 @@ Language is Scary when Over-Analyzed: Unpacking Implied Misogynistic Reasoning with Argumentation Theory-Driven Prompts AriannaMuti - FedericoRuggeriUniversity of Bologna - Khalid AlKhatibUniversity of Groningen - AlbertoBarrón-CedeñoUniversità di Bologna - TommasoCaselliUniversity of Groningen + FedericoRuggeriUniversity of Bologna + Khalid AlKhatibUniversity of Groningen + AlbertoBarrón-CedeñoUniversità di Bologna + TommasoCaselliUniversity of Groningen 21091-21107 We propose misogyny detection as an Argumentative Reasoning task and we investigate the capacity of large language models (LLMs) to understand the implicit reasoning used to convey misogyny in both Italian and English. The central aim is to generate the missing reasoning link between a message and the implied meanings encoding the misogyny. Our study uses argumentation theory as a foundation to form a collection of prompts in both zero-shot and few-shot settings. These prompts integrate different techniques, including chain-of-thought reasoning and augmented knowledge. Our findings show that LLMs fall short on reasoning capabilities about misogynistic comments and that they mostly rely on their implicit knowledge derived from internalized common stereotypes about women to generate implied assumptions, rather than on inductive reasoning. 2024.emnlp-main.1174 @@ -16197,12 +16197,12 @@ Thoughts to Target: Enhance Planning for Target-driven Conversation - ZhonghuaZheng + ZhonghuaZheng LiziLiaoSingapore Management University YangDengSingapore Management University - Ee-PengLimSingapore Management University + Ee-PengLimSingapore Management University MinlieHuang - LiqiangNieHarbin Institute of Technology (Shenzhen) + LiqiangNieHarbin Institute of Technology (Shenzhen) 21108-21124 In conversational AI, large-scale models excel in various tasks but struggle with target-driven conversation planning. Current methods, such as chain-of-thought reasoning and tree-search policy learning techniques, either neglect plan rationality or require extensive human simulation procedures. Addressing this, we propose a novel two-stage framework, named EnPL, to improve the LLMs’ capability in planning conversations towards designated targets, including (1) distilling natural language plans from target-driven conversation corpus and (2) generating new plans with demonstration-guided in-context learning. Specifically, we first propose a filter approach to distill a high-quality plan dataset, ConvPlan (Resources of this paper can be found at https://github.com/pandazzh2020/ConvPlan). With the aid of corresponding conversational data and support from relevant knowledge bases, we validate the quality and rationality of these plans. Then, these plans are leveraged to help guide LLMs to further plan for new targets. Empirical results demonstrate that our method significantly improves the planning ability of LLMs, especially in target-driven conversations. Furthermore, EnPL is demonstrated to be quite effective in collecting target-driven conversation datasets and enhancing response generation, paving the way for constructing extensive target-driven conversational models. 2024.emnlp-main.1175 @@ -16211,13 +16211,13 @@ Scalable Data Ablation Approximations for Language Models through Modular Training and Merging - ClaraNaCarnegie Mellon University + ClaraNaCarnegie Mellon University IanMagnusson Ananya HarshJhaAllen Institute for Artificial Intelligence - TomSherborneCohere + TomSherborneCohere EmmaStrubellAllen Institute for Artificial Intelligence and Carnegie Mellon University JesseDodgeAllen Institute for Artificial Intelligence - PradeepDasigiAllen Institute for Artificial Intelligence + PradeepDasigiAllen Institute for Artificial Intelligence 21125-21141 Training data compositions for Large Language Models (LLMs) can significantly affect their downstream performance. However, a thorough data ablation study exploring large sets of candidate data mixtures is typically prohibitively expensive since the full effect is seen only after training the models; this can lead practitioners to settle for sub-optimal data mixtures. We propose an efficient method for approximating data ablations which trains individual models on subsets of a training corpus and reuses them across evaluations of combinations of subsets.In continued pre-training experiments, we find that, given an arbitrary evaluation set, the perplexity score of a single model trained on a candidate set of data is strongly correlated with perplexity scores of parameter averages of models trained on distinct partitions of that data. From this finding, we posit that researchers and practitioners can conduct inexpensive simulations of data ablations by maintaining a pool of models that were each trained on partitions of a large training corpus, and assessing candidate data mixtures by evaluating parameter averages of combinations of these models. This approach allows for substantial improvements in amortized training efficiency – scaling only linearly with respect to new data – by enabling reuse of previous training computation, opening new avenues for improving model performance through rigorous, incremental data assessment and mixing. 2024.emnlp-main.1176 @@ -16228,8 +16228,8 @@ Exploring Intrinsic Language-specific Subspaces in Fine-tuning Multilingual Neural Machine Translation ZheCaoNara Institute of Science and Technology, Japan ZhiQuNara Institute of Science and Technology, Japan and National Institute of Information and Communications Technology (NICT) - HidetakaKamigaitoNara Institute of Science and Technology - TaroWatanabeNara Institute of Science and Technology, Japan + HidetakaKamigaitoNara Institute of Science and Technology + TaroWatanabeNara Institute of Science and Technology, Japan 21142-21157 Multilingual neural machine translation models support fine-tuning hundreds of languages simultaneously. However, fine-tuning on full parameters solely is inefficient potentially leading to negative interactions among languages. In this work, we demonstrate that the fine-tuning for a language occurs in its intrinsic language-specific subspace with a tiny fraction of entire parameters. Thus, we propose language-specific LoRA to isolate intrinsic language-specific subspaces. Furthermore, we propose architecture learning techniques and introduce a gradual pruning schedule during fine-tuning to exhaustively explore the optimal setting and the minimal intrinsic subspaces for each language, resulting in a lightweight yet effective fine-tuning procedure. The experimental results on a 12-language subset and a 30-language subset of FLORES-101 show that our methods not only outperform full-parameter fine-tuning up to 2.25 spBLEU scores but also reduce trainable parameters to 0.4% for high and medium-resource languages and 1.6% for low-resource ones. 2024.emnlp-main.1177 @@ -16239,8 +16239,8 @@ Attention Score is not All You Need for Token Importance Indicator in <fixed-case>KV</fixed-case> Cache Reduction: Value Also Matters ZhiyuGuo - HidetakaKamigaitoNara Institute of Science and Technology - TaroWatanabeNara Institute of Science and Technology, Japan + HidetakaKamigaitoNara Institute of Science and Technology + TaroWatanabeNara Institute of Science and Technology, Japan 21158-21166 2024.emnlp-main.1178 guo-etal-2024-attention @@ -16251,7 +16251,7 @@ JinyoungParkKorea University MinseokJooKorea University Joo-KyungKimAmazon AGI - Hyunwoo J.KimKorea University + Hyunwoo J.KimKorea University 21167-21182 Knowledge graph–grounded dialog generation requires retrieving a dialog-relevant subgraph from the given knowledge base graph and integrating it with the dialog history. Previous works typically represent the graph using an external encoder, such as graph neural networks, and retrieve relevant triplets based on the similarity between single-vector representations of triplets and the dialog history. However, these external encoders fail to leverage the rich knowledge of pretrained language models, and the retrieval process is also suboptimal due to the information bottleneck caused by the single-vector abstraction of the dialog history. In this work, we propose Dialog generation with Generative Subgraph Retrieval (DialogGSR), which retrieves relevant knowledge subgraphs by directly generating their token sequences on top of language models. For effective generative subgraph retrieval, we introduce two key methods: (i) structure-aware knowledge graph linearization with self-supervised graph-specific tokens and (ii) graph-constrained decoding utilizing graph structural proximity-based entity informativeness scores for valid and relevant generative retrieval. DialogGSR achieves state-of-the-art performance in knowledge graph–grounded dialog generation, as demonstrated on OpenDialKG and KOMODIS datasets. 2024.emnlp-main.1179 @@ -16261,7 +16261,7 @@ Adapters Mixup: Mixing Parameter-Efficient Adapters to Enhance the Adversarial Robustness of Fine-tuned Pre-trained Text Classifiers Tuc VanNguyen - ThaiLeIndiana University + ThaiLeIndiana University 21183-21203 Existing works show that augmenting the training data of pre-trained language models (PLMs) for classification tasks fine-tuned via parameter-efficient fine-tuning methods (PEFT) using both clean and adversarial examples can enhance their robustness under adversarial attacks. However, this adversarial training paradigm often leads to performance degradation on clean inputs and requires frequent re-training on the entire data to account for new, unknown attacks. To overcome these challenges while still harnessing the benefits of adversarial training and the efficiency of PEFT, this work proposes a novel approach, called AdpMixup, that combines two paradigms: (1) fine-tuning through adapters and (2) adversarial augmentation via mixup to dynamically leverage existing knowledge from a set of pre-known attacks for robust inference. Intuitively, AdpMixup fine-tunes PLMs with multiple adapters with both clean and pre-known adversarial examples and intelligently mixes them up in different ratios during prediction. Our experiments show AdpMixup achieves the best trade-off between training efficiency and robustness under both pre-known and unknown attacks, compared to existing baselines on five downstream tasks across six varied black-box attacks and 2 PLMs. The code is available at https://github.com/nguyentuc/adapters_mixup. 2024.emnlp-main.1180 @@ -16272,7 +16272,7 @@ Generalizing Clinical De-identification Models by Privacy-safe Data Augmentation using <fixed-case>GPT</fixed-case>-4 WoojinKimKorean National Police Agency SungeunHahm - JaejinLeeSeoul National University + JaejinLeeSeoul National University 21204-21218 De-identification (de-ID) refers to removing the association between a set of identifying data and the data subject. In clinical data management, the de-ID of Protected Health Information (PHI) is critical for patient confidentiality. However, state-of-the-art de-ID models show poor generalization on a new dataset. This is due to the difficulty of retaining training corpora. Additionally, labeling standards and the formats of patient records vary across different institutions. Our study addresses these issues by exploiting GPT-4 for data augmentation through one-shot and zero-shot prompts. Our approach effectively circumvents the problem of PHI leakage, ensuring privacy by redacting PHI before processing. To evaluate the effectiveness of our proposal, we conduct cross-dataset testing. The experimental result demonstrates significant improvements across three types of F1 scores. 2024.emnlp-main.1181 @@ -16297,15 +16297,15 @@ <fixed-case>G</fixed-case>ott<fixed-case>BERT</fixed-case>: a pure <fixed-case>G</fixed-case>erman Language Model - RaphaelScheibleTechnische Universität München - JohannFreiUniversität Augsburg + RaphaelScheibleTechnische Universität München + JohannFreiUniversität Augsburg FabianThomczykMedical Center - University of Freiburg - HenryHeTechnische Universität München - PatricTippmannAlbert-Ludwigs-Universität Freiburg - JochenKnaus + HenryHeTechnische Universität München + PatricTippmannAlbert-Ludwigs-Universität Freiburg + JochenKnaus VictorJaravine - FrankKramerUniversity of Augsburg, Universität Augsburg - MartinBoekerTechnische Universität München + FrankKramerUniversity of Augsburg, Universität Augsburg + MartinBoekerTechnische Universität München 21237-21250 2024.emnlp-main.1183 scheible-etal-2024-gottbert @@ -16314,7 +16314,7 @@ Computational Meme Understanding: A Survey - Khoi P. N.Nguyen + Khoi P. N.Nguyen VincentNgUniversity of Texas at Dallas 21251-21267 Computational Meme Understanding, which concerns the automated comprehension of memes, has garnered interest over the last four years and is facing both substantial opportunities and challenges. We survey this emerging area of research by first introducing a comprehensive taxonomy for memes along three dimensions – forms, functions, and topics. Next, we present three key tasks in Computational Meme Understanding, namely, classification, interpretation, and explanation, and conduct a comprehensive review of existing datasets and models, discussing their limitations. Finally, we highlight the key challenges and recommend avenues for future work. @@ -16327,9 +16327,9 @@ CostasMavromatis BalasubramaniamSrinivasanAmazon ZhengyuanShenAmazon - JianiZhangAmazon Web Services + JianiZhangAmazon Web Services HuzefaRangwalaAmazon and Computer Science, George Mason University - ChristosFaloutsosAmazon and Carnegie Mellon University + ChristosFaloutsosAmazon and Carnegie Mellon University GeorgeKarypisUniversity of Minnesota, Minneapolis 21268-21286 In-context learning (ICL) adapts Large Language Models (LLMs) to new tasks, without requiring any parameter updates, but few annotated examples as input. In this work, we investigate selective annotation for ICL, where there is a limited budget for annotating examples, similar to low-budget active learning (AL). Although uncertainty-based selection is unreliable with few annotated data, we present CoverICL, an adaptive graph-based selection algorithm, that effectively incorporates uncertainty sampling into selective annotation for ICL. First, CoverICL builds a nearest-neighbor graph based on the semantic similarity between candidate ICL examples. Then, CoverICL employs uncertainty estimation by the LLM to identify hard examples for the task. Selective annotation is performed over the active graph of the hard examples, adapting the process to the particular LLM used and the task tackled. CoverICL selects the most representative examples by solving a Maximum Coverage problem, approximating diversity-based sampling. Extensive experiments on ten datasets and seven LLMs show that, by incorporating uncertainty via coverage on the active graph, CoverICL (1) outperforms existing AL methods for ICL by 2–4.6% accuracy points, (2) is up to 2x more budget-efficient than SOTA methods for low-budget AL, and (3) generalizes better across tasks compared to non-graph alternatives. @@ -16340,8 +16340,8 @@ Retrieval-enriched zero-shot image classification in low-resource domains - NicolaDall’Asen - YimingWangFondazione Bruno Kessler + NicolaDall’Asen + YimingWangFondazione Bruno Kessler EnricoFiniApple ElisaRicciUniversity of Trento and University of Perugia 21287-21302 @@ -16352,13 +16352,13 @@ <fixed-case>I</fixed-case>-<fixed-case>AM</fixed-case>-<fixed-case>G</fixed-case>: Interest Augmented Multimodal Generator for Item Personalization - XianquanWang + XianquanWang LikangWu ShukangYinUniversity of Science and Technology of China - ZhiLiShenzhen International Graduate School, Tsinghua University - YanjiangChen + ZhiLiShenzhen International Graduate School, Tsinghua University + YanjiangChen HufengHufeng - YuSu + YuSu QiLiuUniversity of Science and Technology of China 21303-21317 The emergence of personalized generation has made it possible to create texts or images that meet the unique needs of users. Recent advances mainly focus on style or scene transfer based on given keywords. However, in e-commerce and recommender systems, it is almost an untouched area to explore user historical interactions, automatically mine user interests with semantic associations, and create item representations that closely align with user individual interests.In this paper, we propose a brand new framework called **I**nterest-**A**ugmented **M**ultimodal **G**enerator (**I-AM-G**). The framework first extracts tags from the multimodal information of items that the user has interacted with, and the most frequently occurred ones are extracted to rewrite the text description of the item. Then, the framework uses a decoupled text-to-text and image-to-image retriever to search for the top-K similar item text and image embeddings from the item pool. Finally, the Attention module for user interests fuses the retrieved information in a cross-modal manner and further guides the personalized generation process in collaboration with the rewritten text.We conducted extensive and comprehensive experiments to demonstrate that our framework can effectively generate results aligned with user preferences, which potentially provides a new paradigm of **Rewrite and Retrieve** for personalized generation. @@ -16369,9 +16369,9 @@ Twists, Humps, and Pebbles: Multilingual Speech Recognition Models Exhibit Gender Performance Gaps GiuseppeAttanasioInstituto de Telecomunicações - BeatriceSavoldi - DennisFucci - DirkHovyBocconi University + BeatriceSavoldi + DennisFucci + DirkHovyBocconi University 21318-21340 Current automatic speech recognition (ASR) models are designed to be used across many languages and tasks without substantial changes. However, this broad language coverage hides performance gaps within languages, for example, across genders. Our study systematically evaluates the performance of two widely used multilingual ASR models on three datasets, encompassing 19 languages from eight language families and two speaking conditions. Our findings reveal clear gender disparities, with the advantaged group varying across languages and models. Surprisingly, those gaps are not explained by acoustic or lexical properties. However, probing internal model states reveals a correlation with gendered performance gap. That is, the easier it is to distinguish speaker gender in a language using probes, the more the gap reduces, favoring female speakers. Our results show that gender disparities persist even in state-of-the-art models. Our findings have implications for the improvement of multilingual ASR systems, underscoring the importance of accessibility to training data and nuanced evaluation to predict and mitigate gender gaps. We release all code and artifacts at https://github.com/g8a9/multilingual-asr-gender-gap. 2024.emnlp-main.1188 @@ -16393,12 +16393,12 @@ Contrastive Policy Gradient: Aligning <fixed-case>LLM</fixed-case>s on sequence-level scores in a supervised-friendly fashion YannisFlet-BerliacStanford University - NathanGrinsztajnCohere + NathanGrinsztajnCohere FlorianStrubDeepMind EugeneChoiCohere BillWuCohere ChrisCremer - ArashAhmadian + ArashAhmadian YashChandakComputer Science Department, Stanford University Mohammad GheshlaghiAzarCohere AI OlivierPietquinCohere and Earth Species Project @@ -16411,7 +16411,7 @@ Show and Guide: Instructional-Plan Grounded Vision and Language Model - DiogoGlória-SilvaUniversidade NOVA de Lisboa + DiogoGlória-SilvaUniversidade NOVA de Lisboa DavidSemedoUniversidade NOVA de Lisboa and Universidade NOVA de Lisboa JoaoMagalhaesUniversidade Nova de Lisboa 21371-21389 @@ -16422,11 +16422,11 @@ Beyond Turn-Based Interfaces: Synchronous <fixed-case>LLM</fixed-case>s as Full-Duplex Dialogue Agents - BandhavVeluriUniversity of Washington, Seattle + BandhavVeluriUniversity of Washington, Seattle Benjamin NPeloquinFacebook BokaiYuMeta AI HongyuGongFAIR at Meta - ShyamnathGollakotaDepartment of Computer Science, University of Washington + ShyamnathGollakotaDepartment of Computer Science, University of Washington 21390-21402 Despite broad interest in modeling spoken dialogue agents, most approaches are inherently “half-duplex” – restricted to turn-based interaction with responses requiring explicit prompting by the user or implicit tracking of interruption or silence events. Human dialogue, by contrast, is “full-duplex” allowing for rich synchronicity in the form of quick and dynamic turn-taking, overlapping speech, and backchanneling. Technically, the challenge of achieving full-duplex dialogue with LLMs lies in modeling synchrony as pre-trained LLMs do not have a sense of “time”. To bridge this gap, we propose Synchronous LLMs for full-duplex spoken dialogue modeling. We design a novel mechanism to integrate time information into Llama3-8b so that they run synchronously with the real-world clock. We also introduce a training recipe that uses 212k hours of synthetic spoken dialogue data generated from text dialogue data to create a model that generates meaningful and natural spoken dialogue, with just 2k hours of real-world spoken dialogue data. Synchronous LLMs outperform state-of-the-art in dialogue meaningfulness while maintaining naturalness. Finally, we demonstrate the model’s ability to participate in full-duplex dialogue by simulating interaction between two agents trained on different datasets, while considering Internet-scale latencies of up to 240 ms. 2024.emnlp-main.1192 @@ -16452,7 +16452,7 @@ ChanwoongYoonKorea University TaewhooLeeKorea University HyeonHwangKorea University - MinbyulJeongUpstage + MinbyulJeongUpstage JaewooKangKorea University 21424-21439 Retrieval-augmented generation supports language models to strengthen their factual groundings by providing external contexts. However, language models often face challenges when given extensive information, diminishing their effectiveness in solving questions. Context compression tackles this issue by filtering out irrelevant information, but current methods still struggle in realistic scenarios where crucial information cannot be captured with a single-step approach. To overcome this limitation, we introduce CompAct, a novel framework that employs an active strategy to condense extensive documents without losing key information. Our experiments demonstrate that CompAct brings significant improvements in both performance and compression rate on multi-hop question-answering benchmarks. CompAct flexibly operates as a cost-efficient plug-in module with various off-the-shelf retrievers or readers, achieving exceptionally high compression rates (47x). @@ -16489,7 +16489,7 @@ Local Contrastive Editing of Gender Stereotypes - MarleneLutzUniversität Mannheim + MarleneLutzUniversität Mannheim RochelleChoenni MarkusStrohmaierUniversität Mannheim AnneLauscherUniversität Hamburg @@ -16505,10 +16505,10 @@ Nicole CornehlLima Santiago PedrozaDiaz Amogh ManojJoshi - SiddharthBetala + SiddharthBetala Jamiu TundeSuleimanMachine Learning Collective YashMathur - Kaushal KumarPrajapatiNational Institute of Technology + Kaushal KumarPrajapatiNational Institute of Technology RamlaAlakraaUniversity of Michigan - Ann Arbor JunjieShen TemiOkotore @@ -16521,11 +16521,11 @@ <fixed-case>RAR</fixed-case>: Retrieval-augmented retrieval for code generation in low resource languages - AvikDuttaMicrosoft - MukulSinghMicrosoft + AvikDuttaMicrosoft + MukulSinghMicrosoft GustVerbruggenMicrosoft - SumitGulwaniResearch, Microsoft - VuLeMicrosoft + SumitGulwaniResearch, Microsoft + VuLeMicrosoft 21506-21515 Language models struggle in generating code for low-resource programming languages, since these are underrepresented in training data. Either examples or documentation are commonly used for improved code generation. We propose to use both types of information together and present retrieval augmented retrieval (RAR) as a two-step method for selecting relevant examples and documentation. Experiments on three low-resource languages (Power Query M, OfficeScript and Excel formulas) show that RAR outperforms independently example and grammar retrieval (+2.81–26.14%). Interestingly, we show that two-step retrieval selects better examples and documentation when used independently as well. 2024.emnlp-main.1199 @@ -16538,12 +16538,12 @@ <fixed-case>STAR</fixed-case>: <fixed-case>S</fixed-case>ocio<fixed-case>T</fixed-case>echnical Approach to Red Teaming Language Models LauraWeidingerDeepMind John F JMellor - Bernat GuillénPeguerolesGoogle - NahemaMarchalGoogle + Bernat GuillénPeguerolesGoogle + NahemaMarchalGoogle RavinKumar KristianLumTwitter CanferAkbulutGoogle DeepMind - MarkDiazGoogle + MarkDiazGoogle A. StevieBergmanGoogle Mikel D.Rodriguez VerenaRieserGoogle DeepMind @@ -16558,8 +16558,8 @@ Do great minds think alike? Investigating Human-<fixed-case>AI</fixed-case> Complementarity in Question Answering with <fixed-case>CAIMIRA</fixed-case> MaharshiGorUniversity of Maryland, College Park HalDaumé IiiUniversity of Maryland - College Park, University of Maryland, College Park and Microsoft - TianyiZhouUniversity of Maryland, College Park - Jordan LeeBoyd-GraberUniversity of Maryland, College Park + TianyiZhouUniversity of Maryland, College Park + Jordan LeeBoyd-GraberUniversity of Maryland, College Park 21533-21564 Recent advancements of large language models (LLMs)have led to claims of AI surpassing humansin natural language processing NLP tasks such as textual understanding and reasoning.%This work investigates these assertions by introducingCAIMIRA, a novel framework rooted in item response theory IRTthat enables quantitative assessment and comparison of problem-solving abilities inquestion-answering QA agents.%Through analysis of over 300,000 responses from ~ 70 AI systemsand 155 humans across thousands of quiz questions, CAIMIRA uncovers distinctproficiency patterns in knowledge domains and reasoning skills. %Humans outperform AI systems in knowledge-grounded abductive and conceptual reasoning,while state-of-the-art LLMs like GPT-4 Turbo and Llama-3-70B demonstrate superior performance ontargeted information retrieval and fact-based reasoning, particularly when information gapsare well-defined and addressable through pattern matching or data retrieval.%These findings identify key areas for future QA tasks and model development,highlighting the critical need for questions that not only challengehigher-order reasoning and scientific thinking, but also demand nuanced linguisticand cross-contextual application. 2024.emnlp-main.1201 @@ -16568,7 +16568,7 @@ Memory-Efficient Fine-Tuning of Transformers via Token Selection - AntoineSimoulinMeta AI + AntoineSimoulinMeta AI NamyongParkMeta AI XiaoyiLiuNorthwestern University GreyYangMeta Platforms, Inc @@ -16581,8 +16581,8 @@ Unveiling the mystery of visual attributes of concrete and abstract concepts: Variability, nearest neighbors, and challenging categories TarunTaterUniversität Stuttgart - SabineSchulte Im WaldeUniversity of Stuttgart - DiegoFrassinelliLudwig-Maximilians-Universität München + SabineSchulte Im WaldeUniversity of Stuttgart + DiegoFrassinelliLudwig-Maximilians-Universität München 21581-21597 The visual representation of a concept varies significantly depending on its meaning and the context where it occurs; this poses multiple challenges both for vision and multimodal models. Our study focuses on concreteness, a well-researched lexical-semantic variable, using it as a case study to examine the variability in visual representations. We rely on images associated with approximately 1,000 abstract and concrete concepts extracted from two different datasets: Bing and YFCC. Our goals are: (i) evaluate whether visual diversity in the depiction of concepts can reliably distinguish between concrete and abstract concepts; (ii) analyze the variability of visual features across multiple images of the same concept through a nearest neighbor analysis; and (iii) identify challenging factors contributing to this variability by categorizing and annotating images. Our findings indicate that for classifying images of abstract versus concrete concepts, a combination of basic visual features such as color and texture is more effective than features extracted by more complex models like Vision Transformer (ViT). However, ViTs show better performances in the nearest neighbor analysis, emphasizing the need for a careful selection of visual features when analyzing conceptual variables through modalities other than text. 2024.emnlp-main.1203 @@ -16592,10 +16592,10 @@ Evaluating Large Language Models on Time Series Feature Understanding: A Comprehensive Taxonomy and Benchmark ElizabethFonsJ.P. Morgan Chase - RachneetKaurJ.P. Morgan Chase + RachneetKaurJ.P. Morgan Chase SohamPalandeJ.P. Morgan Chase - ZhenZengJ.P. Morgan Chase - TuckerBalchJ.P. Morgan Chase + ZhenZengJ.P. Morgan Chase + TuckerBalchJ.P. Morgan Chase ManuelaVelosoSchool of Computer Science, Carnegie Mellon University SvitlanaVyetrenkoJ.P. Morgan Chase 21598-21634 @@ -16607,13 +16607,13 @@ Can <fixed-case>LLM</fixed-case>s Learn Uncertainty on Their Own? Expressing Uncertainty Effectively in A Self-Training Manner - ShudongLiuUniversity of Macau + ShudongLiuUniversity of Macau ZhaocongLi XueboLiuHarbin Institute of Technolgy, Shenzhen RunzheZhanUniversity of Macau - Derek F.WongUniversity of Macau + Derek F.WongUniversity of Macau Lidia S.Chao - MinZhangHarbin Institute of Technology + MinZhangHarbin Institute of Technology 21635-21645 Large language models (LLMs) often exhibit excessive, random, and uninformative uncertainty, rendering them unsuitable for decision-making in human-computer interactions. In this paper, we aim to instigate a heightened awareness of self-uncertainty in LLMs, enabling them to express uncertainty more effectively. To accomplish this, we propose an uncertainty-aware instruction tuning (UaIT) method, aligning LLMs’ perception with the probabilistic uncertainty of the generation. We conducted experiments using LLaMA2 and Mistral on multiple free-form QA tasks. Experimental results revealed a surprising 45.2% improvement in the effectiveness of uncertainty expression by LLMs, accompanied by reasonably good out-of-domain generalization capabilities. Moreover, this uncertainty expression can serve as a valuable real-time basis for human decision-making, e.g., retrieving external documents and incorporating stronger LLMs. 2024.emnlp-main.1205 @@ -16633,9 +16633,9 @@ Metrics for What, Metrics for Whom: Assessing Actionability of Bias Evaluation Metrics in <fixed-case>NLP</fixed-case> - PieterDelobelleKU Leuven, KU Leuven + PieterDelobelleKU Leuven, KU Leuven GiuseppeAttanasioInstituto de Telecomunicações - DeboraNozzaBocconi University + DeboraNozzaBocconi University Su LinBlodgettMicrosoft ZeerakTalatMohamed bin Zayed University of Artificial Intelligence 21669-21691 @@ -16649,7 +16649,7 @@ XuhuiZhou ZheSu TiwalayoEisapeMassachusetts Institute of Technology - HyunwooKimAllen Institute for Artificial Intelligence + HyunwooKimAllen Institute for Artificial Intelligence MaartenSapCarnegie Mellon University 21692-21714 Recent advances in large language models (LLM) have enabled richer social simulations, allowing for the study of various social phenomena. However, most recent work has used a more omniscient perspective on these simulations (e.g., single LLM to generate all interlocutors), which is fundamentally at odds with the non-omniscient, information asymmetric interactions that involve humans and AI agents in the real world. To examine these differences, we develop an evaluation framework to simulate social interactions with LLMs in various settings (omniscient, non-omniscient). Our experiments show that LLMs perform better in unrealistic, omniscient simulation settings but struggle in ones that more accurately reflect real-world conditions with information asymmetry. Moreover, we illustrate the limitations inherent in learning from omniscient simulations. Our findings indicate that addressing information asymmetry remains a fundamental challenge for LLM-based agents. @@ -16661,7 +16661,7 @@ A Simple <fixed-case>LLM</fixed-case> Framework for Long-Range Video Question-Answering CeZhangUNC-Chapel Hill TaixiLu - Md MohaiminulIslamUniversity of North Carolina at Chapel Hill + Md MohaiminulIslamUniversity of North Carolina at Chapel Hill ZiyangWangDepartment of Computer Science, University of North Carolina at Chapel Hill ShoubinYu MohitBansalUniversity of North Carolina at Chapel Hill @@ -16686,33 +16686,33 @@ <fixed-case>C</fixed-case>asablanca: Data and Models for Multidialectal <fixed-case>A</fixed-case>rabic Speech Recognition - BasharTalafha - KarimaKadaouiMohamed bin Zayed University of Artificial Intelligence - Samar MohamedMagdyMohamed bin Zayed University of Artificial Intelligence + BasharTalafha + KarimaKadaouiMohamed bin Zayed University of Artificial Intelligence + Samar MohamedMagdyMohamed bin Zayed University of Artificial Intelligence MariemHabiboullahMohamed bin Zayed University of Artificial Intelligence Chafei MohamedChafei Ahmed OumarEl-Shangiti HibaZayedBirzeit University - Mohamedou CheikhTouradUniversité de Nouakchott - RahafAlhamouriJordan University of Science and Technology + Mohamedou CheikhTouradUniversité de Nouakchott + RahafAlhamouriJordan University of Science and Technology RwaaAssi AishaAlraeesi HourMohamed FakhraddinAlwajih - AbdelrahmanMohamed + AbdelrahmanMohamed AbdellahEl MekkiMohamed bin Zayed University of Artificial Intelligence El Moatez BillahNagoudiUniversity of British Columbia Benelhadj Djelloul MamaSaadiaUniversité des Sciences et de la Technologie Houari Boumediène - Hamzah A.Alsayadi - WalidAl-Dhabyani + Hamzah A.Alsayadi + WalidAl-Dhabyani SaraShatnawi YasirEch-chammakhy AmalMakouar YousraBerrachedi - MustafaJarrarBirzeit University - ShadyShehataMohamed bin Zayed University of Artificial Intelligence + MustafaJarrarBirzeit University + ShadyShehataMohamed bin Zayed University of Artificial Intelligence IsmailBerradaMohammed VI Polytechnic University - MuhammadAbdul-MageedUniversity of British Columbia + MuhammadAbdul-MageedUniversity of British Columbia 21745-21758 In spite of the recent progress in speech processing, the majority of world languages and dialects remain uncovered. This situation only furthers an already wide technological divide, thereby hindering technological and socioeconomic inclusion. This challenge is largely due to the absence of datasets that can empower diverse speech systems. In this paper, we seek to mitigate this obstacle for a number of Arabic dialects by presenting Casablanca, a large-scale community-driven effort to collect and transcribe a multi-dialectal Arabic dataset. The dataset covers eight dialects: Algerian, Egyptian, Emirati, Jordanian, Mauritanian, Moroccan, Palestinian, and Yemeni, and includes annotations for transcription, gender, dialect, and code-switching. We also develop a number of strong baselines exploiting Casablanca. The project page for Casablanca is accessible at: www.dlnlp.ai/speech/casablanca. 2024.emnlp-main.1211 @@ -16734,7 +16734,7 @@ Communicating with Speakers and Listeners of Different Pragmatic Levels KataNaszadiUniversity of Amsterdam - Frans AOliehoekDelft University of Technology + Frans AOliehoekDelft University of Technology ChristofMonzUniversity of Amsterdam, University of Amsterdam 21777-21783 This paper explores the impact of variable pragmatic competence on communicative success through simulating language learning and conversing between speakers and listeners with different levels of reasoning abilities. Through studying this interaction, we hypothesize that matching levels of reasoning between communication partners would create a more beneficial environment for communicative success and language learning. Our research findings indicate that learning from more explicit, literal language is advantageous, irrespective of the learner’s level of pragmatic competence. Furthermore, we find that integrating pragmatic reasoning during language learning, not just during evaluation, significantly enhances overall communication performance. This paper provides key insights into the importance of aligning reasoning levels and incorporating pragmatic reasoning in optimizing communicative interactions. @@ -16745,11 +16745,11 @@ <fixed-case>RECANTF</fixed-case>ormer: Referring Expression Comprehension with Varying Numbers of Targets - BhathiyaHemanthage - HakanBilenUniversity of Edinburgh, University of Edinburgh - PhilBartieHeriot-Watt University + BhathiyaHemanthage + HakanBilenUniversity of Edinburgh, University of Edinburgh + PhilBartieHeriot-Watt University ChristianDondrupHeriot-Watt University - OliverLemonHeriot-Watt University + OliverLemonHeriot-Watt University 21784-21798 The Generalized Referring Expression Comprehension (GREC) task extends classic REC by generating image bounding boxes for objects referred to in natural language expressions, which may indicate zero, one, or multiple targets. This generalization enhances the practicality of REC models for diverse real-world applications. However, the presence of varying numbers of targets in samples makes GREC a more complex task, both in terms of training supervision and final prediction selection strategy. Addressing these challenges, we introduce RECANTFormer, a one-stage method for GREC that combines a decoder-free (encoder-only) transformer architecture with DETR-like Hungarian matching. Our approach consistently outperforms baselines by significant margins in three GREC datasets. 2024.emnlp-main.1214 @@ -16773,7 +16773,7 @@ AlexanderSpangherUniversity of Southern California NanyunPengUniversity of California, Los Angeles SebastianGehrmannBloomberg - MarkDredzeDepartment of Computer Science, Whiting School of Engineering + MarkDredzeDepartment of Computer Science, Whiting School of Engineering 21814-21828 Journalists engage in multiple steps in the news writing process that depend on human creativity, like exploring different “angles” (i.e. the specific perspectives a reporter takes). These can potentially be aided by large language models (LLMs). By affecting planning decisions, such interventions can have an outsize impact on creative output. We advocate a careful approach to evaluating these interventions to ensure alignment with human values.In a case study of journalistic coverage of press releases, we assemble a large dataset of 250k press releases and 650k articles covering them. We develop methods to identify news articles that _challenge and contextualize_ press releases. Finally, we evaluate suggestions made by LLMs for these articles and compare these with decisions made by human journalists. Our findings are three-fold: (1) Human-written news articles that challenge and contextualize press releases more take more creative angles and use more informational sources. (2) LLMs align better with humans when recommending angles, compared with informational sources. (3) Both the angles and sources LLMs suggest are significantly less creative than humans. 2024.emnlp-main.1216 @@ -16785,9 +16785,9 @@ <fixed-case>T</fixed-case>-<fixed-case>FREE</fixed-case>: Subword Tokenizer-Free Generative <fixed-case>LLM</fixed-case>s via Sparse Representations for Memory-Efficient Embeddings BjörnDeiserothTechnische Universität Darmstadt and Aleph Alpha ManuelBrackGerman Research Center for AI and Technische Universität Darmstadt - PatrickSchramowskiGerman Research Center for AI - KristianKerstingGerman Research Center for AI, The Hessian Center for AI and TU Darmstadt - SamuelWeinbachAleph Alpha GmbH + PatrickSchramowskiGerman Research Center for AI + KristianKerstingGerman Research Center for AI, The Hessian Center for AI and TU Darmstadt + SamuelWeinbachAleph Alpha GmbH 21829-21851 Tokenizers are crucial for encoding information in Large Language Models, but their development has recently stagnated, and they contain inherent weaknesses. Major limitations include computational overhead, ineffective vocabulary use, and unnecessarily large embedding and head layers. Additionally, their performance is biased towards a reference corpus, leading to reduced effectiveness for underrepresented languages.To remedy these issues, we propose T-Free, which directly embeds words through sparse activation patterns over character triplets and does not require a reference corpus. T-Free inherently exploits morphological similarities and allows for strong compression of embedding layers. In our exhaustive experimental evaluation, we achieve competitive downstream performance with a parameter reduction of more than 85% on these layers. Further, T-Free shows significant improvements in cross-lingual transfer learning. 2024.emnlp-main.1217 @@ -16807,13 +16807,13 @@ Assessing and Verifying Task Utility in <fixed-case>LLM</fixed-case>-Powered Applications - NegarArabzadeh - SiqingHuoUniversity of Waterloo + NegarArabzadeh + SiqingHuoUniversity of Waterloo NikhilMehta QingyunWuPennsylvania State University ChiWangGoogle DeepMind Ahmed HassanAwadallahMicrosoft Research - Charles L. A.ClarkeUniversity of Waterloo + Charles L. A.ClarkeUniversity of Waterloo JuliaKiselevaResearch, Microsoft 21868-21888 The rapid development of Large Language Models (LLMs) has led to a surge in applications that facilitate collaboration among multiple agents, assisting humans in their daily tasks. However, a significant gap remains in assessing to what extent LLM-powered applications genuinely enhance user experience and task execution efficiency. This highlights the need to verify utility of LLM-powered applications, particularly by ensuring alignment between the application’s functionality and end-user needs. We introduce AgentEval, a novel framework designed to simplify the utility verification process by automatically proposing a set of criteria tailored to the unique purpose of any given application. This allows for a comprehensive assessment, quantifying the utility of an application against the suggested criteria. We present a comprehensive analysis of the effectiveness and robustness of AgentEval for two open source datasets including Math Problem solving and ALFWorld House-hold related tasks. For reproducibility purposes, we make the data, code and all the logs publicly available at https://github.com/Narabzad/AgentEval @@ -16824,8 +16824,8 @@ Dynamic Rewarding with Prompt Optimization Enables Tuning-free Self-Alignment of Language Models SomanshuSingla - ZhenWangUniversity of California, San Diego - TianyangLiuUniversity of California, San Diego + ZhenWangUniversity of California, San Diego + TianyangLiuUniversity of California, San Diego AbdullahAshfaq ZhitingHuUniversity of California, San Diego and Amazon Eric P.XingMohamed bin Zayed Univeristy of AI and School of Computer Science, Carnegie Mellon University @@ -16840,7 +16840,7 @@ HarbaniJaggiUC Berkeley KashyapCoimbatore Murali EveFleisig - ErdemBiyikUniversity of Southern California + ErdemBiyikUniversity of Southern California 21910-21917 When annotators disagree, predicting the labels given by individual annotators can capture nuances overlooked by traditional label aggregation. We introduce three approaches to predict individual annotator ratings on the toxicity of text by incorporating individual annotator-specific information: a neural collaborative filtering (NCF) approach, an in-context learning (ICL) approach, and an intermediate embedding-based architecture. We also study the utility of demographic information for rating prediction. NCF showed limited utility; however, integrating annotator history, demographics, and survey information permits both the embedding-based architecture and ICL to substantially improve prediction accuracy, with the embedding-based architecture outperforming the other methods. We also find that, if demographics are predicted from survey information, using these imputed demographics as features performs comparably to using true demographic data. This suggests that demographics may not provide substantial information for modeling ratings beyond what is captured in survey responses. Our findings raise considerations about the relative utility of different types of annotator information and provide new approaches for modeling annotators in subjective NLP tasks. 2024.emnlp-main.1221 @@ -16850,9 +16850,9 @@ Adversarial Text Generation using Large Language Models for Dementia Detection YouxiangZhu - NanaLin + NanaLin Kiran SandilyaBalivadaUniversity of Massachusetts Boston - DanielHaehnUniversity of Massachusetts, Boston + DanielHaehnUniversity of Massachusetts, Boston XiaohuiLiangUniversity of Massachusetts Boston 21918-21933 Although large language models (LLMs) excel in various text classification tasks, regular prompting strategies (e.g., few-shot prompting) do not work well with dementia detection via picture description. The challenge lies in the language marks for dementia are unclear, and LLM may struggle with relating its internal knowledge to dementia detection. In this paper, we present an accurate and interpretable classification approach by Adversarial Text Generation (ATG), a novel decoding strategy that could relate dementia detection with other tasks. We further develop a comprehensive set of instructions corresponding to various tasks and use them to guide ATG, achieving the best accuracy of 85%, >10% improvement compared to the regular prompting strategies. In addition, we introduce feature context, a human-understandable text that reveals the underlying features of LLM used for classifying dementia. From feature contexts, we found that dementia detection can be related to tasks such as assessing attention to detail, language, and clarity with specific features of the environment, character, and other picture content or language-related features. Future work includes incorporating multi-modal LLMs to interpret speech and picture information. @@ -16878,7 +16878,7 @@ The Greatest Good Benchmark: Measuring <fixed-case>LLM</fixed-case>s’ Alignment with Utilitarian Moral Dilemmas Giovanni Franco GabrielMarraffini - AndrésCottonUniversidad Torcuato di Tella + AndrésCottonUniversidad Torcuato di Tella Noe FabianHsueh AxelFridmanUniversidad de San Andres JuanWisznia @@ -16916,10 +16916,10 @@ The Death and Life of Great Prompts: Analyzing the Evolution of <fixed-case>LLM</fixed-case> Prompts from the Structural Perspective YihanMaCISPA Helmholtz Center for Information Security XinyueShenCISPA Helmholtz Center for Information Security - YixinWu + YixinWu BoyangZhangCISPA Helmholtz Center for Information Security MichaelBackesCISPA Helmholtz Center for Information Security - YangZhangCISPA Helmholtz Center for Information Security + YangZhangCISPA Helmholtz Center for Information Security 21990-22001 Effective utilization of large language models (LLMs), such as ChatGPT, relies on the quality of input prompts. This paper explores prompt engineering, specifically focusing on the disparity between experimentally designed prompts and real-world “in-the-wild” prompts. We analyze 10,538 in-the-wild prompts collected from various platforms and develop a framework that decomposes the prompts into eight key components. Our analysis shows that and Requirement are the most prevalent two components. Roles specified in the prompts, along with their capabilities, have become increasingly varied over time, signifying a broader range of application scenarios for LLMs. However, from the response of GPT-4, there is a marginal improvement with a specified role, whereas leveraging less prevalent components such as Capability and Demonstration can result in a more satisfying response. Overall, our work sheds light on the essential components of in-the-wild prompts and the effectiveness of these components on the broader landscape of LLM prompt engineering, providing valuable guidelines for the LLM community to optimize high-quality prompts. 2024.emnlp-main.1227 @@ -16931,7 +16931,7 @@ MinqianLiuVirginia Polytechnic Institute and State University ZhiyangXu ZihaoLin - TrevorAshbyVirginia Polytechnic Institute and State University + TrevorAshbyVirginia Polytechnic Institute and State University JoyRimchala JiaxinZhangIntuit AI Research LifuHuangVirginia Tech @@ -16956,7 +16956,7 @@ LucySun AlexanderWardle-Solano HannahSzabó - EkaterinaZubova + EkaterinaZubova MatthewBurtell JonathanFan YixinLiuYale University @@ -16977,7 +16977,7 @@ CaimingXiongSalesforce Research RexYingYale University ArmanCohanYale University and Allen Institute for Artificial Intelligence - DragomirRadevYale University + DragomirRadevYale University 22017-22031 Large language models (LLMs) have achieved remarkable performance on a variety of natural language understanding tasks. However, existing benchmarks are inadequate in measuring the complex logical reasoning capabilities of a model. We present FOLIO, a human-annotated, logically complex and diverse dataset for reasoning in natural language (NL), equipped with first-order logic (FOL) annotations. FOLIO consists of 1,430 examples (unique conclusions), each paired with one of 487 sets of premises used to deductively reason for the validity of each conclusion. The logical correctness of the premises and conclusions is ensured by their FOL annotations, which are automatically verified by an FOL inference engine. In addition to the main NL reasoning task, NL-FOL pairs in FOLIO constitute a new NL-FOL translation dataset. Our experiments on FOLIO systematically evaluate the FOL reasoning ability of supervised fine-tuning on medium-sized language models. For both NL reasoning and NL-FOL translation, we benchmark multiple state-of-the-art language models. Our results show that a subset of FOLIO remains a challenge for one of the most capable Large Language Model (LLM) publicly available, GPT-4. 2024.emnlp-main.1229 @@ -16987,10 +16987,10 @@ The <fixed-case>LLM</fixed-case> Effect: Are Humans Truly Using <fixed-case>LLM</fixed-case>s, or Are They Being Influenced By Them Instead? - AlexanderChoi + AlexanderChoi Syeda SabrinaAkterGeorge Mason University - J.p.Singh - AntoniosAnastasopoulosAthena Research Center and George Mason University + J.p.Singh + AntoniosAnastasopoulosAthena Research Center and George Mason University 22032-22054 Large Language Models (LLMs) have shown capabilities close to human performance in various analytical tasks, leading researchers to use them for time and labor-intensive analyses. However, their capability to handle highly specialized and open-ended tasks in domains like policy studies remains in question. This paper investigates the efficiency and accuracy of LLMs in specialized tasks through a structured user study focusing on Human-LLM partnership. The study, conducted in two stages—Topic Discovery and Topic Assignment—integrates LLMs with expert annotators to observe the impact of LLM suggestions on what is usually human-only analysis. Results indicate that LLM-generated topic lists have significant overlap with human generated topic lists, with minor hiccups in missing document-specific topics. However, LLM suggestions may significantly improve task completion speed, but at the same time introduce anchoring bias, potentially affecting the depth and nuance of the analysis, raising a critical question about the trade-off between increased efficiency and the risk of biased analysis. 2024.emnlp-main.1230 @@ -17001,7 +17001,7 @@ Is Child-Directed Speech Effective Training Data for Language Models? Steven Y.Feng Noah D.Goodman - Michael C.Frank + Michael C.Frank 22055-22071 While high-performing language models are typically trained on hundreds of billions of words, human children become fluent language users with a much smaller amount of data. What are the features of the data they receive, and how do these features support language modeling objectives? To investigate this question, we train GPT-2 and RoBERTa models on 29M words of English child-directed speech and a new matched, synthetic dataset (TinyDialogues), comparing to OpenSubtitles, Wikipedia, and a heterogeneous blend of datasets from the BabyLM challenge. We evaluate the syntactic and semantic knowledge of these models using developmentally-inspired evaluations. Through pretraining experiments, we test whether the global developmental ordering or the local discourse ordering of children’s training data supports high performance relative to other datasets. The local properties of the data affect model results, but surprisingly, global properties do not. Further, child language input is not uniquely valuable for training language models. These findings support the hypothesis that, rather than proceeding from better data, the child’s learning algorithm is substantially more data-efficient than current language modeling techniques. 2024.emnlp-main.1231 @@ -17010,9 +17010,9 @@ <fixed-case>R</fixed-case>ev<fixed-case>MUX</fixed-case>: Data Multiplexing with Reversible Adapters for Efficient <fixed-case>LLM</fixed-case> Batch Inference - YigeXuNanyang Technological University - XuGuo - ZhiweiZengNational Technological University + YigeXuNanyang Technological University + XuGuo + ZhiweiZengNational Technological University ChunyanMiaoSchool of Computer Science and Engineering, Nanyang Technological University 22072-22087 Large language models (LLMs) have brought a great breakthrough to the natural language processing (NLP) community, while leading the challenge of handling concurrent customer queries due to their high throughput demands. Data multiplexing addresses this by merging multiple inputs into a single composite input, allowing more efficient inference through a shared forward pass. However, as distinguishing individuals from a composite input is challenging, conventional methods typically require training the entire backbone, yet still suffer from performance degradation. In this paper, we introduce RevMUX, a parameter-efficient data multiplexing framework that incorporates a reversible design in the multiplexer, which can be reused by the demultiplexer to perform reverse operations and restore individual samples for classification. Extensive experiments on four datasets and three types of LLM backbones demonstrate the effectiveness of RevMUX for enhancing LLM inference efficiency while retaining a satisfactory classification performance. @@ -17024,9 +17024,9 @@ Inference Helps <fixed-case>PLM</fixed-case>s’ Conceptual Understanding: Improving the Abstract Inference Ability with Hierarchical Conceptual Entailment Graphs JuncaiLi RuLiShanxi University - XiaoliLi + XiaoliLi QinghuaChaiShanxi University - Jeff Z.PanUniversity of Edinburgh, University of Edinburgh + Jeff Z.PanUniversity of Edinburgh, University of Edinburgh 22088-22104 The abstract inference capability of the Language Model plays a pivotal role in boosting its generalization and reasoning prowess in Natural Language Inference (NLI). Entailment graphs are crafted precisely for this purpose, focusing on learning entailment relations among predicates. Yet, prevailing approaches overlook the *polysemy* and *hierarchical nature of concepts* during entity conceptualization. This oversight disregards how arguments might entail differently across various concept levels, thereby missing potential entailment connections. To tackle this hurdle, we introduce the *concept pyramid* and propose the HiCon-EG (Hierarchical Conceptual Entailment Graph) framework, which organizes arguments hierarchically, delving into entailment relations at diverse concept levels. By learning entailment relationships at different concept levels, the model is guided to better understand concepts so as to improve its abstract inference capabilities. Our method enhances scalability and efficiency in acquiring common-sense knowledge through leveraging statistical language distribution instead of manual labeling, Experimental results show that entailment relations derived from HiCon-EG significantly bolster abstract detection tasks. Our code is available at https://github.com/SXUCFN/HiCon-EG 2024.emnlp-main.1233 @@ -17035,9 +17035,9 @@ <fixed-case>M</fixed-case>3<fixed-case>H</fixed-case>op-<fixed-case>C</fixed-case>o<fixed-case>T</fixed-case>: Misogynous Meme Identification with Multimodal Multi-hop Chain-of-Thought - GitanjaliKumari + GitanjaliKumari KirtanJain - AsifEkbalIndian Institute of Technology, Jodhpur + AsifEkbalIndian Institute of Technology, Jodhpur 22105-22138 In recent years, there has been a significant rise in the phenomenon of hate against women on social media platforms, particularly through the use of misogynous memes. These memes often target women with subtle and obscure cues, making their detection a challenging task for automated systems. Recently, Large Language Models (LLMs) have shown promising results in reasoning using Chain-of-Thought (CoT) prompting to generate the intermediate reasoning chains as the rationale to facilitate multimodal tasks, but often neglect cultural diversity and key aspects like emotion and contextual knowledge hidden in the visual modalities. To address this gap, we introduce a **M**ultimodal **M**ulti-hop CoT (M3Hop-CoT) framework for **M**isogynous meme identification, combining a CLIP-based classifier and a multimodal CoT module with entity-object-relationship integration. M3Hop-CoT employs a three-step multimodal prompting principle to induce emotions, target awareness, and contextual knowledge for meme analysis. Our empirical evaluation, including both qualitative and quantitative analysis, validates the efficacy of the M3Hop-CoT framework on the SemEval-2022 Task 5 (**MAMI task**) dataset, highlighting its strong performance in the macro-F1 score. Furthermore, we evaluate the model’s generalizability by evaluating it on various benchmark meme datasets, offering a thorough insight into the effectiveness of our approach across different datasets. Codes are available at this link: https://github.com/Gitanjali1801/LLM_CoT 2024.emnlp-main.1234 @@ -17083,8 +17083,8 @@ Simul-<fixed-case>M</fixed-case>u<fixed-case>ST</fixed-case>-<fixed-case>C</fixed-case>: Simultaneous Multilingual Speech Translation Corpus Using Large Language Model ManaMakinaeNara Institute of Science and Technology, Japan YusukeSakaiNara Institute of Science and Technology, Japan - HidetakaKamigaitoNara Institute of Science and Technology - TaroWatanabeNara Institute of Science and Technology, Japan + HidetakaKamigaitoNara Institute of Science and Technology + TaroWatanabeNara Institute of Science and Technology, Japan 22185-22205 Simultaneous Speech Translation (SiST) begins translating before the entire source input is received, making it crucial to balance quality and latency. In real interpreting situations, interpreters manage this simultaneity by breaking sentences into smaller segments and translating them while maintaining the source order as much as possible. SiST could benefit from this approach to balance quality and latency. However, current corpora used for simultaneous tasks often involve significant word reordering in translation, which is not ideal given that interpreters faithfully follow source syntax as much as possible. Inspired by conference interpreting by humans utilizing the salami technique, we introduce the Simul-MuST-C, a dataset created by leveraging the Large Language Model (LLM), specifically GPT-4o, which aligns the target text as closely as possible to the source text by using minimal chunks that contain enough information to be interpreted. Experiments on three language pairs show that the effectiveness of segmented-base monotonicity in training data varies with the grammatical distance between the source and the target, with grammatically distant language pairs benefiting the most in achieving quality while minimizing latency. 2024.emnlp-main.1238 @@ -17117,14 +17117,14 @@ <fixed-case>BMR</fixed-case>etriever: Tuning Large Language Models as Better Biomedical Text Retrievers RanXuEmory University - WenqiShiUniversity of Texas Southwestern Medical Center - YueYuGeorgia Institute of Technology + WenqiShiUniversity of Texas Southwestern Medical Center + YueYuGeorgia Institute of Technology YuchenZhuangGeorgia Institute of Technology - YanqiaoZhuUniversity of California, Los Angeles + YanqiaoZhuUniversity of California, Los Angeles May DongmeiWang Joyce C.HoEmory University - ChaoZhangGeorgia Institute of Technology - CarlYangEmory University + ChaoZhangGeorgia Institute of Technology + CarlYangEmory University 22234-22254 Developing effective biomedical retrieval models is important for excelling at knowledge-intensive biomedical tasks but still challenging due to the lack of sufficient publicly annotated biomedical data and computational resources. We present BMRetriever, a series of dense retrievers for enhancing biomedical retrieval via unsupervised pre-training on large biomedical corpora, followed by instruction fine-tuning on a combination of labeled datasets and synthetic pairs. Experiments on 5 biomedical tasks across 11 datasets verify BMRetriever’s efficacy on various biomedical applications. BMRetriever also exhibits strong parameter efficiency, with the 410M variant outperforming baselines up to 11.7 times larger, and the 2B variant matching the performance of models with over 5B parameters. The training data and model checkpoints are released at https://huggingface.co/BMRetriever to ensure transparency, reproducibility, and application to new domains. 2024.emnlp-main.1241 @@ -17137,7 +17137,7 @@ JonghyunSongSeoul National University CheyonJin WenlongZhaoUniversity of Massachusetts at Amherst - AndrewMcCallumUniversity of Massachusetts Amherst + AndrewMcCallumUniversity of Massachusetts Amherst Jay-YoonLeeSeoul National University 22255-22269 A common retrieve-and-rerank paradigm involves retrieving relevant candidates from a broad set using a fast bi-encoder (BE), followed by applying expensive but accurate cross-encoders (CE) to a limited candidate set. However, relying on this small subset is often susceptible to error propagation from the bi-encoders, which limits the overall performance. To address these issues, we propose the Comparing Multiple Candidates (CMC) framework. CMC compares a query and multiple embeddings of similar candidates (i.e., neighbors) through shallow self-attention layers, delivering rich representations contextualized to each other. Furthermore, CMC is scalable enough to handle multiple comparisons simultaneously. For example, comparing ~10K candidates with CMC takes a similar amount of time as comparing 16 candidates with CE. Experimental results on the ZeSHEL dataset demonstrate that CMC, when plugged in between bi-encoders and cross-encoders as a seamless intermediate reranker (BE-CMC-CE), can effectively improve recall@k (+6.7%-p, +3.5%-p for R@16, R@64) compared to using only bi-encoders (BE-CE), with negligible slowdown (<7%). Additionally, to verify CMC’s effectiveness as the final-stage reranker in improving top-1 accuracy, we conduct experiments on downstream tasks such as entity, passage, and dialogue ranking. The results indicate that CMC is not only faster (11x) but also often more effective than CE, with improved prediction accuracy in Wikipedia entity linking (+0.7%-p) and DSTC7 dialogue ranking (+3.3%-p). @@ -17161,13 +17161,13 @@ <fixed-case>M</fixed-case>ed<fixed-case>A</fixed-case>dapter: Efficient Test-Time Adaptation of Large Language Models Towards Medical Reasoning - WenqiShiUniversity of Texas Southwestern Medical Center + WenqiShiUniversity of Texas Southwestern Medical Center RanXuEmory University YuchenZhuangGeorgia Institute of Technology - YueYuGeorgia Institute of Technology - HaotianSunGeorgia Institute of Technology + YueYuGeorgia Institute of Technology + HaotianSunGeorgia Institute of Technology HangWuByteDance Inc and Georgia Institute of Technology - CarlYangEmory University + CarlYangEmory University May DongmeiWang 22294-22314 Despite their improved capabilities in generation and reasoning, adapting large language models (LLMs) to the biomedical domain remains challenging due to their immense size and privacy concerns. In this study, we propose MedAdapter, a unified post-hoc adapter for test-time adaptation of LLMs towards biomedical applications. Instead of fine-tuning the entire LLM, MedAdapter effectively adapts the original model by fine-tuning only a small BERT-sized adapter to rank candidate solutions generated by LLMs. Experiments on four biomedical tasks across eight datasets demonstrate that MedAdapter effectively adapts both white-box and black-box LLMs in biomedical reasoning, achieving average performance improvements of 18.24% and 10.96%, respectively, without requiring extensive computational resources or sharing data with third parties. MedAdapter also yields enhanced performance when combined with train-time adaptation, highlighting a flexible and complementary solution to existing adaptation methods. Faced with the challenges of balancing model performance, computational resources, and data privacy, MedAdapter provides an efficient, privacy-preserving, cost-effective, and transparent solution for adapting LLMs to the biomedical domain. @@ -17178,15 +17178,15 @@ <fixed-case>EHRA</fixed-case>gent: Code Empowers Large Language Models for Few-shot Complex Tabular Reasoning on Electronic Health Records - WenqiShiUniversity of Texas Southwestern Medical Center + WenqiShiUniversity of Texas Southwestern Medical Center RanXuEmory University YuchenZhuangGeorgia Institute of Technology - YueYuGeorgia Institute of Technology - JieyuZhangUniversity of Washington + YueYuGeorgia Institute of Technology + JieyuZhangUniversity of Washington HangWuByteDance Inc and Georgia Institute of Technology YuandaZhu Joyce C.HoEmory University - CarlYangEmory University + CarlYangEmory University May DongmeiWang 22315-22339 Clinicians often rely on data engineers to retrieve complex patient information from electronic health record (EHR) systems, a process that is both inefficient and time-consuming. We propose EHRAgent, a large language model (LLM) agent empowered with accumulative domain knowledge and robust coding capability. EHRAgent enables autonomous code generation and execution to facilitate clinicians in directly interacting with EHRs using natural language. Specifically, we formulate a multi-tabular reasoning task based on EHRs as a tool-use planning process, efficiently decomposing a complex task into a sequence of manageable actions with external toolsets. We first inject relevant medical information to enable EHRAgent to effectively reason about the given query, identifying and extracting the required records from the appropriate tables. By integrating interactive coding and execution feedback, EHRAgent then effectively learns from error messages and iteratively improves its originally generated code. Experiments on three real-world EHR datasets show that EHRAgent outperforms the strongest baseline by up to 29.6% in success rate, verifying its strong capacity to tackle complex clinical tasks with minimal demonstrations. @@ -17198,8 +17198,8 @@ <fixed-case>S</fixed-case>im<fixed-case>LLM</fixed-case>: Detecting Sentences Generated by Large Language Models Using Similarity between the Generation and its Re-generation - Hoang-QuocNguyen-SonNational Institute of Information and Communications Technology (NICT), National Institute of Advanced Industrial Science and Technology - Minh-SonDaoNational Institute of Information and Communications Technology (NICT) + Hoang-QuocNguyen-SonNational Institute of Information and Communications Technology (NICT), National Institute of Advanced Industrial Science and Technology + Minh-SonDaoNational Institute of Information and Communications Technology (NICT) KojiZettsuNational Institute of Information and Communications Technology (NICT), National Institute of Advanced Industrial Science and Technology 22340-22352 Large language models have emerged as a significant phenomenon due to their ability to produce natural text across various applications. However, the proliferation of generated text raises concerns regarding its potential misuse in fraudulent activities such as academic dishonesty, spam dissemination, and misinformation propagation. Prior studies have detected the generation of non-analogous text, which manifests numerous differences between original and generated text. We have observed that the similarity between the original text and its generation is notably higher than that between the generated text and its subsequent regeneration. To address this, we propose a novel approach named SimLLM, aimed at estimating the similarity between an input sentence and its generated counterpart to detect analogous machine-generated sentences that closely mimic human-written ones. Our empirical analysis demonstrates SimLLM’s superior performance compared to existing methods. @@ -17224,8 +17224,8 @@ Simultaneous Interpretation Corpus Construction by Large Language Models in Distant Language Pair YusukeSakaiNara Institute of Science and Technology, Japan ManaMakinaeNara Institute of Science and Technology, Japan - HidetakaKamigaitoNara Institute of Science and Technology - TaroWatanabeNara Institute of Science and Technology, Japan + HidetakaKamigaitoNara Institute of Science and Technology + TaroWatanabeNara Institute of Science and Technology, Japan 22375-22398 In Simultaneous Machine Translation (SiMT), training with a simultaneous interpretation (SI) corpus is an effective method for achieving high-quality yet low-latency. However, constructing such a corpus is challenging due to high costs, and limitations in annotator capabilities, and as a result, existing SI corpora are limited. Therefore, we propose a method to convert existing speech translation (ST) corpora into interpretation-style corpora, maintaining the original word order and preserving the entire source content using Large Language Models (LLM-SI-Corpus). We demonstrate that fine-tuning SiMT models using the LLM-SI-Corpus reduces latency while achieving better quality compared to models fine-tuned with other corpora in both speech-to-text and text-to-text settings. The LLM-SI-Corpus is available at https://github.com/yusuke1997/LLM-SI-Corpus. 2024.emnlp-main.1248 @@ -17249,12 +17249,12 @@ <fixed-case>MIB</fixed-case>ench: Evaluating Multimodal Large Language Models over Multiple Images - HaoweiLiuInstitute of Automation, Chinese Academy of Sciences + HaoweiLiuInstitute of Automation, Chinese Academy of Sciences XiZhang HaiyangXu YayaShi - ChaoyaJiangShanghai Jiaotong University, Wuhan University, Tsinghua University, Tsinghua University, Microsoft, University of the Chinese Academy of Sciences, Chinese Academy of Sciences, Beijing University of Aeronautics and Astronautics, South China University of Technology, SUN YAT-SEN UNIVERSITY, University of Electronic Science and Technology of China, Huazhong University of Science and Technology, Harbin Institute of Technology, Shandong University, nanjing university, Beijing University of Posts and Telecommunications, Shanghai Artificial Intelligence Laboratory, Shanghai University of Science and Technology, Tianjin University, Northeastern University, Southeast University, Xi’an Jiaotong University, Xiamen University, Fudan University, Renmin University of China, Nankai University, Meituan, Kuaishou- 快手科技, East China Normal University, Xi’an University of Electronic Science and Technology, Nanjing University of Aeronautics and Astronautics, Nanjing University of Science and Technology, Southern University of Science and Technology, Northwest Polytechnical University Xi’an, Chongqing University, Jilin University, Beijing Normal University, University of Science and Technology Beijing and Zhejiang University - MingYan + ChaoyaJiangShanghai Jiaotong University, Wuhan University, Tsinghua University, Tsinghua University, Microsoft, University of the Chinese Academy of Sciences, Chinese Academy of Sciences, Beijing University of Aeronautics and Astronautics, South China University of Technology, SUN YAT-SEN UNIVERSITY, University of Electronic Science and Technology of China, Huazhong University of Science and Technology, Harbin Institute of Technology, Shandong University, nanjing university, Beijing University of Posts and Telecommunications, Shanghai Artificial Intelligence Laboratory, Shanghai University of Science and Technology, Tianjin University, Northeastern University, Southeast University, Xi’an Jiaotong University, Xiamen University, Fudan University, Renmin University of China, Nankai University, Meituan, Kuaishou- 快手科技, East China Normal University, Xi’an University of Electronic Science and Technology, Nanjing University of Aeronautics and Astronautics, Nanjing University of Science and Technology, Southern University of Science and Technology, Northwest Polytechnical University Xi’an, Chongqing University, Jilin University, Beijing Normal University, University of Science and Technology Beijing and Zhejiang University + MingYan JiZhangAlibaba Group FeiHuangAlibaba Group ChunfengYuan, Institute of automation, Chinese academy of science @@ -17268,10 +17268,10 @@ <fixed-case>ZEBRA</fixed-case>: Zero-Shot Example-Based Retrieval Augmentation for Commonsense Question Answering - Francesco MariaMolfeseUniversity of Roma “La Sapienza” + Francesco MariaMolfeseUniversity of Roma “La Sapienza” SimoneConiaSapienza University of Rome RiccardoOrlando - RobertoNavigliSapienza University of Rome + RobertoNavigliSapienza University of Rome 22429-22444 Current Large Language Models (LLMs) have shown strong reasoning capabilities in commonsense question answering benchmarks, but the process underlying their success remains largely opaque. As a consequence, recent approaches have equipped LLMs with mechanisms for knowledge retrieval, reasoning and introspection, not only to improve their capabilities but also to enhance the interpretability of their outputs. However, these methods require additional training, hand-crafted templates or human-written explanations. To address these issues, we introduce ZEBRA, a zero-shot question answering framework that combines retrieval, case-based reasoning and introspection and dispenses with the need for additional training of the LLM. Given an input question, ZEBRA retrieves relevant question-knowledge pairs from a knowledge base and generates new knowledge by reasoning over the relationships in these pairs. This generated knowledge is then used to answer the input question, improving the model’s performance and interpretability. We evaluate our approach across 8 well-established commonsense reasoning benchmarks, demonstrating that ZEBRA consistently outperforms strong LLMs and previous knowledge integration approaches, achieving an average accuracy improvement of up to 4.5 points. 2024.emnlp-main.1251 @@ -17280,9 +17280,9 @@ <fixed-case>ABLE</fixed-case>: Personalized Disability Support with Politeness and Empathy Integration - KshitijMishraIndian Institute of Technology, Patna + KshitijMishraIndian Institute of Technology, Patna ManishaBurja - AsifEkbalIndian Institute of Technology, Jodhpur + AsifEkbalIndian Institute of Technology, Jodhpur 22445-22470 In today’s dynamic world, providing inclusive and personalized support for individuals with physical disabilities is imperative. With diverse needs and preferences, tailored assistance according to user personas is crucial. In this paper, we introduce ABLE (Adaptive, Bespoke, Listen and Empathetic), a Conversational Support System for Physical Disabilities. By tracking user personas, including gender, age, and personality traits based on the OCEAN model, ABLE ensures that support interactions are uniquely tailored to each user’s characteristics and preferences. Moreover, integrating politeness and empathy levels in responses enhances user satisfaction and engagement, fostering a supportive and respectful environment. The development of ABLE involves compiling a comprehensive conversational dataset enriched with user profile annotations. Leveraging reinforcement learning techniques and diverse reward mechanisms, ABLE trains a model to generate responses aligned with individual user profiles while maintaining appropriate levels of politeness and empathy. Based on rigorous empirical analysis encompassing automatic and human evaluation metrics based on persona-consistency, politeness accuracy, empathy accuracy, perplexity, and conversation coherence, the efficacy of ABLE is assessed. Our findings underscore ABLE’s success in delivering tailored support to individuals grappling with physical disabilities. To the best of our knowledge, this is the very first attempt towards building a user’s persona-oriented physical disability support system. 2024.emnlp-main.1252 @@ -17301,7 +17301,7 @@ TaeyoonKwonYonsei University JiwanChung YoungjaeYuYonsei University - JinyoungYeoYonsei University + JinyoungYeoYonsei University 22471-22502 Algorithmic reasoning tasks that involve complex logical patterns, such as completing Dyck language, pose challenges for large language models (LLMs), despite their recent success. Prior work has used LLMs to generate programming language and applied external compilers for such tasks. Yet, when on the fly, it is hard to generate an executable code with the correct logic for the solution. Even so, code for one instance cannot be reused for others, although they might require the same logic to solve. We present Think-and-Execute, a novel framework that improves LLMs’ algorithmic reasoning: (1) In Think, we discover task-level logic shared across all instances, and express such logic with pseudocode; (2) In Execute, we tailor the task-level pseudocode to each instance and simulate the execution of it. Think-and-Execute outperforms several strong baselines (including CoT and PoT) in diverse algorithmic reasoning tasks. We manifest the advantage of using task-level pseudocode over generating instance-specific solutions one by one. Also, we show that pseudocode can better improve LMs’ reasoning than natural language (NL) guidance, even though they are trained with NL instructions. 2024.emnlp-main.1253 @@ -17314,13 +17314,13 @@ HyungjooChae TaeyoonKwonYonsei University SeungjunMoonYonsei University - YonghoSongYonsei University + YonghoSongYonsei University DongjinKangYonsei University Kai Tzu-iunnOng Beong-wooKwakYonsei University SeonghyeonBaeYonsei University Seung-wonHwangSeoul National University - JinyoungYeoYonsei University + JinyoungYeoYonsei University 22503-22524 This paper presents Coffee-Gym, a comprehensive RL environment for training models that provide feedback on code editing. Coffee-Gym includes two major components: (1) Coffee, a dataset containing humans’ code edit traces for coding questions and human-written feedback for editing erroneous code; (2) CoffeeEval, a reward function that faithfully reflects the helpfulness of feedback by assessing the performance of the revised code in unit tests. With them, Coffee-Gym addresses the unavailability of high-quality datasets for training feedback models with RL, and provides more accurate rewards than the SOTA reward model (i.e., GPT-4). By applying Coffee-Gym, we elicit feedback models that outperform baselines in enhancing open-source code LLMs’ code editing, making them comparable with closed-source LLMs. We make the dataset and the model checkpoint publicly available in https://huggingface.co/spaces/Coffee-Gym/Project-Coffee-Gym. 2024.emnlp-main.1254 @@ -17344,8 +17344,8 @@ Deciphering Cognitive Distortions in Patient-Doctor Mental Health Conversations: A Multimodal <fixed-case>LLM</fixed-case>-Based Detection and Reasoning Framework Gopendra VikramSingh Sai VardhanVemulapalli - MauajamaFirdaus - AsifEkbalIndian Institute of Technology, Jodhpur + MauajamaFirdaus + AsifEkbalIndian Institute of Technology, Jodhpur 22546-22570 Cognitive distortion research holds increasing significance as it sheds light on pervasive errors in thinking patterns, providing crucial insights into mental health challenges and fostering the development of targeted interventions and therapies. This paper delves into the complex domain of cognitive distortions which are prevalent distortions in cognitive processes often associated with mental health issues. Focusing on patient-doctor dialogues, we introduce a pioneering method for detecting and reasoning about cognitive distortions utilizing Large Language Models (LLMs). Operating within a multimodal context encompassing audio, video, and textual data, our approach underscores the critical importance of integrating diverse modalities for a comprehensive understanding of cognitive distortions. By leveraging multimodal information, including audio, video, and textual data, our method offers a nuanced perspective that enhances the accuracy and depth of cognitive distortion detection and reasoning in a zero-shot manner. Our proposed hierarchical framework adeptly tackles both detection and reasoning tasks, showcasing significant performance enhancements compared to current methodologies. Through comprehensive analysis, we elucidate the efficacy of our approach, offering promising insights into the diagnosis and understanding of cognitive distortions in multimodal settings.The code and dataset can be found here: https://github.com/clang1234/ZS-CoDR.git 2024.emnlp-main.1256 @@ -17356,11 +17356,11 @@ Nearest Neighbor Normalization Improves Multimodal Retrieval - NeilChowdhuryMassachusetts Institute of Technology + NeilChowdhuryMassachusetts Institute of Technology FranklinWangMassachusetts Institute of Technology SumedhShenoyMassachusetts Institute of Technology DouweKielaStanford University - SarahSchwettmannMassachusetts Institute of Technology + SarahSchwettmannMassachusetts Institute of Technology TristanThrushStanford University 22571-22582 Multimodal models leverage large-scale pretraining to achieve strong but still imperfect performance on tasks such as image captioning, visual question answering, and cross-modal retrieval. In this paper, we present a simple and efficient method for correcting errors in trained contrastive image-text retrieval models with no additional training, called Nearest Neighbor Normalization (NNN). We show an improvement on retrieval metrics in both text retrieval and image retrieval for all of the contrastive models that we tested (CLIP, BLIP, ALBEF, SigLIP, BEiT) and for both of the datasets that we used (MS-COCO and Flickr30k). NNN requires a reference database, but does not require any training on this database, and can even increase the retrieval accuracy of a model after finetuning. @@ -17384,10 +17384,10 @@ <fixed-case>L</fixed-case>ong<fixed-case>RAG</fixed-case>: A Dual-Perspective Retrieval-Augmented Generation Paradigm for Long-Context Question Answering QingfeiZhaoUniversity of the Chinese Academy of Sciences and Institute of Information Engineering, Chinese Academy of Sciences RuobingWangUniversity of the Chinese Academy of Sciences and Institute of Information Engineering, Chinese Academy of Sciences - YukuoCenZhipu AI + YukuoCenZhipu AI DarenZhaInstitute of Information Engineering, Chinese Academy of Sciences ShichengTan - YuxiaoDongTsinghua University + YuxiaoDongTsinghua University JieTangTsinghua University, Tsinghua University 22600-22632 Long-Context Question Answering (LCQA), a challenging task, aims to reason over long-context documents to yield accurate answers to questions. Existing long-context Large Language Models (LLMs) for LCQA often struggle with the “lost in the middle” issue. Retrieval-Augmented Generation (RAG) mitigates this issue by providing external factual evidence. However, its chunking strategy disrupts the global long-context information, and its low-quality retrieval in long contexts hinders LLMs from identifying effective factual details due to substantial noise. To this end, we propose LongRAG, a general, dual-perspective, and robust LLM-based RAG system paradigm for LCQA to enhance RAG’s understanding of complex long-context knowledge (i.e., global information and factual details). We design LongRAG as a plug-and-play paradigm, facilitating adaptation to various domains and LLMs. Extensive experiments on three multi-hop datasets demonstrate that LongRAG significantly outperforms long-context LLMs (up by 6.94%), advanced RAG (up by 6.16%), and Vanilla RAG (up by 17.25%). Furthermore, we conduct quantitative ablation studies and multi-dimensional analyses, highlighting the effectiveness of the system’s components and fine-tuning strategies.Data and code are available at [https://github.com/QingFei1/LongRAG](https://github.com/QingFei1/LongRAG). @@ -17402,7 +17402,7 @@ YipingSong TianlunLiu LiangDing - DongshengLi + DongshengLi 22633-22646 Watermarking enables people to determine whether the text is generated by a specific model. It injects a unique signature based on the “green-red” list that can be tracked during detection, where the words in green lists are encouraged to be generated. Recent researchers propose to fix the green/red lists or increase the proportion of green tokens to defend against paraphrasing attacks. However, these methods cause degradation of text quality due to semantic disparities between the watermarked text and the unwatermarked text. In this paper, we propose a semantic-aware watermark method that considers contexts to generate a semantic-aware key to split a semantically balanced green/red list for watermark injection. The semantic balanced list reduces the performance drop due to adding bias on green lists. To defend against paraphrasing attacks, we generate the watermark key considering the semantics of contexts via locally sensitive hashing. To improve the text quality, we propose to split green/red lists considering semantics to enable the green list to cover almost all semantics. We also dynamically adapt the bias to balance text quality and robustness. The experiments show our advantages in both robustness and text quality comparable to existing baselines. 2024.emnlp-main.1260 @@ -17411,9 +17411,9 @@ Knowledge Graph Enhanced Large Language Model Editing - MengqiZhangShandong University + MengqiZhangShandong University XiaotianYe - QiangLiuInstitute of Automation, Chinese Academy of Sciences + QiangLiuInstitute of Automation, Chinese Academy of Sciences PengjieRenShandong University ShuWuInstitute of automation, Chinese academy of science, Chinese Academy of Sciences ZhuminChenShandong University @@ -17431,7 +17431,7 @@ MohitSahu VardhanGacche TirthankarGhosalOak Ridge National Laboratory - AsifEkbalIndian Institute of Technology, Jodhpur + AsifEkbalIndian Institute of Technology, Jodhpur 22663-22679 The integrity of the peer-review process is vital for maintaining scientific rigor and trust within the academic community. With the steady increase in the usage of large language models (LLMs) like ChatGPT in academic writing, there is a growing concern that AI-generated texts could compromise the scientific publishing including peer-reviews. Previous works have focused on generic AI-generated text detection or have presented an approach for estimating the fraction of peer-reviews that can be AI-generated. Our focus here is to solve a real-world problem by assisting the editor or chair in determining whether a review is written by ChatGPT or not. To address this, we introduce the Term Frequency (TF) model, which posits that AI often repeats tokens, and the Review Regeneration (RR) model which is based on the idea that ChatGPT generates similar outputs upon re-prompting. We stress test these detectors against token attack and paraphrasing. Finally we propose an effective defensive strategy to reduce the effect of paraphrasing on our models. Our findings suggest both our proposed methods perform better than other AI text detectors. Our RR model is more robust, although our TF model performs better than the RR model without any attacks. We make our code, dataset, model public. 2024.emnlp-main.1262 @@ -17442,8 +17442,8 @@ Mitigating Open-Vocabulary Caption Hallucinations AssafBen-KishTel Aviv University MoranYanukaTel Aviv University - MorrisAlperTel Aviv University - RajaGiryesTel Aviv University + MorrisAlperTel Aviv University + RajaGiryesTel Aviv University HadarAverbuch-ElorTel Aviv University and Cornell University 22680-22698 While recent years have seen rapid progress in image-conditioned text generation, image captioning still suffers from the fundamental issue of hallucinations, namely, the generation of spurious details that cannot be inferred from the given image. Existing methods largely use closed-vocabulary object lists to mitigate or evaluate hallucinations in image captioning, ignoring the long-tailed nature of hallucinations that occur in practice. To this end, we propose a framework for addressing hallucinations in image captioning in the open-vocabulary setting. Our framework includes a new benchmark, OpenCHAIR, that leverages generative foundation models to evaluate open-vocabulary object hallucinations for image captioning, surpassing the popular and similarly-sized CHAIR benchmark in both diversity and accuracy. Furthermore, to mitigate open-vocabulary hallucinations without using a closed object list, we propose MOCHa, an approach harnessing advancements in reinforcement learning. Our multi-objective reward function explicitly targets the trade-off between fidelity and adequacy in generations without requiring any strong supervision. MOCHa improves a large variety of image captioning models, as captured by our OpenCHAIR benchmark and other existing metrics. We will release our code and models. @@ -17455,7 +17455,7 @@ Initialization of Large Language Models via Reparameterization to Mitigate Loss Spikes KosukeNishidaNTT - KyosukeNishidaNTT corporation + KyosukeNishidaNTT corporation KunikoSaitoNTT 22699-22714 Loss spikes, a phenomenon in which the loss value diverges suddenly, is a fundamental issue in the pre-training of large language models. This paper supposes that the non-uniformity of the norm of the parameters is one of the causes of loss spikes. Here, in training of neural networks, the scale of the gradients is required to be kept constant throughout the layers to avoid the vanishing and exploding gradients problem. However, to meet these requirements in the Transformer model, the norm of the model parameters must be non-uniform, and thus, parameters whose norm is smaller are more sensitive to the parameter update. To address this issue, we propose a novel technique, weight scaling as reparameterization (WeSaR). WeSaR introduces a gate parameter per parameter matrix and adjusts it to the value satisfying the requirements. Because of the gate parameter, WeSaR sets the norm of the original parameters uniformly, which results in stable training. Experimental results with the Transformer decoders consisting of 130 million, 1.3 billion, and 13 billion parameters showed that WeSaR stabilizes and accelerates training and that it outperformed compared methods including popular initialization methods. @@ -17466,7 +17466,7 @@ <fixed-case>ALVIN</fixed-case>: Active Learning Via <fixed-case>IN</fixed-case>terpolation MichalisKorakakis - AndreasVlachosUniversity of Cambridge + AndreasVlachosUniversity of Cambridge AdrianWellerAlan Turing Institute and University of Cambridge 22715-22728 Active Learning aims to minimize annotation effort by selecting the most useful instances from a pool of unlabeled data. However, typical active learning methods overlook the presence of distinct example groups within a class, whose prevalence may vary, e.g., in occupation classification datasets certain demographics are disproportionately represented in specific classes. This oversight causes models to rely on shortcuts for predictions, i.e., spurious correlations between input attributes and labels occurring in well-represented groups. To address this issue, we propose Active Learning Via INterpolation (ALVIN), which conducts intra-class interpolations between examples from under-represented and well-represented groups to create anchors, i.e., artificial points situated between the example groups in the representation space. By selecting instances close to the anchors for annotation, ALVIN identifies informative examples exposing the model to regions of the representation space that counteract the influence of shortcuts. Crucially, since the model considers these examples to be of high certainty, they are likely to be ignored by typical active learning methods. Experimental results on six datasets encompassing sentiment analysis, natural language inference, and paraphrase detection demonstrate that ALVIN outperforms state-of-the-art active learning methods in both in-distribution and out-of-distribution generalization. @@ -17489,7 +17489,7 @@ Instruction Fine-Tuning: Does Prompt Loss Matter? - MathewHuerta-Enochian + MathewHuerta-Enochian Seung YongKo 22771-22795 We present a novel study analyzing the effects of various prompt loss token weights (PLW) for supervised instruction fine-tuning (SIFT). While prompt-masking (PLW = 0) is common for SIFT, some fine-tuning APIs support fractional PLWs and suggest that using a small non-zero PLW can help stabilize learning when fine-tuning on short-completion data. However, there has never been a study confirming this claim, and OpenAI, a major cloud-based SIFT provider, recently removed this parameter from their fine-tuning API. We found that performance of models fine-tuned on short-completion data had a statistically-significant negative quadratic relationship with PLW. Using small values (0.01 − 0.5) of PLW produced better results on multiple-choice and short-generation benchmarks (outperforming models fine-tuned on long-completion data) while large values (≈ 1.0) of PLW produced better results on long-generation benchmarks. We explained this effect and verified its importance through additional experiments. This research serves as a warning to API providers about the importance of providing a PLW parameter for SIFT. @@ -17500,9 +17500,9 @@ Entity Insertion in Multilingual Linked Corpora: The Case of <fixed-case>W</fixed-case>ikipedia TomásFeith - AkhilAroraSwiss Federal Institute of Technology Lausanne - MartinGerlachWikimedia Foundation - DebjitPaulEPFL - EPF Lausanne + AkhilAroraSwiss Federal Institute of Technology Lausanne + MartinGerlachWikimedia Foundation + DebjitPaulEPFL - EPF Lausanne RobertWestEPFL - EPF Lausanne 22796-22819 Links are a fundamental part of information networks, turning isolated pieces of knowledge into a network of information that is much richer than the sum of its parts. However, adding a new link to the network is not trivial: it requires not only the identification of a suitable pair of source and target entities but also the understanding of the content of the source to locate a suitable position for the link in the text. The latter problem has not been addressed effectively, particularly in the absence of text spans in the source that could serve as anchors to insert a link to the target entity. To bridge this gap, we introduce and operationalize the task of entity insertion in information networks. Focusing on the case of Wikipedia, we empirically show that this problem is, both, relevant and challenging for editors. We compile a benchmark dataset in 105 languages and develop a framework for entity insertion called LocEI (Localized Entity Insertion) and its multilingual variant XLocEI. We show that XLocEI outperforms all baseline models (including state-of-the-art prompt-based ranking with LLMs such as GPT-4) and that it can be applied in a zero-shot manner on languages not seen during training with minimal performance drop. These findings are important for applying entity insertion models in practice, e.g., to support editors in adding links across the more than 300 language versions of Wikipedia. @@ -17540,7 +17540,7 @@ ZhengranZengPeking University WeiYePeking University JindongWangMicrosoft Research - YueZhangWestlake University + YueZhangWestlake University ShikunZhangPeking University 1-13 The rapid growth of evaluation methodologies and datasets for large language models (LLMs) has created a pressing need for their unified integration. Meanwhile, concerns about data contamination and bias compromise the trustworthiness of evaluation findings, while the efficiency of evaluation processes remains a bottleneck due to the significant computational costs associated with LLM inference.In response to these challenges, we introduce FreeEval, a modular framework not only for conducting trustworthy and efficient automatic evaluations of LLMs but also serving as a platform to develop and validate new evaluation methodologies. FreeEval addresses key challenges through: (1) unified abstractions that simplify the integration of diverse evaluation methods, including dynamic evaluations requiring complex LLM interactions; (2) built-in meta-evaluation techniques such as data contamination detection and human evaluation to enhance result fairness; (3) a high-performance infrastructure with distributed computation and caching strategies for efficient large-scale evaluations; and (4) an interactive Visualizer for result analysis and interpretation to support innovation of evaluation techniques. We open-source all our code at https://github.com/WisdomShell/FreeEval and our demostration video, live demo, installation guides are available at: https://freeeval.zhuohao.me/. @@ -17574,7 +17574,7 @@ DahyunKimUpstage AI Research YunsuKimUpstage YungiKimUpstage - ChanjunParkUpstage + ChanjunParkUpstage 25-33 This paper introduces Evalverse, a novel library that streamlines the evaluation of Large Language Models (LLMs) by unifying disparate evaluation tools into a single, user-friendly framework. Evalverse enables individuals with limited knowledge of artificial intelligence to easily request LLM evaluations and receive detailed reports, facilitated by an integration with communication platforms like Slack. Thus, Evalverse serves as a powerful tool for the comprehensive assessment of LLMs, offering both researchers and practitioners a centralized and easily accessible evaluation framework. Finally, we also provide a demo video for Evalverse, showcasing its capabilities and implementation in a two-minute format. 2024.emnlp-demo.3 @@ -17588,9 +17588,9 @@ ZhenyuLiu JifangWang DongfangLiHarbin Institute of Technology - YibinChen - BaotianHuHarbin Institute of Technology, Shenzhen - MinZhangHarbin Institute of Technology + YibinChen + BaotianHuHarbin Institute of Technology, Shenzhen + MinZhangHarbin Institute of Technology 34-45 As we all know, hallucinations prevail in Large Language Models (LLMs), where the generated content is coherent but factually incorrect, which inflicts a heavy blow on the widespread application of LLMs. Previous studies have shown that LLMs could confidently state non-existent facts rather than answering “I don’t know”. Therefore, it is necessary to resort to external knowledge to detect and correct the hallucinated content. Since manual detection and correction of factual errors is labor-intensive, developing an automatic end-to-end hallucination-checking approach is indeed a needful thing. To this end, we present Medico, a Multi-source evidence fusion enhanced hallucination detection and correction framework. It fuses diverse evidence from multiple sources, detects whether the generated content contains factual errors, provides the rationale behind the judgment, and iteratively revises the hallucinated content. Experimental results on evidence retrieval (0.964 HR@5, 0.908 MRR@5), hallucination detection (0.927-0.951 F1), and hallucination correction (0.973-0.979 approval rate) manifest the great potential of Medico. A video demo of Medico can be found at https://youtu.be/RtsO6CSesBI. 2024.emnlp-demo.4 @@ -17599,11 +17599,11 @@ <fixed-case>O</fixed-case>pen<fixed-case>O</fixed-case>mni: A Collaborative Open Source Tool for Building Future-Ready Multimodal Conversational Agents - QiangSunUniversity of Western Australia - YuanyiLuo + QiangSunUniversity of Western Australia + YuanyiLuo SiruiLi - WenxiaoZhang - WeiLiuUniversity of Western Australia + WenxiaoZhang + WeiLiuUniversity of Western Australia 46-52 Multimodal conversational agents are highly desirable because they offer natural and human-like interaction.However, there is a lack of comprehensive end-to-end solutions to support collaborative development and benchmarking.While proprietary systems like GPT-4o and Gemini demonstrating impressive integration of audio, video, and text with response times of 200-250ms, challenges remain in balancing latency, accuracy, cost, and data privacy.To better understand and quantify these issues, we developed OpenOmni, an open-source, end-to-end pipeline benchmarking tool that integrates advanced technologies such as Speech-to-Text, Emotion Detection, Retrieval Augmented Generation, Large Language Models, along with the ability to integrate customized models.OpenOmni supports local and cloud deployment, ensuring data privacy and supporting latency and accuracy benchmarking. This flexible framework allows researchers to customize the pipeline, focusing on real bottlenecks and facilitating rapid proof-of-concept development. OpenOmni can significantly enhance applications like indoor assistance for visually impaired individuals, advancing human-computer interaction.Our demonstration video is available https://www.youtube.com/watch?v=zaSiT3clWqY, demo is available via https://openomni.ai4wa.com, code is available via https://github.com/AI4WA/OpenOmniFramework. 2024.emnlp-demo.5 @@ -17627,16 +17627,16 @@ <fixed-case>M</fixed-case>ark<fixed-case>LLM</fixed-case>: An Open-Source Toolkit for <fixed-case>LLM</fixed-case> Watermarking LeyiPan AiweiLiuTsinghua University, Tsinghua University - ZhiweiHeShanghai Jiao Tong University + ZhiweiHeShanghai Jiao Tong University ZitianGao XuandongZhaoUniversity of California, Berkeley YijianLu BinglinZhouShanghai Jiaotong University ShuliangLiu - XumingHuThe Hong Kong University of Science and Technology (Guangzhou) and Hong Kong University of Science and Technology + XumingHuThe Hong Kong University of Science and Technology (Guangzhou) and Hong Kong University of Science and Technology LijieWenTsinghua University - IrwinKingThe Chinese University of Hong Kong - Philip S.YuUniversity of Illinois, Chicago + IrwinKingThe Chinese University of Hong Kong + Philip S.YuUniversity of Illinois, Chicago 61-71 Watermarking for Large Language Models (LLMs), which embeds imperceptible yet algorithmically detectable signals in model outputs to identify LLM-generated text, has become crucial in mitigating the potential misuse of LLMs. However, the abundance of LLM watermarking algorithms, their intricate mechanisms, and the complex evaluation procedures and perspectives pose challenges for researchers and the community to easily understand, implement and evaluate the latest advancements. To address these issues, we introduce MarkLLM, an open-source toolkit for LLM watermarking. MarkLLM offers a unified and extensible framework for implementing LLM watermarking algorithms, while providing user-friendly interfaces to ensure ease of access. Furthermore, it enhances understanding by supporting automatic visualization of the underlying mechanisms of these algorithms. For evaluation, MarkLLM offers a comprehensive suite of 12 tools spanning three perspectives, along with two types of automated evaluation pipelines. Through MarkLLM, we aim to support researchers while improving the comprehension and involvement of the general public in LLM watermarking technology, fostering consensus and driving further advancements in research and application. Our code is available at https://github.com/THU-BPM/MarkLLM. 2024.emnlp-demo.7 @@ -17649,8 +17649,8 @@ JingyaChenMicrosoft GaganBansalMicrosoft SuffSyed - AdamFourney - ErkangZhuMicrosoft + AdamFourney + ErkangZhuMicrosoft ChiWangGoogle DeepMind SaleemaAmershi 72-79 @@ -17663,13 +17663,13 @@ <fixed-case>T</fixed-case>iny<fixed-case>A</fixed-case>gent: Function Calling at the Edge Lutfi ErenErdogan NicholasLeeUniversity of California, Berkeley - SiddharthJha + SiddharthJha SehoonKimUniversity of California Berkeley RyanTabrizi SuhongMoon Coleman Richard CharlesHooper GopalaAnumanchipalliUniversity of California, Berkeley - KurtKeutzerUniversity of California Berkeley + KurtKeutzerUniversity of California Berkeley AmirGholamiUniversity of California Berkeley 80-88 Recent large language models (LLMs) have enabled the development of advanced agentic systems that can integrate various tools and APIs to fulfill user queries through function calling. However, the deployment of these LLMs on the edge has not been explored since they typically require cloud-based infrastructure due to their substantial model size and computational demands. To this end, we present TinyAgent, an end-to-end framework for training and deploying task-specific small language model agents capable of function calling for driving agentic systems at the edge. We first show how to enable accurate function calling for open-source models via the LLMCompiler framework. We then systematically curate a high-quality dataset for function calling, which we use to fine-tune two small language models, TinyAgent-1.1B and 7B. For efficient inference, we introduce a novel tool retrieval method to reduce the input prompt length and utilize quantization to further accelerate the inference speed. As a driving application, we demonstrate a local Siri-like system for Apple’s MacBook that can execute user commands through text or voice input. Our results show that our models can achieve, and even surpass, the function-calling capabilities of larger models like GPT-4-Turbo, while being fully deployed at the edge. We open-source our [dataset, models, and installable package](https://github.com/SqueezeAILab/TinyAgent) and provide a [demo video](https://www.youtube.com/watch?v=0GvaGL9IDpQ) for our MacBook assistant agent. @@ -17684,7 +17684,7 @@ DongfangLiHarbin Institute of Technology XinshuoHu ZetianSun - BaotianHuHarbin Institute of Technology, Shenzhen + BaotianHuHarbin Institute of Technology, Shenzhen ShaolinYeTencent ZifeiShanWeChat, Tencent QianChen @@ -17700,8 +17700,8 @@ RajveeSheth ShubhNisar HeenabenPrajapati - HimanshuBeniwalIndian Institute of Technology Gandhinagar - MayankSinghIndian Institute of Technology Gandhinagar + HimanshuBeniwalIndian Institute of Technology Gandhinagar + MayankSinghIndian Institute of Technology Gandhinagar 101-109 As the NLP community increasingly addresses challenges associated with multilingualism, robust annotation tools are essential to handle multilingual datasets efficiently. In this paper, we introduce a code-mixed multilingual text annotation framework, COMMENTATOR, specifically designed for annotating code- mixed text. The tool demonstrates its effectiveness in token-level and sentence-level language annotation tasks for Hinglish text. We perform robust qualitative human-based evaluations to showcase COMMENTATOR led to 5x faster annotations than the best baseline. 2024.emnlp-demo.11 @@ -17724,7 +17724,7 @@ GuanyuLin TaoFeng PengruiHanCalifornia Institute of Technology and University of Illinois at Urbana-Champaign - GeLiuUniversity of Illinois at Urbana-Champaign, University of Washington and Amazon AWS AI + GeLiuUniversity of Illinois at Urbana-Champaign, University of Washington and Amazon AWS AI JiaxuanYouUniversity of Illinois at Urbana-Champaign 122-130 As scientific research proliferates, researchers face the daunting task of navigating and reading vast amounts of literature. Existing solutions, such as document QA, fail to provide personalized and up-to-date information efficiently. We present Arxiv Copilot, a self-evolving, efficient LLM system designed to assist researchers, based on thought-retrieval, user profile and high performance optimization. Specifically, Arxiv Copilot can offer personalized research services, maintaining a real-time updated database. Quantitative evaluation demonstrates that Arxiv Copilot saves 69.92% of time after efficient deployment. This paper details the design and implementation of Arxiv Copilot, highlighting its contributions to personalized academic support and its potential to streamline the research process. We have deployed Arxiv Copilot at: https://huggingface.co/spaces/ulab-ai/ArxivCopilot. @@ -17735,8 +17735,8 @@ <fixed-case>T</fixed-case>rans<fixed-case>A</fixed-case>gents: Build Your Translation Company with Language Agents MinghaoWu - JiahaoXu - LongyueWang + JiahaoXu + LongyueWang 131-141 Multi-agent systems empowered by large language models (LLMs) have demonstrated remarkable capabilities in a wide range of downstream applications. In this work, we introduce TransAgents, a novel multi-agent translation system inspired by human translation companies. TransAgents employs specialized agents—Senior Editor, Junior Editor, Translator, Localization Specialist, and Proofreader—to collaboratively produce translations that are accurate, culturally sensitive, and of high quality. Our system is flexible, allowing users to configure their translation company based on specific needs, and universal, with empirical evidence showing superior performance across various domains compared to state-of-the-art methods. Additionally, TransAgents features a user-friendly interface and offers translations at a cost approximately 80\times cheaper than professional human translation services. Evaluations on literary, legal, and financial test sets demonstrate that TransAgents produces translations preferred by human evaluators, even surpassing human-written references in literary contexts. Our live demo website is available at https://www.transagents.ai/. Our demonstration video is available at https://www.youtube.com/watch?v=p7jIAtF-WKc. 2024.emnlp-demo.14 @@ -17748,9 +17748,9 @@ Musa IzzanardiWijanarkoMonash University LuckySusanto Prasetia AnugrahPratama - Ika KarlinaIdrisMonash University - TraciHongBoston University, Boston University - Derry TantiWijayaMonash University and Boston University + Ika KarlinaIdrisMonash University + TraciHongBoston University, Boston University + Derry TantiWijayaMonash University and Boston University 142-152 Online hate speech propagation is a complex issue, deeply influenced by both the perpetrator and the target’s cultural, historical, and societal contexts. Consequently, developing a universally robust hate speech classifier for diverse social media texts remains a challenging and unsolved task. The lack of mechanisms to track the spread and severity of hate speech further complicates the formulation of effective solutions. In response to this, to monitor hate speech in Indonesia during the recent 2024 presidential election, we have employed advanced Natural Language Processing (NLP) technologies to create an improved hate speech classifier tailored for a narrower subset of texts; specifically, texts that target vulnerable groups that have historically been the targets of hate speech in Indonesia. Our focus is on texts that mention these six vulnerable minority groups in Indonesia: Shia, Ahmadiyyah, Christians, LGBTQ+, Indonesian Chinese, and people with disabilities, as well as one additional group of interest: Jews. The insights gained from our dashboard have assisted stakeholders in devising more effective strategies to counteract hate speech. Notably, our dashboard has persuaded the General Election Supervisory Body in Indonesia (BAWASLU) to collaborate with our institution and the Alliance of Independent Journalists (AJI) to monitor social media hate speech in vulnerable areas in the country known for hate speech dissemination or hate-related violence in the upcoming Indonesian regional elections. This dashboard is available online at https://aji.or.id/hate-speech-monitoring. 2024.emnlp-demo.15 @@ -17785,13 +17785,13 @@ ShuhangLin, Rutgers University WenyueHuaRutgers University, New Brunswick LingyaoLi - Che-JuiChangAmazon - LizhouFanBrigham and Women’s Hospital, Harvard University + Che-JuiChangAmazon + LizhouFanBrigham and Women’s Hospital, Harvard University JianchaoJi HangHua - MingyuJinRutgers University - JieboLuoUniversity of Rochester and University of Rochester - YongfengZhangRutgers University + MingyuJinRutgers University + JieboLuoUniversity of Rochester and University of Rochester + YongfengZhangRutgers University 172-181 This paper presents BattleAgent, a detailed emulation demonstration system that combines the Large Vision-Language Model (VLM) and Multi-Agent System (MAS). This novel system aims to emulate complex dynamic interactions among multiple agents, as well as between agents and their environments, over a period of time. The emulation showcases the current capabilities of agents, featuring fine-grained multi-modal interactions between agents and landscapes. It develops customizable agent structures to meet specific situational requirements, for example, a variety of battle-related activities like scouting and trench digging. These components collaborate to recreate historical events in a lively and comprehensive manner. This methodology holds the potential to substantially improve visualization of historical events and deepen our understanding of historical events especially from the perspective of decision making. The data and code for this project are accessible at https://github.com/agiresearch/battleagent and the demo is accessible at https://drive.google.com/file/d/1I5B3KWiYCSSP1uMiPGNmXlTmild-MzRJ/view?usp=sharing. 2024.emnlp-demo.18 @@ -17846,12 +17846,12 @@ JifanLin BinjieWangFudan University YunLuowestlake university - RenjiePan + RenjiePan YangXu QingkaiMin ZizhaoZhang YiwenWang - WenjieLiThe Hong Kong Polytechnic University, The Hong Kong Polytechnic University + WenjieLiThe Hong Kong Polytechnic University, The Hong Kong Polytechnic University PengfeiLiu 209-218 The rapid growth of scientific literature imposes significant challenges for researchers endeavoring to stay updated with the latest advancements in their fields and delve into new areas. We introduce OpenResearcher, an innovative platform that leverages Artificial Intelligence (AI) techniques to accelerate the research process by answering diverse questions from researchers. OpenResearcher is built based on Retrieval-Augmented Generation (RAG) to integrate Large Language Models (LLMs) with up-to-date, domain-specific knowledge. Moreover, we develop various tools for OpenResearcher to understand researchers’ queries, search from the scientific literature, filter retrieved information, provide accurate and comprehensive answers, and self-refine these answers. OpenResearcher can flexibly use these tools to balance efficiency and effectiveness. As a result, OpenResearcher enables researchers to save time and increase their potential to discover new insights and drive scientific breakthroughs. Demo, video, and code are available at: https://github.com/GAIR-NLP/OpenResearcher. @@ -17861,13 +17861,13 @@ <fixed-case>O</fixed-case>pen<fixed-case>F</fixed-case>act<fixed-case>C</fixed-case>heck: A Unified Framework for Factuality Evaluation of <fixed-case>LLM</fixed-case>s - HasanIqbalMohamed bin Zayed University of Artificial Intelligence + HasanIqbalMohamed bin Zayed University of Artificial Intelligence YuxiaWang MinghanWangMonash University Georgi NenkovGeorgiev - JiahuiGeng + JiahuiGeng IrynaGurevychInstitute for Computer Science, Artificial Intelligence and Technology, Mohamed bin Zayed University of Artificial Intelligence and Technische Universität Darmstadt - PreslavNakovMohamed bin Zayed University of Artificial Intelligence + PreslavNakovMohamed bin Zayed University of Artificial Intelligence 219-229 The increased use of large language models (LLMs) across a variety of real-world applications calls for automatic tools to check the factual accuracy of their outputs, as LLMs often hallucinate. This is difficult as it requires assessing the factuality of free-form open-domain responses. While there has been a lot of research on this topic, different papers use different evaluation benchmarks and measures,which makes them hard to compare and hampers future progress. To mitigate these issues, we developed OpenFactCheck, a unified framework, with three modules: (i) RESPONSEEVAL, which allows users to easily customize an automatic fact-checking system and to assess the factuality of all claims in an input document using that system, (ii) LLMEVAL, which assesses the overall factuality of an LLM, and (iii) CHECKEREVAL, a module to evaluate automatic fact-checking systems. OpenFactCheck is open-sourced (https://github.com/mbzuai-nlp/openfactcheck) and publicly released as a Python library (https://pypi.org/project/openfactcheck/) and also as a web service (http://app.openfactcheck.com). A video describing the system is available at https://youtu.be/-i9VKL0HleI. 2024.emnlp-demo.23 @@ -17878,7 +17878,7 @@ <fixed-case>ULLME</fixed-case>: A Unified Framework for Large Language Model Embeddings with Generation-Augmented Learning HieuManUniversity of Oregon Nghia TrungNgoUniversity of Oregon - FranckDernoncourtAdobe Systems + FranckDernoncourtAdobe Systems Thien HuuNguyen, University of Oregon 230-239 Large Language Models (LLMs) excel in various natural language processing tasks, but leveraging them for dense passage embedding remains challenging. This is due to their causal attention mechanism and the misalignment between their pre-training objectives and the text ranking tasks. Despite some recent efforts to address these issues, existing frameworks for LLM-based text embeddings have been limited by their support for only a limited range of LLM architectures and fine-tuning strategies, limiting their practical application and versatility. In this work, we introduce the Unified framework for Large Language Model Embedding (ULLME), a flexible, plug-and-play implementation that enables bidirectional attention across various LLMs and supports a range of fine-tuning strategies. We also propose Generation-augmented Representation Learning (GRL), a novel fine-tuning method to boost LLMs for text embedding tasks. GRL enforces consistency between representation-based and generation-based relevance scores, leveraging LLMs’ powerful generative abilities for learning passage embeddings. To showcase our framework’s flexibility and effectiveness, we release three pre-trained models from ULLME with different backbone architectures, ranging from 1.5B to 8B parameters, all of which demonstrate strong performance on the Massive Text Embedding Benchmark. Our framework is publicly available at: https://github.com/nlp-uoregon/ullme. A demo video for ULLME can also be found at https://rb.gy/ws1ile. @@ -17898,7 +17898,7 @@ XianLiFacebook AI Justine TKao MaryamFazel-ZarandiFAIR - Meta - YuandongTianMeta AI (FAIR) + YuandongTianMeta AI (FAIR) 240-249 Travel planning is a challenging and time-consuming task that aims to find an itinerary which satisfies multiple, interdependent constraints regarding flights, accommodations, attractions, and other travel arrangements. In this paper, we propose To the Globe (TTG), a real-time demo system that takes natural language requests from users, translates it to symbolic form via a fine-tuned Large Language Model, and produces optimal travel itineraries with Mixed Integer Linear Programming solvers. The overall system takes ~5seconds to reply to the user request with guaranteed itineraries. To train TTG, we develop a synthetic data pipeline that generates userrequests, flight and hotel information in symbolic form without human annotations, based on the statistics of real-world datasets, and fine-tune an LLM to translate NL user requests to their symbolic form, which is sent to the symbolic solver to compute optimal itineraries. Our NL-symbolic translation achieves ~91% exact match in a backtranslation metric (i.e., whether the estimated symbolic form of generated natural language matches the groundtruth), and its returned itineraries have a ratio of 0.979 compared to the optimal cost of the ground truth user request. When evaluated by users, TTG achieves consistently high Net Promoter Scores (NPS) of 35-40% on generated itinerary. 2024.emnlp-demo.25 @@ -17909,7 +17909,7 @@ <fixed-case>MATSA</fixed-case>: Multi-Agent Table Structure Attribution PuneetMathurAdobe Systems AlexaSiuAdobe - NedimLipkaAdobe Systems + NedimLipkaAdobe Systems TongSunAdobe Systems 250-258 Large Language Models (LLMs) have significantly advanced QA tasks through in-context learning but often suffer from hallucinations. Attributing supporting evidence grounded in source documents has been explored for unstructured text in the past. However, tabular data present unique challenges for attribution due to ambiguities (e.g., abbreviations, domain-specific terms), complex header hierarchies, and the difficulty in interpreting individual table cells without row and column context. We introduce a new task, Fine-grained Structured Table Attribution (FAST-Tab), to generate row and column-level attributions supporting LLM-generated answers. We present MATSA, a novel LLM-based Multi-Agent system capable of post-hoc Table Structure Attribution to help users visually interpret factual claims derived from tables. MATSA augments tabular entities with descriptive context about structure, metadata, and numerical trends to semantically retrieve relevant rows and columns corresponding to facts in an answer. Additionally, we propose TabCite, a diverse benchmark designed to evaluate the FAST-Tab task on tables with complex layouts sourced from Wikipedia and business PDF documents. Extensive experiments demonstrate that MATSA significantly outperforms SOTA baselines on TabCite, achieving an 8-13% improvement in F1 score. Qualitative user studies show that MATSA helps increase user trust in Generative AI by providing enhanced explainability for LLM-assisted table QA and enables professionals to be more productive by saving time on fact-checking LLM-generated answers. @@ -17938,7 +17938,7 @@ <fixed-case>C</fixed-case>hat<fixed-case>HF</fixed-case>: Collecting Rich Human Feedback from Real-time Conversations AndrewLi - ZhenduoWang + ZhenduoWang EthanMendesGeorgia Institute of Technology Duong MinhLe WeiXuGeorgia Institute of Technology @@ -17951,8 +17951,8 @@ <fixed-case>KM</fixed-case>atrix: A Flexible Heterogeneous Knowledge Enhancement Toolkit for Large Language Model - ShunWuInstitute of automation, Chinese academy of science, Chinese Academy of Sciences - DiWuInstitute of automation, Chinese academy of science, Chinese Academy of Sciences + ShunWuInstitute of automation, Chinese academy of science, Chinese Academy of Sciences + DiWuInstitute of automation, Chinese academy of science, Chinese Academy of Sciences KunLuo XueYouZhang JunZhaoInstitute of automation, Chinese academy of science @@ -18020,7 +18020,7 @@ ThammeGowdaMicrosoft Translator RomanGrundkiewiczMicrosoft ElijahRippeth - MattPostMicrosoft and Johns Hopkins University + MattPostMicrosoft and Johns Hopkins University MarcinJunczys-DowmuntMicrosoft 328-335 2024.emnlp-demo.34 @@ -18042,17 +18042,17 @@ JonibekMansurov EkaterinaArtemovaToloka AI VladislavMikhailovUniversity of Oslo - RuiXingMohamed bin Zayed University of Artificial Intelligence and University of Melbourne - JiahuiGeng - HasanIqbalMohamed bin Zayed University of Artificial Intelligence - Zain MuhammadMujahid - TarekMahmoud + RuiXingMohamed bin Zayed University of Artificial Intelligence and University of Melbourne + JiahuiGeng + HasanIqbalMohamed bin Zayed University of Artificial Intelligence + Zain MuhammadMujahid + TarekMahmoud AkimTsvigunSemrush Alham FikriAjiMohamed bin Zayed University of Artificial Intelligence and Amazon - ArtemShelmanovMohamed bin Zayed University of Artificial Intelligence - NizarHabashNew York University Abu Dhabi + ArtemShelmanovMohamed bin Zayed University of Artificial Intelligence + NizarHabashNew York University Abu Dhabi IrynaGurevychInstitute for Computer Science, Artificial Intelligence and Technology, Mohamed bin Zayed University of Artificial Intelligence and Technische Universität Darmstadt - PreslavNakovMohamed bin Zayed University of Artificial Intelligence + PreslavNakovMohamed bin Zayed University of Artificial Intelligence 336-343 The ease of access to large language models (LLMs) has enabled a widespread of machine-generated texts, and now it is often hard to tell whether a piece of text was human-written or machine-generated. This raises concerns about potential misuse, particularly within educational and academic domains. Thus, it is important to develop practical systems that can automate the process. Here, we present one such system, LLM-DetectAIve, designed for fine-grained detection. Unlike most previous work on machine-generated text detection, which focused on binary classification, LLM-DetectAIve supports four categories: (i) human-written, (ii) machine-generated, (iii) machine-written, then machine-humanized, and (iv) human-written, then machine-polished. Category (iii) aims to detect attempts to obfuscate the fact that a text was machine-generated, while category (iv) looks for cases where the LLM was used to polish a human-written text, which is typically acceptable in academic writing, but not in education. Our experiments show that LLM-DetectAIve can effectively identify the above four categories, which makes it a potentially useful tool in education, academia, and other domains.LLM-DetectAIve is publicly accessible at https://github.com/mbzuai-nlp/LLM-DetectAIve. The video describing our system is available at https://youtu.be/E8eT_bE7k8c. 2024.emnlp-demo.35 @@ -18067,7 +18067,7 @@ WendaXu XiXu SiqiOuyangCMU, Carnegie Mellon University - LeiLiSchool of Computer Science, Carnegie Mellon University + LeiLiSchool of Computer Science, Carnegie Mellon University 344-350 With the rapid advancement of machine translation research, evaluation toolkits have become essential for benchmarking system progress. Tools like COMET and SacreBLEU offer single quality score assessments that are effective for pairwise system comparisons. However, these tools provide limited insights for fine-grained system-level comparisons and the analysis of instance-level defects. To address these limitations, we introduce Translation Canvas, an explainable interface designed to pinpoint and analyze translation systems’ performance: 1) Translation Canvas assists machine translation researchers in comprehending system-level model performance by identifying common errors (their frequency and severity) and analyzing relationships between different systems based on various evaluation metrics. 2) It supports fine-grained analysis by highlighting error spans with explanations and selectively displaying systems’ predictions. According to human evaluation, Translation Canvas demonstrates superior performance over COMET and SacreBLEU packages under enjoybility and understandbility criteria. 2024.emnlp-demo.36 @@ -18076,10 +18076,10 @@ mbrs: A Library for Minimum <fixed-case>B</fixed-case>ayes Risk Decoding - HiroyukiDeguchiNTT Communications + HiroyukiDeguchiNTT Communications YusukeSakaiNara Institute of Science and Technology, Japan - HidetakaKamigaitoNara Institute of Science and Technology - TaroWatanabeNara Institute of Science and Technology, Japan + HidetakaKamigaitoNara Institute of Science and Technology + TaroWatanabeNara Institute of Science and Technology, Japan 351-362 2024.emnlp-demo.37 deguchi-etal-2024-mbrs @@ -18102,13 +18102,13 @@ Schema-Guided Culture-Aware Complex Event Simulation with Multi-Agent Role-Play ShaLiUniversity of Illinois, Urbana Champaign RevanthGangi Reddy - Khanh DuyNguyen - QingyunWangUniversity of Illinois, Urbana Champaign + Khanh DuyNguyen + QingyunWangUniversity of Illinois, Urbana Champaign YiFungUniversity of Illinois at Urbana-Champaign - ChiHan + ChiHan JiaweiHan KartikNatarajanPrivate Sector Humanitarian Alliance - Clare R.VossARL + Clare R.VossARL HengJiUniversity of Illinois, Urbana-Champaign 372-381 Complex news events, such as natural disasters and socio-political conflicts, require swift responses from the government and society. Relying on historical events to project the future is insufficient as such events are sparse and do not cover all possible conditions and nuanced situations. Simulation of these complex events can help better prepare and reduce the negative impact. We develop a controllable complex news event simulator guided by both the event schema representing domain knowledge about the scenario and user-provided assumptions representing case-specific conditions.As event dynamics depend on the fine-grained social and cultural context, we further introduce a geo-diverse commonsense and cultural norm-aware knowledge enhancement component.To enhance the coherence of the simulation, apart from the global timeline of events,we take an agent-based approach to simulate the individual character states, plans, and actions. By incorporating the schema and cultural norms, our generated simulations achieve much higher coherence and appropriateness and are received favorably by participants from a humanitarian assistance organization. @@ -18150,16 +18150,16 @@ <fixed-case>W</fixed-case>alled<fixed-case>E</fixed-case>val: A Comprehensive Safety Evaluation Toolkit for Large Language Models - PrannayaGupta + PrannayaGupta Le QiYau Hao HanLow - I-ShiangLee + I-ShiangLee Hugo MaximusLimNUS High School of Math and Science Yu XinTeohUniversity of Illinois at Urbana-Champaign Koh JiaHng Dar WinLiew - RishabhBhardwaj - RajatBhardwajIndian Institute of Science Education and Research, Thiruvananthapuram + RishabhBhardwaj + RajatBhardwajIndian Institute of Science Education and Research, Thiruvananthapuram SoujanyaPoriaSingapore University of Technology and Design 397-407 WalledEval is a comprehensive AI safety testing toolkit designed to evaluate large language models (LLMs). It accommodates a diverse range of models, including both open-weight and API-based ones, and features over 35 safety benchmarks covering areas such as multilingual safety, exaggerated safety, and prompt injections. The framework supports both LLM and judge benchmarking, and incorporates custom mutators to test safety against various text-style mutations such as future tense and paraphrasing. Additionally, WalledEval introduces WalledGuard, a new, small and performant content moderation tool, and SGXSTest, a benchmark for assessing exaggerated safety in cultural contexts. We make WalledEval publicly available at https://github.com/walledai/walledeval with a demonstration video at https://youtu.be/50Zy97kj1MA. @@ -18178,10 +18178,10 @@ ZhenWuNanjing University WeiYePeking University WenyuanXu - YueZhangWestlake University + YueZhangWestlake University XinyuDaiNanjing University ShikunZhangPeking University - QingsongWenSquirrel Ai Learning + QingsongWenSquirrel Ai Learning 408-418 Large Language Models (LLMs) demonstrate human-level capabilities in dialogue, reasoning, and knowledge retention. However, even the most advanced LLMs face challenges such as hallucinations and real-time updating of their knowledge. Current research addresses this bottleneck by equipping LLMs with external knowledge, a technique known as Retrieval Augmented Generation (RAG). However, two key issues constrained the development of RAG. First, there is a growing lack of comprehensive and fair comparisons between novel RAG algorithms. Second, open-source tools such as LlamaIndex and LangChain employ high-level abstractions, which results in a lack of transparency and limits the ability to develop novel algorithms and evaluation metrics. To close this gap, we introduce RAGLAB, a modular and research-oriented open-source library. RAGLAB reproduces 6 existing algorithms and provides a comprehensive ecosystem for investigating RAG algorithms. Leveraging RAGLAB, we conduct a fair comparison of 6 RAG algorithms across 10 benchmarks. With RAGLAB, researchers can efficiently compare the performance of various algorithms and develop novel algorithms. 2024.emnlp-demo.43 @@ -18206,7 +18206,7 @@ JiahuiZhou XinMao ZiqiJin - WeiLuSingapore University of Technology and Design + WeiLuSingapore University of Technology and Design MinLinSea AI Lab 424-435 We present Sailor, a family of open language models ranging from 0.5B to 14B parameters, tailored for South-East Asian (SEA) languages. From Qwen1.5, Sailor models accept 200B to 400B tokens during continual pre-training, primarily covering the languages of English, Chinese, Vietnamese, Thai, Indonesian, Malay, and Lao. The training leverages several techniques, including BPE dropout for improving the model robustness, aggressive data cleaning and deduplication, and small proxy models to optimize the data mixture. Experimental results on four typical tasks indicate that Sailor models demonstrate strong performance across different benchmarks, including commonsense reasoning, question answering, reading comprehension and examination. We share our insights to spark a wider interest in developing large language models for multilingual use cases. @@ -18219,15 +18219,15 @@ QinyuLuoJohns Hopkins University and Tsinghua University, Tsinghua University YiningYe ShihaoLiang - ZhongZhangTsinghua University + ZhongZhangTsinghua University YujiaQin YaxiLuDepartment of Computer Science and Technology, Tsinghua University YesaiWu XinCong - YankaiLinRenmin University of China + YankaiLinRenmin University of China YingliZhang XiaoyinCheSiemens Corporate Research - ZhiyuanLiuTsinghua University + ZhiyuanLiuTsinghua University MaosongSun 436-464 Generative models have demonstrated considerable potential in software engineering, particularly in tasks such as code generation and debugging. However, their utilization in the domain of code documentation generation remains underexplored. To this end, we introduce RepoAgent, a large language model powered open-source framework aimed at proactively generating, maintaining, and updating code documentation. Through both qualitative and quantitative evaluations, we have validated the effectiveness of our approach, showing that RepoAgent excels in generating high-quality repository-level documentation. The code and results are publicly accessible at https://github.com/OpenBMB/RepoAgent. @@ -18262,8 +18262,8 @@ EvelinaMironova BukashkinAnton KonstantinKulikov - AndreyKuznetsovAIRI, Sber and Samara National Research University - DenisDimitrovAIRI and Sber + AndreyKuznetsovAIRI, Sber and Samara National Research University + DenisDimitrovAIRI and Sber 475-485 Text-to-image (T2I) diffusion models are popular for introducing image manipulation methods, such as editing, image fusion, inpainting, etc. At the same time, image-to-video (I2V) and text-to-video (T2V) models are also built on top of T2I models. We present Kandinsky 3, a novel T2I model based on latent diffusion, achieving a high level of quality and photorealism. The key feature of the new architecture is the simplicity and efficiency of its adaptation for many types of generation tasks. We extend the base T2I model for various applications and create a multifunctional generation system that includes text-guided inpainting/outpainting, image fusion, text-image fusion, image variations generation, I2V and T2V generation. We also present a distilled version of the T2I model, evaluating inference in 4 steps of the reverse process without reducing image quality and 3 times faster than the base model. We deployed a user-friendly demo system in which all the features can be tested in the public domain. Additionally, we released the source code and checkpoints for the Kandinsky 3 and extended models. Human evaluations show that Kandinsky 3 demonstrates one of the highest quality scores among open source generation systems. 2024.emnlp-demo.48 @@ -18277,13 +18277,13 @@ HanminwangHanminwang HaoranWang YilunZhaoYale University - WenqiShiUniversity of Texas Southwestern Medical Center + WenqiShiUniversity of Texas Southwestern Medical Center YiFungUniversity of Illinois at Urbana-Champaign WangchunshuZhouGuangdong OPPO Mobile Telecommunications Corp.,Ltd. JiannanCao HengJiUniversity of Illinois, Urbana-Champaign ArmanCohanYale University and Allen Institute for Artificial Intelligence - MarkGersteinYale University + MarkGersteinYale University 486-496 Recently, large language models (LLMs) have evolved into interactive agents, proficient in planning, tool use, and task execution across various tasks. However, without agent-tuning, open-source models like LLaMA2 currently struggle to match the efficiency of larger models such as GPT-4 in scientific applications due to a lack of agent tuning datasets. In response, we introduce MIMIR, a streamlined platform that leverages large LLMs to generate agent-tuning data for fine-tuning smaller, specialized models. By employing a role-playing methodology, MIMIR enables larger models to simulate various roles and create interaction data, which can then be used to fine-tune open-source models like LLaMA2. This approach ensures that even smaller models can effectively serve as agents in scientific tasks. Integrating these features into an end-to-end platform, MIMIR facilitates everything from the uploading of scientific data to one-click agent fine-tuning. MIMIR is publicly released and actively maintained at https://github. com/gersteinlab/MIMIR, along with a demo video for quick-start, calling for broader development. 2024.emnlp-demo.49 @@ -18294,7 +18294,7 @@ <fixed-case>W</fixed-case>ild<fixed-case>V</fixed-case>is: Open Source Visualizer for Million-Scale Chat Logs in the Wild YuntianDengUniversity of Waterloo WentingZhaoCornell University - JackHesselSamaya AI + JackHesselSamaya AI XiangRen ClaireCardieCornell University YejinChoiDepartment of Computer Science, University of Washington @@ -18318,15 +18318,15 @@ <fixed-case>LM</fixed-case>-Interview: An Easy-to-use Smart Interviewer System via Knowledge-guided Language Model Exploitation - HanmingLi + HanmingLi JifanYu RuimiaoLiShanghai Normal University - ZhanxinHao - YanXuan + ZhanxinHao + YanXuan JiaxiYuan - BinXu - JuanziLi - ZhiyuanLiuTsinghua University + BinXu + JuanziLi + ZhiyuanLiuTsinghua University 520-528 Semi-structured interviews are a crucial method of data acquisition in qualitative research. Typically controlled by the interviewer, the process progresses through a question-and-answer format, aimed at eliciting information from the interviewee. However, interviews are highly time-consuming and demand considerable experience of the interviewers, which greatly limits the efficiency and feasibility of data collection. Therefore, we introduce LM-Interview, a novel system designed to automate the process of preparing, conducting and analyzing semi-structured interviews. Experimental results demonstrate that LM-interview achieves performance comparable to that of skilled human interviewers. 2024.emnlp-demo.52 @@ -18430,7 +18430,7 @@ XuanGuo RohitPatkiAmazon DanteEveraertAmazon - ChristopherPottsStanford University + ChristopherPottsStanford University 73-79 The rapid introduction of new brand names into everyday language poses a unique challenge for e-commerce spelling correction services, which must distinguish genuine misspellings from novel brand names that use unconventional spelling. We seek to address this challenge via Retrieval Augmented Generation (RAG). On this approach, product names are retrieved from a catalog and incorporated into the context used by a large language model (LLM) that has been fine-tuned to do contextual spelling correction. Through quantitative evaluation and qualitative error analyses, we find improvements in spelling correction utilizing the RAG framework beyond a stand-alone LLM. We also demonstrate the value of additional finetuning of the LLM to incorporate retrieved context. 2024.emnlp-industry.7 @@ -18440,10 +18440,10 @@ Scaling Parameter-Constrained Language Models with Quality Data ErnieChangMeta AI - MatteoPaltenghiUniversität Stuttgart + MatteoPaltenghiUniversität Stuttgart YangLiIowa State University Pin-JieLin - ChangshengZhaoMeta Inc. + ChangshengZhaoMeta Inc. PatrickHuberFacebook ZechunLiuMeta Inc. RastislavRabatinFacebook @@ -18462,34 +18462,34 @@ MasayasuMuraokaTokyo Institute of Technology, Tokyo Institute of Technology and IBM Research - Tokyo, International Business Machines MuthukumaranRamasubramanianUniversity of Alabama at Huntsville TakumaUdagawaInternational Business Machines - IkshaGurungUniversity of Alabama at Huntsville + IkshaGurungUniversity of Alabama at Huntsville NishanPanthaUniversity of Alabama at Huntsville RongZhang BharathDandalaIBM, International Business Machines - RahulRamachandranNASA/MSFC + RahulRamachandranNASA/MSFC ManilMaskey KaylinBugbeeNA Michael M.Little ElizabethFancherNASA - IrinaGerasimovNASA GES DISC - ArminMehrabianGeorge Washington University and NASA + IrinaGerasimovNASA GES DISC + ArminMehrabianGeorge Washington University and NASA LaurenSandersNASA GeneLab, BMSIS - Sylvain V.CostesNASA Ames - SergiBlanco-CuaresmaUniversité de Lausanne + Sylvain V.CostesNASA Ames + SergiBlanco-CuaresmaUniversité de Lausanne KellyLockhartHarvard & Smithsonian Center for Astrophysics ThomasAllenNA FelixGrezesNA - MeganAnsdellNASA HQ - AlbertoAccomazzi + MeganAnsdellNASA HQ + AlbertoAccomazzi YousefEl-KurdiIBM, International Business Machines DavisWertheimerInternational Business Machines BirgitPfitzmannCity of Zurich - CesarBerrospi RamisInternational Business Machines - MicheleDolfiInternational Business Machines + CesarBerrospi RamisInternational Business Machines + MicheleDolfiInternational Business Machines Rafael TeixeiraDe LimaInternational Business Machines PanagiotisVagenasInternational Business Machines S. KarthikMukkavilliInternational Business Machines - Peter W. J.Staar + Peter W. J.Staar SanazVahidiniaNA RyanMcGranaghanNA Tsengdar J.Lee @@ -18505,7 +18505,7 @@ ZheLiAMD DongLiAMD Inc. LuTianAMD - EmadBarsoumAMD + EmadBarsoumAMD 113-119 Improving the efficiency of inference in Large Language Models (LLMs) is a critical area of research. Post-training Quantization (PTQ) is a popular technique, but it often faces challenges at low-bit levels, particularly in downstream tasks. Quantization-aware Training (QAT) can alleviate this problem, but it requires significantly more computational resources. To tackle this, we introduced Weight-Decomposed Low-Rank Quantization-Aware Training (DL-QAT), which merges the advantages of QAT while training only less than 1% of the total parameters. Specifically, we introduce a group-specific quantization magnitude to adjust the overall scale of each quantization group. Within each quantization group, we use LoRA matrices to update the weight size and direction in the quantization space. We validated the effectiveness of our method on the LLaMA and LLaMA2 model families. The results show significant improvements over our baseline method across different quantization granularities. For instance, for LLaMA-7B, our approach outperforms the previous state-of-the-art method by 4.2% in MMLU on 3-bit LLaMA-7B. Additionally, our quantization results on pre-trained models also surpass previous QAT methods, demonstrating the superior performance and efficiency of our approach. 2024.emnlp-industry.10 @@ -18517,10 +18517,10 @@ Hybrid-<fixed-case>RACA</fixed-case>: Hybrid Retrieval-Augmented Composition Assistance for Real-time Text Prediction MenglinXiaMicrosoft XuchaoZhangMicrosoft - CamilleCouturierMicrosoft + CamilleCouturierMicrosoft GuoqingZhengMicrosoft Research SaravanRajmohanMicrosoft - VictorRühleMicrosoft + VictorRühleMicrosoft 120-131 Large language models (LLMs) enhanced with retrieval augmentation has shown great performance in many applications. However, the computational demands for these models pose a challenge when applying them to real-time tasks, such as composition assistance. To address this, we propose Hybrid Retrieval-Augmented Composition Assistance (Hybrid-RACA), a novel system for real-time text prediction that efficiently combines a cloud-based LLM with a smaller client-side model through retrieval augmented memory. This integration enables the client model to generate better responses, benefiting from the LLM’s capabilities and cloud-based data. Meanwhile, via a novel asynchronous memory update mechanism, the client model can deliver real-time completions to user inputs without the need to wait for responses from the cloud. Our experiments on five datasets demonstrate that Hybrid-RACA offers strong performance while maintaining low latency. 2024.emnlp-industry.11 @@ -18530,9 +18530,9 @@ <fixed-case>LLMC</fixed-case>: Benchmarking Large Language Model Quantization with a Versatile Compression Toolkit - RuihaoGong + RuihaoGong YangYong - ShiqiaoGuSensetime + ShiqiaoGuSensetime YushiHuangSenseTime ChengtaoLv YunchenZhang @@ -18551,9 +18551,9 @@ JoeBarrowPattern Data AlexaSiuAdobe AniNenkovaAdobe Research - SeunghyunYoonAdobe Research + SeunghyunYoonAdobe Research Ryan A.RossiAdobe Research - FranckDernoncourtAdobe Systems + FranckDernoncourtAdobe Systems 153-169 Large Language Models (LLMs) have issues with document question answering (QA) in situations where the document is unable to fit in the small context length of an LLM. To overcome this issue, most existing works focus on retrieving the relevant context from the document, representing them as plain text. However, documents such as PDFs, web pages, and presentations are naturally structured with different pages, tables, sections, and so on. Representing such structured documents as plain text is incongruous with the user’s mental model of these documents with rich structure. When a system has to query the document for context, this incongruity is brought to the fore, and seemingly trivial questions can trip up the QA system. To bridge this fundamental gap in handling structured documents, we propose an approach called PDFTriage that enables models to retrieve the context based on either structure or content. Our experiments demonstrate the effectiveness of the proposed PDFTriage-augmented models across several classes of questions where existing retrieval-augmented LLMs fail. To facilitate further research on this fundamental problem, we release our benchmark dataset consisting of 900+ human-generated questions over 80 structured documents from 10 different categories of question types for document QA. Our code and datasets will be released soon on Github. 2024.emnlp-industry.13 @@ -18582,7 +18582,7 @@ JihooKim ByungjuKimMathpresso WonseokLeeMathpresso, Inc. - ChanjunParkUpstage + ChanjunParkUpstage 186-198 This study presents a novel learning approach designed to enhance both mathematical reasoning and problem-solving abilities of Large Language Models (LLMs). We focus on integrating the Chain-of-Thought (CoT) and the Program-of-Thought (PoT) learning, hypothesizing that prioritizing the learning of mathematical reasoning ability is helpful for the amplification of problem-solving ability. Thus, the initial learning with CoT is essential for solving challenging mathematical problems. To this end, we propose a sequential learning approach, named SAAS (Solving Ability Amplification Strategy), which strategically transitions from CoT learning to PoT learning. Our empirical study, involving an extensive performance comparison using several benchmarks, demonstrates that our SAAS achieves state-of-the-art (SOTA) performance. The results underscore the effectiveness of our sequential learning approach, marking a significant advancement in the field of mathematical reasoning in LLMs. 2024.emnlp-industry.15 @@ -18597,7 +18597,7 @@ OliviaSturmanGoogle Aparna RJoshi BhaktipriyaRadharapuFacebook - PiyushKumar + PiyushKumar ReneeShelbyGoogle 199-214 Increasing use of large language models (LLMs) demand performant guardrails to ensure the safety of inputs and outputs of LLMs. When these safeguards are trained on imbalanced data, they can learn the societal biases. We present a light-weight, post-processing method for mitigating counterfactual fairness in closed-source text safety classifiers. Our approach involves building an ensemble that not only outperforms the input classifiers and policy-aligns them, but also acts as a debiasing regularizer. We introduce two threshold-agnostic metrics to assess the counterfactual fairness of a model, and demonstrate how combining these metrics with Fair Data Reweighting (FDW) helps mitigate biases. We create an expanded Open AI dataset, and a new templated LLM-generated dataset based on user-prompts, both of which are counterfactually balanced across identity groups and cover four key areas of safety; we will work towards publicly releasing these datasets. Our results show that our approach improves counterfactual fairness with minimal impact on model performance. @@ -18608,10 +18608,10 @@ Centrality-aware Product Retrieval and Ranking - HadeelSaadany + HadeelSaadany SwapnilBhosale SamarthAgrawaleBay Inc. - DipteshKanojiaUniversity of Surrey + DipteshKanojiaUniversity of Surrey ConstantinOrasanUniversity of Surrey ZheWu 215-224 @@ -18640,7 +18640,7 @@ Investigating the Personality Consistency in Quantized Role-Playing Dialogue Agents - YixiaoWangLG Electronics + YixiaoWangLG Electronics HomaFashandi KevinFerreiraLG Corporation 239-255 @@ -18664,10 +18664,10 @@ Code Representation Pre-training with Complements from Program Executions JiaboHuang - JianyuZhaoTencent - YuyangRong + JianyuZhaoTencent + YuyangRong YiwenGuo - YifengHe + YifengHe HaoChenUniversity of California, Davis 267-278 Language models for natural language processing have been grafted onto programming language modeling for advancing code intelligence. Although it can be represented in the text format, code is syntactically more rigorous, as it is designed to be properly compiled or interpreted to perform a set of behaviors given any inputs. In this case, existing works benefit from syntactic representations to learn from code less ambiguously in forms of abstract syntax tree, control-flow graph, etc. However, programs with the same purpose can be implemented in various ways showing different syntactic representations, while the ones with similar implementations can have distinct behaviors. Though trivially demonstrated during executions, such semantics about functionality are challenging to be learned directly from code, especially in an unsupervised manner. Hence, in this paper, we propose FuzzPretrain to explore the dynamic information of programs revealed by their test cases and embed it into the feature representations of code as complements. The test cases are obtained with the assistance of a customized fuzzer and are only required during pre-training. FuzzPretrain yielded more than 6%/19% mAP improvements on code search over its masked language modeling counterparts trained with only source code and source code coupled with abstract syntax trees (ASTs), respectively. Our experiments show the benefits of learning discriminative code representations from FuzzPretrain. @@ -18677,11 +18677,11 @@ <fixed-case>S</fixed-case>cale<fixed-case>LLM</fixed-case>: A Resource-Frugal <fixed-case>LLM</fixed-case> Serving Framework by Optimizing End-to-End Efficiency - YuhangYao + YuhangYao HanJin - Alay DilipbhaiShah + Alay DilipbhaiShah ShanshanHanUniversity of California, Irvine - ZijianHuScale AI + ZijianHuScale AI DimitrisStripelisTensorOpera, Inc. YideRan ZhaozhuoXuStevens Institute of Technology @@ -18699,7 +18699,7 @@ SomnathBanerjeeIIT Kharagpur AmruitSahoo SayanLayek - AvikDuttaMicrosoft + AvikDuttaMicrosoft RimaHazraSingapore University of Technology and Design AnimeshMukherjeeIndian Institute of Technology Kharagpur 290-302 @@ -18710,8 +18710,8 @@ <fixed-case>SHIELD</fixed-case>: <fixed-case>LLM</fixed-case>-Driven Schema Induction for Predictive Analytics in <fixed-case>EV</fixed-case> Battery Supply Chain Disruptions - Zhi-QiChengCarnegie Mellon University - YifeiDong + Zhi-QiChengCarnegie Mellon University + YifeiDong AikeShiGeorgia Institute of Technology WeiLiu YuzhiHu @@ -18755,7 +18755,7 @@ <fixed-case>TPTU</fixed-case>-v2: Boosting Task Planning and Tool Usage of Large Language Model-based Agents in Real-world Industry Systems YilunKong - JingqingRuan + JingqingRuan YiHongChen BinZhang TianpengBaoSenseTime Research @@ -18763,10 +18763,10 @@ du GuoQing XiaoruHu HangyuMaoSensetime Research - ZiyueLiEWI gGmbH and University of Cologne + ZiyueLiEWI gGmbH and University of Cologne XingyuZengSenseTime Group Limited RuiZhaoQing Yuan Research Institute, Shanghai Jiao Tong University and SenseTime Research - XueqianWang + XueqianWang 371-385 Large Language Models (LLMs) have demonstrated proficiency in addressing tasks that necessitate a combination of task planning and the usage of external tools, such as weather and calculator APIs. However, real-world industrial systems present prevalent challenges in task planning and tool usage: numerous APIs in the real system make it intricate to invoke the appropriate one, while the inherent limitations of LLMs pose challenges in orchestrating an accurate sub-task sequence and API-calling order. This paper introduces a comprehensive framework aimed at enhancing the Task Planning and Tool Usage (TPTU) abilities of LLM-based agents in industry. Our framework comprises three key components designed to address these challenges: (1) the API Retriever selects the most pertinent APIs among the extensive API set; (2) the Demo Selector retrieves task-level demonstrations, which is further used for in-context learning to aid LLMs in accurately decomposing subtasks and effectively invoking hard-to-distinguish APIs; (3) LLM Finetuner tunes a base LLM to enhance its capability for task planning and API calling. We validate our methods using a real-world industry system and an open-sourced academic dataset, demonstrating the efficacy of each individual component as well as the integrated framework. The code is available at here. 2024.emnlp-industry.27 @@ -18799,11 +18799,11 @@ Moleco: Molecular Contrastive Learning with Chemical Language Models for Molecular Property Prediction - Jun-HyungParkHankuk University of Foreign Studies + Jun-HyungParkHankuk University of Foreign Studies HyuntaePark YeachanKimKorea University WoosangLimPOSCO Holdings - SangKeunLeeKorea University + SangKeunLeeKorea University 408-420 Pre-trained chemical language models (CLMs) excel in the field of molecular property prediction, utilizing string-based molecular descriptors such as SMILES for learning universal representations. However, such string-based descriptors implicitly contain limited structural information, which is closely associated with molecular property prediction. In this work, we introduce Moleco, a novel contrastive learning framework to enhance the understanding of molecular structures within CLMs. Based on the similarity of fingerprint vectors among different molecules, we train CLMs to distinguish structurally similar and dissimilar molecules in a contrastive manner. Experimental results demonstrate that Moleco significantly improves the molecular property prediction performance of CLMs, outperforming state-of-the-art models. Moreover, our in-depth analysis with diverse Moleco variants verifies that fingerprint vectors are highly effective features in improving CLMs’ understanding of the structural information of molecules. 2024.emnlp-industry.30 @@ -18815,11 +18815,11 @@ <fixed-case>SEED</fixed-case>: Semantic Knowledge Transfer for Language Model Adaptation to Materials Science YeachanKimKorea University - Jun-HyungParkHankuk University of Foreign Studies + Jun-HyungParkHankuk University of Foreign Studies SungHoKimKorea University JuhyeongParkKorea University SangyunKimKorea University - SangKeunLeeKorea University + SangKeunLeeKorea University 421-428 Materials science is an interdisciplinary field focused on studying and discovering materials around us. However, due to the vast space of materials, datasets in this field are typically scarce and have limited coverage. This inherent limitation makes current adaptation methods less effective when adapting pre-trained language models (PLMs) to materials science, as these methods rely heavily on the frequency information from limited downstream datasets. In this paper, we propose Semantic Knowledge Transfer (SEED), a novel vocabulary expansion method to adapt the pre-trained language models for materials science. The core strategy of SEED is to transfer the materials knowledge of lightweight embeddings into the PLMs. To this end, we introduce knowledge bridge networks, which learn to transfer the latent knowledge of the materials embeddings into ones compatible with PLMs. By expanding the embedding layer of PLMs with these transformed embeddings, PLMs can comprehensively understand the complex terminology associated with materials science. We conduct extensive experiments across a broad range of materials-related benchmarks. Comprehensive evaluation results convincingly demonstrate that SEED mitigates the mentioned limitations of previous adaptation methods, showcasing the efficacy of transferring embedding knowledge into PLMs. 2024.emnlp-industry.31 @@ -18832,7 +18832,7 @@ AdilNygaardJ.P. Morgan Chase AshishUpadhyayJ.P. Morgan Chase LaurenHinkleNA - XeniaSkottiJ.P. Morgan Chase + XeniaSkottiJ.P. Morgan Chase JoeHalliwellNA Ian CBrown GlenNoronhaJ.P. Morgan Chase @@ -18846,8 +18846,8 @@ <fixed-case>F</fixed-case>ast<fixed-case>A</fixed-case>da<fixed-case>SP</fixed-case>: Multitask-Adapted Efficient Inference for Large Speech Language Model YichenLu JiaqiSong - Chao-Han HuckYangNVIDIA Research - ShinjiWatanabeCarnegie Mellon University + Chao-Han HuckYangNVIDIA Research + ShinjiWatanabeCarnegie Mellon University 440-451 In this study, we aim to explore Multitask Speech Language Model (SpeechLM) efficient inference via token reduction. Unlike other modalities such as vision or text, speech has unique temporal dependencies, making previous efficient inference works on other modalities not directly applicable. Furthermore, methods for efficient SpeechLM inference on long sequence and sparse signals remain largely unexplored. In this work, we propose FastAdaSP, a weighted token merging framework specifically designed for various speech-related tasks to improve the trade-off between efficiency and performance. Experimental results on WavLLM and Qwen-Audio show that our method achieves the state-of-the-art (SOTA) efficiency-performance trade-off compared with other baseline methods. Specifically, FastAdaSP achieved 7x memory efficiency and 1.83x decoding throughput without any degradation on tasks like Emotion Recognition (ER) and Spoken Question Answering (SQA). 2024.emnlp-industry.33 @@ -18860,12 +18860,12 @@ <fixed-case>T</fixed-case>ensor<fixed-case>O</fixed-case>pera Router: A Multi-Model Router for Efficient <fixed-case>LLM</fixed-case> Inference DimitrisStripelisTensorOpera, Inc. ZhaozhuoXuStevens Institute of Technology - ZijianHuScale AI - Alay DilipbhaiShah + ZijianHuScale AI + Alay DilipbhaiShah HanJin - YuhangYao + YuhangYao JipengZhang - TongZhangUIUC + TongZhangUIUC SalmanAvestimehrUniversity of Southern California ChaoyangHe 452-462 @@ -18879,7 +18879,7 @@ Prompt-Tuned Muti-Task Taxonomic Transformer (<fixed-case>PTMTT</fixed-case>axo<fixed-case>F</fixed-case>ormer) RajashekarVasanthaAmazon NhanNguyenAmazon - YueZhangAmazon + YueZhangAmazon 463-476 Hierarchical Text Classification (HTC) is a subclass of multi-label classification, it is challenging because the hierarchy typically has a large number of diverse topics. Existing methods for HTC fall within two categories, local methods (a classifier for each level, node, or parent) or global methods (a single classifier for everything). Local methods are computationally expensive, whereas global methods often require complex explicit injection of the hierarchy, verbalizers, and/or prompt engineering. In this work, we propose Prompt Tuned Multi Task Taxonomic Transformer, a single classifier that uses a multi-task objective to predict one or more topics. The approach is capable of understanding the hierarchy during training without explicit injection, complex heads, verbalizers, or prompt engineering. PTMTTaxoFormer is a novel model architecture and training paradigm using differentiable prompts and labels that are learnt through backpropagation. PTMTTaxoFormer achieves state of the art results on several HTC benchmarks that span a range of topics consistently. Compared to most other HTC models, it has a simpler yet effective architecture, making it more production-friendly in terms of latency requirements (a factor of 2-5 lower latency). It is also robust and label-efficient, outperforming other models with 15%-50% less training data. 2024.emnlp-industry.35 @@ -18906,8 +18906,8 @@ Personal Large Language Model Agents: A Case Study on Tailored Travel Planning HarmanpreetSinghLG Electronics NikhilVermaLG Toronto AI Lab - YixiaoWangLG Electronics - ManasaBharadwajLG Corporation + YixiaoWangLG Electronics + ManasaBharadwajLG Corporation HomaFashandi KevinFerreiraLG Corporation ChulLeeLG Electronics @@ -18943,9 +18943,9 @@ <fixed-case>BPID</fixed-case>: A Benchmark for Personal Identity Deduplication - RunhuiWang - YefanTaoAmazon - AditKrishnanAmazon + RunhuiWang + YefanTaoAmazon + AditKrishnanAmazon LuyangKongAmazon XuanqingLiuAmazon YuqianDengAmazon @@ -18964,10 +18964,10 @@ <fixed-case>MERLIN</fixed-case>: Multimodal Embedding Refinement via <fixed-case>LLM</fixed-case>-based Iterative Navigation for Text-Video Retrieval-Rerank Pipeline DonghoonHanSeoul National University - EunhwanParkBuzzni AI Lab + EunhwanParkBuzzni AI Lab GisangLee AdamLeeUniversity of California, Berkeley - NojunKwakSeoul National University + NojunKwakSeoul National University 547-562 The rapid expansion of multimedia content has made accurately retrieving relevant videos from large collections increasingly challenging. Recent advancements in text-video retrieval have focused on cross-modal interactions, large-scale foundation model training, and probabilistic modeling, yet often neglect the crucial user perspective, leading to discrepancies between user queries and the content retrieved. To address this, we introduce MERLIN (Multimodal Embedding Refinement via LLM-based Iterative Navigation), a novel, training-free pipeline that leverages Large Language Models (LLMs) for iterative feedback learning. MERLIN refines query embeddings from a user perspective, enhancing alignment between queries and video content through a dynamic question answering process. Experimental results on datasets like MSR-VTT, MSVD, and ActivityNet demonstrate that MERLIN substantially improves Recall@1, outperforming existing systems and confirming the benefits of integrating LLMs into multimodal retrieval systems for more responsive and context-aware multimedia retrieval. 2024.emnlp-industry.41 @@ -18995,7 +18995,7 @@ HongChenAlipay ZhuXinLee SongqiaoHanShanghai University of Finance and Economics - HailiangHuangShanghai University of Finance and Economics + HailiangHuangShanghai University of Finance and Economics 573-594 Large language models (LLMs) have achieved significant leadership in many NLP tasks, but aligning structured output with generative models in information extraction (IE) tasks remains a challenge. Prompt Engineering (PE) is renowned for improving IE performance through prompt modifications. However, the realm of the sample design for downstream fine-tuning, crucial for task-specific LLM adaptation, is largely unexplored. This paper introduces **Sample Design Engineering** (SDE), a methodical approach to enhancing LLMs’ post-tuning performance on IE tasks by refining input, output, and reasoning designs. Through extensive ID and OOD experiments across six LLMs, we first assess the impact of various design options on IE performance, revealing several intriguing patterns. Based on these insights, we then propose an integrated SDE strategy and validate its consistent superiority over heuristic sample designs on three complex IE tasks with four additional LLMs, demonstrating the generality of our method. Additionally, analyses of LLMs’ inherent prompt/output perplexity, zero-shot, and ICL abilities illustrate that good PE strategies may not always translate to good SDE strategies. 2024.emnlp-industry.43 @@ -19060,7 +19060,7 @@ <fixed-case>K</fixed-case>or<fixed-case>S</fixed-case>mishing Explainer: A <fixed-case>K</fixed-case>orean-centric <fixed-case>LLM</fixed-case>-based Framework for Smishing Detection and Explanation Generation YunseungLeeKakaoBank Corp. - DaeheeHanKakaobank + DaeheeHanKakaobank 642-656 To mitigate the annual financial losses caused by SMS phishing (smishing) in South Korea, we propose an explainable smishing detection framework that adapts to a Korean-centric large language model (LLM). Our framework not only classifies smishing attempts but also provides clear explanations, enabling users to identify and understand these threats. This end-to-end solution encompasses data collection, pseudo-label generation, and parameter-efficient task adaptation for models with fewer than five billion parameters. Our approach achieves a 15% improvement in accuracy over GPT-4 and generates high-quality explanatory text, as validated by seven automatic metrics and qualitative evaluation, including human assessments. 2024.emnlp-industry.47 @@ -19069,9 +19069,9 @@ Time Matters: An End-to-End Solution for Temporal Claim Verification - Anab MaulanaBarik - WynneHsuNational University of Singapore - Mong-LiLeeNational University of Singapore + Anab MaulanaBarik + WynneHsuNational University of Singapore + Mong-LiLeeNational University of Singapore 657-664 Automated claim verification plays an essential role in fostering trust in the digital space. Despite the growing interest, the verification of temporal claims has not received much attention in the community. Temporal claim verification brings new challenges where cues of the temporal information need to be extracted, and temporal reasoning involving various temporal aspects of the text must be applied.In this work, we describe an end-to-end solution for temporal claim verification that considers the temporal information in claims to obtain relevant evidence sentences and harnesses the power of a large language model for temporal reasoning. We curate two datasets comprising a diverse range of temporal claims to learn time-sensitive representations that encapsulate not only the semantic relationships among the events, but also their chronological proximity.Experiment results demonstrate that the proposed approach significantly enhances the accuracy of temporal claim verification, thereby advancing current state-of-the-art in automated claim verification. 2024.emnlp-industry.48 @@ -19082,12 +19082,12 @@ <fixed-case>MILD</fixed-case> Bot: Multidisciplinary Childhood Cancer Survivor Question-Answering Bot MiraeKim - KyubumHwangSung Kyun Kwan University + KyubumHwangSung Kyun Kwan University HayoungOhSung Kyun Kwan University - Min AhKimSung Kyun Kwan University + Min AhKimSung Kyun Kwan University ChaerimPark - YehwiParkSung Kyun Kwan University - ChungyeonLeeSung Kyun Kwan University + YehwiParkSung Kyun Kwan University + ChungyeonLeeSung Kyun Kwan University 665-676 This study introduces a Multidisciplinary chILDhood cancer survivor question-answering (MILD) bot designed to support childhood cancer survivors facing diverse challenges in their survivorship journey. In South Korea, a shortage of experts equipped to address these unique concerns comprehensively leaves survivors with limited access to reliable information. To bridge this gap, our MILD bot employs a dual-component model featuring an intent classifier and a semantic textual similarity model. The intent classifier first analyzes the user’s query to identify the underlying intent and match it with the most suitable expert who can provide advice. Then, the semantic textual similarity model identifies questions in a predefined dataset that closely align with the user’s query, ensuring the delivery of relevant responses. This proposed framework shows significant promise in offering timely, accurate, and high-quality information, effectively addressing a critical need for support among childhood cancer survivors. 2024.emnlp-industry.49 @@ -19101,12 +19101,12 @@ HuimuWang MingmingLi DadongMiao - WangBinbinJD.com + WangBinbinJD.com XusongChen LiKuangCentral South University YuxingHanTsinghua University, Tsinghua University JiaxingWangInstitute of automation, Chinese academy of science - GuoyuTang + GuoyuTang LinLiuJD SonglinWang JingweiZhuo @@ -19138,7 +19138,7 @@ ShuguangHan GuoqiangWu FeiHuang - JufengChen + JufengChen 697-711 Unlike professional Business-to-Consumer (B2C) e-commerce platforms (e.g., Amazon), Consumer-to-Consumer (C2C) platforms (e.g., Facebook marketplace) are mainly targeting individual sellers who usually lack sufficient experience in e-commerce. Individual sellers often struggle to compose proper descriptions for selling products. With the recent advancement of Multimodal Large Language Models (MLLMs), we attempt to integrate such state-of-the-art generative AI technologies into the product listing process. To this end, we develop IPL, an Intelligent Product Listing tool tailored to generate descriptions using various product attributes such as category, brand, color, condition, etc. IPL enables users to compose product descriptions by merely uploading photos of the selling product. More importantly, it can imitate the content style of our C2C platform Xianyu. This is achieved by employing domain-specific instruction tuning on MLLMs, and by adopting the multi-modal Retrieval-Augmented Generation (RAG) process. A comprehensive empirical evaluation demonstrates that the underlying model of IPL significantly outperforms the base model in domain-specific tasks while producing less hallucination. IPL has been successfully deployed in our production system, where 72% of users have their published product listings based on the generated content, and those product listings are shown to have a quality score 5.6% higher than those without AI assistance. 2024.emnlp-industry.52 @@ -19148,12 +19148,12 @@ <fixed-case>QD</fixed-case>y<fixed-case>L</fixed-case>o<fixed-case>RA</fixed-case>: Quantized Dynamic Low-Rank Adaptation for Efficient Large Language Model Tuning HosseinRajabzadeh - MojtabaValipourCoastal Carbon + MojtabaValipourCoastal Carbon TianshuZhu Marzieh S.Tahaei Hyock JuKwonUniversity of Waterloo AliGhodsi - BoxingChenHuawei Technologies Ltd. + BoxingChenHuawei Technologies Ltd. MehdiRezagholizadeh 712-718 Finetuning large language models requires huge GPU memory, restricting the choice to acquire Larger models. While the quantized version of the Low-Rank Adaptation technique, named QLoRA, significantly alleviates this issue, finding the efficient LoRA rank is still challenging. Moreover, QLoRA is trained on a pre-defined rank and, therefore, cannot be reconfigured for its lower ranks without requiring further fine-tuning steps. This paper proposes QDyLoRA -Quantized Dynamic Low-Rank Adaptation-, as an efficient quantization approach for dynamic low-rank adaptation. Motivated by Dynamic LoRA, QDyLoRA is able to efficiently finetune LLMs on a set of pre-defined LoRA ranks. QDyLoRA enables fine-tuning Falcon-40b for ranks 1 to 64 on a single 32 GB V100-GPU through one round of fine-tuning. Experimental results show that QDyLoRA is competitive to QLoRA and outperforms when employing its optimal rank. @@ -19191,12 +19191,12 @@ <fixed-case>RAG</fixed-case>4<fixed-case>ITO</fixed-case>ps: A Supervised Fine-Tunable and Comprehensive <fixed-case>RAG</fixed-case> Framework for <fixed-case>IT</fixed-case> Operations and Maintenance - TianyangZhangLearnable.ai + TianyangZhangLearnable.ai ZhuoxuanJiangShanghai Business School ShengguangBai TianruiZhang LinLinucloud - YangLiuNorth Carolina Central University + YangLiuNorth Carolina Central University JiaweiRen 738-754 With the ever-increasing demands on Question Answering (QA) systems for IT operations and maintenance, an efficient and supervised fine-tunable framework is necessary to ensure the data security, private deployment and continuous upgrading. Although Large Language Models (LLMs) have notably improved the open-domain QA’s performance, how to efficiently handle enterprise-exclusive corpora and build domain-specific QA systems are still less-studied for industrial applications. In this paper, we propose a general and comprehensive framework based on Retrieval Augmented Generation (RAG) and facilitate the whole business process of establishing QA systems for IT operations and maintenance. In accordance with the prevailing RAG method, our proposed framework, named with RAG4ITOps, composes of two major stages: (1) Models Fine-tuning & Data Vectorization, and (2) Online QA System Process. At the Stage 1, we leverage a contrastive learning method with two negative sampling strategies to fine-tune the embedding model, and design the instruction templates to fine-tune the LLM with a Retrieval Augmented Fine-Tuning method. At the Stage 2, an efficient process of QA system is built for serving. We collect enterprise-exclusive corpora from the domain of cloud computing, and the extensive experiments show that our method achieves superior results than counterparts on two kinds of QA tasks. Our experiment also provide a case for applying the RAG4ITOps to real-world enterprise-level applications. @@ -19207,10 +19207,10 @@ <fixed-case>ULMR</fixed-case>: Unlearning Large Language Models via Negative Response and Model Parameter Average ShaojieShiINF Technology (Shanghai) Co., Ltd. - XiaoyuTanINF Technology (Shanghai) Co., Ltd. - XiheQiuShanghai University of Engineering Science + XiaoyuTanINF Technology (Shanghai) Co., Ltd. + XiheQiuShanghai University of Engineering Science ChaoQu - KexinNie + KexinNie YuanChengFudan University WeiChuInf Tech XuYinghuiFudan University @@ -19224,9 +19224,9 @@ Pretraining and Finetuning Language Models on Geospatial Networks for Accurate Address Matching - SaketMaheshwaryAmazon and International Institute of Information Technology, Hyderabad + SaketMaheshwaryAmazon and International Institute of Information Technology, Hyderabad ArpanPaul - SaurabhSohoney + SaurabhSohoney 763-773 We propose a novel framework for pretraining and fine-tuning language models with the goal of determining whether two addresses represent the same physical building. Address matching and building authoritative address catalogues are important to many applications and businesses, such as delivery services, online retail, emergency services, logistics, etc. We propose to view a collection of addresses as an address graph and curate inputs for language models by placing geospatially linked addresses in the same context. Our approach jointly integrates concepts from graph theory and weak supervision with address text and geospatial semantics. This integration enables us to generate informative and diverse address pairs, facilitating pretraining and fine-tuning in a self-supervised manner. Experiments and ablation studies on manually curated datasets and comparisons with state-of-the-art techniques demonstrate the efficacy of our approach. We achieve a 24.49% improvement in recall while maintaining 95% precision on average, in comparison to the current baseline across multiple geographies. Further, we deploy our proposed approach and show the positive impact of improving address matching on geocode learning. 2024.emnlp-industry.58 @@ -19236,7 +19236,7 @@ <fixed-case>SMARTCAL</fixed-case>: An Approach to Self-Aware Tool-Use Evaluation and Calibration YuanhaoShen - XiaodanZhuQueen’s University + XiaodanZhuQueen’s University LeiChenRakuten Institute of Technology, The University of Tokyo 774-789 The tool-use ability of Large Language Models (LLMs) has a profound impact on a wide range of applications. However, LLMs’ self-awareness and self-control capability in appropriately using tools remains understudied. The problem is consequential as it alarms a potential risk of degraded performance and poses a threat to trustworthiness on the models. In this paper, we conduct a study on a family of state-of-the-art LLMs on three datasets with two mainstream tool-use frameworks. Our study reveals the tool-abuse behavior of LLMs, a tendency for models to misuse tools along with models’ frequent overconfidence in tool choice. We also find that this is a common issue regardless of model capability. Accordingly, we propose a novel framework, SMARTCAL, to mitigate the observed issues, and our results show an average 8.6 percent increase in the QA performance in three testing datasets and 21.6 percent lower Expected Calibration Error (ECE) than existing methods. @@ -19248,7 +19248,7 @@ Probing the Depths of Language Models’ Contact-Center Knowledge for Quality Assurance Digvijay AnilInglePersonal AashrayaSachdevaIndependent - Surya PrakashSahuObserve.AI + Surya PrakashSahuObserve.AI MayankSati CijoGeorgeObserve.AI JithendraVepa @@ -19261,9 +19261,9 @@ Intelligent Predictive Maintenance <fixed-case>RAG</fixed-case> framework for Power Plants: Enhancing <fixed-case>QA</fixed-case> with <fixed-case>S</fixed-case>tyle<fixed-case>DFS</fixed-case> and Domain Specific Instruction Tuning - SeongtaeHongKorea University + SeongtaeHongKorea University Joong MinShinKorea University - JaehyungSeo + JaehyungSeo TaeminLeeKorea University JeongbaeParkKorea University Cho ManYounggaonplatform @@ -19282,7 +19282,7 @@ ShaobaiJiangAmazon and Iowa State University QiLiAmazon JulienHanAmazon - KarimBouyarmaneAmazon + KarimBouyarmaneAmazon 821-828 In this paper, we study the problem of generating structured objects that conform to a complex schema, with intricate dependencies between the different components (facets) of the object. The facets of the object (attributes, fields, columns, properties) can be a mix of short, structured facts, or long natural-language descriptions. The object has to be self-consistent between the different facets in the redundant information it carries (relative consistency), while being grounded with respect to world knowledge (absolute consistency). We frame the problem as a Language Modeling problem (Structured Object Language Modeling) and train an LLM to perform the task natively, without requiring instructions or prompt-engineering. We propose a self-supervised denoising method to train the model from an existing dataset of such objects. The input query can be the existing object itself, in which case the system acts as a regenerator, completing, correcting, normalizing the input, or any unstructured blurb to be structured. We show that the self-supervised denoising training provides a strong baseline, and that additional supervised fine-tuning with small amount of human demonstrations leads to further improvement. Experimental results show that the proposed method matches or outperforms prompt-engineered general-purpose state-of-the-art LLMs (Claude 3, Mixtral-8x7B), while being order-of-magnitude more cost-efficient. 2024.emnlp-industry.62 @@ -19321,7 +19321,7 @@ SergeySavin OlegBaryshnikov AlenaLisevych - SergeyNikolenkoSteklov Institute of Mathematics at St. Petersburg + SergeyNikolenkoSteklov Institute of Mathematics at St. Petersburg 866-880 We propose Project Context for Code Summarization with LLMs (ProConSuL), a new framework to provide a large language model (LLM) with precise information about the code structure from program analysis methods such as a compiler or IDE language services and use task decomposition derived from the code structure. ProConSuL builds a call graph to provide the context from callees and uses a two-phase training method (SFT + preference alignment) to train the model to use the project context. We also provide a new evaluation benchmark for C/C++ functions and a set of proxy metrics. Experimental results demonstrate that ProConSuL allows to significantly improve code summaries and reduce the number of hallucinations compared to the base model (CodeLlama-7B-instruct). We make our code and dataset available at https://github.com/TypingCat13/ProConSuL. 2024.emnlp-industry.65 @@ -19333,8 +19333,8 @@ ZhuowanLiGoogle ChengLiGoogle MingyangZhangGoogle - QiaozhuMeiUniversity of Michigan - MichaelBenderskyGoogle + QiaozhuMeiUniversity of Michigan + MichaelBenderskyGoogle 881-893 Retrieval Augmented Generation (RAG) has been a powerful tool for Large Language Models (LLMs) to efficiently process overly lengthy contexts. However, recent LLMs like Gemini-1.5 and GPT-4 show exceptional capabilities to understand long contexts directly. We conduct a comprehensive comparison between RAG and long-context (LC) LLMs, aiming to leverage the strengths of both. We benchmark RAG and LC across various public datasets using three latest LLMs. Results reveal that when resourced sufficiently, LC consistently outperforms RAG in terms of average performance. However, RAG’s significantly lower cost remains a distinct advantage. Based on this observation, we propose Self-Route, a simple yet effective method that routes queries to RAG or LC based on model self-reflection. Self-Route significantly reduces the computation cost while maintaining a comparable performance to LC. Our findings provide a guideline for long-context applications of LLMs using RAG and LC. 2024.emnlp-industry.66 @@ -19345,8 +19345,8 @@ <fixed-case>MARS</fixed-case>: Multilingual Aspect-centric Review Summarisation - Sandeep SricharanMukkuAmazon - AbineshKanagarajanAmazon + Sandeep SricharanMukkuAmazon + AbineshKanagarajanAmazon ChetanAggarwalAmazon PromodYenigallaAmazon 894-909 @@ -19356,7 +19356,7 @@ A new approach for fine-tuning sentence transformers for intent classification and out-of-scope detection tasks - TianyiZhang + TianyiZhang AttaNorouzianNuance Communications AanchanMohanUniversity of Victoria and Northeastern University FrederickDucatelleCerence @@ -19370,9 +19370,9 @@ Tell me what <fixed-case>I</fixed-case> need to know: Exploring <fixed-case>LLM</fixed-case>-based (Personalized) Abstractive Multi-Source Meeting Summarization FredericKirsteinGeorg-August Universität Göttingen - TerryRuasGeorg-August Universität Göttingen + TerryRuasGeorg-August Universität Göttingen RobertKratelGeorg-August Universität Göttingen - BelaGippGeorg-August Universität Göttingen + BelaGippGeorg-August Universität Göttingen 920-939 Meeting summarization is crucial in digital communication, but existing solutions struggle with salience identification to generate personalized, workable summaries, and context understanding to fully comprehend the meetings’ content.Previous attempts to address these issues by considering related supplementary resources (e.g., presentation slides) alongside transcripts are hindered by models’ limited context sizes and handling the additional complexities of the multi-source tasks, such as identifying relevant information in additional files and seamlessly aligning it with the meeting content.This work explores multi-source meeting summarization considering supplementary materials through a three-stage large language model approach: identifying transcript passages needing additional context, inferring relevant details from supplementary materials and inserting them into the transcript, and generating a summary from this enriched transcript.Our multi-source approach enhances model understanding, increasing summary relevance by ~9% and producing more content-rich outputs.We introduce a personalization protocol that extracts participant characteristics and tailors summaries accordingly, improving informativeness by ~10%.This work further provides insights on performance-cost trade-offs across four leading model families, including edge-device capable options.Our approach can be extended to similar complex generative tasks benefitting from additional resources and personalization, such as dialogue systems and action planning. 2024.emnlp-industry.69 @@ -19405,7 +19405,7 @@ Tsz FungYauDepartment of Computer Science QixuanZhang MohammadBolandraftar - XiaodanZhuQueen’s University + XiaodanZhuQueen’s University Faiza KhanKhattak 954-969 Mitigating bias in language models (LMs) has become a critical problem due to the widespread deployment of LMs in the industry and customer-facing applications. Numerous approaches revolve around data pre-processing and subsequent fine-tuning of language models, tasks that can be both time-consuming and computationally demanding. As alternatives, machine unlearning techniques are being explored, yet there is a notable lack of comparative studies evaluating the effectiveness of these methods. In this work, we explore the effectiveness of two machine unlearning methods: Partitioned Contrastive Gradient Unlearning (PCGU) applied on decoder models, and Negation via Task Vector, and compare them with Direct Preference Optimization (DPO) to reduce social biases in open-source LMs such as LLaMA-2 and OPT. We also implement distributed PCGU for large models. It is empirically shown, through quantitative and qualitative analyses, that negation via Task Vector method outperforms PCGU and is comparable to DPO in debiasing models with minimum deterioration in model performance and perplexity. Negation via Task Vector reduces the bias score by 25.5% for LLaMA-2 and achieves bias reduction of up to 40% for OPT models. Moreover, it can be easily tuned to balance the trade-off between bias reduction and generation quality, unlike DPO. @@ -19442,7 +19442,7 @@ Adapting <fixed-case>LLM</fixed-case>s for Structured Natural Language <fixed-case>API</fixed-case> Integration - RobinChan + RobinChan KatsiarynaMirylenkaInternational Business Machines ThomasGschwindIBM Research ChristophMiksovicInternational Business Machines @@ -19498,7 +19498,7 @@ DanteEveraertAmazon RohitPatkiAmazon TianqiZhengAmazon - ChristopherPottsStanford University + ChristopherPottsStanford University 1046-1055 Query Autocomplete (QAC) is a critical feature in modern search engines, facilitating user interaction by predicting search queries based on input prefixes. Despite its widespread adoption, the absence of large-scale, realistic datasets has hindered advancements in QAC system development. This paper addresses this gap by introducing AmazonQAC, a new QAC dataset sourced from Amazon Search logs, comprising 395M samples. The dataset includes actual sequences of user-typed prefixes leading to final search terms, as well as session IDs and timestamps that support modeling the context-dependent aspects of QAC. We assess Prefix Trees, semantic retrieval, and Large Language Models (LLMs) with and without finetuning. We find that finetuned LLMs perform best, particularly when incorporating contextual information. However, even our best system achieves only half of what we calculate is theoretically possible on our test data, which implies QAC is a challenging problem that is far from solved with existing systems. This contribution aims to stimulate further research on QAC systems to better serve user needs in diverse environments. We open-source this data on Hugging Face at https://huggingface.co/datasets/amazon/AmazonQAC. 2024.emnlp-industry.78 @@ -19529,8 +19529,8 @@ MathieuSibueJ.P. Morgan Chase AntonyPapadimitriouJ.P. Morgan Chase ZhiqiangMaJ.P. Morgan Chase - XiaomoLiuJP Morgan AI Research - XiaodanZhuQueen’s University + XiaomoLiuJP Morgan AI Research + XiaodanZhuQueen’s University 1068-1082 The Chartered Financial Analyst (CFA) program is one of the most widely recognized financial certifications globally. In this work, we test a variety of state-of-the-art large language models (LLMs) on mock CFA exams to provide an overview of their financial analysis capabilities using the same evaluation standards applied for human professionals. We benchmark five leading proprietary models and eight open-source models on all three levels of the CFA through challenging multiple-choice and essay questions. We find that flagship proprietary models perform relatively well and can solidly pass levels I and II exams, but fail at level III due to essay questions. Open-source models generally fall short of estimated passing scores, but still show strong performance considering their size, cost, and availability advantages. We also find that using textbook data helps bridge the gap between open-source and proprietary models to a certain extent, despite reduced gains in CFA levels II and III. By understanding the current financial analysis abilities of LLMs, we aim to guide practitioners on which models are best suited for enhancing automation in the financial industry. 2024.emnlp-industry.80 @@ -19542,9 +19542,9 @@ Value Alignment from Unstructured Text InkitPadhi KarthikeyanNatesan RamamurthyInternational Business Machines - PrasannaSattigeriIBM Research + PrasannaSattigeriIBM Research ManishNagireddyIBM Research - PierreDogninInternational Business Machines + PierreDogninInternational Business Machines Kush R.VarshneyInternational Business Machines 1083-1095 Aligning large language models (LLMs) to value systems has emerged as a significant area of research within the fields of AI and NLP. Currently, this alignment process relies on the availability of high-quality supervised and preference data, which can be both time-consuming and expensive to curate or annotate. In this paper, we introduce a systematic end-to-end methodology for aligning LLMs to the implicit and explicit values represented in unstructured text data. Our proposed approach leverages the use of scalable synthetic data generation techniques to effectively align the model to the values present in the unstructured data. Through two distinct use-cases, we demonstrate the efficiency of our methodology on the Mistral-7B-Instruct model. Our approach credibly aligns LLMs to the values embedded within documents, and shows improved performance against other approaches, as quantified through the use of automatic metrics and win rates. @@ -19554,10 +19554,10 @@ <fixed-case>LARA</fixed-case>: Linguistic-Adaptive Retrieval-Augmentation for Multi-Turn Intent Classification - JunhuaLiuSingapore University of Technology and Design and Forth AI + JunhuaLiuSingapore University of Technology and Design and Forth AI Tan YongKeat BinFuUniversity of Science and Technology Beijing - Kwan HuiLimSingapore University of Technology and Design + Kwan HuiLimSingapore University of Technology and Design 1096-1106 Multi-turn intent classification is notably challenging due to the complexity and evolving nature of conversational contexts. This paper introduces LARA, a Linguistic-Adaptive Retrieval-Augmentation framework to enhance accuracy in multi-turn classification tasks across six languages, accommodating numerous intents in chatbot interactions. LARA combines a fine-tuned smaller model with a retrieval-augmented mechanism, integrated within the architecture of LLMs. The integration allows LARA to dynamically utilize past dialogues and relevant intents, thereby improving the understanding of the context. Furthermore, our adaptive retrieval techniques bolster the cross-lingual capabilities of LLMs without extensive retraining and fine-tuning. Comprehensive experiments demonstrate that LARA achieves state-of-the-art performance on multi-turn intent classification tasks, enhancing the average accuracy by 3.67% from state-of-the-art single-turn intent classifiers. 2024.emnlp-industry.82 @@ -19567,7 +19567,7 @@ Generating Vehicular Icon Descriptions and Indications Using Large Vision-Language Models JamesFletcher - NicholasDehnen + NicholasDehnen Seyed NimaTayarani BathaieYork University AijunAnYork University HeidarDavoudiOntario Tech University @@ -19583,7 +19583,7 @@ Athena: Safe Autonomous Agents with Verbal Contrastive Learning TanmanaSadhuLG Corporation - AliPesaranghaderLG Electronics + AliPesaranghaderLG Electronics YananChenLG Corporation Dong HoonYiLG Corporation 1121-1130 @@ -19615,7 +19615,7 @@ SumitNeelamInternational Business Machines DineshRaghuIBM Research - New Delhi UditSharmaInternational Business Machines - Adriana MezaSoriaMIT-IBM Watson AI Lab + Adriana MezaSoriaMIT-IBM Watson AI Lab DheerajSreedharInternational Business Machines PraveenVenkateswaranInternational Business Machines MerveUnuvarIBM TJ Watson Research Center @@ -19644,7 +19644,7 @@ <fixed-case>D</fixed-case>i<fixed-case>AL</fixed-case> : Diversity Aware Listwise Ranking for Query Auto-Complete - SonaliSingh + SonaliSingh Sachin SudhakarFarfade Prakash MandayamComarAmazon 1152-1162 @@ -19685,12 +19685,12 @@ Knowledge-augmented Financial Market Analysis and Report Generation YueminChen - FeifanWu + FeifanWu JingweiWangAnt Group HaoQianAnt Group ZiqiLiuAnt Group ZhiqiangZhangAnt Group - JunZhouAnt Group + JunZhouAnt Group MengWangTongji University 1207-1217 Crafting a convincing financial market analysis report necessitates a wealth of market information and the expertise of financial analysts, posing a highly challenging task. While large language models (LLMs) have enabled the automated generation of financial market analysis text, they still face issues such as hallucinations, errors in financial knowledge, and insufficient capability to reason about complex financial problems, which limits the quality of the generation. To tackle these shortcomings, we propose a novel task and a retrieval-augmented framework grounded in a financial knowledge graph (FKG). The proposed framework is compatible with commonly used instruction-tuning methods. Experiments demonstrate that our framework, coupled with a small-scale language model fine-tuned with instructions, can significantly enhance the logical consistency and quality of the generated analysis texts, outperforming both large-scale language models and other retrieval-augmented baselines. @@ -19701,7 +19701,7 @@ Let Me Speak Freely? A Study On The Impact Of Format Restrictions On Large Language Model Performance. - Zhi RuiTamAppier + Zhi RuiTamAppier Cheng-KuangWuAppier Yi-LinTsaiAppier Chieh-YenLinAppier Inc. @@ -19715,7 +19715,7 @@ <fixed-case>ASTRA</fixed-case>: Automatic Schema Matching using Machine Translation - TarangChughAmazon + TarangChughAmazon DeepakZambreAmazon 1237-1244 Many eCommerce platforms source product information from millions of sellers and manufactures, each having their own proprietary schemas, and employ schema matching solutions to structure it to enable informative shopping experiences. Meanwhile, state-of-the-art machine translation techniques have demonstrated great success in building context-aware representations that generalize well to new languages with minimal training data. In this work, we propose modeling the schema matching problem as a neural machine translation task: given product context and an attribute-value pair from a source schema, the model predicts the corresponding attribute, if available, in the target schema. We utilize open-source seq2seq models, such as mT5 and mBART, fine-tuned on product attribute mappings to build a scalable schema matching framework. We demonstrate that our proposed approach achieves a significant performance boost (15% precision and 7% recall uplift) compared to the baseline system and can support new attributes with precision \ge 95\% using only five labeled samples per attribute. @@ -19777,7 +19777,7 @@ AlexanderKhasin GuyShiran AsnatGreenstein-MessicaLightricks and Reichman University - DafnaShahafHebrew University of Jerusalem + DafnaShahafHebrew University of Jerusalem 1286-1304 We present a practical distillation approach to fine-tune LLMs for invoking tools in real-time applications. We focus on visual editing tasks; specifically, we modify images and videos by interpreting user stylistic requests, specified in natural language (“golden hour”), using an LLM to select the appropriate tools and their parameters to achieve the desired visual effect.We found that proprietary LLMs such as GPT-3.5-Turbo show potential in this task, but their high cost and latency make them unsuitable for real-time applications.In our approach, we fine-tune a (smaller) student LLM with guidance from a (larger) teacher LLM and behavioral signals.We introduce offline metrics to evaluate student LLMs. Both online and offline experiments show that our student models manage to match the performance of our teacher model (GPT-3.5-Turbo), significantly reducing costs and latency.Lastly, we show that fine-tuning was improved by 25% in low-data regimes using augmentation. 2024.emnlp-industry.96 @@ -19791,9 +19791,9 @@ Provenance: A Light-weight Fact-checker for Retrieval Augmented <fixed-case>LLM</fixed-case> Generation Output HitheshSankararamanIndian Institute of Technology, Madras, Dhirubhai Ambani Institute Of Information and Communication Technology Mohammed NasheedYasinUniphore - TannerSorensenUniphore + TannerSorensenUniphore Alessandro DiBariUNIPHORE - AndreasStolckeUniphore Technologies + AndreasStolckeUniphore Technologies 1305-1313 We present a light-weight approach for detecting nonfactual outputs from retrieval-augemented generation (RAG). Given a context and putative output, we compute a factuality score that can be thresholded to yield a binary decision to check the results of LLM-based question-answering, summarization, or other systems. Unlike factuality checkers that themselves rely on LLMs, we use compact, open-source natural language inference (NLI) models that yield a freely accessible solution with low latency and low cost at run-time, and no need for LLM fine-tuning. The approach also enables downstream mitigation and correction of hallucinations, by tracing them back to specific context chunks. Our experiments show high ROC-AUC across a wide range of relevant open source datasets, indicating the effectiveness of our method for fact-checking RAG output. 2024.emnlp-industry.97 @@ -19806,7 +19806,7 @@ AndreaMadottoFAIR ZhaojiangLinFacebook TusharNagarajanFAIR - MattSmith + MattSmith ShashankJain Chun-FuYehTaiwan AILabs PrakashMurugesanMeta @@ -19842,8 +19842,8 @@ Hyper-<fixed-case>QKSG</fixed-case>: Framework for Automating Query Generation and Knowledge-Snippet Extraction from Tables and Lists - DooyoungKimSung Kyun Kwan University - YoonjinJangSung Kyun Kwan University + DooyoungKimSung Kyun Kwan University + YoonjinJangSung Kyun Kwan University DongwookShin ChanhoonParkNAVER YoungjoongKoSungkyunkwan University @@ -19881,18 +19881,18 @@ <fixed-case>mGTE</fixed-case>: Generalized Long-Context Text Representation and Reranking Models for Multilingual Text Retrieval - XinZhangHong Kong Polytechnic University and Harbin Institute of Technology, Shenzhen + XinZhangHong Kong Polytechnic University and Harbin Institute of Technology, Shenzhen YanzhaoZhang DingkunLong - WenXie - ZiqiDai + WenXie + ZiqiDai JialongTang HuanLinAlibaba Group BaosongYang PengjunXie FeiHuangAlibaba Group MeishanZhangHarbin Institute of Technology (Shenzhen), China and Tianjin University, China - WenjieLiThe Hong Kong Polytechnic University, The Hong Kong Polytechnic University + WenjieLiThe Hong Kong Polytechnic University, The Hong Kong Polytechnic University MinZhangHarbin Institute of Technology, Shenzhen 1393-1412 We present systematic efforts in building long-context multilingual text representation model (TRM) and reranker from scratch for text retrieval. We first introduce a text encoder (base size) enhanced with RoPE and unpadding, pre-trained in a native 8192-token context (longer than 512 of previous multilingual encoders). Then we construct a hybrid TRM and a cross-encoder reranker by contrastive learning. Evaluations show that our text encoder outperforms the same-sized previous state-of-the-art XLM-R. Meanwhile, our TRM and reranker match the performance of large-sized state-of-the-art BGE-M3 models and achieve better results on long-context retrieval benchmarks. Further analysis demonstrate that our proposed models exhibit higher efficiency during both training and inference. We believe their efficiency and effectiveness could benefit various researches and industrial applications. @@ -19907,13 +19907,13 @@ AoQuMassachusetts Institute of Technology YihaoYan ZhaofengWuMassachusetts Institute of Technology - DingyiZhuang + DingyiZhuang JushiKaiShanghai Jiaotong University KebingHou XiaotongGuo - JinhuaZhaoMassachusetts Institute of Technology - ZhanZhaoUniversity of Hong Kong - WeiMa + JinhuaZhaoMassachusetts Institute of Technology + ZhanZhaoUniversity of Hong Kong + WeiMa 1413-1432 Citywalk, a recently popular form of urban travel, requires genuine personalization and understanding of fine-grained requests compared to traditional itinerary planning. In this paper, we introduce the novel task of Open-domain Urban Itinerary Planning (OUIP), which generates personalized urban itineraries from user requests in natural language. We then present ItiNera, an OUIP system that integrates spatial optimization with large language models to provide customized urban itineraries based on user needs. This involves decomposing user requests, selecting candidate points of interest (POIs), ordering the POIs based on cluster-aware spatial optimization, and generating the itinerary. Experiments on real-world datasets and the performance of the deployed system demonstrate our system’s capacity to deliver personalized and spatially coherent itineraries compared to current solutions. Source codes of ItiNera are available at https://github.com/YihongT/ITINERA. 2024.emnlp-industry.104 @@ -19923,7 +19923,7 @@ <fixed-case>REST</fixed-case>ful-Llama: Connecting User Queries to <fixed-case>REST</fixed-case>ful <fixed-case>API</fixed-case>s - HanXu + HanXu RuiningZhao JindongWangMicrosoft Research HaipengChenCollege of William and Mary @@ -19938,7 +19938,7 @@ AnnaHättyBosch DraganMilchevskiRobert Bosch GmbH, Bosch KerstenDöring - MarkoPutnikovicUniversity of Belgrade + MarkoPutnikovicUniversity of Belgrade MohsenMesgarBosch FilipNovovićUniversity of Belgrade MaximilianBraunRobert Bosch GmbH, Bosch @@ -19953,7 +19953,7 @@ <fixed-case>C</fixed-case>haracter<fixed-case>GLM</fixed-case>: Customizing Social Characters with Large Language Models JinfengZhou - ZhuangChen + ZhuangChen DazhenWan BosiWen YiSongComputer Science, Tsinghua University, Tsinghua University @@ -19963,13 +19963,13 @@ GuanqunBiTsinghua University LibiaoPeng JiaMingYangJilin University - XiyaoXiao + XiyaoXiao SahandSabour - XiaohanZhangBeijing Knowledge Atlas Technology Co., Ltd. - WenjingHou + XiaohanZhangBeijing Knowledge Atlas Technology Co., Ltd. + WenjingHou YijiaZhang - YuxiaoDongTsinghua University - HongningWangTsinghua University + YuxiaoDongTsinghua University + HongningWangTsinghua University JieTangTsinghua University, Tsinghua University MinlieHuang 1457-1476 @@ -19995,10 +19995,10 @@ Improving Retrieval in Sponsored Search by Leveraging Query Context Signals - Akash KumarMohankumarMicrosoft - GururajKMicrosoft + Akash KumarMohankumarMicrosoft + GururajKMicrosoft GaganMadan - AmitSinghMicrosoft + AmitSinghMicrosoft 1489-1498 Accurately retrieving relevant bid keywords for user queries is critical in Sponsored Search but remains challenging, particularly for short, ambiguous queries. Existing dense and generative retrieval models often fail to capture the nuanced user intent in these cases. To address this, we propose an approach to enhance query understanding by augmenting queries with rich contextual signals derived from web search results and large language models, stored in an online cache. Specifically, we use web search titles and snippets to ground queries in real-world information, and utilize GPT-4 to generate query rewrites and explanations that clarify user intent. These signals are efficiently integrated through a Fusion-in-Decoder based Unity architecture, enabling both dense and generative retrieval with serving costs on par with traditional context-free models. To address scenarios where context is unavailable in the cache, we introduce context glancing, a curriculum learning strategy that improves model robustness and performance even without contextual signals during inference. Extensive offline experiments demonstrate that our context-aware approach substantially outperforms context-free models. Furthermore, online A/B testing on a prominent search engine across 160+ countries shows significant improvements in user engagement and revenue. 2024.emnlp-industry.109 @@ -20010,16 +20010,16 @@ HaoranSun RenrenJin ShaoyangXu - LeiyuPanTianjin University - SupryadiTianjin University + LeiyuPanTianjin University + SupryadiTianjin University MenglongCui JiangcunDu YikunLei LeiYang - LingShi + LingShi JuesiXiao ShaolinZhuTianjin University - DeyiXiongTianjin University + DeyiXiongTianjin University 1499-1522 Large language models (LLMs) have demonstrated prowess in a wide range of tasks. However, many LLMs exhibit significant performance discrepancies between high- and low-resource languages. To mitigate this challenge, we present FuxiTranyu, an open-source multilingual LLM, which is designed to satisfy the need of the research community for balanced and high-performing multilingual capabilities. The base model, FuxiTranyu-8B, features 8 billion parameters and is trained from scratch on meticulously balanced multilingual data that contains 600 billion tokens covering 43 natural languages and 16 programming languages. We also develop two instruction-tuned models: FuxiTranyu-8B-SFT which is fine-tuned on a diverse multilingual instruction dataset, and FuxiTranyu-8B-DPO which is further refined with DPO on a preference dataset for enhanced alignment ability. Extensive experiments on a wide range of multilingual benchmarks demonstrate the competitive performance of FuxiTranyu against existing multilingual LLMs, e.g., BLOOM-7B, PolyLM-13B, and Mistral-7B-Instruct. Both neuron and representation interpretability analyses reveal that FuxiTranyu achieves consistent multilingual representations across languages. To promote further research into multilingual LLMs, we release both the base and instruction-tuned FuxiTranyu models together with 58 pre-training checkpoints at HuggingFace and Github. 2024.emnlp-industry.110 @@ -20029,7 +20029,7 @@ <fixed-case>QUIS</fixed-case>: Question-guided Insights Generation for Automated Exploratory Data Analysis AbhijitManatkarInternational Business Machines - AshleshaAkella + AshleshaAkella ParthiviGupta KrishnasuriNarayanam 1523-1535 @@ -20098,8 +20098,8 @@ Efficient Answer Retrieval System (<fixed-case>EARS</fixed-case>): Combining Local <fixed-case>DB</fixed-case> Search and Web Search for Generative <fixed-case>QA</fixed-case> NikitaKraykoMTS AI - IvanSidorov - FedorLaputin + IvanSidorov + FedorLaputin DariaGalimzianovaMTS AI VasilyKonovalovAIRI 1584-1594 @@ -20144,11 +20144,11 @@ Course-Correction: Safety Alignment Using Synthetic Preferences RongwuXu - YishuoCai + YishuoCai ZhenhongZhou RenjieGu HaiqinWeng - LiuYan + LiuYan TianweiZhangNanyang Technological University WeiXuTsinghua University, Tsinghua University HanQiuTsinghua University @@ -20167,7 +20167,7 @@ ZhenxinDingNA XiaodongZhangNA HaiboShi - JunfengWang + JunfengWang DaweiYinBaidu 1650-1658 Pre-trained language models have become an integral component of question-answering systems, achieving remarkable performance. However, for practical deployment, it is crucial to perform knowledge distillation to maintain high performance while operating under computational constraints. In this paper, we address a key question: given the importance of unsupervised distillation for student model performance, how can knowledge from multiple teacher models be effectively ensemble during this stage without the guidance of labels? We propose a novel algorithm, GOVERN, to tackle this issue. GOVERN has demonstrated significant improvements in both offline and online experiments, enabling the student model to achieve results comparable to that of teacher ensembles. Our experiments show that GOVERN remarkably requires a mere 1% of the ensemble method’s inference budget to achieve 99.5% of performance. The proposed algorithm has been successfully deployed in a real-world commercial question-answering system, demonstrating its real-world applicability. diff --git a/data/xml/2024.fever.xml b/data/xml/2024.fever.xml index 6676c09963..a00735d92d 100644 --- a/data/xml/2024.fever.xml +++ b/data/xml/2024.fever.xml @@ -62,7 +62,7 @@ Retrieving Semantics for Fact-Checking: A Comparative Approach using <fixed-case>CQ</fixed-case> (Claim to Question) & <fixed-case>AQ</fixed-case> (Answer to Question) NicolòUrbaniUniversity of Milan - Bicocca SandipModhaUniversity of Milan - Bicocca - GabriellaPasiUniversity of Milan - Bicocca + GabriellaPasiUniversity of Milan - Bicocca 37-46 Fact-checking using evidences is the preferred way to tackle the issue of misinformation in the society. The democratization of information through social media has accelerated the spread of information, allowing misinformation to reach and influence a vast audience. The significant impact of these falsehoods on society and public opinion underscores the need for automated approaches to identify and combat this phenomenon.This paper is describes the participation of team IKR3-UNIMIB in AVeriTeC (Automated Verification of Textual Claims) 2024 shared task. We proposed a methods to retrieve evidence in the question and answer format and predict the veracity of a claim. As part of the AVeriTeC shared task, our method combines similarity-based ColBERT re-ranker with traditional keyword search using BM25. Additionally, a recent promising approach, Chain of RAG (CoRAG) is introduced to generate question and answer pairs (QAs) to evaluate performance on this specific dataset. We explore whether generating questions from claims or answers produces more effective QA pairs for veracity prediction. Additionally, we try to generate questions from the claim rather than from evidence (opposite the AVeriTeC dataset paper) to generate effective QA pairs for veracity prediction. Our method achieved an AVeriTeC Score of 0.18 (more than baseline) on the test dataset, demonstrating its potential in automated fact-checking. 2024.fever-1.3 @@ -75,7 +75,7 @@ <fixed-case>RAG</fixed-case>-Fusion Based Information Retrieval for Fact-Checking YukiMomii TetsuyaTakiguchiKobe University - YasuoArikiKobe University + YasuoArikiKobe University 47-54 Fact-checking involves searching for relevant evidence and determining whether the given claim contains any misinformation. In this paper, we propose a fact verification system based on RAG-Fusion. We use GPT-4o to generate questions from the claim, which helps improve the accuracy of evidence retrieval.Additionally, we adopt GPT-4o for the final judgment module and refine the prompts to enhance the detection accuracy, particularly when the claim contains misinformation. Experiment showed that our system achieved an AVeriTeC Score of 0.3865 on the AVeriTeC test data, significantly surpassing the baseline score of 0.11. 2024.fever-1.4 @@ -85,8 +85,8 @@ <fixed-case>UHH</fixed-case> at <fixed-case>AV</fixed-case>eri<fixed-case>T</fixed-case>e<fixed-case>C</fixed-case>: <fixed-case>RAG</fixed-case> for Fact-Checking with Real-World Claims ÖzgeSevgili - IrinaNikishina - Seid MuhieYimamUniversität Hamburg + IrinaNikishina + Seid MuhieYimamUniversität Hamburg MartinSemmannUniversität Hamburg ChrisBiemannU Hamburg 55-63 @@ -98,7 +98,7 @@ Improving Evidence Retrieval on Claim Verification Pipeline through Question Enrichment SvetlanaChurina - Anab MaulanaBarik + Anab MaulanaBarik Saisamarth RajeshPhaye 64-70 The AVeriTeC shared task introduces a new real-word claim verification dataset, where a system is tasked to verify a real-world claim based on the evidence found in the internet.In this paper, we proposed a claim verification pipeline called QueenVer which consists of 2 modules, Evidence Retrieval and Claim Verification.Our pipeline collects pairs of <Question, Answer> as the evidence. Recognizing the pivotal role of question quality in the evidence efficacy, we proposed question enrichment to enhance the retrieved evidence. Specifically, we adopt three different Question Generation (QG) technique, muti-hop, single-hop, and Fact-checker style. For the claim verification module, we integrate an ensemble of multiple state-of-the-art LLM to enhance its robustness.Experiments show that QueenVC achieves 0.41, 0.29, and 0.42 on Q, Q+A, and AVeriTeC scores. @@ -121,9 +121,9 @@ <fixed-case>FZI</fixed-case>-<fixed-case>WIM</fixed-case> at <fixed-case>AV</fixed-case>eri<fixed-case>T</fixed-case>e<fixed-case>C</fixed-case> Shared Task: Real-World Fact-Checking with Question Answering - JinLiu + JinLiu SteffenThoma - AchimRettingerFZI Forschungszentrum Informatik and Trier University + AchimRettingerFZI Forschungszentrum Informatik and Trier University 77-85 This paper describes the FZI-WIM system at the AVeriTeC shared Task, which aims to assess evidence-based automated fact-checking systems for real-world claims with evidence retrieved from the web. The FZI-WIM system utilizes open-source models to build a reliable fact-checking pipeline via question-answering. With different experimental setups, we show that more questions lead to higher scores in the shared task. Both in question generation and question-answering stages, sampling can be a way to improve the performance of our system. We further analyze the limitations of current open-source models for real-world claim verification. Our code is publicly available https://github.com/jens5588/FZI-WIM-AVERITEC. 2024.fever-1.8 @@ -146,7 +146,7 @@ RonitSingalIIT Kharagpur, India PranshPatwaAditya English Medium School, India ParthPatwaAmazon and University of California, Los Angeles - AmanChadhaAmazon + AmanChadhaAmazon AmitavaDasUniversity of South Carolina 91-98 Given the widespread dissemination of misinformation on social media, implementing fact-checking mechanisms for online claims is essential. Manually verifying every claim is very challenging, underscoring the need for an automated fact-checking system. This paper presents our system designed to address this issue. We utilize the Averitec dataset (Schlichtkrull et al., 2023) to assess the performance of our fact-checking system. In addition to veracity prediction, our system provides supporting evidence, which is extracted from the dataset. We develop a Retrieve and Generate (RAG) pipeline to extract relevant evidence sentences from a knowledge base, which are then inputted along with the claim into a large language model (LLM) for classification. We also evaluate the few-shot In-Context Learning (ICL) capabilities of multiple LLMs. Our system achieves an ‘Averitec’ score of 0.33, which is a 22% absolute improvement over the baseline. Our Code is publicly available on https://github.com/ronit-singhal/evidence-backed-fact-checking-using-rag-and-few-shot-in-context-learning-with-llms. @@ -157,7 +157,7 @@ <fixed-case>SK</fixed-case>_<fixed-case>DU</fixed-case> Team: Cross-Encoder based Evidence Retrieval and Question Generation with Improved Prompt for the <fixed-case>AV</fixed-case>eri<fixed-case>T</fixed-case>e<fixed-case>C</fixed-case> Shared Task ShrikantMalviya - StamosKatsigiannisDurham University + StamosKatsigiannisDurham University 99-107 As part of the AVeriTeC shared task, we developed a pipelined system comprising robust and finely tuned models. Our system integrates advanced techniques for evidence retrieval and question generation, leveraging cross-encoders and large language models (LLMs) for optimal performance. With multi-stage processing, the pipeline demonstrates improvements over baseline models, particularly in handling complex claims that require nuanced reasoning by improved evidence extraction, question generation and veracity prediction. Through detailed experiments and ablation studies, we provide insights into the strengths and weaknesses of our approach, highlighting the critical role of evidence sufficiency and context dependency in automated fact-checking systems. Our system secured a competitive rank, 7th on the development and 12th on the test data, in the shared task, underscoring the effectiveness of our methods in addressing the challenges of real-world claim verification. 2024.fever-1.11 @@ -169,7 +169,7 @@ MarkRothermelTechnische Universität Darmstadt TobiasBraun MarcusRohrbachTechnische Universität Darmstadt - AnnaRohrbachTechnische Universität Darmstadt + AnnaRohrbachTechnische Universität Darmstadt 108-112 The spread of disinformation poses a global threat to democratic societies, necessitating robust and scalable Automated Fact-Checking (AFC) systems. The AVeriTeC Shared Task Challenge 2024 offers a realistic benchmark for text-based fact-checking methods. This paper presents Information-Retrieving Fact-Checker (InFact), an LLM-based approach that breaks down the task of claim verification into a 6-stage process, including evidence retrieval. When using GPT-4o as the backbone, InFact achieves an AVeriTeC score of 63% on the test set, outperforming all other 20 teams competing in the challenge, and establishing a new strong baseline for future text-only AFC systems. Qualitative analysis of mislabeled instances reveals that InFact often yields a more accurate conclusion than AVeriTeC’s human-annotated ground truth. 2024.fever-1.12 @@ -190,9 +190,9 @@ JiayuLiu JunhaoTang HanwenWangThe Hong Kong University of Science and Technology - BaixuanXuHong Kong University of Science and Technology + BaixuanXuHong Kong University of Science and Technology HaochenShi - WeiqiWangJohns Hopkins University and The Hong Kong University of Science and Technology + WeiqiWangJohns Hopkins University and The Hong Kong University of Science and Technology YangqiuSongThe Hong Kong University of Science and Technology 118-129 In the information era, the vast proliferation of online content poses significant challenges, particularly concerning the trustworthiness of these digital statements, which can have profound societal implications. Although it is possible to manually annotate and verify the authenticity of such content, the sheer volume and rapid pace of information generation render this approach impractical, both in terms of time and cost. Therefore, it is imperative to develop automated systems capable of validating online claims, ensuring that users can use the wealth of information available on the Internet effectively and reliably. Using primarily ChatGPT and the Google search API, GProofT fact checking framework generates question-answer pairs to systematically extract and verify the facts within claims. Based on the outcomes of these QA pairs, claims are subsequently labeled as Supported, Conflicted Evidence/Cherry-Picking, or Refuted. Shown by extensive experiments, GProofT Retrieval generally performs effectively in fact-checking and makes a substantial contribution to the task. Our code is released on https://github.com/HKUST-KnowComp/GProofT. @@ -204,8 +204,8 @@ <fixed-case>H</fixed-case>er<fixed-case>O</fixed-case> at <fixed-case>AV</fixed-case>eri<fixed-case>T</fixed-case>e<fixed-case>C</fixed-case>: The Herd of Open Large Language Models for Verifying Real-World Claims YejunYoonSoongsil University JaeyoonJung - SeunghyunYoonAdobe Research - KunwooParkSoongsil University + SeunghyunYoonAdobe Research + KunwooParkSoongsil University 130-136 To tackle the AVeriTeC shared task hosted by the FEVER-24, we introduce a system that only employs publicly available large language models (LLMs) for each step of automated fact-checking, dubbed the Herd of Open LLMs for verifying real-world claims (HerO). HerO employs multiple LLMs for each step of automated fact-checking. For evidence retrieval, a language model is used to enhance a query by generating hypothetical documents that check the veracity of a claim. We fine-tune LLMs for question generation and veracity prediction by crafting prompts with retrieved in-context samples. HerO achieved 2nd place on the leaderboard with the AVeriTeC score of 0.57, suggesting the potential of open LLMs for verifying real-world claims. For future research, we make our code publicly available at https://github.com/ssu-humane/HerO. 2024.fever-1.15 @@ -215,8 +215,8 @@ <fixed-case>AIC</fixed-case> <fixed-case>CTU</fixed-case> system at <fixed-case>AV</fixed-case>eri<fixed-case>T</fixed-case>e<fixed-case>C</fixed-case>: Re-framing automated fact-checking as a simple <fixed-case>RAG</fixed-case> task HerbertUllrich - TomášMlynářCzech Technical Univeresity in Prague, Czech Technical University of Prague - JanDrchalCzech Technical Univeresity in Prague, Czech Technical University of Prague + TomášMlynářCzech Technical Univeresity in Prague, Czech Technical University of Prague + JanDrchalCzech Technical Univeresity in Prague, Czech Technical University of Prague 137-150 This paper describes our 3^{rd} place submission in the AVeriTeC shared task in which we attempted to address the challenge of fact-checking with evidence retrieved in the wild using a simple scheme of Retrieval-Augmented Generation (RAG) designed for the task, leveraging the predictive power of Large Language Models.We release our codebase and explain its two modules - the Retriever and the Evidence & Label generator - in detail, justifying their features such as MMR-reranking and Likert-scale confidence estimation.We evaluate our solution on AVeriTeC dev and test set and interpret the results, picking the GPT-4o as the most appropriate model for our pipeline at the time of our publication, with Llama 3.1 70B being a promising open-source alternative.We perform an empirical error analysis to see that faults in our predictions often coincide with noise in the data or ambiguous fact-checks, provoking further research and data augmentation. 2024.fever-1.16 @@ -225,9 +225,9 @@ Enhancing Fact Verification with Causal Knowledge Graphs and Transformer-Based Retrieval for Deductive Reasoning - Fiona AntingTan + Fiona AntingTan JayDesaiAmazon - Srinivasan H.SengameduAmazon + Srinivasan H.SengameduAmazon 151-169 The ability to extract and verify factual information from free-form text is critical in an era where vast amounts of unstructured data are available, yet unreliable sources abound. This paper focuses on enhancing causal deductive reasoning, a key component of factual verification, through the lens of accident investigation, where determining the probable causes of events is paramount. Deductive reasoning refers to the task of drawing conclusions based on a premise. While some deductive reasoning benchmarks exist, none focus on causal deductive reasoning and are from real-world applications. Recently, large language models (LLMs) used with prompt engineering techniques like retrieval-augmented generation (RAG) have demonstrated remarkable performance across various natural language processing benchmarks. However, adapting these techniques to handle scenarios with no knowledge bases and to different data structures, such as graphs, remains an ongoing challenge. In our study, we introduce a novel framework leveraging LLMs’ decent ability to detect and infer causal relations to construct a causal Knowledge Graph (KG) which represents knowledge that the LLM recognizes. Additionally, we propose a RoBERTa-based Transformer Graph Neural Network (RoTG) specifically designed to select relevant nodes within this KG. Integrating RoTG-retrieved causal chains into prompts effectively enhances LLM performance, demonstrating usefulness of our approach in advancing LLMs’ causal deductive reasoning capabilities. 2024.fever-1.20 @@ -256,7 +256,7 @@ YotamIntrator RegevCohenGoogle OriKelner - RomanGoldenberg + RomanGoldenberg EhudRivlinTechnion, Technion DanielFreedmanVerily 186-191 @@ -267,12 +267,12 @@ Improving Explainable Fact-Checking via Sentence-Level Factual Reasoning - FrancielleVargas - IsadoraSalles + FrancielleVargas + IsadoraSalles DiegoAlvesUniversität des Saarlandes AmeetaAgrawalPortland State University Thiago A. S.PardoUniversidade de São Paulo - FabrícioBenevenuto + FabrícioBenevenuto 192-204 Most existing fact-checking systems are unable to explain their decisions by providing relevant rationales (justifications) for their predictions. It highlights a lack of transparency that poses significant risks, such as the prevalence of unexpected biases, which may increase political polarization due to limitations in impartiality. To address this critical gap, we introduce SEntence-Level FActual Reasoning (SELFAR), aimed at improving explainable fact-checking. SELFAR relies on fact extraction and verification by predicting the news source reliability and factuality (veracity) of news articles or claims at the sentence level, generating post-hoc explanations using SHAP/LIME and zero-shot prompts. Our experiments show that unreliable news stories predominantly consist of subjective statements, in contrast to reliable ones. Consequently, predicting unreliable news articles at the sentence level by analyzing impartiality and subjectivity is a promising approach for fact extraction and improving explainable fact-checking. Furthermore, LIME outperforms SHAP in explaining predictions on reliability. Additionally, while zero-shot prompts provide highly readable explanations and achieve an accuracy of 0.71 in predicting factuality, their tendency to hallucinate remains a challenge. Lastly, this paper also presents the first study on explainable fact-checking in the Portuguese language. 2024.fever-1.23 @@ -292,7 +292,7 @@ Question-Based Retrieval using Atomic Units for Enterprise <fixed-case>RAG</fixed-case> - VatsalRaina + VatsalRaina MarkGalesUniversity of Cambridge 219-233 Enterprise retrieval augmented generation (RAG) offers a highly flexible framework for combining powerful large language models (LLMs) with internal, possibly temporally changing, documents. In RAG, documents are first chunked. Relevant chunks are then retrieved for a user query, which are passed as context to a synthesizer LLM to generate the query response. However, the retrieval step can limit performance, as incorrect chunks can lead the synthesizer LLM to generate a false response. This work applies a zero-shot adaptation of standard dense retrieval steps for more accurate chunk recall. Specifically, a chunk is first decomposed into atomic statements. A set of synthetic questions are then generated on these atoms (with the chunk as the context). Dense retrieval involves finding the closest set of synthetic questions, and associated chunks, to the user query. It is found that retrieval with the atoms leads to higher recall than retrieval with chunks. Further performance gain is observed with retrieval using the synthetic questions generated over the atoms. Higher recall at the retrieval step enables higher performance of the enterprise LLM using the RAG pipeline. @@ -302,9 +302,9 @@ <fixed-case>AMRE</fixed-case>x: <fixed-case>AMR</fixed-case> for Explainable Fact Verification - ChathuriJayaweeraUniversity of Florida - SangpilYoumUniversity of Florida - Bonnie JDorrUniversity of Florida + ChathuriJayaweeraUniversity of Florida + SangpilYoumUniversity of Florida + Bonnie JDorrUniversity of Florida 234-244 With the advent of social media networks and the vast amount of information circulating through them, automatic fact verification is an essential component to prevent the spread of misinformation. It is even more useful to have fact verification systems that provide explanations along with their classifications to ensure accurate predictions. To address both of these requirements, we implement AMREx, an Abstract Meaning Representation (AMR)-based veracity prediction and explanation system for fact verification using a combination of Smatch, an AMR evaluation metric to measure meaning containment and textual similarity, and demonstrate its effectiveness in producing partially explainable justifications using two community standard fact verification datasets, FEVER and AVeriTeC. AMREx surpasses the AVeriTec baseline accuracy showing the effectiveness of our approach for real-world claim verification. It follows an interpretable pipeline and returns an explainable AMR node mapping to clarify the system’s veracity predictions when applicable. We further demonstrate that AMREx output can be used to prompt LLMs to generate natural-language explanations using the AMR mappings as a guide to lessen the probability of hallucinations. 2024.fever-1.26 @@ -325,7 +325,7 @@ Contrastive Learning to Improve Retrieval for Real-World Fact Checking AniruddhSriram FangyuanXuUniversity of Texas at Austin and University of Texas at Austin - EunsolChoiNew York University + EunsolChoiNew York University GregDurrettUniversity of Texas, Austin 264-279 Recent work on fact-checking addresses a realistic setting where models incorporate evidence retrieved from the web to decide the veracity of claims. A bottleneck in this pipeline is in retrieving relevant evidence: traditional methods may surface documents directly related to a claim, but fact-checking complex claims requires more inferences. For instance, a document about how a vaccine was developed is relevant to addressing claims about what it might contain, even if it does not address them directly. We present Contrastive Fact-Checking Reranker (CFR), an improved retriever for this setting. By leveraging the AVeriTeC dataset, which annotates subquestions for claims with human written answers from evidence documents, we fine-tune Contriever with a contrastive objective based on multiple training signals, including distillation from GPT-4, evaluating subquestion answers, and gold labels in the dataset. We evaluate our model on both retrieval and end-to-end veracity judgments about claims. On the AVeriTeC dataset, we find a 6% improvement in veracity classification accuracy. We also show our gains can be transferred to FEVER, ClaimDecomp, HotpotQA, and a synthetic dataset requiring retrievers to make inferences. @@ -339,7 +339,7 @@ Paul Yu-ChunChangappliedAI Initiative GmbH MingyangMa BernhardPflugfelderappliedAI Initiative GmbH - FilipMiletićUniversity of Stuttgart + FilipMiletićUniversity of Stuttgart 280-296 The escalating challenge of misinformation, particularly in political discourse, requires advanced fact-checking solutions; this is even clearer in the more complex scenario of multimodal claims. We tackle this issue using a multimodal large language model in conjunction with retrieval-augmented generation (RAG), and introduce two novel reasoning techniques: Chain of RAG (CoRAG) and Tree of RAG (ToRAG). They fact-check multimodal claims by extracting both textual and image content, retrieving external information, and reasoning subsequent questions to be answered based on prior evidence. We achieve a weighted F1-score of 0.85, surpassing a baseline reasoning technique by 0.14 points. Human evaluation confirms that the vast majority of our generated fact-check explanations contain all information from gold standard data. 2024.fever-1.29 @@ -348,8 +348,8 @@ <fixed-case>F</fixed-case>act<fixed-case>G</fixed-case>enius: Combining Zero-Shot Prompting and Fuzzy Relation Mining to Improve Fact Verification with Knowledge Graphs - SushantGautam - RoxanaPop + SushantGautam + RoxanaPop 297-306 Fact-checking is a crucial natural language processing (NLP) task that verifies the truthfulness of claims by considering reliable evidence. Traditional methods are labour- intensive, and most automatic approaches focus on using documents as evidence. In this paper, we focus on the relatively understudied fact-checking with Knowledge Graph data as evidence and experiment on the recently introduced FactKG benchmark. We present FactGenius, a novel method that enhances fact- checking by combining zero-shot prompting of large language models (LLMs) with fuzzy text matching on knowledge graphs (KGs). Our method employs LLMs for filtering relevant connections from the graph and validates these connections via distance-based matching. The evaluation of FactGenius on an existing benchmark demonstrates its effectiveness, as we show it significantly outperforms state-of- the-art methods. The code and materials are available at https://github.com/SushantGautam/FactGenius. 2024.fever-1.30 diff --git a/data/xml/2024.findings.xml b/data/xml/2024.findings.xml index 2fdfd13b76..282e105c3b 100644 --- a/data/xml/2024.findings.xml +++ b/data/xml/2024.findings.xml @@ -6118,7 +6118,7 @@ Controllable Data Augmentation for Few-Shot Text Mining with Chain-of-Thought Attribute Manipulation LetianPeng - YuweiZhangUniversity of California, San Diego + YuweiZhangUniversity of California, San Diego JingboShangUniversity of California, San Diego 1-16 Prompting large language models (LLMs) for data augmentation has recently become a common practice in few-shot NLP tasks. In this paper, we propose Chain-of-Thought Attribute Manipulation (CoTAM), a novel approach that generates new data from existing examples by only tweaking in the user-provided, task-specific attribute, e.g., sentiment polarity or topic in movie reviews. Instead of conventional latent representation controlling, we leverage the chain-of-thought prompting to directly edit the text in three steps, (1) attribute decomposition, (2) manipulation proposal, and (3) sentence reconstruction. Extensive results on various tasks, such as text (pair) classification and aspect-based sentiment analysis, verify the superiority of CoTAM over other LLM-based augmentation methods with the same number of training examples for both fine-tuning and in-context learning. Remarkably, the 2D visualization of the augmented dataset using principle component analysis revealed a human-recognizable decision boundary that is likely hinted by the attribute manipulation, demonstrating the potential of our proposed approach. @@ -6128,9 +6128,9 @@ Match More, Extract Better! Hybrid Matching Model for Open Domain Web Keyphrase Extraction - MingyangSongTencent + MingyangSongTencent LipingJingBeijing Jiaotong University - YiFeng + YiFeng 17-27 Keyphrase extraction aims to automatically extract salient phrases representing the critical information in the source document. Identifying salient phrases is challenging because there is a lot of noisy information in the document, leading to wrong extraction. To address this issue, in this paper, we propose a hybrid matching model for keyphrase extraction, which combines representation-focused and interaction-based matching modules into a unified framework for improving the performance of the keyphrase extraction task. Specifically, HybridMatch comprises (1) a PLM-based Siamese encoder component that represents both candidate phrases and documents, (2) an interaction-focused matching (IM) component that estimates word matches between candidate phrases and the corresponding document at the word level, and (3) a representation-focused matching (RM) component captures context-aware semantic relatedness of each candidate keyphrase at the phrase level. Extensive experimental results on the OpenKP dataset demonstrate that the performance of the proposed model HybridMatch outperforms the recent state-of-the-art keyphrase extraction baselines. Furthermore, we discuss the performance of large language models in keyphrase extraction based on recent studies and our experiments. 2024.findings-acl.2 @@ -6143,7 +6143,7 @@ SichengZhang ShijieCaoMicrosoft Research Asia DaYouDuHKUST(GZ) - JianyuWei + JianyuWei TingCaoMicrosoft Research NingyiXuShanghai Jiaotong University 28-36 @@ -6166,7 +6166,7 @@ Overcoming Catastrophic Forgetting by Exemplar Selection in Task-oriented Dialogue System ChenChenNanyang Technological University - RuizheLiUniversity of Aberdeen + RuizheLiUniversity of Aberdeen YuchenHu YuanyuanChenNanyang Technological University ChengweiQinNanyang Technological University @@ -6191,7 +6191,7 @@ AlexGuMassachusetts Institute of Technology Wen-DingLiCornell University NamanJainUniversity of California, Berkeley - TheoOlaussonMassachusetts Institute of Technology + TheoOlaussonMassachusetts Institute of Technology CelineLeeCornell University KoushikSenUC Berkeley, University of California, Berkeley ArmandoSolar-LezamaMassachusetts Institute of Technology @@ -6209,7 +6209,7 @@ BaileyKuehl ChenhaoTanUniversity of Chicago DavidWaddenAllen Institute for Artificial Intelligence - LucyWangUniversity of Washington and Allen Institute for Artificial Intelligence + LucyWangUniversity of Washington and Allen Institute for Artificial Intelligence AakankshaNaikAllen Institute for Artificial Intelligence and National Institutes of Health 118-132 Literature review requires researchers to synthesize a large amount of information and is increasingly challenging as the scientific literature expands. In this work, we investigate the potential of LLMs for producing hierarchical organizations of scientific studies to assist researchers with literature review. We define hierarchical organizations as tree structures where nodes refer to topical categories and every node is linked to the studies assigned to that category. Our naive LLM-based pipeline for hierarchy generation from a set of studies produces promising yet imperfect hierarchies, motivating us to collect CHIME, an expert-curated dataset for this task focused on biomedicine. Given the challenging and time-consuming nature of building hierarchies from scratch, we use a human-in-the-loop process in which experts correct errors (both links between categories and study assignment) in LLM-generated hierarchies. CHIME contains 2,174 LLM-generated hierarchies covering 472 topics, and expert-corrected hierarchies for a subset of 100 topics. Expert corrections allow us to quantify LLM performance, and we find that while they are quite good at generating and organizing categories, their assignment of studies to categories could be improved. We attempt to train a corrector model with human feedback which improves study assignment by 12.6 F1 points. We release our dataset and models to encourage research on developing better assistive tools for literature review. @@ -6219,16 +6219,16 @@ Which Side Are You On? A Multi-task Dataset for End-to-End Argument Summarisation and Evaluation - HaoLi - YupingWu + HaoLi + YupingWu ViktorSchlegelImperial College London RizaBatista-NavarroUniversity of Manchester - TharinduMadusanka + TharinduMadusanka IqraZahid JiayanZeng XiaochiWang XinranHe - YizhiLiUniversity of Manchester and University of Sheffield + YizhiLiUniversity of Manchester and University of Sheffield GoranNenadicUniversity of Manchester 133-150 With the recent advances of large language models (LLMs), it is no longer infeasible to build an automated debate system that helps people to synthesise persuasive arguments. Previous work attempted this task by integrating multiple components. In our work, we introduce an argument mining dataset that captures the end-to-end process of preparing an argumentative essay for a debate, which covers the tasks of claim and evidence identification (Task 1 ED), evidence convincingness ranking (Task 2 ECR), argumentative essay summarisation and human preference ranking (Task 3 ASR) and metric learning for automated evaluation of resulting essays, based on human feedback along argument quality dimensions (Task 4 SQE). Our dataset contains 14k examples of claims that are fully annotated with various properties supporting the aforementioned tasks. We evaluate multiple generative baselines for each of these tasks, including representative LLMs. We find, that while they show promising results on individual tasks in our benchmark, their end-to-end performance on all four tasks in succession deteriorates significantly, both in automated measures as well as in human-centred evaluation. This challenge presented by our proposed dataset motivates future research on end-to-end argument mining and summarisation. The repository of this project is available at https://github.com/HarrywillDr/ArgSum-Datatset. @@ -6243,7 +6243,7 @@ SarathkrishnaSwaminathanInternational Business Machines AsafYehudai SubhajitChaudhuryInternational Business Machines - RaduFlorianInternational Business Machines + RaduFlorianInternational Business Machines RamónAstudilloInternational Business Machines AsimMunawarInternational Business Machines 151-162 @@ -6254,16 +6254,16 @@ Graph Chain-of-Thought: Augmenting Large Language Models by Reasoning on Graphs - BowenJin + BowenJin ChulinXieUniversity of Illinois, Urbana Champaign JiaweiZhang Kashob KumarRoyDepartment of Computer Science - YuZhangTexas A&M University - College Station + YuZhangTexas A&M University - College Station ZhengLiAmazon RuiruiLi XianfengTangAmazon - SuhangWangPennsylvania State University - YuMengUniversity of Virginia + SuhangWangPennsylvania State University + YuMengUniversity of Virginia JiaweiHan 163-184 Large language models (LLMs), while exhibiting exceptional performance, suffer from hallucinations, especially on knowledge-intensive tasks. Existing works propose to augment LLMs with individual text units retrieved from external knowledge corpora to alleviate the issue. However, in many domains, texts are interconnected (e.g., academic papers in a bibliographic graph are linked by citations and co-authorships) which form a (text-attributed) graph. The knowledge in such graphs is encoded not only in single texts/nodes but also in their associated connections. To facilitate the research of augmenting LLMs with graphs, we manually construct a Graph Reasoning Benchmark dataset called GRBench, containing 1,740 questions that can be answered with the knowledge from 10 domain graphs. Then, we propose a simple and effective framework called Graph Chain-of-thought (Graph-CoT) to augment LLMs with graphs by encouraging LLMs to reason on the graph iteratively. Each Graph-CoT iteration consists of three sub-steps: LLM reasoning, LLM-graph interaction, and graph execution. We conduct systematic experiments with three LLM backbones on GRBench, where Graph-CoT outperforms the baselines consistently. The code is available at https://github.com/PeterGriffinJin/Graph-CoT/. @@ -6275,7 +6275,7 @@ <fixed-case>T</fixed-case>ext2<fixed-case>DB</fixed-case>: Integration-Aware Information Extraction with Large Language Model Agents YizhuJiaoUIUC ShaLiUniversity of Illinois, Urbana Champaign - SizheZhou + SizheZhou HengJiUniversity of Illinois, Urbana-Champaign JiaweiHan 185-205 @@ -6302,7 +6302,7 @@ MahmoudSalemCerebras Systems, Inc ShreyasSaxenaCerebras Systems, Inc Chen-YuLeong - JoelHestnessCerebras Systems, Inc + JoelHestnessCerebras Systems, Inc SeanLieCerebras Systems, Inc 214-230 Large language models (LLMs) are typically trained on general source data forvarious domains, but a recent surge in domain-specific LLMs has shown theirpotential to outperform general-purpose models in domain-specific tasks (e.g.,biomedicine). Although domain-specific pre-training enhances efficiency andleads to smaller models, the computational costs of training these LLMs remainhigh, posing budgeting challenges. We introduce MediSwift, a suite of biomedicalLMs that leverage sparse pre-training on domain-specific biomedical text data.By inducing up to 75% weight sparsity during the pre-training phase, MediSwiftachieves a 2-2.5x reduction in training FLOPs. Notably, all sparse pre-trainingwas performed on the Cerebras CS-2 system, which is specifically designed torealize the acceleration benefits from unstructured weight sparsity, therebysignificantly enhancing the efficiency of the MediSwift models. Throughsubsequent dense fine-tuning and strategic soft prompting, MediSwift modelsoutperform existing LLMs up to 7B parameters on biomedical tasks, setting newbenchmarks w.r.t efficiency-accuracy on tasks such as PubMedQA. Our results showthat sparse pre-training, along with dense fine-tuning and soft prompting,offers an effective method for creating high-performing, computationallyefficient models in specialized domains. @@ -6323,11 +6323,11 @@ <fixed-case>P</fixed-case>-<fixed-case>TA</fixed-case>: Using Proximal Policy Optimization to Enhance Tabular Data Augmentation via Large Language Models - ShuoYang - ChenchenYuan + ShuoYang + ChenchenYuan YaoRong - FelixSteinbauerDepartment of Informatics, Technische Universität München - GjergjiKasneciTechnische Universität München and University of Tuebingen + FelixSteinbauerDepartment of Informatics, Technische Universität München + GjergjiKasneciTechnische Universität München and University of Tuebingen 248-264 A multitude of industries depend on accurate and reasonable tabular data augmentation for their business processes. Contemporary methodologies in generating tabular data revolve around utilizing Generative Adversarial Networks (GAN) or fine-tuning Large Language Models (LLM). However, GAN-based approaches are documented to produce samples with common-sense errors attributed to the absence of external knowledge. On the other hand, LLM-based methods exhibit a limited capacity to capture the disparities between synthesized and actual data distribution due to the absence of feedback from a discriminator during training. Furthermore, the decoding of LLM-based generation introduces gradient breakpoints, impeding the backpropagation of loss from a discriminator, thereby complicating the integration of these two approaches. To solve this challenge, we propose using proximal policy optimization (PPO) to apply GANs, guiding LLMs to enhance the probability distribution of tabular features. This approach enables the utilization of LLMs as generators for GANs in synthesizing tabular data. Our experiments demonstrate that PPO leads to an approximately 4% improvement in the accuracy of models trained on synthetically generated data over state-of-the-art across three real-world datasets. 2024.findings-acl.16 @@ -6351,7 +6351,7 @@ ShuohangWang YangLiu ChenguangZhuZoom - JulianMcAuleyUniversity of California, San Diego, University of California, San Diego + JulianMcAuleyUniversity of California, San Diego, University of California, San Diego 283-294 Large language models (LLMs) such as GPT-3 and GPT-4 are powerful but their weights are often publicly unavailable and their immense sizes make the models difficult to be tuned with common hardware. As a result, effectively tuning these models with large-scale supervised data can be challenging. As an alternative, In-Context Learning (ICL) can only use a small number of supervised examples due to context length limits. In this paper, we propose Super In-Context Learning (SuperICL) which allows black-box LLMs to work with locally fine-tuned smaller models, resulting in superior performance on supervised tasks. Our experiments demonstrate that SuperICL can improve performance beyond state-of-the-art fine-tuned models while addressing the instability problem of in-context learning. 2024.findings-acl.18 @@ -6360,7 +6360,7 @@ Are self-explanations from Large Language Models faithful? - AndreasMadsenMontreal Institute for Learning Algorithms, École Polytechnique de Montréal, Université de Montréal and Mila + AndreasMadsenMontreal Institute for Learning Algorithms, École Polytechnique de Montréal, Université de Montréal and Mila SarathChandarPolytechnique Montreal SivaReddyMila, McGill University and Mila, McGill University 295-337 @@ -6374,10 +6374,10 @@ HenryZou VinaySamuel YueZhouUniversity of Illinois at Chicago - WeizhiZhangUniversity of Illinois at Chicago + WeizhiZhangUniversity of Illinois at Chicago LianchengFangUniversity of Illinois at Chicago ZiheSong - PhilipYuUniversity of Illinois, Chicago + PhilipYuUniversity of Illinois, Chicago CorneliaCarageaUniversity of Illinois, Chicago 338-354 Existing datasets for attribute value extraction (AVE) predominantly focus on explicit attribute values while neglecting the implicit ones, lack product images, are often not publicly available, and lack an in-depth human inspection across diverse domains. To address these limitations, we present ImplicitAVE, the first, publicly available multimodal dataset for implicit attribute value extraction. ImplicitAVE, sourced from the MAVE dataset, is carefully curated and expanded to include implicit AVE and multimodality, resulting in a refined dataset of 68k training and 1.6k testing data across five domains. We also explore the application of multimodal large language models (MLLMs) to implicit AVE, establishing a comprehensive benchmark for MLLMs on the ImplicitAVE dataset. Six recent MLLMs with eleven variants are evaluated across diverse settings, revealing that implicit value extraction remains a challenging task for MLLMs. The contributions of this work include the development and release of ImplicitAVE, and the exploration and benchmarking of various MLLMs for implicit AVE, providing valuable insights and potential future research directions. Dataset and code are available at https://github.com/HenryPengZou/ImplicitAVE. @@ -6405,7 +6405,7 @@ UtkarshTyagi SSakshi SanjoyChowdhuryUniversity of Maryland, College Park - DineshManochaUniversity of Maryland, College Park + DineshManochaUniversity of Maryland, College Park 386-406 Neural image classifiers can often learn to make predictions by overly relying on non-predictive features that are spuriously correlated with the class labels in the training data. This leads to poor performance in real-world atypical scenarios where such features are absent. This paper presents ASPIRE (Language-guided Data Augmentation for SPurIous correlation REmoval), a simple yet effective solution for supplementing the training dataset with images without spurious features, for robust learning against spurious correlations via better generalization. ASPIRE, guided by language at various steps, can generate non-spurious images without requiring any group labeling or existing non-spurious images in the training set. Precisely, we employ LLMs to first extract foreground and background features from textual descriptions of an image, followed by advanced language-guided image editing to discover the features that are spuriously correlated with the class label. Finally, we personalize a text-to-image generation model using the edited images to generate diverse in-domain images without spurious features. ASPIRE is complementary to all prior robust training methods in literature, and we demonstrate its effectiveness across 4 datasets and 9 baselines and show that ASPIRE improves the worst-group classification accuracy of prior methods by 1% - 38%. We also contribute a novel test set for the challenging Hard ImageNet dataset. 2024.findings-acl.22 @@ -6414,14 +6414,14 @@ Tables as Texts or Images: Evaluating the Table Reasoning Ability of <fixed-case>LLM</fixed-case>s and <fixed-case>MLLM</fixed-case>s - NaihaoDeng + NaihaoDeng ZhenjieSun RuiqiHe AmanSikkaUniversity of Michigan - Ann Arbor YulongChenUniversity of Cambridge - LinMaUniversity of Michigan - Ann Arbor - YueZhangWestlake University - RadaMihalceaUniversity of Michigan + LinMaUniversity of Michigan - Ann Arbor + YueZhangWestlake University + RadaMihalceaUniversity of Michigan 407-426 Tables contrast with unstructured text data by its structure to organize the information.In this paper, we investigate the efficiency of various LLMs in interpreting tabular data through different prompting strategies and data formats. Our analysis extends across six benchmarks for table-related tasks such as question-answering and fact-checking. We pioneer in the assessment of LLMs’ performance on image-based table representation. Specifically, we compare five text-based and three image-based table representations, revealing the influence of representation and prompting on LLM performance. We hope our study provides researchers insights into optimizing LLMs’ application in table-related tasks. 2024.findings-acl.23 @@ -6460,7 +6460,7 @@ <fixed-case>LLM</fixed-case>-<fixed-case>QAT</fixed-case>: Data-Free Quantization Aware Training for Large Language Models ZechunLiuMeta Inc. BarlasOguzMeta - ChangshengZhaoMeta Inc. + ChangshengZhaoMeta Inc. ErnieChangMeta AI PierreStockFacebook YasharMehdadFacebook @@ -6477,15 +6477,15 @@ <fixed-case>I</fixed-case>nfi<fixed-case>MM</fixed-case>: Advancing Multimodal Understanding with an Open-Sourced Visual Language Model HaogengLiu QuanzengYouByteDance - YiqiWang - XiaotianHanByteDance + YiqiWang + XiaotianHanByteDance BohanZhaiSnowflake YongfeiLiuBytedance WentaoChenByteDance Inc. YirenJianByteDance Inc. YunzheTaoByteDance JianboYuanBytedance - RanHeInstitute of automation, Chinese academy of science, Chinese Academy of Sciences + RanHeInstitute of automation, Chinese academy of science, Chinese Academy of Sciences HongxiaYang 485-492 In this work, we present InfiMM, an advanced Multimodal Large Language Model that adapts to intricate vision-language tasks. InfiMM, inspired by the Flamingo architecture, distinguishes itself through the utilization of large-scale training data, comprehensive training strategies, and diverse large language models. This approach ensures the preservation of Flamingo’s foundational strengths while simultaneously introducing augmented capabilities. Empirical evaluations across a variety of benchmarks underscore InfiMM’s remarkable capability in multimodal understanding. The code can be found at: https://anonymous.4open.science/r/infimm-zephyr-F60C/. @@ -6499,7 +6499,7 @@ YixinCaoFudan University LiangmingPan YuboMaSchool of Computer Science and Engineering, Nanyang Technological University - AixinSunNanyang Technological University + AixinSunNanyang Technological University 493-516 Although achieving great success, Large Language Models (LLMs) usually suffer from unreliable hallucinations. Although language attribution can be a potential solution, there are no suitable benchmarks and evaluation metrics to attribute LLMs to structured knowledge. In this paper, we define a new task of Knowledge-aware Language Model Attribution (KaLMA) that improves upon three core concerns with conventional attributed LMs. First, we extend attribution source from unstructured texts to Knowledge Graph (KG), whose rich structures benefit both the attribution performance and working scenarios. Second, we propose a new “Conscious Incompetence” setting considering the incomplete knowledge repository, where the model identifies the need for supporting knowledge beyond the provided KG. Third, we propose a comprehensive automatic evaluation metric encompassing text quality, citation quality, and text citation alignment. To implement the above innovations, we build a dataset in biography domain BioKaLMA via evolutionary question generation strategy, to control the question complexity and necessary knowledge to the answer. For evaluation, we develop a baseline solution and demonstrate the room for improvement in LLMs’ citation generation, emphasizing the importance of incorporating the “Conscious Incompetence” setting, and the critical role of retrieval accuracy. 2024.findings-acl.28 @@ -6513,7 +6513,7 @@ VipulRahejaColumbia University, Grammarly and International Institute of Information Technology Hyderabad Jong InnPark Zae MyungKimUniversity of Minnesota - Twin Cities - DongyeopKangUniversity of Minnesota + DongyeopKangUniversity of Minnesota 517-545 Large Language Models (LLMs) have recently been shown to be effective as automatic evaluators with simple prompting and in-context learning. In this work, we assemble 16 LLMs encompassing four different size ranges and evaluate their output responses by preference ranking from the other LLMs as evaluators, such as System Star is better than System Square. We then evaluate the quality of ranking outputs introducing the Cognitive Bias Benchmark for LLMs as Evaluators (CoBBLer), a benchmark to measure six different cognitive biases in LLM evaluation outputs, such as the Egocentric bias where a model prefers to rank its own outputs highly in evaluation. We find that LLMs are biased text quality evaluators, exhibiting strong indications on our bias benchmark (40% of comparisons made by all models) within each of their evaluations that question their robustness as evaluators. Furthermore, we examine the correlation between human and machine preferences and calculate the average Rank-Biased Overlap (RBO) score to be 44%, indicating that machine preferences are misaligned with humans. According to our findings, LLMs may still be unable to be utilized for automatic annotation aligned with human preferences. 2024.findings-acl.29 @@ -6525,7 +6525,7 @@ ChongLiInstitute of automation, Chinese Academy of Sciences WenYangInstitute of automation, Chinese academy of science, Chinese Academy of Sciences JiajunZhangInstitute of automation, Chinese academy of science, Chinese Academy of Sciences - JinliangLuInstitute of automation, Chinese Academy of Sciences + JinliangLuInstitute of automation, Chinese Academy of Sciences ShaonanWang ChengqingZongInstitute of automation, Chinese academy of science, Chinese Academy of Sciences 546-566 @@ -6536,11 +6536,11 @@ Muffin: Mitigating Unhelpfulness in Emotional Support Conversations with Multifaceted <fixed-case>AI</fixed-case> Feedback - JiashuoWang + JiashuoWang ChunpuXu Chak TouLeongHong Kong Polytechnic University - WenjieLiThe Hong Kong Polytechnic University, The Hong Kong Polytechnic University - JingLiThe Hong Kong Polytechnic University + WenjieLiThe Hong Kong Polytechnic University, The Hong Kong Polytechnic University + JingLiThe Hong Kong Polytechnic University 567-585 2024.findings-acl.31 wang-etal-2024-muffin @@ -6552,7 +6552,7 @@ IvanKobyzevHuawei Noah’s Ark Lab PengLuUniversity of Montreal MehdiRezagholizadeh - BangLiuUniversity of Montreal + BangLiuUniversity of Montreal 586-598 This paper addresses the challenge of train-short-test-long (TSTL) scenarios in Large Language Models (LLMs) equipped with Rotary Position Embedding (RoPE), where models pre-trained on shorter sequences face difficulty with out-of-distribution (OOD) token positions in longer sequences. We introduce Resonance RoPE, a novel approach designed to narrow the generalization gap in TSTL scenarios by refining the interpolation of RoPE features for OOD positions, significantly improving the model performance without additional online computational costs. Furthermore, we present PosGen, a new synthetic benchmark specifically designed for fine-grained behavior analysis in TSTL scenarios, aiming to isolate the constantly increasing difficulty of token generation on long contexts from the challenges of recognizing new token positions. Our experiments on synthetic tasks show that after applying Resonance RoPE, Transformers recognize OOD position better and more robustly. Our extensive LLM experiments also show superior performance after applying Resonance RoPE to the current state-of-the-art RoPE scaling method, YaRN, on both upstream language modeling tasks and a variety of downstream long-text applications. 2024.findings-acl.32 @@ -6562,13 +6562,13 @@ <fixed-case>M</fixed-case>ed<fixed-case>A</fixed-case>gents: Large Language Models as Collaborators for Zero-shot Medical Reasoning XiangruTangYale University - AnniZou - ZhuoshengZhangShanghai Jiao Tong University + AnniZou + ZhuoshengZhangShanghai Jiao Tong University ZimingLi YilunZhaoYale University XingyaoZhangAlibaba Group ArmanCohanYale University and Allen Institute for Artificial Intelligence - MarkGersteinYale University + MarkGersteinYale University 599-621 Large language models (LLMs), despite their remarkable progress across various general domains, encounter significant barriers in medicine and healthcare. This field faces unique challenges such as domain-specific terminologies and reasoning over specialized knowledge. To address these issues, we propose MedAgents, a novel multi-disciplinary collaboration framework for the medical domain. MedAgents leverages LLM-based agents in a role-playing setting that participate in a collaborative multi-round discussion, thereby enhancing LLM proficiency and reasoning capabilities. This training-free framework encompasses five critical steps: gathering domain experts, proposing individual analyses, summarising these analyses into a report, iterating over discussions until a consensus is reached, and ultimately making a decision. Our work focuses on the zero-shot setting, which is applicable in real-world scenarios. Experimental results on nine datasets (MedQA, MedMCQA, PubMedQA, and six subtasks from MMLU) establish that our proposed MedAgents framework excels at mining and harnessing the medical expertise within LLMs, as well as extending its reasoning abilities. Our code can be found at https://github.com/gersteinlab/MedAgents. 2024.findings-acl.33 @@ -6577,11 +6577,11 @@ Meta-Reasoning: Semantics-Symbol Deconstruction for Large Language Models - YimingWangShanghai Jiao Tong University - ZhuoshengZhangShanghai Jiao Tong University + YimingWangShanghai Jiao Tong University + ZhuoshengZhangShanghai Jiao Tong University PeiZhangAlibaba Group BaosongYang - RuiWangShanghai Jiao Tong University + RuiWangShanghai Jiao Tong University 622-643 Neural-symbolic methods have demonstrated efficiency in enhancing the reasoning abilities of large language models (LLMs). However, existing methods mainly rely on syntactically mapping natural languages to complete formal languages like Python and SQL. Those methods require that reasoning tasks be convertible into programs, which cater to the computer execution mindset and deviate from human reasoning habits. To broaden symbolic methods’ applicability and adaptability in the real world, we propose Meta-Reasoning from a linguistic perspective. This method empowers LLMs to deconstruct reasoning-independent semantic information into generic symbolic representations, thereby efficiently capturing more generalized reasoning knowledge. We conduct extensive experiments on more than ten datasets encompassing conventional reasoning tasks like arithmetic, symbolic, and logical reasoning, and the more complex interactive reasoning tasks like theory-of-mind reasoning. Experimental results demonstrate that Meta-Reasoning significantly enhances in-context reasoning accuracy, learning efficiency, out-of-domain generalization, and output stability compared to the Chain-of-Thought technique. 2024.findings-acl.34 @@ -6591,13 +6591,13 @@ <fixed-case>DPDLLM</fixed-case>: A Black-box Framework for Detecting Pre-training Data from Large Language Models BaohangZhouNankai University - ZezhongWang - LingzhiWangThe Chinese University of Hong Kong - HongruWangThe Chinese University of Hong Kong + ZezhongWang + LingzhiWangThe Chinese University of Hong Kong + HongruWangThe Chinese University of Hong Kong YingZhangNankai University KehuiSong XuhuiSui - Kam-FaiWongThe Chinese University of Hong Kong + Kam-FaiWongThe Chinese University of Hong Kong 644-653 The success of large language models (LLM) benefits from large-scale model parameters and large amounts of pre-training data. However, the textual data for training LLM can not be confirmed to be legal because they are crawled from different web sites. For example, there are copyrighted articles, personal reviews and information in the pre-training data for LLM which are illegal. To address the above issue and develop legal LLM, we propose to detect the pre-training data from LLM in a pure black-box way because the existing LLM services only return the generated text. The previous most related works are the membership inference attack (MIA) on machine learning models to detect the training data from them. But the existing methods are based on analyzing the output probabilities of models which are unrealistic to LLM services. To tackle the problem, we firstly construct the benchmark datasets by collecting textual data from different domains as the seen and unseen pre-training data for LLMs. Then, we investigate a black-box framework named DPDLLM, with the only access to the generated texts from LLM for detecting textual data whether was used to train it. In the proposed framework, we exploit GPT-2 as the reference model to fit the textual data and feed the generated text from LLM into it to acquire sequence probabilities as the significant feature for detection. The experimental results on the benchmark datasets demonstrate that DPDLLM is effective on different popular LLMs and outperforms the existing methods. 2024.findings-acl.35 @@ -6608,9 +6608,9 @@ <fixed-case>PACIT</fixed-case>: Unlocking the Power of Examples for Better In-Context Instruction Tuning TianciXue ZiqiWang - YixiaLi + YixiaLi YunChenShanghai University of Finance and Economics - GuanhuaChenSouthern University of Science and Technology + GuanhuaChenSouthern University of Science and Technology 654-665 Instruction tuning enhances the instruction following ability of large language models by finetuning with supervised instruction data. Previous work proposes in-context instruction tuning (ICIT) where specific positive or negative examples are incorporated into the prompt for better performance. In this work, we propose PACIT, a simple and effective in-context instruction tuning method, inspired by the pedagogical concept of desirable difficulty. The PACIT method unlocks the power of examples by encouraging the model to actively learn to grasp the distinctions between the positive and negative examples instead of merely reading. The model is expected to first verify the correctness of the provided example according to the task description, which is then set as the condition for generating a better response to the task instance. Our extensive experiments prove the effectiveness of PACIT, outperforming ICIT baseline on both in-domain and out-domain tasks up to 9.16 and 3.14 average ROUGE-L scores, respectively. Moreover, PACIT can notably enhance the performance of instruction tuning even when all positive and negative examples are generated with a self-instruct method. 2024.findings-acl.36 @@ -6624,7 +6624,7 @@ ChengweiQinNanyang Technological University QiushiZhu EngSiongChngNanyang Technological University - RuizheLiUniversity of Aberdeen + RuizheLiUniversity of Aberdeen 666-679 Recent advances in large language models (LLMs) have promoted generative error correction (GER) for automatic speech recognition (ASR), which aims to predict the ground-truth transcription from the decoded N-best hypotheses. Thanks to the strong language generation ability of LLMs and rich information in the N-best list, GER shows great effectiveness in enhancing ASR results. However, it still suffers from two limitations: 1) LLMs are unaware of the source speech during GER, which may lead to results that are grammatically correct but violate the source speech content, 2) N-best hypotheses usually only vary in a few tokens, making it redundant to send all of them for GER, which could confuse LLM about which tokens to focus on and thus lead to increased miscorrection. In this paper, we propose ClozeGER, a new paradigm for ASR generative error correction. First, we introduce a multimodal LLM (i.e., SpeechGPT) to receive source speech as extra input to improve the fidelity of correction output. Then, we reformat GER as a cloze test with logits calibration to remove the input information redundancy and simplify GER with clear instructions. Experiments show that ClozeGER achieves a new breakthrough over vanilla GER on 9 popular ASR datasets. 2024.findings-acl.37 @@ -6636,7 +6636,7 @@ HaoYue ShaopengLaiAlibaba Group ChengyiYang - LiangZhang + LiangZhang JunfengYaoXiamen University JinsongSuXiamen University 680-691 @@ -6660,15 +6660,15 @@ <fixed-case>C</fixed-case>ode<fixed-case>M</fixed-case>: Less Data Yields More Versatility via Ability Matrix - DaoguangZan + DaoguangZan AilunYu WeiLiu BoShen ShaoxinLin - YongshunGongShandong University + YongshunGongShandong University YafenYao YanLiu - BeiGuan + BeiGuan WeihuaLuoAlibaba Group YongjiWang QianxiangWangPeking University @@ -6683,9 +6683,9 @@ Do <fixed-case>LVLM</fixed-case>s Understand Charts? Analyzing and Correcting Factual Errors in Chart Captioning Kung-HsiangHuangSalesForce.com MingyangZhou - Hou PongChanAlibaba Group + Hou PongChanAlibaba Group YiFung - ZhenhailongWang + ZhenhailongWang LingyuZhangDuke University Shih-FuChangColumbia, Columbia University, Columbia University, Columbia University, Columbia University, Columbia University and Columbia University HengJiUniversity of Illinois, Urbana-Champaign @@ -6697,10 +6697,10 @@ <fixed-case>BIDER</fixed-case>: Bridging Knowledge Inconsistency for Efficient Retrieval-Augmented <fixed-case>LLM</fixed-case>s via Key Supporting Evidence - JiajieJinRenmin University of China - YutaoZhu - YujiaZhouTsinghua University, Tsinghua University - ZhichengDouRenmin University of China + JiajieJinRenmin University of China + YutaoZhu + YujiaZhouTsinghua University, Tsinghua University + ZhichengDouRenmin University of China 750-761 Retrieval-augmented large language models (LLMs) have demonstrated efficacy in knowledge-intensive tasks such as open-domain QA, addressing inherent challenges in knowledge update and factual inadequacy.However, inconsistencies between retrieval knowledge and the necessary knowledge for LLMs, leading to a decline in LLM’s answer quality. This paper introduces BIDER, an approach that refines retrieval documents into Key Supporting Evidence (KSE) through knowledge synthesis, supervised fine-tuning (SFT), and preference alignment. We train BIDER by learning from crafting KSE, while maximizing its output to align with LLM’s information acquisition preferences through reinforcement learning. Evaluations across five datasets show BIDER boosts LLMs’ answer quality by 7% while reducing input content length in retrieval documents by 80%, outperforming existing methods. The proposed KSE simulation effectively equips LLMs with essential information for accurate question answering. 2024.findings-acl.42 @@ -6709,7 +6709,7 @@ Beyond Literal Descriptions: Understanding and Locating Open-World Objects Aligned with Human Intentions - WenxuanWangNational Lab of Pattern Recognition, Institute of Automation,Chinese Academy of Sciences and Beijing Academy of Artificial Intelligence + WenxuanWangNational Lab of Pattern Recognition, Institute of Automation,Chinese Academy of Sciences and Beijing Academy of Artificial Intelligence YisiZhangUniversity of Science and Technology Beijing XingjianHe, Institute of automation, Chinese academy of science YichenYan @@ -6725,10 +6725,10 @@ Incremental Sequence Labeling: A Tale of Two Shifts ShengjieQiu - JunhaoZheng + JunhaoZheng ZhenLiu YichengLuo - QianliMaSouth China University of Technology + QianliMaSouth China University of Technology 777-791 The incremental sequence labeling task involves continuously learning new classes over time while retaining knowledge of the previous ones. Our investigation identifies two significant semantic shifts: E2O (where the model mislabels an old entity as a non-entity) and O2E (where the model labels a non-entity or old entity as a new entity). Previous research has predominantly focused on addressing the E2O problem, neglecting the O2E issue. This negligence results in a model bias towards classifying new data samples as belonging to the new class during the learning process. To address these challenges, we propose a novel framework, Incremental Sequential Labeling without Semantic Shifts (IS3). Motivated by the identified semantic shifts (E2O and O2E), IS3 aims to mitigate catastrophic forgetting in models. As for the E2O problem, we use knowledge distillation to maintain the model’s discriminative ability for old entities. Simultaneously, to tackle the O2E problem, we alleviate the model’s bias towards new entities through debiased loss and optimization levels.Our experimental evaluation, conducted on three datasets with various incremental settings, demonstrates the superior performance of IS3 compared to the previous state-of-the-art method by a significant margin. 2024.findings-acl.44 @@ -6743,8 +6743,8 @@ TingjianZhangTsinghua University, Tsinghua University LunyiuNieUniversity of Texas at Austin LinmeiHuBeijing Institute of Technology - LeiHouTsinghua University, Tsinghua University - JuanziLi + LeiHouTsinghua University, Tsinghua University + JuanziLi 792-815 Knowledge Base Question Answering (KBQA) aims to answer natural language questions based on facts in knowledge bases. A typical approach to KBQA is semantic parsing, which translates a question into an executable logical form in a formal language. Recent works leverage the capabilities of large language models (LLMs) for logical form generation to improve performance. However, although it is validated that LLMs are capable of solving some KBQA problems, there has been little discussion on the differences in LLMs’ proficiency in formal languages used in semantic parsing. In this work, we propose to evaluate the understanding and generation ability of LLMs to deal with differently structured logical forms by examining the inter-conversion of natural and formal language through in-context learning of LLMs. Extensive experiments with models of different sizes show that state-of-the-art LLMs can understand formal languages as well as humans, but generating correct logical forms given a few examples remains a challenge. Most importantly, our results also indicate that LLMs exhibit considerable sensitivity. In general, the formal language with a lower formalization level, i.e., the more similar it is to natural language, is more friendly to LLMs. Code and data can be found at https://github.com/Matthewlliu/structure_probe. 2024.findings-acl.45 @@ -6782,7 +6782,7 @@ BlaineHillUniversity of Illinois at Urbana-Champaign BoxinDuAmazon FeiWangAmazon - HanghangTong + HanghangTong 839-850 Conversational question answering (ConvQA) over knowledge graphs (KGs) involves answering multi-turn natural language questions about information contained in a KG. State-of-the-art methods of ConvQA often struggle with inexplicit question-answer pairs. These inputs are easy for human beings to understand given a conversation history, but hard for a machine to interpret, which can degrade ConvQA performance. To address this problem, we propose a reinforcement learning (RL) based model, CoRnNet, which utilizes question reformulations generated by large language models (LLMs) to improve ConvQA performance. CoRnNet adopts a teacher-student architecture where a teacher model learns question representations using human writing reformulations, and a student model to mimic the teacher model’s output via reformulations generated by LLMs. The learned question representation is then used by a RL model to locate the correct answer in a KG. Extensive experimental results show that CoRnNet outperforms state-of-the-art ConvQA models. 2024.findings-acl.48 @@ -6792,7 +6792,7 @@ Debug like a Human: A Large Language Model Debugger via Verifying Runtime Execution Step by Step LiZhong - ZilongWangUniversity of California, San Diego + ZilongWangUniversity of California, San Diego JingboShangUniversity of California, San Diego 851-870 Large language models (LLMs) are leading significant progress in code generation. Beyond one-pass code generation, recent works further integrate unit tests and program verifiers into LLMs to iteratively refine the generated programs. However, these works consider the generated programs as an indivisible entity, which falls short for LLMs in debugging the programs, especially when the programs contain complex logic flows and data operations. In contrast, when human developers debug programs, they typically set breakpoints and selectively examine runtime execution information. The execution flow and the intermediate variables play a crucial role in the debugging process, yet they are underutilized in the existing literature on code generation. In this study, we introduce Large Language Model Debugger (LDB), a novel debugging framework that enables LLMs to refine their generated programs with the runtime execution information. Specifically, LDB segments the programs into basic blocks and tracks the values of intermediate variables after each block throughout the runtime execution. This allows LLMs to concentrate on simpler code units within the overall execution flow, verify their correctness against the task description block by block, and efficiently pinpoint any potential errors. Experiments demonstrate that LDB consistently enhances the baseline performance by up to 9.8% across the HumanEval, MBPP, and TransCoder benchmarks, archiving new state-of-the-art performance in code debugging for various LLM selections. @@ -6802,10 +6802,10 @@ Effective In-Context Example Selection through Data Compression - ZhongXiangSun + ZhongXiangSun KepuZhang HaoyuWang - XiaoZhang + XiaoZhang JunXuRenmin University of China 871-877 In-context learning has been extensively validated in large language models. However, the mechanism and selection strategy for in-context example selection, which is a crucial ingredient in this approach, lacks systematic and in-depth research. In this paper, we propose a data compression approach to the selection of in-context examples. We introduce a two-stage method that can effectively choose relevant examples and retain sufficient information about the training dataset within the in-context examples. Our method shows a significant improvement of an average of 5.90% across five different real-world datasets using four language models. @@ -6819,7 +6819,7 @@ ChongYangByteDance Inc. TuHu XinhaoChen - ManLan + ManLan LiCaiGuizhou University XinlinZhuang XuanLinAnt Group @@ -6833,9 +6833,9 @@ Knowledgeable Preference Alignment for <fixed-case>LLM</fixed-case>s in Domain-specific Question Answering - YichiZhang - ZhuoChenZhejiang University - YinFang + YichiZhang + ZhuoChenZhejiang University + YinFang YanxiLu LiFangming WenZhangZhejiang University @@ -6849,7 +6849,7 @@ <fixed-case>MARIO</fixed-case>: <fixed-case>MA</fixed-case>th Reasoning with code Interpreter Output - A Reproducible Pipeline MinpengLiao - ChengxiLiAlibaba Group + ChengxiLiAlibaba Group WeiLuo WuJingAlibaba Group KaiFanAlibaba Group @@ -6861,8 +6861,8 @@ <fixed-case>D</fixed-case>iffus<fixed-case>P</fixed-case>oll: Conditional Text Diffusion Model for Poll Generation - LeCheng - ShuangyinLi + LeCheng + ShuangyinLi 925-935 Online social media platforms often gather user feedback through polls to enhance user engagement. Automatically generating polls from social media and its context can decrease the labor expenses of media workers and enhance workplace productivity. However, on social media platforms, there are internet water armies that manipulate public opinion through sheer numbers and causing the comments to be biased, drowning out minority views. In such circumstances, polls created based on biased comments often have limited types of options and poor coverage. Therefore, it is crucial to diversify the poll options and try to listen to the voices of the minority. To achieve this, we introduce DiffusPoll, a novel paradigm for poll generation based on a non-autoregressive diffusion model that can generate diversified and high-quality samples. Under the new paradigm, we design a task-specific mask strategy tailored to the inherent logic of polls to optimize controlled generation. Furthermore, we also leverage additional attribute tags from comments to enhance the generation quality. Experimental results indicate that DiffusPoll has achieved state-of-the-art performance in both the quality and diversity of poll generation tasks, and is more likely to hit the voices of minority. 2024.findings-acl.54 @@ -6873,7 +6873,7 @@ Exploring Mathematical Extrapolation of Large Language Models with Synthetic Data HaolongLi YuMaByteDance Inc. - YinqiZhangByteDance Inc. and East China Normal University + YinqiZhangByteDance Inc. and East China Normal University ChenYeTongji University JieChen 936-946 @@ -6885,7 +6885,7 @@ Implanting <fixed-case>LLM</fixed-case>’s Knowledge via Reading Comprehension Tree for Toxicity Detection HankunKang - TieyunQianWuhan University + TieyunQianWuhan University 947-962 Toxicity detection plays a crucial role in maintaining the peace of the society. Existing methods can be roughly categorized as small language model (SLM) based and large language model (LLM) based. However, due to the limitation of SLMs on general knowledge and the potential embedded bias in LLMs despite their large amount of knowledge, it is not a good idea to detect toxicity only with either SLM or LLM based method.In this work, we propose to implant LLM’s knowledge into SLM based methods such that we can stick to both types of models’ strengths. To this end, we develop a reading comprehension (RC) tree to transfer knowledge between two models. Specifically, we first construct the RC tree, from an extensive to intensive reading perspective, to capture the local and global information in the text. We then model samples encoded by SLM and knowledge extracted from LLM as two distributions using the constructed RT tree. We finally transfer knowledge via optimal transportation between two distributions. Extensive experiments prove the effectiveness of our method on real-world and machine-generated datasets. 2024.findings-acl.56 @@ -6896,17 +6896,17 @@ <fixed-case>LLML</fixed-case>ingua-2: Data Distillation for Efficient and Faithful Task-Agnostic Prompt Compression ZhuoshiPan QianhuiWuMicrosoft - HuiqiangJiangMicrosoft + HuiqiangJiangMicrosoft MenglinXiaMicrosoft XufangLuoMicrosoft Research JueZhangMicrosoft - QingweiLinMicrosoft Research - VictorRühleMicrosoft + QingweiLinMicrosoft Research + VictorRühleMicrosoft YuqingYangResearch, Microsoft Chin-YewLinMicrosoft H. VickyZhaoTsinghua University, Tsinghua University LiliQiuMicrosoft - DongmeiZhangMicrosoft and Microsoft + DongmeiZhangMicrosoft and Microsoft 963-981 This paper focuses on task-agnostic prompt compression for better generalizability and efficiency. Considering the redundancy in natural language, existing approaches compress prompts by removing tokens or lexical units according to their information entropy obtained from a causal language model such as LLaMa-7B. The challenge is that information entropy may be a suboptimal compression metric: (i) it only leverages unidirectional context and may fail to capture all essential information needed for prompt compression; (ii) it is not aligned with the prompt compression objective.To address these issues, we propose a data distillation procedure to derive knowledge from an LLM to compress prompts without losing crucial information, and meantime, introduce an extractive text compression dataset. We formulate prompt compression as a token classification problem to guarantee the faithfulness of the compressed prompt to the original one, and use a Transformer encoder as the base architecture to capture all essential information for prompt compression from the full bidirectional context. Our approach leads to lower latency by explicitly learning the compression objective with smaller models such as XLM-RoBERTa-large and mBERT.We evaluate our method on both in-domain and out-of-domain datasets, including MeetingBank, LongBench, ZeroScrolls, GSM8K, and BBH. Despite its small size, our model shows significant performance gains over strong baselines and demonstrates robust generalization ability across different LLMs. Additionally, our model is 3x-6x faster than existing prompt compression methods, while accelerating the end-to-end latency by 1.6x-2.9x with compression ratios of 2x-5x. 2024.findings-acl.57 @@ -6915,7 +6915,7 @@ <fixed-case>E</fixed-case>con<fixed-case>NLI</fixed-case>: Evaluating Large Language Models on Economics Reasoning - YueGuoHong Kong University of Science and Technology + YueGuoHong Kong University of Science and Technology YiYangHong Kong University of Science and Technology 982-994 Large Language Models (LLMs) are widely used for writing economic analysis reports or providing financial advice, but their ability to understand economic knowledge and reason about potential results of specific economic events lacks systematic evaluation. To address this gap, we propose a new dataset, natural language inference on economic events (EconNLI), to evaluate LLMs’ knowledge and reasoning abilities in the economic domain. We evaluate LLMs on (1) their ability to correctly classify whether a premise event will cause a hypothesis event and (2) their ability to generate reasonable events resulting from a given premise. Our experiments reveal that LLMs are not sophisticated in economic reasoning and may generate wrong or hallucinated answers. Our study raises awareness of the limitations of using LLMs for critical decision-making involving economic reasoning and analysis. The dataset and codes are available at https://github.com/Irenehere/EconNLI. @@ -6926,7 +6926,7 @@ Better Late Than Never: Model-Agnostic Hallucination Post-Processing Framework Towards Clinical Text Summarization SongdaLi - YunqiZhang + YunqiZhang ChunyuanDengRice University YakeNiu HuiZhaoEast China Normal University @@ -6940,9 +6940,9 @@ Finding and Editing Multi-Modal Neurons in Pre-Trained Transformers HaowenPanUniversity of Science and Technology of China YixinCaoFudan University - XiaozhiWangDepartment of Computer Science and Technology, Tsinghua University - XunYangUniversity of Science and Technology of China - MengWangHefei University of Technology + XiaozhiWangDepartment of Computer Science and Technology, Tsinghua University + XunYangUniversity of Science and Technology of China + MengWangHefei University of Technology 1012-1037 Understanding the internal mechanisms by which multi-modal large language models (LLMs) interpret different modalities and integrate cross-modal representations is becoming increasingly critical for continuous improvements in both academia and industry. In this paper, we propose a novel method to identify key neurons for interpretability — how multi-modal LLMs bridge visual and textual concepts for captioning. Our method improves conventional works upon efficiency and applied range by removing needs of costly gradient computation. Based on those identified neurons, we further design a multi-modal knowledge editing method, beneficial to mitigate sensitive words or hallucination. For rationale of our design, we provide theoretical assumption. For empirical evaluation, we have conducted extensive quantitative and qualitative experiments. The results not only validate the effectiveness of our methods, but also offer insightful findings that highlight three key properties of multi-modal neurons: sensitivity, specificity and causal-effect, to shed light for future research. 2024.findings-acl.60 @@ -6965,7 +6965,7 @@ Controllable Text Generation with Residual Memory Transformer HanqingZhang SiSunTsinghua University, Tsinghua University - HaimingWuBeijing Institute of Technology + HaimingWuBeijing Institute of Technology DaweiSongBeijing Institute of Technology and Open University 1048-1066 Large-scale Causal Language Models (CLMs), e.g., GPT3 and ChatGPT, have brought great success in text generation. However, it is still an open challenge to effectively control the generation process of a CLM while balancing the flexibility, control granularity, and generation efficiency. In this paper, we provide a new alternative for controllable text generation (CTG), by designing a non-intrusive, lightweight control plugin, namely Residual Memory Transformer (RMT), to accompany the generation of CLM at arbitrary time steps. With an encoder-decoder setup, RMT can accept any types of control conditions and cooperate with the base CLM through a residual learning paradigm, to achieve a more flexible, general, and efficient CTG. Extensive experiments are carried out on various control tasks, in the form of both automatic and human evaluations. The results demonstrate the superiority of RMT over a wide range of state-of-the-art CTG approaches. The code implementation of our work is available at: https://github.com/Residual_Memory_Transformer. @@ -6976,10 +6976,10 @@ Prompt-Based Length Controlled Generation with Multiple Control Types RenlongJieNorthwest Polytechnical University Xi’an - XiaojunMengNoah’s Ark Lab, Huawei Technologies Ltd. + XiaojunMengNoah’s Ark Lab, Huawei Technologies Ltd. LifengShangHuawei Technologies Ltd. - XinJiang - QunLiuHuawei Noah’s Ark Lab + XinJiang + QunLiuHuawei Noah’s Ark Lab 1067-1085 Large language models (LLMs) have attracted great attention given their strong performance on a wide range of NLP tasks. In practice, users often expect generated texts to fall within a specific length range, making length controlled generation an important topic, especially for GPT-style models. Existing length control methods mostly focus on a simple control type of “equal to” a target length. Different from them, we propose a prompt-based method to achieve length controlled generation under different control types with high accuracy. In particular, we adopt reinforcement learning (RL) and sample filtering with the reward signal given by rule-based reward models, which enhances the length control ability of models by rewarding outputs that follow certain control instructions. In addition, we introduce a standard prompt extractor to parse arbitrary users’ input into standard control instructions. Experiments show that our method significantly improves the accuracy of prompt-based length control on popular summarization datasets like CNNDM and NYT under multiple control types. Moreover, both the standard prompt extractor and RL-tuned model show strong generalization to unseen control prompt templates. 2024.findings-acl.63 @@ -6991,7 +6991,7 @@ LiangChen YichiZhang ShuhuaiRen - HaozheZhao + HaozheZhao ZefanCai YuchiWang PeiyiWang @@ -7010,10 +7010,10 @@ MinjuKim HanaKimYonsei University Beong-wooKwakYonsei University - SeongKuKangUniversity of Illinois Urbana-Champaign + SeongKuKangUniversity of Illinois Urbana-Champaign YoungjaeYuYonsei University - JinyoungYeoYonsei University - DonghaLeeYonsei University + JinyoungYeoYonsei University + DonghaLeeYonsei University 1105-1120 Conversational recommender systems are an emerging area that has garnered increasing interest in the community, especially with the advancements in large language models (LLMs) that enable sophisticated handling of conversational input. Despite the progress, the field still has many aspects left to explore. The currently available public datasets for conversational recommendation lack specific user preferences and explanations for recommendations, hindering high-quality recommendations. To address such challenges, we present a novel conversational recommendation dataset named PEARL, synthesized with persona- and knowledge-augmented LLM simulators. We obtain detailed persona and knowledge from real-world reviews and construct a large-scale dataset with over 57k dialogues. Our experimental results demonstrate that PEARL contains more specific user preferences, show expertise in the target domain, and provides recommendations more relevant to the dialogue context than those in prior datasets. Furthermore, we demonstrate the utility of PEARL by showing that our downstream models outperform baselines in both human and automatic evaluations. We release our dataset and code. 2024.findings-acl.65 @@ -7024,8 +7024,8 @@ <fixed-case>C</fixed-case>o<fixed-case>LL</fixed-case>a<fixed-case>VO</fixed-case>: Crayon Large Language and Vision m<fixed-case>O</fixed-case>del Byung-KwanLeeKorea Advanced Institute of Science and Technology BeomchanParkKAIST - Chae WonKim - Yong ManRoKorea Advanced Institute of Science and Technology + Chae WonKim + Yong ManRoKorea Advanced Institute of Science and Technology 1121-1138 The remarkable success of Large Language Models (LLMs) and instruction tuning drives the evolution of Vision Language Models (VLMs) towards a versatile general-purpose model. Yet, it remains unexplored whether current VLMs genuinely possess quality object-level image understanding capabilities determined from ‘what objects are in the image?’ or ‘which object corresponds to a specified bounding box?’. Our findings reveal that the image understanding capabilities of current VLMs are strongly correlated with their zero-shot performance on vision language (VL) tasks. This suggests that prioritizing basic image understanding is crucial for VLMs to excel at VL tasks. To enhance object-level image understanding, we propose Crayon Large Language and Vision mOdel (CoLLaVO), which incorporates instruction tuning with Crayon Prompt as a new visual prompt tuning scheme based on panoptic color maps. Furthermore, we present a learning strategy of Dual QLoRA to preserve object-level image understanding without forgetting it during visual instruction tuning, thereby achieving a significant leap in numerous VL benchmarks in a zero-shot setting. 2024.findings-acl.66 @@ -7034,7 +7034,7 @@ Modelling Variability in Human Annotator Simulation - WenWu + WenWu WenlinChenUniversity of Cambridge and Max Planck Institute for Intelligent Systems ChaoZhangTsinghua University and University College London PhilWoodlandUniversity of Cambridge @@ -7049,7 +7049,7 @@ SheikhShafayatKAIST HHasan MinhajurMahim - RifkiPutriKorea Advanced Institute of Science & Technology + RifkiPutriKorea Advanced Institute of Science & Technology JamesThorneKAIST AliceOhKorea Advanced Institute of Science and Technology 1158-1177 @@ -7061,7 +7061,7 @@ <fixed-case>MORE</fixed-case>: Multi-m<fixed-case>O</fixed-case>dal <fixed-case>RE</fixed-case>trieval Augmented Generative Commonsense Reasoning WanqingCui - KepingBiChinese Academy of Sciences + KepingBiChinese Academy of Sciences JiafengGuoInstitute of Computing Technolgy, Chinese Academy of Sciences XueqiCheng, Chinese Academy of Sciences 1178-1192 @@ -7088,15 +7088,15 @@ <fixed-case>B</fixed-case>io<fixed-case>T</fixed-case>5+: Towards Generalized Biological Understanding with <fixed-case>IUPAC</fixed-case> Integration and Multi-task Tuning - QizhiPei - LijunWu - KaiyuanGao + QizhiPei + LijunWu + KaiyuanGao XiaozhuanLiangZhejiang University - YinFang + YinFang JinhuaZhu ShufangXieRenmin University of China TaoQin - RuiYanRenmin University of China + RuiYanRenmin University of China 1216-1240 Recent research trends in computational biology have increasingly focused on integrating text and bio-entity modeling, especially in the context of molecules and proteins. However, previous efforts like BioT5 faced challenges in generalizing across diverse tasks and lacked a nuanced understanding of molecular structures, particularly in their textual representations (e.g., IUPAC). This paper introduces BioT5+, an extension of the BioT5 framework, tailored to enhance biological research and drug discovery. BioT5+ incorporates several novel features: integration of IUPAC names for molecular understanding, inclusion of extensive bio-text and molecule data from sources like bioRxiv and PubChem, the multi-task instruction tuning for generality across tasks, and a numerical tokenization technique for improved processing of numerical data. These enhancements allow BioT5+ to bridge the gap between molecular representations and their textual descriptions, providing a more holistic understanding of biological entities, and largely improving the grounded reasoning of bio-text and bio-sequences. The model is pre-trained and fine-tuned with a large number of experiments, including 3 types of problems (classification, regression, generation), 15 kinds of tasks, and 21 total benchmark datasets, demonstrating the remarkable performance and state-of-the-art results in most cases. BioT5+ stands out for its ability to capture intricate relationships in biological data, thereby contributing significantly to bioinformatics and computational biology. Our code is available at https://github.com/QizhiPei/BioT5. 2024.findings-acl.71 @@ -7105,9 +7105,9 @@ <fixed-case>SIBO</fixed-case>: A Simple Booster for Parameter-Efficient Fine-Tuning - ZhihaoWen + ZhihaoWen JieZhang - YuanFangSingapore Management University + YuanFangSingapore Management University 1241-1257 Fine-tuning all parameters of large language models (LLMs) necessitates substantial computational power and extended time. Latest advancements in parameter-efficient fine-tuning (PEFT) techniques, such as Adapter tuning and LoRA, allow for adjustments to only a minor fraction of the parameters of these LLMs. Concurrently, it has been noted that the issue of over-smoothing diminishes the effectiveness of these Transformer-based LLMs, resulting in suboptimal performances in downstream tasks. In this paper, we present SIBO, which is a SImple BOoster to enhance PEFT, by injecting an initial residual. SIBO is straightforward and readily extensible to a range of state-of-the-art PEFT techniques to alleviate over-smoothing and enhance performance. Extensive experiments on 22 benchmark datasets demonstrate that SIBO significantly enhances the performance of various strong baselines, achieving up to 15.7% and 23.5% improvement over existing PEFT methods on the arithmetic and commonsense reasoning tasks, respectively. 2024.findings-acl.72 @@ -7116,11 +7116,11 @@ <fixed-case>G</fixed-case>eo<fixed-case>E</fixed-case>val: Benchmark for Evaluating <fixed-case>LLM</fixed-case>s and Multi-Modal Models on Geometry Problem-Solving - JiaxinZhangUniversity of Strathclyde + JiaxinZhangUniversity of Strathclyde Zhong-ZhiLi Ming-LiangZhang FeiYin, Institute of automation, Chinese academy of science - Cheng-LinLiuInstitute of automation, Chinese academy of science, Chinese Academy of Sciences + Cheng-LinLiuInstitute of automation, Chinese academy of science, Chinese Academy of Sciences YasharMoshfeghiUniversity of Strathclyde 1258-1276 Recent advancements in large language models (LLMs) and multi-modal models (MMs) have demonstrated their remarkable capabilities in problem-solving. Yet, their proficiency in tackling geometry math problems, which necessitates an integrated understanding of both textual and visual information, has not been thoroughly evaluated. To address this gap, we introduce the GeoEval benchmark, a comprehensive collection that includes a main subset of 2,000 problems, a 750 problems subset focusing on backward reasoning, an augmented sub- set of 2,000 problems, and a hard subset of 300 problems. This benchmark facilitates a deeper investigation into the performance of LLMs and MMs in solving geometry math problems. Our evaluation of ten LLMs and MMs across these varied subsets reveals that the WizardMath model excels, achieving a 55.67% accuracy rate on the main subset but only a 6.00% accuracy on the hard subset. This highlights the critical need for testing models against datasets on which they have not been pre-trained. Additionally, our findings indicate that GPT-series models perform more effectively on problems they have rephrased, suggesting a promising method for enhancing model capabilities. @@ -7131,13 +7131,13 @@ Boosting Textural <fixed-case>NER</fixed-case> with Synthetic Image and Instructive Alignment JiahaoWang - WenjunKeSoutheast University + WenjunKeSoutheast University PengWang - HangZhang + HangZhang DongNieMeta Inc. - JiajunLiuSoutheast University + JiajunLiuSoutheast University GuozhengLiSoutheast University - ZiyuShang + ZiyuShang 1277-1287 Named entity recognition (NER) is a pivotal task reliant on textual data, often impeding the disambiguation of entities due to the absence of context. To tackle this challenge, conventional methods often incorporate images crawled from the internet as auxiliary information. However, the images often lack sufficient entities or would introduce noise. Even with high-quality images, it is still challenging to efficiently use images as auxiliaries (i.e., fine-grained alignment with texts). We introduce a novel method named InstructNER to address these issues. Leveraging the rich real-world knowledge and image synthesis capabilities of a large pre-trained stable diffusion (SD) model, InstructNER transforms the text-only NER into a multimodal NER (MNER) task. A selection process automatically identifies the best synthetic image by comparing fine-grained similarities with internet-crawled images through a visual bag-of-words strategy. Note, during the image synthesis, a cross-attention matrix between synthetic images and raw text emerges, which inspires a soft attention guidance alignment (AGA) mechanism. AGA optimizes the MNER task and concurrently facilitates instructive alignment in MNER. Empirical experiments on prominent MNER datasets show that our method surpasses all text-only baselines, improving F1-score by 1.4% to 2.3%. Remarkably, even when compared to fully multimodal baselines, our approach maintains competitive. Furthermore, we open-source a comprehensive synthetic image dataset and the code to supplement existing raw dataset. The code and datasets are available in https://github.com/Heyest/InstructNER. 2024.findings-acl.74 @@ -7150,7 +7150,7 @@ Neurons in Large Language Models: Dead, N-gram, Positional ElenaVoitaFAIR at Meta AI and University of Amsterdam JavierFerrando - ChristoforosNalmpantis + ChristoforosNalmpantis 1288-1301 We analyze a family of large language models in such a lightweight manner that can be done on a single GPU. Specifically, we focus on the OPT family of models ranging from 125m to 66b parameters and rely only on whether an FFN neuron is activated or not. First, we find that the early part of the network is sparse and represents many discrete features. Here, many neurons (more than in some layers of the 66b model) are “dead”, i.e. they never activate on a large collection of diverse data. At the same time, many of the alive neurons are reserved for discrete features and act as token and n-gram detectors. Interestingly, their corresponding FFN updates not only promote next token candidates as could be expected, but also explicitly focus on removing the information about triggering them tokens, i.e., current input. To the best of our knowledge, this is the first example of mechanisms specialized at removing (rather than adding) information from the residual stream. With scale, models become more sparse in a sense that they have more dead neurons and token detectors. Finally, some neurons are positional: them being activated or not depends largely (or solely) on position and less so (or not at all) on textual data. We find that smaller models have sets of neurons acting as position range indicators while larger models operate in a less explicit manner. 2024.findings-acl.75 @@ -7178,7 +7178,7 @@ ThanitTativannarat ChawanPiansaddhayanonChulalongkorn University AttapolRutherfordChulalongkorn University - EkapolChuangsuwanichChulalongkorn University + EkapolChuangsuwanichChulalongkorn University 1319-1329 Learning job title representation is a vital process for developing automatic human resource tools. To do so, existing methods primarily rely on learning the title representation through skills extracted from the job description, neglecting the rich and diverse content within. Thus, we propose an alternative framework for learning job titles through their respective job description (JD) and utilize a Job Description Aggregator component to handle the lengthy description and bidirectional contrastive loss to account for the bidirectional relationship between the job title and its description. We evaluated the performance of our method on both in-domain and out-of-domain settings, achieving a superior performance over the skill-based approach. 2024.findings-acl.77 @@ -7203,9 +7203,9 @@ Flexible Weight Tuning and Weight Fusion Strategies for Continual Named Entity Recognition YahanYuKyoto University, Kyoto University - DuzhenZhang + DuzhenZhang XiuyiChen - ChenhuiChuKyoto University + ChenhuiChuKyoto University 1351-1358 Continual Named Entity Recognition (CNER) is dedicated to sequentially learning new entity types while mitigating catastrophic forgetting of old entity types. Traditional CNER approaches commonly employ knowledge distillation to retain old knowledge within the current model. However, because only the representations of old and new models are constrained to be consistent, the reliance solely on distillation in existing methods still suffers from catastrophic forgetting. To further alleviate the forgetting issue of old entity types, this paper introduces flexible Weight Tuning (WT) and Weight Fusion (WF) strategies for CNER. The WT strategy, applied at each training step, employs a learning rate schedule on the parameters of the current model. After learning the current task, the WF strategy dynamically integrates knowledge from both the current and previous models for inference. Notably, these two strategies are model-agnostic and seamlessly integrate with existing State-Of-The-Art (SOTA) models. Extensive experiments demonstrate that the WT and WF strategies consistently enhance the performance of previous SOTA methods across ten CNER settings in three datasets. 2024.findings-acl.79 @@ -7215,11 +7215,11 @@ Unveiling the Achilles’ Heel of <fixed-case>NLG</fixed-case> Evaluators: A Unified Adversarial Framework Driven by Large Language Models YimingChennational university of singaore, National University of Singapore - ChenZhangNational University of Singapore + ChenZhangNational University of Singapore DanqingLuoNational University of Singapore - Luis FernandoD’HaroUniversidad Politécnica de Madrid - RobbyTanNational University of Singapore - HaizhouLiThe Chinese University of Hong Kong (Shenzhen); National University of Singapore and National University of Singapore + Luis FernandoD’HaroUniversidad Politécnica de Madrid + RobbyTanNational University of Singapore + HaizhouLiThe Chinese University of Hong Kong (Shenzhen); National University of Singapore and National University of Singapore 1359-1375 The automatic evaluation of natural language generation (NLG) systems presents a long-lasting challenge. Recent studies have highlighted various neural metrics that align well with human evaluations. Yet, the robustness of these evaluators against adversarial perturbations remains largely under-explored due to the unique challenges in obtaining adversarial data for different NLG evaluation tasks. To address the problem, we introduce AdvEval, a novel black-box adversarial framework against NLG evaluators. AdvEval is specially tailored to generate data that yield strong disagreements between human and victim evaluators. Specifically, inspired by the recent success of large language models (LLMs) in text generation and evaluation, we adopt strong LLMs as both the data generator and gold evaluator. Adversarial data are automatically optimized with feedback from the gold and victim evaluator. We conduct experiments on 12 victim evaluators and 11 NLG datasets, spanning tasks including dialogue, summarization, and question evaluation. The results show that AdvEval can lead to significant performance degradation of various victim metrics, thereby validating its efficacy. 2024.findings-acl.80 @@ -7254,7 +7254,7 @@ ShenZhou YongqiLi XinMiao - TieyunQianWuhan University + TieyunQianWuhan University 1410-1423 Continual relation extraction (CRE) aims to continuously learn relations in new tasks without forgetting old relations in previous tasks.Current CRE methods are all rehearsal-based which need to store samples and thus may encounter privacy and security issues.This paper targets rehearsal-free continual relation extraction for the first time and decomposes it into task identification and within-task prediction sub-problems. Existing rehearsal-free methods focus on training a model (expert) for within-task prediction yet neglect to enhance models’ capability of task identification.In this paper, we propose an Ensemble-of-Experts (EoE) framework for rehearsal-free continual relation extraction. Specifically, we first discriminatively train each expert by augmenting analogous relations across tasks to enhance the expert’s task identification ability. We then propose a cascade voting mechanism to form an ensemble of experts for effectively aggregating their abilities.Extensive experiments demonstrate that our method outperforms current rehearsal-free methods and is even better than rehearsal-based CRE methods. 2024.findings-acl.83 @@ -7263,7 +7263,7 @@ Temporal Validity Change Prediction - GeorgWenzel + GeorgWenzel AdamJatowt 1424-1446 Temporal validity is an important property of text that has many downstream applications, such as recommender systems, conversational AI, and user status tracking. Existing benchmarking tasks often require models to identify the temporal validity duration of a single statement. However, many data sources contain additional context, such as successive sentences in a story or posts on a social media profile. This context may alter the duration for which the originally collected statement is expected to be valid. We propose Temporal Validity Change Prediction, a natural language processing task benchmarking the capability of machine learning models to detect context statements that induce such change. We create a dataset consisting of temporal target statements sourced from Twitter and crowdsource corresponding context statements. We then benchmark a set of transformer-based language models on our dataset. Finally, we experiment with a multitasking approach to improve the state-of-the-art performance. @@ -7274,7 +7274,7 @@ <fixed-case>RIFF</fixed-case>: Learning to Rephrase Inputs for Few-shot Fine-tuning of Language Models SaeedNajafiUniversity of Alberta - AlonaFysheUniversity of Alberta + AlonaFysheUniversity of Alberta 1447-1466 Pre-trained Language Models (PLMs) can be accurately fine-tuned for downstream text processing tasks. Recently, researchers have introduced several parameter-efficient fine-tuning methods that optimize input prompts or adjust a small number of model parameters (e.g LoRA). In this study, we explore the impact of altering the input text of the original task in conjunction with parameter-efficient fine-tuning methods. To most effectively rewrite the input text, we train a few-shot paraphrase model with a Maximum-Marginal Likelihood objective. Using six few-shot text classification datasets, we show that enriching data with paraphrases at train and test time enhances the performance beyond what can be achieved with parameter-efficient fine-tuning alone. The code used for our experiments can be found at https://github.com/SaeedNajafi/RIFF. 2024.findings-acl.85 @@ -7284,9 +7284,9 @@ Modelling Commonsense Commonalities with Multi-Facet Concept Embeddings HananeKteich - NaLiSchool of Optical-Electrical and Computer Engineering, University of Shanghai for Science and Technology + NaLiSchool of Optical-Electrical and Computer Engineering, University of Shanghai for Science and Technology UsashiChatterjeeCardiff University - ZiedBouraouiCRIL Univ-Artois & CNRS + ZiedBouraouiCRIL Univ-Artois & CNRS StevenSchockaertCardiff University 1467-1480 Concept embeddings offer a practical and efficient mechanism for injecting commonsense knowledge into downstream tasks. Their core purpose is often not to predict the commonsense properties of concepts themselves, but rather to identify commonalities, i.e. sets of concepts which share some property of interest. Such commonalities are the basis for inductive generalisation, hence high-quality concept embeddings can make learning easier and more robust. Unfortunately, standard embeddings primarily reflect basic taxonomic categories, making them unsuitable for finding commonalities that refer to more specific aspects (e.g. the colour of objects or the materials they are made of). In this paper, we address this limitation by explicitly modelling the different facets of interest when learning concept embeddings. We show that this leads to embeddings which capture a more diverse range of commonsense properties, and consistently improves results in downstream tasks such as ultra-fine entity typing and ontology completion. @@ -7321,7 +7321,7 @@ SaifeiLiao VictoriaNg SimonDe Montigny - GeraldPennDepartment of Computer Science, University of Toronto + GeraldPennDepartment of Computer Science, University of Toronto 1521-1533 The task of temporal relation extraction (TRE) involves identifying and extracting temporal relations between events from narratives. We identify two primary issues with TRE systems. First, by formulating TRE as a simple text classification task where every temporal relation is independent, it is hard to enhance the TRE model’s representation of meaning of temporal relations, and its facility with the underlying temporal calculus. We solve the issue by proposing a novel Temporally Contrastive learning model (ConTempo) that increase the model’s awareness of the meaning of temporal relations by leveraging their symmetric or antisymmetric properties. Second, the reusability of innovations has been limited due to incompatibilities in model architectures. Therefore, we propose a unified framework and show that ConTempo is compatible with all three main branches of TRE research. Our results demonstrate that the performance gains of ConTempo are more pronounced, with the total combination achieving state-of-the-art performance on the widely used MATRES and TBD corpora. We furthermore identified and corrected a large number of annotation errors present in the test set of MATRES, after which the performance increase brought by ConTempo becomes more apparent. 2024.findings-acl.89 @@ -7331,10 +7331,10 @@ <fixed-case>CHARP</fixed-case>: Conversation History <fixed-case>A</fixed-case>wa<fixed-case>R</fixed-case>eness Probing for Knowledge-grounded Dialogue Systems AbbasGhaddarHuawei Technologies Ltd. - DavidAlfonso-HermeloHuawei Technologies Ltd. - PhilippeLanglaisUniversité de Montréal + DavidAlfonso-HermeloHuawei Technologies Ltd. + PhilippeLanglaisUniversité de Montréal MehdiRezagholizadeh - BoxingChenHuawei Technologies Ltd. + BoxingChenHuawei Technologies Ltd. PrasannaParthasarathiHuawei Technologies Ltd. 1534-1551 In this work, we dive deep into one of the popular knowledge-grounded dialogue benchmarks that focus on faithfulness, FaithDial. We show that a significant portion of the FaithDial data contains annotation artifacts, which may bias models towards completely ignoring the conversation history. We therefore introduce CHARP, a testbed, designed for evaluating supposedly non-hallucinatory models trained on the FaithDial dataset. Our extensive analysis reveals that models primarily exhibit poor performance on CHARP due to their inability to effectively attend to and reason over the conversation history. Furthermore, the evaluation methods of FaithDial fail to capture these shortcomings, neglecting the conversational history. Our findings indicate that there is substantial room for contribution in both dataset creation and hallucination evaluation for knowledge-grounded dialogue, and that CHARP can serve as a tool for monitoring the progress in this particular research area. Data, models, and source code will be publicly available upon acceptance. @@ -7347,9 +7347,9 @@ ZichengLin ZhibinGou TianLiang - RuilinLuo + RuilinLuo HaoweiLiuUniversity of Hong Kong - YujiuYangGraduate School at Shenzhen,Tsinghua University + YujiuYangGraduate School at Shenzhen,Tsinghua University 1552-1587 The ability of Large Language Models (LLMs) to critique and refine their reasoning is crucial for their application in evaluation, feedback provision, and self-improvement. This paper introduces CriticBench, a comprehensive benchmark designed to assess LLMs’ abilities to critique and rectify their reasoning across a variety of tasks. CriticBench encompasses five reasoning domains: mathematical, commonsense, symbolic, coding, and algorithmic. It compiles 15 datasets and incorporates responses from three LLM families. Utilizing CriticBench, we evaluate and dissect the performance of 17 LLMs in generation, critique, and correction reasoning, i.e., GQC reasoning. Our findings reveal: (1) a linear relationship in GQC capabilities, with critique-focused training markedly enhancing performance; (2) a task-dependent variation in correction effectiveness, with logic-oriented tasks being more amenable to correction; (3) GQC knowledge inconsistencies that decrease as model size increases; and (4) an intriguing inter-model critiquing dynamic, where stronger models are better at critiquing weaker ones, while weaker models can surprisingly surpass stronger ones in their self-critique. We hope these insights into the nuanced critique-correct reasoning of LLMs will foster further research in LLM critique and self-improvement. 2024.findings-acl.91 @@ -7359,10 +7359,10 @@ <fixed-case>DAFN</fixed-case>et: Dynamic Auxiliary Fusion for Sequential Model Editing in Large Language Models TaolinZhangAlibaba Group - QizhouChen + QizhouChen DongyangLiEast China Normal University ChengyuWangAlibaba Group - XiaofengHeEast China Normal University + XiaofengHeEast China Normal University LongtaoHuangAlibaba Group HuiXue’ JunHuang @@ -7374,7 +7374,7 @@ Controllable Text Summarization: Unraveling Challenges, Approaches, and Prospects - A Survey - AshokUrlanaTata Consultancy Services Limited, India + AshokUrlanaTata Consultancy Services Limited, India PruthwikMishraIIIT-Hyderabad TathagatoRoy RahulMishraInternational Institute of Information Technology Hyderabad @@ -7392,7 +7392,7 @@ SongtaoWang HongfuLiuNational University of Singapore HaoWangRutgers University - YeWang + YeWang 1624-1637 Traditional applications of natural language processing (NLP) in healthcare have predominantly focused on patient-centered services, enhancing patient interactions and care delivery, such as through medical dialogue systems. However, the potential of NLP to benefit inexperienced doctors, particularly in areas such as communicative medical coaching, remains largely unexplored. We introduce “ChatCoach”, a human-AI cooperative framework designed to assist medical learners in practicing their communication skills during patient consultations. ChatCoach differentiates itself from conventional dialogue systems by offering a simulated environment where medical learners can practice dialogues with a patient agent, while a coach agent provides immediate, structured feedback. This is facilitated by our proposed Generalized Chain-of-Thought (GCoT) approach, which fosters the generation of structured feedback and enhances the utilization of external knowledge sources. Additionally, we have developed a dataset specifically for evaluating Large Language Models (LLMs) within the ChatCoach framework on communicative medical coaching tasks. Our empirical results validate the effectiveness of ChatCoach. 2024.findings-acl.94 @@ -7406,11 +7406,11 @@ LuWangMicrosoft YongXu MinghuaMaMicrosoft - WeiZhangEast China Normal University + WeiZhangEast China Normal University SiQinMicrosoft SaravanRajmohanMicrosoft - QingweiLinMicrosoft Research - DongmeiZhangMicrosoft and Microsoft + QingweiLinMicrosoft Research + DongmeiZhangMicrosoft and Microsoft 1638-1662 This paper introduce a novel thought prompting approach called ”Everything of Thoughts” (XoT) for Large Language Models (LLMs) to defy the law of ”Penrose triangle” of existing thought paradigms, to achieve three key perspectives in thought generation simultaneously: performance, efficiency, and flexibility. XoT leverages pretrained reinforcement learning and Monte Carlo Tree Search (MCTS) to incorporate external domain knowledge and planning capability into thoughts, thereby enhancing LLMs’ decision-making capabilities. Through the MCTS-LLM collaborative thought revision framework, XoT autonomously produces high-quality comprehensive cognitive mappings with minimal LLM interactions. Additionally, XoT empowers LLMs to utilize flexible cognitive mappings for solving problems with multiple solutions.We evaluate XoT on several challenging problem-solving tasks, including Game of 24, 8-Puzzle, and Pocket Cube. Our results demonstrate that XoT significantly outperforms existing approaches in various dimensions, showcasing its remarkable proficiency in addressing complex problems across diverse domains. The data and code are available at https://github.com/microsoft/Everything-of-Thoughts-XoT. 2024.findings-acl.95 @@ -7420,10 +7420,10 @@ <fixed-case>SPAGHETTI</fixed-case>: Open-Domain Question Answering from Heterogeneous Data Sources with Retrieval and Semantic Parsing HeidiZhangStanford University - SinaSemnaniStanford University + SinaSemnaniStanford University FarhadGhassemiComputer Science Department, Stanford University JialiangXu - ShichengLiuStanford University + ShichengLiuStanford University MonicaLamStanford University 1663-1678 We introduce SPAGHETTI: Semantic Parsing Augmented Generation for Hybrid English information from Text Tables and Infoboxes, a hybrid question-answering (QA) pipeline that utilizes information from heterogeneous knowledge sources, including knowledge base, text, tables, and infoboxes. Our LLM-augmented approach achieves state-of-the-art performance on the Compmix dataset, the most comprehensive heterogeneous open-domain QA dataset, with 56.5% exact match (EM) rate. More importantly, manual analysis on a sample of the dataset suggests that SPAGHETTI is more than 90% accurate, indicating that EM is no longer suitable for assessing the capabilities of QA systems today. @@ -7438,7 +7438,7 @@ RuochenZhao TianzeLuo XinzeLiSchool of Computer Science and Engineering, Nanyang Technological University - GuizhenChen + GuizhenChen WenhanXia JunjieHuUniversity of Wisconsin, Madison Anh TuanLuuNanyang Technological University @@ -7488,7 +7488,7 @@ <fixed-case>C</fixed-case>ee<fixed-case>BERT</fixed-case>: Cross-Domain Inference in Early Exit <fixed-case>BERT</fixed-case> Divya JyotiBajpaiIndian Institute of Technology, Bombay, Dhirubhai Ambani Institute Of Information and Communication Technology - ManjeshHanawalIndian Institute of Technology Bombay + ManjeshHanawalIndian Institute of Technology Bombay 1736-1748 Pre-trained Language Models (PLMs), like BERT, with self-supervision objectives exhibit remarkable performance and generalization across various tasks. However, they suffer in inference latency due to their large size. To address this issue, side branches are attached at intermediate layers, enabling early inference of samples without requiring them to pass through all layers. However, the challenge is to decide which layer to infer and exit each sample so that the accuracy and latency are balanced. Moreover, the distribution of the samples to be inferred may differ from that used for training necessitating cross-domain adaptation. We propose an online learning algorithm named Cross-Domain Inference in Early Exit BERT (CeeBERT) that dynamically determines early exits of samples based on the level of confidence at each exit point. CeeBERT learns optimal thresholds from domain-specific confidence observed at intermediate layers on the fly, eliminating the need for labeled data. Experimental results on five distinct datasets with BERT and ALBERT models demonstrate CeeBERT’s ability to improve latency by reducing unnecessary computations with minimal drop in performance. By adapting to the threshold values, CeeBERT can speed up the BERT/ALBERT models by 2\times - 3.1\times with minimal drop in accuracy. The anonymized source code is available at https://github.com/Div290/CeeBERT. 2024.findings-acl.101 @@ -7511,7 +7511,7 @@ MehakDhaliwalUniversity of California, Santa Barbara PeterFrischAmazon TobiasDomhanAmazon - MarcelloFedericoAmazon + MarcelloFedericoAmazon 1763-1775 We show that content on the web is often translated into many languages, and the low quality of these multi-way translations indicates they were likely created using Machine Translation (MT). Multi-way parallel, machine generated content not only dominates the translations in lower resource languages; it also constitutes a large fraction of the total web content in those languages. We also find evidence of a selection bias in the type of content which is translated into many languages, consistent with low quality English content being translated en masse into many lower resource languages, via MT. Our work raises serious concerns about training models such as multilingual large language models on both monolingual and bilingual data scraped from the web. 2024.findings-acl.103 @@ -7520,7 +7520,7 @@ <fixed-case>R</fixed-case>ank<fixed-case>M</fixed-case>ean: Module-Level Importance Score for Merging Fine-tuned <fixed-case>LLM</fixed-case> Models - GabrielPerin + GabrielPerin XuxiChenUniversity of Texas at Austin ShusenLiuLawrence Livermore National Labs BhavyaKailkhuraLawrence Livermore National Laboratory @@ -7562,11 +7562,11 @@ Towards Safer Large Language Models through Machine Unlearning - ZheyuanLiuUniversity of Notre Dame + ZheyuanLiuUniversity of Notre Dame GuangyaoDou - ZhaoxuanTanUniversity of Notre Dame - YijunTian - MengJiangUniversity of Notre Dame + ZhaoxuanTanUniversity of Notre Dame + YijunTian + MengJiangUniversity of Notre Dame 1817-1829 The rapid advancement of Large Language Models (LLMs) has demonstrated their vast potential across various domains, attributed to their extensive pretraining knowledge and exceptional generalizability. However, LLMs often encounter challenges in generating harmful content when faced with problematic prompts. To address this problem, existing work attempted to implement a gradient ascent based approach to prevent LLMs from producing harmful output. While these methods can be effective, they frequently impact the model utility in responding to normal prompts. To address this gap, we introduce Selective Knowledge negation Unlearning (SKU), a novel unlearning framework for LLMs, designed to eliminate harmful knowledge while preserving utility on normal prompts. Specifically, SKU is consisted of two stages: harmful knowledge acquisition stage and knowledge negation stage. The first stage aims to identify and acquire harmful knowledge within the model, whereas the second is dedicated to remove this knowledge. SKU selectively isolates and removes harmful knowledge in model parameters, ensuring the model’s performance remains robust on normal prompts. Our experiments conducted across various LLM architectures demonstrate that SKU identifies a good balance point between removing harmful information and preserving utility. 2024.findings-acl.107 @@ -7581,7 +7581,7 @@ HaiyanZhaoNew Jersey Institute of Technology WenyueHuaRutgers University, New Brunswick YandaMengUniversity of Exeter - YongfengZhangRutgers University + YongfengZhangRutgers University MengnanDuNew Jersey Institute of Technology 1830-1842 Chain of Thought (CoT) is significant in improving the reasoning abilities of large language models (LLMs). However, the correlation between the effectiveness of CoT and the length of reasoning steps in prompts remains largely unknown. To shed light on this, we have conducted several empirical experiments to explore the relations. Specifically, we design experiments that expand and compress the rationale reasoning steps within CoT demonstrations, while keeping all other factors constant. We have the following key findings. First, the results indicate that lengthening the reasoning steps in prompts, even without adding new information into the prompt, considerably enhances LLMs’ reasoning abilities across multiple datasets. Alternatively, shortening the reasoning steps, even while preserving the key information, significantly diminishes the reasoning abilities of models. This finding highlights the importance of the number of steps in CoT prompts and provides practical guidance to make better use of LLMs’ potential in complex problem-solving scenarios. Second, we also investigated the relationship between the performance of CoT and the rationales used in demonstrations. Surprisingly, the result shows that even incorrect rationales can yield favorable outcomes if they maintain the requisite length of inference. Third, we observed that the advantages of increasing reasoning steps are task-dependent: simpler tasks require fewer steps, whereas complex tasks gain significantly from longer inference sequences. @@ -7607,14 +7607,14 @@ <fixed-case>SKGS</fixed-case>um: Structured Knowledge-Guided Document Summarization - QiqiWangUniversity of Auckland + QiqiWangUniversity of Auckland RuofanWang - KaiqiZhaoUniversity of Auckland + KaiqiZhaoUniversity of Auckland RobertAmorUniversity of Auckland BenjaminLiu JiamouLiuThe University of Auckland XiandaZhengUniversity of Auckland - ZijianHuangUniversity of Auckland + ZijianHuangUniversity of Auckland 1857-1871 A summary structure is inherent to certain types of texts according to the Genre Theory of Linguistics. Such structures aid readers in efficiently locating information within summaries. However, most existing automatic summarization methods overlook the importance of summary structure, resulting in summaries that emphasize the most prominent information while omitting essential details from other sections. While a few summarizers recognize the importance of summary structure, they rely heavily on the predefined labels of summary structures in the source document and ground truth summaries. To address these shortcomings, we developed a Structured Knowledge-Guided Summarization (SKGSum) and its variant, SKGSum-W, which do not require structure labels. Instead, these methods rely on a set of automatically extracted summary points to generate summaries. We evaluate the proposed methods using three real-world datasets. The results indicate that our methods not only improve the quality of summaries, in terms of ROUGE and BERTScore, but also broaden the types of documents that can be effectively summarized. 2024.findings-acl.110 @@ -7651,7 +7651,7 @@ YixinYangPeking University ZhengLi QingxiuDong - HemingXia + HemingXia ZhifangSuiPeking University 1898-1912 Understanding the deep semantics of images is essential in the era dominated by social media. However, current research works primarily on the superficial description of images, revealing a notable deficiency in the systematic investigation of the inherent deep semantics. In this work, we introduce DEEPEVAL, a comprehensive benchmark to assess Large Multimodal Models’ (LMMs) capacities of visual deep semantics. DEEPEVAL includes human-annotated dataset and three progressive subtasks: fine-grained description selection, in-depth title matching, and deep semantics understanding. Utilizing DEEPEVAL, we evaluate 9 open-source LMMs and GPT-4V(ision). Our evaluation demonstrates a substantial gap between the deep semantic comprehension capabilities of existing LMMs and humans. For example, GPT-4V is 30% behind humans in understanding deep semantics, even though it achieves human-comparable performance in image description. Further analysis reveals that LMM performance on DEEPEVAL varies according to the specific facets of deep semantics explored, indicating the fundamental challenges remaining in developing LMMs. @@ -7663,7 +7663,7 @@ Harvesting Events from Multiple Sources: Towards a Cross-Document Event Extraction Paradigm QiangGao ZixiangMeng - BoboLiWuhan University + BoboLiWuhan University JunZhouWuhan University FeiLiWuhan University ChongTeng @@ -7679,7 +7679,7 @@ EunJeongHwangUniversity of British Columbia VeredShwartz DanGutfreundMIT-IBM Watson AI Lab - VeronikaThostInternational Business Machines + VeronikaThostInternational Business Machines 1928-1942 Reasoning about subjective natural language descriptions, such as opinions and preferences, is a challenging topic that largely remains unsolved to date. In particular, state-of-the-art large language models (LLMs) perform disappointingly in this task, show strong biases, and do not meet the interpretability requirements often needed in these kinds of applications. We propose a novel approach for reasoning about subjective knowledge that integrates potential and implicit meanings and explicitly models the relational nature of the information. We apply supervised graph learning, offer explanations for the model’s reasoning, and show that our model performs well across all 15 topics of OpinionQA, outperforming several prominent LLMs. Our detailed analysis further shows its unique advantages and the complementary nature it offers in comparison to LLMs. 2024.findings-acl.115 @@ -7695,8 +7695,8 @@ ZhiyuanLiuNational University of Singapore SihangLi KunWangUniversity of Science and Technology of China - WenjieDu - XiangWangUniversity of Science and Technology of China + WenjieDu + XiangWangUniversity of Science and Technology of China 1943-1958 Molecular Relational Learning (MRL), aiming to understand interactions between molecular pairs, plays a pivotal role in advancing biochemical research. Recently, the adoption of large language models (LLMs), known for their vast knowledge repositories and advanced logical inference capabilities, has emerged as a promising way for efficient and effective MRL. Despite their potential, these methods predominantly rely on textual data, thus not fully harnessing the wealth of structural information inherent in molecular graphs. Moreover, the absence of a unified framework exacerbates the issue of insufficient data exploitation, as it hinders the sharing of interaction mechanism learned across various datasets. To address these challenges, this work proposes a novel LLM-based multi-modal framework for molecular interaction modeling following Chain-of-Thought (CoT) theory, termed MolTC, which effectively integrate graphical information of two molecules in pair. To train this integrated framework efficiently, we introduce a *multi-hierarchical CoT theory* to refine its training paradigm, and conduct a comprehensive *Molecular Interactive Instructions* dataset for the development of biochemical LLMs involving MRL.Our experiments,conducted across various datasets involving over 4,000,000 molecular pairs, exhibit the superiority of our method over current GNN and LLM-based baselines. Code is available at https://github.com/MangoKiller/MolTC. 2024.findings-acl.116 @@ -7728,7 +7728,7 @@ <fixed-case>L</fixed-case>o<fixed-case>RA</fixed-case> Meets Dropout under a Unified Framework - ShengWang + ShengWang LihengChen JiyueJiang BoyangXue @@ -7742,7 +7742,7 @@ Enhancing Text-to-<fixed-case>SQL</fixed-case> Parsing through Question Rewriting and Execution-Guided Refinement - WenxinMao + WenxinMao RuiqiWang JiyuGuo JichuanZeng @@ -7759,7 +7759,7 @@ The Knowledge Alignment Problem: Bridging Human and External Knowledge for Large Language Models ShuoZhang LiangmingPan - JunzhouZhaoXi’an Jiaotong University + JunzhouZhaoXi’an Jiaotong University William YangWangUC Santa Barbara 2025-2038 Large language models often necessitate grounding on external knowledge to generate faithful and reliable answers. Yet even with the correct groundings in the reference, they can ignore them and rely on wrong groundings or their inherent biases to hallucinate when users, being largely unaware of the specifics of the stored information, pose questions that might not directly correlate with the retrieved groundings. In this work, we formulate this knowledge alignment problem and introduce MixAlign, a framework that interacts with both the human user and the knowledge base to obtain and integrate clarifications on how the user question relates to the stored information. MixAlign employs a language model to achieve automatic knowledge alignment and, if necessary, further enhances this alignment through human user clarifications. Experimental results highlight the crucial role of knowledge alignment in boosting model performance and mitigating hallucination, with improvements noted up to 22.2% and 27.1% respectively. We also demonstrate the effectiveness of MixAlign in improving knowledge alignment by producing high-quality, user-centered clarifications. @@ -7769,17 +7769,17 @@ <fixed-case>C</fixed-case>hat<fixed-case>KBQA</fixed-case>: A Generate-then-Retrieve Framework for Knowledge Base Question Answering with Fine-tuned Large Language Models - HaoranLuo + HaoranLuo HaihongEBeijing University of Post and Telecommunication - ZichenTangBeijing University of Posts and Telecommunications + ZichenTangBeijing University of Posts and Telecommunications ShiyaoPeng - YikaiGuo + YikaiGuo WentaiZhang ChenghaoMa GuantingDongRenmin University of China MeinaSongBeijing University of Posts and Telecommunications WeiLin - YifanZhuBeijing University of Posts and Telecommunications + YifanZhuBeijing University of Posts and Telecommunications Anh TuanLuuNanyang Technological University 2039-2056 Knowledge Base Question Answering (KBQA) aims to answer natural language questions over large-scale knowledge bases (KBs), which can be summarized into two crucial steps: knowledge retrieval and semantic parsing. However, three core challenges remain: inefficient knowledge retrieval, mistakes of retrieval adversely impacting semantic parsing, and the complexity of previous KBQA methods. To tackle these challenges, we introduce ChatKBQA, a novel and simple generate-then-retrieve KBQA framework, which proposes first generating the logical form with fine-tuned LLMs, then retrieving and replacing entities and relations with an unsupervised retrieval method, to improve both generation and retrieval more directly. Experimental results show that ChatKBQA achieves new state-of-the-art performance on standard KBQA datasets, WebQSP, and CWQ. This work can also be regarded as a new paradigm for combining LLMs with knowledge graphs (KGs) for interpretable and knowledge-required question answering. @@ -7805,10 +7805,10 @@ <fixed-case>INTERVENOR</fixed-case>: Prompting the Coding Ability of Large Language Models with the Interactive Chain of Repair HanbinWang ZhenghaoLiuNortheastern University - ShuoWang + ShuoWang GanquCui NingDingTsinghua University, Tsinghua University - ZhiyuanLiuTsinghua University + ZhiyuanLiuTsinghua University GeYu 2081-2107 This paper introduces INTERVENOR (INTERactiVE chaiN Of Repair), a system designed to emulate the interactive code repair processes observed in humans, encompassing both code diagnosis and code repair. INTERVENOR prompts Large Language Models (LLMs) to play distinct roles during the code repair process, functioning as both a Code Learner and a Code Teacher. Specifically, the Code Learner is tasked with adhering to instructions to generate or repair code, while the Code Teacher is responsible for crafting a Chain-of-Repair (CoR) to serve as guidance for the Code Learner. During generating the CoR, the Code Teacher needs to check the generated codes from Code Learner and reassess how to address code bugs based on error feedback received from compilers. Experimental results demonstrate that INTERVENOR surpasses baseline models, exhibiting improvements of approximately 18% and 4.3% over GPT-3.5 in code generation and code translation tasks, respectively. Our further analyses show that CoR is effective to illuminate the reasons behind bugs and outline solution plans in natural language. With the feedback of code compilers, INTERVENOR can accurately identify syntax errors and assertion errors and provide precise instructions to repair codes. All data and codes are available at [https://github.com/NEUIR/INTERVENOR](https://github.com/NEUIR/INTERVENOR). @@ -7820,8 +7820,8 @@ <fixed-case>S</fixed-case>ocial<fixed-case>B</fixed-case>ench: Sociality Evaluation of Role-Playing Conversational Agents HongzhanChenSUN YAT-SEN UNIVERSITY HehongChen - MingYan - WenshenXu + MingYan + WenshenXu GaoXing WeizhouShen XiaojunQuanSUN YAT-SEN UNIVERSITY @@ -7836,9 +7836,9 @@ From Model-centered to Human-Centered: Revision Distance as a Metric for Text Evaluation in <fixed-case>LLM</fixed-case>s-based Applications - YongqiangMaWuhan University + YongqiangMaWuhan University LizhiQingAlibaba Group - JiaweiLiuWuhan University + JiaweiLiuWuhan University YangyangKangAlibaba Group YueZhangAlibaba Group WeiLu @@ -7858,7 +7858,7 @@ XuanLinAnt Group LiubinWang DaqianLiDaqianLi - YongruiChen + YongruiChen 2138-2148 Incomplete utterance rewriting (IUR) aims to reconstruct the utterance with omitted information and pronouns to be standalone and complete based on the context. The existing works predominantly focus on simple ellipsis and coreference problems in brief multi-turn dialogues. But in actual scenarios: 1) the context of the dialogues frequently comprises multiple similar candidates for ellipsis and coreference resolution, pouring to confuse. 2) the number of turns tends to be more extensive, while the content with various topics also grows more complex. This paper proposes a novel method called CaT to address these issues. In particular, we first devise a tacker model, distilled from GPT4-turbo, to adopt Context Tracking that dynamically updates a list of key phrases turn by turn, as accurate candidates for ellipsis and coreference resolution. Second, we further present the Dynamic Context Introduction mechanism to filter irrelevant preceding contexts that are not relied on by any element within the key phrase list to condense extended dialogues. Comprehensive experiments indicate that our solution provides a significant improvement over the existing baselines, and achieves state-of-the-art on three benchmarks. 2024.findings-acl.127 @@ -7869,9 +7869,9 @@ <fixed-case>E</fixed-case>motion<fixed-case>Q</fixed-case>ueen: A Benchmark for Evaluating Empathy of Large Language Models YuyanChen SongzhouYan - SijiaLiu + SijiaLiu YuezeLi - YanghuaXiaoFudan University + YanghuaXiaoFudan University 2149-2176 Emotional intelligence in large language models (LLMs) is of great importance in Natural Language Processing. However, the previous research mainly focus on basic sentiment analysis tasks, such as emotion recognition, which is not enough to evaluate LLMs’ overall emotional intelligence. Therefore, this paper presents a novel framework named EmotionQueen for evaluating the emotional intelligence of LLMs. The framework includes four distinctive tasks: Key Event Recognition, Mixed Event Recognition, Implicit Emotional Recognition, and Intention Recognition. LLMs are requested to recognize important event or implicit emotions and generate empathetic response.We also design two metrics to evaluate LLMs’ capabilities in recognition and response for emotion-related statements. Experiments yield significant conclusions about LLMs’ capabilities and limitations in emotion intelligence. 2024.findings-acl.128 @@ -7880,15 +7880,15 @@ Plum: Prompt Learning using Metaheuristics - RuiPanThe Hong Kong University of Science and Technology + RuiPanThe Hong Kong University of Science and Technology ShuoXingTexas A&M University - College Station ShizheDiaoHong Kong University of Science and Technology WenheSun XiangLiu - KaShunShum + KaShunShum JipengZhang RenjiePi - TongZhangUIUC + TongZhangUIUC 2177-2197 Since the emergence of large language models, prompt learning has become a popular method for optimizing and customizing these models. Special prompts, such as Chain-of-Thought, have even revealed previously unknown reasoning capabilities within these models. However, the progress of discovering effective prompts has been slow, driving a desire for general prompt optimization methods. Unfortunately, few existing prompt learning methods satisfy the criteria of being truly “general”, i.e., automatic, discrete, black-box, gradient-free, and interpretable all at once. In this paper, we introduce metaheuristics, a branch of discrete non-convex optimization methods with over 100 options, as a promising approach to prompt learning. Within our paradigm, we test six typical methods: hill climbing, simulated annealing, genetic algorithms with/without crossover, tabu search, and harmony search, demonstrating their effectiveness in white-box and black-box prompt learning. Furthermore, we show that these methods can be used to discover more human-understandable prompts that were previously unknown in both reasoning and image generation tasks, opening the door to a cornucopia of possibilities in prompt optimization. 2024.findings-acl.129 @@ -7902,7 +7902,7 @@ QingpeiGuoAnt Group JiyuanJiasouthern university of science and technology ZhixuLi - YanghuaXiaoFudan University + YanghuaXiaoFudan University 2198-2224 In the era of social media video platforms, popular “hot-comments” play a crucial role in attracting user impressions of short-form videos, making them vital for marketing and branding purpose. However, existing research predominantly focuses on generating descriptive comments or “danmaku” in English, offering immediate reactions to specific video moments. Addressing this gap, our study introduces HOTVCOM, the largest Chinese video hot-comment dataset, comprising 94k diverse videos and 137 million comments. We also present the ComHeat framework, which synergistically integrates visual, auditory, and textual data to generate influential hot-comments on the Chinese video dataset. Empirical evaluations highlight the effectiveness of our framework, demonstrating its excellence on both the newly constructed and existing datasets. 2024.findings-acl.130 @@ -7914,9 +7914,9 @@ YuyanChen YuezeLi SongzhouYan - SijiaLiu + SijiaLiu JiaqingLiangFudan University - YanghuaXiaoFudan University + YanghuaXiaoFudan University 2225-2238 The evaluation of the problem-solving capability under incomplete information scenarios of Large Language Models (LLMs) is increasingly important, encompassing capabilities such as questioning, knowledge search, error detection, and path planning. Current research mainly focus on LLMs’ problem-solving capability such as “Twenty Questions”.However, these kinds of games do not require recognizing misleading cues which are necessary in the incomplete information scenario.Moreover, the existing game such as “Who is undercover” are highly subjective, making it challenging for evaluation.Therefore, in this paper, we introduce a novel game named BrainKing based on the “Who is undercover” and “Twenty Questions” for evaluating LLM capabilities under incomplete information scenarios. It requires LLMs to identify target entities with limited yes-or-no questions and potential misleading answers. By setting up easy, medium, and hard difficulty modes, we comprehensively assess the performance of LLMs across various aspects. Our results reveal the capabilities and limitations of LLMs in BrainKing, providing significant insights of LLM problem-solving levels. 2024.findings-acl.131 @@ -7936,7 +7936,7 @@ Into the Unknown: Generating Geospatial Descriptions for New Environments TzufPaz-ArgamanBar-Ilan University - JohnPalowitchGoogle + JohnPalowitchGoogle SayaliKulkarniResearch, Google and Google ReutTsarfatyGoogle and Bar-Ilan University, Technion JasonBaldridgeGoogle @@ -7962,14 +7962,14 @@ Length-aware Byte Pair Encoding for Mitigating Over-segmentation in <fixed-case>K</fixed-case>orean Machine Translation - JungseobLeeKorea University - HyeonseokMoonKorea University + JungseobLeeKorea University + HyeonseokMoonKorea University SeungjunLeeKorea University - ChanjunParkUpstage - SugyeongEoKorea University + ChanjunParkUpstage + SugyeongEoKorea University HyunwoongKo - JaehyungSeo - SeungyoonLeeKorea University + JaehyungSeo + SeungyoonLeeKorea University HeuiseokLimKorea University 2287-2303 Byte Pair Encoding is an effective approach in machine translation across several languages. However, our analysis indicates that BPE is prone to over-segmentation in the morphologically rich language, Korean, which can erode word semantics and lead to semantic confusion during training. This semantic confusion, stemming from over-segmentation, ultimately contributes to a degradation of overall translation quality. To address this issue, we introduce Length-aware Subword Vocabulary Construction (LeVoC), a novel approach strategically incorporating longer words into the vocabulary. By utilizing an external monolingual Korean corpus, LeVoC extracts and integrates long words, effectively preserving morphological information and reducing semantic confusion. Our experiments demonstrate that LeVoC not only significantly outperforms BPE, but also can be applied to and surpass current state-of-the-art morpheme-aware subword tokenization methods. We provide evidence that the difficulty in translating sentences with long words in Korean is associated with morphological compositionality, and LeVoC’s ability to reduce semantic confusion during training leads to improved translation quality. @@ -7997,8 +7997,8 @@ ShitaoXiao PeitianZhang KunLuo - DefuLianUniversity of Science and Technology of China - ZhengLiu + DefuLianUniversity of Science and Technology of China + ZhengLiu 2318-2335 In this paper, we introduce a new embedding model called M3-Embedding, which is distinguished for its versatility in Multi-Linguality, Multi-Functionality, and Multi-Granularity. It provides a uniform support for the semantic retrieval of more than 100 working languages. It can simultaneously accomplish the three common retrieval functionalities: dense retrieval, multi-vector retrieval, and sparse retrieval. Besides, it is also capable of processing inputs of different granularities, spanning from short sentences to long documents of up to 8,192 tokens. The effective training of M3-Embedding presents a series of technical contributions. Notably, we propose a novel self-knowledge distillation approach, where the relevance scores from different retrieval functionalities can be integrated as the teacher signal to enhance the training quality. We also optimize the batching strategy, which enables a large batch size and high training throughput to improve the discriminativeness of embeddings. M3-Embedding exhibits a superior performance in our experiment, leading to new state-of-the-art results on multilingual, cross-lingual, and long-document retrieval benchmarks. 2024.findings-acl.137 @@ -8012,7 +8012,7 @@ ZhengWangUniversity of Leeds HongyuZhangUniversity of Newcastle, Australia BatuGuan - FangxinLu + FangxinLu ZiliZhang YuleiSuiUniversity of New South Wales HaiJinHuazhong University of Science and Technology @@ -8026,8 +8026,8 @@ An Element is Worth a Thousand Words: Enhancing Legal Case Retrieval by Incorporating Legal Elements ChenlongDengRenmin University of China - ZhichengDouRenmin University of China - YujiaZhouTsinghua University, Tsinghua University + ZhichengDouRenmin University of China + YujiaZhouTsinghua University, Tsinghua University PeitianZhang KelongMao 2354-2365 @@ -8041,10 +8041,10 @@ XinnongZhangFudan University HaoyuKuangFudan University XinyiMou - HanjiaLyuUniversity of Rochester + HanjiaLyuUniversity of Rochester KunWu SimingChenFudan University - JieboLuoUniversity of Rochester and University of Rochester + JieboLuoUniversity of Rochester and University of Rochester XuanjingHuangFudan University ZhongyuWeiFudan University 2366-2389 @@ -8055,10 +8055,10 @@ <fixed-case>K</fixed-case>o<fixed-case>C</fixed-case>ommon<fixed-case>GEN</fixed-case> v2: A Benchmark for Navigating <fixed-case>K</fixed-case>orean Commonsense Reasoning Challenges in Large Language Models - JaehyungSeo - JaewookLeeKorea University - ChanjunParkUpstage - SeongTaeHongKorea University + JaehyungSeo + JaewookLeeKorea University + ChanjunParkUpstage + SeongTaeHongKorea University SeungjunLeeKorea University HeuiseokLimKorea University 2390-2415 @@ -8098,11 +8098,11 @@ Integrating Physician Diagnostic Logic into Large Language Models: Preference Learning from Process Feedback ChengfengDou - YingZhang - ZhiJinPeking University and Peking University - WenpinJiaoPeking University + YingZhang + ZhiJinPeking University and Peking University + WenpinJiaoPeking University HaiyanZhaoPeking University - YongqiangZhao + YongqiangZhao ZhengweiTao 2453-2473 The utilization of large language models for medical dialogue generation has attracted considerable attention due to its potential to enhance response richness and coherence. While previous studies have made strides in optimizing model performance, there is a pressing need to bolster the model’s capacity for diagnostic logic to ensure patient safety. In response to this need, we propose an approach termed preference learning from process feedback (PLPF), which involves integrating the doctor’s diagnostic logic into LLMs. PLPF encompasses three key components: rule modeling, preference data generation, and preference alignment. These components collectively serve to train the model to adhere to the diagnostic process. Our experimental results, utilizing Standardized Patient Testing, demonstrate that PLPF enhances the diagnostic accuracy of the baseline model in medical conversations by 17.6%, surpassing the performance of traditional approaches. Moreover, PLPF exhibits effectiveness in both multi-round and single-round dialogue tasks, thereby highlighting its potential in improving medical dialogue generation. Our dataset is available at https://github.com/Chengfeng-Dou/SpTesting. @@ -8113,7 +8113,7 @@ <fixed-case>LM</fixed-case>-Cocktail: Resilient Tuning of Language Models via Model Merging ShitaoXiao - ZhengLiu + ZhengLiu PeitianZhang XingrunXing 2474-2488 @@ -8127,7 +8127,7 @@ XinMiao YongqiLi ShenZhou - TieyunQianWuhan University + TieyunQianWuhan University 2489-2511 Large language models (LLMs) have achieved satisfactory performance in counterfactual generation. However, confined by the stochastic generation process of LLMs, there often are misalignments between LLMs and humans which hinder LLMs from handling complex tasks like relation extraction. As a result, LLMs may generate commonsense-violated counterfactuals like ‘eggs were produced by a box’. To bridge this gap, we propose to mimick the episodic memory retrieval, the working mechanism of human hippocampus, to align LLMs’ generation process with that of humans. In this way, LLMs can derive experience from their extensive memory, which keeps in line with the way humans gain commonsense. We then implement two central functions in the hippocampus, i.e., pattern separation and pattern completion, to retrieve the episodic memory from LLMs and generate commonsense counterfactuals for relation extraction. Experimental results demonstrate the improvements of our framework over existing methods in terms of the quality of counterfactuals. 2024.findings-acl.146 @@ -8137,32 +8137,32 @@ <fixed-case>S</fixed-case>em<fixed-case>R</fixed-case>el2024: A Collection of Semantic Textual Relatedness Datasets for 13 Languages NedjmaOusidhoumCardiff University - ShamsuddeenMuhammadBayero University, Kano-Nigeria + ShamsuddeenMuhammadBayero University, Kano-Nigeria MohamedAbdallaUniversity of Alberta - IdrisAbdulmuminAhmadu Bello University - IbrahimAhmadNortheastern University + IdrisAbdulmuminAhmadu Bello University + IbrahimAhmadNortheastern University SanchitAhujaResearch, Microsoft AlhamAjiMohamed bin Zayed University of Artificial Intelligence and Amazon - VladimirAraujoKU Leuven - AbinewAyeleBahir Dar University, Universität Hamburg + VladimirAraujoKU Leuven + AbinewAyeleBahir Dar University, Universität Hamburg PavanBaswani MeriemBeloucifUppsala University ChrisBiemannU Hamburg SofiaBourhim - ChristineKockUniversity of Melbourne + ChristineKockUniversity of Melbourne GenetDekebo OumaimaHourrane GopichandKanumolu LokeshMadasu SamuelRutunda - ManishShrivastavaInternational Institute of Information Technology Hyderabad, India + ManishShrivastavaInternational Institute of Information Technology Hyderabad, India ThamarSolorioMohamed bin Zayed University of Artificial Intelligence and University of Houston NirmalSurangeInternational Institute of Information Technology Hyderabad - HailegnawTilayeKotebe University of Education + HailegnawTilayeKotebe University of Education KrishnapriyaVishnubhotla GentaWinataCapital One AI Foundations - SeidYimamUniversität Hamburg - SaifMohammadNational Research Council Canada + SeidYimamUniversität Hamburg + SaifMohammadNational Research Council Canada 2512-2530 Exploring and quantifying semantic relatedness is central to representing language and holds significant implications across various NLP tasks. While earlier NLP research primarily focused on semantic similarity, often within the English language context, we instead investigate the broader phenomenon of semantic relatedness. In this paper, we present SemRel, a new semantic relatedness dataset collection annotated by native speakers across 13 languages: Afrikaans, Algerian Arabic, Amharic, English, Hausa, Hindi, Indonesian, Kinyarwanda, Marathi, Moroccan Arabic, Modern Standard Arabic, Spanish, and Telugu. These languages originate from five distinct language families and are predominantly spoken in Africa and Asia – regions characterised by a relatively limited availability of NLP resources. Each instance in the SemRel datasets is a sentence pair associated with a score that represents the degree of semantic textual relatedness between the two sentences. The scores are obtained using a comparative annotation framework. We describe the data collection and annotation processes, challenges when building the datasets, baseline experiments, and their impact and utility in NLP. 2024.findings-acl.147 @@ -8181,7 +8181,7 @@ <fixed-case>VISP</fixed-case>ool: Enhancing Transformer Encoders with Vector Visibility Graph Neural Networks - TunaAlikaşifoğlu + TunaAlikaşifoğlu ArdaAras AykutKocBilkent University 2547-2556 @@ -8195,7 +8195,7 @@ KrishnapriyaVishnubhotla AdamHammondUniversity of Toronto GraemeHirstUniversity of Toronto - SaifMohammadNational Research Council Canada + SaifMohammadNational Research Council Canada 2557-2574 Stories are rich in the emotions they exhibit in their narratives and evoke in the readers. The emotional journeys of the various characters within a story are central to their appeal. Computational analysis of the emotions of novels, however, has rarely examined the variation in the emotional trajectories of the different characters within them, instead considering the entire novel to represent a single story arc. In this work, we use character dialogue to distinguish between the emotion arcs of the narration and the various characters. We analyze the emotion arcs of the various characters in a dataset of English literary novels using the framework of Utterance Emotion Dynamics. Our findings show that the narration and the dialogue largely express disparate emotions through the course of a novel, and that the commonalities or differences in the emotional arcs of stories are more accurately captured by those associated with individual characters. 2024.findings-acl.150 @@ -8215,8 +8215,8 @@ Dictionary-Aided Translation for Handling Multi-Word Expressions in Low-Resource Languages AntoniosDimakisUniversity of Athens - StellaMarkantonatou - AntoniosAnastasopoulosAthena Research Center and George Mason University + StellaMarkantonatou + AntoniosAnastasopoulosAthena Research Center and George Mason University 2588-2595 Multi-word expressions (MWEs) present unique challenges in natural language processing (NLP), particularly within the context of translation systems, due to their inherent scarcity, non-compositional nature, and other distinct lexical and morphosyntactic characteristics, issues that are exacerbated in low-resource settings.In this study, we elucidate and attempt to address these challenges by leveraging a substantial corpus of human-annotated Greek MWEs. To address the complexity of translating such phrases, we propose a novel method leveraging an available out-of-context lexicon.We assess the translation capabilities of current state-of-the-art systems on this task, employing both automated metrics and human evaluators.We find that by using our method when applicable, the performance of current systems can be significantly improved, however these models are still unable to produce translations comparable to those of a human speaker. 2024.findings-acl.152 @@ -8228,7 +8228,7 @@ Zhong-ZhiLi Ming-LiangZhang FeiYin, Institute of automation, Chinese academy of science - Cheng-LinLiuInstitute of automation, Chinese academy of science, Chinese Academy of Sciences + Cheng-LinLiuInstitute of automation, Chinese academy of science, Chinese Academy of Sciences 2596-2608 Geometry problem solving (GPS) is a challenging mathematical reasoning task requiring multi-modal understanding, fusion, and reasoning. Existing neural solvers take GPS as a vision-language task but are short in the representation of geometry diagrams that carry rich and complex layout information. In this paper, we propose a layout-aware neural solver named LANS, integrated with two new modules: multimodal layout-aware pre-trained language module (MLA-PLM) and layout-aware fusion attention (LA-FA). MLA-PLM adopts structural-semantic pre-training (SSP) to implement global relationship modeling, and point-match pre-training (PMP) to achieve alignment between visual points and textual points. LA-FA employs a layout-aware attention mask to realize point-guided cross-modal fusion for further boosting layout awareness of LANS. Extensive experiments on datasets Geometry3K and PGPS9K validate the effectiveness of the layout-aware modules and superior problem-solving performance of our LANS solver, over existing symbolic and neural solvers. We have made our code and data publicly available. 2024.findings-acl.153 @@ -8238,12 +8238,12 @@ Knowledge Crosswords: Geometric Knowledge Reasoning with Large Language Models WenxuanDingHong Kong University of Science and Technology - ShangbinFengUniversity of Washington + ShangbinFengUniversity of Washington YuhanLiu - ZhaoxuanTanUniversity of Notre Dame + ZhaoxuanTanUniversity of Notre Dame VidhishaBalachandranResearch, Microsoft TianxingHe - YuliaTsvetkovDepartment of Computer Science, University of Washington + YuliaTsvetkovDepartment of Computer Science, University of Washington 2609-2636 We propose Knowledge Crosswords, a geometric knowledge reasoning benchmark consisting of incomplete knowledge networks bounded by structured factual constraints, where LLMs are tasked with inferring the missing facts to meet all constraints. The novel setting of geometric knowledge reasoning necessitates new LM abilities beyond existing atomic/linear multi-hop QA, such as backtracking, verifying facts and constraints, reasoning with uncertainty, and more. Knowledge Crosswords contains 2,101 individual problems, covering diverse knowledge domains, and is further divided into three difficulty levels. We conduct extensive experiments to evaluate existing LLMs and approaches on Knowledge Crosswords. Results demonstrate that baseline approaches struggle with larger knowledge networks and semantically-equivalent entity distractors. In light of their limitations, we propose two new approaches, Staged Prompting and Verify-All, to augment LLMs’ abilities for error-aware backtracking and constraint verification. Our Verify-All significantly outperforms prior methods and is more robust towards problems in the hard subset. Further analysis shows that geometric knowledge reasoning poses new challenges to LLMs’ knowledge abilities, particularly in robustness towards varying option orders, complex structural constraints in knowledge networks, “none of the above” scenarios, and more. 2024.findings-acl.154 @@ -8253,11 +8253,11 @@ <fixed-case>DELL</fixed-case>: Generating Reactions and Explanations for <fixed-case>LLM</fixed-case>-Based Misinformation Detection HerunWan - ShangbinFengUniversity of Washington - ZhaoxuanTanUniversity of Notre Dame + ShangbinFengUniversity of Washington + ZhaoxuanTanUniversity of Notre Dame HengWang - YuliaTsvetkovDepartment of Computer Science, University of Washington - MinnanLuoXi’an Jiaotong University + YuliaTsvetkovDepartment of Computer Science, University of Washington + MinnanLuoXi’an Jiaotong University 2637-2667 Large language models are limited by challenges in factuality and hallucinations to be directly employed off-the-shelf for judging the veracity of news articles, where factual accuracy is paramount. In this work, we propose DELL that identifies three key stages in misinformation detection where LLMs could be incorporated as part of the pipeline: 1) LLMs could generate news reactions to represent diverse perspectives and simulate user-news interaction networks; 2) LLMs could generate explanations for proxy tasks (e.g., sentiment, stance) to enrich the contexts of news articles and produce experts specializing in various aspects of news understanding; 3) LLMs could merge task-specific experts and provide an overall prediction by incorporating the predictions and confidence scores of varying experts. Extensive experiments on seven datasets with three LLMs demonstrate that DELL outperforms state-of-the-art baselines by up to 16.8% in macro f1-score. Further analysis reveals that the generated reactions and explanations are greatly helpful in misinformation detection, while our proposed LLM-guided expert merging helps produce better-calibrated predictions. 2024.findings-acl.155 @@ -8273,7 +8273,7 @@ JingyuZhangJohns Hopkins University HaoranXuJohns Hopkins University BoyuanZhengOhio State University, Columbus - PhilippKoehnJohns Hopkins University + PhilippKoehnJohns Hopkins University DanielKhashabiJohns Hopkins University 2668-2680 As the influence of large language models (LLMs) spans across global communities, their safety challenges in multilingual settings become paramount for alignment research. This paper examines the variations in safety challenges faced by LLMs across different languages and discusses approaches to alleviating such concerns. By comparing how state-of-the-art LLMs respond to the same set of malicious prompts written in higher- vs. lower-resource languages,we observe that (1) LLMs tend to generate unsafe responses much more often when a malicious prompt is written in a lower-resource language, and (2) LLMs tend to generate more irrelevant responses to malicious prompts in lower-resource languages. To understand where the discrepancy can be attributed, we study the effect of instruction tuning with reinforcement learning from human feedback (RLHF) or supervised finetuning (SFT) on the HH-RLHF dataset. Surprisingly, while training with high-resource languages improves model alignment, training in lower-resource languages yields minimal improvement. This suggests that the bottleneck of cross-lingual alignment is rooted in the pretraining stage. Our findings highlight the challenges in cross-lingual LLM safety, and we hope they inform future research in this direction. @@ -8285,9 +8285,9 @@ Self-Specialization: Uncovering Latent Expertise within Large Language Models JunmoKangGeorgia Institute of Technology HongyinLuoMassachusetts Institute of Technology - YadaZhuIBM Research + YadaZhuIBM Research JacobHansen - JamesGlass + JamesGlass DavidCoxInternational Business Machines AlanRitterGeorgia Institute of Technology RogerioFerisInternational Business Machines @@ -8303,8 +8303,8 @@ FredXuUniversity of California, Los Angeles SongJiangFAIR ZijieHuangUniversity of California, Los Angeles - XiaoLuoUniversity of California, Los Angeles - ShichangZhangHarvard Business School + XiaoLuoUniversity of California, Los Angeles + ShichangZhangHarvard Business School YuanzhouChen, University of California, Los Angeles YizhouSunUniversity of California, Los Angeles 2707-2720 @@ -8315,7 +8315,7 @@ Chain of Logic: Rule-Based Reasoning with Large Language Models - SergioServantez + SergioServantez JoeBarrowPattern Data KristianHammond RajivJainAdobe Systems @@ -8350,11 +8350,11 @@ Simulated Misinformation Susceptibility (<fixed-case>SMISTS</fixed-case>): Enhancing Misinformation Research with Large Language Model Simulations - WeichengMaDartmouth College + WeichengMaDartmouth College ChunyuanDengRice University AramMoossavi LiliWang - SoroushVosoughiDartmouth College + SoroushVosoughiDartmouth College DiyiYangStanford University 2774-2788 Psychological inoculation, a strategy designed to build resistance against persuasive misinformation, has shown efficacy in curbing its spread and mitigating its adverse effects at early stages. Despite its effectiveness, the design and optimization of these inoculations typically demand substantial human and financial resources, primarily due to the need for repeated experimental trials. To address these challenges, this paper introduces Simulated Misinformation Susceptibility Tests (SMISTs), leveraging Large Language Models (LLMs) to simulate participant responses in misinformation studies. SMIST employs a life experience-driven simulation methodology, which accounts for various aspects of participants’ backgrounds, to mitigate common issues of caricatures and stereotypes in LLM simulations and enhance response diversity. Our extensive experimentation demonstrates that SMIST, utilizing GPT-4 as the backend model, yields results that align closely with those obtained from human-subject studies in misinformation susceptibility. This alignment suggests that LLMs can effectively serve as proxies in evaluating the impact of psychological inoculations. Moreover, SMIST offers the critical benefit of being applicable to emerging or anticipated misinformation scenarios without exposing human participants to potentially harmful content. This characteristic of SMIST not only preserves participant safety but also expands the scope of misinformation research to include more sensitive or speculative topics. @@ -8388,8 +8388,8 @@ <fixed-case>MODABS</fixed-case>: Multi-Objective Learning for Dynamic Aspect-Based Summarization - XiaoboGuo - SoroushVosoughiDartmouth College + XiaoboGuo + SoroushVosoughiDartmouth College 2814-2827 The rapid proliferation of online content necessitates effective summarization methods, among which dynamic aspect-based summarization stands out. Unlike its traditional counterpart, which assumes a fixed set of known aspects, this approach adapts to the varied aspects of the input text. We introduce a novel multi-objective learning framework employing a Longformer-Encoder-Decoder for this task. The framework optimizes aspect number prediction, minimizes disparity between generated and reference summaries for each aspect, and maximizes dissimilarity across aspect-specific summaries. Extensive experiments show our method significantly outperforms baselines on three diverse datasets, largely due to the effective alignment of generated and reference aspect counts without sacrificing single-aspect summarization quality. 2024.findings-acl.165 @@ -8409,15 +8409,15 @@ Medical Dialogue System: A Survey of Categories, Methods, Evaluation and Challenges XiaomingShiEast China Normal University - ZemingLiu + ZemingLiu LiDu YuxuanWangZhejiang Lab, Zhejiang Lab - HongruWangThe Chinese University of Hong Kong + HongruWangThe Chinese University of Hong Kong YuhangGuo TongRuan - JieXu + JieXu XiaofanZhangShanghai Jiaotong University - ShaotingZhangUniversity of North Carolina at Charlotte + ShaotingZhangUniversity of North Carolina at Charlotte 2840-2861 This paper surveys and organizes research works of medical dialog systems, which is an important yet challenging task. Although these systems have been surveyed in the medical community from an application perspective, a systematic review from a rigorous technical perspective has to date remained noticeably absent. As a result, an overview of the categories, methods, evaluation of medical dialogue systems remain limited and underspecified, hindering the further improvement of this area. To fill this gap, we investigate an initial pool of 325 papers from well-known computer science, natural language processing conferences and journals, and make an overview. Recently, large language models have shown strong model capacity on downstream tasks, which also reshape medical dialog systems’ foundation.Despite the alluring practical application value, current medical dialogue systems still suffer from problems. To this end, this paper lists grand challenges of medical dialog systems, especially of large language models. 2024.findings-acl.167 @@ -8426,8 +8426,8 @@ Direct Evaluation of Chain-of-Thought in Multi-hop Reasoning with Knowledge Graphs - Minh-VuongNguyen - LinhaoLuo + Minh-VuongNguyen + LinhaoLuo FatemehShiri DinhPhung Yuan-FangLi @@ -8454,9 +8454,9 @@ Self-Supervised Position Debiasing for Large Language Models ZhongkunLiu - ZhengChen - MengqiZhangShandong University - ZhaochunRenLeiden University + ZhengChen + MengqiZhangShandong University + ZhaochunRenLeiden University PengjieRenShandong University ZhuminChenShandong University 2897-2917 @@ -8469,8 +8469,8 @@ <fixed-case>H</fixed-case>yper<fixed-case>CL</fixed-case>: A Contrastive Learning Framework for Hyper-Relational Knowledge Graph Embedding with Hierarchical Ontology YuhuanLu WeijianYu - XinJing - DingqiYangUniversity of Macau + XinJing + DingqiYangUniversity of Macau 2918-2929 2024.findings-acl.171 lu-etal-2024-hypercl @@ -8479,8 +8479,8 @@ Encoding Hierarchical Schema via Concept Flow for Multifaceted Ideology Detection SongtaoLiu - BangWangHuazhong University of Science and Technology - WeiXiangHuazhong University of Science and Technology + BangWangHuazhong University of Science and Technology + WeiXiangHuazhong University of Science and Technology HanXuHuazhong University of Science and Technology MinghuaXuHuazhong University of Science and Technology 2930-2942 @@ -8501,9 +8501,9 @@ <fixed-case>A</fixed-case>lign<fixed-case>RE</fixed-case>: An Encoding and Semantic Alignment Approach for Zero-Shot Relation Extraction - ZehanLiNortheastern University - FuZhangNortheastern University - JingweiChengNortheastern University, China + ZehanLiNortheastern University + FuZhangNortheastern University + JingweiChengNortheastern University, China 2957-2966 Zero-shot Relation Extraction (ZSRE) aims to predict unseen relations between entity pairs from input sentences. Existing prototype-based ZSRE methods encode relation descriptions into prototype embeddings and predict by measuring the similarity between sentence embeddings and prototype embeddings. However, these methods often overlook abundant side information of relations and suffer from a significant encoding gap between prototypes and sentences, limiting performance. To this end, we propose a framework named AlignRE, based on two Alignment methods for ZSRE. Specifically, we present a novel perspective centered on encoding schema alignment to enhance prototype-based ZSRE methods. We utilize well-designed prompt-tuning to bridge the encoding gap. To improve prototype quality, we explore and leverage multiple side information and propose a prototype aggregation method based on semantic alignment to create comprehensive relation prototype representations. We conduct experiments on FewRel and Wiki-ZSL datasets and consistently outperform state-of-the-art methods. Moreover, our method exhibits substantially faster performance and reduces the need for extensive manual labor in prototype construction. Code is available at https://github.com/lizehan1999/AlignRE. 2024.findings-acl.174 @@ -8516,7 +8516,7 @@ DengCaiTencent AI Lab LemaoLiuTencent ShumingShiTencent AI Lab - RuiYanRenmin University of China + RuiYanRenmin University of China 2967-2985 Supervised fine-tuning (SFT) on instruction-following corpus is a crucial approach toward the alignment of large language models (LLMs). However, the performance of LLMs on standard knowledge and reasoning benchmarks tends to suffer from deterioration at the latter stage of the SFT process, echoing the phenomenon of alignment tax. Through our pilot study, we put a hypothesis that the data biases are probably one cause behind the phenomenon. To address the issue, we introduce a simple disperse-then-merge framework. To be concrete, we disperse the instruction-following data into portions and then train multiple sub-models using different data portions. Lastly, we merge multiple models into a single one via model merging techniques. Despite its simplicity, our framework outperforms various sophisticated methods such as data curation and training regularization on a series of standard knowledge and reasoning benchmarks. 2024.findings-acl.175 @@ -8541,8 +8541,8 @@ Towards Precise Localization of Critical Errors in Machine Translation - DahyunJungKorea University - SugyeongEoKorea University + DahyunJungKorea University + SugyeongEoKorea University HeuiseokLimKorea University 3000-3012 The advent of large language models has experienced a remarkable improvement in the field of machine translation. However, machine translation is still vulnerable to critical meaning deviations, which may incur catastrophic issues in social or ethical contexts. In particular, existing critical error detection primarily focuses on identifying sentence-level errors, leaving the precise localization of such errors within the sentence unaddressed. In this paper, we introduce a new task, word-level critical error detection (WCED), to detect critical errors at a fine-grained level in machine translation sentences. The task aims to identify the parts of a machine translation that contain catastrophic meaning distortions. We hypothesize that the ability to determine errors at the sentence level will positively influence the detection of more granular errors. We propose a sentence-level error detection module to predict which words in a sentence have critical errors. Experimental results demonstrate that our method outperforms existing methodologies and LLM in En-De, Zh-En, En-Ru, and En-Ko. Our method is helpful for determining the fine-grained location of errors. We hope that such studies will improve the capacity to address critical errors adeptly. @@ -8568,7 +8568,7 @@ Speculative Decoding via Early-exiting for Faster <fixed-case>LLM</fixed-case> Inference with <fixed-case>T</fixed-case>hompson Sampling Control Mechanism JiahaoLiuMeituan - QifanWangMeta AI + QifanWangMeta AI JingangWangMeituan XunliangCai 3027-3043 @@ -8597,8 +8597,8 @@ MingdaoLiu RuiLu BowenWangTsinghua University, Tsinghua University - XiaoLiu - YuxiaoDongTsinghua University + XiaoLiu + YuxiaoDongTsinghua University JieTangTsinghua University, Tsinghua University 3053-3077 Open large language models (LLMs) with great performance in various tasks have significantly advanced the development of LLMs. However, they are far inferior to commercial models such as ChatGPT and GPT-4 when acting as agents to tackle complex tasks in the real world. These agent tasks employ LLMs as the central controller responsible for planning, memorization, and tool utilization, necessitating both fine-grained prompting methods and robust LLMs to achieve satisfactory performance. Though many prompting methods have been proposed to complete particular agent tasks, there is lack of research focusing on improving the agent capabilities of LLMs themselves without compromising their general abilities. In this work, we present AgentTuning, a simple and general method to enhance the agent abilities of LLMs while maintaining their general LLM capabilities. We construct AgentInstruct, a lightweight instruction-tuning dataset containing high-quality interaction trajectories. We employ a hybrid instruction-tuning strategy by combining AgentInstruct with open-source instructions from general domains. AgentTuning is used to instruction-tune the Llama 2 series, resulting in AgentLM. Our evaluations show that AgentTuning enables LLMs’ agent capabilities without compromising general abilities. The AgentLM-70B is comparable to GPT-3.5-turbo on unseen agent tasks, demonstrating generalized agent capabilities. We open source the AgentInstruct and AgentLM-7B, 13B, and 70B models at https://anonymous.4open.science/r/AgentTuning, serving open and powerful alternatives to commercial LLMs for agent tasks. @@ -8634,13 +8634,13 @@ A <fixed-case>C</fixed-case>hinese Dataset for Evaluating the Safeguards in Large Language Models YuxiaWang ZenanZhai - HaonanLi - XudongHanUniversity of Melbourne + HaonanLi + XudongHanUniversity of Melbourne ShomLin ZhenxuanZhang AngelaZhao - PreslavNakovMohamed bin Zayed University of Artificial Intelligence - TimothyBaldwinMohamed bin Zayed University of Artificial Intelligence and The University of Melbourne + PreslavNakovMohamed bin Zayed University of Artificial Intelligence + TimothyBaldwinMohamed bin Zayed University of Artificial Intelligence and The University of Melbourne 3106-3119 Many studies have demonstrated that large language models (LLMs) can produce harmful responses, exposing users to unexpected risks. Previous studies have proposed comprehensive taxonomies of LLM risks, as well as corresponding prompts that can be used to examine LLM safety. However, the focus has been almost exclusively on English. We aim to broaden LLM safety research by introducing a dataset for the safety evaluation of Chinese LLMs, and extending it to better identify false negative and false positive examples in terms of risky prompt rejections. We further present a set of fine-grained safety assessment criteria for each risk type, facilitating both manual annotation and automatic evaluation in terms of LLM response harmfulness. Our experiments over five LLMs show that region-specific risks are the prevalent risk type. Warning: this paper contains example data that may be offensive, harmful, or biased. Our data is available at https://github.com/Libr-AI/do-not-answer. 2024.findings-acl.184 @@ -8651,7 +8651,7 @@ <fixed-case>LLMF</fixed-case>actor: Extracting Profitable Factors through Prompts for Explainable Stock Movement Prediction MeiyunWang KiyoshiIzumi - HirokiSakajiHokkaido University + HirokiSakajiHokkaido University 3120-3131 Recently, Large Language Models (LLMs) have attracted significant attention for their exceptional performance across a broad range of tasks, particularly in text analysis. However, the finance sector presents a distinct challenge due to its dependence on time-series data for complex forecasting tasks. In this study, we introduce a novel framework called LLMFactor, which employs Sequential Knowledge-Guided Prompting (SKGP) to identify factors that influence stock movements using LLMs. Unlike previous methods that relied on keyphrases or sentiment analysis, this approach focuses on extracting factors more directly related to stock market dynamics, providing clear explanations for complex temporal changes. Our framework directs the LLMs to create background knowledge through a fill-in-the-blank strategy and then discerns potential factors affecting stock prices from related news. Guided by background knowledge and identified factors, we leverage historical stock prices in textual format to predict stock movement. An extensive evaluation of the LLMFactor framework across four benchmark datasets from both the U.S. and Chinese stock markets demonstrates its superiority over existing state-of-the-art methods and its effectiveness in financial time-series forecasting. 2024.findings-acl.185 @@ -8660,7 +8660,7 @@ You Only Look at Screens: Multimodal Chain-of-Action Agents - ZhuoshengZhangShanghai Jiao Tong University + ZhuoshengZhangShanghai Jiao Tong University AstonZhangMeta 3132-3149 Autonomous graphical user interface (GUI) agents aim to facilitate task automation by interacting with the user interface without manual intervention. Recent studies have investigated eliciting the capabilities of large language models (LLMs) for effective engagement in diverse environments. To align with the input-output requirement of LLMs, most existing approaches are developed under a sandbox setting where they rely on external tools and application-specific APIs to parse the environment into textual elements and interpret the predicted actions. Consequently, those approaches often grapple with inference inefficiency and error propagation risks. To mitigate the challenges, we introduce Auto-GUI, a multimodal solution that directly interacts with the interface, bypassing the need for environment parsing or reliance on application-dependent APIs. Moreover, we propose a chain-of-action technique—leveraging a series of intermediate previous action histories and future action plans—to help the agent decide what action to execute. We evaluate our approach on a new device-control benchmark AITW with 30K unique instructions, spanning multi-step tasks such as application operation, web searching, and web shopping. Experimental results show that Auto-GUI achieves state-of-the-art performance with an action type prediction accuracy of 90% and an overall action success rate of 74%. Code is publicly available at https://github.com/cooelf/Auto-GUI. @@ -8685,7 +8685,7 @@ <fixed-case>GENDEX</fixed-case>: Generative Data Augmentation Strategy Leveraging External Data for Abstractive Dialogue Summarization SangwonParkGwangju Institute of Science and Technology - HongseokChoiElectronics and Telecommunications Research Institute + HongseokChoiElectronics and Telecommunications Research Institute DonghaChoiGwangju Institute of Science and Technology HyunjuLeeGwangju Institute of Science and Technology 3171-3185 @@ -8719,7 +8719,7 @@ Refine, Align, and Aggregate: Multi-view Linguistic Features Enhancement for Aspect Sentiment Triplet Extraction GuixinSu - MingminWu + MingminWu ZhongqiangHuang YongchengZhang TongguanWang @@ -8733,8 +8733,8 @@ Pro-Woman, Anti-Man? Identifying Gender Bias in Stance Detection - YingjieLiWestlake University - YueZhangWestlake University + YingjieLiWestlake University + YueZhangWestlake University 3229-3236 Gender bias has been widely observed in NLP models, which has the potential to perpetuate harmful stereotypes and discrimination. In this paper, we construct a dataset GenderStance of 36k samples to measure gender bias in stance detection, determining whether models consistently predict the same stance for a particular gender group. We find that all models are gender-biased and prone to classify sentences that contain male nouns as Against and those with female nouns as Favor. Moreover, extensive experiments indicate that sources of gender bias stem from the fine-tuning data and the foundation model itself. We will publicly release our code and dataset. 2024.findings-acl.192 @@ -8744,7 +8744,7 @@ Likelihood-based Mitigation of Evaluation Bias in Large Language Models MasanariOhi - MasahiroKanekoMohamed bin Zayed University of Artificial Intelligence and Tokyo Institute of Technology, Tokyo Institute of Technology + MasahiroKanekoMohamed bin Zayed University of Artificial Intelligence and Tokyo Institute of Technology, Tokyo Institute of Technology RyutoKoike MengsayLoemSansan, Inc. NaoakiOkazakiTokyo Institute of Technology @@ -8785,7 +8785,7 @@ From Role-Play to Drama-Interaction: An <fixed-case>LLM</fixed-case> Solution - WeiqiWu + WeiqiWu HongqiuWu LaiJiang XingyuanLiu @@ -8802,10 +8802,10 @@ JaewooAhnSeoul National University TaehyunLeeSeoul National University JunyoungLimSeoul National University - Jin-HwaKimSeoul National University and NAVER + Jin-HwaKimSeoul National University and NAVER SangdooYunNAVER - HwaranLeeNAVER AI Lab - GunheeKimSeoul National University + HwaranLeeNAVER AI Lab + GunheeKimSeoul National University 3291-3325 While Large Language Models (LLMs) can serve as agents to simulate human behaviors (i.e., role-playing agents), we emphasize the importance of point-in-time role-playing. This situates characters at specific moments in the narrative progression for three main reasons: (i) enhancing users’ narrative immersion, (ii) avoiding spoilers, and (iii) fostering engagement in fandom role-playing. To accurately represent characters at specific time points, agents must avoid character hallucination, where they display knowledge that contradicts their characters’ identities and historical timelines. We introduce TimeChara, a new benchmark designed to evaluate point-in-time character hallucination in role-playing LLMs. Comprising 10,895 instances generated through an automated pipeline, this benchmark reveals significant hallucination issues in current state-of-the-art LLMs (e.g., GPT-4o). To counter this challenge, we propose Narrative-Experts, a method that decomposes the reasoning steps and utilizes narrative experts to reduce point-in-time character hallucinations effectively. Still, our findings with TimeChara highlight the ongoing challenges of point-in-time character hallucination, calling for further study. 2024.findings-acl.197 @@ -8815,11 +8815,11 @@ Red Teaming Visual Language Models MukaiLi - LeiLiUniversity of Hong Kong - YuweiYin + LeiLiUniversity of Hong Kong + YuweiYin MasoodAhmed ZhenguangLiuZhejiang University - QiLiuUniversity of Hong Kong + QiLiuUniversity of Hong Kong 3326-3342 VLMs (Vision-Language Models) extend the capabilities of LLMs (Large Language Models) to accept multimodal inputs. Since it has been verified that LLMs can be induced to generate harmful or inaccurate content through specific test cases (termed as Red Teaming), how VLMs perform in similar scenarios, especially with their combination of textual and visual inputs, remains a question. To explore this problem, we present a novel red teaming dataset RTVLM, which encompasses 12 subtasks (e.g., image misleading, multi-modal jailbreaking, face fairness, etc) under 4 primary aspects (faithfulness, privacy, safety, fairness). Our RTVLM is the first red teaming dataset to benchmark current VLMs in terms of these 4 different aspects. Detailed analysis shows that 10 prominent open-sourced VLMs struggle with the red teaming in different degrees and have up to 31% performance gap with GPT-4V. Additionally, we simply apply red teaming alignment to LLaVA-v1.5 with Supervised Fine-tuning (SFT) using RTVLM, and this bolsters the models’ performance with 10% in RTVLM test set, 13% in MM-hallu, and without noticeable decline in MM-Bench, overpassing other LLaVA-based models in similar size with regular alignment data. This reveals that current open-sourced VLMs still lack red teaming alignment. Our code and datasets will be open-sourced. 2024.findings-acl.198 @@ -8832,7 +8832,7 @@ DapengChenHuawei Technologies Ltd. YajingSun RongjunLi - ZhiyongFengTianjin University + ZhiyongFengTianjin University WeiPengHuawei Technologies Ltd. 3343-3353 A Large Language Model (LLM) tends to generate inconsistent and sometimes contradictory outputs when presented with a prompt that has equivalent semantics but is expressed differently from the original prompt. To achieve semantic consistency of an LLM, one of the key approaches is to finetune the model with prompt-output pairs with semantically equivalent meanings. Despite its effectiveness, a data-driven finetuning method incurs substantial computation costs in data preparation and model optimization. In this regime, an LLM is treated as a “black box”, restricting our ability to gain deeper insights into its internal mechanism. In this paper, we are motivated to enhance the semantic consistency of LLMs through a more interpretable method (i.e., model editing) to this end. We first identify the model components (i.e., attention heads) that have a key impact on the semantic consistency of an LLM. We subsequently inject biases into the output of these model components along the semantic-consistency activation direction. It is noteworthy that these modifications are cost-effective, without reliance on mass manipulations of the original model parameters. Through comprehensive experiments on the constructed NLU and open-source NLG datasets, our method demonstrates significant improvements in the semantic consistency and task performance of LLMs. Additionally, our method exhibits promising generalization capabilities by performing well on tasks beyond the primary tasks. @@ -8846,7 +8846,7 @@ SeungHyunKim YoungsooJangLG AI Research MoontaeLeeUniversity of Illinois, Chicago - HongukWoo + HongukWoo 3354-3376 In embodied instruction-following (EIF), the integration of pretrained language models (LMs) as task planners emerges as a significant branch, where tasks are planned at the skill level by prompting LMs with pretrained skills and user instructions. However, grounding these pretrained skills in different domains remains challenging due to their intricate entanglement with the domain-specific knowledge. To address this challenge, we present a semantic skill grounding (SemGro) framework that leverages the hierarchical nature of semantic skills. SemGro recognizes the broad spectrum of these skills, ranging from short-horizon low-semantic skills that are universally applicable across domains to long-horizon rich-semantic skills that are highly specialized and tailored for particular domains. The framework employs an iterative skill decomposition approach, starting from the higher levels of semantic skill hierarchy and then moving downwards, so as to ground each planned skill to an executable level within the target domain. To do so, we use the reasoning capabilities of LMs for composing and decomposing semantic skills, as well as their multi-modal extension for assessing the skill feasibility in the target domain. Our experiments in the VirtualHome benchmark show the efficacy of SemGro in 300 cross-domain EIF scenarios. 2024.findings-acl.200 @@ -8857,7 +8857,7 @@ <fixed-case>LIRE</fixed-case>: listwise reward enhancement for preference alignment MingyeZhu YiLiuState Key Laboratory of Communication Content Cognition - LeiZhangUniversity of Science and Technology of China + LeiZhangUniversity of Science and Technology of China JunboGuoPeople’s Daily Online ZhendongMaoUniversity of Science and Technology of China 3377-3394 @@ -8873,7 +8873,7 @@ Seung HwanKimLG AI Research SoonyoungLee BumsooKimLG AI Research - GunheeKimSeoul National University + GunheeKimSeoul National University 3395-3405 3D dense captioning is a task to localize objects in a 3D scene and generate descriptive sentences for each object. Recent approaches in 3D dense captioning have adopted transformer encoder-decoder frameworks from object detection to build an end-to-end pipeline without hand-crafted components. However, these approaches struggle with contradicting objectives where a single query attention has to simultaneously view both the tightly localized object regions and contextual environment. To overcome this challenge, we introduce SIA (See-It-All), a transformer pipeline that engages in 3D dense captioning with a novel paradigm called late aggregation. SIA simultaneously decodes two sets of queries—context query and instance query. The instance query focuses on localization and object attribute descriptions, while the context query versatilely captures the region-of-interest of relationships between multiple objects or with the global scene, then aggregated afterwards (i.e., late aggregation) via simple distance-based measures. To further enhance the quality of contextualized caption generation, we design a novel aggregator to generate a fully informed caption based on the surrounding context, the global environment, and object instances. Extensive experiments on two of the most widely-used 3D dense captioning datasets demonstrate that our proposed method achieves a significant improvement over prior methods. 2024.findings-acl.202 @@ -8883,7 +8883,7 @@ <tex-math>\texttt{DARA}</tex-math>: Decomposition-Alignment-Reasoning Autonomous Language Agent for Question Answering over Knowledge Graphs HaishuoFangTechnische Universität Darmstadt - XiaodanZhuQueen’s University + XiaodanZhuQueen’s University IrynaGurevychMohamed bin Zayed University of Artificial Intelligence and Technical University of Darmstadt 3406-3432 Answering Questions over Knowledge Graphs (KGQA) is key to well-functioning autonomous language agents in various real-life applications. To improve the neural-symbolic reasoning capabilities of language agents powered by Large Language Models (LLMs) in KGQA, we propose the Decomposition-Alignment-Reasoning Agent (DARA) framework. DARA effectively parses questions into formal queries through a dual mechanism: high-level iterative task decomposition and low-level task grounding. Importantly, DARA can be efficiently trained with a small number of high-quality reasoning trajectories. Our experimental results demonstrate that DARA fine-tuned on LLMs (e.g. Llama-2-7B, Mistral) outperforms both in-context learning-based agents with GPT-4 and alternative fine-tuned agents, across different benchmarks, making such models more accessible for real-life applications. We also show that DARA attains performance comparable to state-of-the-art enumerating-and-ranking-based methods for KGQA. @@ -8905,7 +8905,7 @@ Compositional Generalization with Grounded Language Models SondreWold - ÉtienneSimon + ÉtienneSimon LucasCharpentierUniversity of Oslo EgorKostylevUniversity of Oslo, Norway ErikVelldalUniversity of Oslo @@ -8919,10 +8919,10 @@ Rethinking Negative Instances for Generative Named Entity Recognition YuyangDing - JuntaoLiSoochow University, China + JuntaoLiSoochow University, China PinzhengWang - ZechengTangSoochow University - YanBowen + ZechengTangSoochow University + YanBowen MinZhangHarbin Institute of Technology, Shenzhen 3461-3475 Large Language Models (LLMs) have demonstrated impressive capabilities for generalizing in unseen tasks. In the Named Entity Recognition (NER) task, recent advancements have seen the remarkable improvement of LLMs in a broad range of entity domains via instruction tuning, by adopting entity-centric schema. In this work, we explore the potential enhancement of the existing methods by incorporating negative instances into training. Our experiments reveal that negative instances contribute to remarkable improvements by (1) introducing contextual information, and (2) clearly delineating label boundaries. Furthermore, we introduce an efficient longest common subsequence (LCS) matching algorithm, which is tailored to transform unstructured predictions into structured entities. By integrating these components, we present GNER, a Generative NER system that shows improved zero-shot performance across unseen entity domains. Our comprehensive evaluation illustrates our system’s superiority, surpassing state-of-the-art (SoTA) methods by 9 F_1 score in zero-shot evaluation. @@ -8970,9 +8970,9 @@ How Much Does Nonverbal Communication Conform to Entropy Rate Constancy?: A Case Study on Listener Gaze in Interaction YuWangUniversität Bielefeld - YangXuSouthern University of Science and Technology - GabrielSkantzeKTH Royal Institute of Technology, Stockholm, Sweden - HendrikBuschmeierUniversität Bielefeld + YangXuSouthern University of Science and Technology + GabrielSkantzeKTH Royal Institute of Technology, Stockholm, Sweden + HendrikBuschmeierUniversität Bielefeld 3533-3545 According to the Entropy Rate Constancy (ERC) principle, the information density of a text is approximately constant over its length. Whether this principle also applies to nonverbal communication signals is still under investigation. We perform empirical analyses of video-recorded dialogue data and investigate whether listener gaze, as an important nonverbal communication signal, adheres to the ERC principle. Results show (1) that the ERC principle holds for listener gaze; and (2) that the two linguistic factors syntactic complexity and turn transition potential are weakly correlated with local entropy of listener gaze. 2024.findings-acl.210 @@ -9013,12 +9013,12 @@ Measuring Bargaining Abilities of <fixed-case>LLM</fixed-case>s: A Benchmark and A Buyer-Enhancement Method TianXia - ZhiweiHeShanghai Jiao Tong University + ZhiweiHeShanghai Jiao Tong University TongRen YiboMiao - ZhuoshengZhangShanghai Jiao Tong University + ZhuoshengZhangShanghai Jiao Tong University YangYang - RuiWangShanghai Jiao Tong University + RuiWangShanghai Jiao Tong University 3579-3602 Bargaining is an important and unique part of negotiation between humans. As LLM-driven agents learn to negotiate and act like real humans, how to evaluate agents’ bargaining abilities remains an open problem.For the first time, we formally described the Bargaining task as an asymmetric incomplete information game, defining the gains of the Buyer and Seller in multiple bargaining processes. It allows us to quantitatively assess an agent’s performance in the Bargain task.We collected a real product price dataset, AmazonHistoryPrice, and conducted evaluations of various LLM agents’ bargaining abilities. We find that playing a Buyer is much harder than a Seller, and increasing model size can not effectively improve the Buyer’s performance.To address the challenge, we propose a novel approach called OG-Narrator that integrates a deterministic Offer Generator to control the price range of Buyer’s offers, and an LLM Narrator to create natural language sentences for generated offers.Experimental results show that OG-Narrator improves the buyer’s deal rates from 26.67% to 88.88% and brings a ten times multiplication of profits on all baselines, even a model that has not been aligned. 2024.findings-acl.213 @@ -9041,7 +9041,7 @@ XuanmingZhang YuqiZhu YihongDongPeking University - ZhiJinPeking University and Peking University + ZhiJinPeking University and Peking University BinhuaLi FeiHuangAlibaba Group YongbinLiAlibaba Group @@ -9070,7 +9070,7 @@ Aligning Speech Segments Beyond Pure Semantics KevinHeffernanFacebook ArtyomKozhevnikov - LoicBarrault + LoicBarrault AlexandreMourachkoResearch, Facebook HolgerSchwenk 3626-3635 @@ -9088,7 +9088,7 @@ YicongLi Jay ZhangjieWuNational University of Singapore Cong-DuyNguyenSchool of Computer Science and Engineering, Nanyang Technological University - See-KiongNgNational University of Singapore + See-KiongNgNational University of Singapore Anh TuanLuuNanyang Technological University 3636-3657 Humans use multiple senses to comprehend the environment. Vision and language are two of the most vital senses since they allow us to easily communicate our thoughts and perceive the world around us. There has been a lot of interest in creating video-language understanding systems with human-like senses since a video-language pair can mimic both our linguistic medium and visual environment with temporal dynamics. In this survey, we review the key tasks of these systems and highlight the associated challenges. Based on the challenges, we summarize their methods from model architecture, model training, and data perspectives. We also conduct performance comparison among the methods, and discuss promising directions for future research. @@ -9111,12 +9111,12 @@ A + <fixed-case>B</fixed-case>: A General Generator-Reader Framework for Optimizing <fixed-case>LLM</fixed-case>s to Unleash Synergy Potential - WeiTang + WeiTang YixinCaoFudan University JiahaoYing - BoWangSchool of Computer Science & Technology, Beijing Institute of Technology + BoWangSchool of Computer Science & Technology, Beijing Institute of Technology YuyueZhao - YongLiaoUniversity of Science and Technology of China and China Academic of Electronics and Information Technology + YongLiaoUniversity of Science and Technology of China and China Academic of Electronics and Information Technology PengZhouAarhus University 3670-3685 Retrieval-Augmented Generation (RAG) is an effective solution to supplement necessary knowledge to large language models (LLMs). Targeting its bottleneck of retriever performance, “generate-then-read” pipeline is proposed to replace the retrieval stage with generation from the LLM itself. Although promising, this research direction is underexplored and still cannot work in the scenario when source knowledge is given. In this paper, we formalize a general “A + B” framework with varying combinations of foundation models and types for systematic investigation. We explore the efficacy of the base and chat versions of LLMs and found their different functionalities suitable for generator A and reader B, respectively. Their combinations consistently outperform single models, especially in complex scenarios. Furthermore, we extend the application of the “A + B” framework to scenarios involving source documents through continuous learning, enabling the direct integration of external knowledge into LLMs. This approach not only facilitates effective acquisition of new knowledge but also addresses the challenges of safety and helpfulness post-adaptation. The paper underscores the versatility of the “A + B” framework, demonstrating its potential to enhance the practical application of LLMs across various domains. @@ -9137,7 +9137,7 @@ Adversarial Preference Optimization: Enhancing Your Alignment via <fixed-case>RM</fixed-case>-<fixed-case>LLM</fixed-case> Game - PengyuChengTencent + PengyuChengTencent YifanYangTencent AI Lab JianLiTencent YongDaiTencent AI Lab @@ -9159,7 +9159,7 @@ ChenweiZhangUniversity of Hong Kong ZhechaoZhu ZehaiZhou - XiangjieKong + XiangjieKong 3717-3726 Aspect sentiment quad prediction (ASQP) has garnered significant attention in aspect-based sentiment analysis (ABSA). Current ASQP research primarily relies on pre-trained generative language models to produce templated sequences, often complemented by grid-based auxiliary methods. Despite these efforts, the persistent challenge of generation instability remains unresolved and the effectiveness of grid methods remains underexplored in current studies. To this end, we introduce Grid Noise Diffusion Pinpoint Network (GDP), a T5-based generative model aiming to tackle the issue of generation instability. The model consists of three novel modules, including Diffusion Vague Learning (DVL) to facilitate effective model learning and enhance overall robustness; Consistency Likelihood Learning (CLL) to discern the characteristics and commonalities of sentiment elements and thus reduce the impact of distributed noise; and GDP-FOR, a novel generation template, to enable models to generate outputs in a more natural way. Extensive experiments on four datasets demonstrate the remarkable effectiveness of our approach in addressing ASQP tasks. 2024.findings-acl.222 @@ -9170,9 +9170,9 @@ Continual Contrastive Spoken Language Understanding UmbertoCappellazzo EnricoFiniApple - MuqiaoYang + MuqiaoYang DanieleFalavignaFondazione Bruno Kessler - AlessioBruttiFondazione Bruno Kessler + AlessioBruttiFondazione Bruno Kessler BhikshaRajCarnegie Mellon University, Carnegie Mellon University and Mohamed bin Zayed University of Artificial Intelligence 3727-3741 Recently, neural networks have shown impressive progress across diverse fields, with speech processing being no exception. However, recent breakthroughs in this area require extensive offline training using large datasets and tremendous computing resources. Unfortunately, these models struggle to retain their previously acquired knowledge when learning new tasks continually. In this paper, we investigate the problem of learning sequence-to-sequence models for spoken language understanding in a class-incremental learning (CIL) setting and we propose COCONUT, a CIL method that relies on the combination of experience replay and contrastive learning. Through a modified version of the standard supervised contrastive loss, COCONUT preserves the learned representations by pulling closer samples from the same class and pushing away the others. Moreover, we leverage a multimodal contrastive loss that helps the model learn more discriminative representations of the new data by aligning audio and text features. We also investigate different contrastive designs to combine the strengths of the contrastive loss with teacher-student architectures used for distillation. Experiments on two established SLU datasets reveal the effectiveness of our proposed approach and significant improvements over the baselines. We also show that COCONUT can be combined with methods that operate on the decoder side of the model, resulting in further metrics improvements. @@ -9185,7 +9185,7 @@ KaiWang YuweiXu ZhiyongWuShanghai Artificial Intelligence Laboratory - SiqiangLuoNanyang Technological University + SiqiangLuoNanyang Technological University 3742-3759 Knowledge Graph (KG) inductive reasoning, which aims to infer missing facts from new KGs that are not seen during training, has been widely adopted in various applications. One critical challenge of KG inductive reasoning is handling low-resource scenarios with scarcity in both textual and structural aspects. In this paper, we attempt to address this challenge with Large Language Models (LLMs). Particularly, we utilize the state-of-the-art LLMs to generate a graph-structural prompt to enhance the pre-trained Graph Neural Networks (GNNs), which brings us new methodological insights into the KG inductive reasoning methods, as well as high generalizability in practice. On the methodological side, we introduce a novel pretraining and prompting framework ProLINK, designed for low-resource inductive reasoning across arbitrary KGs without requiring additional training. On the practical side, we experimentally evaluate our approach on 36 low-resource KG datasets and find that ProLINK outperforms previous methods in three-shot, one-shot, and zero-shot reasoning tasks, exhibiting average performance improvements by 20%, 45%, and 147%, respectively. Furthermore, ProLINK demonstrates strong robustness for various LLM promptings as well as full-shot scenarios. 2024.findings-acl.224 @@ -9195,8 +9195,8 @@ Unsupervised Parsing by Searching for Frequent Word Sequences among Sentences with Equivalent Predicate-Argument Structures JunjieChenthe University of Tokyo - XianghengHe - DanushkaBollegalaAmazon and University of Liverpool + XianghengHe + DanushkaBollegalaAmazon and University of Liverpool YusukeMiyaoThe University of Tokyo 3760-3772 Unsupervised constituency parsing focuses on identifying word sequences that form a syntactic unit (i.e., constituents) in target sentences. Linguists identify the constituent by evaluating a set of Predicate-Argument Structure (PAS) equivalent sentences where we find the constituent appears more frequently than non-constituents (i.e., the constituent corresponds to a frequent word sequence within the sentence set). However, such frequency information is unavailable in previous parsing methods that identify the constituent by observing sentences with diverse PAS. In this study, we empirically show that constituents correspond to frequent word sequences in the PAS-equivalent sentence set. We propose a frequency-based parser, span-overlap, that (1) computes the span-overlap score as the word sequence’s frequency in the PAS-equivalent sentence set and (2) identifies the constituent structure by finding a constituent tree with the maximum span-overlap score. The parser achieves state-of-the-art level parsing accuracy, outperforming existing unsupervised parsers in eight out of ten languages. Additionally, we discover a multilingual phenomenon: participant-denoting constituents tend to have higher span-overlap scores than equal-length event-denoting constituents, meaning that the former tend to appear more frequently in the PAS-equivalent sentence set than the latter. The phenomenon indicates a statistical difference between the two constituent types, laying the foundation for future labeled unsupervised parsing research. @@ -9208,9 +9208,9 @@ Data-Centric Explainable Debiasing for Improving Fairness in Pre-trained Language Models YingjiLiJilin University MengnanDuNew Jersey Institute of Technology - RuiSongJilin University - XinWangJilin University - YingWangJilin University + RuiSongJilin University + XinWangJilin University + YingWangJilin University 3773-3786 Human-like social bias of pre-trained language models (PLMs) on downstream tasks have attracted increasing attention. The potential flaws in the training data are the main factor that causes unfairness in PLMs. Existing data-centric debiasing strategies mainly leverage explicit bias words (defined as sensitive attribute words specific to demographic groups) for counterfactual data augmentation to balance the training data. However, they lack consideration of implicit bias words potentially associated with explicit bias words in complex distribution data, which indirectly harms the fairness of PLMs. To this end, we propose a **Data**-Centric **Debias**ing method (named Data-Debias), which uses an explainability method to search for implicit bias words to assist in debiasing PLMs. Specifically, we compute the feature attributions of all tokens using the Integrated Gradients method, and then treat the tokens that have a large impact on the model’s decision as implicit bias words. To make the search results more precise, we iteratively train a biased model to amplify the bias with each iteration. Finally, we use the implicit bias words searched in the last iteration to assist in debiasing PLMs. Extensive experimental results on multiple PLMs debiasing on three different classification tasks demonstrate that Data-Debias achieves state-of-the-art debiasing performance and strong generalization while maintaining predictive abilities. 2024.findings-acl.226 @@ -9220,7 +9220,7 @@ Knowledge-Driven Cross-Document Relation Extraction MonikaJainIndraprastha Institute of Information Technology, Delhi - RaghavaMutharajuIndraprastha Institute of Information Technology, Delhi, India + RaghavaMutharajuIndraprastha Institute of Information Technology, Delhi, India KuldeepSinghCerence GmbH RamakanthKavuluruUniversity of Kentucky 3787-3797 @@ -9241,9 +9241,9 @@ <fixed-case>KG</fixed-case>-Adapter: Enabling Knowledge Graph Integration in Large Language Models through Parameter-Efficient Fine-Tuning - ShiyuTian + ShiyuTian YangyangLuoAlibaba Group - TianzeXuBeijing University of Posts and Telecommunications + TianzeXuBeijing University of Posts and Telecommunications CaixiaYuan HuixingJiangLi Auto ChenWei @@ -9261,7 +9261,7 @@ PengliLiukuaishou QingyangLi YanGong - JunchenWan + JunchenWan FuzhengZhang ZhongyuanWangKuaishou Inc. and Kuaishou DiZhangKuaishou Technology @@ -9288,11 +9288,11 @@ Improving In-Context Learning with Prediction Feedback for Sentiment Analysis HonglingXuHarbin Institute of Technology - QianlongWang + QianlongWang YiceZhang MinYangShenzhen Institutes of Advanced Technology, Chinese Academy of Sciences, Chinese Academy of Sciences XiZeng - BingQinHarbin Institute of Technology + BingQinHarbin Institute of Technology RuifengXuHarbin Institute of Technology 3879-3890 Large language models (LLMs) have achieved promising results in sentiment analysis through the in-context learning (ICL) paradigm. However, their ability to distinguish subtle sentiments still remains a challenge. Inspired by the human ability to adjust understanding via feedback, this paper enhances ICL by incorporating prior predictions and feedback, aiming to rectify sentiment misinterpretation of LLMs. Specifically, the proposed framework consists of three steps: (1) acquiring prior predictions of LLMs, (2) devising predictive feedback based on correctness, and (3) leveraging a feedback-driven prompt to refine sentiment understanding. Experimental results across nine sentiment analysis datasets demonstrate the superiority of our framework over conventional ICL methods, with an average F1 improvement of 5.95%. @@ -9304,10 +9304,10 @@ Can Large Language Models Mine Interpretable Financial Factors More Effectively? A Neural-Symbolic Factor Mining Agent Model ZhiweiLiRenmin University of China RanSongKunmimg University of Science and Technology - CaihongSunRenmin University of China + CaihongSunRenmin University of China WeiXu - ZhengtaoYuKunming University of Science and Technology - Ji-RongWenRenmin University of China + ZhengtaoYuKunming University of Science and Technology + Ji-RongWenRenmin University of China 3891-3902 Finding interpretable factors for stock returns is the most vital issue in the empirical asset pricing domain. As data-driven methods, existing factor mining models can be categorized into symbol-based and neural-based models. Symbol-based models are interpretable but inefficient, while neural-based approaches are efficient but lack interpretability. Hence, mining interpretable factors effectively presents a significant challenge. Inspired by the success of Large Language Models (LLMs) in various tasks, we propose a FActor Mining Agent (FAMA) model that enables LLMs to integrate the strengths of both neural and symbolic models for factor mining. In this paper, FAMA consists of two main components: Cross-Sample Selection (CSS) and Chain-of-Experience (CoE). CSS addresses the homogeneity challenges in LLMs during factor mining by assimilating diverse factors as in-context samples, whereas CoE enables LLMs to leverage past successful mining experiences, expediting the mining of effective factors. Experimental evaluations on real-world stock market data demonstrate the effectiveness of our approach by surpassing the SOTA RankIC by 0.006 and RankICIR by 0.105 in predicting S&P 500 returns. Furthermore, the investment simulation shows that our model can achieve superior performance with an annualized return of 38.4% and a Sharpe ratio of 667.2%. 2024.findings-acl.233 @@ -9331,12 +9331,12 @@ <fixed-case>SALAD</fixed-case>-Bench: A Hierarchical and Comprehensive Safety Benchmark for Large Language Models LijunLiShanghai Artificial Intelligence Laboratory - BowenDong + BowenDong RuohuiWang XuhaoHu WangmengZuoHarbin Institute of Technology DahuaLinThe Chinese University of Hong Kong - YuQiao + YuQiao JingShaoShanghai AI Laboratory 3923-3954 In the rapidly evolving landscape of Large Language Models (LLMs), ensuring robust safety measures is paramount. To meet this crucial need, we propose SALAD-Bench, a safety benchmark specifically designed for evaluating LLMs, attack, and defense methods. Distinguished by its breadth, SALAD-Bench transcends conventional benchmarks through its large scale, rich diversity, intricate taxonomy spanning three levels, and versatile functionalities.SALAD-Bench is crafted with a meticulous array of questions, from standard queries to complex ones enriched with attack, defense modifications and multiple-choice. To effectively manage the inherent complexity, we introduce an innovative evaluators: the LLM-based MD-Judge for QA pairs with a particular focus on attack-enhanced queries, ensuring a seamless, and reliable evaluation. Above components extend SALAD-Bench from standard LLM safety evaluation to both LLM attack and defense methods evaluation, ensuring the joint-purpose utility. Our extensive experiments shed light on the resilience of LLMs against emerging threats and the efficacy of contemporary defense tactics. Data and evaluator are released under https://github.com/OpenSafetyLab/SALAD-BENCH @@ -9348,9 +9348,9 @@ Extracting and Encoding: Leveraging Large Language Models and Medical Knowledge to Enhance Radiological Text Representation PabloMessina ReneVidalUniversity of Pennsylvania and Amazon - DenisParraPontificia Universidad Catolica de Chile + DenisParraPontificia Universidad Catolica de Chile AlvaroSoto - VladimirAraujoKU Leuven + VladimirAraujoKU Leuven 3955-3986 Advancing representation learning in specialized fields like medicine remains challenging due to the scarcity of expert annotations for text and images. To tackle this issue, we present a novel two-stage framework designed to extract high-quality factual statements from free-text radiology reports in order to improve the representations of text encoders and, consequently, their performance on various downstream tasks.In the first stage, we propose a Fact Extractor that leverages large language models (LLMs) to identify factual statements from well-curated domain-specific datasets. In the second stage, we introduce a Fact Encoder (CXRFE) based on a BERT model fine-tuned with objective functions designed to improve its representations using the extracted factual data. Our framework also includes a new embedding-based metric (CXRFEScore) for evaluating chest X-ray text generation systems, leveraging both stages of our approach. Extensive evaluations show that our fact extractor and encoder outperform current state-of-the-art methods in tasks such as sentence ranking, natural language inference, and label extraction from radiology reports. Additionally, our metric proves to be more robust and effective than existing metrics commonly used in the radiology report generation literature. The code of this project is available at https://github.com/PabloMessina/CXR-Fact-Encoder. 2024.findings-acl.236 @@ -9360,8 +9360,8 @@ <fixed-case>GNN</fixed-case>avi: Navigating the Information Flow in Large Language Models by Graph Neural Network ShuzhouYuan - ErcongNie - MichaelFärberTechnische Universität Dresden + ErcongNie + MichaelFärberTechnische Universität Dresden HelmutSchmidCenter for Information and Language Processing HinrichSchuetze 3987-4001 @@ -9372,12 +9372,12 @@ <fixed-case>M</fixed-case>-<fixed-case>QALM</fixed-case>: A Benchmark to Assess Clinical Reading Comprehension and Knowledge Recall in Large Language Models via Question Answering - AnandSubramanian + AnandSubramanian ViktorSchlegelImperial College London AbhinavRamesh Kashyap Thanh-TungNguyenasus Vijay PrakashDwivedi - StefanWinklerNational University of Singapore + StefanWinklerNational University of Singapore 4002-4042 There is vivid research on adapting Large Language Models (LLMs) to perform a variety of tasks in high-stakes domains such as healthcare. Despite their popularity, there is a lack of understanding of the extent and contributing factors that allow LLMs to recall relevant knowledge and combine it with presented information in the clinical and biomedical domain: a fundamental pre-requisite for success on down-stream tasks.Addressing this gap, we use Multiple Choice and Abstractive Question Answering to conduct a large-scale empirical study on 22 datasets in three generalist and three specialist biomedical sub-domains. Our multifaceted analysis of the performance of 15 LLMs, further broken down by sub-domain, source of knowledge and model architecture, uncovers success factors such as instruction tuning that lead to improved recall and comprehension. We further show that while recently proposed domain-adapted models may lack adequate knowledge, directly fine-tuning on our collected medical knowledge datasets shows encouraging results, even generalising to unseen specialist sub-domains. We complement the quantitative results with a skill-oriented manual error analysis, which reveals a significant gap between the models’ capabilities to simply recall necessary knowledge and to integrate it with the presented context.To foster research and collaboration in this field we share M-QALM, our resources, standardised methodology, and evaluation results, with the research community to facilitate further advancements in clinical knowledge representation learning within language models. 2024.findings-acl.238 @@ -9387,7 +9387,7 @@ <fixed-case>M</fixed-case>ovie<fixed-case>S</fixed-case>um: An Abstractive Summarization Dataset for Movie Screenplays RohitSaxenaUniversity of Edinburgh, University of Edinburgh - FrankKellerUniversity of Edinburgh + FrankKellerUniversity of Edinburgh 4043-4050 Movie screenplay summarization is challenging, as it requires an understanding of long input contexts and various elements unique to movies. Large language models have shown significant advancements in document summarization, but they often struggle with processing long input contexts. Furthermore, while television transcripts have received attention in recent studies, movie screenplay summarization remains underexplored. To stimulate research in this area, we present a new dataset, MovieSum, for abstractive summarization of movie screenplays. This dataset comprises 2200 movie screenplays accompanied by their Wikipedia plot summaries. We manually formatted the movie screenplays to represent their structural elements. Compared to existing datasets, MovieSum possesses several distinctive features: 1) It includes movie screenplays which are longer than scripts of TV episodes. 2) It is twice the size of previous movie screenplay datasets. 3) It provides metadata with IMDb IDs to facilitate access to additional external knowledge. We also show the results of recently released large language models applied to summarization on our dataset to provide a detailed baseline. 2024.findings-acl.239 @@ -9396,18 +9396,18 @@ Autonomous Workflow for Multimodal Fine-Grained Training Assistants Towards Mixed Reality - JiahuanPeiCentrum voor Wiskunde en Informatica - IreneViola + JiahuanPeiCentrum voor Wiskunde en Informatica + IreneViola HaochenHuang - JunxiaoWangKing Abdullah University of Science and Technology + JunxiaoWangKing Abdullah University of Science and Technology MoonisaAhsan FanghuaYe JiangYiming YaoSai - DiWangKAUST + DiWangKAUST ZhuminChenShandong University PengjieRenShandong University - PabloCesarDelft University of Technology and Centrum Wiskunde & Informatica (CWI) + PabloCesarDelft University of Technology and Centrum Wiskunde & Informatica (CWI) 4051-4066 Autonomous artificial intelligence (AI) agents have emerged as promising protocols for automatically understanding the language-based environment, particularly with the exponential development of large language models (LLMs). However, a fine-grained, comprehensive understanding of multimodal environments remains under-explored. This work designs an autonomous workflow tailored for integrating AI agents seamlessly into extended reality (XR) applications for fine-grained training. We present a demonstration of a multimodal fine-grained training assistant for LEGO brick assembly in a pilot XR environment. Specifically, we design a cerebral language agent that integrates LLM with memory, planning, and interaction with XR tools and a vision-language agent, enabling agents to decide their actions based on past experiences. Furthermore, we introduce LEGO-MRTA, a multimodal fine-grained assembly dialogue dataset synthesized automatically in the workflow served by a commercial LLM. This dataset comprises multimodal instruction manuals, conversations, XR responses, and vision question answering. Last, we present several prevailing open-resource LLMs as benchmarks, assessing their performance with and without fine-tuning on the proposed dataset. We anticipate that the broader impact of this workflow will advance the development of smarter assistants for seamless user interaction in XR environments, fostering research in both AI and HCI communities. 2024.findings-acl.240 @@ -9431,7 +9431,7 @@ AbhaySheshadri VictorLevoso PaulSwobodaHeinrich-Heine University Düsseldorf - ChristianBarteltUniversität Mannheim + ChristianBarteltUniversität Mannheim 4082-4102 Transformers demonstrate impressive performance on a range of reasoning benchmarks. To evaluate the degree to which these abilities are a result of actual reasoning, existing work has focused on developing sophisticated benchmarks for behavioral studies. However, these studies do not provide insights into the internal mechanisms driving the observed capabilities. To improve our understanding of the internal mechanisms of transformers, we present a comprehensive mechanistic analysis of a transformer trained on a synthetic reasoning task. We identify a set of interpretable mechanisms the model uses to solve the task, and validate our findings using correlational and causal evidence. Our results suggest that it implements a depth-bounded recurrent mechanisms that operates in parallel and stores intermediate results in selected token positions. We anticipate that the motifs we identified in our synthetic setting can provide valuable insights into the broader operating principles of transformers and thus provide a basis for understanding more complex models. 2024.findings-acl.242 @@ -9441,11 +9441,11 @@ Optimal Transport Guided Correlation Assignment for Multimodal Entity Linking ZefengZhang - JiaweiShengInstitute of Information Engineering, Chinese Academy of Sciences + JiaweiShengInstitute of Information Engineering, Chinese Academy of Sciences ZhangChuang - LiangyunzhiLiangyunzhi - WenyuanZhang - SiqiWang + LiangyunzhiLiangyunzhi + WenyuanZhang + SiqiWang TingwenLiuInstitute of Information Engineering, Chinese Academy of Sciences 4103-4117 Multimodal entity linking (MEL) aims to link ambiguous mentions in multimodal contexts to entities in a multimodal knowledge graph. A pivotal challenge is to fully leverage multi-element correlations between mentions and entities to bridge modality gap and enable fine-grained semantic matching. Existing methods attempt several local correlative mechanisms, relying heavily on the automatically learned attention weights, which may over-concentrate on partial correlations. To mitigate this issue, we formulate the correlation assignment problem as an optimal transport (OT) problem, and propose a novel MEL framework, namely OT-MEL, with OT-guided correlation assignment. Thereby, we exploit the correlation between multimodal features to enhance multimodal fusion, and the correlation between mentions and entities to enhance fine-grained matching. To accelerate model prediction, we further leverage knowledge distillation to transfer OT assignment knowledge to attention mechanism. Experimental results show that our model significantly outperforms previous state-of-the-art baselines and confirm the effectiveness of the OT-guided correlation assignment. @@ -9456,7 +9456,7 @@ On Efficiently Representing Regular Languages as <fixed-case>RNN</fixed-case>s AnejSveteDepartment of Computer Science, ETHZ - ETH Zurich - RobinChan + RobinChan RyanCotterellSwiss Federal Institute of Technology 4118-4135 Recent work by Hewitt et al. (2020) provides an interpretation of the empirical success of recurrent neural networks (RNNs) as language models (LMs). It shows that RNNs can efficiently represent bounded hierarchical structures that are prevalent in human language.This suggests that RNNs’ success might be linked to their ability to model hierarchy. However, a closer inspection of hewitt-etal-2020-rnns construction shows that it is not inherently limited to hierarchical structures. This poses a natural question: What other classes of LMs RNNs can efficiently represent? To this end, we generalize Hewitt et al.’s (2020) construction and show that RNNs can efficiently represent a larger class of LMs than previously claimed—specifically, those that can be represented by a pushdown automaton with a bounded stack and a specific stack update function. Altogether, the efficiency of representing this diverse class of LMs with RNN LMs suggests novel interpretations of their inductive bias. @@ -9469,7 +9469,7 @@ InesReinig MariaBeckerRuprecht-Karls-Universität Heidelberg InesRehbeinUniversität Mannheim - SimonePonzettoUniversity of Mannheim + SimonePonzettoUniversity of Mannheim 4136-4155 In this survey, we provide a systematic review of recent work on modelling morality in text, an area of research that has garnered increasing attention in recent years. Our survey is motivated by the importance of modelling decisions on the created resources, the models trained on these resources and the analyses that result from the models’ predictions. We review work at the interface of NLP, Computational Social Science and Psychology and give an overview of the different goals and research questions addressed in the papers, their underlying theoretical backgrounds and the methods that have been applied to pursue these goals. We then identify and discuss challenges and research gaps, such as the lack of a theoretical framework underlying the operationalisation of morality in text, the low IAA reported for manyhuman-annotated resulting resources and the lack of validation of newly proposed resources and analyses. 2024.findings-acl.245 @@ -9499,12 +9499,12 @@ YiningYe YujiaQin XinCong - YankaiLinRenmin University of China + YankaiLinRenmin University of China YinxuPan YesaiWu HuiHaotian LiuWeichuanSiemens Corporate Research - ZhiyuanLiuTsinghua University + ZhiyuanLiuTsinghua University MaosongSun 4173-4198 Large Language Models (LLMs) have demonstrated exceptional coding capability. However, as another critical component of programming proficiency, the debugging capability of LLMs remains relatively unexplored. Previous evaluations of LLMs’ debugging ability are significantly limited by the risk of data leakage, the scale of the dataset, and the variety of tested bugs. To overcome these deficiencies, we introduce ‘DebugBench’, an LLM debugging benchmark consisting of 4,253 instances. It covers four major bug categories and 18 minor types in C++, Java, and Python. To construct DebugBench, we collect code snippets from the LeetCode community, implant bugs into source data with GPT-4, and assure rigorous quality checks. We evaluate two commercial and four open-source models in a zero-shot scenario. We find that (1) while closed-source models exhibit inferior debugging performance compared to humans, open-source models relatively lower pass rate scores; (2) the complexity of debugging notably fluctuates depending on the bug category; (3) incorporating runtime feedback has a clear impact on debugging performance which is not always helpful. As an extension, we also compare LLM debugging and code generation, revealing a strong correlation between them for closed-source models. These findings will benefit the development of LLMs in debugging. @@ -9515,9 +9515,9 @@ <fixed-case>POP</fixed-case>-<fixed-case>CEE</fixed-case>: Position-oriented Prompt-tuning Model for Causal Emotion Entailment ZhihanZhouJilin University - XueGuUniversidade do Minho - YujieZhao - HaoXuJilin University + XueGuUniversidade do Minho + YujieZhao + HaoXuJilin University 4199-4210 The objective of the Causal Emotion Entailment (CEE) task is to identify the causes of the target emotional utterances in a given conversation. Most existing studies have focused on a fine-tuning paradigm based on a pretrained model, e.g., the BERT model. However, there are gaps between the pretrained task and the CEE task. Although a pretrained model enhances contextual comprehension to some extent, it cannot acquire specific knowledge that is relevant to the CEE task. In addition, in a typical CEE task, there are peculiarities in the distribution of the positions with different emotion types of emotion utterances and cause utterances in conversations. Existing methods employ a fixed-size window to capture the relationship between neighboring conversations; however, these methods ignore the specific semantic associations between emotions and cause utterances. To address these issues, we propose the Position-oriented Prompt-tuning (POP-CEE) model to solve the CEE task in an end-to-end manner. Specifically, we can model the CEE task by designing prompts with multiple unified goals and by exploring the positional relationship between emotion and cause utterances using a position constraint module. Experimental results demonstrate that the proposed POP-CEE model achieves state-of-the-art performance on a benchmark dataset. Ourcode and data can be found at: https://github.com/Zh0uzh/POP-CEE. 2024.findings-acl.248 @@ -9526,8 +9526,8 @@ Context Length Extension via Generalized Extrapolation Scale - LinhanLi - ZhangHuapingBeijing Institute of Technology + LinhanLi + ZhangHuapingBeijing Institute of Technology 4211-4218 2024.findings-acl.249 li-huaping-2024-context @@ -9537,8 +9537,8 @@ Selectively Answering Visual Questions JulianEisenschlosGoogle DeepMind HernánMainaUniversidad Nacional de Córdoba, Argentina - GuidoIvettaUniversidad Nacional de Córdoba - LucianaBenottiUniversidad nacional de Córdoba + GuidoIvettaUniversidad Nacional de Córdoba + LucianaBenottiUniversidad nacional de Córdoba 4219-4229 Recently, large multi-modal models (LMMs) have emerged with the capacity to perform vision tasks such as captioning and visual question answering (VQA) with unprecedented accuracy. Applications such as helping the blind or visually impaired have a critical need for precise answers. It is specially important for models to be well calibrated and be able to quantify their uncertainty in order to selectively decide when to answer and when to abstain or ask for clarifications. We perform the first in-depth analysis of calibration methods and metrics for VQA with in-context learning LMMs. Studying VQA on two answerability benchmarks, we show that the likelihood score of visually grounded models is better calibrated than in their text-only counterparts for in-context learning, where sampling based methods are generally superior, but no clear winner arises. We propose Avg BLEU, a calibration score combining the benefits of both sampling and likelihood methods across modalities. 2024.findings-acl.250 @@ -9552,8 +9552,8 @@ JinzhengHeZhejiang University GangSun RanShen - XizeCheng - ZhouZhaoZhejiang University and Zhejiang University + XizeCheng + ZhouZhaoZhejiang University and Zhejiang University 4230-4242 We release a multi-accent dataset and propose speech-programming and gradient reversal classifier to improve the generalization.Abstract: Speech-to-SQL (S2SQL) aims to convert spoken questions into SQL queries given relational databases, which has been traditionally implemented in a cascaded manner while facing the following challenges: 1) model training is faced with the major issue of data scarcity, where limited parallel data is available; and 2) the systems should be robust enough to handle diverse out-of-domain speech samples that differ from the source data. In this work, we propose the direct generalizable speech-to-SQL parsing model Wav2SQL which avoids error compounding across cascaded systems. Specifically, 1) to accelerate speech-driven SQL parsing research in the community, we release a large-scale and multi-accent dataset MASpider; 2) leveraging the recent progress in the large-scale pre-training, we show that it alleviates the data scarcity issue and allow for direct speech-to-SQL parsing; and 3) we include the speech re-programming and gradient reversal classifier techniques to reduce acoustic variance and learned style-agnostic representation, improving generalization to unseen out-of-domain custom data. Experimental results demonstrate that Wav2SQL avoids error compounding and achieves state-of-the-art results by up to 4.7% accuracy improvement over the baseline. 2024.findings-acl.251 @@ -9564,7 +9564,7 @@ E2-<fixed-case>LLM</fixed-case>: Efficient and Extreme Length Extension of Large Language Models JiahengLiu ZhiqiBaiZhiqiBai - YuanxingZhang + YuanxingZhang ChenchenZhangBeijing University of Posts and Telecommunications YuangZhYuangZh GeZhang @@ -9572,10 +9572,10 @@ HaoranQue YukangChen WenboSu - TiezhengGeAlibaba Group - JieFuHong Kong University of Science and Technology + TiezhengGeAlibaba Group + JieFuHong Kong University of Science and Technology WenhuChenUniversity of Waterloo and Google - BoZhengAlibaba Group + BoZhengAlibaba Group 4243-4253 Training Large Language Models (LLMs) to process extensive context lengths incurs prohibitive computational costs. Prevailing techniques for extending context capabilities in LLMs typically require not only additional training procedures but also access to datasets with long context (e.g., sequences of 32K tokens), presupposing substantial GPU expenditures. To address the aforementioned issues, we introduce a novel solution named Efficient and Extreme length extension for Large Language Models (E2-LLM). E2-LLM entails a singular training process over considerably short sequences (e.g., 4K tokens), which greatly mitigates the cost of continual-pretraining or fine-tuning. Within the training phase, we incorporate a dual augmentation strategy with Rotary Position Embeddings (RoPE) that adjusts the scale and position indices across distinct training samples. E 2 -LLM is meticulously designed to enhance the model’s robustness to diverse relative positions. The experimental results on multiple benchmark datasets demonstrate the superior performance of E 2 -LLM on demanding tasks of processing long contexts. 2024.findings-acl.252 @@ -9586,7 +9586,7 @@ Are Female Carpenters like Blue Bananas? A Corpus Investigation of Occupation Gender Typicality DaJuFacebook KarenUllrichMeta AI - AdinaWilliamsFAIR (Meta Platforms Inc.) + AdinaWilliamsFAIR (Meta Platforms Inc.) 4254-4274 People tend to use language to mention surprising properties of events: for example, when a banana is blue, we are more likely to mention color than when it is yellow. This fact is taken to suggest that yellowness is somehow a typical feature of bananas, and blueness is exceptional. Similar to how a yellow color is typical of bananas, there may also be genders that are typical of occupations. In this work, we explore this question using information theoretic techniques coupled with corpus statistic analysis. In two distinct large corpora, we do not find strong evidence that occupations and gender display the same patterns of mentioning as do bananas and color. Instead, we find that gender mentioning is correlated with femaleness of occupation in particular, suggesting perhaps that woman-dominated occupations are seen as somehow “more gendered” than male-dominated ones, and thereby they encourage more gender mentioning overall. 2024.findings-acl.253 @@ -9598,13 +9598,13 @@ SitaoCheng ZiyuanZhuang YongXu - FangkaiYangMicrosoft + FangkaiYangMicrosoft ChaoyunZhang - XiaotingQinMicrosoft + XiaotingQinMicrosoft XiangHuang LingChen - QingweiLinMicrosoft Research - DongmeiZhangMicrosoft and Microsoft + QingweiLinMicrosoft Research + DongmeiZhangMicrosoft and Microsoft SaravanRajmohanMicrosoft QiZhang 4275-4295 @@ -9615,12 +9615,12 @@ Legal Judgment Reimagined: <fixed-case>P</fixed-case>red<fixed-case>E</fixed-case>x and the Rise of Intelligent <fixed-case>AI</fixed-case> Interpretation in <fixed-case>I</fixed-case>ndian Courts - Shubham KumarNigamIIT Kanpur + Shubham KumarNigamIIT Kanpur AnuragSharmaIISER Kolkata DanushKhanna NoelShallumSymbiosis Law School Pune KripabandhuGhoshIndian Institute of Science Education and Research Kolkata - ArnabBhattacharyaIIT Kanpur + ArnabBhattacharyaIIT Kanpur 4296-4315 In the era of Large Language Models (LLMs), predicting judicial outcomes poses significant challenges due to the complexity of legal proceedings and the scarcity of expert-annotated datasets. Addressing this, we introduce Prediction with Explanation (PredEx), the largest expert-annotated dataset for legal judgment prediction and explanation in the Indian context, featuring over 15,000 annotations. This groundbreaking corpus significantly enhances the training and evaluation of AI models in legal analysis, with innovations including the application of instruction tuning to LLMs. This method has markedly improved the predictive accuracy and explanatory depth of these models for legal judgments. We employed various transformer-based models, tailored for both general and Indian legal contexts. Through rigorous lexical, semantic, and expert assessments, our models effectively leverage PredEx to provide precise predictions and meaningful explanations, establishing it as a valuable benchmark for both the legal profession and the NLP community. 2024.findings-acl.255 @@ -9643,7 +9643,7 @@ Multi-Objective Linguistic Control of Large Language Models DangNguyenUniversity of Maryland, College Park JiuhaiChen - TianyiZhouUniversity of Maryland, College Park + TianyiZhouUniversity of Maryland, College Park 4336-4347 Large language models (LLMs), despite their breakthroughs on many challenging benchmark tasks, prefer to generate verbose responses and lack the controllability of output complexity, which is usually preferred by human users in practice. In this paper, we study how to precisely control multiple linguistic complexities of LLM output by finetuning using off-the-shelf data. To this end, we propose multi-control tuning (MCTune), which includes multiple linguistic complexity values of ground-truth responses as controls in the input for instruction tuning. We finetune LLaMA2-7B on Alpaca-GPT4 and WizardLM datasets. Evaluations on widely used benchmarks demonstrate that our method does not only improve LLMs’ multi-complexity controllability substantially but also retains or even enhances the quality of the responses as a side benefit. 2024.findings-acl.257 @@ -9653,7 +9653,7 @@ Evaluating the Smooth Control of Attribute Intensity in Text Generation with <fixed-case>LLM</fixed-case>s ShangZhou - FengYao + FengYao ChengyuDongUniversity of California, San Diego ZihanWang JingboShangUniversity of California, San Diego @@ -9670,14 +9670,14 @@ JianqiaoLu QiZhu JiahuiGao - WeiwenLiuHuawei Technologies Ltd. - YutaiHou + WeiwenLiuHuawei Technologies Ltd. + YutaiHou XingshanZengHuawei Technologies Ltd. YashengWang LifengShangHuawei Technologies Ltd. - XinJiang + XinJiang RuifengXuHarbin Institute of Technology - QunLiuHuawei Noah’s Ark Lab + QunLiuHuawei Noah’s Ark Lab 4363-4400 The recent trend of using Large Language Models (LLMs) as tool agents in real-world applications underscores the necessity for comprehensive evaluations of their capabilities, particularly in complex scenarios involving planning, creating, and using tools. However, existing benchmarks typically focus on simple synthesized queries that do not reflect real-world complexity, thereby offering limited perspectives in evaluating tool utilization. To address this issue, we present UltraTool, a novel benchmark designed to improve and evaluate LLMs’ ability in tool utilization within real-world scenarios. UltraTool focuses on the entire process of using tools - from planning and creating to applying them in complex tasks. It emphasizes real-world complexities, demanding accurate, multi-step planning for effective problem-solving. A key feature of UltraTool is its independent evaluation of planning with natural language, which happens before tool usage and simplifies the task solving by mapping out the intermediate steps. Thus, unlike previous work, it eliminates the restriction of pre-defined toolset. Through extensive experiments on various LLMs, we offer novel insights into the evaluation of capabilities of LLMs in tool utilization, thereby contributing a fresh perspective to this rapidly evolving field. The benchmark is publicly available at https://github.com/JoeYing1019/UltraTool. 2024.findings-acl.259 @@ -9688,7 +9688,7 @@ Do Androids Know They’re Only Dreaming of Electric Sheep? SkyCH-WangColumbia University BenjaminVan DurmeJohns Hopkins University, Johns Hopkins University, Johns Hopkins University and Microsoft - JasonEisnerMicrosoft and Johns Hopkins University + JasonEisnerMicrosoft and Johns Hopkins University ChrisKedzieRasa Technologies, Inc. 4401-4420 We design probes trained on the internal representations of a transformer language model to predict its hallucinatory behavior on three grounded generation tasks. To train the probes, we annotate for span-level hallucination on both sampled (organic) and manually edited (synthetic) reference outputs. Our probes are narrowly trained and we find that they are sensitive to their training domain: they generalize poorly from one task to another or from synthetic to organic hallucinations. However, on in-domain data, they can reliably detect hallucinations at many transformer layers, achieving 95% of their peak performance as early as layer 4. Here, probing proves accurate for evaluating hallucination, outperforming several contemporary baselines and even surpassing an expert human annotator in response-level detection F1. Similarly, on span-level labeling, probes are on par or better than the expert annotator on two out of three generation tasks. Overall, we find that probing is a feasible and efficient alternative to language model hallucination evaluation when model states are available. @@ -9699,9 +9699,9 @@ <fixed-case>URG</fixed-case>: A Unified Ranking and Generation Method for Ensembling Language Models BoLv - ChenTang - YananZhang - XinLiu + ChenTang + YananZhang + XinLiu PingLuoInstitute of Computing Technology, Chinese Academy of Sciences YueYuNational University of Defense Technology and PengCheng Lab 4421-4434 @@ -9728,7 +9728,7 @@ <fixed-case>L</fixed-case>ora<fixed-case>R</fixed-case>etriever: Input-Aware <fixed-case>L</fixed-case>o<fixed-case>RA</fixed-case> Retrieval and Composition for Mixed Tasks in the Wild - ZiyuZhao + ZiyuZhao LeileiGanZhejiang University GuoyinWangBytedance WangchunshuZhouAIWaves Inc. @@ -9743,11 +9743,11 @@ <fixed-case>ELAD</fixed-case>: Explanation-Guided Large Language Models Active Distillation - YifeiZhangEmory University - BoPan - ChenLing - YuntongHuEmory University - LiangZhaoEmory University + YifeiZhangEmory University + BoPan + ChenLing + YuntongHuEmory University + LiangZhaoEmory University 4463-4475 The deployment and application of Large Language Models (LLMs) is hindered by their memory inefficiency, computational demands, and the high costs of API inferences. Traditional distillation methods, which transfer the capabilities of LLMs to smaller models, often fail to determine whether the knowledge has been sufficiently transferred, potentially resulting in high costs or incomplete distillation. In this paper, we propose an Explanation-Guided LLMs Active Distillation (ELAD) framework that employs an active learning strategy to optimize the balance between annotation costs and model performance. To improve the efficiency of sample selection, we introduce an explanation-guided sample selection method that identifies samples challenging its reasoning by exploiting uncertainties in reasoning explanation steps. Additionally, we present a customized LLM-annotated explanation revision technique where the teacher model detects and corrects flaws in the student model’s reasoning. Our experiments across various reasoning datasets demonstrate that our framework significantly enhances the efficiency of LLMs knowledge distillation. 2024.findings-acl.264 @@ -9756,8 +9756,8 @@ Evaluating the Elementary Multilingual Capabilities of Large Language Models with <fixed-case>M</fixed-case>ulti<fixed-case>Q</fixed-case> - CarolinHoltermannUniversität Hamburg - PaulRöttgerBocconi University + CarolinHoltermannUniversität Hamburg + PaulRöttgerBocconi University TimmDillUniversität Hamburg AnneLauscherUniversität Hamburg 4476-4494 @@ -9770,7 +9770,7 @@ Semantics or spelling? Probing contextual word embeddings with orthographic noise JacobMatthewsCornell University JohnStarr - MartenSchijndelCornell University + MartenSchijndelCornell University 4495-4504 Pretrained language model (PLM) hidden states are frequently employed as contextual word embeddings (CWE): high-dimensional representations that encode semantic information given linguistic context. Across many areas of computational linguistics research, similarity between CWEs is interpreted as semantic similarity. However, it remains unclear exactly what information is encoded in PLM hidden states. We investigate this practice by probing PLM representations using minimal orthographic noise. We expect that if CWEs primarily encode semantic information, a single character swap in the input word will not drastically affect the resulting representation, given sufficient linguistic context. Surprisingly, we find that CWEs generated by popular PLMs are highly sensitive to noise in input data, and that this sensitivity is related to subword tokenization: the fewer tokens used to represent a word at input, the more sensitive its corresponding CWE. This suggests that CWEs capture information unrelated to word-level meaning and can be manipulated through trivial modifications of input data. We conclude that these PLM-derived CWEs may not be reliable semantic proxies, and that caution is warranted when interpreting representational similarity. 2024.findings-acl.266 @@ -9784,10 +9784,10 @@ PengfeiHeMichigan State University YidingLiuBaidu YueXingMichigan State University - HanXuUniversity of Arizona + HanXuUniversity of Arizona JieRenBaidu and Michigan State University - YiChangJilin University, China - ShuaiqiangWang + YiChangJilin University, China + ShuaiqiangWang DaweiYinBaidu JiliangTangMichigan State University 4505-4524 @@ -9798,13 +9798,13 @@ <fixed-case>E</fixed-case>mpathic<fixed-case>S</fixed-case>tories++: A Multimodal Dataset for Empathy Towards Personal Experiences - JocelynShenMassachusetts Institute of Technology - YubinKimMassachusetts Institute of Technology + JocelynShenMassachusetts Institute of Technology + YubinKimMassachusetts Institute of Technology MohitHulse WazeerZulfikar SharifaAlghowinem - CynthiaBreazeal - HaeParkAmazon and Massachusetts Institute of Technology + CynthiaBreazeal + HaeParkAmazon and Massachusetts Institute of Technology 4525-4536 Modeling empathy is a complex endeavor that is rooted in interpersonal and experiential dimensions of human interaction, and remains an open problem within AI. Existing empathy datasets fall short in capturing the richness of empathy responses, often being confined to in-lab or acted scenarios, lacking longitudinal data, and missing self-reported labels. We introduce a new multimodal dataset for empathy during personal experience sharing: the EmpathicStories++ dataset containing 53 hours of video, audio, and text data of 41 participants sharing vulnerable experiences and reading empathically resonant stories with an AI agent. EmpathicStories++ is the first longitudinal dataset on empathy, collected over a month-long deployment of social robots in participants’ homes, as participants engage in natural, empathic storytelling interactions with AI agents. We then introduce a novel task of predicting individuals’ empathy toward others’ stories based on their personal experiences, evaluated in two contexts: participants’ own personal shared story context and their reflections on stories they read. We benchmark this task using state-of-the-art models to pave the way for future improvements in contextualized and longitudinal empathy modeling. Our work provides a valuable resource for further research in developing empathetic AI systems and understanding the intricacies of human empathy within genuine, real-world settings. 2024.findings-acl.268 @@ -9825,9 +9825,9 @@ <fixed-case>S</fixed-case>yntax<fixed-case>S</fixed-case>hap: Syntax-aware Explainability Method for Text Generation - KenzaAmara + KenzaAmara RitaSevastjanovaETHZ - ETH Zurich - MennatallahEl-AssadyDepartment of Computer Science, ETHZ - ETH Zurich + MennatallahEl-AssadyDepartment of Computer Science, ETHZ - ETH Zurich 4551-4566 To harness the power of large language models in safety-critical domains, we need to ensure the explainability of their predictions. However, despite the significant attention to model interpretability, there remains an unexplored domain in explaining sequence-to-sequence tasks using methods tailored for textual data. This paper introduces *SyntaxShap*, a local, model-agnostic explainability method for text generation that takes into consideration the syntax in the text data. The presented work extends Shapley values to account for parsing-based syntactic dependencies. Taking a game theoric approach, SyntaxShap only considers coalitions constraint by the dependency tree. We adopt a model-based evaluation to compare SyntaxShap and its weighted form to state-of-the-art explainability methods adapted to text generation tasks, using diverse metrics including faithfulness, coherency, and semantic alignment of the explanations to the model. We show that our syntax-aware method produces explanations that help build more faithful and coherent explanations for predictions by autoregressive models. Confronted with the misalignment of human and AI model reasoning, this paper also highlights the need for cautious evaluation strategies in explainable AI. 2024.findings-acl.270 @@ -9837,11 +9837,11 @@ Automated Detection and Analysis of Data Practices Using A Real-World Corpus MukundSrinath - PranavNarayanan Venkit + PranavNarayanan Venkit MariaBadillo FlorianSchaubUniversity of Michigan - Ann Arbor C.GilesPennsylvania State University - ShomirWilsonPennsylvania State University + ShomirWilsonPennsylvania State University 4567-4574 Privacy policies are crucial for informing users about data practices, yet their length and complexity often deter users from reading them. In this paper, we propose an automated approach to identify and visualize data practices within privacy policies at different levels of detail. Leveraging crowd-sourced annotations from the ToS;DR platform, we experiment with various methods to match policy excerpts with predefined data practice descriptions. We further conduct a case study to evaluate our approach on a real-world policy, demonstrating its effectiveness in simplifying complex policies. Experiments show that our approach accurately matches data practice descriptions with policy excerpts, facilitating the presentation of simplified privacy information to users. 2024.findings-acl.271 @@ -9852,7 +9852,7 @@ Enhancing Hyperbolic Knowledge Graph Embeddings via Lorentz Transformations XiranFanVISA MinghuaXu - HuiyuanChenVISA + HuiyuanChenVISA YuzhongChen MahashwetaDas HaoYangVisa Research @@ -9876,7 +9876,7 @@ Probing the Uniquely Identifiable Linguistic Patterns of Conversational <fixed-case>AI</fixed-case> Agents IqraZahid - TharinduMadusanka + TharinduMadusanka RizaBatista-NavarroUniversity of Manchester YouchengSunThe University of Manchester 4612-4628 @@ -9898,7 +9898,7 @@ <fixed-case>X</fixed-case>-Shot: A Unified System to Handle Frequent, Few-shot and Zero-shot Learning Simultaneously in Classification HanziXu - MuhaoChenUniversity of California, Davis and University of Southern California + MuhaoChenUniversity of California, Davis and University of Southern California LifuHuangVirginia Tech SlobodanVuceticTemple University and Temple University WenpengYinPennsylvania State University @@ -9911,10 +9911,10 @@ <fixed-case>SPIN</fixed-case>: Sparsifying and Integrating Internal Neurons in Large Language Models for Text Classification DifanJiao - YilunLiuTechnische Universität München - ZhenweiTangUniversity of Toronto - DanielMatterTechnische Universität München - JürgenPfefferTechnische Universität München + YilunLiuTechnische Universität München + ZhenweiTangUniversity of Toronto + DanielMatterTechnische Universität München + JürgenPfefferTechnische Universität München AshtonAndersonDepartment of Computer Science, University of Toronto 4666-4682 Among the many tasks that Large Language Models (LLMs) have revolutionized is text classification. Current text classification paradigms, however, rely solely on the output of the final layer in the LLM, with the rich information contained in internal neurons largely untapped. In this study, we present SPIN: a model-agnostic framework that sparsifies and integrates internal neurons of intermediate layers of LLMs for text classification. Specifically, SPIN sparsifies internal neurons by linear probing-based salient neuron selection layer by layer, avoiding noise from unrelated neurons and ensuring efficiency. The cross-layer salient neurons are then integrated to serve as multi-layered features for the classification head. Extensive experimental results show our proposed SPIN significantly improves text classification accuracy, efficiency, and interpretability. @@ -9925,8 +9925,8 @@ Decomposing Co-occurrence Matrices into Interpretable Components as Formal Concepts AkihiroMaedaJapan Advanced Institute of Science and Technology - TakumaToriiTokyo Denki University, Tokyo Institute of Technology - ShoheiHidakaJapan Advanced Institute of Science and Technology, Tokyo Institute of Technology + TakumaToriiTokyo Denki University, Tokyo Institute of Technology + ShoheiHidakaJapan Advanced Institute of Science and Technology, Tokyo Institute of Technology 4683-4700 This study addresses the interpretability of word representations through an investigation of a count-based co-occurrence matrix. Employing the mathematical methodology of Formal Concept Analysis, we reveal an underlying structure that is amenable to human interpretation. Furthermore, we unveil the emergence of hierarchical and geometrical structures within word vectors as consequences of word usage. Our experiments on the PPMI matrix demonstrate that the formal concepts that we identified align with interpretable categories, as shown in the category completion task. 2024.findings-acl.278 @@ -9959,7 +9959,7 @@ YanmingLiu XinyuePeng XuhongZhangZhejiang University - WeihaoLiu + WeihaoLiu JianweiYinZhejiang University JiannanCao TianyuDuZhejiang University @@ -9973,7 +9973,7 @@ <fixed-case>M</fixed-case>r<fixed-case>R</fixed-case>ank: Improving Question Answering Retrieval System through Multi-Result Ranking Model DanupatKhamnuansinChulalongkorn University and KASIKORN Business-Technology Group TawunratChalothornKASIKORN Business-Technology Group - EkapolChuangsuwanichChulalongkorn University + EkapolChuangsuwanichChulalongkorn University 4750-4762 Large Language Models (LLMs) often struggle with hallucinations and outdated information. To address this, Information Retrieval (IR) systems can be employed to augment LLMs with up-to-date knowledge. However, existing IR techniques contain deficiencies, posing a performance bottleneck. Given the extensive array of IR systems, combining diverse approaches presents a viable strategy. Nevertheless, prior attempts have yielded restricted efficacy. In this work, we propose an approach that leverages learning-to-rank techniques to combine heterogeneous IR systems. We demonstrate the method on two Retrieval Question Answering (ReQA) tasks. Our empirical findings exhibit a significant performance enhancement, outperforming previous approaches and achieving state-of-the-art results on ReQA SQuAD. 2024.findings-acl.282 @@ -9984,7 +9984,7 @@ Chain-of-Question: A Progressive Question Decomposition Approach for Complex Knowledge Base Question Answering PengYixingUniversity of Science and Technology of China QuanWangBeijing University of Posts and Telecommunications - LichengZhang + LichengZhang YiLiuState Key Laboratory of Communication Content Cognition ZhendongMaoUniversity of Science and Technology of China 4763-4776 @@ -10032,7 +10032,7 @@ Locating and Extracting Relational Concepts in Large Language Models ZijianWang BritneyWhyteUniversity of New South Wales - ChangXuUniversity of Sydney + ChangXuUniversity of Sydney 4818-4832 Relational concepts are indeed foundational to the structure of knowledge representation, as they facilitate the association between various entity concepts, allowing us to express and comprehend complex world knowledge.By expressing relational concepts in natural language prompts, people can effortlessly interact with large language models (LLMs) and recall desired factual knowledge. However, the process of knowledge recall lacks interpretability, and representations of relational concepts within LLMs remain unknown to us. In this paper, we identify hidden states that can express entity and relational concepts through causal mediation analysis in fact recall processes. Our finding reveals that at the last token position of the input prompt, there are hidden states that solely express the causal effects of relational concepts. Based on this finding, we assume that these hidden states can be treated as relational representations and we can successfully extract them from LLMs. The experimental results demonstrate high credibility of the relational representations: they can be flexibly transplanted into other fact recall processes, and can also be used as robust entity connectors. Moreover, we also show that the relational representations exhibit significant potential for controllable fact recall through relation rewriting. 2024.findings-acl.287 @@ -10055,8 +10055,8 @@ <fixed-case>S</fixed-case>entic<fixed-case>V</fixed-case>ec: Toward Robust and Human-Centric Neurosymbolic Sentiment Analysis XulangZhang - RuiMao - ErikCambriaNanyang Technological University + RuiMao + ErikCambriaNanyang Technological University 4851-4863 The success of state-of-the-art Natural Language Processing (NLP) systems heavily depends on deep neural networks, which excel in various tasks through strong data fitting and latent feature modeling abilities. However, certain challenges linked to deep neural networks and supervised deep learning deserve considerations, e.g., extensive computing resources, knowledge forgetting, etc. Previous research attempted to tackle these challenges individually through irrelative techniques. However, they do not instigate fundamental shifts in the learning paradigm. In this work, we propose a novel neurosymbolic method for sentiment analysis to tackle these issues. We also propose a novel sentiment-pragmatic knowledge base that places emphasis on human subjectivity within varying domain annotations. We conducted extensive experiments to show that our neurosymbolic framework for sentiment analysis stands out for its lightweight nature, robustness across domains and languages, efficient few-shot training, and rapid convergence. 2024.findings-acl.289 @@ -10068,10 +10068,10 @@ ChenQian JieZhang WeiYao - DongruiLiuShanghai Artificial Intelligence Laboratory - ZhenfeiYinUniversity of Sydney and Shanghai AI Laboratory - YuQiao - YongLiuRenmin University of China and Institute of information engineering, CAS + DongruiLiuShanghai Artificial Intelligence Laboratory + ZhenfeiYinUniversity of Sydney and Shanghai AI Laboratory + YuQiao + YongLiuRenmin University of China and Institute of information engineering, CAS JingShaoShanghai AI Laboratory 4864-4888 Ensuring the trustworthiness of large language models (LLMs) is crucial. Most studies concentrate on fully pre-trained LLMs to better understand and improve LLMs’ trustworthiness. In this paper, to reveal the untapped potential of pre-training, we pioneer the exploration of LLMs’ trustworthiness during this period, focusing on five key dimensions: reliability, privacy, toxicity, fairness, and robustness. To begin with, we apply linear probing to LLMs. The high probing accuracy suggests that LLMs in early pre-training can already distinguish concepts in each trustworthiness dimension. Therefore, to further uncover the hidden possibilities of pre-training, we extract steering vectors from a LLM’s pre-training checkpoints to enhance the LLM’s trustworthiness. Finally, inspired by the theoretical result that mutual information estimation is bounded by linear probing accuracy, we also probe LLMs with mutual information to investigate the dynamics of trustworthiness during pre-training. We are the first to observe a similar two-phase phenomenon: fitting and compression. This research provides an initial exploration of trustworthiness modeling during LLM pre-training, seeking to unveil new insights and spur further developments in the field. @@ -10082,9 +10082,9 @@ Language Models can Evaluate Themselves via Probability Discrepancy TingyuXia - BowenYuAlibaba Group + BowenYuAlibaba Group YuanWuJilin University - YiChangJilin University, China + YiChangJilin University, China ChangZhou 4889-4901 In this paper, we begin by illustrating that, when presented with a query, Large Language Models (LLMs) capable of providing accurate responses tend to exhibit a more uniform probability distribution compared to their less proficient counterparts. Building upon this observation, we introduce a novel self-assessment criterion termed ProbDiff for evaluating the performance of diverse LLMs. This method eliminates the need for training an additional evaluation model or relying on external proprietary models such as GPT-4 as a judger. Instead, it solely relies on the LLMs under evaluation to compute the probability discrepancy between the original response generation and its revised versions. A higher discrepancy in two LLMs for the same query suggests a relatively weaker ability. We discover that ProbDiff yields comparable results to mainstream GPT-4-based evaluations on various scenarios including NLG tasks like translation and summarization, as well as LLM evaluation benchmarks such as AlignBench, MT-Bench, and AlpacaEval, across LLMs of different sizes. @@ -10094,12 +10094,12 @@ Evaluating the Validity of Word-level Adversarial Attacks with Large Language Models - HuichiZhou + HuichiZhou ZhaoyangWangMicrosoft HongtaoWangNorth China Electric Power University DongpingChen WenhanMu - FangyuanZhang + FangyuanZhang 4902-4922 Deep neural networks exhibit vulnerability to word-level adversarial attacks in natural language processing. Most of these attack methods adopt synonymous substitutions to perturb original samples for crafting adversarial examples while attempting to maintain semantic consistency with the originals. Some of them claim that they could achieve over 90% attack success rate, thereby raising serious safety concerns. However, our investigation reveals that many purportedly successful adversarial examples are actually invalid due to significant changes in semantic meanings compared to their originals. Even when equipped with semantic constraints such as BERTScore, existing attack methods can generate up to 87.9% invalid adversarial examples. Building on this insight, we first curate a 13K dataset for adversarial validity evaluation with the help of GPT-4. Then, an open-source large language model is fine-tuned to offer an interpretable validity score for assessing the semantic consistency between original and adversarial examples. Finally, this validity score can serve as a guide for existing adversarial attack methods to generate valid adversarial examples. Comprehensive experiments demonstrate the effectiveness of our method in evaluating and refining the quality of adversarial examples. 2024.findings-acl.292 @@ -10113,11 +10113,11 @@ ZhiZhongSony Group Corporation Chieh-HsinLaiSony AI YuhtaTakidaSony AI - NaokiMurataSony AI and Sony Group Corporation + NaokiMurataSony AI and Sony Group Corporation Wei-HsiangLiaoSony Corporation - TakashiShibuyaSony AI + TakashiShibuyaSony AI HiromiWakakiSony Group Corporation - YukiMitsufujiSony AI, Sony Group Corporation, Tokyo Institute of Technology, Tokyo Institute of Technology and Sony Group Corporation + YukiMitsufujiSony AI, Sony Group Corporation, Tokyo Institute of Technology, Tokyo Institute of Technology and Sony Group Corporation 4923-4940 Contrastive cross-modal models such as CLIP and CLAP aid various vision-language (VL) and audio-language (AL) tasks. However, there has been limited investigation of and improvement in their language encoder – the central component of encoding natural language descriptions of image/audio into vector representations. We extensively evaluate how unsupervised and supervised sentence embedding training affect language encoder quality and cross-modal task performance. In VL pretraining, we found that sentence embedding training enhances language encoder quality and aids in cross-modal tasks, improving contrastive VL models such as CyCLIP. Sentence embedding training benefits AL tasks when the amount of training data is large. We analyze the representation spaces to understand the strengths of sentence embedding training, and find that it improves text-space uniformity, at the cost of decreased cross-modal alignment. 2024.findings-acl.293 @@ -10140,10 +10140,10 @@ Anchor-based Large Language Models JianhuiPang FanghuaYe - DerekWongUniversity of Macau + DerekWongUniversity of Macau XinHe WanshunChen - LongyueWang + LongyueWang 4958-4976 Large language models (LLMs) predominantly employ decoder-only transformer architectures, necessitating the retention of keys/values information for historical tokens to provide contextual information and avoid redundant computation. However, the substantial size and parameter volume of these LLMs require massive GPU memory. This memory demand increases with the length of the input text, leading to an urgent need for more efficient methods of information storage and processing. This study introduces Anchor-based LLMs (AnLLMs), which utilize an innovative anchor-based self-attention network (AnSAN) and also an anchor-based inference strategy. This approach enables LLMs to compress sequence information into an anchor token, reducing the keys/values cache and enhancing inference efficiency. Experiments on question-answering benchmarks reveal that AnLLMs maintain similar accuracy levels while achieving up to 99% keys/values cache reduction and up to 3.5 times faster inference. Despite a minor compromise in accuracy, the substantial enhancements of AnLLMs employing the AnSAN technique in resource utilization and computational efficiency underscore their potential for practical LLM applications. 2024.findings-acl.295 @@ -10152,15 +10152,15 @@ <fixed-case>ML</fixed-case>e<fixed-case>VLM</fixed-case>: Improve Multi-level Progressive Capabilities based on Multimodal Large Language Model for Medical Visual Question Answering - DexuanXu - YanyuanChen - JieyiWang + DexuanXu + YanyuanChen + JieyiWang YueHuang HanpinWangPeking University - ZhiJinPeking University and Peking University - HongxingWangCapital Medical University + ZhiJinPeking University and Peking University + HongxingWangCapital Medical University WeihuaYue - JingHe + JingHe HangLiPeking University First Hospital YuHuangPeking University 4977-4997 @@ -10185,8 +10185,8 @@ <fixed-case>MIKE</fixed-case>: A New Benchmark for Fine-grained Multimodal Entity Knowledge Editing JiaqiLiSoutheast University MiaozengDu - ChuanyiZhangHohai University - YongruiChen + ChuanyiZhangHohai University + YongruiChen NanHuSoutheast University GuilinQi HaiyunJiangSUN YAT-SEN UNIVERSITY @@ -10215,8 +10215,8 @@ <fixed-case>M</fixed-case>eme<fixed-case>MQA</fixed-case>: Multimodal Question Answering for Memes via Rationale-Based Inferencing SiddhantAgarwal ShivamSharmaIndian Institute of Technology, Delhi - PreslavNakovMohamed bin Zayed University of Artificial Intelligence - TanmoyChakrabortyIndian Institute of Technology, Delhi + PreslavNakovMohamed bin Zayed University of Artificial Intelligence + TanmoyChakrabortyIndian Institute of Technology, Delhi 5042-5078 Memes have evolved as a prevalent medium for diverse communication, ranging from humour to propaganda. With the rising popularity of image-focused content, there is a growing need to explore its potential harm from different aspects. Previous studies have analyzed memes in closed settings - detecting harm, applying semantic labels, and offering natural language explanations. To extend this research, we introduce MemeMQA, a multimodal question-answering framework aiming to solicit accurate responses to structured questions while providing coherent explanations. We curate MemeMQACorpus, a new dataset featuring 1,880 questions related to 1,122 memes with corresponding answer-explanation pairs. We further propose ARSENAL, a novel two-stage multimodal framework that leverages the reasoning capabilities of LLMs to address MemeMQA. We benchmark MemeMQA using competitive baselines and demonstrate its superiority - ~18% enhanced answer prediction accuracy and distinct text generation lead across various metrics measuring lexical and semantic alignment over the best baseline. We analyze ARSENAL’s robustness through diversification of question-set, confounder-based evaluation regarding MemeMQA’s generalizability, and modality-specific assessment, enhancing our understanding of meme interpretation in the multimodal communication landscape. 2024.findings-acl.300 @@ -10227,7 +10227,7 @@ Improving Attributed Text Generation of Large Language Models via Preference Learning DongfangLiHarbin Institute of Technology ZetianSun - BaotianHuHarbin Institute of Technology, Shenzhen + BaotianHuHarbin Institute of Technology, Shenzhen ZhenyuLiu XinshuoHu XueboLiuHarbin Institute of Technolgy, Shenzhen @@ -10243,7 +10243,7 @@ SungHoKimKorea University JuhyeongParkKorea University YeachanKimKorea University - SangKeunLeeKorea University + SangKeunLeeKorea University 5102-5119 The Korean writing system, Hangeul, has a unique character representation rigidly following the invention principles recorded in Hunminjeongeum. However, existing pre-trained language models (PLMs) for Korean have overlooked these principles. In this paper, we introduce a novel framework for Korean PLMs called KOMBO, which firstly brings the invention principles of Hangeul to represent character. Our proposed method, KOMBO, exhibits notable experimental proficiency across diverse NLP tasks. In particular, our method outperforms the state-of-the-art Korean PLM by an average of 2.11% in five Korean natural language understanding tasks. Furthermore, extensive experiments demonstrate that our proposed method is suitable for comprehending the linguistic features of the Korean language. Consequently, we shed light on the superiority of using subcharacters over the typical subword-based approach for Korean PLMs. Our code is available at: https://github.com/SungHo3268/KOMBO. 2024.findings-acl.302 @@ -10254,7 +10254,7 @@ Tree-Planted Transformers: Unidirectional Transformer Language Models with Implicit Syntactic Supervision RyoYoshidaThe University of Tokyo TaigaSomeya - YoheiOsekiUniversity of Tokyo + YoheiOsekiUniversity of Tokyo 5120-5134 Syntactic Language Models (SLMs) can be trained efficiently to reach relatively high performance; however, they have trouble with inference efficiency due to the explicit generation of syntactic structures. In this paper, we propose a new method dubbed tree-planting: instead of explicitly generating syntactic structures, we “plant” trees into attention weights of unidirectional Transformer LMs to implicitly reflect syntactic structures of natural language. Specifically, unidirectional Transformer LMs trained with tree-planting will be called Tree-Planted Transformers (TPT), which inherit the training efficiency from SLMs without changing the inference efficiency of their underlying Transformer LMs. Targeted syntactic evaluations on the SyntaxGym benchmark demonstrated that TPTs, despite the lack of explicit generation of syntactic structures, significantly outperformed not only vanilla Transformer LMs but also various SLMs that generate hundreds of syntactic structures in parallel. This result suggests that TPTs can learn human-like syntactic knowledge as data-efficiently as SLMs while maintaining the modeling space of Transformer LMs unchanged. 2024.findings-acl.303 @@ -10268,7 +10268,7 @@ YiLiu JunjieWang QingWangInstitute of Software, Chinese Academy of Sciences - YangLiuNanyang Technological University + YangLiuNanyang Technological University 5135-5147 With the development of LLMs, the security threats of LLMs are getting more and more attention. Numerous jailbreak attacks have been proposed to assess the security defense of LLMs. Current jailbreak attacks primarily utilize scenario camouflage techniques. However their explicitly mention of malicious intent will be easily recognized and defended by LLMs. In this paper, we propose an indirect jailbreak attack approach, Puzzler, which can bypass the LLM’s defensive strategies and obtain malicious response by implicitly providing LLMs with some clues about the original malicious query. In addition, inspired by the wisdom of “When unable to attack, defend” from Sun Tzu’s Art of War, we adopt a defensive stance to gather clues about the original malicious query through LLMs. The experimental results indicate that the Query Success Rate of the Puzzler is 14.0%-82.7% higher than baselines on the most prominent LLMs. Furthermore, when tested against the state-of-the-art jailbreak detection approaches, Puzzler proves to be more effective at evading detection compared to baselines. 2024.findings-acl.304 @@ -10278,18 +10278,18 @@ Publicly Shareable Clinical Large Language Model Built on Synthetic Clinical Notes SunjunKweon - JunuKimKorea Advanced Institute of Science & Technology + JunuKimKorea Advanced Institute of Science & Technology JiyounKimKorea Advanced Institute of Science & Technology SujeongImKorea Advanced Institute of Science & Technology EunbyeolChoKorea Advanced Institute of Science & Technology SeongsuBaeKorea Advanced Institute of Science and Technology - JungwooOhKorea Advanced Institute of Science and Technology + JungwooOhKorea Advanced Institute of Science and Technology GyubokLeeKorea Advanced Institute of Science and Technology Jong HakMoonKorea Advanced Institute of Science & Technology Seng ChanYouYonsei University SeungjinBaekYonsei university - Chang HoonHan - Yoon BinJungYonsei University + Chang HoonHan + Yoon BinJungYonsei University YohanJoSeoul National University EdwardChoiKorea Advanced Institute of Science and Technology 5148-5168 @@ -10301,12 +10301,12 @@ Extending Context Window of Large Language Models via Semantic Compression WeizhiFeiThe Department of Mathematics, Tsinghua University - XueyanNiuHuawei Technologies Ltd. + XueyanNiuHuawei Technologies Ltd. PingyiZhouHuawei Technologies Ltd. LuHouHuawei Technologies Ltd. BoBai LeiDeng - WeiHanHuawei Tech. Investment Co., Limited + WeiHanHuawei Tech. Investment Co., Limited 5169-5181 Transformer based Large Language Models (LLMs) often impose limitations on the length of the text input to ensure the generation of fluent and relevant responses due to the quadratic complexity. These constraints restrict their applicability in long text scenarios. In this paper, we propose a novel semantic compression method that enables generalization to texts that are 6-8 times longer without incurring significant computational costs or requiring fine-tuning. Our proposed framework draws inspiration from source coding in information theory and employs a pre-trained model to reduce the semantic redundancy of long inputs before passing them to the LLMs for downstream tasks. Experimental results demonstrate that our method effectively extends the context window of LLMs across a range of tasks including question answering, summarization, few-shot learning, and information retrieval. Furthermore, the proposed semantic compression method exhibits consistent fluency in text generation while reducing the associated computational overhead. 2024.findings-acl.306 @@ -10317,7 +10317,7 @@ Plausible Extractive Rationalization through Semi-Supervised Entailment Signal YeoWei JieSchool of Computer Science and Engineering, Nanyang Technological University RanjanSatapathy - ErikCambriaNanyang Technological University + ErikCambriaNanyang Technological University 5182-5192 The increasing use of complex and opaque black box models requires the adoption of interpretable measures, one such option is extractive rationalizing models, which serve as a more interpretable alternative. These models, also known as Explain-Then-Predict models, employ an explainer model to extract rationales and subsequently condition the predictor with the extracted information. Their primary objective is to provide precise and faithful explanations, represented by the extracted rationales. In this paper, we take a semi-supervised approach to optimize for the plausibility of extracted rationales. We adopt a pre-trained natural language inference (NLI) model and further fine-tune it on a small set of supervised rationales (10%). The NLI predictor is leveraged as a source of supervisory signals to the explainer via entailment alignment. We show that, by enforcing the alignment agreement between the explanation and answer in a question-answering task, the performance can be improved without access to ground truth labels. We evaluate our approach on the ERASER dataset and show that our approach achieves comparable results with supervised extractive models and outperforms unsupervised approaches by > 100%. 2024.findings-acl.307 @@ -10327,7 +10327,7 @@ Translation Deserves Better: Analyzing Translation Artifacts in Cross-lingual Visual Question Answering ChaeHunPark - KoanhoLeeKorea Advanced Institute of Science & Technology + KoanhoLeeKorea Advanced Institute of Science & Technology HyesuLimKorea Advanced Institute of Science & Technology JaeseokKimKorea Telecom Research JunmoParkSaltlux @@ -10356,7 +10356,7 @@ Fast Randomized Low-Rank Adaptation of Pre-trained Language Models with <fixed-case>PAC</fixed-case> Regularization ZijianLeiHong Kong Baptist University DongQianLinköping University - WilliamCheungHong Kong Baptist University + WilliamCheungHong Kong Baptist University 5236-5249 Low-rank adaptation (LoRA) achieves parameter efficient fine-tuning for large language models (LLMs) by decomposing the model weight update into a pair of low-rank projection matrices. Yet, the memory overhead restricts it to scale up when the model size increases. We propose Randomized LoRA (RLoRA) which adopts Randomized Walsh-Hadamard Transform to achieve significant reduction in the size of trainable parameters compared to LoRA. At the same time, it allows a PAC-Bayes regularizer to be efficiently incorporated to improve generalization. We evaluate the effectiveness of RLoRA on LLMs RoBERTa, GPT-2 and LLaMA-7B using GLUE, E2E and math reasoning benchmarks. With a much lower memory requirement, RLoRA can give similar performance as the SOTA low-rank adaptation methods for these three tasks and significantly better performance under few-shot settings. 2024.findings-acl.310 @@ -10366,8 +10366,8 @@ <fixed-case>SDA</fixed-case>: Semantic Discrepancy Alignment for Text-conditioned Image Retrieval YuchenYang - YuWangShanghai Jiao Tong University - YanfengWangShanghai Jiao Tong University + YuWangShanghai Jiao Tong University + YanfengWangShanghai Jiao Tong University 5250-5261 In the realm of text-conditioned image retrieval, models utilize a query composed of a reference image and modification text to retrieve corresponding images. Despite its significance, this task is fraught with challenges, including small-scale datasets due to labeling costs and the complexity of attributes in modification texts. These challenges often result in models learning a generalized representation of the query, thereby missing the semantic correlations of image and text attributes.In this paper, we introduce a general boosting framework designed to address these issues by employing semantic discrepancy alignment. Our framework first leverages the ChatGPT to augment text data by modifying the original modification text’s attributes. The augmented text is then combined with the original reference image to create an augmented composed query. Then we generate corresponding images using GPT-4 for the augmented composed query.We realize the cross-modal semantic discrepancy alignment by formulating distance consistency and neighbor consistency between the image and text domains. Through this novel approach, attribute in the text domain can be more effectively transferred to the image domain, enhancing retrieval performance. Extensive experiments on three prominent datasets validate the effectiveness of our approach, with state-of-the-art results on a majority of evaluation metrics compared to various baseline methods. 2024.findings-acl.311 @@ -10393,7 +10393,7 @@ Generation Meets Verification: Accelerating Large Language Model Inference with Smart Parallel Auto-Correct Decoding HanlingYi - FengLinIntelliFusion Co., Ltd + FengLinIntelliFusion Co., Ltd HongbinLi NingPeiyangIntellifusion Inc. XiaotianYu @@ -10424,7 +10424,7 @@ XinweiWu WeilongDong ShaoyangXu - DeyiXiongTianjin University + DeyiXiongTianjin University 5319-5332 Protecting privacy leakage in large language models remains a paramount challenge. In this paper, we reveal Privacy Seesaw in LLM privacy safeguarding, a phenomenon where measures to secure specific private information inadvertently heighten exposure risks for other privacy. Through comprehensive analysis, we identify the amount of targeted privacy data and the volume of edited privacy neurons as the two central triggers to this issue. To mitigate privacy seesaw, we propose Augmented Privacy Neuron Editing via Activation Patching (APNEAP), a novel framework designed to well balance model performance with privacy protection. The proposed APNEAP augments collected private data by automatically synthesizing new private data, which deactivates the first trigger to the privacy seesaw issue. Additionally, it adapts activation patching to privacy neuron editing for switching off the second trigger to the privacy seesaw problem. Experimental results show that the proposed APNEAP is capable of alleviating the privacy seesaw phenomenon and offers a more stable and reliable approach to privacy protection in LLMs than previous methods. 2024.findings-acl.315 @@ -10446,7 +10446,7 @@ <fixed-case>B</fixed-case>ad<fixed-case>A</fixed-case>cts: A Universal Backdoor Defense in the Activation Space BiaoYi SishuoChenAlibaba Group - YimingLiNanyang Technological University + YimingLiNanyang Technological University TongLiNankai University BaoleiZhang ZheliLiu @@ -10462,10 +10462,10 @@ YaoruiShi AnZhangNational University of Singapore SihangLi - EnzhiZhang - XiangWangUniversity of Science and Technology of China + EnzhiZhang + XiangWangUniversity of Science and Technology of China KenjiKawaguchiNational University of Singapore - Tat-SengChuaNational University of Singapore + Tat-SengChuaNational University of Singapore 5353-5377 Molecule-text modeling, which aims to facilitate molecule-relevant tasks with a textual interface and textual knowledge, is an emerging research direction. Beyond single molecules, studying reaction-text modeling holds promise for helping the synthesis of new materials and drugs. However, previous works mostly neglect reaction-text modeling: they primarily focus on modeling individual molecule-text pairs or learning chemical reactions without texts in context. Additionally, one key task of reaction-text modeling – experimental procedure prediction – is less explored due to the absence of an open-source dataset. The task is to predict step-by-step actions of conducting chemical experiments and is crucial to automating chemical synthesis. To resolve the challenges above, we propose a new pretraining method, ReactXT, for reaction-text modeling, and a new dataset, OpenExp, for experimental procedure prediction. Specifically, ReactXT features three types of input contexts to incrementally pretrain LMs. Each of the three input contexts corresponds to a pretraining task to improve the text-based understanding of either reactions or single molecules. ReactXT demonstrates consistent improvements in experimental procedure prediction and molecule captioning and offers competitive results in retrosynthesis. Our code is available at https://github.com/syr-cn/ReactXT. 2024.findings-acl.318 @@ -10476,7 +10476,7 @@ Multi-modal Concept Alignment Pre-training for Generative Medical Visual Question Answering QuanYanCentral South University JunwenDuanCentral South University, China - JianxinWangCentral South University + JianxinWangCentral South University 5378-5389 Medical Visual Question Answering (Med-VQA) seeks to accurately respond to queries regarding medical images, a task particularly challenging for open-ended questions. This study unveils the Multi-modal Concept Alignment Pre-training (MMCAP) approach for generative Med-VQA, leveraging a knowledge graph sourced from medical image-caption datasets and the Unified Medical Language System. MMCAP advances the fusion of visual and textual medical knowledge via a graph attention network and a transformer decoder. Additionally, it incorporates a Type Conditional Prompt in the fine-tuning phase, markedly boosting the accuracy and relevance of answers to open-ended questions. Our tests on benchmark datasets illustrate MMCAP’s superiority over existing methods, demonstrating its high efficiency in data-limited settings and effective knowledge-image alignment capability. 2024.findings-acl.319 @@ -10504,8 +10504,8 @@ HangJiang RuiYang QingchengZengNorthwestern University, Northwestern University - JinghuiLuByteDance Inc. - MoritzBlum + JinghuiLuByteDance Inc. + MoritzBlum TianweiSheThe University of Tokyo, Tokyo Institute of Technology YuangJiang IreneLi @@ -10518,8 +10518,8 @@ The Butterfly Effect of Model Editing: Few Edits Can Trigger Large Language Models Collapse WanliYang - FeiSunInstitute of Computing Technology, Chinese Academy of Sciences - XinyuMaBaidu + FeiSunInstitute of Computing Technology, Chinese Academy of Sciences + XinyuMaBaidu XunLiu DaweiYinBaidu XueqiCheng, Chinese Academy of Sciences @@ -10546,7 +10546,7 @@ BowenLi BowenQinShenzhen Institutes of Advanced Technology, Chinese Academy of Sciences, Chinese Academy of Sciences NanHuo - ChenhaoMaThe Chinese University of Hong Kong, Shenzhen + ChenhaoMaThe Chinese University of Hong Kong, Shenzhen ReynoldCheng 5456-5471 Large Language Models (LLMs) driven by In-Context Learning (ICL) have significantly improved the performance of text-to-SQL. Previous methods generally employ a two-stage reasoning framework, namely 1) schema linking and 2) logical synthesis, making the framework not only effective but also interpretable. Despite these advancements, the inherent bad nature of the generalization of LLMs often results in hallucinations, which limits the full potential of LLMs. In this work, we first identify and categorize the common types of hallucinations at each stage in text-to-SQL. We then introduce a novel strategy, Task Alignment (TA), designed to mitigate hallucinations at each stage. TA encourages LLMs to take advantage of experiences from similar tasks rather than starting the tasks from scratch. This can help LLMs reduce the burden of generalization, thereby mitigating hallucinations effectively. We further propose TA-SQL, a text-to-SQL framework based on this strategy. The experimental results and comprehensive analysis demonstrate the effectiveness and robustness of our framework. Specifically, it enhances the performance of the GPT-4 baseline by 21.23% relatively on BIRD dev and it yields significant improvements across six models and four mainstream, complex text-to-SQL benchmarks. @@ -10558,8 +10558,8 @@ Translatotron-<fixed-case>V</fixed-case>(ison): An End-to-End Model for In-Image Machine Translation ZhibinLan LiqiangNiu - FandongMengWeChat AI, Tencent Inc. - JieZhou + FandongMengWeChat AI, Tencent Inc. + JieZhou MinZhangHarbin Institute of Technology, Shenzhen JinsongSuXiamen University 5472-5485 @@ -10572,9 +10572,9 @@ FarhadNooralahzadehUniversity of Zurich and ZHAW - Zürcher Hochschule für Angewandte Wissenschaften YiZhangUniversity of Zurich and ZHAW - Zürcher Hochschule für Angewandte Wissenschaften EllerySmith - SabineMaennelETHZ - ETH Zurich - CyrilMatthey-DoretEPFL - EPF Lausanne - RaphaëlDe FondevilleFederal Office of Statistics + SabineMaennelETHZ - ETH Zurich + CyrilMatthey-DoretEPFL - EPF Lausanne + RaphaëlDe FondevilleFederal Office of Statistics KurtStockingerZHAW - Zürcher Hochschule für Angewandte Wissenschaften 5486-5507 The potential for improvements brought by Large Language Models (LLMs) in Text-to-SQL systems is mostly assessed on monolingual English datasets. However, LLMs’ performance for other languages remains vastly unexplored. In this work, we release the StatBot.Swiss dataset, the first bilingual benchmark for evaluating Text-to-SQL systems based on real-world applications. The StatBot.Swiss dataset contains 455 natural language/SQL-pairs over 35 big databases with varying level of complexity for both English and German.We evaluate the performance of state-of-the-art LLMs such as GPT-3.5-Turbo and mixtral-8x7b-instruct for the Text-to-SQL translation task using an in-context learning approach. Our experimental analysis illustrates that current LLMs struggle to generalize well in generating SQL queries on our novel bilingual dataset. @@ -10610,7 +10610,7 @@ Improving the Robustness of Distantly-Supervised Named Entity Recognition via Uncertainty-Aware Teacher Learning and Student-Student Collaborative Learning ShuzhengSiTsinghua University HelanHu - HaozheZhao + HaozheZhao ShuangZeng KaikaiAn ZefanCai @@ -10626,7 +10626,7 @@ HarriRowlandsInfluenceMap GakuMorioHitachi America, Ltd., Stanford University and Hitachi, ltd. DylanTanner - ChristopherManningComputer Science Department, Stanford University + ChristopherManningComputer Science Department, Stanford University 5547-5558 Social media advertising offers a platform for fossil fuel value chain companies and their agents to reinforce their narratives, often emphasizing economic, labor market, and energy security benefits to promote oil and gas policy and products. Whether such narratives can be detected automatically and the extent to which the cost of human annotation can be reduced is our research question. We introduce a task of classifying narratives into seven categories, based on existing definitions and data.Experiments showed that RoBERTa-large outperforms other methods, while GPT-4 Turbo can serve as a viable annotator for the task, thereby reducing human annotation costs. Our findings and insights provide guidance to automate climate-related ad analysis and lead to more scalable ad scrutiny. 2024.findings-acl.330 @@ -10637,10 +10637,10 @@ <fixed-case>SSS</fixed-case>: Editing Factual Knowledge in Language Models towards Semantic Sparse Space HuazhengWang HaifengSunBeijing University of Posts and Telecommunications, Beijing University of Posts and Telecommunications and Beijing University of Posts and Telecommunications - JingyuWangBeijing University of Post and Telecommunication, Tsinghua University + JingyuWangBeijing University of Post and Telecommunication, Tsinghua University QiQiBeijing University of Posts and Telecommunications ZixuanXiaBeijing University of Posts and Telecommunications - MenghaoZhangBeijing University of Posts and Telecommunications + MenghaoZhangBeijing University of Posts and Telecommunications JianxinLiao 5559-5570 Language Models (LMs) acquire factual knowledge during pre-training and store it in the parameters, which can be valuable for downstream tasks. As world evolves, some facts may be incorrectly induced or become obsolete over time. Various model editing methods have been proposed to modify specific examples in LMs. However, existing training-based methods still suffer from sub-optimal locality, where irrelevant neighborhood examples can be adversely influenced. Model’s gradients are still struggling to identify the appropriate direction when updating the parameters. To address this issue, we find that directing the hidden state of the edit example towards spaces where semantics are sparse tends to help preserve the semantics of irrelevant neighborhood examples. Based on this hypothesis, we propose a novel metric, named SSS, to evaluate the degree of sparsity around a sentence embedding in the semantic space without any human or machine annotation. Subsequently, we incorporate SSS into the original loss function of the existing training-based methods to enhance locality. Experiments conducted on two datasets across various models demonstrate that SSS is effective in improving both locality and reasoning capability. @@ -10664,8 +10664,8 @@ Unveiling Selection Biases: Exploring Order and Token Sensitivity in Large Language Models Sheng-LunWeiDepartment of computer science and informational engineering, National Taiwan University Cheng-KuangWuAppier - Hen-HsenHuangInstitute of Information Science, Academia Sinica - Hsin-HsiChenNational Taiwan University + Hen-HsenHuangInstitute of Information Science, Academia Sinica + Hsin-HsiChenNational Taiwan University 5598-5621 In this paper, we investigate the phenomena of “selection biases” in Large Language Models (LLMs), focusing on problems where models are tasked with choosing the optimal option from an ordered sequence. We delve into biases related to option order and token usage, which significantly impact LLMs’ decision-making processes. We also quantify the impact of these biases through an extensive empirical analysis across multiple models and tasks. Furthermore, we propose mitigation strategies to enhance model performance. Our key contributions are threefold: 1) Precisely quantifying the influence of option order and token on LLMs, 2) Developing strategies to mitigate the impact of token and order sensitivity to enhance robustness, and 3) Offering a detailed analysis of sensitivity across models and tasks, which informs the creation of more stable and reliable LLM applications for selection problems. 2024.findings-acl.333 @@ -10675,7 +10675,7 @@ <fixed-case>A</fixed-case>rabic<fixed-case>MMLU</fixed-case>: Assessing Massive Multitask Language Understanding in <fixed-case>A</fixed-case>rabic FajriKotoMohamed bin Zayed University of Artificial Intelligence - HaonanLi + HaonanLi SaraShatnawi JadDoughman AbdelrahmanSadallah @@ -10683,10 +10683,10 @@ KhalidAlmubarakPrince Sattam bin Abdulaziz University ZaidAlyafeai NehaSengupta - ShadyShehataMohamed bin Zayed University of Artificial Intelligence - NizarHabashNew York University Abu Dhabi - PreslavNakovMohamed bin Zayed University of Artificial Intelligence - TimothyBaldwinMohamed bin Zayed University of Artificial Intelligence and The University of Melbourne + ShadyShehataMohamed bin Zayed University of Artificial Intelligence + NizarHabashNew York University Abu Dhabi + PreslavNakovMohamed bin Zayed University of Artificial Intelligence + TimothyBaldwinMohamed bin Zayed University of Artificial Intelligence and The University of Melbourne 5622-5640 The focus of language model evaluation has transitioned towards reasoning and knowledge-intensive tasks, driven by advancements in pretraining large models. While state-of-the-art models are partially trained on large Arabic texts, evaluating their performance in Arabic remains challenging due to the limited availability of relevant datasets. To bridge this gap, we present ArabicMMLU, the first multi-task language understanding benchmark for the Arabic language, sourced from school exams across diverse educational levels in different countries spanning North Africa, the Levant, and the Gulf regions. Our data comprises 40 tasks and 14,575 multiple-choice questions in Modern Standard Arabic (MSA) and is carefully constructed by collaborating with native speakers in the region. Our comprehensive evaluations of 35 models reveal substantial room for improvement, particularly among the best open-source models. Notably, BLOOMZ, mT0, LLama2, and Falcon struggle to achieve a score of 50%, while even the top-performing Arabic-centric model only achieves a score of 62.3%. 2024.findings-acl.334 @@ -10696,10 +10696,10 @@ On the Relationship Between <fixed-case>RNN</fixed-case> Hidden-State Vectors and Semantic Structures EdiMuskardin - MartinTapplerTechnische Universität Wien + MartinTapplerTechnische Universität Wien IngoPillTechnische Universität Graz - BernhardAichernigTechnische Universität Graz - ThomasPockGraz University of Technology + BernhardAichernigTechnische Universität Graz + ThomasPockGraz University of Technology 5641-5658 We examine the assumption that hidden-state vectors of recurrent neural networks (RNNs) tend to form clusters of semantically similar vectors, which we dub the clustering hypothesis. While this hypothesis has been assumed in RNN analyses in recent years, its validity has not been studied thoroughly on modern RNN architectures. We first consider RNNs that were trained to recognize regular languages. This enables us to draw on perfect ground-truth automata in our evaluation, against which we can compare the RNN’s accuracy and the distribution of the hidden-state vectors. Then, we consider context-free languages to examine if RNN states form clusters for more expressive languages.For our analysis, we fit (generalized) linear models to classify RNN states into automata states and we apply different unsupervised clustering techniques. With a new ambiguity score, derived from information entropy, we measure how well an abstraction function maps the hidden state vectors to abstract clusters. Our evaluation supports the validity of the clustering hypothesis for regular languages, especially if RNNs are well-trained, i.e., clustering techniques succeed in finding clusters of similar state vectors. However, the clustering accuracy decreases substantially for context-free languages. This suggests that clustering is not a reliable abstraction technique for RNNs used in tasks like natural language processing. 2024.findings-acl.335 @@ -10710,11 +10710,11 @@ <fixed-case>XMC</fixed-case>-Agent : Dynamic Navigation over Scalable Hierarchical Index for Incremental Extreme Multi-label Classification YanjiangLiu TianyunZhong - YaojieLuInstitute of Software, Chinese Academy of Sciences + YaojieLuInstitute of Software, Chinese Academy of Sciences HongyuLinInstitute of Software, Chinese Academy of Sciences BenHe ShuhengZhouAnt Group - HuijiaZhu + HuijiaZhu WeiqiangWangAnt Group ZhongyiLiuAnt Group XianpeiHanInstitute of Software, CAS @@ -10741,11 +10741,11 @@ Improving Large Language Models via Fine-grained Reinforcement Learning with Minimum Editing Constraint ZhipengChen KunZhouRenmin University of China - XinZhaoRenmin University of China - JunchenWan + XinZhaoRenmin University of China + JunchenWan FuzhengZhang DiZhangKuaishou Technology - Ji-RongWenRenmin University of China + Ji-RongWenRenmin University of China 5694-5711 Reinforcement learning (RL) has been widely used in training large language models (LLMs) for preventing unexpected outputs, e.g., reducing harmfulness and errors. However, existing RL methods mainly adopt instance-level reward, which cannot provide fine-grained supervision for complex reasoning tasks. As a result, the RL training cannot be fully aware of the specific part or step that actually leads to the incorrectness in model response. To address it, we propose a new RL method named RLMEC that incorporates a generative model as the reward model, which is trained by the erroneous solution rewriting task under the minimum editing constraint, which can produce token-level supervision for RL training. Based 0on the generative reward model, we design the token-level RL objective for training and an imitation-based regularization for stabilizing RL process. And these two objectives focus on the revision of the key tokens for the erroneous solution, reducing the effect of other unimportant tokens. Experiment results on 8 tasks have demonstrated the effectiveness of our approach. Our code and data will be publicly released. 2024.findings-acl.338 @@ -10755,7 +10755,7 @@ Definition generation for lexical semantic change detection MariiaFedorova - AndreyKutuzovUniversity of Oslo + AndreyKutuzovUniversity of Oslo YvesScherrerUniversity of Oslo 5712-5724 We use contextualized word definitions generated by large language models as semantic representations in the task of diachronic lexical semantic change detection (LSCD). In short, generated definitions are used as ‘senses’, and the change score of a target word is retrieved by comparing their distributions in two time periods under comparison. On the material of five datasets and three languages, we show that generated definitions are indeed specific and general enough to convey a signal sufficient to rank sets of words by the degree of their semantic change over time. Our approach is on par with or outperforms prior non-supervised sense-based LSCD methods. At the same time, it preserves interpretability and allows to inspect the reasons behind a specific shift in terms of discrete definitions-as-senses. This is another step in the direction of explainable semantic change modeling. @@ -10767,8 +10767,8 @@ <fixed-case>M</fixed-case>u<fixed-case>T</fixed-case>ox: Universal <fixed-case>MU</fixed-case>ltilingual Audio-based <fixed-case>TOX</fixed-case>icity Dataset and Zero-shot Detector MartaCosta-jussàMeta MarianoMeglioliMeta - PierreAndrews - DavidDaleFAIR at Meta + PierreAndrews + DavidDaleFAIR at Meta PrangthipHansanti ElaheKalbassi AlexandreMourachkoResearch, Facebook @@ -10782,8 +10782,8 @@ Phased Instruction Fine-Tuning for Large Language Models - WeiPang - ChuanZhouPeking University + WeiPang + ChuanZhouPeking University Xiao-HuaZhou XiaojieWangBeijing University of Post and Telecommunication 5735-5748 @@ -10802,11 +10802,11 @@ XinhaoChen TuHu YangChen - YupeiRen + YupeiRen YadongZhang YouqiSong BinxuanLiu - ManLan + ManLan 5749-5765 Topic relevance of an essay demands that the composition adheres to a clear theme and aligns well with the essay prompt requirements, a critical aspect of essay quality evaluation. However, existing research of Automatic Essay Scoring (AES) for Chinese essays has overlooked topic relevance and lacks detailed feedback, while Automatic Essay Comment Generation (AECG) faces much complexity and difficulty. Additionally, current Large Language Models, including GPT-4, often make incorrect judgments and provide overly impractical feedback when evaluating topic relevance. This paper introduces TOREE (Topic Relevance Evaluation), a comprehensive dataset developed to assess topic relevance in Chinese primary and middle school students’ essays, which is beneficial for AES, AECG and other applications. Moreover, our proposed two-step method utilizes TOREE through a combination of Supervised Fine-tuning and Preference Learning. Experimental results demonstrate that TOREE is of high quality, and our method significantly enhances models’ performance on two designed tasks for topic relevance evaluation, improving both automatic and human evaluations across four diverse LLMs. 2024.findings-acl.342 @@ -10815,13 +10815,13 @@ Predicting the Unpredictable: Uncertainty-Aware Reasoning over Temporal Knowledge Graphs via Diffusion Process - YuxiangCai - QiaoLiuUESTC - YangleiGan - ChanglinLiUniversity of Electronic Science and Technology of China - XueyiLiu - RunLin - DaLuo + YuxiangCai + QiaoLiuUESTC + YangleiGan + ChanglinLiUniversity of Electronic Science and Technology of China + XueyiLiu + RunLin + DaLuo JiayeYangJiayeYang 5766-5778 Temporal Knowledge Graph (TKG) reasoning seeks to predict future incomplete facts leveraging historical data. While existing approaches have shown effectiveness in addressing the task through various perspectives, such as graph learning and logic rules, they are limited in capturing the indeterminacy in future events, particularly in the case of rare/unseen facts. To tackle the highlighted issues, we introduce a novel approach by conceptualizing TKG reasoning as a sequence denoising process for future facts, namely DiffuTKG. Concretely, we first encodes the historical events as the conditional sequence. Then we gradually introduce Gaussian noise to corrupt target facts during the forward process and then employ a transformer-based conditional denoiser to restore them in the reverse phase. Moreover, we introduce an uncertainty regularization loss to mitigate the risk of prediction biases by favoring frequent scenarios over rare/unseen facts. Empirical results on four real-world datasets show that DiffuTKG outperforms state-of-the-art methods across multiple evaluation metrics. @@ -10845,12 +10845,12 @@ Controlled Text Generation for Large Language Model with Dynamic Attribute Graphs XunLiangRenmin University of China HanyuWang - ShichaoSong - MengtingHuNankai University + ShichaoSong + MengtingHuNankai University XunzhiWangNankai University - ZhiyuLi - FeiyuXiongInstitute for Advanced Algorithms Research, Shanghai - BoTang + ZhiyuLi + FeiyuXiongInstitute for Advanced Algorithms Research, Shanghai + BoTang 5797-5814 Controlled Text Generation (CTG) aims to produce texts that exhibit specific desired attributes. In this study, we introduce a pluggable CTG framework for Large Language Models (LLMs) named Dynamic Attribute Graphs-based controlled text generation (DATG). This framework utilizes an attribute scorer to evaluate the attributes of sentences generated by LLMs and constructs dynamic attribute graphs. DATG modulates the occurrence of key attribute words and key anti-attribute words, achieving effective attribute control without compromising the original capabilities of the model. We conduct experiments across four datasets in two tasks: toxicity mitigation and sentiment transformation, employing five LLMs as foundational models. Our findings highlight a remarkable enhancement in control accuracy, achieving a peak improvement of 19.29% over baseline methods in the most favorable task across four datasets. Additionally, we observe a significant decrease in perplexity, markedly improving text fluency. 2024.findings-acl.345 @@ -10859,10 +10859,10 @@ Coconut: Contextualized Commonsense Unified Transformers for Graph-Based Commonsense Augmentation of Language Models - Jun-HyungPark + Jun-HyungPark MingyuLeeKorea University JunhoKimKorea University - SangKeunLeeKorea University + SangKeunLeeKorea University 5815-5830 In this paper, we introduce COCONUT to effectively guide the contextualization of structured commonsense knowledge based on largelanguage models. COCONUT employs a contextualized knowledge prompting scheme to gather high-quality contextualization examplesfrom a large language model. These examples are subsequently distilled into small language models to enhance their contextualization capability. Extensive evaluations show that COCONUT considerably improves commonsense reasoning performance across diverse benchmarks, models, and settings, exhibiting its flexibility and universality in generating contextualized commonsense knowledge. Notably,COCONUT consistently outperforms the state-of-the-art technique by an average of 5.8%. 2024.findings-acl.346 @@ -10874,7 +10874,7 @@ DanielTamayo AitorGonzalez-Agirre JavierHernando - MartaVillegas + MartaVillegas 5831-5847 Recent research has explored methods for updating and modifying factual knowledge in large language models, often focusing on specific multi-layer perceptron blocks. This study expands on this work by examining the effectiveness of existing knowledge editing methods across languages and delving into the role of attention mechanisms in this process. Drawing from the insights gained, we propose Mass-Editing Memory with Attention in Transformers (MEMAT), a method that achieves significant improvements in all metrics while requiring minimal parameter modifications. MEMAT delivers a remarkable 10% increase in magnitude metrics, benefits languages not included in the training data and also demonstrates a high degree of portability. Our code and data are at https://github.com/dtamayo-nlp/MEMAT. 2024.findings-acl.347 @@ -10883,12 +10883,12 @@ <fixed-case>B</fixed-case>io<fixed-case>M</fixed-case>istral: A Collection of Open-Source Pretrained Large Language Models for Medical Domains - YanisLabrak - AdrienBazogeNantes Université + YanisLabrak + AdrienBazogeNantes Université EmmanuelMorin - Pierre-AntoineGourraudUniversité de Nantes + Pierre-AntoineGourraudUniversité de Nantes MickaelRouvierUniversité d’Avignon - RichardDufourNantes University + RichardDufourNantes University 5848-5864 Large Language Models (LLMs) have demonstrated remarkable versatility in recent years, offering potential applications across specialized domains such as healthcare and medicine. Despite the availability of various open-source LLMs tailored for health contexts, adapting general-purpose LLMs to the medical domain presents significant challenges.In this paper, we introduce BioMistral, an open-source LLM tailored for the biomedical domain, utilizing Mistral as its foundation model and further pre-trained on PubMed Central. We conduct a comprehensive evaluation of BioMistral on a benchmark comprising 10 established medical question-answering (QA) tasks in English. We also explore lightweight models obtained through quantization and model merging approaches. Our results demonstrate BioMistral’s superior performance compared to existing open-source medical models and its competitive edge against proprietary counterparts. Finally, to address the limited availability of data beyond English and to assess the multilingual generalization of medical LLMs, we automatically translated and evaluated this benchmark into 7 other languages. This marks the first large-scale multilingual evaluation of LLMs in the medical domain. Datasets, multilingual evaluation benchmarks, scripts, and all the models obtained during our experiments are freely released. 2024.findings-acl.348 @@ -10901,9 +10901,9 @@ ZhaopengTuTencent AI Lab ChangChen YouliangYuanThe Chinese University of Hong Kong-Shenzhen - Jen-tseHuang + Jen-tseHuang WenxiangJiaoTencent AI Lab - MichaelLyuThe Chinese University of Hong Kong + MichaelLyuThe Chinese University of Hong Kong 5865-5877 Safety lies at the core of developing and deploying large language models (LLMs). However, previous safety benchmarks only concern the safety in one language, e.g. the majority language in the pretraining data such as English. In this work, we build the first multilingual safety benchmark for LLMs, XSafety, in response to the global deployment of LLMs in practice. XSafety covers 14 kinds of commonly used safety issues across 10 languages that span several language families. We utilize XSafety to empirically study the multilingual safety for 4 widely-used LLMs, including both close-API and open-source models. Experimental results show that all LLMs produce significantly more unsafe responses for non-English queries than English ones, indicating the necessity of developing safety alignment for non-English languages. In addition, we propose a simple and effective prompting method to improve the multilingual safety of ChatGPT by enhancing cross-lingual generalization of safety alignment. Our prompting method can significantly reduce the ratio of unsafe responses by 42% for non-English queries. We will release all the data and results to facilitate future research on LLMs’ safety. 2024.findings-acl.349 @@ -10915,7 +10915,7 @@ YuanZhang WanhongHuang YiFengNanjing University - ChuanyiLinanjing university + ChuanyiLinanjing university ZhiweiFeiFudan University, Harbin Institute of Technology, Dalian University of Technology, Shanghai Jiaotong University, Shandong University, Peking University, Zhejiang University, University of Science and Technology of China, Hunan University, Beijing Institute of Technology, University of the Chinese Academy of Sciences, Southeast University, Sichuan University, Monash University, Malaysia Campus, Tianjin University, Beijing University of Aeronautics and Astronautics, Wuhan University of Technology, Yale University, Technische Universität München, Wuhan University, nanjing university, Tsinghua University and Wuhan University JidongGeNanjing University BinLuonanjing university @@ -10930,7 +10930,7 @@ <fixed-case>CMDL</fixed-case>: A Large-Scale <fixed-case>C</fixed-case>hinese Multi-Defendant Legal Judgment Prediction Dataset WanhongHuang YiFengNanjing University - ChuanyiLinanjing university + ChuanyiLinanjing university HonghanWu JidongGeNanjing University VincentNgUniversity of Texas at Dallas @@ -10952,18 +10952,18 @@ <fixed-case>A</fixed-case>bstract <fixed-case>M</fixed-case>eaning <fixed-case>R</fixed-case>epresentation-Based Logic-Driven Data Augmentation for Logical Reasoning - QimingBao + QimingBao Alex YuxuanPeng ZhenyunDeng WanjunZhong - GaëlGendron + GaëlGendron TimothyPistotti NeşetTan NathanYoung YangChen YonghuaZhu PaulDenny - MichaelWitbrock + MichaelWitbrock JiamouLiu 5914-5934 Combining large language models with logical reasoning enhances their capacity to address problems in a robust and reliable manner. Nevertheless, the intricate nature of logical reasoning poses challenges when gathering reliable data from the web to build comprehensive training datasets, subsequently affecting performance on downstream tasks. To address this, we introduce a novel logic-driven data augmentation approach, AMR-LDA. AMR-LDA converts the original text into an Abstract Meaning Representation (AMR) graph, a structured semantic representation that encapsulates the logical structure of the sentence, upon which operations are performed to generate logically modified AMR graphs. The modified AMR graphs are subsequently converted back into text to create augmented data. Notably, our methodology is architecture-agnostic and enhances both generative large language models, such as GPT-3.5 and GPT-4, through prompt augmentation, and discriminative large language models through contrastive learning with logic-driven data augmentation. Empirical evidence underscores the efficacy of our proposed method with improvement in performance across seven downstream tasks, such as reading comprehension requiring logical reasoning, textual entailment, and natural language inference. Furthermore, our method leads on the ReClor leaderboard at https://eval.ai/web/challenges/challenge-page/503/leaderboard/1347. The source code and data are publicly available at https://github.com/Strong-AI-Lab/Logical-Equivalence-driven-AMR-Data-Augmentation-for-Representation-Learning. @@ -10985,7 +10985,7 @@ <fixed-case>V</fixed-case>i<fixed-case>H</fixed-case>ate<fixed-case>T</fixed-case>5: Enhancing Hate Speech Detection in <fixed-case>V</fixed-case>ietnamese With a Unified Text-to-Text Transformer Model - LuanThanh NguyenUniversity of Information Technology, Vietnam National University Ho Chi Minh City + LuanThanh NguyenUniversity of Information Technology, Vietnam National University Ho Chi Minh City 5948-5961 Recent advancements in hate speech detection (HSD) in Vietnamese have made significant progress, primarily attributed to the emergence of transformer-based pre-trained language models, particularly those built on the BERT architecture. However, the necessity for specialized fine-tuned models has resulted in the complexity and fragmentation of developing a multitasking HSD system. Moreover, most current methodologies focus on fine-tuning general pre-trained models, primarily trained on formal textual datasets like Wikipedia, which may not accurately capture human behavior on online platforms. In this research, we introduce ViHateT5, a T5-based model pre-trained on our proposed large-scale domain-specific dataset named VOZ-HSD. By harnessing the power of a text-to-text architecture, ViHateT5 can tackle multiple tasks using a unified model and achieve state-of-the-art performance across all standard HSD benchmarks in Vietnamese. Our experiments also underscore the significance of label distribution in pre-training data on model efficacy. We provide our experimental materials for research purposes, including the VOZ-HSD dataset, pre-trained checkpoint, the unified HSD-multitask ViHateT5 model, and related source code on GitHub publicly. 2024.findings-acl.355 @@ -11009,7 +11009,7 @@ HanxingDing YuexiangXieAlibaba Group QiCaoInstitute of Computing Technology, Chinese Academy of Sciences, China - FeiSunInstitute of Computing Technology, Chinese Academy of Sciences + FeiSunInstitute of Computing Technology, Chinese Academy of Sciences JinyangGao HuaweiShenInstitute of Computing Technology, Chinese Academy of Sciences BolinDingAlibaba Group @@ -11021,8 +11021,8 @@ Zero-shot Cross-lingual Alignment for Embedding Initialization - XiAi - ZhiyongHuangNUS School of Computing + XiAi + ZhiyongHuangNUS School of Computing 5997-6007 For multilingual training, we present CrossInit, an initialization method that initializes embeddings into similar geometrical structures across languages in an unsupervised manner. CrossInit leverages a common cognitive linguistic mechanism, Zipf’s law, which indicates that similar concepts across languages have similar word ranks or frequencies in their monolingual corpora. Instead of considering point-to-point alignments based on ranks, CrossInit considers the same span of consecutive ranks in each language as the Positive pairs for alignment, while others out of the span are used as Negative pairs. CrossInit then employs Contrastive Learning to iteratively refine randomly initialized embeddings for similar geometrical structures across languages. Our experiments on Unsupervised NMT, XNLI, and MLQA showed significant gains in low-resource and dissimilar languages after applying CrossInit. 2024.findings-acl.358 @@ -11045,10 +11045,10 @@ It takes two to borrow: a donor and a recipient. Who’s who? LiviuDinuUniversity of Bucharest AnaUbanUniversitatea Bucuresti - AncaDinu + AncaDinu Ioan-BogdanIordache SimonaGeorgescuUniversity of Bucharest - LaurentiuZoicasUniversity of Bucharest + LaurentiuZoicasUniversity of Bucharest 6023-6035 We address the open problem of automatically identifying the direction of lexical borrowing, given word pairs in the donor and recipient languages. We propose strong benchmarks for this task, by applying a set of machine learning models. We extract and publicly release a comprehensive borrowings dataset from the recent RoBoCoP cognates and borrowings database for five Romance languages. We experiment on this dataset with both graphic and phonetic representations and with different features, models and architectures. We interpret the results, in terms of F1 score, commenting on the influence of features and model choice, of the imbalanced data and of the inherent difficulty of the task for particular language pairs. We show that automatically determining the direction of borrowing is a feasible task, and propose additional directions for future work. 2024.findings-acl.360 @@ -11058,7 +11058,7 @@ Advancing Post-<fixed-case>OCR</fixed-case> Correction: A Comparative Study of Synthetic Data ShuhaoGuan - DerekGreeneUniversity College Dublin + DerekGreeneUniversity College Dublin 6036-6047 This paper explores the application of synthetic data in the post-OCR domain on multiple fronts by conducting experiments to assess the impact of data volume, augmentation, and synthetic data generation methods on model performance. Furthermore, we introduce a novel algorithm that leverages computer vision feature detection algorithms to calculate glyph similarity for constructing post-OCR synthetic data. Through experiments conducted across a variety of languages, including several low-resource ones, we demonstrate that models like ByT5 can significantly reduce Character Error Rates (CER) without the need for manually annotated data, and our proposed synthetic data generation method shows advantages over traditional methods, particularly in low-resource languages. 2024.findings-acl.361 @@ -11067,11 +11067,11 @@ <fixed-case>G</fixed-case>eo<fixed-case>A</fixed-case>gent: To Empower <fixed-case>LLM</fixed-case>s using Geospatial Tools for Address Standardization - ChenghuaHuangFudan University + ChenghuaHuangFudan University ShisongChen ZhixuLi JianfengQuSoochow University - YanghuaXiaoFudan University + YanghuaXiaoFudan University JiaxinLiu ZhigangCheniFLYTEK Research 6048-6063 @@ -11083,7 +11083,7 @@ <fixed-case>HQP</fixed-case>: A Human-Annotated Dataset for Detecting Online Propaganda AbdurahmanMaarouf - DominikBärLudwig-Maximilians-Universität München + DominikBärLudwig-Maximilians-Universität München DominiqueGeisslerLudwig-Maximilians-Universität München StefanFeuerriegelLMU Munich 6064-6089 @@ -11107,7 +11107,7 @@ Exploring Spatial Schema Intuitions in Large Language and Vision Models - PhilippWickeLudwig-Maximilians-Universität München + PhilippWickeLudwig-Maximilians-Universität München LennartWachowiakKing’s College London, University of London 6102-6117 Despite the ubiquity of large language models (LLMs) in AI research, the question of embodiment in LLMs remains underexplored, distinguishing them from embodied systems in robotics where sensory perception directly informs physical action.Our investigation navigates the intriguing terrain of whether LLMs, despite their non-embodied nature, effectively capture implicit human intuitions about fundamental, spatial building blocks of language. We employ insights from spatial cognitive foundations developed through early sensorimotor experiences, guiding our exploration through the reproduction of three psycholinguistic experiments. Surprisingly, correlations between model outputs and human responses emerge, revealing adaptability without a tangible connection to embodied experiences. Notable distinctions include polarized language model responses and reduced correlations in vision language models. This research contributes to a nuanced understanding of the interplay between language, spatial experiences, and the computations made by large language models.Project Website: https://cisnlp.github.io/Spatial_Schemas/ @@ -11120,7 +11120,7 @@ YiboMiao HongchengGao HaoZhangUniversity of California, San Diego, Petuum, Inc and Carnegie Mellon University - ZhijieDengShanghai Jiaotong University + ZhijieDengShanghai Jiaotong University 6118-6130 The detection of machine-generated text, especially from large language models (LLMs), is crucial in preventing serious social problems resulting from their misuse. Some methods train dedicated detectors on specific datasets but fall short in generalizing to unseen test data, while other zero-shot ones often yield suboptimal performance. Although the recent DetectGPT has shown promising detection performance, it suffers from significant inefficiency issues, as detecting a single candidate requires querying the source LLM with hundreds of its perturbations. This paper aims to bridge this gap. Concretely, we propose to incorporate a Bayesian surrogate model, which allows us to select typical samples based on Bayesian uncertainty and interpolate scores from typical samples to other samples, to improve query efficiency. Empirical results demonstrate that our method significantly outperforms existing approaches under a low query budget. Notably, when detecting the text generated by LLaMA family models, our method with just 2 or 3 queries can outperform DetectGPT with 200 queries. 2024.findings-acl.366 @@ -11129,7 +11129,7 @@ Decoding the Narratives: Analyzing Personal Drug Experiences Shared on <fixed-case>R</fixed-case>eddit - LaylaBouzoubaaDrexel University + LaylaBouzoubaaDrexel University ElhamAghakhani MaxSong QuangTrinh @@ -11145,7 +11145,7 @@ ShaoboCuiEPFL - EPF Lausanne YiyangFeng YisongMao - YifanHouDepartment of Computer Science, Swiss Federal Institute of Technology + YifanHouDepartment of Computer Science, Swiss Federal Institute of Technology BoiFaltings 6149-6174 Crafting an appealing heading is crucial for attracting readers and marketing work or products. A popular way is to summarize the main idea with a refined description and a memorable acronym. However, there lacks a systematic study and a formal benchmark including datasets and metrics. Motivated by this absence, we introduce LOgogram, a novel benchmark comprising 6,653 paper abstracts with corresponding descriptions and acronyms. To measure the quality of heading generation, we propose a set of evaluation metrics from three aspects: summarization, neology, and algorithm. Additionally, we explore three strategies for heading generation(generation ordering, tokenization of acronyms, and framework design) under various prevalent learning paradigms(supervised fine-tuning, in-context learning with Large Language Models(LLMs), and reinforcement learning) on our benchmark. Our experimental results indicate the difficulty in identifying a practice that excels across all summarization, neologistic, and algorithmic aspects. @@ -11156,9 +11156,9 @@ Understanding Fine-grained Distortions in Reports of Scientific Findings AmelieWuehrlUniversity of Stuttgart, Universität Stuttgart - DustinWrightUniversity of Copenhagen - RomanKlingerOtto-Friedrich Universität Bamberg - IsabelleAugensteinUniversity of Copenhagen + DustinWrightUniversity of Copenhagen + RomanKlingerOtto-Friedrich Universität Bamberg + IsabelleAugensteinUniversity of Copenhagen 6175-6191 Distorted science communication harms individuals and society as it can lead to unhealthy behavior change and decrease trust in scientific institutions. Given the rapidly increasing volume of science communication in recent years, a fine-grained understanding of how findings from scientific publications are reported to the general public, and methods to detect distortions from the original work automatically, are crucial. Prior work focused on individual aspects of distortions or worked with unpaired data. In this work, we make three foundational contributions towards addressing this problem: (1) annotating 1,600 instances of scientific findings from academic papers paired with corresponding findings as reported in news articles and tweets wrt. four characteristics: causality, certainty, generality and sensationalism; (2) establishing baselines for automatically detecting these characteristics; and (3) analyzing the prevalence of changes in these characteristics in both human-annotated and large-scale unlabeled data. Our results show that scientific findings frequently undergo subtle distortions when reported. Tweets distort findings more often than science news reports. Detecting fine-grained distortions automatically poses a challenging task. In our experiments, fine-tuned task-specific models consistently outperform few-shot LLM prompting. 2024.findings-acl.369 @@ -11167,11 +11167,11 @@ <fixed-case>MM</fixed-case>-<fixed-case>SOC</fixed-case>: Benchmarking Multimodal Large Language Models in Social Media Platforms - YiqiaoJin + YiqiaoJin MinjeChoiGeorgia Institute of Technology - GauravVermaGeorgia Institute of Technology + GauravVermaGeorgia Institute of Technology JindongWangMicrosoft Research - SrijanKumarGeorgia Institute of Technology + SrijanKumarGeorgia Institute of Technology 6192-6210 Social media platforms are hubs for multimodal information exchange, encompassing text, images, and videos, making it challenging for machines to comprehend the information or emotions associated with interactions in online spaces. Multimodal Large Language Models (MLLMs) have emerged as a promising solution to address these challenges, yet struggle with accurately interpreting human emotions and complex contents like misinformation. This paper introduces MM-Soc, a comprehensive benchmark designed to evaluate MLLMs’ understanding of multimodal social media content. MM-Soc compiles prominent multimodal datasets and incorporates a novel large-scale YouTube tagging dataset, targeting a range of tasks from misinformation detection, hate speech detection, and social context generation. Through our exhaustive evaluation on ten size-variants of four open-source MLLMs, we have identified significant performance disparities, highlighting the need for advancements in models’ social understanding capabilities. Our analysis reveals that, in a zero-shot setting, various types of MLLMs generally exhibit difficulties in handling social media tasks. However, MLLMs demonstrate performance improvements post fine-tuning, suggesting potential pathways for improvement. 2024.findings-acl.370 @@ -11182,7 +11182,7 @@ Instances Need More Care: Rewriting Prompts for Instances with <fixed-case>LLM</fixed-case>s in the Loop Yields Better Zero-Shot Performance SaurabhSrivastavaGeorge Mason University ChengyueHuang - WeiguoFanUniversity of Iowa + WeiguoFanUniversity of Iowa ZiyuYaoGeorge Mason University 6211-6232 Large language models (LLMs) have revolutionized zero-shot task performance, mitigating the need for task-specific annotations while enhancing task generalizability. Despite its advancements, current methods using trigger phrases such as “Let’s think step by step” remain limited. This study introduces PRomPTed, an approach that optimizes the zero-shot prompts for individual task instances following an innovative manner of “LLMs in the loop”.Our comprehensive evaluation across 13 datasets and 10 task types based on GPT-4 reveals that PRomPTed significantly outperforms both the naive zero-shot approaches and a strong baseline (i.e., “Output Refinement”) which refines the task output instead of the input prompt. Our experimental results also confirmed the generalization of this advantage to the relatively weaker GPT-3.5. Even more intriguingly, we found that leveraging GPT-3.5 to rewrite prompts for the stronger GPT-4 not only matches but occasionally exceeds the efficacy of using GPT-4 as the prompt rewriter. Our research thus presents a huge value in not only enhancing zero-shot LLM performance but also potentially enabling supervising LLMs with their weaker counterparts, a capability attracting much interest recently. Finally, our additional experiments confirm the generalization of the advantages to open-source LLMs such as Mistral 7B and Mixtral 8x7B. @@ -11192,8 +11192,8 @@ Benchmarking Retrieval-Augmented Generation for Medicine - GuangzhiXiong - QiaoJinNational Institutes of Health + GuangzhiXiong + QiaoJinNational Institutes of Health ZhiyongLuNational Institutes of Health AidongZhang 6233-6251 @@ -11207,9 +11207,9 @@ RuibinYuan HanfengLinBeijing Jiaotong University YiWang - ZeyueTianHong Kong University of Science and Technology + ZeyueTianHong Kong University of Science and Technology ShangdaWu - TianhaoShen + TianhaoShen GeZhang YuhangWu CongLiu @@ -11218,19 +11218,19 @@ ZiyangMa QinLiu TianyuZheng - YizhiLiUniversity of Manchester and University of Sheffield - YinghaoMaQueen Mary University of London + YizhiLiUniversity of Manchester and University of Sheffield + YinghaoMaQueen Mary University of London YimingLiang XiaoweiChi RuiboLiuGoogle DeepMind ZiliWang ChenghuaLinUniversity of Manchester - QifengLiuThe Hong Kong University of Science and Technology + QifengLiuThe Hong Kong University of Science and Technology TaoJiang WenhaoHuang WenhuChenUniversity of Waterloo and Google - JieFuHong Kong University of Science and Technology - EmmanouilBenetos + JieFuHong Kong University of Science and Technology + EmmanouilBenetos GusXiaNew York University RogerDannenbergCarnegie Mellon University WeiXueHong Kong University of Science and Technology @@ -11266,7 +11266,7 @@ Knowledge Graph-Enhanced Large Language Models via Path Selection HaochenLiu - SongWangUniversity of Virginia + SongWangUniversity of Virginia YaochenZhu YushunDong JundongLiUniversity of Virginia @@ -11278,12 +11278,12 @@ <fixed-case>OTTAWA</fixed-case>: Optimal <fixed-case>T</fixed-case>ranspor<fixed-case>T</fixed-case> Adaptive Word Aligner for Hallucination and Omission Translation Errors Detection - ChenyangHuang + ChenyangHuang AbbasGhaddarHuawei Technologies Ltd. IvanKobyzevHuawei Noah’s Ark Lab MehdiRezagholizadeh - OsmarZaianeUniversity of Alberta - BoxingChenHuawei Technologies Ltd. + OsmarZaianeUniversity of Alberta + BoxingChenHuawei Technologies Ltd. 6322-6334 Recently, there has been considerable attention on detecting hallucinations and omissions in Machine Translation (MT) systems. The two dominant approaches to tackle this task involve analyzing the MT system’s internal states or relying on the output of external tools, such as sentence similarity or MT quality estimators. In this work, we introduce OTTAWA, a novel Optimal Transport (OT)-based word aligner specifically designed to enhance the detection of hallucinations and omissions in MT systems. Our approach explicitly models the missing alignments by introducing a “null” vector, for which we propose a novel one-side constrained OT setting to allow an adaptive null alignment. Our approach yields competitive results compared to state-of-the-art methods across 18 language pairs on the HalOmi benchmark. In addition, it shows promising features, such as the ability to distinguish between both error types and perform word-level detection without accessing the MT system’s internal states. 2024.findings-acl.377 @@ -11294,7 +11294,7 @@ <fixed-case>ONSEP</fixed-case>: A Novel Online Neural-Symbolic Framework for Event Prediction Based on Large Language Model XuanqingYuInstitute of automation, Chinese academy of science, Chinese Academy of Sciences WangtaoSun - JingweiLi + JingweiLi KangLiuInstitute of automation, Chinese academy of science, Chinese Academy of Sciences ChengbaoLiuInstitute of automation, Chinese academy of science, Chinese Academy of Sciences JieTan @@ -11320,7 +11320,7 @@ Too Big to Fail: Larger Language Models are Disproportionately Resilient to Induction of Dementia-Related Linguistic Anomalies - ChangyeLiUniversity of Washington + ChangyeLiUniversity of Washington ZhechengSheng TrevorCohenUniversity of Washington SergueiPakhomovUniversity of Minnesota - Twin Cities @@ -11345,7 +11345,7 @@ <fixed-case>TRAM</fixed-case>: Benchmarking Temporal Reasoning for Large Language Models YuqingWangStanford University - YunZhaoMeta Platforms, Inc + YunZhaoMeta Platforms, Inc 6389-6415 Reasoning about time is essential for understanding the nuances of events described in natural language. Previous research on this topic has been limited in scope, characterized by a lack of standardized benchmarks that would allow for consistent evaluations across different studies. In this paper, we introduce TRAM, a temporal reasoning benchmark composed of ten datasets, encompassing various temporal aspects of events such as order, arithmetic, frequency, and duration, designed to facilitate a comprehensive evaluation of the TeR capabilities of large language models (LLMs). We evaluate popular LLMs like GPT-4 and Llama2 in zero-shot and few-shot scenarios, and establish baselines with BERT-based and domain-specific models. Our findings indicate that the best-performing model lags significantly behind human performance. It is our aspiration that TRAM will spur further progress in enhancing the TeR capabilities of LLMs. 2024.findings-acl.382 @@ -11371,7 +11371,7 @@ LazarMilikicEPFL - EPF Lausanne YiyangFeng MeteIsmayilzadaEPFL - EPF Lausanne - DebjitPaulEPFL - EPF Lausanne + DebjitPaulEPFL - EPF Lausanne AntoineBosselutSwiss Federal Institute of Technology Lausanne BoiFaltings 6433-6452 @@ -11398,7 +11398,7 @@ YanzhengXiang HanqiYan LinGuiKing’s College London, University of London - YulanHeKing’s College London, University of London + YulanHeKing’s College London, University of London 6467-6481 In-context learning has become a popular paradigm in natural language processing. However, its performance can be significantly influenced by the order of in-context demonstration examples. In this paper, we found that causal language models (CausalLMs) are more sensitive to this order compared to prefix language models (PrefixLMs). We attribute this phenomenon to the auto-regressive attention masks within CausalLMs, which restrict each token from accessing information from subsequent tokens. This results in different receptive fields for samples at different positions, thereby leading to representation disparities across positions. To tackle this challenge, we introduce an unsupervised fine-tuning method, termed the Information-Augmented and Consistency-Enhanced approach. This approach utilizes contrastive learning to align representations of in-context examples across different positions and introduces a consistency loss to ensure similar representations for inputs with different permutations. This enhances the model’s predictive consistency across permutations. Experimental results on five benchmarks suggest that our proposed method can reduce the sensitivity of CausalLMs to the order of in-context examples and exhibit robust generalizability, particularly when demonstrations are sourced from a candidate pool different from that used in the training phase, or when the number of in-context examples differs from what is used during training. 2024.findings-acl.386 @@ -11408,8 +11408,8 @@ Perspective Taking through Generating Responses to Conflict Situations JoanPlepiRheinische Friedrich-Wilhelms Universität Bonn - CharlesWelchMcMaster University - LucieFlekRheinische Friedrich-Wilhelms Universität Bonn + CharlesWelchMcMaster University + LucieFlekRheinische Friedrich-Wilhelms Universität Bonn 6482-6497 Although language model performance across diverse tasks continues to improve, these models still struggle to understand and explain the beliefs of other people. This skill requires perspective-taking, the process of conceptualizing the point of view of another person. Perspective taking becomes challenging when the text reflects more personal and potentially more controversial beliefs.We explore this task through natural language generation of responses to conflict situations. We evaluate novel modifications to recent architectures for conditioning generation on an individual’s comments and self-disclosure statements. Our work extends the Social-Chem-101 corpus, using 95k judgements written by 6k authors from English Reddit data, for each of whom we obtained 20-500 self-disclosure statements. Our evaluation methodology borrows ideas from both personalized generation and theory of mind literature. Our proposed perspective-taking models outperform recent work, especially the twin encoder model conditioned on self-disclosures with high similarity to the conflict situation. 2024.findings-acl.387 @@ -11425,7 +11425,7 @@ ShengShenUniversity of California Berkeley GopalaAnumanchipalliUniversity of California, Berkeley MichaelMahoneyUniversity of California Berkeley - KurtKeutzerUniversity of California Berkeley + KurtKeutzerUniversity of California Berkeley AmirGholamiUniversity of California Berkeley 6498-6526 Pretrained large language models (LLMs) are currently state-of-the-art for solving the vast majority of natural language processing tasks. While many real-world applications still require fine-tuning to reach satisfactory levels of performance, many of them are in the low-data regime, making fine-tuning challenging. To address this, we propose LLM2LLM, a targeted and iterative data augmentation strategy that uses a teacher LLM to enhance a small seed dataset by augmenting additional data that can be used for fine-tuning on a specific task. LLM2LLM (1) fine-tunes a baseline student LLM on the initial seed data, (2) evaluates and extracts data points that the model gets wrong, and (3) uses a teacher LLM to generate synthetic data based on these incorrect data points, which are then added back into the training data. This approach amplifies the signal from incorrectly predicted data points by the LLM during training and reintegrates them into the dataset to focus on more challenging examples for the LLM. Our results show that LLM2LLM significantly enhances the performance of LLMs in the low-data regime, outperforming both traditional fine-tuning and other data augmentation baselines. LLM2LLM reduces the dependence on labor-intensive data curation and paves the way for more scalable and performant LLM solutions, allowing us to tackle data-constrained domains and tasks. We achieve improvements up to 24.2% on the GSM8K dataset, 32.6% on CaseHOLD, 32.0% on SNIPS, 52.6% on TREC and 39.8% on SST-2 over regular fine-tuning in the low-data regime using a Llama-2-7B student model. Our code is available at https://github.com/SqueezeAILab/LLM2LLM. @@ -11441,7 +11441,7 @@ SharonAdarAmazon MohitBansalUniversity of North Carolina at Chapel Hill JacobGoldbergerBar-Ilan University - RanLevyAmazon + RanLevyAmazon IdoDaganBar-Ilan University 6527-6548 Multi-document summarization (MDS) is a challenging task, often decomposed to subtasks of salience and redundancy detection, followed by text generation.In this context, alignment of corresponding sentences between a reference summary and its source documents has been leveraged to generate training data for some of the component tasks. Yet, this enabling alignment step has usually been applied heuristically on the sentence level on a limited number of subtasks.In this paper, we propose extending the summary-source alignment framework by (1) applying it at the more fine-grained proposition span level, (2) annotating alignment manually in a multi-document setup, and (3) revealing the great potential of summary-source alignments to yield several datasets for at least six different tasks. Specifically, for each of the tasks, we release a manually annotated test set that was derived automatically from the alignment annotation. We also release development and train sets in the same way, but from automatically derived alignments.Using the datasets, each task is demonstrated with baseline models and corresponding evaluation metrics to spur future research on this broad challenge. @@ -11484,7 +11484,7 @@ Text Simplification via Adaptive Teaching Seyed AliBahrainian JonathanDou - CarstenEickhoffEberhard-Karls-Universität Tübingen + CarstenEickhoffEberhard-Karls-Universität Tübingen 6574-6584 Text simplification is the process of rewriting a piece of text using simpler vocabulary and grammatical structure in order to make the text more accessible and understandable for a larger audience. In this paper, we introduce a new text simplification model based on the notion of adaptive teaching using a teacher network and a text generation network. We name this new model Simplification via Adaptive Teaching (SAT). Our proposed model sets a new state-of-the-art performance in terms of standard simplification metrics such as SARI and D-SARI with a significant improvement over the previous state of the art on the D-Wikipedia dataset and the Wiki-Doc benchmark dataset. Moreover, we conduct a human evaluation in terms of text simplicity, correctness, and fluency to substantiate SAT’s performance. 2024.findings-acl.392 @@ -11495,8 +11495,8 @@ A multi-level multi-label text classification dataset of 19th century Ottoman and <fixed-case>R</fixed-case>ussian literary and critical texts GokcenGokceogluMETU DevrimÇavuşoğlu - EmreAkbasMiddle East Technical University - ÖzenDolceroccaUniversity of Bologna + EmreAkbasMiddle East Technical University + ÖzenDolceroccaUniversity of Bologna 6585-6596 This paper introduces a multi-level, multi-label text classification dataset comprising over 3000 documents. The dataset features literary and critical texts from 19th-century Ottoman Turkish and Russian. It is the first study to apply large language models (LLMs) to this dataset, sourced from prominent literary periodicals of the era. The texts have been meticulously organized and labeled. This was done according to a taxonomic framework that takes into account both their structural and semantic attributes. Articles are categorized and tagged with bibliometric metadata by human experts. We present baseline classification results using a classical bag-of-words (BoW) naive Bayes model and three modern LLMs: multilingual BERT, Falcon, and Llama-v2. We found that in certain cases, Bag of Words (BoW) outperforms Large Language Models (LLMs), emphasizing the need for additional research, especially in low-resource language settings. This dataset is expected to be a valuable resource for researchers in natural language processing and machine learning, especially for historical and low-resource languages. The dataset is publicly available. 2024.findings-acl.393 @@ -11505,7 +11505,7 @@ It is Simple Sometimes: A Study On Improving Aspect-Based Sentiment Analysis Performance - LauraCabelloCopenhagen University + LauraCabelloCopenhagen University UchennaAkujuobiSony Research 6597-6610 Aspect-Based Sentiment Analysis (ABSA) involves extracting opinions from textual data about specific entities and their corresponding aspects through various complementary subtasks. Several prior research has focused on developing ad hoc designs of varying complexities for these subtasks. In this paper, we build upon the instruction tuned model proposed by Scaria et al. (2023), who present an instruction-based model with task descriptions followed by in-context examples on ABSA subtasks. We propose PFInstruct, an extension to this instruction learning paradigm by appending an NLP-related task prefix to the task description. This simple approach leads to improved performance across all tested SemEval subtasks, surpassing previous state-of-the-art (SOTA) on the ATE subtask (Rest14) by +3.28 F1-score, and on the AOOE subtask by an average of +5.43 F1-score across SemEval datasets. Furthermore, we explore the impact of the prefix-enhanced prompt quality on the ABSA subtasks and find that even a noisy prefix enhances model performance compared to the baseline. Our method also achieves competitive results on a biomedical domain dataset (ERSA). @@ -11515,10 +11515,10 @@ Whose Emotions and Moral Sentiments do Language Models Reflect? - ZihaoHe - SiyiGuo + ZihaoHe + SiyiGuo AshwinRao - KristinaLermanUniversity of Southern California and USC Information Sciences Institute + KristinaLermanUniversity of Southern California and USC Information Sciences Institute 6611-6631 Language models (LMs) are known to represent the perspectives of some social groups better than others, which may impact their performance, especially on subjective tasks such as content moderation and hate speech detection. To explore how LMs represent different perspectives, existing research focused on positional alignment, i.e., how closely the models mimic the opinions and stances of different groups, e.g., liberals or conservatives. However, human communication also encompasses emotional and moral dimensions. We define the problem of affective alignment, which measures how LMs’ emotional and moral tone represents those of different groups. By comparing the affect of responses generated by 36 LMs to the affect of Twitter messages written by two ideological groups, we observe significant misalignment of LMs with both ideological groups. This misalignment is larger than the partisan divide in the U.S. Even after steering the LMs towards specific ideological perspectives, the misalignment and liberal tendencies of the model persist, suggesting a systemic bias within LMs. 2024.findings-acl.395 @@ -11533,8 +11533,8 @@ JinlanFu QinyuanCheng JiashengYe - JunjieYe - XipengQiuFudan University + JunjieYe + XipengQiuFudan University XuanjingHuangFudan University 6632-6646 In the realm of Large Language Models (LLMs), users commonly employ diverse decoding strategies and adjust hyperparameters to control the generated text. However, a critical question emerges: Are LLMs conscious of the existence of these decoding strategies and capable of regulating themselves? The current decoding generation process often relies on empirical and heuristic manual adjustments to hyperparameters based on types of tasks and demands. However, this process is typically cumbersome, and the decoding hyperparameters may not always be optimal for each sample. To address the aforementioned challenges, we propose a novel text generation paradigm termed Hyperparameter Aware Generation (HAG). By leveraging hyperparameter-aware instruction tuning, the LLM autonomously determines the optimal decoding strategy and configs based on the input samples, enabling self-regulation. Our approach eliminates the need for extensive manual tuning, offering a more autonomous, self-regulate model behavior. Experimental results spanning six datasets across reasoning, creativity, translation, and mathematics tasks demonstrate that hyperparameter-aware instruction tuning empowers the LLMs to self-regulate the decoding strategy and hyperparameter. HAG extends the current paradigm in the text generation process, highlighting the feasibility of endowing the LLMs with self-regulate decoding strategies. @@ -11560,7 +11560,7 @@ Towards Uncertainty-Aware Language Agent JiuzhouHan - WrayBuntineVinUniversity + WrayBuntineVinUniversity EhsanShareghiMonash University and University of Cambridge 6662-6685 While Language Agents have achieved promising success by placing Large Language Models at the core of a more versatile design that dynamically interacts with the external world, the existing approaches neglect the notion of uncertainty during these interactions. We present the Uncertainty-Aware Language Agent (UALA), a framework that orchestrates the interaction between the agent and the external world using uncertainty quantification. Compared with other well-known counterparts like ReAct, our extensive experiments across 3 representative tasks (HotpotQA, StrategyQA, MMLU) and various LLM sizes demonstrate that UALA brings a significant improvement of performance, while having a substantially lower reliance on the external world (i.e., reduced number of tool calls and tokens). Our analyses provide various insights including the great potential of UALA compared with agent fine-tuning, and underscore the unreliability of verbalised confidence of LLMs as a proxy for uncertainty. @@ -11570,7 +11570,7 @@ Detection and Positive Reconstruction of Cognitive Distortion Sentences: <fixed-case>M</fixed-case>andarin Dataset and Evaluation - ShuyaLin + ShuyaLin YuxiongWang JonathanDong ShiguangNiTsinghua University, Tsinghua University @@ -11583,8 +11583,8 @@ <fixed-case>P</fixed-case>i<fixed-case>V</fixed-case>e: Prompting with Iterative Verification Improving Graph-based Generative Capability of <fixed-case>LLM</fixed-case>s JiuzhouHan - NigelCollierUniversity of Cambridge - WrayBuntineVinUniversity + NigelCollierUniversity of Cambridge + WrayBuntineVinUniversity EhsanShareghiMonash University and University of Cambridge 6702-6718 Large language models (LLMs) have shown great abilities of solving various natural language tasks in different domains. Due to the training objective of LLMs and their pre-training data, LLMs are not very well equipped for tasks involving structured data generation. We propose a framework, Prompting with Iterative Verification (PiVe), to improve graph-based generative capability of LLMs. We show how a small language model could be trained to act as a verifier module for the output of an LLM(i.e., ChatGPT, GPT-4), and to iteratively improve its performance via fine-grained corrective instructions. We also show how the verifier module could apply iterative corrections offline for a more cost-effective solution to the text-to-graph generation task. Experiments on three graph-based datasets show consistent improvement gained via PiVe. Additionally, we create GenWiki-HIQ and highlight that the verifier module can be used as a data augmentation tool to help improve the quality of automatically generated parallel text-graph datasets. @@ -11594,12 +11594,12 @@ Two-stage Generative Question Answering on Temporal Knowledge Graph Using Large Language Models - YifuGaoNational University of Defense Technology + YifuGaoNational University of Defense Technology LinboQiao ZhigangKanNational University of Defense Technology - ZhihuaWenNational University of Defence Technology - YongquanHe - DongshengLi + ZhihuaWenNational University of Defence Technology + YongquanHe + DongshengLi 6719-6734 Temporal knowledge graph question answering (TKGQA) poses a significant challenge task, due to the temporal constraints hidden in questions and the answers sought from dynamic structured knowledge. Although large language models (LLMs) have made considerable progress in their reasoning ability over structured data, their application to the TKGQA task is a relatively unexplored area. This paper first proposes a novel generative temporal knowledge graph question answering framework, GenTKGQA, which guides LLMs to answer temporal questions through two phases: Subgraph Retrieval and Answer Generation. First, we exploit LLM’s intrinsic knowledge to mine temporal constraints and structural links in the questions without extra training, thus narrowing down the subgraph search space in both temporal and structural dimensions. Next, we design virtual knowledge indicators to fuse the graph neural network signals of the subgraph and the text representations of the LLM in a non-shallow way, which helps the open-source LLM deeply understand the temporal order and structural dependencies among the retrieved facts through instruction tuning. Experimental results on two widely used datasets demonstrate the superiority of our model. 2024.findings-acl.401 @@ -11611,7 +11611,7 @@ Syeda NahidaAkter SangwuLeeUniversity of Rochester YingshanChang - YonatanBiskMeta and Carnegie Mellon University + YonatanBiskMeta and Carnegie Mellon University EricNybergCarnegie Mellon University 6735-6752 Verifying a question’s validity before answering is crucial in real-world applications, where users may provide imperfect instructions. In this scenario, an ideal model should address the discrepancies in the query and convey them to the users rather than generating the best possible answer. Addressing this requirement, we introduce a new compositional visual question-answering dataset, VisReas, that consists of answerable and unanswerable visual queries formulated by traversing and perturbing commonalities and differences among objects, attributes, and relations. VisReas contains 2.07M semantically diverse queries generated automatically using Visual Genome scene graphs. The unique feature of this task, validating question answerability with respect to an image before answering, and the poor performance of state-of-the-art models inspired the design of a new modular baseline, Logic2Vision that reasons by producing and executing pseudocode without any external modules to generate the answer. Logic2Vision outperforms generative models in VisReas (+4.82% over LLaVA-1.5; +12.23% over InstructBLIP) and achieves a significant gain in performance against the classification models. @@ -11622,7 +11622,7 @@ A Unified Generative Framework for Bilingual Euphemism Detection and Identification YuxueHu - JunsongLi + JunsongLi TongguanWang DongyuSu GuixinSu @@ -11638,7 +11638,7 @@ GaoxiangCong YuankaiQiMacquarie University LiangLi - AminBeheshtiMacquarie University + AminBeheshtiMacquarie University ZhedongZhang AntonHengelUniversity of Adelaide Ming-HsuanYangGoogle and University of California at Merced @@ -11652,8 +11652,8 @@ <fixed-case>ETAS</fixed-case>: Zero-Shot Transformer Architecture Search via Network Trainability and Expressivity - JiechaoYangRenmin University of China - YongLiuRenmin University of China and Institute of information engineering, CAS + JiechaoYangRenmin University of China + YongLiuRenmin University of China and Institute of information engineering, CAS 6780-6795 Transformer Architecture Search (TAS) methods aim to automate searching for the optimal Transformer architecture configurations for a given task. However, they are impeded by the prohibitive cost of evaluating Transformer architectures. Recently, several Zero-Shot TAS methods have been proposed to mitigate this problem by utilizing zero-cost proxies to evaluate Transformer architectures without training. Unfortunately, they are limited to specific computer vision or natural language processing tasks. Nonetheless, most of them are developed based on empirical observations and lack theoretical guarantees. To solve this problem, we develop a new zero-cost proxy called NTSR that combines two theoretically-inspired indicators to measure the trainability and expressivity of Transformer networks separately. We then integrate it into an effective regularized evolution framework called ETAS to demonstrate its efficacy on various tasks. The results show that our proposed NTSR proxy can consistently achieve a higher correlation with the true performance of Transformer networks on both computer vision and natural language processing tasks. Further, it can significantly accelerate the search process for finding the best-performing Transformer architecture configurations. 2024.findings-acl.405 @@ -11665,8 +11665,8 @@ KaishuaiXuHong Kong Polytechnic University YiChengThe Hong Kong Polytechnic University WenjunHou - QiaoyuTanNew York University Shanghai - WenjieLiThe Hong Kong Polytechnic University, The Hong Kong Polytechnic University + QiaoyuTanNew York University Shanghai + WenjieLiThe Hong Kong Polytechnic University, The Hong Kong Polytechnic University 6796-6814 Medical dialogue systems have attracted significant attention for their potential to act as medical assistants. Enabling these medical systems to emulate clinicians’ diagnostic reasoning process has been the long-standing research focus. Previous studies rudimentarily realized the simulation of clinicians’ diagnostic process by fine-tuning language models on high-quality dialogue datasets. Nonetheless, they overly focus on the outcomes of the clinician’s reasoning process while ignoring their internal thought processes and alignment with clinician preferences. Our work aims to build a medical dialogue system that aligns with clinicians’ diagnostic reasoning processes. We propose a novel framework, Emulation, designed to generate an appropriate response that relies on abductive and deductive diagnostic reasoning analyses and aligns with clinician preferences through thought process modeling. Experimental results on two datasets confirm the efficacy of Emulation. Crucially, our framework furnishes clear explanations for the generated responses, enhancing its transparency in medical consultations. 2024.findings-acl.406 @@ -11676,18 +11676,18 @@ <fixed-case>C</fixed-case>oncept<fixed-case>M</fixed-case>ath: A Bilingual Concept-wise Benchmark for Measuring Mathematical Reasoning of Large Language Models YananWu - JieLiuThe Chinese University of Hong Kong - XingyuanBuAlibaba Group + JieLiuThe Chinese University of Hong Kong + XingyuanBuAlibaba Group JiahengLiu ZhanhuiZhouShanghai Artificial Intelligence Laboratory - YuanxingZhang + YuanxingZhang ChenchenZhangBeijing University of Posts and Telecommunications ZhiqiBaiZhiqiBai HaibinChen - TiezhengGeAlibaba Group - WanliOuyangShanghai AI Lab + TiezhengGeAlibaba Group + WanliOuyangShanghai AI Lab WenboSu - BoZhengAlibaba Group + BoZhengAlibaba Group 6815-6839 This paper introduces ConceptMath, a bilingual (English and Chinese), fine-grained benchmark that evaluates concept-wise mathematical reasoning of Large Language Models (LLMs). Unlike traditional benchmarks that evaluate general mathematical reasoning with an average accuracy, ConceptMath systemically organizes math problems under a hierarchy of math concepts, so that mathematical reasoning can be evaluated at different granularity with concept-wise accuracies. Based on our ConcepthMath, we then evaluate a broad range of LLMs, and we observe existing LLMs, though achieving high average accuracies on traditional benchmarks, exhibit significant performance variations across different math concepts and may even fail catastrophically on the most basic ones. Besides, we also introduce an efficient fine-tuning strategy to enhance the weaknesses of existing LLMs. Finally, we hope ConceptMath could guide the developers to understand the fine-grained mathematical abilities of their models and facilitate the growth of foundation models. Code is available at https://github.com/conceptmath/conceptmath. 2024.findings-acl.407 @@ -11698,7 +11698,7 @@ <fixed-case>REI</fixed-case>nstruct: Building Instruction Data from Unlabeled Corpus ShuChen XinyanGuan - YaojieLuInstitute of Software, Chinese Academy of Sciences + YaojieLuInstitute of Software, Chinese Academy of Sciences HongyuLinInstitute of Software, Chinese Academy of Sciences XianpeiHanInstitute of Software, CAS LeSunInstitute of Software, Chinese Academy of Sciences @@ -11710,10 +11710,10 @@ Learning to Maximize Mutual Information for Chain-of-Thought Distillation - XinChenIntel Corp + XinChenIntel Corp HanxianHuang - YanjunGaoUniversity of Colorado Anschutz Medical Campus - YiWang + YanjunGaoUniversity of Colorado Anschutz Medical Campus + YiWang JishenZhaoUniversity of California, San Diego KeDingIntel 6857-6868 @@ -11727,7 +11727,7 @@ ZhishengLin HanFu ChenghaoLiuSalesForce.com - ZhuoLiZhejiang University + ZhuoLiZhejiang University JianlingSun 6869-6883 Parameter-efficient fine-tuning (PEFT) has emerged as an effective method for adapting pre-trained language models to various tasks efficiently. Recently, there has been a growing interest in transferring knowledge from one or multiple tasks to the downstream target task to achieve performance improvements. However, current approaches typically either train adapters on individual tasks or distill shared knowledge from source tasks, failing to fully exploit task-specific knowledge and the correlation between source and target tasks. To overcome these limitations, we propose PEMT, a novel parameter-efficient fine-tuning framework based on multi-task transfer learning. PEMT extends the mixture-of-experts (MoE) framework to capture the transferable knowledge as a weighted combination of adapters trained on source tasks. These weights are determined by a gated unit, measuring the correlation between the target and each source task using task description prompt vectors. To fully exploit the task-specific knowledge, we also propose the Task Sparsity Loss to improve the sparsity of the gated unit. We conduct experiments on a broad range of tasks over 17 datasets. The experimental results demonstrate our PEMT yields stable improvements over full fine-tuning, and state-of-the-art PEFT and knowledge transferring methods on various tasks. The results highlight the effectiveness of our method which is capable of sufficiently exploiting the knowledge and correlation features across multiple tasks. @@ -11739,14 +11739,14 @@ <fixed-case>M</fixed-case>ath<fixed-case>B</fixed-case>ench: Evaluating the Theory and Application Proficiency of <fixed-case>LLM</fixed-case>s with a Hierarchical Mathematics Benchmark HongweiLiu ZilongZheng - YuxuanQiao + YuxuanQiao HaodongDuanShanghai Artificial Intelligence Laboratory ZhiweiFeiFudan University, Harbin Institute of Technology, Dalian University of Technology, Shanghai Jiaotong University, Shandong University, Peking University, Zhejiang University, University of Science and Technology of China, Hunan University, Beijing Institute of Technology, University of the Chinese Academy of Sciences, Southeast University, Sichuan University, Monash University, Malaysia Campus, Tianjin University, Beijing University of Aeronautics and Astronautics, Wuhan University of Technology, Yale University, Technische Universität München, Wuhan University, nanjing university, Tsinghua University and Wuhan University FengzheZhou - WenweiZhangShanghai AI Laboratory + WenweiZhangShanghai AI Laboratory SongyangZhangShanghai AI Laboratory DahuaLinThe Chinese University of Hong Kong - KaiChenShanghai AI Laboratory + KaiChenShanghai AI Laboratory 6884-6915 Recent advancements in large language models (LLMs) have showcased significant improvements in mathematics. However, traditional math benchmarks like GSM8k offer a unidimensional perspective, which fall short in providing a holistic assessment of the LLMs’ math capabilities. To address this gap, we introduce MathBench, a new benchmark that rigorously assesses the mathematical capabilities of large language models. MathBench spans a wide range of mathematical disciplines, offering a detailed evaluation of both theoretical understanding and practical problem-solving skills. The benchmark progresses through five distinct stages, from basic arithmetic to college mathematics, and is structured to evaluate models at various depths of knowledge. Each stage includes theoretical questions and application problems, allowing us to measure a model’s mathematical proficiency and its ability to apply concepts in practical scenarios. MathBench aims to enhance the evaluation of LLMs’ mathematical abilities, providing a nuanced view of their knowledge understanding levels and problem solving skills in a bilingual context. 2024.findings-acl.411 @@ -11755,12 +11755,12 @@ Identifying Semantic Induction Heads to Understand In-Context Learning - JieRenShanghai Jiao Tong University + JieRenShanghai Jiao Tong University QipengGuoShanghai AI Laboratory HangYanAI lab - DongruiLiuShanghai Artificial Intelligence Laboratory + DongruiLiuShanghai Artificial Intelligence Laboratory QuanshiZhangShanghai Jiao Tong University - XipengQiuFudan University + XipengQiuFudan University DahuaLinThe Chinese University of Hong Kong 6916-6932 Although large language models (LLMs) have demonstrated remarkable performance, the lack of transparency in their inference logic raises concerns about their trustworthiness. To gain a better understanding of LLMs, we conduct a detailed analysis of the operations of attention heads and aim to better understand the in-context learning of LLMs. Specifically, we investigate whether attention heads encode two types of relationships between tokens present in natural languages: the syntactic dependency parsed from sentences and the relation within knowledge graphs. We find that certain attention heads exhibit a pattern where, when attending to subject tokens, they recall object tokens and increase the output logits of those object tokens. More crucially, the formulation of such semantic induction heads has a close correlation with the emergence of the in-context learning ability of language models. The study of semantic attention heads advances our understanding of the intricate operations of attention heads in transformers, and further provides new insights into the in-context learning of LLMs. @@ -11783,7 +11783,7 @@ Logical Closed Loop: Uncovering Object Hallucinations in Large Vision-Language Models JunfeiWu - QiangLiuInstitute of Automation, Chinese Academy of Sciences + QiangLiuInstitute of Automation, Chinese Academy of Sciences DingWang JinghaoZhang ShuWuInstitute of automation, Chinese academy of science, Chinese Academy of Sciences @@ -11810,7 +11810,7 @@ XiChen SongyangZhangShanghai AI Laboratory QibingBai - KaiChenShanghai AI Laboratory + KaiChenShanghai AI Laboratory SatoshiNakamuraThe Chinese University of Hong Kong 6976-6987 We introduces ***LLaST***, a framework for building high-performance Large Language model based Speech-to-text Translation systems. We address the limitations of end-to-end speech translation (E2E ST) models by exploring model architecture design and optimization techniques tailored for LLMs. Our approach includes LLM-based speech translation architecture design, ASR-augmented training, multilingual data augmentation, and dual-LoRA optimization. Our approach demonstrates superior performance on the CoVoST-2 benchmark and showcases exceptional scaling capabilities powered by LLMs.We believe this effective method will serve as a strong baseline for speech translation and provide insights for futureimprovements of the LLM-based speech translation framework. @@ -11820,8 +11820,8 @@ Plan, Generate and Complicate: Improving Low-resource Dialogue State Tracking via Easy-to-Difficult Zero-shot Data Augmentation - MingGu - YanYangEast China Normal University + MingGu + YanYangEast China Normal University 6988-7005 Data augmentation methods have been a promising direction to improve the performance of small models for low-resource dialogue state tracking. However, traditional methods rely on pre-defined user goals and neglect the importance of data complexity in this task. In this paper, we propose EDZ-DA, an Easy-to-Difficult Zero-shot Data Augmentation framework for low-resource dialogue state tracking that utilizes large language models to automatically catch the relationships of different domains and then generate the dialogue data. We also complicate the dialogues based on the domain relation to enhance the model’s capability for co-reference slot tracking. Furthermore, we permute slot values to mitigate the influence of output orders and the problem of incomplete value generation. Experimental results illustrate the superiority of our proposed method compared to previous strong data augmentation baselines on MultiWOZ. 2024.findings-acl.417 @@ -11830,7 +11830,7 @@ <fixed-case>DM</fixed-case>o<fixed-case>ERM</fixed-case>: Recipes of Mixture-of-Experts for Effective Reward Modeling - ShanghaoranQuan + ShanghaoranQuan 7006-7028 The performance of the reward model (RM) is a critical factor in improving the effectiveness of the large language model (LLM) during alignment fine-tuning. There remain two challenges in RM training: 1) training the same RM using various categories of data may cause its generalization performance to suffer from multi-task disturbance, and 2) the human annotation consistency rate is generally only 60% to 75%, causing training data to contain a lot of noise. To tackle these two challenges, we introduced the idea of Mixture-of-Experts (MoE) into the field of RM for the first time. We propose the Double-Layer MoE RM (DMoERM). The outer layer MoE is a sparse model. After classifying an input into task categories, we route it to the corresponding inner layer task-specific model. The inner layer MoE is a dense model. We decompose the specific task into multiple capability dimensions and individually fine-tune a LoRA expert on each one. Their outputs are then synthesized by an MLP to compute the final rewards. To minimize costs, we call a public LLM API to obtain the capability preference labels. The validation on manually labeled datasets confirms that our model attains superior consistency with human preference and outstrips advanced generative approaches. Meanwhile, through BoN sampling and RL experiments, we demonstrate that our model outperforms state-of-the-art ensemble methods of RM and mitigates the overoptimization problem. Our code is available at: https://github.com/quanshr/DMoERM. 2024.findings-acl.418 @@ -11851,10 +11851,10 @@ Comments as Natural Logic Pivots: Improve Code Generation via Comment Perspective YijieChen YijinLiuWechat AI - FandongMengWeChat AI, Tencent Inc. + FandongMengWeChat AI, Tencent Inc. YufengChen JinanXuBeijing Jiaotong University - JieZhou + JieZhou 7040-7051 Code generation aims to understand the problem description and generate corresponding code snippets, where existing works generally decompose such complex tasks into intermediate steps by prompting strategies, such as Chain-of-Thought and its variants. While these studies have achieved some success, their effectiveness is highly dependent on the capabilities of advanced Large Language Models (LLMs) such as GPT-4, particularly in terms of API calls, which significantly limits their practical applicability. Consequently, how to enhance the code generation capabilities of small and medium-scale code LLMs without significantly increasing training costs is an appealing challenge. In this paper, we suggest that code comments are the natural logic pivot between natural language and code language and propose using comments to boost the code generation ability of code LLMs. Concretely, we propose MANGO (comMents As Natural loGic pivOts), including a comment contrastive training strategy and a corresponding logical comment decoding strategy. Experiments are performed on HumanEval and MBPP, utilizing StarCoder and WizardCoder as backbone models, and encompassing model parameter sizes between 3B and 7B. The results indicate that MANGO significantly improves the code pass rate based on the strong baselines. Meanwhile, the robustness of the logical comment decoding strategy is notably higher than the Chain-of-thoughts prompting. 2024.findings-acl.420 @@ -11863,15 +11863,15 @@ Cocktail: A Comprehensive Information Retrieval Benchmark with <fixed-case>LLM</fixed-case>-Generated Documents Integration - SunhaoDai - WeihaoLiuRenmin University of China - YuqiZhou - LiangPangInstitute of Computing Technology, Chinese Academy of Sciences + SunhaoDai + WeihaoLiuRenmin University of China + YuqiZhou + LiangPangInstitute of Computing Technology, Chinese Academy of Sciences RongjuRuanHuawei Technologies Ltd. - GangWangHuawei Technologies Ltd. + GangWangHuawei Technologies Ltd. ZhenhuaDong JunXuRenmin University of China - Ji-RongWenRenmin University of China + Ji-RongWenRenmin University of China 7052-7074 The proliferation of Large Language Models (LLMs) has led to an influx of AI-generated content (AIGC) on the internet, transforming the corpus of Information Retrieval (IR) systems from solely human-written to a coexistence with LLM-generated content. The impact of this surge in AIGC on IR systems remains an open question, with the primary challenge being the lack of a dedicated benchmark for researchers. In this paper, we introduce Cocktail, a comprehensive benchmark tailored for evaluating IR models in this mixed-sourced data landscape of the LLM era. Cocktail consists of 16 diverse datasets with mixed human-written and LLM-generated corpora across various text retrieval tasks and domains. Additionally, to avoid the potential bias from previously included dataset information in LLMs, we also introduce an up-to-date dataset, named NQ-UTD, with queries derived from recent events. Through conducting over 1,000 experiments to assess state-of-the-art retrieval models against the benchmarked datasets in Cocktail, we uncover a clear trade-off between ranking performance and source bias in neural retrieval models, highlighting the necessity for a balanced approach in designing future IR systems. We hope Cocktail can serve as a foundational resource for IR research in the LLM era, with all data and code publicly available at https://github.com/KID-22/Cocktail. 2024.findings-acl.421 @@ -11880,13 +11880,13 @@ Continual Dialogue State Tracking via Reason-of-Select Distillation - YujieFengHong Kong Polytechnic University + YujieFengHong Kong Polytechnic University BoLiu - XiaoyuDong - ZexinLuHong Kong Polytechnic University + XiaoyuDong + ZexinLuHong Kong Polytechnic University Li-MingZhanThe Hong Kong Polytechnic University Xiao-MingWuHong Kong Polytechnic University - AlbertLamUniversity of Hong Kong and Fano Labs + AlbertLamUniversity of Hong Kong and Fano Labs 7075-7087 An ideal dialogue system requires continuous skill acquisition and adaptation to new tasks while retaining prior knowledge. Dialogue State Tracking (DST), vital in these systems, often involves learning new services, confronting catastrophic forgetting and a critical capability loss termed the “Value Selection Quandary”. To address these challenges, we introduce the Reason-of-Select (RoS) distillation method by enhancing smaller models with a novel “meta-reasoning” capability. Meta-reasoning, employing an enhanced multi-domain perspective, combines fragments of meta-knowledge from domain-specific dialogues during continual learning, transcending traditional single-perspective reasoning. This domain bootstrapping process enhances the model’s ability to dissect intricate dialogues from multiple possible values, and its domain-agnostic property aligns data distribution across different domains, effectively mitigating forgetting. Besides, two novel improvements, “multi-value resolution” strategy and Semantic Contrastive Reasoning Selection method, significantly enhance RoS by generating DST-specific selection chains and mitigating hallucinations in teachers’ reasoning, ensuring effective and reliable knowledge transfer. Extensive experiments validate the exceptional performance and robust generalization capabilities of our method. 2024.findings-acl.422 @@ -11898,9 +11898,9 @@ YafuLiWestlake University ZhilinWang LeyangCui - WeiBiTencent AI Lab + WeiBiTencent AI Lab ShumingShiTencent AI Lab - YueZhangWestlake University + YueZhangWestlake University 7088-7107 AI-generated text detection has attracted increasing attention as powerful language models approach human-level generation. Limited work is devoted to detecting (partially) AI-paraphrased texts. However, AI paraphrasing is commonly employed in various application scenarios for text refinement and diversity. To this end, we propose a novel detection framework, paraphrased text span detection (PTD), aiming to identify paraphrased text spans within a text. Different from text-level detection, PTD takes in the full text and assigns each of the sentences with a score indicating the paraphrasing degree. We construct a dedicated dataset, PASTED, for paraphrased text span detection. Both in-distribution and out-of-distribution results demonstrate the effectiveness of PTD models in identifying AI-paraphrased text spans. Statistical and model analysis explains the crucial role of the surrounding context of the paraphrased text spans. Extensive experiments show that PTD models can generalize to versatile paraphrasing prompts as well as multiple paraphrased text spans. 2024.findings-acl.423 @@ -11910,8 +11910,8 @@ <fixed-case>S</fixed-case>o<fixed-case>FA</fixed-case>: Shielded On-the-fly Alignment via Priority Rule Following XinyuLu - BowenYuAlibaba Group - YaojieLuInstitute of Software, Chinese Academy of Sciences + BowenYuAlibaba Group + YaojieLuInstitute of Software, Chinese Academy of Sciences HongyuLinInstitute of Software, Chinese Academy of Sciences HaiyangYu LeSunInstitute of Software, Chinese Academy of Sciences @@ -11936,10 +11936,10 @@ Modeling Emotional Trajectories in Written Stories Utilizing Transformers and Weakly-Supervised Learning LukasChristUniversity of Augsburg, Universität Augsburg - ShahinAmiriparianTechnical University of Munich + ShahinAmiriparianTechnical University of Munich ManuelMillingUniversity of Augsburg - IlhanAslan - BjörnSchullerTechnische Universität München and Imperial College London + IlhanAslan + BjörnSchullerTechnische Universität München and Imperial College London 7144-7159 Telling stories is an integral part of human communication which can evoke emotions and influence the affective states of the audience. Automatically modeling emotional trajectories in stories has thus attracted considerable scholarly interest. However, as most existing works have been limited to unsupervised dictionary-based approaches, there is no benchmark for this task. We address this gap by introducing continuous valence and arousal labels for an existing dataset of children’s stories originally annotated with discrete emotion categories. We collect additional annotations for this data and map the categorical labels to the continuous valence and arousal space. For predicting the thus obtained emotionality signals, we fine-tune a DeBERTa model and improve upon this baseline via a weakly supervised learning approach. The best configuration achieves a Concordance Correlation Coefficient (CCC) of .8221 for valence and .7125 for arousal on the test set, demonstrating the efficacy of our proposed approach. A detailed analysis shows the extent to which the results vary depending on factors such as the author, the individual story, or the section within the story. In addition, we uncover the weaknesses of our approach by investigating examples that prove to be difficult to predict. 2024.findings-acl.426 @@ -11948,13 +11948,13 @@ <fixed-case>RAP</fixed-case>: Efficient Text-Video Retrieval with Sparse-and-Correlated Adapter - MengCaoMohamed bin Zayed University of Artificial Intelligence + MengCaoMohamed bin Zayed University of Artificial Intelligence HaoranTang JinfaHuang - PengJin - CanZhangTencent MediaLab + PengJin + CanZhangTencent MediaLab RuyangLiuPeking University - LongChenThe Hong Kong University of Science and Technology + LongChenThe Hong Kong University of Science and Technology XiaodanLiang LiYuanPeking University GeLiPeking University Shenzhen Graduate School @@ -11966,14 +11966,14 @@ Benchmarking and Improving Long-Text Translation with Large Language Models - LongyueWang + LongyueWang ZefengDu WenxiangJiaoTencent AI Lab ChenyangLyuMohamed bin Zayed University of Artificial Intelligence JianhuiPang LeyangCui KaiqiangSongTencent AI Lab - DerekWongUniversity of Macau + DerekWongUniversity of Macau ShumingShiTencent AI Lab ZhaopengTuTencent AI Lab 7175-7187 @@ -11984,11 +11984,11 @@ Personalized Topic Selection Model for Topic-Grounded Dialogue - ShixuanFan - WeiWeiHuazhong University of Science and Technology + ShixuanFan + WeiWeiHuazhong University of Science and Technology XiaofeiWen Xian-LingMaoBeijing Institute of Technology - JixiongChen + JixiongChen DangyangChenPingan Technology 7188-7202 Recently, the topic-grounded dialogue (TGD) system has become increasingly popular as its powerful capability to actively guide users to accomplish specific tasks through topic-guided conversations. Most existing works utilize side information (e.g. topics or personas) in isolation to enhance the topic selection ability. However, due to disregarding the noise within these auxiliary information sources and their mutual influence, current models tend to predict user-uninteresting and contextually irrelevant topics. To build user-engaging and coherent dialogue agent, we propose a personalized topic selection model for topic-grounded dialogue, named PETD, which takes account of the interaction of side information to selectively aggregate such information for more accurately predicting subsequent topics. Specifically, we evaluate the correlation between global topics and personas and selectively incorporate the global topics aligned with user personas. Furthermore, we propose a contrastive learning based persona selector to filter relevant personas under the constraint of lacking pertinent persona annotations. Throughout the selection and generation, diverse relevant side information is considered. Extensive experiments demonstrate that our proposed method can generate engaging and diverse responses, outperforming state-of-the-art baselines across various evaluation metrics. @@ -12001,10 +12001,10 @@ LvxueLi JiaqiChen XinyuLu - YaojieLuInstitute of Software, Chinese Academy of Sciences + YaojieLuInstitute of Software, Chinese Academy of Sciences HongyuLinInstitute of Software, Chinese Academy of Sciences ShuhengZhouAnt Group - HuijiaZhu + HuijiaZhu WeiqiangWangAnt Group ZhongyiLiuAnt Group XianpeiHanInstitute of Software, CAS @@ -12029,9 +12029,9 @@ <fixed-case>MS</fixed-case>2<fixed-case>SL</fixed-case>: Multimodal Spoken Data-Driven Continuous Sign Language Production JianMa - WenguanWangZhejiang University + WenguanWangZhejiang University YiYangZhejiang University - FengZhengSouthern University of Science and Technology + FengZhengSouthern University of Science and Technology 7241-7254 Sign language understanding has made significant strides; however, there is still no viable solution for generating sign sequences directlyfrom entire spoken content, e.g., text or speech. In this paper, we propose a unified framework for continuous sign language production, easing communication between sign and non-sign language users. In particular, a sequence diffusion model, utilizing embeddings extracted from text or speech, is crafted to generate sign predictions step by step. Moreover, by creating a joint embedding space for text, audio, and sign, we bind these modalities and leverage the semantic consistency among them to provide informative feedback for the model training. This embedding-consistency learning strategy minimizes the reliance on sign triplets and ensures continuous model refinement, evenwith a missing audio modality. Experiments on How2Sign and PHOENIX14T datasets demonstrate that our model achieves competitive performance in sign language production. 2024.findings-acl.432 @@ -12044,9 +12044,9 @@ XintingHuangTencent AI Lab TingchenFu QintongLi - ShansanGong + ShansanGong LemaoLiuTencent - WeiBiTencent AI Lab + WeiBiTencent AI Lab LingpengKongDepartment of Computer Science, The University of Hong Kong 7255-7279 Multimodal reasoning stands as a pivotal capability for large vision-language models (LVLMs). The integration with Domain-Specific Languages (DSL), offering precise visual representations, equips these models with the opportunity to execute more accurate reasoning in complex and professional domains. However, the vanilla Chain-of-Thought (CoT) prompting method faces challenges in effectively leveraging the unique strengths of visual and DSL representations, primarily due to their differing reasoning mechanisms. Additionally, it often falls short in addressing critical steps in multi-step reasoning tasks. To mitigate these challenges, we introduce the Bi-Modal Behavioral Alignment (BBA) prompting method, designed to maximize the potential of DSL in augmenting complex multi-modal reasoning tasks. This method initiates by guiding LVLMs to create separate reasoning chains for visual and DSL representations. Subsequently, it aligns these chains by addressing any inconsistencies, thus achieving a cohesive integration of behaviors from different modalities. Our experiments demonstrate that BBA substantially improves the performance of GPT-4V(ision) on geometry problem solving (28.34% \to 34.22%), chess positional advantage prediction (42.08% \to 46.99%) and molecular property prediction (77.47% \to 83.52%). @@ -12056,7 +12056,7 @@ <fixed-case>P</fixed-case>artial<fixed-case>F</fixed-case>ormer: Modeling Part Instead of Whole for Machine Translation - TongZheng + TongZheng BeiLiMeituan HuiwenBaoNortheastern University JialeWang @@ -12071,12 +12071,12 @@ Self-Consistent Reasoning-based Aspect-Sentiment Quad Prediction with Extract-Then-Assign Strategy - JieyongKim + JieyongKim RyangHeoYonsei University YongsikSeoYonsei University - SeongKuKangUniversity of Illinois Urbana-Champaign - JinyoungYeoYonsei University - DonghaLeeYonsei University + SeongKuKangUniversity of Illinois Urbana-Champaign + JinyoungYeoYonsei University + DonghaLeeYonsei University 7295-7303 In the task of aspect sentiment quad prediction (ASQP), generative methods for predicting sentiment quads have shown promisingresults. However, they still suffer from imprecise predictions and limited interpretability, caused by data scarcity and inadequate modeling of the quadruplet composition process. In this paper, we propose Self-Consistent Reasoning-based Aspect sentiment quadruple Prediction (SCRAP), optimizing its model to generate reasonings and the corresponding sentiment quadruplets in sequence. SCRAP adopts the Extract-Then-Assign reasoning strategy, which closely mimics human cognition. In the end, SCRAP significantly improves the model’s ability to handle complex reasoning tasks and correctly predict quadruplets through consistency voting, resulting in enhanced interpretability and accuracy in ASQP. 2024.findings-acl.435 @@ -12088,7 +12088,7 @@ YihongDongPeking University KangchengLuoPeking University XueJiangPeking University - ZhiJinPeking University and Peking University + ZhiJinPeking University and Peking University GeLiPeking University Shenzhen Graduate School 7304-7323 Large language models (LLMs) have showcased remarkable potential across various tasks by conditioning on prompts. However, the quality of different human-written prompts leads to substantial discrepancies in LLMs’ performance, and improving prompts usually necessitates considerable human effort and expertise. To this end, this paper proposes Prompt with Actor-Critic Editing (PACE) for LLMs to enable automatic prompt editing. Drawing inspiration from the actor-critic algorithm in reinforcement learning, PACE leverages LLMs as the dual roles of actors and critics, conceptualizing prompt as a type of policy. PACE refines prompt, taking into account the feedback from both actors performing prompt and critics criticizing response. This process helps LLMs better align prompt to a specific task, thanks to real responses and thinking from LLMs.We conduct extensive experiments on 24 instruction induction tasks and 21 big-bench tasks. Experimental results indicate that PACE elevates the relative performance of medium/low-quality human-written prompts by up to 98%, which has comparable performance to high-quality human-written prompts. Moreover, PACE also exhibits notable efficacy for prompt generation. @@ -12099,10 +12099,10 @@ Penetrative <fixed-case>AI</fixed-case>: Making <fixed-case>LLM</fixed-case>s Comprehend the Physical World HuataoXuHong Kong University of Science and Technology - LiyingHanUniversity of California, Los Angeles + LiyingHanUniversity of California, Los Angeles QiruiYangDepartment of Computer Science and Engineering, Hong Kong University of Science and Technology MoLiThe Hong Kong University of Science and Technology and National Technological University - ManiSrivastavaAmazon and University of California, Los Angeles + ManiSrivastavaAmazon and University of California, Los Angeles 7324-7341 Recent developments in Large Language Models (LLMs) have demonstrated their remarkable capabilities across a range of tasks. Questions, however, persist about the nature of LLMs and their potential to integrate common-sense human knowledge when performing tasks involving information about the real physical world. This paper delves into these questions by exploring how LLMs can be extended to interact with and reason about the physical world through IoT sensors and actuators, a concept that we term “Penetrative AI”. The paper explores such an extension at two levels of LLMs’ ability to penetrate into the physical world via the processing of sensory signals. Our preliminary findings indicate that LLMs, with ChatGPT being the representative example in our exploration, have considerable and unique proficiency in employing the embedded world knowledge for interpreting IoT sensor data and reasoning over them about tasks in the physical realm. Not only this opens up new applications for LLMs beyond traditional text-based tasks, but also enables new ways of incorporating human knowledge in cyber-physical systems. 2024.findings-acl.437 @@ -12112,11 +12112,11 @@ The Impact of Demonstrations on Multilingual In-Context Learning: A Multidimensional Analysis MiaoranZhangSaarland University - VagrantGautamSaarland University + VagrantGautamSaarland University MingyangWang - JesujobaAlabiUniversität des Saarlandes + JesujobaAlabiUniversität des Saarlandes XiaoyuShenAmazon - DietrichKlakowSaarland University + DietrichKlakowSaarland University MariusMosbachMcGill University and Mila - Quebec Artificial Intelligence Institute 7342-7371 In-context learning is a popular inference strategy where large language models solve a task using only a few labeled demonstrations without needing any parameter updates. Although there have been extensive studies on English in-context learning, multilingual in-context learning remains under-explored, and we lack an in-depth understanding of the role of demonstrations in this context. To address this gap, we conduct a multidimensional analysis of multilingual in-context learning, experimenting with 5 models from different model families, 9 datasets covering classification and generation tasks, and 56 typologically diverse languages. Our results reveal that the effectiveness of demonstrations varies significantly across models, tasks, and languages. We also find that strong instruction-following models including Llama 2-Chat, GPT-3.5, and GPT-4 are largely insensitive to the quality of demonstrations. Instead, a carefully crafted template often eliminates the benefits of demonstrations for some tasks and languages altogether. These findings show that the importance of demonstrations might be overestimated. Our work highlights the need for granular evaluation across multiple axes towards a better understanding of in-context learning. @@ -12126,10 +12126,10 @@ Rich Semantic Knowledge Enhanced Large Language Models for Few-shot <fixed-case>C</fixed-case>hinese Spell Checking - MingDongCentral China Normal University + MingDongCentral China Normal University YujingChenCentral China Normal University MiaoZhang - HaoSun + HaoSun TingtingHeCentral China Normal University 7372-7383 2024.findings-acl.439 @@ -12138,7 +12138,7 @@ An Empirical Study of In-context Learning in <fixed-case>LLM</fixed-case>s for Machine Translation - PranjalChitaleMicrosoft Research + PranjalChitaleMicrosoft Research JayGalaMohamed bin Zayed University of Artificial Intelligence RajDabreNational Institute of Information and Communications Technology (NICT), National Institute of Advanced Industrial Science and Technology 7384-7406 @@ -12153,9 +12153,9 @@ BoleiMaLudwig-Maximilians-Universität München ChengzhiHu LeonWeber-GenzelLudwig-Maximilians-Universität München - PaulRöttgerBocconi University + PaulRöttgerBocconi University FraukeKreuterUniversity of Maryland, College Park - DirkHovyBocconi University + DirkHovyBocconi University BarbaraPlankLudwig-Maximilians-Universität München and IT University of Copenhagen 7407-7416 The open-ended nature of language generation makes the evaluation of autoregressive large language models (LLMs) challenging. One common evaluation approach uses multiple-choice questions to limit the response space. The model is then evaluated by ranking the candidate answers by the log probability of the first token prediction. However, first-tokens may not consistently reflect the final response output, due to model’s diverse response styles such as starting with “Sure” or refusing to answer. Consequently, first-token evaluation is not indicative of model behaviour when interacting with users. But by how much? We evaluate how aligned first-token evaluation is with the text output along several dimensions, namely final option choice, refusal rate, choice distribution and robustness under prompt perturbation. Our results show that the two approaches are severely misaligned on all dimensions, reaching mismatch rates over 60%. Models heavily fine-tuned on conversational or safety data are especially impacted. Crucially, models remain misaligned even when we increasingly constrain prompts, i.e., force them to start with an option letter or example template. Our findings i) underscore the importance of inspecting the text output as well and ii) caution against relying solely on first-token evaluation. @@ -12191,7 +12191,7 @@ A Data-Driven Guided Decoding Mechanism for Diagnostic Captioning PanagiotisKaliosis - JohnPavlopoulosAthens University of Economics and Business + JohnPavlopoulosAthens University of Economics and Business FoivosCharalampakosAthens University of Economics and Business GeorgiosMoschovis IonAndroutsopoulosAthens University of Economics and Business @@ -12205,7 +12205,7 @@ HengyuanZhang YanruWu DaweiLi - SakYangUniversity of the Chinese Academy of Sciences + SakYangUniversity of the Chinese Academy of Sciences RuiZhaoQing Yuan Research Institute, Shanghai Jiao Tong University and SenseTime Research YongJiangTsinghua University FeiTanSensetime Research @@ -12231,8 +12231,8 @@ Light-<fixed-case>PEFT</fixed-case>: Lightening Parameter-Efficient Fine-Tuning via Early Pruning NaibinGu - PengFuInstitute of Information Engineering, Chinese Academy of Sciences - XiyuLiu + PengFuInstitute of Information Engineering, Chinese Academy of Sciences + XiyuLiu BowenShenUniversity of the Chinese Academy of Sciences ZhengLinInstitute of Information Engineering, Chinese Academy of Sciences WeipingWang @@ -12244,7 +12244,7 @@ Building Bridges: A Dataset for Evaluating Gender-Fair Machine Translation into <fixed-case>G</fixed-case>erman - ManuelLardelli + ManuelLardelli GiuseppeAttanasioInstituto de Telecomunicações AnneLauscherUniversität Hamburg 7542-7550 @@ -12257,8 +12257,8 @@ Prompt Chaining or Stepwise Prompt? Refinement in Text Summarization ShichaoSunThe Hong Kong Polytechnic University RuifengYuan - ZiqiangCao - WenjieLiThe Hong Kong Polytechnic University, The Hong Kong Polytechnic University + ZiqiangCao + WenjieLiThe Hong Kong Polytechnic University, The Hong Kong Polytechnic University PengfeiLiu 7551-7558 2024.findings-acl.449 @@ -12269,8 +12269,8 @@ Trust in Internal or External Knowledge? Generative Multi-Modal Entity Linking with Knowledge Retriever XinweiLong JialiZeng - FandongMengWeChat AI, Tencent Inc. - JieZhou + FandongMengWeChat AI, Tencent Inc. + JieZhou BowenZhouTsinghua University 7559-7569 Multi-modal entity linking (MEL) is a challenging task that requires accurate prediction of entities within extensive search spaces, utilizing multi-modal contexts. Existing generative approaches struggle with the knowledge gap between visual entity information and the intrinsic parametric knowledge of LLMs. To address this knowledge gap, we introduce a novel approach called GELR, which incorporates a knowledge retriever to enhance visual entity information by leveraging external sources. Additionally, we devise a prioritization scheme that effectively handles noisy retrieval results and manages conflicts arising from the integration of external and internal knowledge. Moreover, we propose a noise-aware instruction tuning technique during training to finely adjust the model’s ability to leverage retrieved information effectively. Through extensive experiments conducted on three benchmarks, our approach showcases remarkable improvements, ranging from 3.0% to 6.5%, across all evaluation metrics compared to strong baselines. These results demonstrate the effectiveness and superiority of our proposed method in tackling the complexities of multi-modal entity linking. @@ -12281,7 +12281,7 @@ A Semantic Distance Metric Learning approach for Lexical Semantic Change Detection TaichiAidaTokyo Metropolitan University - DanushkaBollegalaAmazon and University of Liverpool + DanushkaBollegalaAmazon and University of Liverpool 7570-7584 Detecting temporal semantic changes of words is an important task for various NLP applications that must make time-sensitive predictions.Lexical Semantic Change Detection (SCD) task involves predicting whether a given target word, w, changes its meaning between two different text corpora, C_1 and C_2.For this purpose, we propose a supervised two-staged SCD method that uses existing Word-in-Context (WiC) datasets.In the first stage, for a target word w, we learn two sense-aware encoders that represent the meaning of w in a given sentence selected from a corpus.Next, in the second stage, we learn a sense-aware distance metric that compares the semantic representations of a target word across all of its occurrences in C_1 and C_2.Experimental results on multiple benchmark datasets for SCD show that our proposed method achieves strong performance in multiple languages.Additionally, our method achieves significant improvements on WiC benchmarks compared to a sense-aware encoder with conventional distance functions. 2024.findings-acl.451 @@ -12294,7 +12294,7 @@ HuajianZhang JianhaoYanWestlake University YongjingYin - YueZhangWestlake University + YueZhangWestlake University 7585-7606 Recent advances have made non-autoregressive (NAT) translation comparable to autoregressive methods (AT). However, their evaluation using BLEU has been shown to weakly correlate with human annotations. Limited research compares non-autoregressive translation and autoregressive translation comprehensively, leaving uncertainty about the true proximity of NAT to AT. To address this gap, we systematically evaluate four representative NAT methods across various dimensions, including human evaluation. Our empirical results demonstrate that despite narrowing the performance gap, state-of-the-art NAT still underperforms AT under more reliable evaluation metrics. Furthermore, we discover that explicitly modeling dependencies is crucial for generating natural language and generalizing to out-of-distribution sequences. 2024.findings-acl.452 @@ -12304,7 +12304,7 @@ From Zero to Hero: Cold-Start Anomaly Detection TalReissHebrew University of Jerusalem - GeorgeKourInternational Business Machines + GeorgeKourInternational Business Machines NaamaZwerdling AteretAnaby TavorInternational Business Machines YedidHoshenGoogle and Hebrew University of Jerusalem @@ -12316,11 +12316,11 @@ Large Language Models Fall Short: Understanding Complex Relationships in Detective Narratives RuncongZhao - QinglinZhuKing’s College London, University of London + QinglinZhuKing’s College London, University of London HainiuXu JiazhengLiKing’s College London, University of London YuxiangZhouKing’s College London - YulanHeKing’s College London, University of London + YulanHeKing’s College London, University of London LinGuiKing’s College London, University of London 7618-7638 Existing datasets for narrative understanding often fail to represent the complexity and uncertainty of relationships in real-life social scenarios. To address this gap, we introduce a new benchmark, Conan, designed for extracting and analysing intricate character relation graphs from detective narratives. Specifically, we designed hierarchical relationship categories and manually extracted and annotated role-oriented relationships from the perspectives of various characters, incorporating both public relationships known to most characters and secret ones known to only a few. Our experiments with advanced Large Language Models (LLMs) like GPT-3.5, GPT-4, and Llama2 reveal their limitations in inferencing complex relationships and handling longer narratives. The combination of the Conan dataset and our pipeline strategy is geared towards understanding the ability of LLMs to comprehend nuanced relational dynamics in narrative contexts. @@ -12330,9 +12330,9 @@ <fixed-case>D</fixed-case>istill<fixed-case>MIKE</fixed-case>: Editing Distillation of Massive In-Context Knowledge Editing in Large Language Models - ShanbaoQiao - XuebingLiu - Seung-HoonNaChonbuk National University + ShanbaoQiao + XuebingLiu + Seung-HoonNaChonbuk National University 7639-7654 Among the recently emerged knowledge editing methods, in-context knowledge editing (IKE) has shown respectable abilities on knowledge editing in terms of generalization and specificity. Noting the promising advantages but unexplored issues of IKE, we propose **DistillMIKE** as a novel extension of IKE, i.e., editing **distill**ation of "**M**assive” **I**n-context **K**nowledge **E**diting in large language models (LLMs), mainly consisting of two expansions; 1) *Massive in-context knowledge editing (MIKE)*, which extends IKE to a massive editing task, aiming to inject not a single edit but a set of massive edits to LLMs; To preserve specificity, our key novel extension is a “selective” retrieval augmentation, where the retrieval-augmented IKE is only applied to “in-scope” examples, whereas the unedited model without IKE is employed for “out-of-scope” ones. 2) *Editing distillation* of MIKE using low-rank adaptation (LoRA), which distills editing abilities of MIKE to parameters of LLMs in a manner of eliminating the need of lengthy in-context demonstrations, thus removing the computational overhead encountered at the inference time. Experimental results on the zsRE and CounterFact datasets demonstrate that MIKE shows the state-of-the-art perfomrances and DistilMIKE show comparable performances with MIKE. Our code is available at https://github.com/JoveReCode/DistillMIKE.git. 2024.findings-acl.455 @@ -12341,14 +12341,14 @@ Unlocking Efficiency in Large Language Model Inference: A Comprehensive Survey of Speculative Decoding - HemingXia + HemingXia ZheYangPeking University QingxiuDong PeiyiWang YongqiLiHong Kong Polytechnic University TaoGe TianyuLiu - WenjieLiThe Hong Kong Polytechnic University, The Hong Kong Polytechnic University + WenjieLiThe Hong Kong Polytechnic University, The Hong Kong Polytechnic University ZhifangSuiPeking University 7655-7671 To mitigate the high inference latency stemming from autoregressive decoding in Large Language Models (LLMs), Speculative Decoding has emerged as a novel decoding paradigm for LLM inference. In each decoding step, this method first drafts several future tokens efficiently and then verifies them in parallel. Unlike autoregressive decoding, Speculative Decoding facilitates the simultaneous decoding of multiple tokens per step, thereby accelerating inference. This paper presents a comprehensive overview and analysis of this promising decoding paradigm. We begin by providing a formal definition and formulation of Speculative Decoding. Then, we organize in-depth discussions on its key facets, such as drafter selection and verification strategies. Furthermore, we present a comparative analysis of leading methods under third-party testing environments. We aim for this work to serve as a catalyst for further research on Speculative Decoding, ultimately contributing to more efficient LLM inference. @@ -12358,8 +12358,8 @@ Hierarchy-aware Biased Bound Margin Loss Function for Hierarchical Text Classification - GibaegKim - SangHunImKorea University of Technology and Education + GibaegKim + SangHunImKorea University of Technology and Education Heung-SeonOhKorea University of Technology and Education 7672-7682 Hierarchical text classification (HTC) is a challenging problem with two key issues: utilizing structural information and mitigating label imbalance. Recently, the unit-based approach generating unit-based feature representations has outperformed the global approach focusing on a global feature representation. Nevertheless, unit-based models using BCE and ZLPR losses still face static thresholding and label imbalance challenges. Those challenges become more critical in large-scale hierarchies. This paper introduces a novel hierarchy-aware loss function for unit-based HTC models: Hierarchy-aware Biased Bound Margin (HBM) loss. HBM integrates learnable bounds, biases, and a margin to address static thresholding and mitigate label imbalance adaptively. Experimental results on benchmark datasets demonstrate the superior performance of HBM compared to competitive HTC models. @@ -12383,10 +12383,10 @@ <fixed-case>CICL</fixed-case>e: Conformal In-Context Learning for Largescale Multi-Class Food Risk Classification - KorbinianRandlStockholm University - JohnPavlopoulosAthens University of Economics and Business - AronHenrikssonStockholm University - TonyLindgrenDepratment of Computer and Systems Sciences + KorbinianRandlStockholm University + JohnPavlopoulosAthens University of Economics and Business + AronHenrikssonStockholm University + TonyLindgrenDepratment of Computer and Systems Sciences 7695-7715 Contaminated or adulterated food poses a substantial risk to human health. Given sets of labeled web texts for training, Machine Learning and Natural Language Processing can be applied to automatically detect such risks. We publish a dataset of 7,546 short texts describing public food recall announcements. Each text is manually labeled, on two granularity levels (coarse and fine), for food products and hazards that the recall corresponds to. We describe the dataset and benchmark naive, traditional, and Transformer models. Based on our analysis, Logistic Regression based on a TF-IDF representation outperforms RoBERTa and XLM-R on classes with low support. Finally, we discuss different prompting strategies and present an LLM-in-the-loop framework, based on Conformal Prediction, which boosts the performance of the base classifier while reducing energy consumption compared to normal prompting. 2024.findings-acl.459 @@ -12398,8 +12398,8 @@ RuikangLiu HaoliBaiHuawei Technologies Ltd. HaokunLin - YueningLi - HanGaoHuawei Technologies Ltd. + YueningLi + HanGaoHuawei Technologies Ltd. ZhengzhuoXu LuHouHuawei Technologies Ltd. JunYaoHuawei Technologies Ltd. @@ -12413,7 +12413,7 @@ Learning Adverbs with Spectral Mixture Kernels TomoeTaniguchiOchanomizu Women’s University - DaichiMochihashi + DaichiMochihashi IchiroKobayashiOchanomizu University 7742-7752 For humans and robots to collaborate more in the real world, robots need to understand human intentions from the different manner of their behaviors. In our study, we focus on the meaning of adverbs which describe human motions. We propose a topic model, Hierarchical Dirichlet Process-Spectral Mixture Latent Dirichlet Allocation, which concurrently learns the relationship between those human motions and those adverbs by capturing the frequency kernels that represent motion characteristics and the shared topics of adverbs that depict such motions. We trained the model on datasets we made from movies about “walking” and “dancing”, and found that our model outperforms representative neural network models in terms of perplexity score. We also demonstrate our model’s ability to determine the adverbs for a given motion and confirmed that the model predicts more appropriate adverbs. @@ -12429,10 +12429,10 @@ XiangtaoKong ZhigangZheng DaijiaTang - ChengmingLiShenzhen MSU-BIT University - XipingHuBeijing Institute of Technology + ChengmingLiShenzhen MSU-BIT University + XipingHuBeijing Institute of Technology RuifengXuHarbin Institute of Technology - ShiwenNiShenzhen Institutes of Advanced Technology, Chinese Academy of Sciences + ShiwenNiShenzhen Institutes of Advanced Technology, Chinese Academy of Sciences MinYangShenzhen Institutes of Advanced Technology, Chinese Academy of Sciences, Chinese Academy of Sciences 7753-7774 The rapid development of Large Language Models (LLMs) has led to their increasing utilization in Chinese K-12 education. Despite the growing integration of LLMs and education, the absence of a dedicated benchmark for evaluating LLMs within this domain presents a pressing concern. Consequently, there is an urgent need for a comprehensive natural language processing benchmark to precisely assess the capabilities of various LLMs in Chinese K-12 education. In response, we introduce E-EVAL, the first comprehensive evaluation benchmark specifically tailored for Chinese K-12 education. E-EVAL comprises 4,351 multiple-choice questions spanning primary, middle, and high school levels, covering a diverse array of subjects. Through meticulous evaluation, we find that Chinese-dominant models often outperform English-dominant ones, with many exceeding GPT 4.0. However, most struggle with complex subjects like mathematics. Additionally, our analysis indicates that most Chinese-dominant LLMs do not achieve higher scores at the primary school level compared to the middle school level, highlighting the nuanced relationship between proficiency in higher-order and lower-order knowledge domains. Furthermore, experimental results highlight the effectiveness of the Chain of Thought (CoT) technique in scientific subjects and Few-shot prompting in liberal arts. Through E-EVAL, we aim to conduct a rigorous analysis delineating the strengths and limitations of LLMs in educational applications, thereby contributing significantly to the advancement of Chinese K-12 education and LLMs. @@ -12442,12 +12442,12 @@ <fixed-case>C</fixed-case>hart<fixed-case>A</fixed-case>ssistant: A Universal Chart Multimodal Language Model via Chart-to-Table Pre-training and Multitask Instruction Tuning - FanqingMeng + FanqingMeng WenqiShao QuanfengLuShanghai Jiaotong University and Nanjing university PengGaoshanghai ai lab KaipengZhangShanghai AI Laboratory - YuQiao + YuQiao PingLuoThe University of Hong Kong 7775-7803 Charts play a vital role in data visualization, understanding data patterns, and informed decision-making. However, their unique combination of graphical elements (e.g., bars, lines) and textual components (e.g., labels, legends) poses challenges for general-purpose multimodal models. While vision-language models trained on chart data excel in comprehension, they struggle with generalization. To address these challenges, we propose ChartAssistant, a chart-based vision-language model for universal chart comprehension and reasoning. ChartAssistant leverages ChartSFT, a comprehensive dataset covering diverse chart-related tasks with basic (e.g. bars and pies) and specialized (e.g. radars, and bubbles) chart types. It undergoes a two-stage training process, starting with pre-training on chart-to-table parsing to align chart and text, followed by multitask instruction-following fine-tuning. This approach enables ChartAssistant to achieve competitive performance across various chart tasks. Experimental results demonstrate significant performance gains over the state-of-the-art UniChart and ChartLlama methods, especially outperforming them on real-world chart data with zero-shot setting. The code and data are available at https://github.com/OpenGVLab/ChartAst. @@ -12488,8 +12488,8 @@ HaoxinLiu ZhiyuanZhao JindongWangMicrosoft Research - HarshavardhanKamarthiGeorgia Institute of Technology - B. AdityaPrakashGeorgia Institute of Technology + HarshavardhanKamarthiGeorgia Institute of Technology + B. AdityaPrakashGeorgia Institute of Technology 7832-7840 Time-series forecasting (TSF) finds broad applications in real-world scenarios. Prompting off-the-shelf Large Language Models (LLMs) demonstrates strong zero-shot TSF capabilities while preserving computational efficiency. However, existing prompting methods oversimplify TSF as language next-token predictions, overlooking its dynamic nature and lack of integration with state-of-the-art prompt strategies such as Chain-of-Thought. Thus, we propose LSTPrompt, a novel approach for prompting LLMs in zero-shot TSF tasks. LSTPrompt decomposes TSF into short-term and long-term forecasting sub-tasks, tailoring prompts to each. LSTPrompt guides LLMs to regularly reassess forecasting mechanisms to enhance adaptability. Extensive evaluations demonstrate consistently better performance of LSTPrompt than existing prompting methods, and competitive results compared to foundation TSF models. 2024.findings-acl.466 @@ -12499,8 +12499,8 @@ Mitigating Boundary Ambiguity and Inherent Bias for Text Classification in the Era of Large Language Models ZhenyiLu - JieTian - WeiWeiHuazhong University of Science and Technology + JieTian + WeiWeiHuazhong University of Science and Technology XiaoyeQuShanghai Artificial Intelligence Laboratory YuChengThe Chinese University of Hong Kong WenfengXie @@ -12513,12 +12513,12 @@ <fixed-case>UOR</fixed-case>: Universal Backdoor Attacks on Pre-trained Language Models - WeiDu + WeiDu PeixuanLi - HaodongZhaoShanghai Jiaotong University + HaodongZhaoShanghai Jiaotong University TianjieJu GeRen - GongshenLiuShanghai Jiao Tong University + GongshenLiuShanghai Jiao Tong University 7865-7877 Task-agnostic and transferable backdoors implanted in pre-trained language models (PLMs) pose a severe security threat as they can be inherited to any downstream task. However, existing methods rely on manual selection of triggers and backdoor representations, hindering their effectiveness and universality across different PLMs or usage paradigms. In this paper, we propose a new backdoor attack method called UOR, which overcomes these limitations by turning manual selection into automatic optimization. Specifically, we design poisoned supervised contrastive learning, which can automatically learn more uniform and universal backdoor representations. This allows for more even coverage of the output space, thus hitting more labels in downstream tasks after fine-tuning. Furthermore, we utilize gradient search to select appropriate trigger words that can be adapted to different PLMs and vocabularies. Experiments show that UOR achieves better attack performance on various text classification tasks compared to manual methods. Moreover, we test on PLMs with different architectures, usage paradigms, and more challenging tasks, achieving higher scores for universality. 2024.findings-acl.468 @@ -12527,9 +12527,9 @@ Language models emulate certain cognitive profiles: An investigation of how predictability measures interact with individual differences - PatrickHallerUniversity of Zurich + PatrickHallerUniversity of Zurich LenaBolligerUniversity of Zurich - LenaJägerUniversity of Zurich and Universität Potsdam + LenaJägerUniversity of Zurich and Universität Potsdam 7878-7892 To date, most investigations on surprisal and entropy effects in reading have been conducted on the group level, disregarding individual differences. In this work, we revisit the predictive power (PP) of different LMs’ surprisal and entropy measures on data of human reading times as a measure of processing effort by incorporating information of language users’ cognitive capacities. To do so, we assess the PP of surprisal and entropy estimated from generative language models (LMs) on reading data obtained from individuals who also completed a wide range of psychometric tests.Specifically, we investigate if modulating surprisal and entropy relative to cognitive scores increases prediction accuracy of reading times, and we examine whether LMs exhibit systematic biases in the prediction of reading times for cognitively high- or low-performing groups, revealing what type of psycholinguistic subjects a given LM emulates.Our study finds that in most cases, incorporating cognitive capacities increases predictive power of surprisal and entropy on reading times, and that generally, high performance in the psychometric tests is associated with lower sensitivity to predictability effects. Finally, our results suggest that the analyzed LMs emulate readers with lower verbal intelligence, suggesting that for a given target group (i.e., individuals with high verbal intelligence), these LMs provide less accurate predictability effect estimates. 2024.findings-acl.469 @@ -12539,7 +12539,7 @@ The State of Relation Extraction Data Quality: Is Bigger Always Better? EricaCaiDepartment of Computer Science, University of Massachusetts at Amherst - BrendanO’ConnorUniversity of Massachusetts, Amherst + BrendanO’ConnorUniversity of Massachusetts, Amherst 7893-7906 Relation extraction (RE) extracts structured tuples of relationships (e.g. friend, enemy) between entities (e.g. Sherlock Holmes, John Watson) from text, with exciting potential applications. Hundreds of RE papers have been published in recent years; do their evaluation practices inform these goals? We review recent surveys and a sample of recent RE methods papers, compiling 38 datasets currently being used. Unfortunately, many have frequent label errors, and ones with known problems continue to be used. Many datasets focus on producing labels for a large number of relation types, often through error-prone annotation methods (e.g. distant supervision or crowdsourcing), and many recent papers rely exclusively on such datasets. We draw attention to a promising alternative: datasets with a small number of relations, often in specific domains like chemistry, finance, or biomedicine, where it is possible to obtain high quality expert annotations; such data can more realistically evaluate RE performance. The research community should consider more often using such resources. 2024.findings-acl.470 @@ -12550,11 +12550,11 @@ <fixed-case>N</fixed-case>atural<fixed-case>C</fixed-case>ode<fixed-case>B</fixed-case>ench: Examining Coding Performance Mismatch on <fixed-case>H</fixed-case>uman<fixed-case>E</fixed-case>val and Natural User Queries ShudanZhang HanlinZhao - XiaoLiu + XiaoLiu QinkaiZheng ZehanQiTsinghua University XiaotaoGuZhipu AI - YuxiaoDongTsinghua University + YuxiaoDongTsinghua University JieTangTsinghua University, Tsinghua University 7907-7928 Large language models (LLMs) have manifested strong ability to generate codes for productive activities. However, current benchmarks for code synthesis, such as HumanEval, MBPP, and DS-1000, are predominantly oriented towards introductory tasks on algorithm and data science, insufficiently satisfying challenging requirements prevalent in real-world coding. To fill this gap, we propose NaturalCodeBench (NCB), a challenging code benchmark designed to mirror the complexity and variety of scenarios in real coding tasks. NCB comprises 402 high-quality problems in Python and Java, meticulously selected from natural user queries from online coding services, covering 6 different domains. Noting the extraordinary difficulty in creating testing cases for real-world queries, we also introduce a semi-automated pipeline to enhance the efficiency of test case construction. Comparing with manual solutions, it achieves an efficiency increase of more than 4 times. Our systematic experiments on 39 LLMs find that performance gaps on NCB between models with close HumanEval scores could still be significant, indicating a lack of focus on practical code synthesis scenarios or over-specified optimization on HumanEval. On the other hand, even the best-performing GPT-4 is still far from satisfying on NCB. The evaluation toolkit and development set are available at https://github.com/THUDM/NaturalCodeBench. @@ -12575,7 +12575,7 @@ Empowering cross-lingual abilities of instruction-tuned large language models by translation-following demonstrations - LeonardoRanaldiIdiap Research Institute + LeonardoRanaldiIdiap Research Institute GiuliaPucci AndreFreitasIdiap Research Institute and University of Manchester 7961-7973 @@ -12598,7 +12598,7 @@ Efficient <tex-math>k</tex-math>-Nearest-Neighbor Machine Translation with Dynamic Retrieval YanGao - ZhiweiCao + ZhiweiCao ZhongjianMiao BaosongYang ShiyuLiu @@ -12614,8 +12614,8 @@ Symmetric Dot-Product Attention for Efficient Training of <fixed-case>BERT</fixed-case> Language Models MartinCourtoisGerman Research Center for AI MalteOstendorffGerman Research Center for AI - LeonhardHennigGerman Research Center for AI - GeorgRehmHumboldt Universität Berlin and Deutsches Forschungszentrum für Künstliche Intelligenz + LeonhardHennigGerman Research Center for AI + GeorgRehmHumboldt Universität Berlin and Deutsches Forschungszentrum für Künstliche Intelligenz 8002-8011 Initially introduced as a machine translation model, the Transformer architecture has now become the foundation for modern deep learning architecture, with applications in a wide range of fields, from computer vision to natural language processing. Nowadays, to tackle increasingly more complex tasks, Transformer-based models are stretched to enormous sizes, requiring increasingly larger training datasets, and unsustainable amount of compute resources. The ubiquitous nature of the Transformer and its core component, the attention mechanism, are thus prime targets for efficiency research.In this work, we propose an alternative compatibility function for the self-attention mechanism introduced by the Transformer architecture. This compatibility function exploits an overlap in the learned representation of the traditional scaled dot-product attention, leading to a symmetric with pairwise coefficient dot-product attention. When applied to the pre-training of BERT-like models, this new symmetric attention mechanism reaches a score of 79.36 on the GLUE benchmark against 78.74 for the traditional implementation, leads to a reduction of 6% in the number of trainable parameters, and reduces the number of training steps required before convergence by half. 2024.findings-acl.476 @@ -12627,7 +12627,7 @@ FanyouWuAmazon WeijieXu ChandanReddyVirginia Tech - SrinivasanSengameduAmazon + SrinivasanSengameduAmazon 8012-8026 In this study, we tackle the challenge of inadequate and costly training data that has hindered the development of conversational question answering (ConvQA) systems. Enterprises have a large corpus of diverse internal documents. Instead of relying on a searching engine, a more compelling approach for people to comprehend these documents is to create a dialogue system. In this paper, we propose a robust dialog synthesising method. We learn the segmentation of data for the dialog task instead of using segmenting at sentence boundaries. The synthetic dataset generated by our proposed method achieves superior quality when compared to WikiDialog, as assessed through machine and human evaluations. By employing our inpainted data for ConvQA retrieval system pre-training, we observed a notable improvement in performance across OR-QuAC benchmarks. 2024.findings-acl.477 @@ -12647,7 +12647,7 @@ Alignment-Based Decoding Policy for Low-Latency and Anticipation-Free Neural <fixed-case>J</fixed-case>apanese Input Method Editors ArminSarhangzadeh - TaroWatanabeNara Institute of Science and Technology, Japan + TaroWatanabeNara Institute of Science and Technology, Japan 8043-8054 Japanese input method editors (IMEs) are essential tools for inputting Japanese text using a limited set of characters such as the kana syllabary. However, despite their importance, the potential of newer attention-based encoder-decoder neural networks, such as Transformer, has not yet been fully explored for IMEs due to their high computational cost and low-quality intermediate output in simultaneous settings, leading to high latencies. In this work, we propose a simple decoding policy to enable the use of attention-based encoder-decoder networks for simultaneous kana-kanji conversion in the context of Japanese IMEs inspired by simultaneous machine translation (SimulMT). We demonstrate that simply decoding by explicitly considering the word boundaries achieves a fairly strong quality-latency trade-off, as it can be seen as equivalent to performing decoding on aligned prefixes and thus achieving an incremental anticipation-free conversion. We further show how such a policy can be applied in practice to achieve high-quality conversions with minimal computational overhead. Our experiments show that our approach can achieve a noticeably better quality-latency trade-off compared to the baselines, while also being a more practical approach due to its ability to directly handle streaming input. Our code is available at https://anonymous.4open.science/r/transformer_ime-D327. 2024.findings-acl.479 @@ -12658,12 +12658,12 @@ <fixed-case>EC</fixed-case>o<fixed-case>K</fixed-case>: Emotional Commonsense Knowledge Graph for Mining Emotional Gold ZhunhengWangNankai University XiaoyiLiu - MengtingHuNankai University + MengtingHuNankai University RuiYing MingJiangNankai University JianfengWu YalanXieNankai University - HangGaoTianjin University of Science and Technology + HangGaoTianjin University of Science and Technology RenhongCheng 8055-8074 The demand for understanding and expressing emotions in the field of natural language processing is growing rapidly. Knowledge graphs, as an important form of knowledge representation, have been widely utilized in various emotion-related tasks. However, existing knowledge graphs mainly focus on the representation and reasoning of general factual knowledge, while there are still significant deficiencies in the understanding and reasoning of emotional knowledge. In this work, we construct a comprehensive and accurate emotional commonsense knowledge graph, ECoK. We integrate cutting-edge theories from multiple disciplines such as psychology, cognitive science, and linguistics, and combine techniques such as large language models and natural language processing. By mining a large amount of text, dialogue, and sentiment analysis data, we construct rich emotional knowledge and establish the knowledge generation model COMET-ECoK. Experimental results show that ECoK contains high-quality emotional reasoning knowledge, and the performance of our knowledge generation model surpasses GPT-4-Turbo, which can help downstream tasks better understand and reason about emotions. Our data and code is available from https://github.com/ZornWang/ECoK. @@ -12674,8 +12674,8 @@ Deterministic Reversible Data Augmentation for Neural Machine Translation JiashuYao - HeyanHuangBeijing Institute of Technology - ZemingLiu + HeyanHuangBeijing Institute of Technology + ZemingLiu YuhangGuo 8075-8089 Data augmentation is an effective way to diversify corpora in machine translation, but previous methods may introduce semantic inconsistency between original and augmented data because of irreversible operations and random subword sampling procedures. To generate both symbolically diverse and semantically consistent augmentation data, we propose Deterministic Reversible Data Augmentation (DRDA), a simple but effective data augmentation method for neural machine translation. DRDA adopts deterministic segmentations and reversible operations to generate multi-granularity subword representations and pulls them closer together with multi-view techniques. With no extra corpora or model changes required, DRDA outperforms strong baselines on several translation tasks with a clear margin (up to 4.3 BLEU gain over Transformer) and exhibits good robustness in noisy, low-resource, and cross-domain datasets. @@ -12715,7 +12715,7 @@ Characterizing Large Language Models as Rationalizers of Knowledge-intensive Tasks AditiMishra - SajjadurRahmanMegagon Labs + SajjadurRahmanMegagon Labs KushanMitra HannahKimMegagon Labs EstevamHruschkaMegagon Labs and Carnegie Mellon University @@ -12740,8 +12740,8 @@ Linear Cross-Lingual Mapping of Sentence Embeddings - OlegVasilyevPrimer Technologies - FumikaIsonoPrimer AI + OlegVasilyevPrimer Technologies + FumikaIsonoPrimer AI JohnBohannon 8163-8171 Semantics of a sentence is defined with much less ambiguity than semantics of a single word, and we assume that it should be better preserved by translation to another language. If multilingual sentence embeddings intend to represent sentence semantics, then the similarity between embeddings of any two sentences must be invariant with respect to translation. Based on this suggestion, we consider a simple linear cross-lingual mapping as a possible improvement of the multilingual embeddings. We also consider deviation from orthogonality conditions as a measure of deficiency of the embeddings. @@ -12775,8 +12775,8 @@ <fixed-case>BASS</fixed-case>: Batched Attention-optimized Speculative Sampling - HaifengQianAmazon - Sujan KumarGonugondlaAmazon + HaifengQianAmazon + Sujan KumarGonugondlaAmazon SungsooHaAmazon MingyueShangAmazon Sanjay KrishnaGoudaAmazon @@ -12795,7 +12795,7 @@ DekunWuUniversité de Montréal HaochenShi ZhiyuanSun - BangLiuUniversity of Montreal + BangLiuUniversity of Montreal 8225-8291 In this study, we explore the application of Large Language Models (LLMs) in Jubensha, a Chinese detective role-playing game and a novel area in Artificial Intelligence (AI) driven gaming. We introduce the first dataset specifically for Jubensha, including character scripts and game rules, to foster AI agent development in this complex narrative environment. Our work also presents a unique multi-agent interaction framework using LLMs, allowing AI agents to autonomously engage in Jubensha games. To evaluate the gaming performance of these AI agents, we developed novel methods measuring their mastery of case information and reasoning skills. Furthermore, we incorporated the latest advancements in prompting engineering to enhance the agents’ performance in information gathering, murderer identification, and logical reasoning. The experimental results validate the effectiveness of our proposed methods. This work aims to offer a novel perspective on understanding LLM capabilities and establish a new benchmark for evaluating large language model-based agents. 2024.findings-acl.490 @@ -12804,8 +12804,8 @@ It Is Not About What You Say, It Is About How You Say It: A Surprisingly Simple Approach for Improving Reading Comprehension - SagiShaier - LawrenceHunterUniversity of Colorado at Denver + SagiShaier + LawrenceHunterUniversity of Colorado at Denver KatharinaWenseJohannes-Gutenberg Universität Mainz, University of Colorado, Boulder and New York University 8292-8305 Natural language processing has seen rapid progress over the past decade. Due to the speed of developments, some practices get established without proper evaluation. Considering one such case and focusing on reading comprehension, we ask our first research question: 1) How does the order of inputs – i.e., question and context – affect model performance? Additionally, given recent advancements in input emphasis, we ask a second research question: 2) Does emphasizing either the question, the context, or both enhance performance? Experimenting with 9 large language models across 3 datasets, we find that presenting the context before the question improves model performance, with an accuracy increase of up to 31%. Furthermore, emphasizing the context yields superior results compared to question emphasis, and in general, emphasizing parts of the input is particularly effective for addressing questions that models lack the parametric knowledge to answer. Experimenting with both prompt-based and attention-based emphasis methods, we additionally find that the best method is surprisingly simple: it only requires concatenating a few tokens to the input and results in an ac- curacy improvement of up to 36%, allowing smaller models to outperform their significantly larger counterparts. @@ -12829,7 +12829,7 @@ XinyuWangUniversity of Warwick HainiuXu LinGuiKing’s College London, University of London - YulanHeKing’s College London, University of London + YulanHeKing’s College London, University of London 8324-8340 Task embedding, a meta-learning technique that captures task-specific information, has gained popularity, especially in areas such as multi-task learning, model editing, and interpretability. However, it faces challenges with the emergence of prompt-guided Large Language Models (LLMs) operating in a gradient-free manner. Existing task embedding methods rely on fine-tuned, task-specific language models, which hinders the adaptability of task embeddings across diverse models, especially prompt-based LLMs. To hardness the potential of task embeddings in the era of LLMs, we propose a framework for unified task embeddings (FUTE), harmonizing task embeddings from various models, including smaller language models and LLMs with varied prompts, within a single vector space. Such uniformity enables comparison and analysis of similarities amongst different models, broadening the scope and utility of existing task embedding methods in multi-model scenarios, while maintaining their performance comparable to architecture-specific methods. 2024.findings-acl.493 @@ -12841,7 +12841,7 @@ YinhongLiu YimaiFangApple DavidVandyke - NigelCollierUniversity of Cambridge + NigelCollierUniversity of Cambridge 8341-8356 In light of recent advances in large language models (LLMs), the expectations for the next generation of virtual assistants include enhanced naturalness and adaptability across diverse usage scenarios. However, the creation of high-quality annotated data for Task-Oriented Dialog (TOD) is recognized to be slow and costly. To address these challenges, we introduce Task-Oriented Automatic Dialogs (TOAD), a novel and scalable TOD dataset along with its automatic generation pipeline. The TOAD dataset simulates realistic app context interaction and provide a variety of system response style options. Two aspects of system response styles are considered, verbosity level and users’ expression mirroring. We benchmark TOAD on two response generation tasks, and the results show that modeling more verbose responses or responses without user expression mirroring is more challenging. 2024.findings-acl.494 @@ -12851,7 +12851,7 @@ Machine-Generated Text Localization ZhongpingZhang - WendaQin + WendaQin BryanPlummerBoston University 8357-8371 Machine-Generated Text (MGT) detection aims to identify a piece of text as machine or human written. Prior work has primarily formulated MGT detection as a binary classification task over an entire document, with limited work exploring cases where only part of a document is machine generated. This paper provides the first in-depth study of MGT that localizes the portions of a document that were machine generated. Thus, if a bad actor were to change a key portion of a news article to spread misinformation, whole document MGT detection may fail since the vast majority is human written, but our approach can succeed due to its granular approach. A key challenge in our MGT localization task is that short spans of text, *e.g.*, a single sentence, provides little information indicating if it is machine generated due to its short length. To address this, we leverage contextual information, where we predict whether multiple sentences are machine or human written at once. This enables our approach to identify changes in style or content to boost performance. A gain of 4-13% mean Average Precision (mAP) over prior work demonstrates the effectiveness of approach on five diverse datasets: GoodNews, VisualNews, WikiText, Essay, and WP. We release our implementation at https://github.com/Zhongping-Zhang/MGT_Localization. @@ -12861,8 +12861,8 @@ <fixed-case>B</fixed-case>ench<fixed-case>IE</fixed-case>^<fixed-case>FL</fixed-case>: A Manually Re-Annotated Fact-Based Open Information Extraction Benchmark - FabriceLamarche - PhilippeLanglaisUniversité de Montréal + FabriceLamarche + PhilippeLanglaisUniversité de Montréal 8372-8394 Open Information Extraction (OIE) is a field of natural language processing that aims to present textual information in a format that allows it to be organized, analyzed and reflected upon. Numerous OIE systems are developed, claiming ever-increasing performance, marking the need for objective benchmarks. BenchIE is the latest reference we know of. Despite being very well thought out, we noticed a number of issues we believe are limiting. Therefore, we propose BenchIE^FL, a new OIE benchmark which fully enforces the principles of BenchIE while containing fewer errors, omissions and shortcomings when candidate facts are matched towards reference ones. BenchIE^FL allows insightful conclusions to be drawn on the actual performance of OIE extractors. 2024.findings-acl.496 @@ -12872,12 +12872,12 @@ <fixed-case>C</fixed-case>ausal<fixed-case>C</fixed-case>ite: A Causal Formulation of Paper Citations IshanAgrawal - ZhijingJin + ZhijingJin EhsanMokhtarianSwiss Federal Institute of Technology Lausanne SiyuanGuo YuenChenUniversity of Illinois at Urbana-Champaign MrinmayaSachanSwiss Federal Institute of Technology - BernhardSchölkopfELLIS Institute and Max Planck Institute for Intelligent Systems, Max-Planck Institute + BernhardSchölkopfELLIS Institute and Max Planck Institute for Intelligent Systems, Max-Planck Institute 8395-8410 Citation count of a paper is a commonly used proxy for evaluating the significance of a paper in the scientific community. Yet citation measures are widely criticized for failing to accurately reflect the true impact of a paper. Thus, we propose CausalCite, a new way to measure the significance of a paper by assessing the causal impact of the paper on its follow-up papers. CausalCite is based on a novel causal inference method, TextMatch, which adapts the traditional matching framework to high-dimensional text embeddings. TextMatch encodes each paper using text embeddings from large language models (LLMs), extracts similar samples by cosine similarity, and synthesizes a counterfactual sample as the weighted average of similar papers according to their similarity values. We demonstrate the effectiveness of CausalCite on various criteria, such as high correlation with paper impact as reported by scientific experts on a previous dataset of 1K papers, (test-of-time) awards for past papers, and its stability across various subfields of AI. We also provide a set of findings that can serve as suggested ways for future researchers to use our metric for a better understanding of the quality of a paper. Our code is available at https://github.com/causalNLP/causal-cite. 2024.findings-acl.497 @@ -12917,7 +12917,7 @@ Multi-Label Classification for Implicit Discourse Relation Recognition WanqiuLong - SiddharthNUniversity of Edinburgh + SiddharthNUniversity of Edinburgh BonnieWebberEdinburgh University, University of Edinburgh 8437-8451 Discourse relations play a pivotal role in establishing coherence within textual content, uniting sentences and clauses into a cohesive narrative. The Penn Discourse Treebank (PDTB) stands as one of the most extensively utilized datasets in this domain. In PDTB-3, the annotators can assign multiple labels to an example, when they believe the simultaneous presence of multiple relations. Prior research in discourse relation recognition has treated these instances as separate examples during training, with a gold-standard prediction matching one of the labels considered correct at test time. However, this approach is inadequate, as it fails to account for the interdependence of labels in real-world contexts and to distinguish between cases where only one sense relation holds and cases where multiple relations hold simultaneously. In our work, we address this challenge by exploring various multi-label classification frameworks to handle implicit discourse relation recognition. We show that the methods for multi-label prediction don’t depress performance for single-label prediction. Additionally, we give comprehensive analysis of results and data. Our work contributes to advancing the understanding and application of discourse relations and provide a foundation for the future study. @@ -12928,10 +12928,10 @@ <fixed-case>S</fixed-case>tudent<fixed-case>E</fixed-case>val: A Benchmark of Student-Written Prompts for Large Language Models of Code Hannah McLeanBabe - SydneyNguyen - YangtianZi + SydneyNguyen + YangtianZi ArjunGuha - Molly QFeldman + Molly QFeldman Carolyn JaneAnderson 8452-8474 Code LLMs have the potential to make it easier for non-experts to understand and write code. However, current CodeLLM benchmarks rely on a single expert-written prompt per problem, making it hard to generalize their success to non-expert users. In this paper, we present a new natural-language-to-code benchmark of prompts written by a key population of non-experts: beginning programmers. StudentEval contains 1,749 prompts written by 80 students who have only completed one introductory Python course. StudentEval contains numerous non-expert prompts describing the same problem, enabling exploration of key factors in prompt success. We use StudentEval to evaluate 12 Code LLMs and find that StudentEval is a better discriminator of model performance than existing benchmarks. Our analysis of student prompting strategies reveals that nondeterministic LLM sampling can mislead students about the quality of their descriptions, a finding with key implications for Code LLMs in education. @@ -12953,9 +12953,9 @@ Generating Diverse and High-Quality Texts by Minimum <fixed-case>B</fixed-case>ayes Risk Decoding YuuJinnaiCyberAgent, Inc. - UkyoHondaCyberAgent, Inc. + UkyoHondaCyberAgent, Inc. TetsuroMorimuraCyberAgent, Inc. - PeinanZhangCyberAgent AI Lab + PeinanZhangCyberAgent AI Lab 8494-8525 One of the most important challenges in text generation systems is to produce outputs that are not only correct but also diverse.Recently, Minimum Bayes-Risk (MBR) decoding has gained prominence for generating sentences of the highest quality among the decoding algorithms. However, existing algorithms proposed to generate diverse outputs are predominantly based on beam search or random sampling, thus their output quality is capped by these underlying decoding algorithms. In this paper, we investigate an alternative approach – we develop diversity-promoting decoding algorithms by enforcing diversity objectives to MBR decoding.We propose two variants of MBR; (i) Diverse MBR (DMBR) that adds a diversity penalty to the decoding objective and (ii) k-medoids MBR (KMBR) that reformulates the decoding task as a clustering problem.We evaluate DMBR and KMBR on a variety of directed text generation tasks using encoder-decoder models and a language model with prompting. The experimental results show that the proposed method achieves a better trade-off than the diverse beam search and sampling algorithms overall. 2024.findings-acl.503 @@ -12999,9 +12999,9 @@ Bi-Chainer: Automated Large Language Models Reasoning with Bidirectional Chaining - ShuqiLiu - BoweiHe - LinqiSongCity University of Hong Kong + ShuqiLiu + BoweiHe + LinqiSongCity University of Hong Kong 8578-8598 Large Language Models (LLMs) have shown human-like reasoning abilities but still face challenges in solving complex logical problems. Existing unidirectional chaining methods, such as forward chaining and backward chaining, suffer from issues like low prediction accuracy and efficiency. To address these, we propose a bidirectional chaining method, Bi-Chainer, which dynamically switches to depth-first reasoning in the opposite reasoning direction when it encounters multiple branching options within the current direction. Thus, the intermediate reasoning results can be utilized as guidance to facilitate the reasoning process. We show that Bi-Chainer achieves sizable accuracy boots over unidirectional chaining frameworks on four challenging logical reasoning datasets. Moreover, Bi-Chainer enhances the accuracy of intermediate proof steps and reduces the average number of inference calls, resulting in more efficient and accurate reasoning. 2024.findings-acl.507 @@ -13022,8 +13022,8 @@ Knowledge Context Modeling with Pre-trained Language Models for Contrastive Knowledge Graph Completion GuangqianYangUniversity of Science and Technology of China YiLiuState Key Laboratory of Communication Content Cognition - LeiZhangUniversity of Science and Technology of China - LichengZhang + LeiZhangUniversity of Science and Technology of China + LichengZhang HongtaoXieUniversity of Science and Technology of China ZhendongMaoUniversity of Science and Technology of China 8619-8630 @@ -13039,7 +13039,7 @@ XianLiAmazon JingboShangUniversity of California, San Diego HoangNguyen - PhilipYuUniversity of Illinois, Chicago + PhilipYuUniversity of Illinois, Chicago 8631-8643 Attribute value extraction involves identifying the value spans of predetermined attributes in product texts. This area of research has traditionally operated under a closed-world assumption, focusing on products from a static set of categories and their associated attributes. However, products in e-commerce stores are ever-increasing and evolving, calling for life-long learning. If continuously trained on the fast-increasing products and attributes, most existing solutions not only struggle for parameter efficiency but also endure foreseeable defects due to data contamination, catastrophic forgetting, etc. As a remedy, we propose and study a new task, which aims to effectively maintain a strong single model for many domains in a life-long learning fashion, without jeopardizing the model performance and parameter efficiency. We introduce factorization into the model and make it domain-aware by decoupling the modeling of product type and attribute, as a way to promote de-contamination and parameter efficiency while scaling up. Tuning the model with distillation prevents forgetting historical knowledge and enables continuous learning from emerging domains. Experiments on hundreds of domains showed that our model attains the near state-of-the-art performance with affordable parameter size, the least historical knowledge forgetting, and the greatest robustness against noises, whilst adding only a few parameters per domain when compared with competitive baselines. 2024.findings-acl.510 @@ -13050,7 +13050,7 @@ Exploring Domain Robust Lightweight Reward Models based on Router Mechanism HyukNamgoongChungnam National University JeesuJung - SangkeunJung + SangkeunJung YoonHyungRohElectronics and Telecommunications Research Institute 8644-8652 Recent advancements in large language models have heavily relied on the large reward model from reinforcement learning from human feedback for fine-tuning. However, the use of a single reward model across various domains may not always be optimal, often requiring retraining from scratch when new domain data is introduced. To address these challenges, we explore the utilization of small language models operating in a domain-specific manner based on router mechanisms. Our three approaches are: 1) utilize mixture of experts to form a single reward model by modularizing an internal router and experts, 2) employing external router to select the appropriate reward model from multiple domain-specific models, and 3) the framework reduces parameter size by loading reward models and router adapters onto a single small language model using adapters. Experimental validation underscores the effectiveness of our approach, demonstrating performance comparable to baseline methods while also reducing the total parameter size. @@ -13062,13 +13062,13 @@ Generalized Category Discovery with Large Language Models in the Loop WenbinAnXi’an Jiaotong University WenkaiShi - FengTianXi’an Jiaotong University + FengTianXi’an Jiaotong University HaonanLinXi’an Jiaotong University QianYingWang - YaqiangWuLenovo Research + YaqiangWuLenovo Research MingxiangCai LuyanWang - YanChenXi’an Jiaotong University + YanChenXi’an Jiaotong University HaipingZhuXi’an Jiaotong University PingChenUniversity of Massachusetts, Boston 8653-8665 @@ -13124,7 +13124,7 @@ LiangDing HaotongQinETHZ - ETH Zurich XiabinZhou - YifuDingBeihang University + YifuDingBeihang University XueboLiuHarbin Institute of Technolgy, Shenzhen MinZhangHarbin Institute of Technology, Shenzhen JinyangGuoBeijing University of Aeronautics and Astronautics @@ -13143,7 +13143,7 @@ YiLiuPeking University YuxiangWang ShuhuaiRen - LeiLiUniversity of Hong Kong + LeiLiUniversity of Hong Kong SishuoChenAlibaba Group XuSun LuHouHuawei Technologies Ltd. @@ -13156,7 +13156,7 @@ “Get Their Hands Dirty, Not Mine”: On Researcher-Annotator Collaboration and the Agency of Annotators ShengqiZhuCornell University - JeffreyRzeszotarskiCornell University + JeffreyRzeszotarskiCornell University 8773-8782 Annotation quality is often framed as post-hoc cleanup of annotator-caused issues. This position paper discusses whether, how, and why this narrative limits the scope of improving annotation. We call to consider annotation as a procedural collaboration, outlining three points in this direction:(1) An issue can be either annotator- or researcher-oriented, where one party is accountable and the other party may lack ability to fix it; (2) yet, they can co-occur or have similar consequences, and thus any specific problem we encounter may be a combination;(3) therefore, we need a new language to capture the nuance and holistically describe the full procedure to resolve these issues.To that end, we propose to study how agency is manifested in annotation and picture how this perspective benefits the community more broadly. 2024.findings-acl.518 @@ -13165,7 +13165,7 @@ Teaching Large Language Models an Unseen Language on the Fly - ChenZhangPeking University + ChenZhangPeking University XiaoLiuPeking University JiuhengLin YansongFengPeking University @@ -13181,7 +13181,7 @@ BaopuQiu LiangDing KanjianZhangSchools of Automation, Southeast University - TomKocmiMicrosoft + TomKocmiMicrosoft DachengTaoUniversity of Sydney 8801-8816 Generative large language models (LLMs), e.g., ChatGPT, have demonstrated remarkable proficiency across several NLP tasks, such as machine translation, text summarization. Recent research (Kocmi and Federmann, 2023) has shown that utilizing LLMs for assessing the quality of machine translation (MT) achieves state-of-the-art performance at the system level but performs poorly at the segment level. To further improve the performance of LLMs on MT quality assessment, we conduct an investigation into several prompting designs, and propose a new prompting method called Error Analysis Prompting (EAPrompt) by combining Chain-of-Thoughts (Wei et al., 2022) and Error Analysis (Lu et al., 2023). This technique emulates the commonly accepted human evaluation framework - Multidimensional Quality Metrics (MQM, Freitag et al., (2021)) and produces explainable and reliable MT evaluations at both the system and segment level. Experimental Results from WMT22 metrics shared task validate the effectiveness of EAPrompt on various LLMs, with different structures. Further analysis confirms that EAPrompt effectively distinguishes major errors from minor ones, while also sharing a similar distribution of the number of errors with MQM. These findings highlight the potential of EAPrompt as a human-like evaluator prompting technique for MT evaluation. We will release our code and scripts to facilitate the community. @@ -13192,7 +13192,7 @@ <fixed-case>GAOKAO</fixed-case>-<fixed-case>MM</fixed-case>: A <fixed-case>C</fixed-case>hinese Human-Level Benchmark for Multimodal Models Evaluation YiZong - XipengQiuFudan University + XipengQiuFudan University 8817-8825 The Large Vision-Language Models (LVLMs) have demonstrated great abilities in image perception and language understanding. However, existing datasets either focus solely on primary perception abilities and commonsense knowledge, or have a low level of text comprehension difficulty, which are insufficient to reflect the comprehensive capabilities of LVLMs, particularly in terms of Chinese language proficiency. We propose GAOKAO-MM, a multimodal benchmark based on the Chinese College Entrance Examination (GAOKAO), comprising of 8 subjects and 12 types of images, such as diagrams, function graphs, maps and photos. GAOKAO-MM derives from native Chinese context and sets human-level requirements for the model’s abilities, including perception, understanding, knowledge and reasoning. We evaluate 10 LVLMs and find that the accuracies of all of them are lower than 50%, with GPT-4-Vision (48.1%), Qwen-VL-Plus (41.2%) and Gemini-Pro-Vision (35.1%) ranking in the top three positions. The results of our multi-dimension analysis indicate that LVLMs have moderate distance towards Artificial General Intelligence (AGI) and provide insights facilitating the development of multilingual LVLMs. The dataset and evaluation code are available through: https://github.com/OpenMOSS/GAOKAO-MM 2024.findings-acl.521 @@ -13205,7 +13205,7 @@ ChengyuWangAlibaba Group TingfengCao JunHuang - LianwenJinSouth China University of Technology + LianwenJinSouth China University of Technology 8826-8840 We present DiffChat, a novel method to align Large Language Models (LLMs) to “chat” with prompt-as-input Text-to-Image Synthesis (TIS)models (e.g., Stable Diffusion) for interactive image creation. Given a raw prompt/image and a user-specified instruction, DiffChat can effectively make appropriate modifications and generate the target prompt, which can be leveraged to create the target image of high quality. To achieve this, we first collect an instruction-following prompt engineering dataset named InstructPE for the supervised training of DiffChat.Next, we propose a reinforcement learning framework with the feedback of three core criteria for image creation, i.e., aesthetics, user preference and content integrity. It involves an action-space dynamic modification technique to obtain more relevant positive samples and harder negative samples during the off-policy sampling. Content integrity is also introduced into the value estimation function for further improvement of produced images. Our method can exhibit superior performance than baseline models and strong competitors based on both automatic and human evaluations, which fully demonstrates its effectiveness. 2024.findings-acl.522 @@ -13215,10 +13215,10 @@ Revisiting Parallel Context Windows: A Frustratingly Simple Alternative and Chain-of-Thought Deterioration KejuanYang - XiaoLiu + XiaoLiu KaiwenMen AohanZengTsinghua University, Tsinghua University - YuxiaoDongTsinghua University + YuxiaoDongTsinghua University JieTangTsinghua University, Tsinghua University 8841-8852 We identify two crucial limitations in the evaluation of recent parallel-integrated method Parallel Context Windows (PCW), which extends the maximum context lengths of language models, e.g., 2048 for LLaMA, by harnessing window-wise attention and positional embedding techniques. We first show that a simple yet strong baseline, weighted sum ensemble, is missing for the in-context few-shot classification. Moreover, on more challenging Chain-of-Thought (CoT) reasoning (e.g., HotpotQA), PCW would present unexpected deterioration regarding question miscomprehension and false inference. Based on our findings, we suggest that the existing PCW design may not guarantee sufficient improvement and practicality in handling lengthy documents in real-world applications. More community efforts on enabling language models’ long context understanding ability should be paid. @@ -13230,8 +13230,8 @@ Rationales for Answers to Simple Math Word Problems Confuse Large Language Models YidanZhang MingfengXueSichuan University - DayihengLiuAlibaba Group - ZhenanHeSichuan University + DayihengLiuAlibaba Group + ZhenanHeSichuan University 8853-8869 Recently, large language models (LLMs) have demonstrated breakthrough mathematical problem-solving capabilities in grade school math word problems (MWP). For example, on the MWP benchmark GSM8K, the accuracy of GPT-3.5-Turbo and MetaMath-70B reaches 80.80% and 82.30%, respectively. One question arises, does it mean that LLMs have truly mastered related mathematical problem-solving abilities? In this paper, by presenting two types of benchmarks, where MCGSM8K aims at selecting one correct solution from four solutions, while GSM8K-Judgement judges whether a solution to a given question is true or false, we demonstrate that the ability of most LLMs to evaluate the mathematical reasoning process of MWP is far from sufficient. To compensate for this issue, we propose hybrid supervised fine-tuning data from the training data of GSM8K, MCGSM8K, and GSM8K-Judgement, which significantly improves performance on the proposed reasoning process evaluation benchmarks. For example, fine-tuning improves the performance of LLaMA-2-13B from 33.51% to 70.89% on MCGSM8K. In conclusion, we experimentally demonstrate that most LLMs have limited ability to evaluate the mathematical reasoning process of MWP, which can be enhanced through fine-tuning. 2024.findings-acl.524 @@ -13258,8 +13258,8 @@ Towards Objectively Benchmarking Social Intelligence of Language Agents at the Action Level - ChenxuWangTsinghua University, Tsinghua University - BinDaiXiaoIce + ChenxuWangTsinghua University, Tsinghua University + BinDaiXiaoIce HuapingLiuTsinghua University, Tsinghua University BaoyuanWangXiaobing.ai 8885-8897 @@ -13270,7 +13270,7 @@ Semantic Role Labeling from <fixed-case>C</fixed-case>hinese Speech via End-to-End Learning - HuiyaoChen + HuiyaoChen XinxinLi MeishanZhangHarbin Institute of Technology (Shenzhen), China and Tianjin University, China MinZhangHarbin Institute of Technology, Shenzhen @@ -13283,8 +13283,8 @@ <fixed-case>MEEL</fixed-case>: Multi-Modal Event Evolution Learning ZhengweiTao - ZhiJinPeking University and Peking University - JunqiangHuangVIPSHOP + ZhiJinPeking University and Peking University + JunqiangHuangVIPSHOP XiancaiChen XiaoyingBai YifanZhang @@ -13299,7 +13299,7 @@ <fixed-case>LLM</fixed-case>-<fixed-case>REDIAL</fixed-case>: A Large-Scale Dataset for Conversational Recommender Systems Created from User Behaviors with <fixed-case>LLM</fixed-case>s TingtingLiangHangzhou Dianzi University ChenxinJin - LingzhiWangThe Chinese University of Hong Kong + LingzhiWangThe Chinese University of Hong Kong WenqiFan CongyingXiaSalesForce.com KaiChen @@ -13314,7 +13314,7 @@ Investigating Subtler Biases in <fixed-case>LLM</fixed-case>s: Ageism, Beauty, Institutional, and Nationality Bias in Generative Models MahammedKamruzzamanUniversity of South Florida Md.ShovonRajshahi University of Engineering and Technology - GeneKimUniversity of South Florida + GeneKimUniversity of South Florida 8940-8965 LLMs are increasingly powerful and widely used to assist users in a variety of tasks. This use risks introducing LLM biases into consequential decisions such as job hiring, human performance evaluation, and criminal sentencing. Bias in NLP systems along the lines of gender and ethnicity has been widely studied, especially for specific stereotypes (e.g., Asians are good at math). In this paper, we investigate bias along less-studied but still consequential, dimensions, such as age and beauty, measuring subtler correlated decisions that LLMs make between social groups and unrelated positive and negative attributes. Although these subtler biases are understudied they follow people as much as gender and ethnicity do. So, we want to see whether they also follow one with LLMs.We introduce a template-generated dataset of sentence completion tasks that asks the model to select the most appropriate attribute to complete an evaluative statement about a person described as a member of a specific social group. We also reverse the completion task to select the social group based on an attribute. We report the correlations that we find for 4 cutting-edge LLMs. This dataset can be used as a benchmark to evaluate progress in more generalized biases and the templating technique can be used to expand the benchmark with minimal additional human annotation. 2024.findings-acl.530 @@ -13325,10 +13325,10 @@ <fixed-case>EVIT</fixed-case>: Event-Oriented Instruction Tuning for Event Reasoning ZhengweiTao XiancaiChen - ZhiJinPeking University and Peking University + ZhiJinPeking University and Peking University XiaoyingBai HaiyanZhaoPeking University - YiweiLou + YiweiLou 8966-8979 Events refer to specific occurrences, incidents, or happenings that take place under a particular background. Event reasoning aims to infer events according to certain relations and predict future events. The cutting-edge techniques for event reasoning play a crucial role in various natural language processing applications. Large language models (LLMs) have made significant advancements in event reasoning owing to their wealth of knowledge and reasoning capabilities. However, smaller instruction-tuned models currently in use do not consistently demonstrate exceptional proficiency in managing these tasks. This discrepancy arises from the absence of explicit modeling of events and the interconnections of them within their instruction data. Consequently, these models face challenges in comprehending event structures and semantics while struggling to bridge the gap between their interpretations and human understanding of events. Additionally, their limitations in grasping event relations lead to constrained event reasoning abilities to effectively deduce and incorporate pertinent event knowledge. In this paper, we propose Event-Oriented Instruction Tuning to train our large language model named EvIT specializing in event reasoning tasks. Specifically, we first propose a novel structure named event quadruple which contains the structure and semantics of events and is complete in the event representation. We then design event-relation learning based on the structures. We encapsulate the learning into the instruction-tuning formulation to better stimulate the event reasoning capacity of our model. To implement our training, we design a heuristic unsupervised method to mine event quadruple from a large-scale corpus. At last, we finetune a Llama model on our Event-Oriented Instruction Tuning. We conduct extensive experiments on event reasoning tasks on several datasets. Automatic and human evaluations demonstrate EvIT achieves competitive performances on event reasoning. 2024.findings-acl.531 @@ -13338,8 +13338,8 @@ <fixed-case>I</fixed-case>nstruct<fixed-case>CMP</fixed-case>: Length Control in Sentence Compression through Instruction-based Large Language Models Juseon-DoChungnam National University - JingunKwonChungnam National University - HidetakaKamigaitoNara Institute of Science and Technology + JingunKwonChungnam National University + HidetakaKamigaitoNara Institute of Science and Technology ManabuOkumuraTokyo Institute of Technology 8980-8996 Extractive summarization can produce faithful summaries but often requires additional constraints such as a desired summary length. Traditional sentence compression models do not typically consider the constraints because of their restricted model abilities, which require model modifications for coping with them. To bridge this gap, we propose Instruction-based Compression (InstructCMP), an approach to the sentence compression task that can consider the length constraint through instructions by leveraging the zero-shot task-solving abilities of Large Language Models (LLMs). For this purpose, we created new evaluation datasets by transforming traditional sentence compression datasets into an instruction format. By using the datasets, we first reveal that the current LLMs still face challenges in accurately controlling the length for a compressed text. To address this issue, we propose an approach named length priming, that incorporates additional length information into the instructions without external resources. While the length priming effectively works in a zero-shot setting, a training dataset with the instructions would further improve the ability of length control. Thus, we additionally created a training dataset in an instruction format to fine-tune the model on it. Experimental results and analysis show that applying the length priming significantly improves performances of InstructCMP in both zero-shot and fine-tuning settings without the need of any model modifications. @@ -13351,7 +13351,7 @@ <fixed-case>S</fixed-case>ym<fixed-case>T</fixed-case>ax: Symbiotic Relationship and Taxonomy Fusion for Effective Citation Recommendation KaranGoyalIndraprastha Institute of Information Technology, Delhi MayankGoel - VikramGoyalIndraprastha Institute of Information Technology, Delhi + VikramGoyalIndraprastha Institute of Information Technology, Delhi MukeshMohaniaIndraprastha Institute of Information Technology 8997-9008 Citing pertinent literature is pivotal to writing and reviewing a scientific document. Existing techniques mainly focus on the local context or the global context for recommending citations but fail to consider the actual human citation behaviour. We propose SymTax, a three-stage recommendation architecture that considers both the local and the global context, and additionally the taxonomical representations of query-candidate tuples and the Symbiosis prevailing amongst them. SymTax learns to embed the infused taxonomies in the hyperbolic space and uses hyperbolic separation as a latent feature to compute query-candidate similarity. We build a novel and large dataset ArSyTa containing 8.27 million citation contexts and describe the creation process in detail. We conduct extensive experiments and ablation studies to demonstrate the effectiveness and design choice of each module in our framework. Also, combinatorial analysis from our experiments shed light on the choice of language models (LMs) and fusion embedding, and the inclusion of section heading as a signal. Our proposed module that captures the symbiotic relationship solely leads to performance gains of 26.66% and 39.25% in Recall@5 w.r.t. SOTA on ACL-200 and RefSeer datasets, respectively. The complete framework yields a gain of 22.56% in Recall@5 wrt SOTA on our proposed dataset. The code and dataset are available at https://github.com/goyalkaraniit/SymTax. @@ -13362,8 +13362,8 @@ Assessing News Thumbnail Representativeness: Counterfactual text can enhance the cross-modal matching ability YejunYoonSoongsil University - SeunghyunYoonAdobe Research - KunwooParkSoongsil University + SeunghyunYoonAdobe Research + KunwooParkSoongsil University 9009-9024 This paper addresses the critical challenge of assessing the representativeness of news thumbnail images, which often serve as the first visual engagement for readers when an article is disseminated on social media. We focus on whether a news image represents the actors discussed in the news text. To serve the challenge, we introduce NewsTT, a manually annotated dataset of 1000 news thumbnail images and text pairs. We found that the pretrained vision and language models, such as BLIP-2, struggle with this task. Since news subjects frequently involve named entities or proper nouns, the pretrained models could have a limited capability to match news actors’ visual and textual appearances. We hypothesize that learning to contrast news text with its counterfactual, of which named entities are replaced, can enhance the cross-modal matching ability of vision and language models. We propose CFT-CLIP, a contrastive learning framework that updates vision and language bi-encoders according to the hypothesis. We found that our simple method can boost the performance for assessing news thumbnail representativeness, supporting our assumption. Code and data can be accessed at https://github.com/ssu-humane/news-images-acl24. 2024.findings-acl.534 @@ -13372,7 +13372,7 @@ Towards Better Question Generation in <fixed-case>QA</fixed-case>-based Event Extraction - ZijinHong + ZijinHong JianLiuBeijing Jiaotong University 9025-9038 Event Extraction (EE) is an essential information extraction task that aims to extract event-related information from unstructured texts.The paradigm of this task has shifted from conventional classification-based methods to more contemporary question-answering-based (QA-based) approaches. However, in QA-based EE, the quality of the questions dramatically affects the extraction accuracy, and how to generate high-quality questions for QA-based EE remains a challenge. In this work, to tackle this challenge, we suggest four criteria to evaluate the quality of a question and propose a reinforcement learning method, RLQG, for QA-based EE that can generate generalizable, high-quality, and context-dependent questions and provides clear guidance to QA models. The extensive experiments conducted on ACE and RAMS datasets have strongly validated our approach’s effectiveness, which also demonstrates its robustness in scenarios with limited training data. The corresponding code of RLQG is released for further research. @@ -13383,11 +13383,11 @@ Budget-Constrained Tool Learning with Planning YuanhangZhengTsinghua University, Tsinghua University - PengLiTsinghua University - MingYan + PengLiTsinghua University + MingYan JiZhangAlibaba Group FeiHuangAlibaba Group - YangLiu + YangLiu 9039-9052 Despite intensive efforts devoted to tool learning, the problem of budget-constrained tool learning, which focuses on resolving user queries within a specific budget constraint, has been widely overlooked. This paper proposes a novel method for budget-constrained tool learning. Our approach involves creating a preferable plan under the budget constraint before utilizing the tools. This plan outlines the feasible tools and the maximum number of times they can be employed, offering a comprehensive overview of the tool learning process for large language models. This allows them to allocate the budget from a broader perspective. To devise the plan without incurring significant extra costs, we suggest initially estimating the usefulness of the candidate tools based on past experience. Subsequently, we employ dynamic programming to formulate the plan. Experimental results demonstrate that our method can be integrated with various tool learning methods, significantly enhancing their effectiveness under strict budget constraints. 2024.findings-acl.536 @@ -13399,10 +13399,10 @@ HuayangLi SihengLi DengCaiTencent AI Lab - LongyueWang + LongyueWang LemaoLiuTencent - TaroWatanabeNara Institute of Science and Technology, Japan - YujiuYangGraduate School at Shenzhen,Tsinghua University + TaroWatanabeNara Institute of Science and Technology, Japan + YujiuYangGraduate School at Shenzhen,Tsinghua University ShumingShiTencent AI Lab 9053-9076 Large language models with instruction-following abilities have revolutionized the field of artificial intelligence. These models show exceptional generalizability to tackle various real-world tasks through their natural language interfaces. However, their performance heavily relies on high-quality exemplar data, which is often difficult to obtain. This challenge is further exacerbated when it comes to multimodal instruction following. We introduce TextBind, an almost annotation-free framework for empowering LLMs with multi-turn interleaved multimodal instruction-following capabilities. Our approach requires only image-caption pairs and generates multi-turn multimodal instruction-response conversations from a language model. To accommodate interleaved image-text inputs and outputs, we devise MIM, a language model-centric architecture that seamlessly integrates image encoder and decoder models. Extensive quantitative and qualitative experiments demonstrate that MIM trained on TextBind achieves remarkable generation capability in multimodal conversations compared to recent baselines. @@ -13416,7 +13416,7 @@ JunlongLi WeizheYuan RuifengYuan - WenjieLiThe Hong Kong Polytechnic University, The Hong Kong Polytechnic University + WenjieLiThe Hong Kong Polytechnic University, The Hong Kong Polytechnic University PengfeiLiu 9077-9096 2024.findings-acl.538 @@ -13425,8 +13425,8 @@ <fixed-case>C</fixed-case>o<fixed-case>C</fixed-case>o-Agent: A Comprehensive Cognitive <fixed-case>MLLM</fixed-case> Agent for Smartphone <fixed-case>GUI</fixed-case> Automation - XinbeiMa - ZhuoshengZhangShanghai Jiao Tong University + XinbeiMa + ZhuoshengZhangShanghai Jiao Tong University HaiZhaoShanghai Jiao Tong University 9097-9110 Multimodal large language models (MLLMs) have shown remarkable potential as human-like autonomous language agents to interact with real-world environments, especially for graphical user interface (GUI) automation.However, those GUI agents require comprehensive cognition including exhaustive perception and reliable action response.We propose a Comprehensive Cognitive LLM Agent, CoCo-Agent, with two novel approaches, comprehensive environment perception (CEP) and conditional action prediction (CAP), to systematically improve the GUI automation performance. First, CEP facilitates the GUI perception through different aspects and granularity, including screenshots and complementary detailed layouts for the visual channel and historical actions for the textual channel.Second, CAP decomposes the action prediction into sub-problems: determining the action type and then identifying the action target conditioned on the action type.With our technical design, our agent achieves state-of-the-art performance on AITW and META-GUI benchmarks, showing promising abilities in realistic scenarios. Code is available at https://github.com/xbmxb/CoCo-Agent. @@ -13476,10 +13476,10 @@ <fixed-case>CTC</fixed-case>-based Non-autoregressive Textless Speech-to-Speech Translation - QingkaiFangInstitute of Computing Technology, Chinese Academy of Sciences + QingkaiFangInstitute of Computing Technology, Chinese Academy of Sciences ZhengruiMaInstitute of Computing Technology, Chinese Academy of Sciences YanZhou - MinZhangHarbin Institute of Technology + MinZhangHarbin Institute of Technology YangFengInstitute of Computing Technology, Chinese Academy of Sciences 9155-9161 Direct speech-to-speech translation (S2ST) has achieved impressive translation quality, but it often faces the challenge of slow decoding due to the considerable length of speech sequences. Recently, some research has turned to non-autoregressive (NAR) models to expedite decoding, yet the translation quality typically lags behind autoregressive (AR) models significantly. In this paper, we investigate the performance of CTC-based NAR models in S2ST, as these models have shown impressive results in machine translation. Experimental results demonstrate that by combining pretraining, knowledge distillation, and advanced NAR training techniques such as glancing training and non-monotonic latent alignments, CTC-based NAR models achieve translation quality comparable to the AR model, while preserving up to 26.81\times decoding speedup. @@ -13533,10 +13533,10 @@ <fixed-case>LCS</fixed-case>: A Language Converter Strategy for Zero-Shot Neural Machine Translation ZengkuiSun YijinLiuWechat AI - FandongMengWeChat AI, Tencent Inc. + FandongMengWeChat AI, Tencent Inc. JinanXuBeijing Jiaotong University YufengChen - JieZhou + JieZhou 9201-9214 Multilingual neural machine translation models generally distinguish translation directions by the language tag (LT) in front of the source or target sentences. However, current LT strategies cannot indicate the desired target language as expected on zero-shot translation, i.e., the off-target issue. Our analysis reveals that the indication of the target language is sensitive to the placement of the target LT. For example, when placing the target LT on the decoder side, the indication would rapidly degrade along with decoding steps, while placing the target LT on the encoder side would lead to copying or paraphrasing the source input. To address the above issues, we propose a simple yet effective strategy named Language Converter Strategy (LCS). By introducing the target language embedding into the top encoder layers, LCS mitigates confusion in the encoder and ensures stable language indication for the decoder. Experimental results on MultiUN, TED, and OPUS-100 datasets demonstrate that LCS could significantly mitigate the off-target issue, with language accuracy up to 95.28%, 96.21%, and 85.35% meanwhile outperforming the vanilla LT strategy by 3.07, 3,3, and 7.93 BLEU scores on zero-shot translation, respectively. 2024.findings-acl.547 @@ -13562,12 +13562,12 @@ JingweiYiUniversity of Science and Technology of China RuiYeShanghai Jiaotong University QisiChen - BinZhuMicrosoft Research + BinZhuMicrosoft Research SihengChenShanghai Jiao Tong University - DefuLianUniversity of Science and Technology of China - GuangzhongSunUniversity of Science and Technology of China - XingXieMicrosoft - FangzhaoWuMicrosoft + DefuLianUniversity of Science and Technology of China + GuangzhongSunUniversity of Science and Technology of China + XingXieMicrosoft + FangzhaoWuMicrosoft 9236-9260 Large language models (LLMs) possess immense capabilities but are susceptible to malicious exploitation. To mitigate the risk, safety alignment is employed to align LLMs with ethical standards. However, safety-aligned LLMs may remain vulnerable to carefully crafted jailbreak attacks, but these attacks often face high rejection rates and limited harmfulness. In this paper, we expose the vulnerabilities of safety alignment in open-access LLMs, which can significantly enhance the success rate and harmfulness of jailbreak attacks. Through reverse alignment, achieved by accessing model parameters, we show the feasibility of efficiently fine-tuning LLMs to undermine their inherent safeguards. We investigate two types of reverse alignment techniques: reverse supervised fine-tuning (RSFT) and reverse preference optimization (RPO). RSFT operates by supervising the fine-tuning of LLMs to reverse their inherent values. We also explore how to prepare data needed for RSFT. RPO optimizes LLMs to enhance their preference for harmful content, reversing the models’ safety alignment. Our extensive experiments reveal that open-access high-performance LLMs can be adeptly reverse-aligned to output harmful content, even in the absence of manually curated malicious datasets. Our research acts as a whistleblower for the community, emphasizing the need to pay more attention to safety of open-accessing LLMs. It also underscores the limitations of current safety alignment approaches and calls for research on robust safety alignment methods to counteract malicious fine-tuning attacks. 2024.findings-acl.549 @@ -13577,9 +13577,9 @@ <fixed-case>PEK</fixed-case>: A Parameter-Efficient Framework for Knowledge-Grounded Dialogue Generation PanYang - DandanSongBeijing Institute of Technology - ZhijingWuBeijing Institute of Technology - YanruZhou + DandanSongBeijing Institute of Technology + ZhijingWuBeijing Institute of Technology + YanruZhou 9261-9273 Pre-trained language models (PLMs) have shown great dialogue generation capability in different scenarios. However, the huge VRAM consumption when fine-tuning them is one of their drawbacks. PEFT approaches can significantly reduce the number of trainable parameters, which enables us to fine-tune larger dialogue generation models. However, the reduction in parameter quantity can diminish a PLM’s expressive capacity and affect the PLM’s learning from certain specific examples like knowledge-related conversations. Previous works have demonstrated that injecting external knowledge into dialogue generation models can improve the model’s performance in knowledge-related conversations. Nonetheless, these methods are designed for the scenario where most parameters of the entire framework are trainable. In this paper, we propose PEK, a parameter-efficient framework for knowledge-enhanced dialogue generation. It enables PLMs to leverage external knowledge documents and knowledge graphs to enhance its generation capabilities with an acceptable number of trainable parameters. Evaluation results on the Wizard of Wikipedia and CMU_DoG datasets show that our approach outperforms baseline methods on multiple evaluation metrics, which validates the effectiveness of our approach. 2024.findings-acl.550 @@ -13591,7 +13591,7 @@ LiwenZhengBeijing University of Posts and Telecommunications ChaozhuoLi XiZhangBeijing University of Posts and Telecommunications - Yu-MingShang + Yu-MingShang FeiranHuang HaoranJiaBeijing University of Posts and Telecommunications 9274-9281 @@ -13604,11 +13604,11 @@ Outdated Issue Aware Decoding for Factual Knowledge Editing ZengkuiSun YijinLiuWechat AI - JiaanWangSoochow University - FandongMengWeChat AI, Tencent Inc. + JiaanWangSoochow University + FandongMengWeChat AI, Tencent Inc. JinanXuBeijing Jiaotong University YufengChen - JieZhou + JieZhou 9282-9293 Recently, Knowledge Editing has received increasing attention, since it could update the specific knowledge from outdated ones in pretrained models without re-training. However, as pointed out by recent studies, existing related methods tend to merely memorize the superficial word composition of the edited knowledge, rather than truly learning and absorbing it. Consequently, on the reasoning questions, we discover that existing methods struggle to utilize the edited knowledge to reason the new answer, and tend to retain outdated responses, which are generated by the original models utilizing original knowledge. Nevertheless, the outdated responses are unexpected for the correct answers to reasoning questions, which we named as the outdated issue. To alleviate this issue, in this paper, we propose a simple yet effective decoding strategy, i.e., outDated ISsue aware deCOding (DISCO), to enhance the performance of edited models on reasoning questions. Specifically, we capture the difference in the probability distribution between the original and edited models. Further, we amplify the difference of the token prediction in the edited model to alleviate the outdated issue, and thus enhance the model performance w.r.t the edited knowledge. Experimental results suggest that applying DISCO could enhance edited models to reason, e.g., on reasoning questions, DISCO outperforms the prior SOTA method by 12.99 F1 scores, and reduces the ratio of the outdated issue to 5.78% on the zsRE dataset. 2024.findings-acl.552 @@ -13617,9 +13617,9 @@ Disentangling Dialect from Social Bias via Multitask Learning to Improve Fairness - MaximilianSpliethöverLeibniz University Hannover + MaximilianSpliethöverLeibniz University Hannover Sai NikhilMenon - HenningWachsmuthLeibniz Universität Hannover + HenningWachsmuthLeibniz Universität Hannover 9294-9313 Dialects introduce syntactic and lexical variations in language that occur in regional or social groups. Most NLP methods are not sensitive to such variations. This may lead to unfair behavior of the methods, conveying negative bias towards dialect speakers. While previous work has studied dialect-related fairness for aspects like hate speech, other aspects of biased language, such as lewdness, remain fully unexplored. To fill this gap, we investigate performance disparities between dialects in the detection of five aspects of biased language and how to mitigate them. To alleviate bias, we present a multitask learning approach that models dialect language as an auxiliary task to incorporate syntactic and lexical variations. In our experiments with African-American English dialect, we provide empirical evidence that complementing common learning approaches with dialect modeling improves their fairness. Furthermore, the results suggest that multitask learning achieves state-of-the-art performance and helps to detect properties of biased language more reliably. 2024.findings-acl.553 @@ -13628,10 +13628,10 @@ <fixed-case>DP</fixed-case>-<fixed-case>MLM</fixed-case>: Differentially Private Text Rewriting Using Masked Language Models - StephenMeisenbacher + StephenMeisenbacher MaulikChevliTechnische Universität München JurajVladikaTechnische Universität München - FlorianMatthesTechnische Universität München + FlorianMatthesTechnische Universität München 9314-9328 2024.findings-acl.554 meisenbacher-etal-2024-dp @@ -13651,10 +13651,10 @@ <fixed-case>EX</fixed-case>-<fixed-case>FEVER</fixed-case>: A Dataset for Multi-hop Explainable Fact Verification HuanhuanMa WeizhiXu - YifanWei + YifanWei LiujiChen - LiangWang - QiangLiuInstitute of Automation, Chinese Academy of Sciences + LiangWang + QiangLiuInstitute of Automation, Chinese Academy of Sciences ShuWuInstitute of automation, Chinese academy of science, Chinese Academy of Sciences LiangWangCASIA 9340-9353 @@ -13665,14 +13665,14 @@ Agent-<fixed-case>FLAN</fixed-case>: Designing Data and Methods of Effective Agent Tuning for Large Language Models - ZehuiChen + ZehuiChen KuikunLiu QiuchenWang - WenweiZhangShanghai AI Laboratory + WenweiZhangShanghai AI Laboratory JiangningLiu DahuaLinThe Chinese University of Hong Kong - KaiChenShanghai AI Laboratory - FengZhaoUniversity of Science and Technology of China + KaiChenShanghai AI Laboratory + FengZhaoUniversity of Science and Technology of China 9354-9366 Open-sourced Large Language Models (LLMs) have achieved great success in various NLP tasks, however, they are still far inferior to API-based models when acting as agents. How to integrate agent ability into general LLMs becomes a crucial and urgent problem.This paper first delivers three key observations: (1) the current agent training corpus is entangled with both formats following and agent reasoning, which significantly shifts from the distribution of its pre-training data; (2) LLMs exhibit different learning speeds on the capabilities required by agent tasks; and (3) current approaches have side-effects when improving agent abilities by introducing hallucinations. Based on the above findings, we propose Agent-FLAN to effectively Fine-tune LANguage models for Agents.Through careful decomposition and redesign of the training corpus, Agent-FLAN enables Llama2-7B to outperform prior best works by 3.5% across various agent evaluation datasets. With comprehensively constructed negative samples, Agent-FLAN greatly alleviates the hallucination issues based on our established evaluation benchmark. Besides, it consistently improves the agent capability of LLMs when scaling model sizes while slightly enhancing the general capability of LLMs. The code and models are available at https://github.com/InternLM/Agent-FLAN. 2024.findings-acl.557 @@ -13683,15 +13683,15 @@ Fact-Checking the Output of Large Language Models via Token-Level Uncertainty Quantification EkaterinaFadeeva AleksandrRubashevskiiSkolkovo Institute of Science and Technology - ArtemShelmanovMohamed bin Zayed University of Artificial Intelligence - SergeyPetrakov - HaonanLi + ArtemShelmanovMohamed bin Zayed University of Artificial Intelligence + SergeyPetrakov + HaonanLi HamdyMubarak EvgeniiTsymbalovIndependent Researcher GlebKuzminArtificial Intelligence Research Institute and Institute for Systems Analysis of Russian Academy of Sciences AlexanderPanchenkoSkoltech - TimothyBaldwinMohamed bin Zayed University of Artificial Intelligence and The University of Melbourne - PreslavNakovMohamed bin Zayed University of Artificial Intelligence + TimothyBaldwinMohamed bin Zayed University of Artificial Intelligence and The University of Melbourne + PreslavNakovMohamed bin Zayed University of Artificial Intelligence MaximPanovMohamed bin Zayed University of Artificial Intelligence 9367-9385 Large language models (LLMs) are notorious for hallucinating, i.e., producing erroneous claims in their output. Such hallucinations can be dangerous, as occasional factual inaccuracies in the generated text might be obscured by the rest of the output being generally factually correct, making it extremely hard for the users to spot them. Current services that leverage LLMs usually do not provide any means for detecting unreliable generations. Here, we aim to bridge this gap. In particular, we propose a novel fact-checking and hallucination detection pipeline based on token-level uncertainty quantification. Uncertainty scores leverage information encapsulated in the output of a neural network or its layers to detect unreliable predictions, and we show that they can be used to fact-check the atomic claims in the LLM output. Moreover, we present a novel token-level uncertainty quantification method that removes the impact of uncertainty about what claim to generate on the current step and what surface form to use. Our method Claim Conditioned Probability (CCP) measures only the uncertainty of a particular claim value expressed by the model. Experiments on the task of biography generation demonstrate strong improvements for CCP compared to the baselines for seven different LLMs and four languages. Human evaluation reveals that the fact-checking pipeline based on uncertainty quantification is competitive with a fact-checking tool that leverages external knowledge. @@ -13701,14 +13701,14 @@ Deciphering the Impact of Pretraining Data on Large Language Models through Machine Unlearning - YangZhao + YangZhao LiDu - XiaoDing - KaiXiongHarbin Institute of Technology + XiaoDing + KaiXiongHarbin Institute of Technology ZhouhaoSun ShiJun TingLiuHarbin Institute of Technology - BingQinHarbin Institute of Technology + BingQinHarbin Institute of Technology 9386-9406 Through pretraining on a corpus with various sources, Large Language Models (LLMs) have gained impressive performance. However, the impact of each component of the pretraining corpus remains opaque. As a result, the organization of the pretraining corpus is still empirical and may deviate from the optimal. To address this issue, we systematically analyze the impact of 48 datasets from 5 major categories of pretraining data of LLMs and measure their impacts on LLMs using benchmarks about nine major categories of model capabilities. Our analyses provide empirical results about the contribution of multiple corpora on the performances of LLMs, along with their joint impact patterns, including complementary, orthogonal, and correlational relationships. We also identify a set of “high-impact data” such as Books that is significantly related to a set of model capabilities. These findings provide insights into the organization of data to support more efficient pretraining of LLMs. 2024.findings-acl.559 @@ -13745,8 +13745,8 @@ Description Boosting for Zero-Shot Entity and Relation Classification GabrielePiccoInternational Business Machines LeopoldFuchsDuale Hochschule Baden-Württemberg Stuttgart - MarcosMartínez GalindoInternational Business Machines - AlbertoPurpuraInternational Business Machines + MarcosMartínez GalindoInternational Business Machines + AlbertoPurpuraInternational Business Machines VanessaLópezInternational Business Machines HoangThanh LamInternational Business Machines 9441-9457 @@ -13758,10 +13758,10 @@ Domain-Aware <tex-math>k</tex-math>-Nearest-Neighbor Knowledge Distillation for Machine Translation ZhexuanWang - ShudongLiuUniversity of Macau + ShudongLiuUniversity of Macau XueboLiuHarbin Institute of Technolgy, Shenzhen - MiaoZhangHarbin Institute of Technology (Shenzhen) - DerekWongUniversity of Macau + MiaoZhangHarbin Institute of Technology (Shenzhen) + DerekWongUniversity of Macau MinZhangHarbin Institute of Technology, Shenzhen 9458-9469 kNN-MT has utilized neighborhood knowledge for auxiliary decoding, significantly improving translation performance. Subsequently, kNN-KD transitions the use of neighborhood knowledge from the decoding phase to the training phase, to address the temporal and spatial inefficiencies inherent in kNN-MT. However, kNN-KD transfers all the kNN knowledge arbitrarily, which has the potential to restrict the learning of student models. In this paper, we propose a novel domain-aware kNN-KD method, which filters out domain-relevant neighborhood knowledge for learning in the distillation process. Notably, this entire process exclusively utilizes the neighborhood knowledge of the original model, eliminating the need for establishing any additional datastores. Experiments on four domain translation tasks demonstrate that our method achieves state-of-the-art performance, realizing an average gain of 1.55 COMET and 1.42 BLEU scores, by further enhancing the translation of rare words. Source code can be accessed at https://github.com/wangzx1219/Dk-KD. @@ -13773,13 +13773,13 @@ Beyond Single-Event Extraction: Towards Efficient Document-Level Multi-Event Argument Extraction WanlongLiu LiZhouThe Chinese University of Hong Kong - DingYiZeng + DingYiZeng YichenXiao - ShaohuanChengUniversity of Electronic Science and Technology of China - ChenZhangNational University of Singapore + ShaohuanChengUniversity of Electronic Science and Technology of China + ChenZhangNational University of Singapore GrandeeLeeSingapore University of Social Sciences MaluZhangUniversity of Electronic Science and Technology of China - WenyuChen + WenyuChen 9470-9487 Recent mainstream event argument extraction methods process each event in isolation, resulting in inefficient inference and ignoring the correlations among multiple events. To address these limitations, here we propose a multiple-event argument extraction model DEEIA (Dependency-guided Encoding and Event-specific Information Aggregation), capable of extracting arguments from all events within a document simultaneously. The proposed DEEIA model employs a multi-event prompt mechanism, comprising DE and EIA modules. The DE module is designed to improve the correlation between prompts and their corresponding event contexts, whereas the EIA module provides event-specific information to improve contextual understanding. Extensive experiments show that our method achieves new state-of-the-art performance on four public datasets (RAMS, WikiEvents, MLEE, and ACE05), while significantly saving the inference time compared to the baselines. Further analyses demonstrate the effectiveness of the proposed modules. 2024.findings-acl.564 @@ -13791,14 +13791,14 @@ Revisiting Interpolation Augmentation for Speech-to-Text Generation ChenXuHarbin Engineering University - JieWang + JieWang XiaoqianLiuNortheastern University QianDongByteDance ChunliangZhangNortheastern University TongXiaoNortheastern University JingBoZhuNortheastern University DapengMan - WuYang + WuYang 9488-9499 Speech-to-text (S2T) generation systems frequently face challenges in low-resource scenarios, primarily due to the lack of extensive labeled datasets. One emerging solution is constructing virtual training samples by interpolating inputs and labels, which has notably enhanced system generalization in other domains. Despite its potential, this technique’s application in S2T tasks has remained under-explored. In this paper, we delve into the utility of interpolation augmentation, guided by several pivotal questions. Our findings reveal that employing an appropriate strategy in interpolation augmentation significantly enhances performance across diverse tasks, architectures, and data scales, offering a promising avenue for more robust S2T systems in resource-constrained settings. 2024.findings-acl.565 @@ -13843,7 +13843,7 @@ Enhancing Cross Text-Molecule Learning by Self-Augmentation YinuoJiang - XiangZhuang + XiangZhuang KeyanDingZhejiang University QiangZhangZhejiang University HuajunChenZhejiang University @@ -13856,7 +13856,7 @@ <fixed-case>R</fixed-case>e<fixed-case>PALM</fixed-case>: Popular Quote Tweet Generation via Auto-Response Augmentation ErxinYuHong Kong Polytechnic University - JingLiThe Hong Kong Polytechnic University + JingLiThe Hong Kong Polytechnic University ChunpuXu 9566-9579 A quote tweet enables users to share others’ content while adding their own commentary. In order to enhance public engagement through quote tweets, we investigate the task of generating popular quote tweets. This task aims to produce quote tweets that garner higher popularity, as indicated by increased likes, replies, and retweets. Despite the impressive language generation capabilities of large language models (LLMs), there has been limited research on how LLMs can effectively learn the popularity of text to better engage the public. Therefore, we introduce a novel approach called Response-augmented Popularity-Aligned Language Model (RePALM), which aligns language generation with popularity by leveraging insights from augmented auto-responses provided by readers. We utilize the Proximal Policy Optimization framework with a dual-reward mechanism to jointly optimize for the popularity of the quote tweet and its consistency with the auto-responses. In our experiments, we collected two datasets consisting of quote tweets containing external links and those referencing others’ tweets. Extensive results demonstrate the superiority of RePALM over advanced language models that do not incorporate response augmentation. @@ -13880,7 +13880,7 @@ Do Pre-Trained Language Models Detect and Understand Semantic Underspecification? Ask the <fixed-case>DUST</fixed-case>! FrankWildenburg MichaelHannaUniversity of Amsterdam - SandroPezzelleUniversity of Amsterdam + SandroPezzelleUniversity of Amsterdam 9598-9613 In everyday language use, speakers frequently utter and interpret sentences that are semantically underspecified, namely, whose content is insufficient to fully convey their message or interpret them univocally. For example, to interpret the underspecified sentence “Don’t spend too much”, which leaves implicit what (not) to spend, additional linguistic context or outside knowledge is needed. In this work, we propose a novel Dataset of semantically Underspecified Sentences grouped by Type (DUST) and use it to study whether pre-trained language models (LMs) correctly identify and interpret underspecified sentences. We find that newer LMs are reasonably able to identify underspecified sentences when explicitly prompted. However, interpreting them correctly is much harder for any LMs. Our experiments show that when interpreting underspecified sentences, LMs exhibit little uncertainty, contrary to what theoretical accounts of underspecification would predict. Overall, our study reveals limitations in current models’ processing of sentence semantics and highlights the importance of using naturalistic data and communicative scenarios when evaluating LMs’ language capabilities. 2024.findings-acl.572 @@ -13892,7 +13892,7 @@ WenHuangUniversity of Science and Technology of China HongbinLiuDuke University MinxinGuoUniversity of Hong Kong - NeilGongDuke University + NeilGongDuke University 9614-9631 Visual hallucination (VH) means that a multi-modal LLM (MLLM) imagines incorrect details about an image in visual question answering. Existing studies find VH instances only in existing image datasets, which results in biased understanding of MLLMs’ performance under VH due to limited diversity of such VH instances. In this work, we propose a tool called VHTest to generate a diverse set of VH instances. Specifically, VHTest finds some initial VH instances in existing image datasets (e.g., COCO), generates a text description for each VH mode, and uses a text-to-image generative model (e.g., DALL-E-3) to generate VH images based on the text descriptions. We collect a benchmark dataset with 1,200 VH instances in 8 VH modes using VHTest. We find that existing MLLMs such as GPT-4, LLaVA-1.5, and MiniGPT-v2 hallucinate for a large fraction of the instances in our benchmark. Moreover, we find that fine-tuning an MLLM using our benchmark dataset reduces its likelihood to hallucinate without sacrificing its performance on other benchmarks. Our benchmarks are publicly available: https://github.com/wenhuang2000/VHTest. 2024.findings-acl.573 @@ -13902,11 +13902,11 @@ <fixed-case>S</fixed-case>um<fixed-case>S</fixed-case>urvey: An Abstractive Dataset of Scientific Survey Papers for Long Document Summarization RanLiuInstitute of Information Engineering, Chinese Academy of Sciences and University of Chinese Academy of Sciences - MingLiuDeakin University + MingLiuDeakin University MinYuInstitute of Information Engineering, Chinese Academy of Sciences - HeZhangCNPIEC KEXIN LTD + HeZhangCNPIEC KEXIN LTD JianguoJiangInstitute of Information Engineering, Chinese Academy of Sciences - GangLiDeakin University + GangLiDeakin University WeiqingHuangInstitute of Information Engineering, Chinese Academy of Sciences 9632-9651 With the popularity of large language models (LLMs) and their ability to handle longer input documents, there is a growing need for high-quality long document summarization datasets. Although many models already support 16k input, current lengths of summarization datasets are inadequate, and salient information is not evenly distributed. To bridge these gaps, we collect a new summarization dataset called SumSurvey, consisting of more than 18k scientific survey papers. With an average document length exceeding 12k and a quarter exceeding 16k, as well as the uniformity metric outperforming current mainstream long document summarization datasets, SumSurvey brings new challenges and expectations to both fine-tuned models and LLMs. The informativeness of summaries and the models supporting the evaluation of long document summarization warrant further attention. Automatic and human evaluation results on this abstractive dataset confirm this view. Our dataset and code are available at https://github.com/Oswald1997/SumSurvey. @@ -13916,10 +13916,10 @@ Pushing the Limits of Low-Resource <fixed-case>NER</fixed-case> Using <fixed-case>LLM</fixed-case> Artificial Data Generation - JoanSantosoInstitut Sains dan Teknologi Terpadu Surabaya + JoanSantosoInstitut Sains dan Teknologi Terpadu Surabaya PatrickSutanto BillyCahyadiInstitut Sains dan Teknologi Terpadu Surabaya - EstherSetiawanInstitut Sains dan Teknologi Terpadu Surabaya + EstherSetiawanInstitut Sains dan Teknologi Terpadu Surabaya 9652-9667 Named Entity Recognition (NER) is an important task, but to achieve great performance, it is usually necessary to collect a large amount of labeled data, incurring high costs. In this paper, we propose using open-source Large Language Models (LLM) to generate NER data with only a few labeled examples, reducing the cost of human annotations. Our proposed method is very simple and can perform well using only a few labeled data points. Experimental results on diverse low-resource NER datasets show that our proposed data generation method can significantly improve the baseline. Additionally, our method can be used to augment datasets with class-imbalance problems and consistently improves model performance on macro-F1 metrics. 2024.findings-acl.575 @@ -13930,9 +13930,9 @@ Understanding and Patching Compositional Reasoning in <fixed-case>LLM</fixed-case>s ZhaoyiLiCity University of Hong Kong and University of Science and Technology of China GangweiJiangCity University of Hong Kong and University of Science and Technology of China - HongXieUniversity of Science and Technology of China - LinqiSongCity University of Hong Kong - DefuLianUniversity of Science and Technology of China + HongXieUniversity of Science and Technology of China + LinqiSongCity University of Hong Kong + DefuLianUniversity of Science and Technology of China YingWeiNanyang Technological University 9668-9688 LLMs have marked a revolutonary shift, yet they falter when faced with compositional reasoning tasks. Our research embarks on a quest to uncover the root causes of compositional reasoning failures of LLMs, uncovering that most of them stem from the improperly generated or leveraged implicit reasoning results. Inspired by our empirical findings, we resort to Logit Lens and an intervention experiment to dissect the inner hidden states of LLMs. This deep dive reveals that implicit reasoning results indeed surface within middle layers and play a causative role in shaping the final explicit reasoning results. Our exploration further locates multi-head self-attention (MHSA) modules within these layers, which emerge as the linchpins in accurate generation and leveraing of implicit reasoning results. Grounded on the above findings, we develop CREME, a lightweight method to patch errors in compositional reasoning via editing the located MHSA modules. Our empirical evidence stands testament to CREME’s effectiveness, paving the way for autonomously and continuously enhancing compositional reasoning capabilities in language models. @@ -13942,7 +13942,7 @@ Bilingual Rhetorical Structure Parsing with Large Parallel Annotations - ElenaChistovaFRC CSC RAS + ElenaChistovaFRC CSC RAS 9689-9706 Discourse parsing is a crucial task in natural language processing that aims to reveal the higher-level relations in a text. Despite growing interest in cross-lingual discourse parsing, challenges persist due to limited parallel data and inconsistencies in the Rhetorical Structure Theory (RST) application across languages and corpora. To address this, we introduce a parallel Russian annotation for the large and diverse English GUM RST corpus. Leveraging recent advances, our end-to-end RST parser achieves state-of-the-art results on both English and Russian corpora. It demonstrates effectiveness in both monolingual and bilingual settings, successfully transferring even with limited second-language annotation. To the best of our knowledge, this work is the first to evaluate the potential of cross-lingual end-to-end RST parsing on a manually annotated parallel corpus. 2024.findings-acl.577 @@ -13951,8 +13951,8 @@ <fixed-case>B</fixed-case>ook2<fixed-case>D</fixed-case>ial: Generating Teacher Student Interactions from Textbooks for Cost-Effective Development of Educational Chatbots - JunlingWangETHZ - ETH Zurich - JakubMacinaDepartment of Computer Science, ETHZ - ETH Zurich + JunlingWangETHZ - ETH Zurich + JakubMacinaDepartment of Computer Science, ETHZ - ETH Zurich NicoDaheimTechnische Universität Darmstadt SankalanPal Chowdhury MrinmayaSachanSwiss Federal Institute of Technology @@ -13966,8 +13966,8 @@ <fixed-case>SELP</fixed-case>: A Semantically-Driven Approach for Separated and Accurate Class Prototypes in Few-Shot Text Classification WenxinLiang TingyuZhangDalian University of Technology - HanLiuDalian University of Technology - FengZhangPeking University + HanLiuDalian University of Technology + FengZhangPeking University 9732-9741 2024.findings-acl.579 liang-etal-2024-selp @@ -13977,7 +13977,7 @@ Automated Focused Feedback Generation for Scientific Writing Assistance EricChamounUniversity of Cambridge MichaelSchlichtkrullQueen Mary, University of London - AndreasVlachosUniversity of Cambridge + AndreasVlachosUniversity of Cambridge 9742-9763 Scientific writing is a challenging task, particularly for novice researchers who often rely on feedback from experienced peers. Recent work has primarily focused on improving surface form and style rather than manuscript content. In this paper, we propose a novel task: automated focused feedback generation for scientific writing assistance. We present SWIF^2T: a Scientific WrIting Focused Feedback Tool. It is designed to generate specific, actionable and coherent comments, which identify weaknesses in a scientific paper and/or propose revisions to it. Our approach consists of four components - planner, investigator, reviewer and controller - leveraging multiple Large Language Models (LLMs) to implement them. We compile a dataset of 300 peer reviews citing weaknesses in scientific papers and conduct human evaluation. The results demonstrate the superiority in specificity, reading comprehension, and overall helpfulness of SWIF^2T’s feedback compared to other approaches. In our analysis, we also identified cases where automatically generated reviews were judged better than human ones, suggesting opportunities for integration of AI-generated feedback in scientific writing. 2024.findings-acl.580 @@ -13987,7 +13987,7 @@ <fixed-case>F</fixed-case>ast<fixed-case>GAS</fixed-case>: Fast Graph-based Annotation Selection for In-Context Learning ZihanChen - SongWangUniversity of Virginia + SongWangUniversity of Virginia CongShenUniversity of Virginia JundongLiUniversity of Virginia 9764-9780 @@ -14013,7 +14013,7 @@ Integrating Multi-scale Contextualized Information for Byte-based Neural Machine Translation - LanglinHuang + LanglinHuang YangFengInstitute of Computing Technology, Chinese Academy of Sciences 9794-9801 Subword tokenization is a common method for vocabulary building in Neural Machine Translation (NMT) models. However, increasingly complex tasks have revealed its disadvantages. First, a vocabulary cannot be modified once it is learned, making it hard to adapt to new words. Second, in multilingual translation, the imbalance in data volumes across different languages spreads to the vocabulary, exacerbating translations involving low-resource languages. While byte-based tokenization addresses these issues, byte-based models struggle with the low information density inherent in UTF-8 byte sequences. Previous works enhance token semantics through local contextualization but fail to select an appropriate contextualizing scope based on the input. Consequently, we propose the Multi-Scale Contextualization (MSC) method, which learns contextualized information of varying scales across different hidden state dimensions. It then leverages the attention module to dynamically integrate the multi-scale contextualized information. Experiments show that MSC significantly outperforms subword-based and other byte-based methods in both multilingual and out-of-domain scenarios. Code can be found in https://github.com/ictnlp/Multiscale-Contextualization. @@ -14024,9 +14024,9 @@ Deductive Closure Training of Language Models for Coherence, Accuracy, and Updatability Afra FeyzaAkyürekBoston University - EkinAkyürek + EkinAkyürek LeshemChoshenInternational Business Machines - DerryWijayaMonash University and Boston University + DerryWijayaMonash University and Boston University JacobAndreasMassachusetts Institute of Technology and Microsoft 9802-9818 While language models (LMs) can sometimes generate factually correct text and estimate truth values of individual claims, these generally do not reflect a globally coherent, manipulable model of the world. As a consequence, current LMs also generate incorrect or nonsensical content, and are difficult to edit and bring up to date. We present a method called Deductive Closure Training (DCT) that uses LMs themselves to identify implications of (and contradictions within) the text that they generate, yielding an efficient self-supervised procedure for improving LM factuality. Given a collection of seed documents, DCT prompts LMs to generate additional text implied by these documents, reason globally about the correctness of this generated text, and finally fine-tune on text inferred to be correct. Given seed documents from a trusted source, DCT provides a tool for supervised model updating; if seed documents are sampled from the LM itself, DCT enables fully unsupervised fine-tuning for improved coherence and accuracy. Across the CREAK, MQuAKE, and Reversal Curse datasets, supervised DCT improves LM fact verification and text generation accuracy by 3-26%; on CREAK, fully unsupervised DCT improves verification accuracy by 12%. These results show that LMs’ reasoning capabilities during inference can be leveraged during training to improve their reliability. @@ -14038,9 +14038,9 @@ Self-Supervised Singing Voice Pre-Training towards Speech-to-Singing Conversion RuiqiLi RongjieHuangFAIR - YongqiWangZhejiang University + YongqiWangZhejiang University ZhiqingHong - ZhouZhaoZhejiang University and Zhejiang University + ZhouZhaoZhejiang University and Zhejiang University 9819-9831 Speech-to-singing voice conversion (STS) task always suffers from data scarcity, because it requires paired speech and singing data. Compounding this issue are the challenges of content-pitch alignment and the suboptimal quality of generated outputs, presenting significant hurdles in STS research. This paper presents SVPT, an STS approach boosted by a self-supervised singing voice pre-training model.We leverage spoken language model techniques to tackle the rhythm alignment problem and the in-context learning capability to achieve zero-shot conversion. We adopt discrete-unit random resampling and pitch corruption strategies, enabling training with unpaired singing data and thus mitigating the issue of data scarcity. SVPT also serves as an effective backbone for singing voice synthesis (SVS), offering insights into scaling up SVS models. Experimental results indicate that SVPT delivers notable improvements in both STS and SVS endeavors. Audio samples are available at https://speech2sing.github.io. 2024.findings-acl.585 @@ -14063,7 +14063,7 @@ YanghaiZhangUniversity of Science and Technology of China YeLiuUniversity of Science and Technology of China ShiweiWuPeking University, Peking University and The Chinese University of Hong Kong - KaiZhang + KaiZhang XukaiLiuUniversity of Science and Technology of China QiLiuUniversity of Science and Technology of China EnhongChenUniversity of Science and Technology of China @@ -14079,7 +14079,7 @@ XinLiangUniversity of Central Florida JiaqiXueUniversity of Central Florida YanchengZhang - RuiXieUniversity of Central Florida + RuiXieUniversity of Central Florida MengxinZhengUniversity of Central Florida 9863-9875 It is imperative to ensure the stability of every prediction made by a language model; that is, a language’s prediction should remain consistent despite minor input variations, like word substitutions. In this paper, we investigate the problem of certifying a language model’s robustness against Universal Text Perturbations (UTPs), which have been widely used in universal adversarial attacks and backdoor attacks. Existing certified robustness based on random smoothing has shown considerable promise in certifying the input-specific text perturbations (ISTPs), operating under the assumption that any random alteration of a sample’s clean or adversarial words would negate the impact of sample-wise perturbations. However, with UTPs, masking only the adversarial words can eliminate the attack. A naive method is to simply increase the masking ratio and the likelihood of masking attack tokens, but it leads to a significant reduction in both certified accuracy and the certified radius due to input corruption by extensive masking. To solve this challenge, we introduce a novel approach, the superior prompt search method, designed to identify a superior prompt that maintains higher certified accuracy under extensive masking. Additionally, we theoretically motivate why ensembles are a particularly suitable choice as base prompts for random smoothing. The method is denoted by superior prompt ensembling technique. We also empirically confirm this technique, obtaining state-of-the-art results in multiple settings. These methodologies, for the first time, enable high certified accuracy against both UTPs and ISTPs. The source code of CR-UTP is available at https://github.com/UCF-ML-Research/CR-UTP. @@ -14090,8 +14090,8 @@ Recovering document annotations for sentence-level bitext RachelWicksJohns Hopkins University - MattPostMicrosoft and Johns Hopkins University - PhilippKoehnJohns Hopkins University + MattPostMicrosoft and Johns Hopkins University + PhilippKoehnJohns Hopkins University 9876-9890 In machine translation, historical models were incapable of handling longer contexts, so the lack of document-level datasets was less noticeable. Now, despite the emergence of long-sequence methods, we remain within a sentence-level paradigm and without data to adequately approach context-aware machine translation. Most large-scale datasets have been processed through a pipeline that discards document-level metadata. In this work, we reconstruct document-level information for three (ParaCrawl, News Commentary, and Europarl) large datasets in German, French, Spanish, Italian, Polish, and Portuguese (paired with English). We then introduce a document-level filtering technique as an alternative to traditional bitext filtering. We present this filtering with analysis to show that this method prefers context-consistent translations rather than those that may have been sentence-level machine translated. Last we train models on these longer contexts and demonstrate improvement in document-level translation without degradation of sentence-level translation. We release our dataset, ParaDocs, and resulting models as a resource to the community. 2024.findings-acl.589 @@ -14100,11 +14100,11 @@ <fixed-case>M</fixed-case>eta<fixed-case>P</fixed-case>ro 2.0: Computational Metaphor Processing on the Effectiveness of Anomalous Language Modeling - RuiMao + RuiMao KaiHeNational University of Singapore ClaudiaOng - QianLiuUniversity of Auckland - ErikCambriaNanyang Technological University + QianLiuUniversity of Auckland + ErikCambriaNanyang Technological University 9891-9908 Metaphor interpretation is a difficult task in natural language understanding. The development of relevant techniques in this domain is slow, mostly because of the lack of large annotated datasets and effective pre-trained language models (PLMs) for metaphor learning. Thus, we propose a large annotated dataset and a PLM for the metaphor interpretation task. Our foundation model is based on a novel anomalous language modeling (ALM) method, which we benchmark with comparable PLM baselines on the new dataset, finding that it largely improves model performance on metaphor identification and interpretation. 2024.findings-acl.590 @@ -14116,9 +14116,9 @@ ShenzhiWangDepartment of Automation, Tsinghua University ChangLiu ZilongZhengBeijing Institute for General Artificial Intelligence - SiyuanQiBeijing Institute for General Artificial Intelligence + SiyuanQiBeijing Institute for General Artificial Intelligence ShuoChenBeijing Institute for General Artificial Intelligence - QisenYang + QisenYang AndrewZhao ChaofeiWangTsinghua University, Tsinghua University ShijiSongTsinghua University, Tsinghua University @@ -14132,7 +14132,7 @@ Direct Preference Optimization with an Offset AfraAminiETHZ - ETH Zurich - TimVieiraJohns Hopkins University + TimVieiraJohns Hopkins University RyanCotterellSwiss Federal Institute of Technology 9954-9972 Direct preference optimization (DPO) is a successful fine-tuning strategy for aligning large language models with human preferences without the need to train a reward model or employ reinforcement learning. DPO, as originally formulated, relies on binary preference data and fine-tunes a language model to increase the likelihood of a preferred response over a dispreferred response. However, not all preference pairs are equal. Sometimes, the preferred response is only slightly better than the dispreferred one. In other cases, the preference is much stronger. For instance, if a response contains harmful or toxic content, the annotator will have a strong preference for that response. In this paper, we propose a generalization of DPO, termed DPO with an offset (ODPO), that does not treat every preference pair equally during fine-tuning. Intuitively, ODPO requires the difference between the likelihood of the preferred and dispreferred response to be greater than an offset value. The offset is determined based on the extent to which one response is preferred over another. Our experiments on various tasks suggest that ODPO significantly outperforms DPO in aligning language models, especially when the number of preference pairs is limited. @@ -14142,16 +14142,16 @@ <fixed-case>T</fixed-case>rans<fixed-case>F</fixed-case>ace: Unit-Based Audio-Visual Speech Synthesizer for Talking Head Translation - XizeCheng + XizeCheng RongjieHuangFAIR - LinjunLiZhejiang University - ZehanWang - TaoJin + LinjunLiZhejiang University + ZehanWang + TaoJin AoxiongYinMicrosoft and Zhejiang University ChenFeiyang XinyuDuan BaoxingHuai - ZhouZhaoZhejiang University and Zhejiang University + ZhouZhaoZhejiang University and Zhejiang University 9973-9986 Direct speech-to-speech translation achieves high-quality results through the introduction of discrete units obtained from self-supervised learning. However, talking head translation, converting audio-visual speech (i.e., talking head video) from one language into another, still confronts several challenges compared to audio speech: (1) Existing methods invariably rely on cascading, synthesizing via both audio and text, resulting in delays and cascading errors. (2) Talking head translation has a limited set of reference frames. If the generated translation exceeds the length of the original speech, the video sequence needs to be supplemented by repeating frames, leading to jarring video transitions. In this work, we propose a model for talking head translation, TransFace, which can directly translate audio-visual speech into audio-visual speech in other languages. It consists of a speech-to-unit translation model to convert audio speech into discrete units and a unit-based audio-visual speech synthesizer, Unit2Lip, to re-synthesize synchronized audio-visual speech from discrete units in parallel. Furthermore, we introduce a Bounded Duration Predictor, ensuring isometric talking head translation and preventing duplicate reference frames. Experiments demonstrate that Unit2Lip significantly improves synchronization and boosts inference speed by a factor of 4.35 on LRS2. Additionally, TransFace achieves impressive BLEU scores of 61.93 and 47.55 for Es-En and Fr-En on LRS3-T and 100% isochronous translations. The samples are available at https://transface-demo.github.io . 2024.findings-acl.593 @@ -14160,7 +14160,7 @@ More than Minorities and Majorities: Understanding Multilateral Bias in Language Generation - JiaxuZhao + JiaxuZhao ZijingShi YitongLiHuawei Technologies Co., Ltd. YulongPeiEindhoven University of Technology @@ -14176,10 +14176,10 @@ Fair Federated Learning with Biased Vision-Language Models HuiminZeng - ZhenruiYue + ZhenruiYue YangZhangUniversity of Illinois at Urbana-Champaign - LanyuShang - DongWangUniversity of Illinois at Urbana-Champaign + LanyuShang + DongWangUniversity of Illinois at Urbana-Champaign 10002-10017 Existing literature that integrates CLIP into federated learning (FL) largely ignores the inherent group unfairness within CLIP and its ethical implications on FL applications. Furthermore, such CLIP bias may be amplified in FL, due to the unique issue of data heterogeneity across clients. However, in identity-sensitive FL applications, model fairness (i.e., group fairness) is imperative for model development. Therefore, this work explores a critical question ignored by the existing literature: how can we build a fair FL framework using biased pre-trained VLMs (e.g., CLIP)? To address this problem, we propose a fairness-aware adaptation framework tailored for VLM (e.g., CLIP) in the context of FL, named Fair Federated Deep Visiual Prompting or FF-DVP. As implied by its name, trains a fair FL model with fairness-aware deep visual prompting (DVP). Moreover, incorporates modality-fused classification heads to learn client-specific knowledge and fairness constraints. These modules explicitly addresses a unique bias in FL, namely the bias triggered by data heterogeneity. We show that can be readily extended to prevailing parameter-efficient fine-tuning methods (e.g., adapter or LoRA) for debiasing. To the best of our knowledge, is the first to leverage biased VLMs for building fair FL frameworks. Extensive results on human face attribute recognition (FAR) applications suggest that effectively improves model fairness and training convergence, outperforming state-of-the-art baselines. 2024.findings-acl.595 @@ -14228,8 +14228,8 @@ JieHe YuhuaKe GuangyaoZhuWaseda University - VictorGutierrez BasultoCardiff University - JeffPanUniversity of Edinburgh, University of Edinburgh + VictorGutierrez BasultoCardiff University + JeffPanUniversity of Edinburgh, University of Edinburgh 10057-10084 Multimodal Large Language Models (MLLMs) fine-tuned with multimodal instruction-following data have demonstrated formidable capabilities in multimodal tasks. However, fine-tuning all parameters of MLLMs has become challenging due to the rapid growth of the overall model’s parameters. To address this issue, we study Parameter-Efficient Fine-Tuning (PEFT) methods for MLLMs. We aim to identify effective methods for enhancing performance in scenarios where only a limited number of parameters are trained. This paper conducts empirical studies that employ four widely used PEFT methods to fine-tune the LLM component of open-source MLLMs. We present a comprehensive analysis that encompasses various aspects, including the impact of PEFT methods on various models, parameters and location of PEFT module, fine-tuning data scale, model stability based on PEFT method, MLLM’s generalization, and hallucination. We evaluated four PEFT methods on seven datasets from two different categories, unseen and seen datasets. Across all experiments, we show that the adapter is the best-performing PEFT method in various aspects. At the same time, fine-tuning the connector layers leads to improved performance in most MLLMs. 2024.findings-acl.598 @@ -14239,8 +14239,8 @@ <fixed-case>PARADISE</fixed-case>: Evaluating Implicit Planning Skills of Language Models with Procedural Warnings and Tips Dataset ArdaUzunoğluJohns Hopkins University - AbdulfattahSafaKoç University - Gözde GülŞahinKoç University + AbdulfattahSafaKoç University + Gözde GülŞahinKoç University 10085-10102 Recently, there has been growing interest within the community regarding whether large language models are capable of planning or executing plans. However, most prior studies use LLMs to generate high-level plans for simplified scenarios lacking linguistic complexity and domain diversity, limiting analysis of their planning abilities. These setups constrain evaluation methods (e.g., predefined action space), architectural choices (e.g., only generative models), and overlook the linguistic nuances essential for realistic analysis. To tackle this, we present PARADISE, an abductive reasoning task using Q&A format on practical procedural text sourced from wikiHow. It involves tip and warning inference tasks directly associated with goals, excluding intermediary steps, with the aim of testing the ability of the models to infer implicit knowledge of the plan solely from the given goal. Our experiments, utilizing fine-tuned language models and zero-shot prompting, reveal the effectiveness of task-specific small models over large language models in most scenarios. Despite advancements, all models fall short of human performance. Notably, our analysis uncovers intriguing insights, such as variations in model behavior with dropped keywords, struggles of BERT-family and GPT-4 with physical and abstract goals, and the proposed tasks offering valuable prior knowledge for other unseen procedural tasks. The PARADISE dataset and associated resources are publicly available for further research exploration with https://anonymous.4open.science/r/paradise-53BD/README.md. 2024.findings-acl.599 @@ -14249,12 +14249,12 @@ <fixed-case>TURNA</fixed-case>: A <fixed-case>T</fixed-case>urkish Encoder-Decoder Language Model for Enhanced Understanding and Generation - GökçeUludoğan + GökçeUludoğan ZeynepBalalBoğaziçi University FurkanAkkurtBoğaziçi University - MeliksahTurkerBogazici University + MeliksahTurkerBogazici University OnurGungorBoğaziçi University - SusanÜsküdarlıBoğaziçi University + SusanÜsküdarlıBoğaziçi University 10103-10117 The recent advances in natural language processing have predominantly favored well-resourced English-centric models, resulting in a significant gap with low-resource languages. In this work, we introduce TURNA, a language model developed for the low-resource language Turkish and is capable of both natural language understanding and generation tasks.TURNA is pretrained with an encoder-decoder architecture based on the unified framework UL2 with a diverse corpus that we specifically curated for this purpose. We evaluated TURNA with three generation tasks and five understanding tasks for Turkish. The results show that TURNA outperforms several multilingual models in both understanding and generation tasks and competes with monolingual Turkish models in understanding tasks. 2024.findings-acl.600 @@ -14268,7 +14268,7 @@ ShuichiroShimizu ZhengdongYangKyoto University, Kyoto University YihangLi - ChenhuiChuKyoto University + ChenhuiChuKyoto University SadaoKurohashiKyoto University 10118-10126 Emotion plays a crucial role in human conversation. This paper underscores the significance of considering emotion in speech translation. We present the MELD-ST dataset for the emotion-aware speech translation task, comprising English-to-Japanese and English-to-German language pairs. Each language pair includes about 10,000 utterances annotated with emotion labels from the MELD dataset. Baseline experiments using the SeamlessM4T model on the dataset indicate that fine-tuning with emotion labels can enhance translation performance in some settings, highlighting the need for further research in emotion-aware speech translation systems. @@ -14291,7 +14291,7 @@ Chain-of-Quizzes: Pedagogy-inspired Example Selection in In-Context-Learning YiquanWuZhejiang University AnlaiZhouZhejiang University - YuhangLiuZhejiang University + YuhangLiuZhejiang University YifeiLiu AdamJatowt WeimingLuZhejiang University @@ -14306,7 +14306,7 @@ It’s Not Easy Being Wrong: Large Language Models Struggle with Process of Elimination Reasoning NishantBalepur - ShramayPalta + ShramayPalta RachelRudingerUniversity of Maryland, College Park 10143-10166 Chain-of-thought (COT) prompting can help large language models (LLMs) reason toward correct answers, but its efficacy in reasoning toward incorrect answers is unexplored. This process of elimination (PoE), when used with COT, can enhance self-consistency, interpretability, and tasks such as medical diagnoses of exclusion. Thus, we propose PoE with COT, where LLMs must reason toward incorrect options on multiple-choice questions. We evaluate the ability of GPT-3.5, LLaMA-2, and Falcon to perform PoE with COT on a total of four commonsense and scientific reasoning datasets. We find that the strategy of PoE always underperforms the strategy of choosing the correct answer. The agreement of these strategies is also lower than the self-consistency of each strategy. To study these issues further, we conduct error analyses and give suggestions for future work. @@ -14316,13 +14316,13 @@ From Discrimination to Generation: Low-Resource Intent Detection with Language Model Instruction Tuning - FengZhangPeking University + FengZhangPeking University WeiChen FeiDingPeking University MengGao TengjiaoWang JiahuiYao - JiabinZheng + JiabinZheng 10167-10183 Intent detection aims to identify user goals from utterances, and is a ubiquitous step towards the satisfaction of user desired needs in many interaction systems. As dynamic and varied intents arise, models that are capable of identifying new intents promptly are required. However, existing studies usually fine-tune discriminative models on the specific defined intent classes, precluding them from being directly adopted to new intent domains. In this paper, we introduce a generative pre-trained intent model that can recognize new intents from different domains in low-resource scenarios. We reformulate intent detection into a generation task and design descriptive and regularized instructions to guide the model effectively to detect new intents in open domains with no parameter updates. To validate the proposed method, we introduce a new intent detection benchmark, including the Meta-Intent Dataset and three types of representative evaluation settings. We conduct extensive experiments which demonstrate that our method outperforms a range of strong baselines that needs further fine-tuning or domain-specific samples. 2024.findings-acl.605 @@ -14331,7 +14331,7 @@ Efficient Continual Pre-training for Building Domain Specific Large Language Models - YongXieAmazon + YongXieAmazon KaranAggarwalAmazon and University of Minnesota, Minneapolis AitzazAhmadAmazon 10184-10201 @@ -14342,7 +14342,7 @@ Distantly-Supervised Joint Extraction with Noise-Robust Learning - YufeiLiUniversity of California, Riverside + YufeiLiUniversity of California, Riverside XiaoYuStellar Cyber YanghongGuo YanchiLiuNEC-Labs @@ -14373,8 +14373,8 @@ YiQiuGuo YuchenYang YaZhangShanghai Jiao Tong University - YuWangShanghai Jiao Tong University - YanfengWangShanghai Jiao Tong University + YuWangShanghai Jiao Tong University + YanfengWangShanghai Jiao Tong University 10231-10241 Structured data offers an efficient means of organizing information. Exsisting text-serialization based methods for processing structured data using large language models (LLMs) are not designed to explicitly capture the heterogeneity of structured data. Such methods are suboptimal for LLMs to process structured data, and may lead to large input token size and poor robustness to input perturbation. In this paper, we propose a novel framework called DictLLM, which is an efficient and effective framework for the modeling of medical lab report to deal with the report-assisted diagnosis generation task. DictLLM introduce 1) group positional encoding to maintain the permutation invariance, 2) hierarchical attention bias to capture the inductive bias of structured data, and 3) a optimal transport alignment layer to align the embeddings generated by the dict encoder with the LLM, producing a list of fixed-length virtual tokens. We conduct experiments with multiple LLM models on a large-scale real-world medical lab report dataset for automatic diagnosis generation. The results show that our proposed framework outperforms the baseline methods and few-shot GPT-4 in terms of both Rouge-L and Knowledge F1 score. We also conduct multiple experiments and analyze the scalability and robustness of our proposed framework, demonstrating the superiority of our method in modeling the heterogeneous structure of medical dictionaries data. 2024.findings-acl.609 @@ -14383,10 +14383,10 @@ imap<fixed-case>S</fixed-case>core: Medical Fact Evaluation Made Easy - HuiminWangJarvis Research Center, Tencent YouTu Lab + HuiminWangJarvis Research Center, Tencent YouTu Lab YutianZhaoTencent AI Lab - XianWuTencent - YefengZheng + XianWuTencent + YefengZheng 10242-10257 Automatic evaluation of natural language generation (NLG) tasks has gained extensive research interests, since it can rapidly assess the performance of large language models (LLMs). However, automatic NLG evaluation struggles with medical QA because it fails to focus on the crucial correctness of medical facts throughout the generated text. To address this, this paper introduces a new data structure, imap, designed to capture key information in questions and answers, enabling evaluators to focus on essential details. The imap comprises three components: Query, Constraint, and Inform, each of which is in the form of term-value pairs to represent medical facts in a structural manner. We then introduce imapScore, which compares the corresponding medical term-value pairs in the imap to score generated texts. We utilize GPT-4 to extract imap from questions, human-annotated answers, and generated responses. To mitigate the diversity in medical terminology for fair term-value pairs comparison, we use a medical knowledge graph to assist GPT-4 in determining matches. To compare imapScore with existing NLG metrics, we establish a new benchmark dataset. The experimental results show that imapScore consistently outperforms state-of-the-art metrics, demonstrating an average improvement of 79.8% in correlation with human scores. Furthermore, incorporating imap into n-gram, embedding, and LLM metrics boosts the base versions, increasing correlation with human scores by averages of 89.9%, 81.7%, and 32.6%, respectively. 2024.findings-acl.610 @@ -14412,7 +14412,7 @@ Debiasing Large Language Models with Structured Knowledge CongdaMaTokyo Institute of Technology, Tokyo Institute of Technology TianyuZhaoSakana AI - ManabuOkumuraTokyo Institute of Technology, Tokyo Institute of Technology + ManabuOkumuraTokyo Institute of Technology, Tokyo Institute of Technology 10274-10287 Due to biases inherently present in data for pre-training, current pre-trained Large Language Models (LLMs) also ubiquitously manifest the same phenomena. Since the bias influences the output from the LLMs across various tasks, the widespread deployment of the LLMs is hampered. We propose a simple method that utilizes structured knowledge to alleviate this issue, aiming to reduce the bias embedded within the LLMs and ensuring they have an encompassing perspective when used in applications. Experimental results indicated that our method has good debiasing ability when applied to existing both autoregressive and masked language models. Additionally, it could ensure that the performances of LLMs on downstream tasks remain uncompromised.Our method outperforms state-of-the-art (SOTA) baselines in the debiasing ability. Importantly, our method obviates the need for training from scratch, thus offering enhanced scalability and cost-effectiveness. 2024.findings-acl.612 @@ -14428,7 +14428,7 @@ FanYinUniversity of California, Los Angeles AramGalstyanInformation Sciences Institute, University of Southern California, University of Southern California, University of Southern California and Amazon Alexa WenpengYinPennsylvania State University - MuhaoChenUniversity of California, Davis and University of Southern California + MuhaoChenUniversity of California, Davis and University of Southern California 10288-10302 Instruction tuning has been used as a promising approach to improve the performance of large language models (LLMs) on unseen tasks. However, current LLMs exhibit limited robustness to unseen instructions, generating inconsistent outputs when the same instruction is phrased with slightly varied forms or language styles. This behavior indicates LLMs’ lack of robustness to textual variations and generalizability to unseen instructions, potentially leading to trustworthiness issues. Accordingly, we propose Contrastive Instruction Tuning, which maximizes the similarity between the hidden representations of semantically equivalent instruction-instance pairs while minimizing the similarity between semantically different ones. To facilitate this approach, we augment the existing FLAN collection by paraphrasing task instructions. Experiments on the PromptBench benchmark show that CoIN consistently improves LLMs’ robustness to unseen instructions with variations across character, word, sentence, and semantic levels by an average of +2.5% in accuracy. 2024.findings-acl.613 @@ -14451,9 +14451,9 @@ Refining and Synthesis: A Simple yet Effective Data Augmentation Framework for Cross-Domain Aspect-based Sentiment Analysis - HainingWang + HainingWang KangHe - BoboLiWuhan University + BoboLiWuhan University LeiChen FeiLiWuhan University XuHan @@ -14468,9 +14468,9 @@ Codec-<fixed-case>SUPERB</fixed-case>: An In-Depth Analysis of Sound Codec Models HaibinWu - Ho-LamChungNational Taiwan University + Ho-LamChungNational Taiwan University Yi-ChengLin - Yuan-KueiWuNational Taiwan University + Yuan-KueiWuNational Taiwan University XuanjunChen Yu-ChiPai Hsiu-HsuanWang @@ -14488,8 +14488,8 @@ SirryChen ShuoFeng LiangSongsong - Chen-ChenZong - JingLiThe Hong Kong Polytechnic University + Chen-ChenZong + JingLiThe Hong Kong Polytechnic University PijiLiNanjing University of Aeronautics and Astronautics 10349-10360 Social media bot detection is increasingly crucial with the rise of social media platforms. Existing methods predominantly construct social networks as graph and utilize graph neural networks (GNNs) for bot detection. However, most of these methods focus on how to improve the performance of GNNs while neglecting the community structure within social networks. Moreover, GNNs based methods still face problems such as poor model generalization due to the relatively small scale of the dataset and over-smoothness caused by information propagation mechanism. To address these problems, we propose the Community-Aware Heterogeneous Graph Contrastive Learning framework (i.e., CACL), which constructs social network as heterogeneous graph with multiple node types and edge types, and then utilizes community-aware module to mine both hard positive samples and hard negative samples for supervised graph contrastive learning with adaptive graph enhancement algorithms. Extensive experiments demonstrate that our framework addresses the previously mentioned challenges and outperforms competitive baselines on three social media bot benchmarks. @@ -14501,8 +14501,8 @@ Are Machines Better at Complex Reasoning? Unveiling Human-Machine Inference Gaps in Entailment Verification SoumyaSanyal TianyiXiao - JiachengLiuAllen Institute for Artificial Intelligence and Paul G. Allen School of Computer Science and Engineering, University of Washington - WenyaWangNanyang Technological University + JiachengLiuAllen Institute for Artificial Intelligence and Paul G. Allen School of Computer Science and Engineering, University of Washington + WenyaWangNanyang Technological University XiangRen 10361-10386 Making inferences in text comprehension to understand the meaning is essential in language processing. This work studies the entailment verification (EV) problem of complex, multi-sentence premises requiring a system to make multiple inferences implicitly. Modern applications of EV in detecting inconsistent model-generated rationales require complex multi-hop reasoning. However, current textual inference datasets mostly contain short-sentence premises that partially focus on this. To address this, we compile an EV benchmark that includes datasets from three NLP domains (NLI, contextual QA, and rationales) containing multi-sentence premises. On benchmarking humans and LLMs, we find that LLMs are better than humans in multi-hop reasoning across extended contexts, while humans perform better in simple deductive reasoning tasks. We also finetune a Flan-T5 model for EV using two training objectives to obtain a strong open-source model that outperforms GPT-3.5 and rivals GPT-4. Finally, we use our finetuned model to filter out inconsistent model-generated rationales in self-consistency decoding, resulting in a 6% accuracy improvement on average across three MCQ datasets. @@ -14514,7 +14514,7 @@ <fixed-case>C</fixed-case>hart<fixed-case>I</fixed-case>nstruct: Instruction Tuning for Chart Comprehension and Reasoning AhmedMasryYork University MehradShahmohammadi - Md RizwanParvezQatar Computing Research Institute and Bosch + Md RizwanParvezQatar Computing Research Institute and Bosch EnamulHoqueYork University ShafiqJotySalesForce.com and Nanyang Technological University 10387-10409 @@ -14545,8 +14545,8 @@ MengLiPeking University AasishPappuMeta AI BarlasOguzMeta - MuhammadAbdul-MageedUniversity of British Columbia - LaksLakshmananUniversity of British Columbia + MuhammadAbdul-MageedUniversity of British Columbia + LaksLakshmananUniversity of British Columbia RaghuramanKrishnamoorthiFacebook VikasChandraMeta 10424-10443 @@ -14558,7 +14558,7 @@ <fixed-case>S</fixed-case>hared<fixed-case>C</fixed-case>on: Implicit Hate Speech Detection using Shared Semantics HyeseonAhnYonsei University - YoungwookKimKT Corporation + YoungwookKimKT Corporation JunginKim Yo-SubHanYonsei University 10444-10455 @@ -14592,11 +14592,11 @@ Generalization-Enhanced Code Vulnerability Detection via Multi-Task Instruction Fine-Tuning - XiaohuDu + XiaohuDu MingWenHuazhong University of Science and Technology JiahaoZhu - ZifanXie - BinJiNational University of Singapore + ZifanXie + BinJiNational University of Singapore HuijunLiu XuanhuaShiHuazhong University of Science and Technology HaiJinHuazhong University of Science and Technology @@ -14610,9 +14610,9 @@ <fixed-case>PPTSER</fixed-case>: A Plug-and-Play Tag-guided Method for Few-shot Semantic Entity Recognition on Visually-rich Documents WenhuiLiao JiapengWang - ZeningLinSouth China University of Technology + ZeningLinSouth China University of Technology LongfeiXiongKingsoft Office - LianwenJinSouth China University of Technology + LianwenJinSouth China University of Technology 10522-10539 Visually-rich document information extraction (VIE) is a vital aspect of document understanding, wherein Semantic Entity Recognition (SER) plays a significant role. However, few-shot SER on visually-rich documents remains relatively unexplored despite its considerable potential for practical applications. To address this issue, we propose a simple yet effective Plug-and-Play Tag-guided method for few-shot Semantic Entity Recognition (PPTSER) on visually-rich documents. PPTSER is built upon off-the-shelf multi-modal pre-trained models. It leverages the semantics of the tags to guide the SER task, reformulating SER into entity typing and span detection, handling both tasks simultaneously via cross-attention. Experimental results illustrate that PPTSER outperforms existing fine-tuning and few-shot methods, especially in low-data regimes. With full training data, PPTSER achieves comparable or superior performance to fine-tuning baseline. For instance, on the FUNSD benchmark, our method improves the performance of LayoutLMv3-base in 1-shot, 3-shot and 5-shot scenarios by 15.61%, 2.13%, and 2.01%, respectively. Overall, PPTSER demonstrates promising generalizability, effectiveness, and plug-and-play nature for few-shot SER on visually-rich documents. The codes will be available at [https://github.com/whlscut/PPTSER](https://github.com/whlscut/PPTSER). 2024.findings-acl.626 @@ -14622,8 +14622,8 @@ <fixed-case>LLM</fixed-case> Performance Predictors are good initializers for Architecture Search GaneshJawaharGoogle DeepMind - MuhammadAbdul-MageedUniversity of British Columbia - LaksLakshmananUniversity of British Columbia + MuhammadAbdul-MageedUniversity of British Columbia + LaksLakshmananUniversity of British Columbia DujianDingComputing Science, University of British Columbia 10540-10560 In this work, we utilize Large Language Models (LLMs) for a novel use case: constructing Performance Predictors (PP) that estimate the performance of specific deep neural network architectures on downstream tasks. We create PP prompts for LLMs, comprising (i) role descriptions, (ii) instructions for the LLM, (iii) hyperparameter definitions, and (iv) demonstrations presenting sample architectures with efficiency metrics and ‘training from scratch’ performance. In machine translation (MT) tasks, GPT-4 with our PP prompts (LLM-PP) achieves a SoTA mean absolute error and a slight degradation in rank correlation coefficient compared to baseline predictors. Additionally, we demonstrate that predictions from LLM-PP can be distilled to a compact regression model (LLM-Distill-PP), which surprisingly retains much of the performance of LLM-PP. This presents a cost-effective alternative for resource-intensive performance estimation. Specifically, for Neural Architecture Search (NAS), we introduce a Hybrid-Search algorithm (HS-NAS) employing LLM-Distill-PP for the initial search stages and reverting to the baseline predictor later. HS-NAS performs similarly to SoTA NAS, reducing search hours by approximately 50%, and in some cases, improving latency, GFLOPs, and model size. The code can be found at: https://github.com/UBC-NLP/llmas. @@ -14637,7 +14637,7 @@ DeXinKongSuzhou University SuxianZhao XingyuLi - GuohongFu + GuohongFu 10561-10573 Dialogue discourse parsing (DDP) aims to capture the relations between utterances in the dialogue. In everyday real-world scenarios, dialogues are typically multi-modal and cover open-domain topics. However, most existing widely used benchmark datasets for DDP contain only textual modality and are domain-specific. This makes it challenging to accurately and comprehensively understand the dialogue without multi-modal clues, and prevents them from capturing the discourse structures of the more prevalent daily conversations. This paper proposes MODDP, the first multi-modal Chinese discourse parsing dataset derived from open-domain daily dialogues, consisting 864 dialogues and 18,114 utterances, accompanied by 12.7 hours of video clips. We present a simple yet effective benchmark approach for multi-modal DDP. Through extensive experiments, we present several benchmark results based on MODDP. The significant improvement in performance from introducing multi-modalities into the original textual unimodal DDP model demonstrates the necessity of integrating multi-modalities into DDP. 2024.findings-acl.628 @@ -14646,13 +14646,13 @@ <fixed-case>C</fixed-case>hinese <fixed-case>M</fixed-case>ental<fixed-case>BERT</fixed-case>: Domain-Adaptive Pre-training on Social Media for <fixed-case>C</fixed-case>hinese Mental Health Text Analysis - WeiZhai + WeiZhai HongzhiQi - QingZhao - JianqiangLiBeijing University of Technology - ZiqiWang - HanWang - BingYang + QingZhao + JianqiangLiBeijing University of Technology + ZiqiWang + HanWang + BingYang GuanghuiFu 10574-10585 In the current environment, psychological issues are prevalent and widespread, with social media serving as a key outlet for individuals to share their feelings. This results in the generation of vast quantities of data daily, where negative emotions have the potential to precipitate crisis situations. There is a recognized need for models capable of efficient analysis. While pre-trained language models have demonstrated their effectiveness broadly, there’s a noticeable gap in pre-trained models tailored for specialized domains like psychology. To address this, we have collected a huge dataset from Chinese social media platforms and enriched it with publicly available datasets to create a comprehensive database encompassing 3.36 million text entries. To enhance the model’s applicability to psychological text analysis, we integrated psychological lexicons into the pre-training masking mechanism. Building on an existing Chinese language model, we performed adaptive training to develop a model specialized for the psychological domain. We evaluated our model’s performance across six public datasets, where it demonstrated improvements compared to eight other models. Additionally, in the qualitative comparison experiment, our model provided psychologically relevant predictions given the masked sentences. Due to concerns regarding data privacy, the dataset will not be made publicly available. However, we have made the pre-trained models and codes publicly accessible to the community via: https://github.com/zwzzzQAQ/Chinese-MentalBERT. @@ -14663,12 +14663,12 @@ Beyond One-Preference-Fits-All Alignment: Multi-Objective Direct Preference Optimization ZhanhuiZhouShanghai Artificial Intelligence Laboratory - JieLiuThe Chinese University of Hong Kong + JieLiuThe Chinese University of Hong Kong JingShaoShanghai AI Laboratory XiangyuYueThe Chinese University of Hong Kong ChaoYang - WanliOuyangShanghai AI Lab - YuQiao + WanliOuyangShanghai AI Lab + YuQiao 10586-10613 A single language model, even when aligned with labelers through reinforcement learning from human feedback (RLHF), may not suit all human preferences. Recent approaches therefore prefer customization, gathering multi-dimensional feedback, and creating distinct reward models for each dimension.Different language models are then optimized for various preferences using multi-objective RLHF (MORLHF) with varying reward weights.However, RL fine-tuning is unstable and resource-heavy, especially with diverse and usually conflicting objectives.In this paper, we present Multi-Objective Direct Preference Optimization (MODPO), an RL-free extension of Direct Preference Optimization (DPO) for multiple alignment objectives.Essentially, MODPO folds language modeling directly into reward modeling, training language models as implicit collective reward models that combine all objectives with specific weights. MODPO theoretically yields the same optimal solutions as MORLHF but is practically more stable and efficient.Empirical results in safety alignment and long-form question answering show that MODPO matches or outperforms existing methods, producing a Pareto front of language models catering to diverse preferences with three times less computational resources compared to MORLHF.Code is available at https://github.com/ZHZisZZ/modpo. 2024.findings-acl.630 @@ -14695,7 +14695,7 @@ WenqiangLeiSichuan University DingnanJin JiaLiu - Tat-SengChuaNational University of Singapore + Tat-SengChuaNational University of Singapore 10633-10649 Equipping a conversational search engine with strategies regarding when to ask clarification questions is becoming increasingly important across various domains. Attributing to the context understanding capability of LLMs and their access to domain-specific sources of knowledge, LLM-based clarification strategies feature rapid transfer to various domains in a post-hoc manner.However, they still struggle to deliver promising performance on unseen domains, struggling to achieve effective domain transferability.We take the first step to investigate this issue and existing methods tend to produce one-size-fits-all strategies across diverse domains, limiting their search effectiveness.In response, we introduce a novel method, called STYLE,to achieve effective domain transferability.Our experimental results indicate that STYLE bears strong domain transferability, resulting in an average search performance improvement of 10% on four unseen domains. 2024.findings-acl.632 @@ -14704,16 +14704,16 @@ Evaluating Robustness of Generative Search Engine on Adversarial Factoid Questions - XumingHuThe Hong Kong University of Science and Technology (Guangzhou) and Hong Kong University of Science and Technology + XumingHuThe Hong Kong University of Science and Technology (Guangzhou) and Hong Kong University of Science and Technology XiaochuanLi - JunzheChen + JunzheChen YinghuiLi YangningLiTsinghua University, Tsinghua University XiaoguangLi YashengWang - QunLiuHuawei Noah’s Ark Lab + QunLiuHuawei Noah’s Ark Lab LijieWenSchool of Software, Tsinghua University - PhilipYuUniversity of Illinois, Chicago + PhilipYuUniversity of Illinois, Chicago ZhijiangGuoUniversity of Cambridge 10650-10671 Generative search engines have the potential to transform how people seek information online, but generated responses from existing large language models (LLMs)-backed generative search engines may not always be accurate. Nonetheless, retrieval-augmented generation exacerbates safety concerns, since adversaries may successfully evade the entire system by subtly manipulating the most vulnerable part of a claim. To this end, we propose evaluating the robustness of generative search engines in the realistic and high-risk setting, where adversaries have only black-box system access and seek to deceive the model into returning incorrect responses. Through a comprehensive human evaluation of various generative search engines, such as Bing Chat, PerplexityAI, and YouChat across diverse queries, we demonstrate the effectiveness of adversarial factual questions in inducing incorrect responses. Moreover, retrieval-augmented generation exhibits a higher susceptibility to factual errors compared to LLMs without retrieval. These findings highlight the potential security risks of these systems and emphasize the need for rigorous evaluation before deployment. The dataset and code will be publicly available. @@ -14739,8 +14739,8 @@ YuhanChen SendongZhao HaochunWang - GongZhangGongZhang - BingQinHarbin Institute of Technology + GongZhangGongZhang + BingQinHarbin Institute of Technology TingLiuHarbin Institute of Technology 10686-10697 Chain-of-Thought (CoT) serves as a critical emerging ability in LLMs, especially when it comes to logical reasoning. Attempts have been made to induce such ability in small models as well by distilling from the data with CoT generated by Large Language Models (LLMs). However, existing methods often simply generate and incorporate more data from LLMs and fail to note the importance of efficiently utilizing existing CoT data. We here propose a new training paradigm AS-ES (Abstractive Segments - Extractive Segments) learning, which exploits the inherent information in CoT for iterative generation. Experiments show that our methods surpass the direct seq2seq training on CoT-extensive tasks like MWP and PET summarization, without data augmentation or altering the model itself. Furthermore, we explore the reason behind the inefficiency of small models in learning CoT and provide an explanation of why AS-ES learning works, giving insights into the underlying mechanism of CoT. @@ -14752,7 +14752,7 @@ <fixed-case>II</fixed-case>-<fixed-case>MMR</fixed-case>: Identifying and Improving Multi-modal Multi-hop Reasoning in Visual Question Answering JihyungKilOhio State University, Columbus FaridehTavazoee - DongyeopKangUniversity of Minnesota + DongyeopKangUniversity of Minnesota Joo-KyungKimAmazon AGI 10698-10709 Visual Question Answering (VQA) often involves diverse reasoning scenarios across Vision and Language (V&L). Most prior VQA studies, however, have merely focused on assessing the model’s overall accuracy without evaluating it on different reasoning cases. Furthermore, some recent works observe that conventional Chain-of-Thought (CoT) prompting fails to generate effective reasoning for VQA, especially for complex scenarios requiring multi-hop reasoning. In this paper, we propose II-MMR, a novel idea to identify and improve multi-modal multi-hop reasoning in VQA. In specific, II-MMR takes a VQA question with an image and finds a reasoning path to reach its answer using two novel language promptings: (i) answer prediction-guided CoT prompt, or (ii) knowledge triplet-guided prompt. II-MMR then analyzes this path to identify different reasoning cases in current VQA benchmarks by estimating how many hops and what types (i.e., visual or beyond-visual) of reasoning are required to answer the question. On popular benchmarks including GQA and A-OKVQA, II-MMR observes that most of their VQA questions are easy to answer, simply demanding “single-hop” reasoning, whereas only a few questions require “multi-hop” reasoning. Moreover, while the recent V&L model struggles with such complex multi-hop reasoning questions even using the traditional CoT method, II-MMR shows its effectiveness across all reasoning cases in both zero-shot and fine-tuning settings. @@ -14763,13 +14763,13 @@ <fixed-case>TAME</fixed-case>-<fixed-case>RD</fixed-case>: Text Assisted Replication of Image Multi-Adjustments for Reverse Designing PoojaGuhanUniversity of Maryland, College Park - UttaranBhattacharyaAdobe Systems + UttaranBhattacharyaAdobe Systems SomdebSarkhelAdobe Research VahidAzizi XiangChenAdobe Systems - SaayanMitraAdobe Research - AniketBeraPurdue University and University of Maryland, College Park - DineshManochaUniversity of Maryland, College Park + SaayanMitraAdobe Research + AniketBeraPurdue University and University of Maryland, College Park + DineshManochaUniversity of Maryland, College Park 10710-10727 Given a source and its edited version performed based on human instructions in natural language, how do we extract the underlying edit operations, to automatically replicate similar edits on other images? This is the problem of reverse designing, and we present TAME-RD, a model to solve this problem. TAME-RD automatically learns from the complex interplay of image editing operations and the natural language instructions to learn fully specified edit operations. It predicts both the underlying image edit operations as discrete categories and their corresponding parameter values in the continuous space.We accomplish this by mapping together the contextual information from the natural language text and the structural differences between the corresponding source and edited images using the concept of pre-post effect. We demonstrate the efficiency of our network through quantitative evaluations on multiple datasets. We observe improvements of 6-10% on various accuracy metrics and 1.01X-4X on the RMSE score and the concordance correlation coefficient for the corresponding parameter values on the benchmark GIER dataset. We also introduce I-MAD, a new two-part dataset: I-MAD-Dense, a collection of approximately 100K source and edited images, together with automatically generated text instructions and annotated edit operations, and I-MAD-Pro, consisting of about 1.6K source and edited images, together with text instructions and annotated edit operations provided by professional editors. On our dataset, we observe absolute improvements of 1-10% on the accuracy metrics and 1.14X–5X on the RMSE score. 2024.findings-acl.637 @@ -14779,11 +14779,11 @@ Batch-<fixed-case>ICL</fixed-case>: Effective, Efficient, and Order-Agnostic In-Context Learning KaiyiZhangRenmin University of China - AngLv - YuhanChenXiaomi Corporation + AngLv + YuhanChenXiaomi Corporation HansenHa TaoXu - RuiYanRenmin University of China + RuiYanRenmin University of China 10728-10739 In this paper, by treating in-context learning (ICL) as a meta-optimization process, we explain why LLMs are sensitive to the order of ICL examples. This understanding leads us to the development of Batch-ICL, an effective, efficient, and order-agnostic inference algorithm for ICL. Differing from the standard N-shot learning approach, Batch-ICL employs N separate 1-shot forward computations and aggregates the resulting meta-gradients. These aggregated meta-gradients are then applied to the forward computation of a zero-shot query to generate the final prediction. This batch processing approach renders the LLM agnostic to the order of ICL examples. Through extensive experiments and analysis, we demonstrate that Batch-ICL consistently outperforms most permutations of ICL examples. In some cases, it even exceeds the performance of the best order for standard ICL, all while reducing the computational resources required. Furthermore, we develop a novel variant of Batch-ICL featuring multiple “epochs” of meta-optimization. This variant implicitly explores permutations of ICL examples, further enhancing ICL performance. 2024.findings-acl.638 @@ -14794,7 +14794,7 @@ <fixed-case>I</fixed-case>ndic<fixed-case>V</fixed-case>oices: Towards building an Inclusive Multilingual Speech Dataset for <fixed-case>I</fixed-case>ndian Languages TahirJaved JankiNawale - EldhoGeorgeIndian Institute of Technology, Madras, Dhirubhai Ambani Institute Of Information and Communication Technology + EldhoGeorgeIndian Institute of Technology, Madras, Dhirubhai Ambani Institute Of Information and Communication Technology SakshiJoshi KaushalBhogale DeovratMehendaleDepartment of Computer Science, Indian Institute of Technology, Madras, Indian Institute of Technology, Madras @@ -14824,7 +14824,7 @@ KaiwenZhou KwonjoonLeeHonda Research Institute USA TeruhisaMisuHonda Research Institute USA, Inc. - XinWangUniversity of California, Santa Cruz + XinWangUniversity of California, Santa Cruz 10783-10795 In our work, we explore the synergistic capabilities of pre-trained vision-and-language models (VLMs) and large language models (LLMs) on visual commonsense reasoning (VCR) problems. We find that VLMs and LLMs-based decision pipelines are good at different kinds of VCR problems. Pre-trained VLMs exhibit strong performance for problems involving understanding the literal visual content, which we noted as visual commonsense understanding (VCU). For problems where the goal is to infer conclusions beyond image content, which we noted as visual commonsense inference (VCI), VLMs face difficulties, while LLMs, given sufficient visual evidence, can use commonsense to infer the answer well. We empirically validate this by letting LLMs classify VCR problems into these two categories and show the significant difference between VLM and LLM with image caption decision pipelines on two subproblems. Moreover, we identify a challenge with VLMs’ passive perception, which may miss crucial context information, leading to incorrect reasoning by LLMs. Based on these, we suggest a collaborative approach, named ViCor, where pre-trained LLMs serve as problem classifiers to analyze the problem category, then either use VLMs to answer the question directly or actively instruct VLMs to concentrate on and gather relevant visual elements to support potential commonsense inferences. We evaluate our framework on two VCR benchmark datasets and outperform all other methods without in-domain fine-tuning. 2024.findings-acl.640 @@ -14833,7 +14833,7 @@ Decomposition for Enhancing Attention: Improving <fixed-case>LLM</fixed-case>-based Text-to-<fixed-case>SQL</fixed-case> through Workflow Paradigm - YuanzhenXieTencent + YuanzhenXieTencent XinzhouJin TaoXie MatrixmxlinMatrixmxlin @@ -14842,7 +14842,7 @@ ChengLei ChengxiangZhuo BoHu - ZangLiTencent + ZangLiTencent 10796-10816 In-context learning of large-language models (LLMs) has achieved remarkable success in the field of natural language processing, while extensive case studies reveal that the single-step chain-of-thought prompting approach faces challenges such as attention diffusion and inadequate performance in complex tasks like text-to-SQL. To improve the contextual learning capabilities of LLMs in text-to-SQL, a workflow paradigm method is proposed, aiming to enhance the attention and problem-solving scope of LLMs through decomposition. Specifically, the information determination module for eliminating redundant information and the brand-new prompt structure based on problem classification greatly enhance the model’s attention. Additionally, the inclusion of self-correction and active learning modules greatly expands the problem-solving scope of LLMs, hence improving the upper limit of LLM-based approaches. Extensive experiments conducted on three datasets demonstrate that our approach outperforms other methods by a significant margin. About 2-3 percentage point improvements compared to the existing baseline on the Spider Dev, Spider-Realistic, and Bird Dev datasets and new SOTA results on the Spider Test dataset are achieved. Our code is available on GitHub: https://github.com/FlyingFeather/DEA-SQL. 2024.findings-acl.641 @@ -14851,12 +14851,12 @@ Unveiling Opinion Evolution via Prompting and Diffusion for Short Video Fake News Detection - LinlinZongDalian University of Technology + LinlinZongDalian University of Technology JiahuiZhou WenminLinDalian University of Technology XinyueLiuDalian University of Technology XianchaoZhangDalian University of Technology - BoXuDalian University of Technology + BoXuDalian University of Technology 10817-10826 Short video fake news detection is crucial for combating the spread of misinformation. Current detection methods tend to aggregate features from individual modalities into multimodal features, overlooking the implicit opinions and the evolving nature of opinions across modalities. In this paper, we mine implicit opinions within short video news and promote the evolution of both explicit and implicit opinions across all modalities. Specifically, we design a prompt template to mine implicit opinions regarding the credibility of news from the textual component of videos. Additionally, we employ a diffusion model that encourages the interplay among diverse modal opinions, including those extracted through our implicit opinion prompts. Experimental results on a publicly available dataset for short video fake news detection demonstrate the superiority of our model over state-of-the-art methods. 2024.findings-acl.642 @@ -14865,7 +14865,7 @@ i<fixed-case>S</fixed-case>ign: A Benchmark for <fixed-case>I</fixed-case>ndian <fixed-case>S</fixed-case>ign <fixed-case>L</fixed-case>anguage Processing - AbhinavJoshiIndian Institute of Technology, Kanpur + AbhinavJoshiIndian Institute of Technology, Kanpur RomitMohanty MounikaKanakanti AndeshaManglaIndian Sign Language Research and Training Centre @@ -14882,8 +14882,8 @@ Data Contamination Calibration for Black-box <fixed-case>LLM</fixed-case>s WentaoYeZhejiang University JiaqiHu - LiyaoLiZhejiang University - HaoboWangZhejiang University + LiyaoLiZhejiang University + HaoboWangZhejiang University GangChen JunboZhaoZhejiang University 10845-10861 @@ -14895,7 +14895,7 @@ Truth-Aware Context Selection: Mitigating Hallucinations of Large Language Models Being Misled by Untruthful Contexts TianYu - ShaoleiZhang + ShaoleiZhang YangFengInstitute of Computing Technology, Chinese Academy of Sciences 10862-10884 Although Large Language Models (LLMs) have demonstrated impressive text generation capabilities, they are easily misled by untruthful contexts provided by users or knowledge augmentation tools, leading to hallucinations. To alleviate LLMs from being misled by untruthful context and take advantage of knowledge augmentation, we propose Truth-Aware Context Selection (TACS), a lightweight method to adaptively recognize and mask untruthful context from the inputs. TACS begins by performing truth detection on the input context, leveraging the parameterized knowledge within the LLM. Subsequently, it constructs a corresponding attention mask based on the truthfulness of each position, selecting the truthful context and discarding the untruthful context. Additionally, we introduce a new evaluation metric, Disturbance Adaption Rate, to further study the LLMs’ ability to accept truthful information and resist untruthful information.Experimental results indicate that TACS can effectively filter untruthful context and significantly improve the overall quality of LLMs’ responses when presented with misleading information. @@ -14908,7 +14908,7 @@ MenglongCui JiangcunDu ShaolinZhuTianjin University - DeyiXiongTianjin University + DeyiXiongTianjin University 10885-10897 Large language models (LLMs) exhibit outstanding performance in machine translation via in-context learning. In contrast to sentence-level translation, document-level translation (DOCMT) by LLMs based on in-context learning faces two major challenges: firstly, document translations generated by LLMs are often incoherent; secondly, the length of demonstration for in-context learning is usually limited. To address these issues, we propose a Context-Aware Prompting method (CAP), which enables LLMs to generate more accurate, cohesive, and coherent translations via in-context learning. CAP takes into account multi-level attention, selects the most relevant sentences to the current one as context, and then generates a summary from these collected sentences. Subsequently, sentences most similar to the summary are retrieved from the datastore as demonstrations, which effectively guide LLMs in generating cohesive and coherent translations. We conduct extensive experiments across various DOCMT tasks, and the results demonstrate the effectiveness of our approach, particularly in zero pronoun translation (ZPT) and literary translation tasks. 2024.findings-acl.646 @@ -14933,7 +14933,7 @@ <fixed-case>RECOST</fixed-case>: External Knowledge Guided Data-efficient Instruction Tuning QiZhang YimingZhang - HaoboWangZhejiang University + HaoboWangZhejiang University JunboZhaoZhejiang University 10911-10921 In the current landscape of large language models (LLMs), the process of instruction tuning serves as an essential step. Considering the high computing power overhead, data-efficient instruction tuning was proposed to reduce the training data size in this process, aiming at selecting high-quality instructional data. Nevertheless, we argue that most current data-efficient instruction-tuning methods are highly dependent on the quality of the original instruction-tuning dataset. When it comes to datasets synthesized by LLMs, a common scenario in this field, dirty samples will even be selected with a higher probability than other samples. To address these challenges, we utilized external knowledge (relevant examples or paragraphs) to evaluate those samples synthesized by LLMs with an in-context-based relative predictive entropy. Based on the new metric, we proposed a framework, dubbed as RECOST, which integrates external-knowledge-base re-ranking and diversity-consistent sampling into a single pipeline. Through extensive experiments on several synthetic datasets (Alpaca and Alpaca-gpt4), we demonstrate the effectiveness of our method and achieve even better results with only 1% of the full dataset. @@ -14944,7 +14944,7 @@ Understanding Cross-Lingual <fixed-case>A</fixed-case>lignment—<fixed-case>A</fixed-case> Survey KatharinaHämmerlCIS, LMU Munich - JindřichLibovickýCharles University Prague + JindřichLibovickýCharles University Prague AlexanderFraserTechnical University of Munich 10922-10943 Cross-lingual alignment, the meaningful similarity of representations across languages in multilingual language models, has been an active field of research in recent years. We survey the literature of techniques to improve cross-lingual alignment, providing a taxonomy of methods and summarising insights from throughout the field. We present different understandings of cross-lingual alignment and their limitations. We provide a qualitative summary of results from a number of surveyed papers. Finally, we discuss how these insights may be applied not only to encoder models, where this topic has been heavily studied, but also to encoder-decoder or even decoder-only models, and argue that an effective trade-off between language-neutral and language-specific information is key. @@ -14956,7 +14956,7 @@ Mitigate Negative Transfer with Similarity Heuristic Lifelong Prompt Tuning ChenyuanWuUniversity of Science and Technology of China GangweiJiangCity University of Hong Kong and University of Science and Technology of China - DefuLianUniversity of Science and Technology of China + DefuLianUniversity of Science and Technology of China 10944-10959 Lifelong prompt tuning has significantly advanced parameter-efficient lifelong learning with its efficiency and minimal storage demands on various tasks.Our empirical studies, however, highlights certain transferability constraints in the current methodologies: a universal algorithm that guarantees consistent positive transfer across all tasks is currently unattainable, especially when dealing dissimilar tasks that may engender negative transfer.Identifying the misalignment between algorithm selection and task specificity as the primary cause of negative transfer, we present the Similarity Heuristic Lifelong Prompt Tuning (SHLPT) framework. This innovative strategy partitions tasks into two distinct subsets by harnessing a learnable similarity metric, thereby facilitating fruitful transfer from tasks regardless of their similarity or dissimilarity. Additionally, SHLPT incorporates a parameter pool to combat catastrophic forgetting effectively. Our experiments shows that SHLPT outperforms state-of-the-art techniques in lifelong learning benchmarks and demonstrates robustness against negative transfer in diverse task sequences. 2024.findings-acl.650 @@ -14969,11 +14969,11 @@ ZonghanYangDepartment of Computer Science and Technology, Tsinghua University ZhenheZhang QingyuanHu - PengLiTsinghua University - MingYan + PengLiTsinghua University + MingYan JiZhangAlibaba Group FeiHuangAlibaba Group - YangLiu + YangLiu 10960-10977 While Large language models (LLMs) have demonstrated considerable capabilities across various natural language tasks, they often fall short of the performance achieved by domain-specific state-of-the-art models. One potential approach to enhance domain-specific capabilities of LLMs involves fine-tuning them using corresponding datasets. However, this method can be both resource and time-intensive, and not applicable to closed-source commercial LLMs. In this paper, we propose Preference Adaptation for Enhancing Domain-specific Abilities of LLMs (PANDA), a method designed to augment the domain-specific capabilities of LLMs by leveraging insights from the response preference of expert models without requiring fine-tuning. Our experimental results reveal that PANDA significantly enhances the domain-specific ability of LLMs on text classification and interactive decision tasks. Moreover, LLM with PANDA even outperforms the expert model that being learned on 4 tasks of ScienceWorld. This finding highlights the potential of exploring tuning-free approaches to achieve weak-to-strong generalization. 2024.findings-acl.651 @@ -14984,10 +14984,10 @@ Developing <fixed-case>PUGG</fixed-case> for <fixed-case>P</fixed-case>olish: A Modern Approach to <fixed-case>KBQA</fixed-case>, <fixed-case>MRC</fixed-case>, and <fixed-case>IR</fixed-case> Dataset Construction AlbertSawczynWroclaw University of Science and Technology KatsiarynaViarenich - KonradWojtasik - AleksandraDomogałaTechnical University of Wroclaw - MarcinOleksy - MaciejPiaseckiWroclaw University of Science and Technology + KonradWojtasik + AleksandraDomogałaTechnical University of Wroclaw + MarcinOleksy + MaciejPiaseckiWroclaw University of Science and Technology TomaszKajdanowiczWroclaw University of Science and Technology 10978-10996 Advancements in AI and natural language processing have revolutionized machine-human language interactions, with question answering (QA) systems playing a pivotal role. The knowledge base question answering (KBQA) task, utilizing structured knowledge graphs (KG), allows for handling extensive knowledge-intensive questions. However, a significant gap exists in KBQA datasets, especially for low-resource languages. Many existing construction pipelines for these datasets are outdated and inefficient in human labor, and modern assisting tools like Large Language Models (LLM) are not utilized to reduce the workload. To address this, we have designed and implemented a modern, semi-automated approach for creating datasets, encompassing tasks such as KBQA, Machine Reading Comprehension (MRC), and Information Retrieval (IR), tailored explicitly for low-resource environments. We executed this pipeline and introduced the PUGG dataset, the first Polish KBQA dataset, and novel datasets for MRC and IR. Additionally, we provide a comprehensive implementation, insightful findings, detailed statistics, and evaluation of baseline models. @@ -14997,12 +14997,12 @@ Knowledge-to-<fixed-case>SQL</fixed-case>: Enhancing <fixed-case>SQL</fixed-case> Generation with Data Expert <fixed-case>LLM</fixed-case> - ZijinHong + ZijinHong ZhengYuan - HaoChen - QinggangZhang + HaoChen + QinggangZhang FeiranHuang - XiaoHuangThe Hong Kong Polytechnic University + XiaoHuangThe Hong Kong Polytechnic University 10997-11008 Generating accurate SQL queries for user questions (text-to-SQL) has been a long-standing challenge since it requires a deep understanding of both the user’s question and the corresponding database schema in order to retrieve the desired content accurately. Existing methods rely on the comprehensive capability of large language models (LLMs) to generate the SQL. However, some necessary knowledge is not explicitly included in the database schema and user question or has been learned by LLMs. Thus, the generated SQL of the knowledge-insufficient questions may be inaccurate, negatively influencing the text-to-SQL models’ performance and robustness. To address this challenge, we propose the Knowledge-to-SQL framework, which employs tailored Data Expert LLM (DELLM) to provide helpful knowledge for all text-to-SQL models. Specifically, we introduce the detailed implementation of DELLM regarding table reading and the basic fine-tuning process. We further propose a Preference Learning via Database Feedback (PLDBF) strategy, refining the DELLM to generate more helpful knowledge for LLMs. Extensive experiments verify that DELLM can enhance the state-of-the-art approaches for text-to-SQL tasks. The corresponding code of DELLM is released for further research. 2024.findings-acl.653 @@ -15011,10 +15011,10 @@ Centroid-Based Efficient Minimum <fixed-case>B</fixed-case>ayes Risk Decoding - HiroyukiDeguchiNara Institute of Science and Technology, Japan and National Institute of Information and Communications Technology (NICT), National Institute of Advanced Industrial Science and Technology + HiroyukiDeguchiNara Institute of Science and Technology, Japan and National Institute of Information and Communications Technology (NICT), National Institute of Advanced Industrial Science and Technology YusukeSakaiNara Institute of Science and Technology, Japan - HidetakaKamigaitoDivision of Information Science, Nara Institute of Science and Technology - TaroWatanabeNara Institute of Science and Technology, Japan + HidetakaKamigaitoDivision of Information Science, Nara Institute of Science and Technology + TaroWatanabeNara Institute of Science and Technology, Japan HidekiTanakaNational Institute of Information and Communications Technology (NICT), National Institute of Advanced Industrial Science and Technology MasaoUtiyamaNational Institute of Information and Communications Technology (NICT), National Institute of Advanced Industrial Science and Technology 11009-11018 @@ -15042,9 +15042,9 @@ Exploiting Positional Bias for Query-Agnostic Generative Content in Search - AndrewParry - SeanMacAvaneyUniversity of Glasgow - DebasisGangulyUniversity of Glasgow + AndrewParry + SeanMacAvaneyUniversity of Glasgow + DebasisGangulyUniversity of Glasgow 11030-11047 In recent years, research shows that neural ranking models (NRMs) substantially outperform their lexical counterparts in text retrieval. In traditional search pipelines, a combination of features leads to well-defined behaviour. However, as neural approaches become increasingly prevalent as the final scoring component of engines or as standalone systems, their robustness to malicious text and, more generally, semantic perturbation needs to be better understood. We posit that the transformer attention mechanism can induce exploitable defects in search models through sensitivity to token position within a sequence, leading to an attack that could generalise beyond a single query or topic. We demonstrate such defects by showing that non-relevant text–such as promotional content–can be easily injected into a document without adversely affecting its position in search results. Unlike previous gradient-based attacks, we demonstrate the existence of these biases in a query-agnostic fashion. In doing so, without the knowledge of topicality, we can still reduce the negative effects of non-relevant content injection by controlling injection position. Our experiments are conducted with simulated on-topic promotional text automatically generated by prompting LLMs with topical context from target documents. We find that contextualisation of a non-relevant text further reduces negative effects whilst likely circumventing existing content filtering mechanisms. In contrast, lexical models are found to be more resilient to such content injection attacks. We then investigate a simple yet effective compensation for the weaknesses of the NRMs in search, validating our hypotheses regarding transformer bias. 2024.findings-acl.656 @@ -15054,9 +15054,9 @@ <fixed-case>ICC</fixed-case> : Quantifying Image Caption Concreteness for Multimodal Dataset Curation MoranYanukaTel Aviv University - MorrisAlperTel Aviv University + MorrisAlperTel Aviv University HadarAverbuch-ElorTel Aviv University and Cornell University - RajaGiryesTel Aviv University + RajaGiryesTel Aviv University 11048-11064 Web-scale training on paired text-image data is becoming increasingly central to multimodal learning, but is challenged by the highly noisy nature of datasets in the wild. Standard data filtering approaches succeed in removing mismatched text-image pairs, but permit semantically related but highly abstract or subjective text. These approaches lack the fine-grained ability to isolate the most concrete samples that provide the strongest signal for learning in a noisy dataset. In this work, we propose a new metric, Image Caption Concreteness (ICC), that evaluates caption text without an image reference to measure its concreteness and relevancy for use in multimodal learning. Our unsupervised approach leverages strong foundation models for measuring visual-semantic information loss in multimodal representations. We demonstrate that this strongly correlates with human evaluation of concreteness in both single-word and caption-level texts. Moreover, we show that curation using ICC complements existing approaches: It succeeds in selecting the highest quality samples from multimodal web-scale datasets to allow for efficient training in resource-constrained settings. 2024.findings-acl.657 @@ -15069,9 +15069,9 @@ RuiWang RuixuanXiao JunboZhaoZhejiang University - XiaoDing + XiaoDing GangChen - HaoboWangZhejiang University + HaoboWangZhejiang University 11065-11082 Within the evolving landscape of deep learning, the dilemma of data quantity and quality has been a long-standing problem. The recent advent of Large Language Models (LLMs) offers a data-centric solution to alleviate the limitations of real-world data with synthetic data generation. However, current investigations into this field lack a unified framework and mostly stay on the surface. Therefore, this paper provides an organization of relevant studies based on a generic workflow of synthetic data generation. By doing so, we highlight the gaps within existing research and outline prospective avenues for future study. This work aims to shepherd the academic and industrial communities towards deeper, more methodical inquiries into the capabilities and applications of LLMs-driven synthetic data generation. 2024.findings-acl.658 @@ -15082,7 +15082,7 @@ When is a Language Process a Language Model? LiDuJohns Hopkins University HoldenLeeJohns Hopkins University - JasonEisnerMicrosoft and Johns Hopkins University + JasonEisnerMicrosoft and Johns Hopkins University RyanCotterellSwiss Federal Institute of Technology 11083-11094 A language model may be viewed as a \Sigma-valued stochastic process for some alphabet \Sigma.However, in some pathological situations, such a stochastic process may “leak” probability mass onto the set of infinite strings and hence is not equivalent to the conventional view of a language model as a distribution over ordinary (finite) strings.Such ill-behaved language processes are referred to as *non-tight* in the literature.In this work, we study conditions of tightness through the lens of stochastic processes.In particular, by regarding the symbol as marking a stopping time and using results from martingale theory, we give characterizations of tightness that generalize our previous work [(Du et al. 2023)](https://arxiv.org/abs/2212.10502). @@ -15093,7 +15093,7 @@ Accelerating Multilingual Language Model for Excessively Tokenized Languages JiminHongKrafton.Inc and Korea Advanced Institute of Science and Technology - GibbeumLeeKRAFTON and KAIST + GibbeumLeeKRAFTON and KAIST JaewoongChoKRAFTON 11095-11111 Recent advancements in large language models (LLMs) have remarkably enhanced performances on a variety of tasks in multiple languages. However, tokenizers in LLMs trained primarily on English-centric corpora often overly fragment a text into character or Unicode-level tokens in non-Roman alphabetic languages, leading to inefficient text generation.We introduce a simple yet effective framework to accelerate text generation in such languages. Our approach involves employing a new language model head with a vocabulary set tailored to a specific target language for a pre-trained LLM. This is followed by fine-tuning the new head while incorporating a verification step to ensure the model’s performance is preserved.We show that this targeted fine-tuning, while freezing other model parameters, effectively reduces token fragmentation for the target language. Our extensive experiments demonstrate that the proposed framework increases the generation speed by a factor of 1.7 while maintaining the performance of pre-trained multilingual models on target monolingual tasks. @@ -15117,9 +15117,9 @@ YongqiLiHong Kong Polytechnic University ZhenZhang WenjieWangNational University of Singapore - LiqiangNieHarbin Institute of Technology (Shenzhen) - WenjieLiThe Hong Kong Polytechnic University, The Hong Kong Polytechnic University - Tat-SengChuaNational University of Singapore + LiqiangNieHarbin Institute of Technology (Shenzhen) + WenjieLiThe Hong Kong Polytechnic University, The Hong Kong Polytechnic University + Tat-SengChuaNational University of Singapore 11119-11129 Generative retrieval is a promising new paradigm in text retrieval that generates identifier strings of relevant passages as the retrieval target. This paradigm leverages powerful generative language models, distinct from traditional sparse or dense retrieval methods. In this work, we identify a viable direction to further enhance generative retrieval via distillation and propose a feasible framework, named DGR. DGR utilizes sophisticated ranking models, such as the cross-encoder, in a teacher role to supply a passage rank list, which captures the varying relevance degrees of passages instead of binary hard labels; subsequently, DGR employs a specially designed distilled RankNet loss to optimize the generative retrieval model, considering the passage rank order provided by the teacher model as labels. This framework only requires an additional distillation step to enhance current generative retrieval systems and does not add any burden to the inference stage. We conduct experiments on four public datasets, and the results indicate that DGR achieves state-of-the-art performance among the generative retrieval methods. Additionally, DGR demonstrates exceptional robustness and generalizability with various teacher models and distillation losses. 2024.findings-acl.662 @@ -15145,10 +15145,10 @@ HaoWangGoogle ShihaoLiang YujiaQin - PengLiTsinghua University - ZhiyuanLiuTsinghua University + PengLiTsinghua University + ZhiyuanLiuTsinghua University MaosongSun - YangLiu + YangLiu 11143-11156 Large Language Models (LLMs) have witnessed remarkable advancements in recent years, prompting the exploration of tool learning, which integrates LLMs with external tools to address diverse real-world challenges. Assessing the capability of LLMs to utilise tools necessitates large-scale and stable benchmarks. However, previous works relied on either hand-crafted online tools with limited scale, or large-scale real online APIs suffering from instability of API status. To address this problem, we introduce StableToolBench, a benchmark evolving from ToolBench, proposing a virtual API server and stable evaluation system. The virtual API server contains a caching system and API simulators which are complementary to alleviate the change in API status. Meanwhile, the stable evaluation system designs solvable pass and win rates using GPT-4 as the automatic evaluator to eliminate the randomness during evaluation. Experimental results demonstrate the stability of StableToolBench, and further discuss the effectiveness of API simulators, the caching system, and the evaluator system. 2024.findings-acl.664 @@ -15159,12 +15159,12 @@ Both Matter: Enhancing the Emotional Intelligence of Large Language Models without Compromising the General Intelligence WeixiangZhaoHarbin Institute of Technology ZhuojunLi - ShilongWangHarbin Institute of Technology + ShilongWangHarbin Institute of Technology YangWang YulinHu YanyanZhaoHarbin Institute of Technology ChenWeixiaomi - BingQinHarbin Institute of Technology + BingQinHarbin Institute of Technology 11157-11176 Emotional Intelligence (EI), consisting of emotion perception, emotion cognition and emotion expression, plays the critical roles in improving user interaction experience for the current large language model (LLM) based conversational general AI assistants. Previous works mainly focus on raising the emotion perception ability of them via naive fine-tuning on EI-related classification or regression tasks. However, this leads to the incomplete enhancement of EI and catastrophic forgetting of the general intelligence (GI). To this end, we first introduce EiBench, a large-scale collection of EI-related tasks in the text-to-text format with task instructions that covers all three aspects of EI, which lays a solid foundation for the comprehensive EI enhancement of LLMs. Then a novel Modular Emotional Intelligence enhancement method (**MoEI**), consisting of Modular Parameter Expansion and intra-inter modulation, is proposed to comprehensively enhance the EI of LLMs without compromise their GI. Extensive experiments on two representative LLM-based assistants, Flan-T5 and LLaMA-2-Chat, demonstrate the effectiveness of MoEI to improving EI while maintain GI. 2024.findings-acl.665 @@ -15177,8 +15177,8 @@ MinwooKim SeunghoKim JunghwanKimselectstar - SeunghyunWonSeoul National University Bundang Hospital - HwaranLeeNAVER AI Lab + SeunghyunWonSeoul National University Bundang Hospital + HwaranLeeNAVER AI Lab EdwardChoiKorea Advanced Institute of Science and Technology 11177-11213 To reliably deploy Large Language Models (LLMs) in a specific country, they must possess an understanding of the nation’s culture and basic knowledge. To this end, we introduce National Alignment, which measures the alignment between an LLM and a targeted country from two aspects: social value alignment and common knowledge alignment. We constructed KorNAT, the first benchmark that measures national alignment between LLMs and South Korea. KorNat contains 4K and 6K multiple-choice questions for social value and common knowledge, respectively. To attain an appropriately aligned ground truth in the social value dataset, we conducted a large-scale public survey with 6,174 South Koreans. For common knowledge, we created the data based on the South Korea text books and GED exams. Our dataset creation process is meticulously designed based on statistical sampling theory, and we also introduce metrics to measure national alignment, including three variations of social value alignment. We tested seven LLMs and found that only few models passed our reference score, indicating there exists room for improvement. Our dataset has received government approval following an assessment by a government-affiliated organization dedicated to evaluating dataset quality. @@ -15191,7 +15191,7 @@ PranabSahoo AyushSingh SriparnaSahaIndian Institute of Technology Patna, India - AmanChadhaAmazon + AmanChadhaAmazon SamratMondal 11214-11226 The mining of adverse drug events (ADEs) is pivotal in pharmacovigilance, enhancing patient safety by identifying potential risks associated with medications, facilitating early detection of adverse events, and guiding regulatory decision-making. Traditional ADE detection methods are reliable but slow, not easily adaptable to large-scale operations, and offer limited information. With the exponential increase in data sources like social media content, biomedical literature, and Electronic Medical Records (EMR), extracting relevant ADE-related information from these unstructured texts is imperative. Previous ADE mining studies have focused on text-based methodologies, overlooking visual cues, limiting contextual comprehension, and hindering accurate interpretation. To address this gap, we present a MultiModal Adverse Drug Event (MMADE) detection dataset, merging ADE-related textual information with visual aids. Additionally, we introduce a framework that leverages the capabilities of LLMs and VLMs for ADE detection by generating detailed descriptions of medical images depicting ADEs, aiding healthcare professionals in visually identifying adverse events. Using our MMADE dataset, we showcase the significance of integrating visual cues from images to enhance overall performance. This approach holds promise for patient safety, ADE awareness, and healthcare accessibility, paving the way for further exploration in personalized healthcare. @@ -15203,7 +15203,7 @@ Space Decomposition for Sentence Embedding WuttikornPonwitayaratVidyasirimedhi Institute of Science and Technology PeeratLimkonchotiwat - EkapolChuangsuwanichChulalongkorn University + EkapolChuangsuwanichChulalongkorn University SaranaNutanong 11227-11239 Determining sentence pair similarity is crucial for various NLP tasks. A common technique to address this is typically evaluated on a continuous semantic textual similarity scale from 0 to 5. However, based on a linguistic observation in STS annotation guidelines, we found that the score in the range [4,5] indicates an upper-range sample, while the rest are lower-range samples. This necessitates a new approach to treating the upper-range and lower-range classes separately. In this paper, we introduce a novel embedding space decomposition method called MixSP utilizing a Mixture of Specialized Projectors, designed to distinguish and rank upper-range and lower-range samples accurately. The experimental results demonstrate that MixSP decreased the overlap representation between upper-range and lower-range classes significantly while outperforming competitors on STS and zero-shot benchmarks. @@ -15213,7 +15213,7 @@ Don’t Augment, Rewrite? Assessing Abusive Language Detection with Synthetic Data - CamillaCasulaUniversity of Trento and Fondazione Bruno Kessler + CamillaCasulaUniversity of Trento and Fondazione Bruno Kessler ElisaLeonardelliFondazione Bruno Kessler SaraTonelli 11240-11247 @@ -15225,7 +15225,7 @@ Improving Low-Resource Machine Translation for Formosan Languages Using Bilingual Lexical Resources FrancisZhengThe University of Tokyo, The University of Tokyo - EdisonMarrese-TaylorThe Univesity of Tokyo and AIST, National Institute of Advanced Industrial Science and Technology + EdisonMarrese-TaylorThe Univesity of Tokyo and AIST, National Institute of Advanced Industrial Science and Technology YutakaMatsuoThe University of Tokyo and The University of Tokyo 11248-11259 This paper investigates how machine translation for low-resource languages can be improved by incorporating information from bilingual lexicons during the training process for mainly translation between Mandarin and Formosan languages, which are all moribund or critically endangered, and we also show that our techniques work for translation between Spanish and Nahuatl, a language pair consisting of languages from completely different language families. About 70% of the approximately 7,000 languages of the world have data in the form of lexicons, a valuable resource for improving low-resource language translation. We collect a dataset of parallel data and bilingual lexicons between Mandarin and 16 different Formosan languages and examine mainly three different approaches: (1) simply using lexical data as additional parallel data, (2) generating pseudo-parallel sentence data to use during training by replacing words in the original parallel sentence data using the lexicon, and (3) a combination of (1) and (2). All three approaches give us gains in both Bleu scores and chrF scores, and we found that (3) provided the most gains, followed by (1) and then (2), which we observed for both translation between Mandarin and the Formosan languages and Spanish-Nahuatl. With technique (3), we saw an average increase of 5.55 in Bleu scores and 10.33 in chrF scores. @@ -15237,14 +15237,14 @@ <fixed-case>CMMLU</fixed-case>: Measuring massive multitask language understanding in <fixed-case>C</fixed-case>hinese - HaonanLi + HaonanLi YixuanZhang FajriKotoMohamed bin Zayed University of Artificial Intelligence - YifeiYang + YifeiYang HaiZhaoShanghai Jiao Tong University YeyunGong NanDuanMicrosoft Research Asia - TimothyBaldwinMohamed bin Zayed University of Artificial Intelligence and The University of Melbourne + TimothyBaldwinMohamed bin Zayed University of Artificial Intelligence and The University of Melbourne 11260-11285 As the capabilities of large language models (LLMs) continue to advance, evaluating their performance is becoming more important and more challenging. This paper aims to address this issue for Mandarin Chinese in the form of CMMLU, a comprehensive Chinese benchmark that covers various subjects, including natural sciences, social sciences, engineering, and the humanities. We conduct a thorough evaluation of more than 20 contemporary multilingual and Chinese LLMs, assessing their performance across different subjects and settings. The results reveal that most existing LLMs struggle to achieve an accuracy of even 60%, which is the pass mark for Chinese exams. This highlights that there is substantial room for improvement in the capabilities of LLMs. Additionally, we conduct extensive experiments to identify factors impacting the models’ performance and propose directions for enhancing LLMs. CMMLU fills the gap in evaluating the knowledge and reasoning capabilities of large language models for Chinese. 2024.findings-acl.671 @@ -15270,8 +15270,8 @@ WenjieWangNational University of Singapore MoxinLi JunrongGuoUniversity of Science and Technology of China - YangZhang - FuliFengUniversity of Science and Technology of China + YangZhang + FuliFengUniversity of Science and Technology of China 11316-11360 The rapid advancement of Large Language Models (LLMs) in the realm of mathematical reasoning necessitates comprehensive evaluations to gauge progress and inspire future directions. Existing assessments predominantly focus on problem-solving from the examinee perspective, overlooking a dual perspective of examiner regarding error identification and correction.From the examiner perspective, we define four evaluation tasks for error identification and correction along with a new dataset with annotated error types and steps. We also design diverse prompts to thoroughly evaluate eleven representative LLMs. Our principal findings indicate that GPT-4 outperforms all models, while open-source model LLaMA-2-7B demonstrates comparable abilities to closed-source models GPT-3.5 and Gemini Pro.Notably, calculation error proves the most challenging error type. Moreover, prompting LLMs with the error types can improve the average correction accuracy by 47.9%. These results reveal potential directions for developing the mathematical reasoning abilities of LLMs.Our code and dataset is available on https://github.com/LittleCirc1e/EIC. 2024.findings-acl.673 @@ -15281,7 +15281,7 @@ Less is <fixed-case>KEN</fixed-case>: a Universal and Simple Non-Parametric Pruning Algorithm for Large Language Models MicheleMastromatteiCampus Bio-Medico University of Rome - Fabio MassimoZanzottoUniversity of Rome Tor Vergata + Fabio MassimoZanzottoUniversity of Rome Tor Vergata 11361-11374 2024.findings-acl.674 mastromattei-zanzotto-2024-less @@ -15289,8 +15289,8 @@ When Do <fixed-case>LLM</fixed-case>s Need Retrieval Augmentation? Mitigating <fixed-case>LLM</fixed-case>s’ Overconfidence Helps Retrieval Augmentation - ShiyuNiInstitute of Computing Technology, Chinese Academy of Sciences - KepingBiChinese Academy of Sciences + ShiyuNiInstitute of Computing Technology, Chinese Academy of Sciences + KepingBiChinese Academy of Sciences JiafengGuoInstitute of Computing Technolgy, Chinese Academy of Sciences XueqiCheng, Chinese Academy of Sciences 11375-11388 @@ -15324,7 +15324,7 @@ JingangWangMeituan XunliangCai DongyanZhaoPeking University - RuiYanRenmin University of China + RuiYanRenmin University of China 11404-11415 Speculative decoding has emerged as a promising technique to accelerate the inference of Large Language Models (LLMs) by employing a small language model to draft a hypothesis sequence, which is then validated by the LLM. The effectiveness of this approach heavily relies on the balance between performance and efficiency of the draft model. In our research, we focus on enhancing the proportion of draft tokens that are accepted to the final output by generating multiple hypotheses instead of just one. This allows the LLM more options to choose from and select the longest sequence that meets its standards. Our analysis reveals that hypotheses produced by the draft model share many common token sequences, suggesting a potential for optimizing computation. Leveraging this observation, we introduce an innovative approach utilizing a directed acyclic graph (DAG) to manage the drafted hypotheses. This structure enables us to efficiently predict and merge recurring token sequences, vastly reducing the computational demands of the draft model. We term this approach Graph-structured Speculative Decoding (GSD). We apply GSD across a range of LLMs, including a 70-billion parameter LLaMA-2 model, and observe a remarkable speedup of 1.70\times to 1.94 \times, significantly surpassing standard speculative decoding. 2024.findings-acl.677 @@ -15335,7 +15335,7 @@ Duwak: Dual Watermarks in Large Language Models ChaoyiZhu JeroenGaljaard - Pin-YuChenInternational Business Machines + Pin-YuChenInternational Business Machines LydiaChenDelft University of Technology 11416-11436 As large language models (LLM) are increasingly used for text generation tasks, it is critical to audit their usages, govern their applications, and mitigate their potential harms. Existing watermark techniques are shown effective in embedding single human-imperceptible and machine-detectable patterns without significantly affecting generated text quality and semantics. However, the efficiency in detecting watermarks, i.e., the minimum number of tokens required to assert detection with significance and robustness against post-editing, is still debatable. In this paper, we propose, Duwak, to fundamentally enhance the efficiency and quality of watermarking by embedding dual secret patterns in both token probability distribution and sampling schemes. To mitigate expression degradation caused by biasing toward certain tokens, we design a contrastive search to watermark the sampling scheme, which minimizes the token repetition and enhances the diversity. We theoretically explain the interdependency of the two watermarks within Duwak. We evaluate Duwak extensively on Llama2 and Vicuna under various post-editing attacks, against four state-of-the-art watermarking techniques and combinations of them. Our results show that Duwak marked text achieves the highest watermarked text quality at the lowest required token count for detection, up to 70% tokens less than existing approaches, especially under post paraphrasing. @@ -15346,9 +15346,9 @@ <fixed-case>C</fixed-case>ode<fixed-case>A</fixed-case>ttack: Revealing Safety Generalization Challenges of Large Language Models via Code Completion QibingRenShanghai Jiaotong University - ChangGao + ChangGao JingShaoShanghai AI Laboratory - JunchiYanShanghai Jiao Tong University + JunchiYanShanghai Jiao Tong University XinTanEast China Normal University WaiLamThe Chinese University of Hong Kong LizhuangMaDept. of Computer Sci. & Eng., Shanghai Jiao Tong University @@ -15362,10 +15362,10 @@ Mitigating Reversal Curse in Large Language Models via Semantic-aware Permutation Training QingyanGuo RuiWangMicrosoft - JunliangGuoMicrosoft - XuTan - JiangBianMicrosoft - YujiuYangGraduate School at Shenzhen,Tsinghua University + JunliangGuoMicrosoft + XuTan + JiangBianMicrosoft + YujiuYangGraduate School at Shenzhen,Tsinghua University 11453-11464 While large language models (LLMs) have achieved impressive performance across diverse tasks, recent studies showcase that causal LLMs suffer from the “reversal curse”. It is a typical example that the model knows “A’s father is B”, but is unable to reason “B’s child is A”. This limitation poses a challenge to the advancement of artificial general intelligence (AGI), as it suggests a gap in the models’ ability to comprehend and apply bidirectional reasoning. In this paper, we first conduct substantial evaluation and identify that the root cause of the reversal curse lies in the different word order between the training and inference stage, namely, the poor ability of causal language models to predict antecedent words within the training data. Accordingly, permutation on the training data is considered as a potential solution, since this can make the model predict antecedent words or tokens. However, previous permutation methods may disrupt complete phrases or entities, thereby posing challenges for the model to comprehend and learn from training data. To address this issue, we propose Semantic-aware Permutation Training (SPT), which addresses this issue by segmenting the training sentences into semantic units (i.e., entities or phrases) with an assistant language model and permuting these units before feeding into the model. Extensive experiments demonstrate that SPT effectively mitigates the reversal curse since the performance on reversed questions approximates that on the forward ones, and significantly advances the performance of existing works. 2024.findings-acl.680 @@ -15401,11 +15401,11 @@ <fixed-case>TRAP</fixed-case>: Targeted Random Adversarial Prompt Honeypot for Black-Box Identification - MartinGubriParameter Lab + MartinGubriParameter Lab DennisUlmer - HwaranLeeNAVER AI Lab + HwaranLeeNAVER AI Lab SangdooYunNAVER - Seong JoonOhParameter Lab and Eberhard-Karls-Universität Tübingen + Seong JoonOhParameter Lab and Eberhard-Karls-Universität Tübingen 11496-11517 Large Language Model (LLM) services and models often come with legal rules on *who* can use them and *how* they must use them. Assessing the compliance of the released LLMs is crucial, as these rules protect the interests of the LLM contributor and prevent misuse. In this context, we describe the novel fingerprinting problem of Black-box Identity Verification (BBIV). The goal is to determine whether a third-party application uses a certain LLM through its chat function. We propose a method called Targeted Random Adversarial Prompt (TRAP) that identifies the specific LLM in use. We repurpose adversarial suffixes, originally proposed for jailbreaking, to get a pre-defined answer from the target LLM, while other models give random answers. TRAP detects the target LLMs with over 95% true positive rate at under 0.2% false positive rate even after a single interaction. TRAP remains effective even if the LLM has minor changes that do not significantly alter the original function. 2024.findings-acl.683 @@ -15442,7 +15442,7 @@ SitipornSae Lim CanUdomcharoenchaikitVidyasirimedhi Institute of Science and Technology (VISTEC) PeeratLimkonchotiwat - EkapolChuangsuwanichChulalongkorn University + EkapolChuangsuwanichChulalongkorn University SaranaNutanong 11548-11563 NLU models have achieved promising results on standard benchmarks. Despite state-of-the-art accuracy, analysis reveals that many models make predictions using annotation bias rather than the properties we intend the model to learn. Consequently, these models perform poorly on out-of-distribution datasets. Recent advances in bias mitigation show that annotation bias can be alleviated through fine-tuning debiasing objectives. In this paper, we apply causal mediation analysis to gauge how much each model component mediates annotation biases. Using the knowledge from the causal analysis, we improve the model’s robustness against annotation bias through two bias mitigation methods: causal-grounded masking and gradient unlearning. Causal analysis reveals that biases concentrated in specific components, even after employing other training-time debiasing techniques. Manipulating these components by masking out neurons’ activations or updating specific weight blocks both demonstrably improve robustness against annotation artifacts. @@ -15452,8 +15452,8 @@ Perturbed examples reveal invariances shared by language models - RuchitRawalMPI-SWS - MariyaTonevaMax Planck Institute for Software Systems + RuchitRawalMPI-SWS + MariyaTonevaMax Planck Institute for Software Systems 11564-11584 The rapid growth in natural language processing (NLP) research has led to numerous new models, outpacing our understanding of how they compare to established ones. One major reason for this difficulty is saturating benchmarks, which may not well reflect differences in model performance in the wild. In this work, we introduce a novel framework to compare two NLP models by revealing their shared invariance to interpretable input perturbations targeting a specific linguistic capability. Via experiments on models from the same and different architecture families, this framework offers insights about how changes in models (e.g., distillation, size increase) affect linguistic capabilities. Furthermore, our framework enables evaluation of invariances between commercial black-box models (e.g., InstructGPT family) and models that are better understood (e.g., GPT-2). Across experiments, we observe that large language models share many invariances encoded by models of various sizes, whereas the invariances by large models are only shared by other large models. Possessing a wide variety of invariances may be key to the recent successes of large language models, and our framework can shed light on the types of invariances retained or emerging in new models. We make the code publicly available. 2024.findings-acl.687 @@ -15477,11 +15477,11 @@ Discourse Structure-Aware Prefix for Generation-Based End-to-End Argumentation Mining - YangSun + YangSun GuanrongChen - CaihuaYang + CaihuaYang JianzhuBaoHarbin Institute of Technology - BinLiang + BinLiang XiZeng MinYangShenzhen Institutes of Advanced Technology, Chinese Academy of Sciences, Chinese Academy of Sciences RuifengXuHarbin Institute of Technology @@ -15493,10 +15493,10 @@ Poor-Supervised Evaluation for <fixed-case>S</fixed-case>uper<fixed-case>LLM</fixed-case> via Mutual Consistency - PeiwenYuan + PeiwenYuan ShaoxiongFeng YiweiLi - XinglinWang + XinglinWang BoyuanPan HedaWang YaoHu @@ -15510,9 +15510,9 @@ Addressing Entity Translation Problem via Translation Difficulty and Context Diversity TianLiang - XingWangTencent AI Lab - MingmingYangTencent AI Lab - YujiuYangGraduate School at Shenzhen,Tsinghua University + XingWangTencent AI Lab + MingmingYangTencent AI Lab + YujiuYangGraduate School at Shenzhen,Tsinghua University ShumingShiTencent AI Lab ZhaopengTuTencent AI Lab 11628-11638 @@ -15541,8 +15541,8 @@ YijinLiuWechat AI XianfengZeng ChenzeShaoTencent Inc - FandongMengWeChat AI, Tencent Inc. - JieZhou + FandongMengWeChat AI, Tencent Inc. + JieZhou 11652-11663 Large language models (LLMs) are capable of performing conditional sequence generation tasks, such as translation or summarization, through instruction fine-tuning. The fine-tuning data is generally sequentially concatenated from a specific task instruction, an input sentence, and the corresponding response. Considering the locality modeled by the self-attention mechanism of LLMs, these models face the risk of instruction forgetting when generating responses for long input sentences. To mitigate this issue, we propose enhancing the instruction-following capability of LLMs by shifting the position of task instructions after the input sentences. Theoretical analysis suggests that our straightforward method can alter the model’s learning focus, thereby emphasizing the training of instruction-following capabilities. Concurrently, experimental results demonstrate that our approach consistently outperforms traditional settings across various model scales (1B / 7B / 13B) and different sequence generation tasks (translation and summarization), without any additional data or annotation costs. Notably, our method significantly improves the zero-shot performance on conditional sequence generation, e.g., up to 9.7 BLEU points on WMT zero-shot translation tasks. Further analysis reveals that our method can significantly improve the tranditional model’s instruction following ability by 1x over traditional approch. 2024.findings-acl.693 @@ -15552,11 +15552,11 @@ <fixed-case>XM</fixed-case>o<fixed-case>E</fixed-case>: Sparse Models with Fine-grained and Adaptive Expert Selection YuanhangYang - ShiyiQi - WenchaoGuTechnische Universität München + ShiyiQi + WenchaoGuTechnische Universität München ChaozhengWang CuiyunGaoHarbin Institute of Technology - ZenglinXuFudan University + ZenglinXuFudan University 11664-11674 Sparse models, including sparse Mixture-of-Experts (MoE) models, have emerged as an effective approach for scaling Transformer models. However, they often suffer from computational inefficiency since a significant number of parameters are unnecessarily involved in computations by multiplying values by zero or low activation values. To address this issue, we present XMoE, a novel MoE designed to enhance both the efficacy and efficiency of sparse MoE models. XMoE leverages small experts and a threshold-based router to enable tokens to selectively engage only essential parameters. Our extensive experiments on language modeling and machine translation tasks demonstrate that enhances model performance and can decrease the computation load at MoE layers by over 50% without sacrificing performance. Furthermore, we present the versatility of by applying it to dense models, enabling sparse computation during inference. We provide a comprehensive analysis and make our code available at https://anonymous.4open.science/r/XMoE. 2024.findings-acl.694 @@ -15567,8 +15567,8 @@ <fixed-case>B</fixed-case>ranch<fixed-case>N</fixed-case>orm: Robustly Scaling Extremely Deep Transformers YijinLiuWechat AI XianfengZeng - FandongMengWeChat AI, Tencent Inc. - JieZhou + FandongMengWeChat AI, Tencent Inc. + JieZhou 11675-11687 Recently, DeepNorm scales Transformers into extremely deep (i.e., 1000 layers) and reveals the promising potential of deep scaling. To stabilize the training of deep models, DeepNorm attempts to constrain the model update to a constant value. Although applying such a constraint can benefit the early stage of model training, it may lead to undertrained models during the whole training procedure. In this paper, we propose BranchNorm, which dynamically rescales the non-residual branch of Transformer in accordance with the training period. BranchNorm not only theoretically stabilizes the training with smooth gradient norms at the early stage, but also encourages better convergence in the subsequent training stage. Experimental results on multiple translation tasks demonstrate that BranchNorm achieves a better trade-off between training stability and converge performance. 2024.findings-acl.695 @@ -15577,8 +15577,8 @@ <fixed-case>M</fixed-case>us<fixed-case>TQ</fixed-case>: A Temporal Knowledge Graph Question Answering Dataset for Multi-Step Temporal Reasoning - TingyiZhang - JiaanWangSoochow University + TingyiZhang + JiaanWangSoochow University ZhixuLi JianfengQuSoochow University AnLiuSuzhou University @@ -15593,10 +15593,10 @@ Deal, or no deal (or who knows)? Forecasting Uncertainty in Conversations using Large Language Models AnthonySiciliaNortheastern University - HyunwooKimAllen Institute for Artificial Intelligence + HyunwooKimAllen Institute for Artificial Intelligence KhyathiChandu MaliheAlikhaniNortheastern University - JackHesselSamaya AI + JackHesselSamaya AI 11700-11726 Effective interlocutors account for the uncertain goals, beliefs, and emotions of others. But even the best human conversationalist cannot perfectly anticipate the trajectory of a dialogue. How well can language models represent inherent uncertainty in conversations? We propose FortUne Dial, an expansion of the long-standing “conversation forecasting” task: instead of just accuracy, evaluation is conducted with uncertainty-aware metrics, effectively enabling abstention on individual instances. We study two ways in which language models potentially represent outcome uncertainty (internally, using scores and directly, using tokens) and propose fine-tuning strategies to improve calibration of both representations. Experiments on eight difficult negotiation corpora demonstrate that our proposed fine-tuning strategies (a traditional supervision strategy and an off-policy reinforcement learning strategy) can calibrate smaller open-source models to compete with pre-trained models 10x their size. 2024.findings-acl.697 @@ -15612,7 +15612,7 @@ ShuyangYu YifeiGuo Sim KuanGohXiamen University Malaysia - Ho-KinTangHarbin Institute of Technology + Ho-KinTangHarbin Institute of Technology 11727-11742 Fine-tuning pre-trained language models, particularly large language models, demands extensive computing resources and can result in varying performance outcomes across different domains and datasets. This paper examines the approach of integrating multiple models from diverse training scenarios into a unified model. This unified model excels across various data domains and exhibits the ability to generalize well on out-of-domain data. We propose a knowledge fusion method named Evolver, inspired by evolutionary algorithms, which does not need further training or additional training data. Specifically, our method involves aggregating the weights of different language models into a population and subsequently generating offspring models through mutation and crossover operations. These offspring models are then evaluated against their parents, allowing for the preservation of those models that show enhanced performance on development datasets. Importantly, our model evolving strategy can be seamlessly integrated with existing model merging frameworks, offering a versatile tool for model enhancement. Experimental results on mainstream language models (i.e., encoder-only, decoder-only, encoder-decoder) reveal that Evolver outperforms previous state-of-the-art models by large margins. 2024.findings-acl.698 @@ -15622,10 +15622,10 @@ <fixed-case>S</fixed-case>ca<fixed-case>L</fixed-case>earn: Simple and Highly Parameter-Efficient Task Transfer by Learning to Scale MarkusFrohmannJohannes Kepler Universität Linz - CarolinHoltermannUniversität Hamburg - ShahedMasoudian + CarolinHoltermannUniversität Hamburg + ShahedMasoudian AnneLauscherUniversität Hamburg - NavidRekabsazThomson Reuters + NavidRekabsazThomson Reuters 11743-11776 Multi-task learning (MTL) has shown considerable practical benefits, particularly when using language models (LMs). While this is commonly achieved by learning tasks under a joint optimization procedure, some methods, such as AdapterFusion, divide the problem into two stages: (i) task learning, where knowledge specific to a task is encapsulated within sets of parameters (e.g., adapters), and (ii) transfer, where this already learned knowledge is leveraged for a target task. This separation of concerns provides numerous benefits (e.g., promoting reusability). However, current two stage MTL introduces a substantial number of additional parameters. We address this issue by leveraging the usefulness of linearly scaling the output representations of source adapters for transfer learning. We introduce ScaLearn, a simple and highly parameter-efficient two-stage MTL method that capitalizes on the knowledge of the source tasks by learning a minimal set of scaling parameters that enable effective transfer to a target task. Our experiments on three benchmarks (GLUE, SuperGLUE, and HumSet) and two encoder LMs show that ScaLearn consistently outperforms strong baselines with a small number of transfer parameters (~0.35% of those of AdapterFusion). Remarkably, we observe that ScaLearn maintains its strong abilities even when further reducing parameters, achieving competitive results with only 8 transfer parameters per target task. Our proposed approach thus demonstrates the power of simple scaling as a promise for more efficient task transfer. Our code is available at https://github.com/CPJKU/ScaLearn. 2024.findings-acl.699 @@ -15646,15 +15646,15 @@ <fixed-case>M</fixed-case>at<fixed-case>P</fixed-case>lot<fixed-case>A</fixed-case>gent: Method and Evaluation for <fixed-case>LLM</fixed-case>-Based Agentic Scientific Data Visualization ZhiyuYang ZihanZhouXiamen University - ShuoWang + ShuoWang XinCong XuHanTsinghua University, Tsinghua University YukunYan ZhenghaoLiuNortheastern University - ZhixingTanZhongguancun Laboratory + ZhixingTanZhongguancun Laboratory PengyuanLiuBeijing Language and Culture University DongYu - ZhiyuanLiuTsinghua University + ZhiyuanLiuTsinghua University XiaodongShiXiamen University, Tsinghua University MaosongSun 11789-11804 @@ -15688,7 +15688,7 @@ TingtingCui XiaoqingChengZhengzhou University LiutaoLiutao - DeyiXiongTianjin University + DeyiXiongTianjin University 11817-11837 What a large language model (LLM) would respond in ethically relevant context? In this paper, we curate a large benchmark CMoralEval for morality evaluation of Chinese LLMs. The data sources of CMoralEval are two-fold: 1) a Chinese TV program discussing Chinese moral norms with stories from the society and 2) a collection of Chinese moral anomies from various newspapers and academic papers on morality. With these sources, we aim to create a moral evaluation dataset characterized by diversity and authenticity. We develop a morality taxonomy and a set of fundamental moral principles that are not only rooted in traditional Chinese culture but also consistent with contemporary societal norms. To facilitate efficient construction and annotation of instances in CMoralEval, we establish a platform with AI-assisted instance generation to streamline the annotation process. These help us curate CMoralEval that encompasses both explicit moral scenarios (14,964 instances) and moral dilemma scenarios (15,424 instances), each with instances from different data sources. We conduct extensive experiments with CMoralEval to examine a variety of Chinese LLMs. Experiment results demonstrate that CMoralEval is a challenging benchmark for Chinese LLMs. 2024.findings-acl.703 @@ -15709,8 +15709,8 @@ Investigating the Impact of Model Instability on Explanations and Uncertainty - SaraMarjanovic - IsabelleAugensteinUniversity of Copenhagen + SaraMarjanovic + IsabelleAugensteinUniversity of Copenhagen ChristinaLiomaUniversity of Copenhagen 11854-11879 Explainable AI methods facilitate the understanding of model behaviour, yet, small, imperceptible perturbations to inputs can vastly distort explanations. As these explanations are typically evaluated holistically, before model deployment, it is difficult to assess when a particular explanation is trustworthy. Some studies have tried to create confidence estimators for explanations, but none have investigated an existing link between uncertainty and explanation quality. We artificially simulate epistemic uncertainty in text input by introducing noise at inference time. In this large-scale empirical study, we insert different levels of noise perturbations and measure the effect on the output of pre-trained language models and different uncertainty metrics. Realistic perturbations have minimal effect on performance and explanations, yet masking has a drastic effect. We find that high uncertainty doesn’t necessarily imply low explanation plausibility; the correlation between the two metrics can be moderately positive when noise is exposed during the training process. This suggests that noise-augmented models may be better at identifying salient tokens when uncertain. Furthermore, when predictive and epistemic uncertainty measures are over-confident, the robustness of a saliency map to perturbation can indicate model stability issues. Integrated Gradients shows the overall greatest robustness to perturbation, while still showing model-specific patterns in performance; however, this phenomenon is limited to smaller Transformer-based language models. @@ -15738,9 +15738,9 @@ MicheleMarchi IreneMondella HuiyuanLaiUniversity of Groningen - FeliceDell’OrlettaIstituto di Linguistica Computazionale “A. Zampolli” (ILC) + FeliceDell’OrlettaIstituto di Linguistica Computazionale “A. Zampolli” (ILC) MalvinaNissimUniversity of Groningen - MarcoGueriniFondazione Bruno Kessler + MarcoGueriniFondazione Bruno Kessler 11892-11907 Automatic methods for generating and gathering linguistic data have proven effective for fine-tuning Language Models (LMs) in languages less resourced than English. Still, while there has been emphasis on data quantity, less attention has been given to its quality. In this work, we investigate the impact of human intervention on machine-generated data when fine-tuning dialogical models. In particular, we study (1) whether post-edited dialogues exhibit higher perceived quality compared to the originals that were automatically generated; (2) whether fine-tuning with post-edited dialogues results in noticeable differences in the generated outputs; and (3) whether post-edited dialogues influence the outcomes when considering the parameter size of the LMs. To this end we created HED-IT, a large-scale dataset where machine-generated dialogues are paired with the version post-edited by humans. Using both the edited and unedited portions of HED-IT, we fine-tuned three different sizes of an LM. Results from both human and automatic evaluation show that the different quality of training data is clearly perceived and it has an impact also on the models trained on such data. Additionally, our findings indicate that larger models are less sensitive to data quality, whereas this has a crucial impact on smaller models. These results enhance our comprehension of the impact of human intervention on training data in the development of high-quality LMs. 2024.findings-acl.707 @@ -15773,7 +15773,7 @@ SuwonShonASAPP Hung-yiLeeNational Taiwan University KarenLivescuToyota Technological Institute at Chicago - ShinjiWatanabeCarnegie Mellon University + ShinjiWatanabeCarnegie Mellon University 11923-11938 The Spoken Language Understanding Evaluation (SLUE) suite of benchmark tasks was recently introduced to address the need for openresources and benchmarking of complex spoken language understanding (SLU) tasks, including both classification and sequence generation tasks, on natural speech. The benchmark has demonstrated preliminary success in using pre-trained speech foundation models (SFM) for these SLU tasks. However, the community still lacks a fine-grained understanding of the comparative utility of different SFMs. Inspired by this, we ask: which SFMs offer the most benefits for these complex SLU tasks, and what is the most effective approach for incorporating these SFMs? To answer this, we perform an extensive evaluation of multiple supervised and self-supervised SFMs using several evaluation protocols: (i) frozen SFMs with a lightweight prediction head, (ii) frozen SFMs with a complex prediction head, and (iii) fine-tuned SFMs with a lightweight prediction head. Although the supervised SFMs are pre-trained on much more speech recognition data (with labels), they do not always outperform self-supervised SFMs; the latter tend to perform at least as well as, and sometimes better than, supervised SFMs, especially on the sequence generation tasks in SLUE. While there is no universally optimal way of incorporating SFMs, the complex prediction head gives the best performance for most tasks, although it increases the inference time. We also introduce an open-source toolkit and performance leaderboard, SLUE-PERB, for these tasks and modeling strategies. 2024.findings-acl.709 @@ -15784,8 +15784,8 @@ Towards Multiple References Era – Addressing Data Leakage and Limited Reference Diversity in Machine Translation Evaluation XianfengZeng YijinLiuWechat AI - FandongMengWeChat AI, Tencent Inc. - JieZhou + FandongMengWeChat AI, Tencent Inc. + JieZhou 11939-11951 Recent research has shown a weak correlation between n-gram-based metrics and human evaluations in machine translation task, particularly when evaluating large language models (LLMs). Additionally, the data leakage risk in LLMs may cause an overestimation problem when evaluating LLMs on downstream tasks. In this work, we identify the limited diversity of references as the primary cause for the inferior performance of n-gram-based metrics and the overestimation problem. To address this issue, we propose to utilize multiple references generated by LLMs, coupled with an effective selection strategy focused on accuracy and diversity, to improve the alignment between automatic metrics and human evaluations. We validate our approach on the WMT22 Metrics benchmark with 4 languages and observe a maximum accuracy gain of 9.5% in F200spBLEU, which makes it on par with computationally expensive neural-based metrics. We also show that using multi-reference with n-gram-based metrics significantly alleviates the overestimation problem when evaluating LLMs with data leakage. Further analysis explores the factors that affect the quality of generated references, offering insights into data synthesis by LLMs. 2024.findings-acl.710 @@ -15799,8 +15799,8 @@ Øistein E.AndersenComputer Laboratory ShivaTaslimipoorUniversity of Cambridge HelenYannakoudakisComputer Laboratory, University of Cambridge and King’s College London - ZhengYuanKing’s College London, University of London - ChristopherBryantComputer Laboratory + ZhengYuanKing’s College London, University of London + ChristopherBryantComputer Laboratory MarekReiImperial College London PaulaButteryUniversity of Cambridge 11952-11967 @@ -15811,8 +15811,8 @@ <fixed-case>BATS</fixed-case>: <fixed-case>B</fixed-case>enchm<fixed-case>A</fixed-case>rking Text Simplicity 🦇 - ChristinKreutzTechnische Hochschule Mittelhessen - FabianHaakFachhochschule Köln + ChristinKreutzTechnische Hochschule Mittelhessen + FabianHaakFachhochschule Köln BjörnEngelmann PhilippSchaerTH Köln - University of Applied Sciences 11968-11989 @@ -15824,11 +15824,11 @@ <fixed-case>A</fixed-case>ustro<fixed-case>T</fixed-case>ox: A Dataset for Target-Based <fixed-case>A</fixed-case>ustrian <fixed-case>G</fixed-case>erman Offensive Language Detection PiaPachingerTechnische Universität Wien - JanisGoldzycher + JanisGoldzycher AnnaPlanitzer WojciechKusaAllegro - AllanHanburyComplexity Science Hub and Technische Universität Wien - JuliaNeidhardtTechnische Universität Wien + AllanHanburyComplexity Science Hub and Technische Universität Wien + JuliaNeidhardtTechnische Universität Wien 11990-12001 Model interpretability in toxicity detection greatly profits from token-level annotations. However, currently, such annotations are only available in English. We introduce a dataset annotated for offensive language detection sourced from a news forum, notable for its incorporation of the Austrian German dialect, comprising 4,562 user comments. In addition to binary offensiveness classification, we identify spans within each comment constituting vulgar language or representing targets of offensive statements. We evaluate fine-tuned Transformer models as well as large language models in a zero- and few-shot fashion. The results indicate that while fine-tuned models excel in detecting linguistic peculiarities such as vulgar dialect, large language models demonstrate superior performance in detecting offensiveness in AustroTox. 2024.findings-acl.713 @@ -15838,7 +15838,7 @@ Discovering influential text using convolutional neural networks MeganAyers - LukeSanford + LukeSanford MargaretRobertsUniversity of California, San Diego EddieYangUniversity of California, San Diego 12002-12027 @@ -15849,13 +15849,13 @@ <fixed-case>LC</fixed-case>4<fixed-case>EE</fixed-case>: <fixed-case>LLM</fixed-case>s as Good Corrector for Event Extraction - MengnaZhu - KaishengZeng - JibingWuJibingWu - LihuaLiuNational University of Defense Technology + MengnaZhu + KaishengZeng + JibingWuJibingWu + LihuaLiuNational University of Defense Technology HongbinHuangNational University of Defense Technology - LeiHouTsinghua University, Tsinghua University - JuanziLi + LeiHouTsinghua University, Tsinghua University + JuanziLi 12028-12038 Event extraction (EE) is a critical task in natural language processing, yet deploying a practical EE system remains challenging. On one hand, powerful large language models (LLMs) currently show poor performance because EE task is more complex than other tasks. On the other hand, state-of-the-art (SOTA) small language models (SLMs) for EE tasks are typically developed through fine-tuning, lack flexibility, and have considerable room for improvement. We propose an approach, **L**LMs-as-**C**orrector for **E**vent **E**xtraction (**LC4EE**), aiming to leverage the superior extraction capability of SLMs and the instruction-following ability of LLMs to construct a robust and highly available EE system. By utilizing LLMs to identify and correct errors of SLMs predictions based on automatically generated feedback information, EE performances can be improved significantly. Experimental results on the representative datasets ACE2005 and MAVEN-Arg for Event Detection (ED) and EE tasks validated the effectiveness of our method. 2024.findings-acl.715 @@ -15867,7 +15867,7 @@ YihongDongPeking University XueJiangPeking University HuanyuLiu - ZhiJinPeking University and Peking University + ZhiJinPeking University and Peking University BinGuBeijing Institute of Control Engineering MengfeiYangChina Academy of Space Technology GeLiPeking University Shenzhen Graduate School @@ -15890,9 +15890,9 @@ <fixed-case>A</fixed-case>ncient <fixed-case>C</fixed-case>hinese Glyph Identification Powered by Radical Semantics YangChiJilin University - FaustoGiunchiglia + FaustoGiunchiglia ChuntaoLiJilin University - HaoXuJilin University + HaoXuJilin University 12065-12074 The ancestor of Chinese character – the ancient characters from about 1300 BC to 200 BC are not fixed in their writing glyphs. At the same or different points in time, one character can possess multiple glyphs that are different in shapes or radicals. Nearly half of ancient glyphs have not been deciphered yet. This paper proposes an innovative task of ancient Chinese glyph identification, which aims at inferring the Chinese character label for the unknown ancient Chinese glyphs which are not in the training set based on the image and radical information. Specifically, we construct a Chinese glyph knowledge graph (CGKG) associating glyphs in different historical periods according to the radical semantics, and propose a multimodal Chinese glyph identification framework (MCGI) fusing the visual, textual, and the graph data. The experiment is designed on a real Chinese glyph dataset spanning over 1000 years, it demonstrates the effectiveness of our method, and reports the potentials of each modality on this task. It provides a preliminary reference for the automatic ancient Chinese character deciphering at the glyph level. 2024.findings-acl.718 @@ -15904,7 +15904,7 @@ SettaluriSravanthiIndian Institute of Technology Bombay, Indian Institute of Technology, Bombay MeetDoshi PavanTankala - RudraMurthyIBM India Ltd + RudraMurthyIBM India Ltd RajDabreNational Institute of Information and Communications Technology (NICT), National Institute of Advanced Industrial Science and Technology PushpakBhattacharyyaIndian Institute of Technology, Bombay, Dhirubhai Ambani Institute Of Information and Communication Technology 12075-12097 @@ -15915,8 +15915,8 @@ <fixed-case>E</fixed-case>mo<fixed-case>T</fixed-case>rans<fixed-case>KG</fixed-case>: An Innovative Emotion Knowledge Graph to Reveal Emotion Transformation - HuanZhaoHunan University - XupengZhaHunan University + HuanZhaoHunan University + XupengZhaHunan University ZixingZhangHunan University 12098-12110 This paper introduces EmoTransKG, an innovative Emotion Knowledge Graph (EKG) that establishes connections and transformations between emotions across diverse open-textual events. Compared to existing EKGs, which primarily focus on linking emotion keywords to related terms or on assigning sentiment dimension ratings to emotion words by humans, EmoTransKG aims to represent the general knowledge involved in emotion transformation. Specifically, in conversations, successive emotions expressed by a single speaker are temporally considered as the head and tail entities, with open-text utterances (events) occurring between them representing the relation. To explore the knowledge of emotion transformations described in EmoTransKG, we develop a Transformer-based translational model called EmoTransNet, which predictively trains tail entities by interpreting the relation as an operation that transforms the source emotion into the target emotion. Particularly, our designed EmoTransNet serves as a plug-in module that seamlessly integrates with any conversational emotion recognition (CER) models for emotion retrofitting. Experimental results on two CER datasets demonstrate that the incorporation of EmoTransNet with baseline models results in substantial improvements, and the qualitative visualization of entities and relations clearly clarify their unique roles in emotion transformations. These experiments confirm the quality and effectiveness of EmoTransKG. @@ -15927,9 +15927,9 @@ How Vocabulary Sharing Facilitates Multilingualism in <fixed-case>LL</fixed-case>a<fixed-case>MA</fixed-case>? FeiYuan - ShuaiYuan + ShuaiYuan ZhiyongWuShanghai Artificial Intelligence Laboratory - LeiLiSchool of Computer Science, Carnegie Mellon University + LeiLiSchool of Computer Science, Carnegie Mellon University 12111-12130 Large Language Models (LLMs), often show strong performance on English tasks, while exhibiting limitations on other languages. What is an LLM’s multilingual capability when it is trained only on certain languages? The underlying mechanism remains unclear. This study endeavors to examine the multilingual capability of LLMs from the vocabulary sharing perspective by conducting an exhaustive analysis across 101 languages. Through the investigation of the performance gap before and after embedding fine-tuning, we discovered four distinct quadrants. By delving into each quadrant we provide actionable and efficient guidelines for tuning these languages. Extensive experiments reveal that existing LLMs possess multilingual capabilities that surpass our expectations, and we can significantly improve the multilingual performance of LLMs based on these attributes of each quadrant . 2024.findings-acl.721 @@ -15940,9 +15940,9 @@ Prefix Text as a Yarn: Eliciting Non-<fixed-case>E</fixed-case>nglish Alignment in Foundation Language Model RunzheZhanUniversity of Macau XinyiYang - DerekWongUniversity of Macau + DerekWongUniversity of Macau LidiaChao - YueZhangWestlake University + YueZhangWestlake University 12131-12145 While supervised fine-tuning (SFT) has been a straightforward approach for tailoring the output of foundation large language model (LLM) to specific preferences, concerns have been raised about the depth of this alignment, with some critiques suggesting it is merely “superficial”. We critically examine this hypothesis within the scope of cross-lingual generation tasks, proposing that the effectiveness of SFT may be constrained by its reliance on prior tokens to guide cross-lingual generation. Based on this crucial insight, and in response to the challenges posed by the costly and limited availability of non-English data for SFT, we introduce a novel training-free alignment method named PreTTY, which employs minimal task-related prior tokens to bridge the foundation LLM and the SFT LLM, achieving comparable performance without training. Experiments on machine translation and part-of-speech tagging across seven languages demonstrate the efficacy of PreTTY in cross-lingual settings. Remarkably, by initiating the decoding process with only one or two prior tokens, foundation LLMs can attain up to 98% of the performance metrics of their SFT counterparts. This method presents a cost-effective alternative to traditional SFT and advances the democratization of multilingual LLMs. 2024.findings-acl.722 @@ -15952,12 +15952,12 @@ Dual Prompt Tuning based Contrastive Learning for Hierarchical Text Classification SishiXiongChina Telecom - YuZhao - JieZhang + YuZhao + JieZhang LiMengxiang ZhongjiangHe XuelongLiNorthwestern Polytechnical University - ShuangyongSong + ShuangyongSong 12146-12158 Hierarchical text classification aims at categorizing texts into a multi-tiered tree-structured hierarchy of labels. Existing methods pay more attention to capture hierarchy-aware text feature by exploiting explicit parent-child relationships, while interactions between peer labels are rarely taken into account, resulting in severe label confusion within each layer. In this work, we propose a novel Dual Prompt Tuning (DPT) method, which emphasizes identifying discrimination among peer labels by performing contrastive learning on each hierarchical layer. We design an innovative hand-crafted prompt containing slots for both positive and negative label predictions to cooperate with contrastive learning. In addition, we introduce a label hierarchy self-sensing auxiliary task to ensure cross-layer label consistency. Extensive experiments demonstrate that DPT achieves significant improvements and outperforms the current state-of-the-art methods on BGC and RCV1-V2 benchmark datasets. 2024.findings-acl.723 @@ -15967,8 +15967,8 @@ Probing the Emergence of Cross-lingual Alignment during <fixed-case>LLM</fixed-case> Training HetongWang - PasqualeMinerviniUniversity of Edinburgh, University of Edinburgh - EdoardoPontiUniversity of Edinburgh + PasqualeMinerviniUniversity of Edinburgh, University of Edinburgh + EdoardoPontiUniversity of Edinburgh 12159-12173 Multilingual Large Language Models (LLMs) achieve remarkable levels of zero-shot cross-lingual transfer performance. We speculate that this is predicated on their ability to align languages without explicit supervision from parallel sentences. While representations of translationally equivalent sentences in different languages are known to be similar after convergence, however, it remains unclear how such cross-lingual alignment emerges during pre-training of LLMs. Our study leverages intrinsic probing techniques, which identify which subsets of neurons encode linguistic features, to correlate the degree of cross-lingual neuron overlap with the zero-shot cross-lingual transfer performance for a given model. In particular, we rely on checkpoints of BLOOM, a multilingual autoregressive LLM, across different training steps and model scales. We observe a high correlation between neuron overlap and downstream performance, which supports our hypothesis on the conditions leading to effective cross-lingual transfer. Interestingly, we also detect a degradation of both implicit alignment and multilingual abilities in certain phases of the pre-training process, providing new insights into the multilingual pretraining dynamics. 2024.findings-acl.724 @@ -15979,7 +15979,7 @@ <fixed-case>STSPL</fixed-case>-<fixed-case>SSC</fixed-case>: Semi-Supervised Few-Shot Short Text Clustering with Semantic text similarity Optimized Pseudo-Labels WenhuaNieNational Yang Ming Chiao Tung University LinDeng - Chang-BoLiu + Chang-BoLiu JialingWeiJialingWei RuitongHan HaoranZheng @@ -15997,7 +15997,7 @@ WeiLiuxiaomi JianLuan BinWangAI Lab, Xiaomi Inc. - DeyiXiongTianjin University + DeyiXiongTianjin University 12186-12215 Increasing the number of parameters in large language models (LLMs) usually improves performance in downstream tasks but raises compute and memory costs, making deployment difficult in resource-limited settings. Quantization techniques, which reduce the bits needed for model weights or activations with minimal performance loss, have become popular due to the rise of LLMs. However, most quantization studies use pre-trained LLMs, and the impact of quantization on instruction-tuned LLMs and the relationship between perplexity and benchmark performance of quantized LLMs are not well understood. Evaluation of quantized LLMs is often limited to language modeling and a few classification tasks, leaving their performance on other benchmarks unclear. To address these gaps, we propose a structured evaluation framework consisting of three critical dimensions: (1) knowledge & capacity, (2) alignment, and (3) efficiency, and conduct extensive experiments across ten diverse benchmarks. Our experimental results indicate that LLMs with 4-bit quantization can retain performance comparable to their non-quantized counterparts, and perplexity can serve as a proxy metric for quantized LLMs on most benchmarks. Furthermore, quantized LLMs with larger parameter scales can outperform smaller LLMs. Despite the memory savings achieved through quantization, it can also slow down the inference speed of LLMs. Consequently, substantial engineering efforts and hardware support are imperative to achieve a balanced optimization of decoding speed and memory consumption in the context of quantized LLMs. 2024.findings-acl.726 @@ -16059,10 +16059,10 @@ Decomposing Argumentative Essay Generation via Dialectical Planning of Complex Reasoning YuhangHe JianzhuBaoHarbin Institute of Technology - YangSun - BinLiang + YangSun + BinLiang MinYang - BingQinHarbin Institute of Technology + BingQinHarbin Institute of Technology RuifengXuHarbin Institute of Technology 12305-12322 Argumentative Essay Generation (AEG) is a challenging task in computational argumentation, where detailed logical reasoning and effective rhetorical skills are essential.Previous methods on argument generation typically involve planning prior to generation.However, the planning strategies in these methods overlook the exploration of the logical reasoning process.Inspired by argument structure-related theories, we propose an argumentative planning strategy for prompting large language models (LLMs) to generate high-quality essays.This strategy comprises two stages: (1) Sketch planning, which creates a rough outline of the essay, and (2) Dialectical planning, which refines the outline through critical self-reflection.Such a planning strategy enables LLMs to write argumentative essays that are more logical, diverse, and persuasive.Furthermore, due to the scarcity of existing AEG datasets, we construct three new datasets.These datasets are from two domains: exam essays and news editorials, covering both Chinese and English.Automatic and manual evaluation on four datasets show that our method can generate more dialectical and persuasive essays with higher diversity compared to several strong baselines. @@ -16074,7 +16074,7 @@ Large Language Models are Few-Shot Training Example Generators: A Case Study in Fallacy Recognition TariqAlhindi SmarandaMuresanAmazon and Columbia University - PreslavNakovMohamed bin Zayed University of Artificial Intelligence + PreslavNakovMohamed bin Zayed University of Artificial Intelligence 12323-12334 Recognizing fallacies is crucial for ensuring the quality and validity of arguments across various domains. However, computational fallacy recognition faces challenges due to the diverse genres, domains, and types of fallacies found in datasets. This leads to a highly multi-class, and even multi-label, setup with substantial class imbalance. In this study, we aim to enhance existing models for fallacy recognition by incorporating additional context and by leveraging large language models to generate synthetic data, thus increasing the representation of the infrequent classes. We experiment with GPT3.5 to generate synthetic examples and we examine the impact of prompt settings for this. Moreover, we explore zero-shot and few-shot scenarios to evaluate the effectiveness of using the generated examples for training smaller models within a unified fallacy recognition framework. Furthermore, we analyze the overlap between the synthetic data and existing fallacy datasets. Finally, we investigate the usefulness of providing supplementary context for detecting fallacy types that need such context, e.g., diversion fallacies. Our evaluation results demonstrate consistent improvements across fallacy types, datasets, and generators. The code and the synthetic datasets are all publicly available. 2024.findings-acl.732 @@ -16085,7 +16085,7 @@ Concept-aware Data Construction Improves In-context Learning of Language Models MichalŠtefánik MarekKadlčíkMasaryk University - PetrSojkaFaculty of Informatics, Masaryk University + PetrSojkaFaculty of Informatics, Masaryk University 12335-12352 Many recent language models (LMs) are capable of in-context learning (ICL), manifested in the LMs’ ability to perform a new task solely from natural-language instruction. Previous work curating in-context learners assumes that ICL emerges from a vast over-parametrization or the scale of multi-task training. However, recent theoretical work attributes the ICL ability to concept-dependent training data and creates functional in-context learners even in small-scale, synthetic settings.In this work, we practically explore this newly identified axis of ICL quality. We propose Concept-aware Training (CoAT), a framework for constructing training scenarios that make it beneficial for the LM to learn to utilize the analogical reasoning concepts from demonstrations. We find that by using CoAT, pre-trained transformers can learn to better utilise new latent concepts from demonstrations and that such ability makes ICL more robust to the functional deficiencies of the previous models. Finally, we show that concept-aware in-context learners are much more effective in in-context learning a majority of unseen tasks compared to traditional instruction tuning, and fare comparably also to previous in-context learners trained in large-scale multitask learning requiring magnitudes of more training data. 2024.findings-acl.733 @@ -16094,9 +16094,9 @@ Beyond Text: Leveraging Multi-Task Learning and Cognitive Appraisal Theory for Post-Purchase Intention Analysis - GerardYeo + GerardYeo ShazFurniturewala - KokilJaidkaNational University of Singapore + KokilJaidkaNational University of Singapore 12353-12360 Supervised machine-learning models for predicting user behavior offer a challenging classification problem with lower average prediction performance scores than other text classification tasks. This study evaluates multi-task learning frameworks grounded in Cognitive Appraisal Theory to predict user behavior as a function of users’ self-expression and psychological attributes. Our experiments show that users’ language and traits improve predictions above and beyond models predicting only from text. Our findings highlight the importance of integrating psychological constructs into NLP to enhance the understanding and prediction of user actions. We close with a discussion of the implications for future applications of large language models for computational psychology. 2024.findings-acl.734 @@ -16107,7 +16107,7 @@ Non-Autoregressive Machine Translation as Constrained <fixed-case>HMM</fixed-case> HaoranLi ZhanmingJieSalesforce Research - WeiLuSingapore University of Technology and Design + WeiLuSingapore University of Technology and Design 12361-12372 In non-autoregressive translation (NAT), directed acyclic Transformers (DAT) have demonstrated their ability to achieve comparable performance to the autoregressive Transformers.In this paper, we first show that DAT is essentially a fully connected left-to-right Hidden Markov Model (HMM), with the source and target sequences being observations and the token positions being latent states.Even though generative models like HMM do not suffer from label bias in traditional task settings (e.g., sequence labeling), we argue here that the left-to-right HMM in NAT may still encounter this issue due to the missing observations at the inference stage.To combat label bias, we propose two constrained HMMs: 1) Adaptive Window HMM, which explicitly balances the number of outgoing transitions at different states; 2) Bi-directional HMM, i.e., a combination of left-to-right and right-to-left HMMs, whose uni-directional components can implicitly regularize each other’s biases via shared parameters.Experimental results on WMT’14 EnDe and WMT’17 ZhEn demonstrate that our methods can achieve better or comparable performance to the original DAT using various decoding methods.We also demonstrate that our methods effectively reduce the impact of label bias. 2024.findings-acl.735 @@ -16116,13 +16116,13 @@ Multi-modal Stance Detection: New Datasets and Model - BinLiang - AngLi - JingqianZhaoHarbin Institute of Technology + BinLiang + AngLi + JingqianZhaoHarbin Institute of Technology LinGuiKing’s College London, University of London MinYangShenzhen Institutes of Advanced Technology, Chinese Academy of Sciences, Chinese Academy of Sciences YueYuNational University of Defense Technology and PengCheng Lab - Kam-FaiWongThe Chinese University of Hong Kong + Kam-FaiWongThe Chinese University of Hong Kong RuifengXuHarbin Institute of Technology 12373-12387 Stance detection is a challenging task that aims to identify public opinion from social media platforms with respect to specific targets. Previous work on stance detection largely focused on pure texts. In this paper, we study multi-modal stance detection for tweets consisting of texts and images, which are prevalent in today’s fast-growing social media platforms where people often post multi-modal messages. To this end, we create five new multi-modal stance detection datasets of different domains based on Twitter, in which each example consists of a text and an image. In addition, we propose a simple yet effective Targeted Multi-modal Prompt Tuning framework (TMPT), where target information is leveraged to learn multi-modal stance features from textual and visual modalities. Experimental results on our five benchmark datasets show that the proposed TMPT achieves state-of-the-art performance in multi-modal stance detection. @@ -16132,9 +16132,9 @@ Enhanced Language Model Truthfulness with Learnable Intervention and Uncertainty Expression - FarimaFatahi BayatUniversity of Michigan - Ann Arbor + FarimaFatahi BayatUniversity of Michigan - Ann Arbor XinLiuUniversity of Michigan - Ann Arbor - H.JagadishUniversity of Michigan - Ann Arbor + H.JagadishUniversity of Michigan - Ann Arbor LuWangNortheastern University, Northeastern University and University of Michigan 12388-12400 Large language models (LLMs) can generate long-form and coherent text, yet they often hallucinate facts, which undermines their reliability. To mitigate this issue, inference-time methods steer LLM representations toward the “truthful directions” previously learned for truth elicitation. However, applying these truthful directions with the same intensity fails to generalize across different query contexts. We propose LITO, a Learnable Intervention method for Truthfulness Optimization that automatically identifies the optimal intervention intensity tailored to each specific context. LITO explores a sequence of model generations based on increasing levels of intervention intensities. It selects the most accurate response or refuses to answer when the predictions are highly uncertain. Experiments on multiple LLMs and question-answering datasets demonstrate that LITO improves truthfulness while preserving task accuracy. The adaptive nature of LITO counters the limitations of one-size-fits-all intervention methods, maximizing truthfulness by reflecting the model’s internal knowledge only when it is confident. Our code is available at https://github.com/launchnlp/LITO. @@ -16144,12 +16144,12 @@ <fixed-case>MM</fixed-case>-<fixed-case>LLM</fixed-case>s: Recent Advances in <fixed-case>M</fixed-case>ulti<fixed-case>M</fixed-case>odal Large Language Models - DuzhenZhang + DuzhenZhang YahanYuKyoto University, Kyoto University JiahuaDong ChenxingLi DanSu - ChenhuiChuKyoto University + ChenhuiChuKyoto University DongYuTencent AI Lab 12401-12430 In the past year, MultiModal Large Language Models (MM-LLMs) have undergone substantial advancements, augmenting off-the-shelf LLMs to support MM inputs or outputs via cost-effective training strategies. The resulting models not only preserve the inherent reasoning and decision-making capabilities of LLMs but also empower a diverse range of MM tasks. In this paper, we provide a comprehensive survey aimed at facilitating further research of MM-LLMs. Initially, we outline general design formulations for model architecture and training pipeline. Subsequently, we introduce a taxonomy encompassing 126 MM-LLMs, each characterized by its specific formulations. Furthermore, we review the performance of selected MM-LLMs on mainstream benchmarks and summarize key training recipes to enhance the potency of MM-LLMs. Finally, we explore promising directions for MM-LLMs while concurrently maintaining a [real-time tracking website](https://mm-llms.github.io/) for the latest developments in the field. We hope that this survey contributes to the ongoing advancement of the MM-LLMs domain. @@ -16159,25 +16159,25 @@ <fixed-case>CIF</fixed-case>-Bench: A <fixed-case>C</fixed-case>hinese Instruction-Following Benchmark for Evaluating the Generalizability of Large Language Models - YizhiLiUniversity of Manchester and University of Sheffield + YizhiLiUniversity of Manchester and University of Sheffield GeZhang XingweiQuHong Kong University of Science and Technology JialiLiNational University of Singapore ZhaoqunLi NoahWang - HaoLi + HaoLi RuibinYuan - YinghaoMaQueen Mary University of London + YinghaoMaQueen Mary University of London KaiZhang WangchunshuZhouAIWaves Inc. YimingLiang LeiZhang - LeiMaPeking University and Beijing Academy of Artifical Intelligence + LeiMaPeking University and Beijing Academy of Artifical Intelligence JiajunZhangInstitute of automation, Chinese academy of science, Chinese Academy of Sciences ZuowenLiBeijing Foreign Studies University WenhaoHuang ChenghuaLinUniversity of Manchester - JieFuHong Kong University of Science and Technology + JieFuHong Kong University of Science and Technology 12431-12446 The advancement of large language models (LLMs) has enhanced the ability to generalize across a wide range of unseen natural language processing (NLP) tasks through instruction-following.Yet, their effectiveness often diminishes in low-resource languages like Chinese, exacerbated by biased evaluations from data leakage, casting doubt on their true generalizability to new linguistic territories. In response, we introduce the Chinese Instruction-Following Benchmark (**CIF-Bench**), designed to evaluate the zero-shot generalizability of LLMs to the Chinese language. CIF-Bench comprises 150 tasks and 15,000 input-output pairs, developed by native speakers to test complex reasoning and Chinese cultural nuances across 20 categories. To mitigate data contamination, we release only half of the dataset publicly, with the remainder kept private, and introduce diversified instructions to minimize score variance, totaling 45,000 data instances.Our evaluation of 28 selected LLMs reveals a noticeable performance gap, with the best model scoring only 52.9%, highlighting the limitations of LLMs in less familiar language and task contexts.This work not only uncovers the current limitations of LLMs in handling Chinese language tasks but also sets a new standard for future LLM generalizability research, pushing towards the development of more adaptable, culturally informed, and linguistically diverse models. 2024.findings-acl.739 @@ -16190,7 +16190,7 @@ FlorianStrubDeepMind RahmaChaabouniGoogle PaulMichelDeepMind - EmmanuelDupouxEHESS + EmmanuelDupouxEHESS OlivierPietquinCohere and Earth Species Project 12447-12472 While reinforcement learning (RL) has been proven essential for tuning large language models (LLMs), it can lead to reward over-optimization (ROO). Existing approaches address ROO by adding KL regularization, requiring computationally expensive hyperparameter tuning. Additionally, KL regularization focuses solely on regularizing the language policy, neglecting a potential source of regularization: the reward function itself. Inspired by demonstration-guided RL, we here introduce the Reward Calibration from Demonstration (RCfD), which leverages human demonstrations and a reward model to recalibrate the reward objective. Formally, given a prompt, the RCfD objective minimizes the distance between the demonstrations’ and LLM’s rewards rather than directly maximizing the reward function. This objective shift avoids incentivizing the LLM to exploit the reward model and promotes more natural and diverse language generation.We show the effectiveness of RCfD in three RL language tasks, where it achieves comparable performance to carefully tuned baselines while mitigating ROO. @@ -16202,8 +16202,8 @@ Enhancing Idiomatic Representation in Multiple Languages via an Adaptive Contrastive Triplet Loss WeiHeUniversity of Sheffield MarcoIdiartUniversidade Federal do Rio Grande do Sul - CarolinaScartonUniversity of Sheffield - AlineVillavicencioUniversity of Exeter and University of Sheffield + CarolinaScartonUniversity of Sheffield + AlineVillavicencioUniversity of Exeter and University of Sheffield 12473-12485 Accurately modeling idiomatic or non-compositional language has been a longstanding challenge in Natural Language Processing (NLP). This is partly because these expressions do not derive their meanings solely from their constituent words, but also due to the scarcity of relevant data resources, and their impact on the performance of downstream tasks such as machine translation and simplification. In this paper we propose an approach to model idiomaticity effectively using a triplet loss that incorporates the asymmetric contribution of components words to an idiomatic meaning for training language models by using adaptive contrastive learning and resampling miners to build an idiomatic-aware learning objective. Our proposed method is evaluated on a SemEval challenge and outperforms previous alternatives significantly in many metrics. 2024.findings-acl.741 @@ -16216,7 +16216,7 @@ HangYanAI lab QipengGuoShanghai AI Laboratory HaijunLv - XipengQiuFudan University + XipengQiuFudan University 12486-12502 Large language models have achieved remarkable success, but their extensive parameter size necessitates substantial memory for training, thereby setting a high threshold. While the recently proposed low-memory optimization (LOMO) reduces memory footprint, its optimization technique, akin to stochastic gradient descent, is sensitive to hyper-parameters and exhibits suboptimal convergence, failing to match the performance of the prevailing optimizer for large language models, AdamW. Through analysis of the Adam optimizer, we found that, compared to momentum, the adaptive learning rate is more critical for bridging the gap. Building on this insight, we introduce the low-memory optimization with adaptive learning rate (AdaLomo), which offers an adaptive learning rate for each parameter and exhibits superior convergence performance compared to LOMO theoretically. To maintain memory efficiency, we employ non-negative matrix factorization for the second-order moment estimation. Additionally, we suggest the use of a grouped update normalization to stabilize convergence. Our experiments with instruction-tuning and further pre-training demonstrate that AdaLomo achieves results on par with AdamW, while significantly reducing memory requirements, thereby lowering the hardware barrier to training large language models. The code is accessible at https://github.com/OpenLMLab/LOMO. 2024.findings-acl.742 @@ -16227,7 +16227,7 @@ Propagation and Pitfalls: Reasoning-based Assessment of Knowledge Editing through Counterfactual Tasks WenyueHuaRutgers University, New Brunswick JiangGuo - MingwenDong + MingwenDong HenghuiZhuAmazon PatrickNgAmazon ZhiguoWang @@ -16243,7 +16243,7 @@ TaliaTseriotou XeniaMiscouridouUniversity of Cyprus and Imperial College London AdamTsakalidisCedefop and Alan Turing Institute - MariaLiakataQueen Mary University London + MariaLiakataQueen Mary University London 12526-12537 Through the rise of social media platforms, longitudinal language modelling has received much attention over the latest years, especially in downstream tasks such as mental health monitoring of individuals where modelling linguistic content in a temporal fashion is crucial. A key limitation in existing work is how to effectively model temporal sequences within Transformer-based language models. In this work we address this challenge by introducing a novel approach for predicting ‘Moments of Change’ (MoC) in the mood of online users, by simultaneously considering user linguistic and time-aware context. A Hawkes process-inspired transformation layer is applied over the proposed architecture to model the influence of time on users’ posts – capturing both their immediate and historical dynamics. We perform experiments on the two existing datasets for the MoC task and showcase clear performance gains when leveraging the proposed layer. Our ablation study reveals the importance of considering temporal dynamics in detecting subtle and rare mood changes. Our results indicate that considering linguistic and temporal information in a hierarchical manner provide valuable insights into the temporal dynamics of modelling user generated content over time, with applications in mental health monitoring. 2024.findings-acl.744 @@ -16259,7 +16259,7 @@ ShanshanGuo JianhuaHanHuawei Technologies Ltd. HangXuHuawei Noah‘s Ark Lab - ShikuiMaDataa Robotics + ShikuiMaDataa Robotics XiaodanLiang 12538-12559 Understanding and following natural language instructions while navigating through complex, real-world environments poses a significant challenge for general-purpose robots. These environments often include obstacles and pedestrians, making it essential for autonomous agents to possess the capability of self-corrected planning to adjust their actions based on feedback from the surroundings. However, the majority of existing vision-and-language navigation (VLN) methods primarily operate in less realistic simulator settings and do not incorporate environmental feedback into their decision-making processes. To address this gap, we introduce a novel zero-shot framework called CorNav, utilizing a large language model for decision-making and comprising two key components: 1) incorporating environmental feedback for refining future plans and adjusting its actions, and 2) multiple domain experts for parsing instructions, scene understanding, and refining predicted actions. In addition to the framework, we develop a 3D simulator that renders realistic scenarios using Unreal Engine 5. To evaluate the effectiveness and generalization of navigation agents in a zero-shot multi-task setting, we create a benchmark called NavBench. Our empirical study involves deploying 7 baselines across four tasks, i.e., goal-conditioned navigation given a specific object category, goal-conditioned navigation given simple instructions, finding abstract objects based on high-level instructions, and step-by-step instruction following. Extensive experiments demonstrate that CorNav consistently outperforms all baselines by a significant margin across all tasks. On average, CorNav achieves a success rate of 28.1%, surpassing the best baseline’s performance of 20.5%. @@ -16270,18 +16270,18 @@ <fixed-case>S</fixed-case>ci<fixed-case>MMIR</fixed-case>: Benchmarking Scientific Multi-modal Information Retrieval SiweiWuNanjing University of Science and Technology - YizhiLiUniversity of Manchester and University of Sheffield + YizhiLiUniversity of Manchester and University of Sheffield KangZhu GeZhang YimingLiang KaijingMa ChenghaoXiao HaoranZhang - BohaoYangUniversity of Manchester + BohaoYangUniversity of Manchester WenhuChenUniversity of Waterloo and Google WenhaoHuang NouraAl MoubayedDurham University - JieFuHong Kong University of Science and Technology + JieFuHong Kong University of Science and Technology ChenghuaLinUniversity of Manchester 12560-12574 Multi-modal information retrieval (MMIR) is a rapidly evolving field where significant progress has been made through advanced representation learning and cross-modality alignment research, particularly in image-text pairing.However, current benchmarks for evaluating MMIR performance on image-text pairings overlook the scientific domain, which has a notable gap with the generic data since the caption of scientific charts and tables usually describes the analysis of experimental results or scientific principles in contrast to human activity or scenery depicted in generic images.To bridge this gap, we develop a scientific domain-specific MMIR benchmark (SciMMIR) by leveraging open-access research paper corpora to extract data relevant to the scientific domain. This benchmark comprises 530K meticulously curated image-text pairs, extracted from figures and tables with detailed captions from scientific documents.We further annotate the image-text pairs with a two-level subset-subcategory hierarchy to facilitate a more comprehensive evaluation of the baselines. We conduct zero-shot and fine-tuned evaluations on prominent multi-modal image-captioning and visual language models, such as CLIP, BLIP, and BLIP-2.Our findings offer critical insights for MMIR in the scientific domain, including the impact of pre-training and fine-tuning settings and the effects of different visual and textual encoders. @@ -16292,7 +16292,7 @@ Diving Deep into the Motion Representation of Video-Text Models ChinmayaDevarajUniversity of Maryland, College Park - CorneliaFermullerUniversity of Maryland, College Park + CorneliaFermullerUniversity of Maryland, College Park YiannisAloimonosUniversity of Maryland, College Park 12575-12584 Videos are more informative than images becausethey capture the dynamics of the scene.By representing motion in videos, we can capturedynamic activities. In this work, we introduceGPT-4 generated motion descriptions thatcapture fine-grained motion descriptions of activitiesand apply them to three action datasets.We evaluated several video-text models on thetask of retrieval of motion descriptions. Wefound that they fall far behind human expertperformance on two action datasets, raisingthe question of whether video-text models understandmotion in videos. To address it, weintroduce a method of improving motion understandingin video-text models by utilizingmotion descriptions. This method proves tobe effective on two action datasets for the motiondescription retrieval task. The results drawattention to the need for quality captions involvingfine-grained motion information in existingdatasets and demonstrate the effectiveness ofthe proposed pipeline in understanding finegrainedmotion during video-text retrieval. @@ -16317,7 +16317,7 @@ AnirudhSomSRI International KaranSikkaSRI International HelenGentSRI International - AjayDivakaranSRI International + AjayDivakaranSRI International AndreasKathol DimitraVergyri 12612-12627 @@ -16331,7 +16331,7 @@ KhiemPhiState University of New York at Stony Brook NoushinSalek Faramarzi, State University of New York at Stony Brook ChenluWangState University of New York at Stony Brook - RitwikBanerjeeState University of New York, Stony Brook + RitwikBanerjeeState University of New York, Stony Brook 12628-12643 Whataboutism, a potent tool for disrupting narratives and sowing distrust, remains under-explored in quantitative NLP research. Moreover, past work has not distinguished its use as a strategy for misinformation and propaganda from its use as a tool for pragmatic and semantic framing. We introduce new datasets from Twitter/X and YouTube, revealing overlaps as well as distinctions between whataboutism, propaganda, and the tu quoque fallacy. Furthermore, drawing on recent work in linguistic semantics, we differentiate the ‘what about’ lexical construct from whataboutism. Our experiments bring to light unique challenges in its accurate detection, prompting the introduction of a novel method using attention weights for negative sample mining. We report significant improvements of 4% and 10% over previous state-of-the-art methods in our Twitter and YouTube collections, respectively. 2024.findings-acl.750 @@ -16360,8 +16360,8 @@ <fixed-case>LLM</fixed-case>s as Narcissistic Evaluators: When Ego Inflates Evaluation Scores - YiqiLiuUniversity of Manchester - NafiseMoosaviUniversity of Sheffield + YiqiLiuUniversity of Manchester + NafiseMoosaviUniversity of Sheffield ChenghuaLinUniversity of Manchester 12688-12701 Automatic evaluation of generated textual content presents an ongoing challenge within the field of NLP. Given the impressive capabilities of modern language models (LMs) across diverse NLP tasks, there is a growing trend to employ these models in creating innovative evaluation metrics for automated assessment of generation tasks. This paper investigates a pivotal question: Do language model-driven evaluation metrics inherently exhibit bias favoring texts generated by the same underlying language model? Specifically, we assess whether prominent LM-based evaluation metrics (e.g. BARTScore, T5Score, and GPTScore) demonstrate a favorable bias toward their respective underlying LMs in the context of summarization tasks. Our findings unveil a latent bias, particularly pronounced when such evaluation metrics are used in a reference-free manner without leveraging gold summaries. These results underscore that assessments provided by generative evaluation models can be influenced by factors beyond the inherent text quality, highlighting the necessity of developing more reliable evaluation protocols in the future. @@ -16389,7 +16389,7 @@ NemikaTyagiArizona State University Md NayemUddinArizona State University NeerajVarshney - ChittaBaralArizona State University + ChittaBaralArizona State University 12717-12733 This study explores the sycophantic tendencies of Large Language Models (LLMs), where these models tend to provide answers that match what users want to hear, even if they are not entirely correct. The motivation behind this exploration stems from the common behavior observed in individuals searching the internet for facts with partial or misleading knowledge. Similar to using web search engines, users may recall fragments of misleading keywords and submit them to an LLM, hoping for a comprehensive response. Our empirical analysis of several LLMs shows the potential danger of these models amplifying misinformation when presented with misleading keywords. Additionally, we thoroughly assess four existing hallucination mitigation strategies to reduce LLMs sycophantic behavior. Our experiments demonstrate the effectiveness of these strategies for generating factually correct statements. Furthermore, our analyses delve into knowledge-probing experiments on factual keywords and different categories of sycophancy mitigation. 2024.findings-acl.755 @@ -16411,7 +16411,7 @@ Choose Your Transformer: Improved Transferability Estimation of Transformer Models on Classification Tasks LukasGarbaciauskas - MaxPlonerHumboldt Universität Berlin + MaxPlonerHumboldt Universität Berlin AlanAkbikHumboldt Universität Berlin 12752-12768 There currently exists a multitude of pre-trained transformer language models (LMs) that are readily available. From a practical perspective, this raises the question of which pre-trained LM will perform best if fine-tuned for a specific downstream NLP task. However, exhaustively fine-tuning all available LMs to determine the best-fitting model is computationally infeasible. To address this problem, we present an approach that inexpensively estimates a ranking of the expected performance of a given set of candidate LMs for a given task. Following a layer-wise representation analysis, we extend existing approaches such as H-score and LogME by aggregating representations across all layers of the transformer model. We present an extensive analysis of 20 transformer LMs, 6 downstream NLP tasks, and various estimators (linear probing, kNN, H-score, and LogME). Our evaluation finds that averaging the layer representations significantly improves the Pearson correlation coefficient between the true model ranks and the estimate, increasing from 0.58 to 0.86 for LogME and from 0.65 to 0.88 for H-score. @@ -16442,7 +16442,7 @@ LeslyMiculicichGoogle NanyunPengUniversity of California, Los Angeles Chen-YuLeeGoogle - TomasPfisterGoogle + TomasPfisterGoogle 12782-12803 Grounded generation aims to equip language models (LMs) with the ability to produce more credible and accountable responses by accurately citing verifiable sources. However, existing methods, by either feeding LMs with raw or preprocessed materials, remain prone to errors. To address this, we introduce CaLM, a novel verification framework. CaLM leverages the insight that a robust grounded response should be consistent with information derived solely from its cited sources. Our framework empowers smaller LMs, which rely less on parametric memory and excel at processing relevant information given a query, to validate the output of larger LMs. Larger LM responses that closely align with the smaller LMs’ output, which relies exclusively on cited documents, are verified. Responses showing discrepancies are iteratively refined through a feedback loop. Experiments on three open-domain question-answering datasets demonstrate significant performance gains of 1.5% to 7% absolute average without any required model fine-tuning. 2024.findings-acl.759 @@ -16484,10 +16484,10 @@ <fixed-case>O</fixed-case>pen<fixed-case>C</fixed-case>ode<fixed-case>I</fixed-case>nterpreter: Integrating Code Generation with Execution and Refinement TianyuZheng GeZhang - TianhaoShen + TianhaoShen XuelingLiu Bill YuchenLin - JieFuHong Kong University of Science and Technology + JieFuHong Kong University of Science and Technology WenhuChenUniversity of Waterloo and Google XiangYueCarnegie Mellon University 12834-12859 @@ -16513,13 +16513,13 @@ ZaidAlyafeai KhalidAlmubarakPrince Sattam bin Abdulaziz University AhmedAshraf - DeemaAlnuhaitUniversity of Illinois at Urbana-Champaign + DeemaAlnuhaitUniversity of Illinois at Urbana-Champaign SaiedAlshahrani GubranAbdulrahmanKing Fahad University of Petroleum and Minerals GamilAhmed QaisGawah ZeadSaleh - MustafaGhaleb + MustafaGhaleb YousefAli MagedAl-shaibaniKing Fahad University of Petroleum and Minerals 12878-12901 @@ -16539,7 +16539,7 @@ DaveVan VeenStanford University TanBui StevenTruongVinbrain JSC and Toronto University - CurtisLanglotzStanford University + CurtisLanglotzStanford University 12902-12915 In order to enable extraction of structured clinical data from unstructured radiology reports, we introduce RadGraph-XL, a large-scale, expert-annotated dataset for clinical entity and relation extraction. RadGraph-XL consists of 2,300 radiology reports, which are annotated with over 410,000 entities and relations by board-certified radiologists. Whereas previous approaches focus solely on chest X-rays, RadGraph-XL includes data from four anatomy-modality pairs - chest CT, abdomen/pelvis CT, brain MR, and chest X-rays. Then, in order to automate structured information extraction, we use RadGraph-XL to train transformer-based models for clinical entity and relation extraction. Our evaluations include comprehensive ablation studies as well as an expert reader study that evaluates trained models on out-of-domain data. Results demonstrate that our model surpasses the performance of previous methods by up to 52% and notably outperforms GPT-4 in this domain. We release RadGraph-XL as well as our trained model to foster further innovation and research in structured clinical information extraction. 2024.findings-acl.765 @@ -16560,11 +16560,11 @@ Selective “Selective Prediction”: Reducing Unnecessary Abstention in Vision-Language Reasoning TejasSrinivasanUniversity of Southern California - JackHesselSamaya AI + JackHesselSamaya AI TanmayGuptaAllen Institute for Artificial Intelligence Bill YuchenLin YejinChoiDepartment of Computer Science, University of Washington - JesseThomasonUniversity of Southern California and Amazon + JesseThomasonUniversity of Southern California and Amazon KhyathiChandu 12935-12948 Selective prediction minimizes incorrect predictions from vision-language models (VLMs) by allowing them to abstain from answering when uncertain. However, when deploying a vision-language system with low tolerance for inaccurate predictions, selective prediction may be over-cautious and abstain too frequently, even on many correct predictions. We introduce ReCoVERR, an inference-time algorithm to reduce the over-abstention of a selective vision-language system without increasing the error rate of the system’s predictions. When the VLM makes a low-confidence prediction, instead of abstaining ReCoVERR tries to find relevant clues in the image that provide additional evidence for the prediction. ReCoVERR uses an LLM to pose related questions to the VLM, collects high-confidence evidences, and if enough evidence confirms the prediction the system makes a prediction instead of abstaining. ReCoVERR enables three VLMs (BLIP2, InstructBLIP and LLaVA-1.5) to answer up to 20% more questions on the VQAv2 and A-OKVQA tasks without decreasing system accuracy, thus improving overall system reliability. Our code is available at https://github.com/tejas1995/ReCoVERR. @@ -16575,7 +16575,7 @@ Language Model Priors and Data Augmentation Strategies for Low-resource Machine Translation: A Case Study Using <fixed-case>F</fixed-case>innish to <fixed-case>N</fixed-case>orthern <fixed-case>S</fixed-case>ámi JonneSäleväBrandeis University - ConstantineLignosBrandeis University + ConstantineLignosBrandeis University 12949-12956 We investigate ways of using monolingual data in both the source and target languages for improving low-resource machine translation. As a case study, we experiment with translation from Finnish to Northern Sámi.Our experiments show that while conventional backtranslation remains a strong contender, using synthetic target-side data when training backtranslation models can be helpful as well.We also show that monolingual data can be used to train a language model which can act as a regularizer without any augmentation of parallel data. 2024.findings-acl.768 @@ -16596,9 +16596,9 @@ <fixed-case>KIWI</fixed-case>: A Dataset of Knowledge-Intensive Writing Instructions for Answering Research Questions FangyuanXuUniversity of Texas at Austin and University of Texas at Austin KyleLoAllen Institute for Artificial Intelligence - LucaSoldainiAllen Institute for Artificial Intelligence + LucaSoldainiAllen Institute for Artificial Intelligence BaileyKuehl - EunsolChoiUniversity of Texas, Austin + EunsolChoiUniversity of Texas, Austin DavidWaddenAllen Institute for Artificial Intelligence 12969-12990 Large language models (LLMs) adapted to follow user instructions are now widely deployed as conversational agents. In this work, we examine one increasingly common instruction-following task: providing writing assistance to compose a long-form answer. To evaluate the capabilities of current LLMs on this task, we construct KIWI, a dataset of knowledge-intensive writing instructions in the scientific domain. Given a research question, an initial model-generated answer and a set of relevant papers, an expert annotator iteratively issues instructions for the model to revise and improve its answer. We collect 1,260 interaction turns from 234 interaction sessions with three state-of-the-art LLMs. Each turn includes a user instruction, a model response, and a human evaluation of the model response. Through a detailed analysis of the collected responses, we find that all models struggle to incorporate new information into an existing answer, and to perform precise and unambiguous edits. Further, we find that models struggle to judge whether their outputs successfully followed user instructions, with accuracy at least 10 points short of human agreement. Our findings indicate that KIWI will be a valuable resource to measure progress and improve LLMs’ instruction-following capabilities for knowledge intensive writing tasks. @@ -16608,7 +16608,7 @@ <fixed-case>XL</fixed-case>-<fixed-case>H</fixed-case>ead<fixed-case>T</fixed-case>ags: Leveraging Multimodal Retrieval Augmentation for the Multilingual Generation of News Headlines and Tags - Faisal TarequeShohan + Faisal TarequeShohan Mir TafseerNayeem SamsulIslam Abu UbaidaAkash @@ -16623,9 +16623,9 @@ <fixed-case>I</fixed-case>n<fixed-case>F</fixed-case>o<fixed-case>B</fixed-case>ench: Evaluating Instruction Following Ability in Large Language Models YiweiQin KaiqiangSongTencent AI Lab - YebowenHuUniversity of Central Florida + YebowenHuUniversity of Central Florida WenlinYaoTencent AI Lab - SangwooChoCapital One + SangwooChoCapital One XiaoyangWangTencent AI Lab XuanshengWu FeiLiuEmory University @@ -16654,7 +16654,7 @@ GaganBhatia El Moatez BillahNagoudiUniversity of British Columbia HasanCavusogluSauder School of Business - MuhammadAbdul-MageedUniversity of British Columbia + MuhammadAbdul-MageedUniversity of British Columbia 13064-13087 We introduce FinTral, a suite of state-of-the-art multimodal large language models (LLMs) built upon the Mistral-7b model and tailored for financial analysis. FinTral integrates textual, numerical, tabular, and image data. We enhance FinTral with domain-specific pretraining, instruction fine-tuning, and RLAIF training by exploiting a large collection of textual and visual datasets we curate for this work. We also introduce an extensive benchmark featuring nine tasks and 25 datasets for evaluation, including hallucinations in the financial domain. Our FinTral model trained with direct preference optimization employing advanced Tools and Retrieval methods, dubbed FinTral-DPO-T&R, demonstrates an exceptional zero-shot performance. It outperforms ChatGPT-3.5 in all tasks and surpasses GPT-4 in five out of nine tasks, marking a significant advancement in AI-driven financial technology. We also demonstrate that FinTral has the potential to excel in real-time analysis and decision-making in diverse financial contexts. 2024.findings-acl.774 @@ -16672,8 +16672,8 @@ ChuangGan LiangyanGuiUIUC Yu-XiongWangSchool of Computer Science, Carnegie Mellon University and Department of Computer Science, University of Illinois Urbana-Champaign - YimingYangSchool of Computer Science, Carnegie Mellon University - KurtKeutzerUniversity of California Berkeley + YimingYangSchool of Computer Science, Carnegie Mellon University + KurtKeutzerUniversity of California Berkeley TrevorDarrellElectrical Engineering & Computer Science Department 13088-13110 Large Multimodal Models (LMM) are built across modalities and the misalignment between two modalities can result in “hallucination”, generating textual outputs that are not grounded by the multimodal information in context. To address the multimodal misalignment issue, we adapt the Reinforcement Learning from Human Feedback (RLHF) from the text domain to the vision-language alignment, where human annotators are asked to compare two responses and pinpoint the more hallucinated one, and the vision-language model is trained to maximize the simulated human rewards. We propose a new alignment algorithm called Factually Augmented RLHF that augments the reward model with additional factual information such as image captions and ground-truth multi-choice options, which alleviates the reward hacking phenomenon in RLHF and further improves the performance. We also enhance the GPT-4-generated training data (for vision instruction tuning) with previously available human-written image-text pairs to improve the general capabilities of our model. To evaluate the proposed approach in real-world scenarios, we develop a new evaluation benchmark MMHAL-BENCH with a special focus on penalizing hallucinations. As the first LMM trained with RLHF, our approach achieves remarkable improvement on the LLaVA-Bench dataset with the 96% performance level of the text-only GPT-4 (while previous best methods can only achieve the 87% level), and an improvement of 60% on MMHAL-BENCH over other baselines. @@ -16686,7 +16686,7 @@ NeerajVarshney PavelDolin AgastyaSeth - ChittaBaralArizona State University + ChittaBaralArizona State University 13111-13128 As Large Language Models (LLMs) play an increasingly pivotal role in natural language processing applications, their safety concerns become critical areas of NLP research. This has resulted in the development of various LLM defense strategies. Unfortunately, despite the shared goal of improving the safety of LLMs, the evaluation suites across various research works are disjoint and lack diverse inputs to ensure accurate and precise evaluation estimates. Furthermore, the important factor of ‘over-defensiveness’ on the safe inputs has largely remained overlooked. Addressing these limitations, this paper presents a systematic evaluation, comparison, and analysis of various LLM defense strategies over both ‘safety’ and ‘over-defensiveness’. To this end, we compile a large and diverse collection of safe and unsafe prompts, design precise evaluation methodology, and study the efficacy of various LLM defense strategies on multiple state-of-the-art LLMs. Our work reveals a number of crucial findings that we believe will pave the way and also facilitate further research in the critical area of improving the safety of LLMs. 2024.findings-acl.776 @@ -16707,12 +16707,12 @@ <tex-math>360^\circ</tex-math><fixed-case>REA</fixed-case>: Towards A Reusable Experience Accumulation with <tex-math>360^\circ</tex-math> Assessment for Multi-Agent System - ShenGaoUniversity of Electronic Science and Technology of China + ShenGaoUniversity of Electronic Science and Technology of China HaoLi ZhengliangShi ChengruiHuang - QuanTu - ShuoShang + QuanTu + ShuoShang ZhiliangTianNational University of Defense Technology MinlieHuangTsinghua University, Tsinghua University 13149-13162 @@ -16724,7 +16724,7 @@ Extracting Polymer Nanocomposite Samples from Full-Length Documents GhazalKhalighinejadDepartment of Computer Science, Duke University DefneCirci - L.Brinson + L.Brinson BhuwanDhingraDuke University 13163-13175 This paper investigates the use of large language models (LLMs) for extracting sample lists of polymer nanocomposites (PNCs) from full-length materials science research papers. The challenge lies in the complex nature of PNC samples, which have numerous attributes scattered throughout the text. The complexity of annotating detailed information on PNCs limits the availability of data, making conventional document-level relation extraction techniques impractical due to the challenge in creating comprehensive named entity span annotations.To address this, we introduce a new benchmark and an evaluation technique for this task and explore different prompting strategies in a zero-shot manner. We also incorporate self-consistency to improve the performance. Our findings show that even advanced LLMs struggle to extract all of the samples from an article. Finally, we analyze the errors encountered in this process, categorizing them into three main challenges, and discuss potential strategies for future research to overcome them. @@ -16753,7 +16753,7 @@ Toucan: Many-to-Many Translation for 150 <fixed-case>A</fixed-case>frican Language Pairs AbdelRahimElmadanyUniversity of British Columbia IfeAdebara - MuhammadAbdul-MageedUniversity of British Columbia + MuhammadAbdul-MageedUniversity of British Columbia 13189-13206 We address a notable gap in Natural Language Processing (NLP) by introducing a collection of resources designed to improve Machine Translation (MT) for low-resource languages, with a specific focus on African languages. First, We introduce two language models (LMs), Cheetah-1.2B and Cheetah-3.7B, with 1.2 billion and 3.7 billion parameters respectively. Next, we finetune the aforementioned models to create Toucan, an Afrocentric machine translation model designed to support 156 African language pairs. To evaluate Toucan, we carefully develop an extensive machine translation benchmark, dubbed Afro-Lingu-MT, tailored for evaluating machine translation. Toucan significantly outperforms other models, showcasing its remarkable performance on MT for African languages. Finally, we train a new model, spBLEU-1K, to enhance translation evaluation metrics, covering 1K languages, including African languages. This work aims to advance the field of NLP, fostering cross-cultural understanding and knowledge exchange, particularly in regions with limited language resources such as Africa. 2024.findings-acl.781 @@ -16768,7 +16768,7 @@ YoshinoriMaedaSony Group Corporation KeiichiYamadaSony Group Corporation HiromiWakakiSony Group Corporation - JulianMcAuleyUniversity of California, San Diego, University of California, San Diego + JulianMcAuleyUniversity of California, San Diego, University of California, San Diego 13207-13219 We consider the task of building a dialogue system that can motivate users to adopt positive lifestyle changes, Motivational Interviewing (MI). Addressing such a task requires a system that could infer how to motivate the user effectively. We propose DIIR, a framework that is capable of learning and applying conversation strategies in the form of natural language inductive rules from expert demonstrations. Automatic and human evaluation on instruction-following large language models show natural language strategies descriptions discovered by DIIR can improve active listening skills, reduce unsolicited advice, and promote more collaborative and less authoritative conversations, outperforming in-context demonstrations that are over 50 times longer. 2024.findings-acl.782 @@ -16779,7 +16779,7 @@ Evaluating Structural Generalization in Neural Machine Translation RyomaKumon DaikiMatsuoka - HitomiYanakathe University of Tokyo + HitomiYanakathe University of Tokyo 13220-13239 Compositional generalization refers to the ability to generalize to novel combinations of previously observed words and syntactic structures.Since it is regarded as a desired property of neural models, recent work has assessed compositional generalization in machine translation as well as semantic parsing.However, previous evaluations with machine translation have focused mostly on lexical generalization (i.e., generalization to unseen combinations of known words).Thus, it remains unclear to what extent models can translate sentences that require structural generalization (i.e., generalization to different sorts of syntactic structures).To address this question, we construct SGET, a machine translation dataset covering various types of compositional generalization with control of words and sentence structures.We evaluate neural machine translation models on SGET and show that they struggle more in structural generalization than in lexical generalization.We also find different performance trends in semantic parsing and machine translation, which indicates the importance of evaluations across various tasks. 2024.findings-acl.783 @@ -16811,9 +16811,9 @@ Improving Machine Translation with Large Language Models: A Preliminary Study with Cooperative Decoding JialiZeng - FandongMengWeChat AI, Tencent Inc. + FandongMengWeChat AI, Tencent Inc. YongjingYin - JieZhou + JieZhou 13275-13288 Contemporary translation engines based on the encoder-decoder framework have made significant strides in development.However, the emergence of Large Language Models (LLMs) has disrupted their position by presenting the potential for achieving superior translation quality.To uncover the circumstances in which LLMs excel and explore how their strengths can be harnessed to enhance translation quality,we first conduct a comprehensive analysis to assess the strengths and limitations of various commercial NMT systems and MT-oriented LLMs. Our findings indicate that neither NMT nor MT-oriented LLMs alone can effectively address all the translation issues, but MT-oriented LLMs show promise as a complementary solution to NMT systems.Building upon these insights, we propose Cooperative Decoding (CoDec), which treats NMT systems as a pretranslation model and MT-oriented LLMs as a supplemental solution to handle complex scenarios beyond the capability of NMT alone.Experimental results on the WMT22 test sets and a newly collected test set WebCrawl demonstrate the effectiveness and efficiency of CoDec, highlighting its potential as a robust solution for combining NMT systems with MT-oriented LLMs in the field of machine translation. 2024.findings-acl.786 @@ -16859,14 +16859,14 @@ <fixed-case>S</fixed-case>ec<fixed-case>F</fixed-case>ormer: Fast and Accurate Privacy-Preserving Inference for Transformer Models via <fixed-case>SMPC</fixed-case> - JinglongLuo + JinglongLuo YehongZhangPeng Cheng Laboratory - ZhuoZhangHarbin Institute of Technology + ZhuoZhangHarbin Institute of Technology JiaqiZhangPengCheng Laboratory XinMu HuiWang YueYuNational University of Defense Technology and PengCheng Lab - ZenglinXuFudan University + ZenglinXuFudan University 13333-13348 2024.findings-acl.790 luo-etal-2024-secformer @@ -16886,12 +16886,12 @@ History-Aware Conversational Dense Retrieval - FengranMo + FengranMo ChenQu KelongMao - TianyuZhu - ZhanSu - KaiyuHuangBeijing Jiaotong University + TianyuZhu + ZhanSu + KaiyuHuangBeijing Jiaotong University Jian-YunNieUniversity of Montreal 13366-13378 Conversational search facilitates complex information retrieval by enabling multi-turn interactions between users and the system. Supporting such interactions requires a comprehensive understanding of the conversational inputs to formulate a good search query based on historical information. In particular, the search query should include the relevant information from the previous conversation turns.However, current approaches for conversational dense retrieval primarily rely on fine-tuning a pre-trained ad-hoc retriever using the whole conversational search session, which can be lengthy and noisy. Moreover, existing approaches are limited by the amount of manual supervision signals in the existing datasets.To address the aforementioned issues, we propose a **H**istory-**A**ware **Conv**ersational **D**ense **R**etrieval (HAConvDR) system, which incorporates two ideas: context-denoised query reformulation and automatic mining of supervision signals based on the actual impact of historical turns.Experiments on two public conversational search datasets demonstrate the improved history modeling capability of HAConvDR, in particular for long conversations with topic shifts. @@ -16902,11 +16902,11 @@ Light Up the Shadows: Enhance Long-Tailed Entity Grounding with Concept-Guided Vision-Language Models YikaiZhang - QianyuHeFudan University + QianyuHeFudan University XintaoWang SiyuYuan JiaqingLiangFudan University - YanghuaXiaoFudan University + YanghuaXiaoFudan University 13379-13389 Multi-Modal Knowledge Graphs (MMKGs) have proven valuable for various downstream tasks. However, scaling them up is challenging because building large-scale MMKGs often introduces mismatched images (i.e., noise). Most entities in KGs belong to the long tail, meaning there are few images of them available online. This scarcity makes it difficult to determine whether a found image matches the entity. To address this, we draw on the Triangle of Reference Theory and suggest enhancing vision-language models with concept guidance. Specifically, we introduce COG, a two-stage framework with COncept-Guided vision-language models. The framework comprises a Concept Integration module, which effectively identifies image-text pairs of long-tailed entities, and an Evidence Fusion module, which offers explainability and enables human verification. To demonstrate the effectiveness of COG, we create a dataset of 25k image-text pairs of long-tailed entities. Our comprehensive experiments show that COG not only improves the accuracy of recognizing long-tailed image-text pairs compared to baselines but also offers flexibility and explainability. 2024.findings-acl.793 @@ -16915,10 +16915,10 @@ <fixed-case>Z</fixed-case>ero<fixed-case>S</fixed-case>tance: Leveraging <fixed-case>C</fixed-case>hat<fixed-case>GPT</fixed-case> for Open-Domain Stance Detection via Dataset Generation - ChenyeZhao - YingjieLiWestlake University + ChenyeZhao + YingjieLiWestlake University CorneliaCarageaUniversity of Illinois, Chicago - YueZhangWestlake University + YueZhangWestlake University 13390-13405 Zero-shot stance detection that aims to detect the stance (typically against, favor, or neutral) towards unseen targets has attracted considerable attention. However, most previous studies only focus on targets from a single or limited text domains (e.g., financial domain), and thus zero-shot models cannot generalize well to unseen targets of diverse domains (e.g., political domain). In this paper, we consider a more realistic task, i.e., open-domain stance detection, which aims at training a model that is able to generalize well to unseen targets across multiple domains of interest. Particularly, we propose a novel dataset generation method ZeroStance, which leverages ChatGPT to construct a synthetic open-domain dataset CHATStance that covers a wide range of domains. We then train an open-domain model on our synthetic dataset after proper data filtering. Extensive results indicate that our model, when trained on this synthetic dataset, shows superior generalization to unseen targets of diverse domains over baselines on most benchmarks. Our method requires only a task description in the form of a prompt and is much more cost-effective and data-efficient than previous methods. We will release our code and data to facilitate future research. 2024.findings-acl.794 @@ -16928,7 +16928,7 @@ Boosting Zero-Shot Crosslingual Performance using <fixed-case>LLM</fixed-case>-Based Augmentations with Effective Data Selection BarahFazili - AshishAgrawal + AshishAgrawal PreethiJyothiIndian Institute of Technology Bombay 13406-13422 Large language models (LLMs) are very proficient text generators. We leverage this capability of LLMs to generate task-specific data via zero-shot prompting and promote cross-lingual transfer for low-resource target languages. Given task-specific data in a source language and a teacher model trained on this data, we propose using this teacher to label LLM generations and employ a set of simple data selection strategies that use the teacher’s label probabilities. Our data selection strategies help us identify a representative subset of diverse generations that help boost zero-shot accuracies while being efficient, in comparison to using all the LLM generations (without any subset selection). We also highlight other important design choices that affect cross-lingual performance such as the use of translations of source data and what labels are best to use for the LLM generations. We observe significant performance gains across sentiment analysis and natural language inference tasks (of up to a maximum of 7.13 absolute points and 1.5 absolute points on average) across a number of target languages (Hindi, Marathi, Urdu, Swahili) and domains. @@ -16938,11 +16938,11 @@ Reinforcement Tuning for Detecting Stances and Debunking Rumors Jointly with Large Language Models - RuichaoYang + RuichaoYang WeiGaoSingapore Management University JingMaHong Kong Baptist University - HongzhanLinHong Kong Baptist University - BoWangSchool of Artificial Intelligence, Jilin University + HongzhanLinHong Kong Baptist University + BoWangSchool of Artificial Intelligence, Jilin University 13423-13439 Learning multi-task models for jointly detecting stance and verifying rumors poses challenges due to the need for training data of stance at post level and rumor veracity at claim level, which are difficult to obtain. To address this issue, we leverage large language models (LLMs) as the foundation annotators for the joint stance detection (SD) and rumor verification (RV) tasks, dubbed as JSDRV. We introduce a novel reinforcement tuning framework to enhance the joint predictive capabilities of LLM-based SD and RV components. Specifically, we devise a policy for selecting LLM-annotated data at the two levels, employing a hybrid reward mechanism to choose high-quality labels for effective LLM fine-tuning on both tasks. Results demonstrate that JSDRV improves the capabilities of LLMs in the joint tasks, not only outperforming state-of-the-art methods but also generalizing to non-LLMs accommodated as task models. 2024.findings-acl.796 @@ -16953,7 +16953,7 @@ Exploring the Potential of Dense Information in Multimodal Alignment ZhiyuanFan ZhihongChenStanford University - BenyouWangThe Chinese University of Hong Kong, Shenzhen + BenyouWangThe Chinese University of Hong Kong, Shenzhen 13440-13451 Despite the success of data augmentation in improving CLIP model, existing methods that utilize LLM or SAM to enrich the information in captions still suffer from several limitations, including insufficient detail and excessive hallucinations, ultimately resulting in compromised alignment and masking the true potential of dense information. This can lead to erroneous conclusions about CLIP’s ability to handle rich data, impeding the development of more effective models. To address the limitations of existing methods, we introduce a novel pipeline that generates highly detailed, factually accurate captions for images, which facilitates in-depth analysis of the potential for dense information in multimodal alignment. Contrary to previous findings, our investigation revealed that lengthening captions boosts performance across diverse benchmarks, even surpassing the effectiveness of meticulously crafted hard negative samples. Building on these insights, DELIP is introduced, demonstrably enhancing both foundational multimodal alignment and compositional reasoning abilities. Finally, we explore strategies to expand the context window of the text encoder, unlocking the potential of richer data for CLIP and paving the way for advancements in leveraging dense information for multimodal alignment. 2024.findings-acl.797 @@ -16975,7 +16975,7 @@ <fixed-case>I</fixed-case>nstruct<fixed-case>E</fixed-case>val: Instruction-Tuned Text Evaluator from Human Preference WenhaoWu - WeiLiInstitute of Computing Technology, Chinese Academy of Sciences + WeiLiInstitute of Computing Technology, Chinese Academy of Sciences XinyanXiaoBaidu JiachenLiuBaidu Inc. SujianLiPeking University @@ -16989,7 +16989,7 @@ A Curious Case of Searching for the Correlation between Training Data and Adversarial Robustness of Transformer Textual Models DangCuong DungLeVinUniversity - ThaiLeIndiana University + ThaiLeIndiana University 13475-13491 Existing works have shown that fine-tuned textual transformer models achieve state-of-the-art prediction performances but are also vulnerable to adversarial text perturbations. Traditional adversarial evaluation is often done only after fine-tuning the models and ignoring the training data. In this paper, we want to prove that there is also a strong correlation between training data and model robustness. To this end, we extract 13 different features representing a wide range of input fine-tuning corpora properties and use them to predict the adversarial robustness of the fine-tuned models. Focusing mostly on encoder-only transformer models BERT and RoBERTa with additional results for BART, ELECTRA and GPT2, we provide diverse evidence to support our argument. First, empirical analyses show that (a) extracted features can be used with a lightweight classifier such as Random Forest to effectively predict the attack success rate and (b) features with the most influence on the model robustness have a clear correlation with the robustness. Second, our framework can be used as a fast and effective additional tool for robustness evaluation since it (a) saves 30x-193x runtime compared to the traditional technique, (b) is transferable across models, (c) can be used under adversarial training, and (d) robust to statistical randomness. Our code is publicly available at https://github.com/CaptainCuong/RobustText_ACL2024. 2024.findings-acl.800 @@ -16998,12 +16998,12 @@ <fixed-case>I</fixed-case>nstruct<fixed-case>G</fixed-case>raph: Boosting Large Language Models via Graph-centric Instruction Tuning and Preference Alignment - JianingWang + JianingWang JundaWu - YupengHouUniversity of California, San Diego - YaoLiuEast China Normal University - MingGao - JulianMcAuleyUniversity of California, San Diego, University of California, San Diego + YupengHouUniversity of California, San Diego + YaoLiuEast China Normal University + MingGao + JulianMcAuleyUniversity of California, San Diego, University of California, San Diego 13492-13510 Do current large language models (LLMs) better solve graph reasoning and generation tasks with parameter updates? In this paper, we propose InstructGraph, a framework that empowers LLMs with the abilities of graph reasoning and generation by instruction tuning and preference alignment. Specifically, we first propose a structured format verbalizer to unify all graph data into a universal code-like format, which can simply represent the graph without any external graph-specific encoders. Furthermore, a graph instruction tuning stage is introduced to guide LLMs in solving graph reasoning and generation tasks. Finally, we identify potential hallucination problems in graph tasks and sample negative instances for preference alignment, the target of which is to enhance the output’s reliability of the model. Extensive experiments across multiple graph-centric tasks exhibit that InstructGraph can achieve the best performance and outperform GPT-4 and LLaMA2 by more than 13% and 38%, respectively. 2024.findings-acl.801 @@ -17027,13 +17027,13 @@ Competition-Level Problems are Effective <fixed-case>LLM</fixed-case> Evaluators YimingHuang ZhenghaoLin - XiaoLiuMicrosoft Research Asia + XiaoLiuMicrosoft Research Asia YeyunGong ShuaiLuMicrosoft FangyuLei YaoboLiang YelongShenMicrosoft - ChenLinXiamen University + ChenLinXiamen University NanDuanMicrosoft Research Asia WeizhuChenMicrosoft GenAI 13526-13544 @@ -17044,12 +17044,12 @@ Large Language Models for Automated Open-domain Scientific Hypotheses Discovery - ZonglinYang + ZonglinYang XinyaDuUniversity of Texas at Dallas - JunxianLiNanyang Technological University + JunxianLiNanyang Technological University JieZheng SoujanyaPoriaSingapore University of Technology and Design - ErikCambriaNanyang Technological University + ErikCambriaNanyang Technological University 13545-13565 Hypothetical induction is recognized as the main reasoning type when scientists make observations about the world and try to propose hypotheses to explain those observations. Past research on hypothetical induction is under a constrained setting: (1) the observation annotations in the dataset are carefully manually handpicked sentences (resulting in a close-domain setting); and (2) the ground truth hypotheses are mostly commonsense knowledge, making the task less challenging. In this work, we tackle these problems by proposing the first dataset for social science academic hypotheses discovery, with the final goal to create systems that automatically generate valid, novel, and helpful scientific hypotheses, given only a pile of raw web corpus. Unlike previous settings, the new dataset requires (1) using open-domain data (raw web corpus) as observations; and (2) proposing hypotheses even new to humanity. A multi-module framework is developed for the task, including three different feedback mechanisms to boost performance, which exhibits superior performance in terms of both GPT-4 based and expert-based evaluation.To the best of our knowledge, this is the first work showing that LLMs are able to generate novel (”not existing in literature”) and valid (”reflecting reality”) scientific hypotheses. 2024.findings-acl.804 @@ -17068,11 +17068,11 @@ Training a Better <fixed-case>C</fixed-case>hinese Spelling Correction Model via Prior-knowledge Guided Teacher - ChiWei - ShaobinHuang - RongshengLiHarbin Engineering University - NaiyuYan - RuiWang + ChiWei + ShaobinHuang + RongshengLiHarbin Engineering University + NaiyuYan + RuiWang 13578-13589 Recent advancements in Chinese Spelling Correction (CSC) predominantly leverage pre-trained language models (PLMs). However, a notable challenge with fine-tuned PLM-based CSC models is their tendency to over-correct, leading to poor generalization for error patterns outside the standard distribution. To address this, we developed a teacher network guided by prior knowledge for distillation learning of CSC models. Unlike traditional teacher networks, which depend on task-related pre-training, our method infuses task-related prior information into the teacher network, offering guidance beyond mere labels to the student network. This strategy significantly enhances the CSC model’s language modeling capabilities, crucial for minimizing over-correction. Importantly, our approach is model-independent and the teacher network does not require task-related pre-training, making it broadly applicable for enhancing various PLM-based CSC models with minimal additional computational resources. Extensive experiments on widely used benchmarks demonstrate that our method achieves new state-of-the-art results. Additionally, we explored the potential of generalizing our method to other non-autoregressive text-generation tasks. 2024.findings-acl.806 @@ -17081,14 +17081,14 @@ The Revolution of Multimodal Large Language Models: A Survey - DavideCaffagni - FedericoCocchiUniversity of Pisa - LucaBarsellotti - NicholasMoratelli - SaraSarto - LorenzoBaraldi + DavideCaffagni + FedericoCocchiUniversity of Pisa + LucaBarsellotti + NicholasMoratelli + SaraSarto + LorenzoBaraldi LorenzoBaraldiUniversità degli Studi di Modena e Reggio Emilia - MarcellaCorniaUniversity of Modena and Reggio Emilia + MarcellaCorniaUniversity of Modena and Reggio Emilia RitaCucchiaraUniversità di Modena e Reggio Emilia 13590-13618 Connecting text and visual modalities plays an essential role in generative intelligence. For this reason, inspired by the success of large language models, significant research efforts are being devoted to the development of Multimodal Large Language Models (MLLMs). These models can seamlessly integrate visual and textual modalities, while providing a dialogue-based interface and instruction-following capabilities. In this paper, we provide a comprehensive review of recent visual-based MLLMs, analyzing their architectural choices, multimodal alignment strategies, and training techniques. We also conduct a detailed analysis of these models across a wide range of tasks, including visual grounding, image generation and editing, visual understanding, and domain-specific applications. Additionally, we compile and describe training datasets and evaluation benchmarks, conducting comparisons among existing models in terms of performance and computational requirements. Overall, this survey offers a comprehensive overview of the current state of the art, laying the groundwork for future MLLMs. @@ -17098,7 +17098,7 @@ <fixed-case>OOP</fixed-case>: Object-Oriented Programming Evaluation Benchmark for Large Language Models - ShuaiWang + ShuaiWang LiangDing LiShenSun Yat-Sen University YongLuoWuhan University @@ -17113,15 +17113,15 @@ Code Needs Comments: Enhancing Code <fixed-case>LLM</fixed-case>s with Comment Augmentation DeminSongShanghai AI Laboratory - HonglinGuoFudan University + HonglinGuoFudan University YunhuaZhou ShuhaoXing YudongWangShanghai AI Laboratory ZifanSongTongji University - WenweiZhangShanghai AI Laboratory + WenweiZhangShanghai AI Laboratory QipengGuoShanghai AI Laboratory HangYanAI lab - XipengQiuFudan University + XipengQiuFudan University DahuaLinThe Chinese University of Hong Kong 13640-13656 The programming skill is one crucial ability for Large Language Models (LLMs), necessitating a deep understanding of programming languages (PLs) and their correlation with natural languages (NLs). We examine the impact of pre-training data on code-focused LLMs’ performance by assessing the comment density as a measure of PL-NL alignment. Given the scarcity of code-comment aligned data in pre-training corpora, we introduce a novel data augmentation method that generates comments for existing code, coupled with a data filtering strategy that filters out code data poorly correlated with natural language. We conducted experiments on three code-focused LLMs and observed consistent improvements in performance on two widely-used programming skill benchmarks. Notably, the model trained on the augmented data outperformed both the model used for generating comments and the model further trained on the data without augmentation. @@ -17133,8 +17133,8 @@ Efficient Domain Adaptation for Non-Autoregressive Machine Translation WangJieYou PeiGuo - JuntaoLiSoochow University, China - KehaiChenHarbin Institute of Technology (Shenzhen) + JuntaoLiSoochow University, China + KehaiChenHarbin Institute of Technology (Shenzhen) MinZhangHarbin Institute of Technology, Shenzhen 13657-13670 Domain adaptation remains a challenge in the realm of Neural Machine Translation (NMT), even in the era of large language models (LLMs). Existing non-parametric approaches like nearest neighbor machine translation have made small Autoregressive Translation (AT) models achieve efficient domain generalization and adaptation without updating parameters, but leaving the Non-Autoregressive Translation (NAT) counterparts under-explored. To fill this blank, we introduce Bi-kNN, an innovative and efficient domain adaptation approach for NAT models that tailors a k-nearest-neighbor algorithm for NAT. Specifically, we introduce an effective datastore construction and correlated updating strategies to conform the parallel nature of NAT. Additionally, we train a meta-network that seamlessly integrates the NN distribution with the NMT distribution robustly during the iterative decoding process of NAT. Our experimental results across four benchmark datasets demonstrate that our Bi-kNN not only achieves significant improvements over the Base-NAT model (7.8 BLEU on average) but also exhibits enhanced efficiency. @@ -17146,8 +17146,8 @@ Exploring Reversal Mathematical Reasoning Ability for Large Language Models PeiGuo WangJieYou - JuntaoLiSoochow University, China - YanBowen + JuntaoLiSoochow University, China + YanBowen MinZhangHarbin Institute of Technology, Shenzhen 13671-13685 Large language models (LLMs) have presented remarkable capabilities in the wide range of natural language understanding and reasoning tasks. Despite their success, a few works indicate that LLMs suffer from the “reversal curse”, in which LLMs can’t employ the inverted structure “B is A” when they are trained based on “A is B”. To explore the effect of the “reversal curse” for LLMs on complex mathematical reasoning tasks, we present two reversal datasets upon GSM8K and MathQA and verify that LLMs also struggle to solve reversal mathematical problems. We analyze the potential reason and attribute it to the insufficient modeling of the relationship between reasoning steps caused by the left-to-right objective. Consequently, based on the characteristics of multi-step reasoning, we design a novel training method to improve the general and reversal reasoning abilities. Finally, we conduct experiments on four mathematical datasets, and the results demonstrate that our method significantly improves the general reasoning capacities and alleviates the reversal problem. Our datasets and codes are available at https: //github.com/AllForward/ReversalMath. @@ -17157,10 +17157,10 @@ A Unified Joint Approach with Topological Context Learning and Rule Augmentation for Knowledge Graph Completion - JingtaoGuo + JingtaoGuo ChunxiaZhangSchool of Computer Science and Technology, Beijing Institute of Technology LingxiLi - XiaojunXue + XiaojunXue ZhendongNiuBeijing Institute of Technology 13686-13696 Knowledge graph completion (KGC) task is to infer the missing knowledge in the knowledge graph based on known factual triples. However, present KGC approaches still face the following two challenges. Those methods perform simple linear update on relation representation, and only local neighborhood information is aggregated, which makes it difficult to capture logic semantic between relations and global topological context information. To tackle the above challenges, we propose a unified joint approach with Topological Context learning and Rule Augmentation (TCRA) for KGC. The TCRA framework consists of an entity topological context learning mechanism based on dual-branch hierarchical graph attention network, and a relation rule context learning mechanism based on Rule-Transformer and rule-to-relation aggregator. The former mechanism encodes the topological structure features of entities, aggregates the local neighborhood topological context information of entities on the three levels (entity, relation and triple), and build clusters of global head or tail entities related to the same relation. It can capture the local and global topological context information of entities related to the same relation. The latter mechanism introduces chain-like Horn rules as the context information of relations, and encodes the logical semantic of relations to enrich the relation representation. Experimental performances on three benchmark datasets FB15k-237, WN18RR and Kinship indicate the effectiveness and superiority of our proposed approach. The codes are publicly available. @@ -17174,7 +17174,7 @@ MohitIyyerUniversity of Massachusetts Amherst XuezhiWangGoogle NoahConstant - JerryWeiAnthropic and Stanford University + JerryWeiAnthropic and Stanford University JasonWeiOpenAI ChrisTar Yun-HsuanSungGoogle @@ -17191,7 +17191,7 @@ <fixed-case>ROSE</fixed-case> Doesn’t Do That: Boosting the Safety of Instruction-Tuned Large Language Models with Reverse Prompt Contrastive Decoding QihuangZhong LiangDing - JuhuaLiuWuhan University + JuhuaLiuWuhan University BoDuWuhan University DachengTaoUniversity of Sydney 13721-13736 @@ -17206,7 +17206,7 @@ JingpingLiuEast China University of Science and Technology SihangJiangFudan University HaiyunJiangSUN YAT-SEN UNIVERSITY - YanghuaXiaoFudan University + YanghuaXiaoFudan University JiaqingLiangFudan University ZujieLiangAnt Group FengWei @@ -17224,9 +17224,9 @@ YingqianMin KunZhouRenmin University of China DaweiGaoAlibaba Group - XinZhaoRenmin University of China + XinZhaoRenmin University of China HeHuRenmin University of China, Renmin University of China - YaliangLiAlibaba Group + YaliangLiAlibaba Group 13748-13761 Recently, multi-task instruction tuning has been utilized to improve sentence representation learning (SRL). It enables SRL models to generate task-specific representations with the guidance of task instruction, thus exhibiting strong generalization ability on unseen tasks. However, these methods mostly neglect the potential interference problems across different tasks and instances, which may affect the training of the model.To address this issue, we propose a data curriculum method, namely **Data-CUBE**, that arranges the order of all the multi-task data for training, to minimize the interference risks from two aspects.At the task level, we aim to find the optimal task order to minimize the total cross-task interference risk and formulate this problem as the traveling salesman problem, which is further solved by a specially designed simulated annealing algorithm. At the instance level, we propose a measurement method to quantify the difficulty of all instances per task, and then arrange instances in an easy-to-difficult order for training.Experimental results show that our approach can boost the performance of state-of-the-art methods. Our code and data will be publicly released. 2024.findings-acl.816 @@ -17236,8 +17236,8 @@ Combating Label Sparsity in Short Text Topic Modeling via Nearest Neighbor Augmentation YangLin - XinyuMa - XinGaoPeking University + XinyuMa + XinGaoPeking University RuiqingLi YashaWang XuChu @@ -17251,7 +17251,7 @@ <fixed-case>R</fixed-case>efute<fixed-case>B</fixed-case>ench: Evaluating Refuting Instruction-Following for Large Language Models JianhaoYanWestlake University YunLuowestlake university - YueZhangWestlake University + YueZhangWestlake University 13775-13791 The application scope of large language models (LLMs) is increasingly expanding. In practical use, users might provide feedback based on the model’s output, hoping for a responsive model that can complete responses according to their feedback. Whether the model can appropriately respond to users’ refuting feedback and consistently follow through with execution has not been thoroughly analyzed. In light of this, this paper proposes a comprehensive benchmark, RefuteBench, covering tasks such as question answering, machine translation, and email writing. The evaluation aims to assess whether models can positively accept feedback in form of refuting instructions and whether they can consistently adhere to user demands throughout the conversation. We conduct evaluations on numerous LLMs and find that LLMs are stubborn, i.e. exhibit inclination to their internal knowledge, often failing to comply with user feedback. Additionally, as the length of the conversation increases, models gradually forget the user’s stated feedback and roll back to their own responses. We further propose a recall-and-repeat prompts as a simple and effective way to enhance the model’s responsiveness to feedback. 2024.findings-acl.818 @@ -17271,9 +17271,9 @@ Argument-Based Sentiment Analysis on Forward-Looking Statements Chin-YiLinNational Taiwan University - Chung-ChiChenAIST, National Institute of Advanced Industrial Science and Technology - Hen-HsenHuangInstitute of Information Science, Academia Sinica - Hsin-HsiChenNational Taiwan University + Chung-ChiChenAIST, National Institute of Advanced Industrial Science and Technology + Hen-HsenHuangInstitute of Information Science, Academia Sinica + Hsin-HsiChenNational Taiwan University 13804-13815 This paper introduces a novel approach to analyzing the forward-looking statements in equity research reports by integrating argument mining with sentiment analysis. Recognizing the limitations of traditional models in capturing the nuances of future-oriented analysis, we propose a refined categorization of argument units into claims, premises, and scenarios, coupled with a unique sentiment analysis framework. Furthermore, we incorporate a temporal dimension to categorize the anticipated impact duration of market events. To facilitate this study, we present the Equity Argument Mining and Sentiment Analysis (Equity-AMSA) dataset. Our research investigates the extent to which detailed domain-specific annotations can be provided, the necessity of fine-grained human annotations in the era of large language models, and whether our proposed framework can improve performance in downstream tasks over traditional methods. Experimental results reveal the significance of manual annotations, especially for scenario identification and sentiment analysis. The study concludes that our annotation scheme and dataset contribute to a deeper understanding of forward-looking statements in equity research reports. 2024.findings-acl.820 @@ -17283,9 +17283,9 @@ Paying More Attention to Source Context: Mitigating Unfaithful Translations from Large Language Model HongbinZhang - KehaiChenHarbin Institute of Technology (Shenzhen) + KehaiChenHarbin Institute of Technology (Shenzhen) XuefengBai - YangXiang + YangXiang MinZhangHarbin Institute of Technology, Shenzhen 13816-13836 Large language models (LLMs) have showcased their remarkable capabilities to handle various downstream tasks, including multilingual machine translation ability. Despite their impressive performance, decoder-only LLMs lack an explicit alignment between source and target contexts, leading to translation that may not faithfully represent the original content. To address this, we propose three learning strategies to encourage LLMs to pay more attention to the source context during translation: 1) adjusting attention weights on the source context by adaptive attention re-weighting; 2) suppressing the irrelevant target prefix using contrastive decoding; 3) avoiding excessive reliance on the target prefix through target-constrained tuning. To verify the effectiveness of our model, we curate a new dataset specifically focusing on unfaithful translations generated by LLMs. Experimental results on both human-collected and general test sets verify the effectiveness of our model across multiple language pairs. Further human evaluation demonstrates the efficacy of our method in reducing hallucinatory translation and improving the fidelity of translations. @@ -17325,7 +17325,7 @@ MeishanZhangHarbin Institute of Technology (Shenzhen), China and Tianjin University, China XueboLiuHarbin Institute of Technolgy, Shenzhen ZhaocongLi - DerekWongUniversity of Macau + DerekWongUniversity of Macau MinZhangHarbin Institute of Technology, Shenzhen 13868-13881 Tuning-based large language models for machine translation (aka large translation model, LTM) have demonstrated significant performance in the field of machine translation. Despite their success, these models often face difficulties in leveraging demonstrations to further improve their performance. To tackle this challenge, we introduce a novel approach that integrates demonstration-aware training and inference strategies within the framework of tuning-based LTMs, hereby referred to as demonstration-aware LTMs. During training, we enrich the model’s learning process by incorporating both sentence- and document-level demonstrations derived from its original training dataset. During inference, the model synergizes its own contextual translations with retrieved high-quality demonstrations, leading to more precise and contextually appropriate outputs. Empirical results reveal that our demonstration-aware LTM not only mitigates the negative impacts traditionally associated with demonstrations but also secures substantial improvements in translation accuracy, particularly in domain-specific and document-level translation tasks. Source code and scripts are freely available at https://github.com/ChenLi0620/Demo-Aware-LLM-MT. @@ -17338,7 +17338,7 @@ DohyeonLeeSeoul National University JongyoonKimSeoul National University Seung-wonHwangSeoul National University - JoonsukParkUniversity of Richmond + JoonsukParkUniversity of Richmond 13882-13893 Pre-trained language models (PLMs) exhibit promise in retrieval tasks but struggle with out-of-domain data due to distribution shifts.Addressing this, generative domain adaptation (DA), known as GPL, tackles distribution shifts by generating pseudo queries and labels to train models for predicting query-document relationships in new domains.However, it overlooks the domain distribution, causing the model to struggle with aligning the distribution in the target domain.We, therefore, propose a Distribution-Aware Domain Adaptation (DADA) to guide the model to consider the domain distribution knowledge at the level of both a single document and the corpus, which is referred to as observation-level feedback and domain-level feedback, respectively.Our method effectively adapts the model to the target domain and expands document representation to unseen gold query terms using domain and observation feedback, as demonstrated by empirical results on the BEIR benchmark. 2024.findings-acl.825 @@ -17363,11 +17363,11 @@ FedericoRanaldiUniversity of Roma “Tor Vergata” Elena SofiaRuzzettiUniversità degli Studi di Roma Tor Vergata DarioOnorati“La Sapienza” University of Rome - LeonardoRanaldiIdiap Research Institute + LeonardoRanaldiIdiap Research Institute CristinaGiannone AndreaFavalliAlmawave RanieroRomagnoliUniversity of Roma “La Sapienza” - Fabio MassimoZanzottoUniversity of Rome Tor Vergata + Fabio MassimoZanzottoUniversity of Rome Tor Vergata 13909-13920 Understanding textual description to generate code seems to be an achieved capability of instruction-following Large Language Models (LLMs) in zero-shot scenario. However, there is a severe possibility that this translation ability may be influenced by having seen target textual descriptions and the related code. This effect is known as Data Contamination.In this study, we investigate the impact of Data Contamination on the performance of GPT-3.5 in the Text-to-SQL code-generating tasks. Hence, we introduce a novel method to detect Data Contamination in GPTs and examine GPT-3.5’s Text-to-SQL performances using the known Spider Dataset and our new unfamiliar dataset Termite. Furthermore, we analyze GPT-3.5’s efficacy on databases with modified information via an adversarial table disconnection (ATD) approach, complicating Text-to-SQL tasks by removing structural pieces of information from the database. Our results indicate a significant performance drop in GPT-3.5 on the unfamiliar Termite dataset, even with ATD modifications, highlighting the effect of Data Contamination on LLMs in Text-to-SQL translation tasks. 2024.findings-acl.827 @@ -17379,7 +17379,7 @@ MubasharaAkhtar NikeshSubedi, University of Utah VivekGuptaUniversity of Pennsylvania, United States - SaharTahmasebiTIB – Leibniz Information Centre for Science and Technology + SaharTahmasebiTIB – Leibniz Information Centre for Science and Technology OanaCocarascuKing’s College London ElenaSimperlKing’s College London 13921-13937 @@ -17391,8 +17391,8 @@ Real World Conversational Entity Linking Requires More Than Zero-Shots MohannaHoveyda - ArjenVriesInstitute for Computing and Information Sciences, Radboud University Nijmegen, Radboud University - FaeghehHasibiRadboud University + ArjenVriesInstitute for Computing and Information Sciences, Radboud University Nijmegen, Radboud University + FaeghehHasibiRadboud University Maartende RijkeUniversity of Amsterdam 13938-13946 Entity linking (EL) in conversations faces notable challenges in practical applications, primarily due to scarcity of entity-annotated conversational datasets and sparse knowledge bases (KB) containing domain-specific, long-tail entities. We designed targeted evaluation scenarios to measure the efficacy of EL models under resource constraints. Our evaluation employs two KBs: Fandom, exemplifying real-world EL complexities, and the widely used Wikipedia. First, we assess EL models’ ability to generalize to a new unfamiliar KB using Fandom and a novel zero-shot conversational entity linking dataset that we curated based on Reddit discussions on Fandom entities. We then evaluate the adaptability of EL models to conversational settings without prior training. Our results indicate that current zero-shot EL models falter when introduced to new, domain-specific KBs without prior training, significantly dropping in performance.Our findings reveal that previous evaluation approaches fall short of capturing real-world complexities for zero-shot EL, highlighting the necessity for new approaches to design and assess conversational EL models to adapt to limited resources. The evaluation frame-work and dataset proposed are tailored to facilitate this research. @@ -17402,16 +17402,16 @@ <fixed-case>CP</fixed-case>sy<fixed-case>C</fixed-case>oun: A Report-based Multi-turn Dialogue Reconstruction and Evaluation Framework for <fixed-case>C</fixed-case>hinese Psychological Counseling - ChenhaoZhangShanghai Artificial Intelligence Laboratory, Shenzhen Institutes of Advanced Technology, Chinese Academy of Sciences and Huazhong University of Science and Technology + ChenhaoZhangShanghai Artificial Intelligence Laboratory, Shenzhen Institutes of Advanced Technology, Chinese Academy of Sciences and Huazhong University of Science and Technology RenhaoLiUniversity of Macau - MinghuanTanShenzhen Institute of Advanced Technology, Chinese Academy of Sciences, Chinese Academy of Sciences + MinghuanTanShenzhen Institute of Advanced Technology, Chinese Academy of Sciences, Chinese Academy of Sciences MinYangShenzhen Institutes of Advanced Technology, Chinese Academy of Sciences, Chinese Academy of Sciences - JingweiZhu - DiYang + JingweiZhu + DiYang JiahaoZhao GuanchengYe - ChengmingLiShenzhen MSU-BIT University - XipingHuBeijing Institute of Technology + ChengmingLiShenzhen MSU-BIT University + XipingHuBeijing Institute of Technology 13947-13966 Using large language models (LLMs) to assist psychological counseling is a significant but challenging task at present. Attempts have been made on improving empathetic conversations or acting as effective assistants in the treatment with LLMs. However, the existing datasets lack consulting knowledge, resulting in LLMs lacking professional consulting competence. Moreover, how to automatically evaluate multi-turn dialogues within the counseling process remains an understudied area. To bridge the gap, we propose CPsyCoun, a report-based multi-turn dialogue reconstruction and evaluation framework for Chinese psychological counseling. To fully exploit psychological counseling reports, a two-phase approach is devised to construct high-quality dialogues while a comprehensive evaluation benchmark is developed for the effective automatic evaluation of multi-turn psychological consultations. Competitive experimental results demonstrate the effectiveness of our proposed framework in psychological counseling. We open-source the datasets and model for future research. 2024.findings-acl.830 @@ -17422,9 +17422,9 @@ Tox-<fixed-case>BART</fixed-case>: Leveraging Toxicity Attributes for Explanation Generation of Implicit Hate Speech NeemeshYadavIndraprastha Institute of Information Technology, Delhi SarahMasudIndraprastha Institute of Information Technology Delhi (IIIT-Delhi) - VikramGoyalIndraprastha Institute of Information Technology, Delhi + VikramGoyalIndraprastha Institute of Information Technology, Delhi Md ShadAkhtarIndraprastha Institute of Information Technology, Delhi - TanmoyChakrabortyIndian Institute of Technology, Delhi + TanmoyChakrabortyIndian Institute of Technology, Delhi 13967-13983 Employing language models to generate explanations for an incoming implicit hate post is an active area of research. The explanation is intended to make explicit the underlying stereotype and aid content moderators. The training often combines top-k relevant knowledge graph (KG) tuples to provide world knowledge and improve performance on standard metrics. Interestingly, our study presents conflicting evidence for the role of the quality of KG tuples in generating implicit explanations. Consequently, simpler models incorporating external toxicity signals outperform KG-infused models. Compared to the KG-based setup, we observe a comparable performance for SBIC (LatentHatred) datasets with a performance variation of +0.44 (+0.49), +1.83 (-1.56), and -4.59 (+0.77) in BLEU, ROUGE-L, and BERTScore. Further human evaluation and error analysis reveal that our proposed setup produces more precise explanations than zero-shot GPT-3.5, highlighting the intricate nature of the task. 2024.findings-acl.831 @@ -17436,9 +17436,9 @@ JamesEnouen HootanNakhost SaynaEbrahimiGoogle - SercanArikGoogle - YanLiuUniversity of Southern California - TomasPfisterGoogle + SercanArikGoogle + YanLiuUniversity of Southern California + TomasPfisterGoogle 13984-14011 Large language models (LLMs) have attracted great interest in many real-world applications; however, their “black-box” nature necessitates scalable and faithful explanations. Shapley values have matured as an explainability method for deep learning, but extending them to LLMs is difficult due to long input contexts and autoregressive output generation. We introduce , an efficient post-hoc explanation method incorporating LLM-specific techniques, which leads to significant runtime improvements: token-level explanations in minutes not hours, and document-level explanations within seconds. We demonstrate how such explanations can improve end-to-end performance of retrieval augmented generation by localizing important words within long documents and reranking passages collected by retrieval systems. On various open-domain question answering benchmarks, we show TextGenSHAP improves the retrieval recall and prediction accuracy significantly. 2024.findings-acl.832 @@ -17452,7 +17452,7 @@ ZhaoyeFei HangYanAI lab DahuaLinThe Chinese University of Hong Kong - XipengQiuFudan University + XipengQiuFudan University 14012-14023 Data plays a fundamental role in the training of Large Language Models (LLMs). While attention has been paid to the collection and composition of datasets, determining the data sampling strategy in training remains an open question. Most LLMs are trained with a simple strategy, random sampling. However, this sampling strategy ignores the unbalanced nature of training data distribution, which can be sub-optimal. In this paper, we propose ClusterClip Sampling to balance the text distribution of training data for better model training. Specifically, ClusterClip Sampling utilizes data clustering to reflect the data distribution of the training set and balances the common samples and rare samples during training based on the cluster results. A repetition clip operation is introduced to mitigate the overfitting issue led by samples from certain clusters. Extensive experiments validate the effectiveness of ClusterClip Sampling, which outperforms random sampling and other cluster-based sampling variants under various training datasets and large language models. 2024.findings-acl.833 @@ -17478,11 +17478,11 @@ Unsupervised Sign Language Translation and Generation ZhengshengGuo - ZhiweiHeShanghai Jiao Tong University + ZhiweiHeShanghai Jiao Tong University WenxiangJiaoTencent AI Lab - XingWangTencent AI Lab - RuiWangShanghai Jiao Tong University - KehaiChenHarbin Institute of Technology (Shenzhen) + XingWangTencent AI Lab + RuiWangShanghai Jiao Tong University + KehaiChenHarbin Institute of Technology (Shenzhen) ZhaopengTuTencent AI Lab YongXu MinZhangHarbin Institute of Technology, Shenzhen @@ -17494,13 +17494,13 @@ Mitigating Data Scarcity in Semantic Parsing across Languages with the Multilingual Semantic Layer and its Dataset - Abelardo CarlosMartinez LorenzoUniversity of Roma “La Sapienza” - Pere-LluísHuguet Cabot + Abelardo CarlosMartinez LorenzoUniversity of Roma “La Sapienza” + Pere-LluísHuguet Cabot KarimGhonimUniversity of Roma “La Sapienza” - LuXuUniversity of Roma “La Sapienza” + LuXuUniversity of Roma “La Sapienza” Hee-SooChoi AlberteFernández-Castro - RobertoNavigliSapienza University of Rome + RobertoNavigliSapienza University of Rome 14056-14080 Data scarcity is a prevalent challenge in the era of Large Language Models (LLMs). The insatiable hunger of LLMs for large corpora becomes even more pronounced when dealing with non-English and low-resource languages. The issue is particularly exacerbated in Semantic Parsing (SP), i.e. the task of converting text into a formal representation. The complexity of semantic formalisms makes training human annotators and subsequent data annotation unfeasible on a large scale, especially across languages. To mitigate this, we first introduce the Multilingual Semantic Layer (MSL), a conceptual evolution of previous formalisms, which decouples from disambiguation and external inventories and simplifies the task. MSL provides the necessary tools to encode the meaning across languages, paving the way for developing a high-quality semantic parsing dataset across different languages in a semi-automatic strategy. Subsequently, we manually refine a portion of this dataset and fine-tune GPT-3.5 to propagate these refinements across the dataset. Then, we manually annotate 1,100 sentences in eleven languages, including low-resource ones. Finally, we assess our dataset’s quality, showcasing the performance gap reduction across languages in Semantic Parsing. 2024.findings-acl.836 @@ -17511,11 +17511,11 @@ Efficient Sparse Attention needs Adaptive Token Release ChaoranZhang LixinZouSchool of Cyber Science and Engineering, Wuhan University - DanLuoLehigh University + DanLuoLehigh University XiangyangLuoState Key Lab of Mathematical Engineering and Advanced Computing ZihaoLiWuhan University MinTangMonash University - ChenliangLi + ChenliangLi 14081-14094 2024.findings-acl.837 zhang-etal-2024-efficient @@ -17533,7 +17533,7 @@ WeihuaPeng DuyuTangTencent AI Lab DandanTu - BingQinHarbin Institute of Technology + BingQinHarbin Institute of Technology 14095-14113 Despite the impressive performance on information-seeking tasks, large language models (LLMs) still struggle with hallucinations. Attributed LLMs, which augment generated text with in-line citations, demonstrate potential in mitigating hallucinations and improving verifiability. However, current approaches suffer from suboptimal citation quality due to their reliance on in-context learning. Furthermore, the practice of merely citing document identifiers complicates the process for users to pinpoint specific supporting evidence. In this work, we introduce FRONT, a training framework that teaches LLMs to generate Fine-grained grounded citations. By initially grounding fine-grained supporting quotes, which then guide the generation process, these quotes not only provide supervision signals to improve citation quality but also serve as fine-grained attributions. Experiments on the ALCE benchmark demonstrate the efficacy of FRONT in generating superior grounded responses and highly supportive citations. With LLaMA-2-7B, the framework significantly outperforms all the baselines, achieving an average of 14.21% improvement in citation quality across all datasets, even surpassing ChatGPT. 2024.findings-acl.838 @@ -17543,9 +17543,9 @@ <fixed-case>R</fixed-case>e<fixed-case>L</fixed-case>i<fixed-case>K</fixed-case>: Retrieve and <fixed-case>L</fixed-case>in<fixed-case>K</fixed-case>, Fast and Accurate Entity Linking and Relation Extraction on an Academic Budget RiccardoOrlando - Pere-LluísHuguet Cabot + Pere-LluísHuguet Cabot EdoardoBarbaUniversity of Roma “La Sapienza” - RobertoNavigliSapienza University of Rome + RobertoNavigliSapienza University of Rome 14114-14132 Entity Linking (EL) and Relation Extraction (RE) are fundamental tasks in Natural Language Processing, serving as critical components in a wide range of applications. In this paper, we propose ReLiK, a Retriever-Reader architecture for both EL and RE, where, given an input text, the Retriever module undertakes the identification of candidate entities or relations that could potentially appear within the text. Subsequently, the Reader module is tasked to discern the pertinent retrieved entities or relations and establish their alignment with the corresponding textual spans. Notably, we put forward an innovative input representation that incorporates the candidate entities or relations alongside the text, making it possible to link entities or extract relations in a single forward pass and to fully leverage pre-trained language models contextualization capabilities, in contrast with previous Retriever-Reader-based methods, which require a forward pass for each candidate. Our formulation of EL and RE achieves state-of-the-art performance in both in-domain and out-of-domain benchmarks while using academic budget training and with up to 40x inference speed compared to competitors. Finally, we show how our architecture can be used seamlessly for Information Extraction (cIE), i.e. EL + RE, and setting a new state of the art by employing a shared Reader that simultaneously extracts entities and relations. 2024.findings-acl.839 @@ -17568,7 +17568,7 @@ <fixed-case>FENICE</fixed-case>: Factuality Evaluation of summarization based on Natural language Inference and Claim Extraction AlessandroScirè KarimGhonimUniversity of Roma “La Sapienza” - RobertoNavigliSapienza University of Rome + RobertoNavigliSapienza University of Rome 14148-14161 Recent advancements in text summarization, particularly with the advent of Large Language Models (LLMs), have shown remarkable performance. However, a notable challenge persists as a substantial number of automatically-generated summaries exhibit factual inconsistencies, such as hallucinations. In response to this issue, various approaches for the evaluation of consistency for summarization have emerged. Yet, these newly-introduced metrics face several limitations, including lack of interpretability, focus on short document summaries (e.g., news articles), and computational impracticality, especially for LLM-based metrics. To address these shortcomings, we propose Factuality Evaluation of summarization based on Natural language Inference and Claim Extraction (FENICE), a more interpretable and efficient factuality-oriented metric. FENICE leverages an NLI-based alignment between information in the source document and a set of atomic facts, referred to as claims, extracted from the summary. Our metric sets a new state of the art on AGGREFACT, the de-facto benchmark for factuality evaluation. Moreover, we extend our evaluation to a more challenging setting by conducting a human annotation process of long-form summarization. In the hope of fostering research in summarization factuality evaluation, we release the code of our metric and our factuality annotations of long-form summarization at https://github.com/Babelscape/FENICE. 2024.findings-acl.841 @@ -17578,11 +17578,11 @@ Self-Para-Consistency: Improving Reasoning Tasks at Low Cost for Large Language Models WenqingChenSUN YAT-SEN UNIVERSITY - WeichengWang + WeichengWang ZhixuanChuAnt Group - KuiRen - ZibinZhengSUN YAT-SEN UNIVERSITY - ZhichaoLu + KuiRen + ZibinZhengSUN YAT-SEN UNIVERSITY + ZhichaoLu 14162-14167 Recently, the self-consistency decoding strategy has shown the ability to improve performance for complex reasoning tasks with large language models (LLMs). However, the costs may be high because the sampling process of the strategy generates some low-probability text, resulting in low-quality reasoning paths. As a consequence, it requires a relatively large sampling number to obtain good aggregation performance. In this paper, we propose an alternative strategy, self-para-consistency. It first generates multiple paraphrases for each test question, then generates reasoning paths for the original and all the paraphrased questions based on greedy decoding, and finally selects the most consistent answer. Since all the candidate paths have relatively high probabilities, the sampling number could be much smaller than the self-consistency strategy. Extensive experiments on complex reasoning datasets demonstrate the effectiveness of our method in reducing the sampling number. 2024.findings-acl.842 @@ -17591,7 +17591,7 @@ Looking Right is Sometimes Right: Investigating the Capabilities of Decoder-only <fixed-case>LLM</fixed-case>s for Sequence Labeling - DavidDukićFaculty of Electrical Engineering and Computing, University of Zagreb + DavidDukićFaculty of Electrical Engineering and Computing, University of Zagreb JanSnajderUniZg-FER, University of Zagreb 14168-14181 Pre-trained language models based on masked language modeling (MLM) excel in natural language understanding (NLU) tasks. While fine-tuned MLM-based encoders consistently outperform causal language modeling decoders of comparable size, recent decoder-only large language models (LLMs) perform on par with smaller MLM-based encoders. Although their performance improves with scale, LLMs fall short of achieving state-of-the-art results in information extraction (IE) tasks, many of which are formulated as sequence labeling (SL). We hypothesize that LLMs’ poor SL performance stems from causal masking, which prevents the model from attending to tokens on the right of the current token. Yet, how exactly and to what extent LLMs’ performance on SL can be improved remains unclear. We explore techniques for improving the SL performance of open LLMs on IE tasks by applying layer-wise removal of the causal mask (CM) during LLM fine-tuning. This approach yields performance gains competitive with state-of-the-art SL models, matching or outperforming the results of CM removal from all blocks. Our findings hold for diverse SL tasks, demonstrating that open LLMs with layer-dependent CM removal outperform strong MLM-based encoders and even instruction-tuned LLMs. @@ -17602,8 +17602,8 @@ m<fixed-case>CSQA</fixed-case>: Multilingual Commonsense Reasoning Dataset with Unified Creation Strategy by Language Models and Humans YusukeSakaiNara Institute of Science and Technology, Japan - HidetakaKamigaitoDivision of Information Science, Nara Institute of Science and Technology - TaroWatanabeNara Institute of Science and Technology, Japan + HidetakaKamigaitoDivision of Information Science, Nara Institute of Science and Technology + TaroWatanabeNara Institute of Science and Technology, Japan 14182-14214 It is very challenging to curate a dataset for language-specific knowledge and common sense in order to evaluate natural language understanding capabilities of language models. Due to the limitation in the availability of annotators, most current multilingual datasets are created through translation, which cannot evaluate such language-specific aspects. Therefore, we propose Multilingual CommonsenseQA (mCSQA) based on the construction process of CSQA but leveraging language models for a more efficient construction, e.g., by asking LM to generate questions/answers, refine answers and verify QAs followed by reduced human efforts for verification. Constructed dataset is a benchmark for cross-lingual language-transfer capabilities of multilingual LMs, and experimental results showed high language-transfer capabilities for questions that LMs could easily solve, but lower transfer capabilities for questions requiring deep knowledge or commonsense. This highlights the necessity of language-specific datasets for evaluation and training. Finally, our method demonstrated that multilingual LMs could create QA including language-specific knowledge, significantly reducing the dataset creation cost compared to manual creation. The datasets are available at https://huggingface.co/datasets/yusuke1997/mCSQA. 2024.findings-acl.844 @@ -17630,8 +17630,8 @@ YiSu YunpengTai YixinJiSoochow University - JuntaoLiSoochow University, China - YanBowen + JuntaoLiSoochow University, China + YanBowen MinZhangHarbin Institute of Technology, Shenzhen 14232-14244 Large Language Models (LLMs) have demonstrated an impressive capability known as In-context Learning (ICL), which enables them to acquire knowledge from textual demonstrations without the need for parameter updates.However, many studies have highlighted that the model’s performance is sensitive to the choice of demonstrations, presenting a significant challenge for practical applications where we lack prior knowledge of user queries.Consequently, we need to construct an extensive demonstration pool and incorporate external databases to assist the model, leading to considerable time and financial costs.In light of this, some recent research has shifted focus towards zero-shot ICL, aiming to reduce the model’s reliance on external information by leveraging their inherent generative capabilities. Despite the effectiveness of these approaches, the content generated by the model may be unreliable, and the generation process is time-consuming.To address these issues, we propose Demonstration Augmentation for In-context Learning (DAIL), which employs the model’s previously predicted historical samples as demonstrations for subsequent ones.DAIL brings no additional inference cost and does not rely on the model’s generative capabilities.Our experiments reveal that DAIL can significantly improve the model’s performance over direct zero-shot inference and can even outperform few-shot ICL without any external information. @@ -17641,9 +17641,9 @@ Pushing the Limits of Zero-shot End-to-End Speech Translation - IoannisTsiamas + IoannisTsiamas Gerard I.Gállego - José A. R.Fonollosa + José A. R.Fonollosa Marta R.Costa-jussà 14245-14267 Data scarcity and the modality gap between the speech and text modalities are two major obstacles of end-to-end Speech Translation (ST) systems, thus hindering their performance. Prior work has attempted to mitigate these challenges by leveraging external MT data and optimizing distance metrics that bring closer the speech-text representations. However, achieving competitive results typically requires some ST data. For this reason, we introduce ZeroSwot, a method for zero-shot ST that bridges the modality gap without any paired ST data. Leveraging a novel CTC compression and Optimal Transport, we train a speech encoder using only ASR data, to align with the representation space of a massively multilingual MT model. The speech encoder seamlessly integrates with the MT model at inference, enabling direct translation from speech to text, across all languages supported by the MT model. Our experiments show that we can effectively close the modality gap without ST data, while our results on MuST-C and CoVoST demonstrate our method’s superiority over not only previous zero-shot models, but also supervised ones, achieving state-of-the-art results. @@ -17654,7 +17654,7 @@ <fixed-case>NUMC</fixed-case>o<fixed-case>T</fixed-case>: Numerals and Units of Measurement in Chain-of-Thought Reasoning using Large Language Models AnchengXu - MinghuanTanShenzhen Institute of Advanced Technology, Chinese Academy of Sciences, Chinese Academy of Sciences + MinghuanTanShenzhen Institute of Advanced Technology, Chinese Academy of Sciences, Chinese Academy of Sciences LeiWangSalesForce MinYangShenzhen Institutes of Advanced Technology, Chinese Academy of Sciences, Chinese Academy of Sciences RuifengXuHarbin Institute of Technology @@ -17666,9 +17666,9 @@ On The Persona-based Summarization of Domain-Specific Documents - AnkanMullick - SombitBoseIndian Institute of Technology Kharagpur, - RounakSaha + AnkanMullick + SombitBoseIndian Institute of Technology Kharagpur, + RounakSaha AyanBhowmickMerlyn Mind Inc. PawanGoyalIIT Kharagpur NiloyGangulyIndian Institute of Technology Kharagpur, @@ -17693,11 +17693,11 @@ Word Sense Linking: Disambiguating Outside the Sandbox - Andrei StefanBejgu + Andrei StefanBejgu EdoardoBarba LuigiProcopio AlberteFernández-Castro - RobertoNavigli + RobertoNavigli 14332-14347 Word Sense Disambiguation (WSD) is the task of associating a word in a given context with its most suitable meaning among a set of possible candidates. While the task has recently witnessed renewed interest, with systems achieving performances above the estimated inter-annotator agreement, at the time of writing it still struggles to find downstream applications. We argue that one of the reasons behind this is the difficulty of applying WSD to plain text. Indeed, in the standard formulation, models work under the assumptions that a) all the spans to disambiguate have already been identified, and b) all the possible candidate senses of each span are provided, both of which are requirements that are far from trivial. In this work, we present a new task called Word Sense Linking (WSL) where, given an input text and a reference sense inventory, systems have to both identify which spans to disambiguate and then link them to their most suitable meaning.We put forward a transformer-based architecture for the task and thoroughly evaluate both its performance and those of state-of-the-art WSD systems scaled to WSL, iteratively relaxing the assumptions of WSD. We hope that our work will foster easier integration of lexical semantics into downstream applications. 2024.findings-acl.851 @@ -17731,10 +17731,10 @@ Unsupervised Real-Time Hallucination Detection based on the Internal States of Large Language Models WeihangSu ChangyueWang - QingyaoAiTsinghua University, Tsinghua University + QingyaoAiTsinghua University, Tsinghua University YiranHu - ZhijingWuBeijing Institute of Technology - YujiaZhouTsinghua University, Tsinghua University + ZhijingWuBeijing Institute of Technology + YujiaZhouTsinghua University, Tsinghua University YiqunLiuTsinghua University 14379-14391 Hallucinations in large language models (LLMs) refer to the phenomenon of LLMs producing responses that are coherent yet factually inaccurate. This issue undermines the effectiveness of LLMs in practical applications, necessitating research into detecting and mitigating hallucinations of LLMs. Previous studies have mainly concentrated on post-processing techniques for hallucination detection, which tend to be computationally intensive and limited in effectiveness due to their separation from the LLM’s inference process. To overcome these limitations, we introduce MIND, an unsupervised training framework that leverages the internal states of LLMs for real-time hallucination detection without requiring manual annotations. Additionally, we present HELM, a new benchmark for evaluating hallucination detection across multiple LLMs, featuring diverse LLM outputs and the internal states of LLMs during their inference process. Our experiments demonstrate that MIND outperforms existing state-of-the-art methods in hallucination detection. @@ -17759,7 +17759,7 @@ NguyenHung-QuangVinUniversity SauravManchandaAmazon MinlongPengBaidu - Kok-SengWongVinUniversity + Kok-SengWongVinUniversity KhoaDoanVinUniversity 14403-14421 Despite outstanding performance in a variety of Natural Language Processing (NLP) tasks, recent studies have revealed that NLP models are vulnerable to adversarial attacks that slightly perturb the input to cause the models to misbehave. Several attacks can even compromise the model without requiring access to the model architecture or model parameters (i.e., a blackbox setting), and thus are detrimental to existing NLP applications. To perform these attacks, the adversary queries the victim model many times to determine the most important parts in an input text and transform. In this work, we propose a lightweight and attack-agnostic defense whose main goal is to perplex the process of generating an adversarial example in these query-based black-box attacks; that is to fool the textual fooler. This defense, named AdvFooler, works by randomizing the latent representation of the input at inference time. Different from existing defenses, AdvFooler does not necessitate additional computational overhead during training nor does it rely on assumptions about the potential adversarial perturbation set while having a negligible impact on the model’s accuracy. Our theoretical and empirical analyses highlight the significance of robustness resulting from confusing the adversary via randomizing the latent space, as well as the impact of randomization on clean accuracy. Finally, we empirically demonstrate near state-of-the-art robustness of AdvFooler against representative adversarial attacks on two benchmark datasets. @@ -17781,8 +17781,8 @@ <fixed-case>FOCUS</fixed-case>: Forging Originality through Contrastive Use in Self-Plagiarism for Language Models KaixinLan - TaoFangUniversity of Macau - DerekWongUniversity of Macau + TaoFangUniversity of Macau + DerekWongUniversity of Macau YaboXu LidiaChao CeciliaZhaoUniversity of Macau, New York University and Ohio State University, Columbus @@ -17795,7 +17795,7 @@ Amanda: Adaptively Modality-Balanced Domain Adaptation for Multimodal Emotion Recognition XinxinZhang - JunSun + JunSun SiminHong TaihaoLiZhejiang Lab 14448-14458 @@ -17807,8 +17807,8 @@ <fixed-case>M</fixed-case>ed<fixed-case>REQAL</fixed-case>: Examining Medical Knowledge Recall of Large Language Models via Question Answering JurajVladikaTechnische Universität München - PhillipSchneider - FlorianMatthesTechnische Universität München + PhillipSchneider + FlorianMatthesTechnische Universität München 14459-14469 In recent years, Large Language Models (LLMs) have demonstrated an impressive ability to encode knowledge during pre-training on large text corpora. They can leverage this knowledge for downstream tasks like question answering (QA), even in complex areas involving health topics. Considering their high potential for facilitating clinical work in the future, understanding the quality of encoded medical knowledge and its recall in LLMs is an important step forward. In this study, we examine the capability of LLMs to exhibit medical knowledge recall by constructing a novel dataset derived from systematic reviews – studies synthesizing evidence-based answers for specific medical questions. Through experiments on the new MedREQAL dataset, comprising question-answer pairs extracted from rigorous systematic reviews, we assess six LLMs, such as GPT and Mixtral, analyzing their classification and generation performance. Our experimental insights into LLM performance on the novel biomedical QA dataset reveal the still challenging nature of this task. 2024.findings-acl.860 @@ -17821,9 +17821,9 @@ WassaySajjad MukeetRazaLahore University of Management Sciences EmaanAbbas - Abdul HameedAzeemiLahore University of Management Sciences + Abdul HameedAzeemiLahore University of Management Sciences Ihsan AyyubQaziLahore University of Management Sciences - Agha AliRazaLahore University of Management Sciences + Agha AliRazaLahore University of Management Sciences 14470-14480 Deepfakes, particularly in the auditory domain, have become a significant threat, necessitating the development of robust countermeasures. This paper addresses the escalating challenges posed by deepfake attacks on Automatic Speaker Verification (ASV) systems. We present a novel Urdu deepfake audio dataset for deepfake detection, focusing on two spoofing attacks – Tacotron and VITS TTS. The dataset construction involves careful consideration of phonemic cover and balance and comparison with existing corpora like PRUS and PronouncUR. Evaluation with AASIST-L model shows EERs of 0.495 and 0.524 for VITS TTS and Tacotron-generated audios, respectively, with variability across speakers. Further, this research implements a detailed human evaluation, incorporating a user study to gauge whether people are able to discern deepfake audios from real (bonafide) audios. The ROC curve analysis shows an area under the curve (AUC) of 0.63, indicating that individuals demonstrate a limited ability to detect deepfakes (approximately 1 in 3 fake audio samples are regarded as real). Our work contributes a valuable resource for training deepfake detection models in low-resource languages like Urdu, addressing the critical gap in existing datasets. The dataset is publicly available at: https://github.com/CSALT-LUMS/urdu-deepfake-dataset. 2024.findings-acl.861 @@ -17845,7 +17845,7 @@ MeishanZhangHarbin Institute of Technology (Shenzhen), China and Tianjin University, China HaoFeiNational University of Singapore BinWang - ShengqiongWu + ShengqiongWu YixinCaoFudan University FeiLiWuhan University MinZhangHarbin Institute of Technology, Shenzhen @@ -17859,11 +17859,11 @@ Enhanced Visual Instruction Tuning with Synthesized Image-Dialogue Data YandaLi ChiZhangWestlake University - GangYuTencent - WanqiYang + GangYuTencent + WanqiYang ZhibinWangTencent LightAI Lab BinFu - GuoshengLinNanyang Technological University + GuoshengLinNanyang Technological University ChunhuaShenZhejiang University LingChenUniversity of Technology Sydney YunchaoWeiBeijing Jiaotong University @@ -17876,12 +17876,12 @@ Modeling Overregularization in Children with Small Language Models AkariHaga - SakuSugawaraNational Institute of Informatics + SakuSugawaraNational Institute of Informatics AkiyoFukatsuTokyo University, Tokyo Institute of Technology MiyuOba HirokiOuchiNAIST - TaroWatanabeNara Institute of Science and Technology, Japan - YoheiOsekiUniversity of Tokyo + TaroWatanabeNara Institute of Science and Technology, Japan + YoheiOsekiUniversity of Tokyo 14532-14550 The imitation of the children’s language acquisition process has been explored to make language models (LMs) more efficient.In particular, errors caused by children’s regularization (so-called overregularization, e.g., using wroted for the past tense of write) have been widely studied to reveal the mechanisms of language acquisition. Existing research has analyzed regularization in language acquisition only by modeling word inflection directly, which is unnatural in light of human language acquisition. In this paper, we hypothesize that language models that imitate the errors children make during language acquisition have a learning process more similar to humans. To verify this hypothesis, we analyzed the learning curve and error preferences of verb inflections in small-scale LMs using acceptability judgments. We analyze the differences in results by model architecture, data, and tokenization. Our model shows child-like U-shaped learning curves clearly for certain verbs, but the preferences for types of overgeneralization did not fully match the observations in children. 2024.findings-acl.865 @@ -17890,7 +17890,7 @@ Fantastic Semantics and Where to Find Them: Investigating Which Layers of Generative <fixed-case>LLM</fixed-case>s Reflect Lexical Semantics - ZhuLiu + ZhuLiu CunliangKong YingLiuTsinghua University, Tsinghua University MaosongSun @@ -17902,9 +17902,9 @@ Harnessing Large Language Models as Post-hoc Correctors - ZhiqiangZhongAarhus University - KuangyuZhouMicrosoft - DavideMottinAarhus University + ZhiqiangZhongAarhus University + KuangyuZhouMicrosoft + DavideMottinAarhus University 14559-14574 As Machine Learning (ML) models grow in size and demand higher-quality training data, the expenses associated with re-training and fine-tuning these models are escalating rapidly. Inspired by recent impressive achievements of Large Language Models (LLMs) in different fields, this paper delves into the question: can LLMs efficiently improve an ML’s performance at a minimal cost? We show that, through our proposed training-free framework LLMCorr, an LLM can work as a post-hoc corrector to propose corrections for the predictions of an arbitrary ML model. In particular, we form a contextual knowledge database by incorporating the dataset’s label information and the ML model’s predictions on the validation dataset. Leveraging the in-context learning capability of LLMs, we ask the LLM to summarise the instances in which the ML model makes mistakes and the correlation between primary predictions and true labels. Following this, the LLM can transfer its acquired knowledge to suggest corrections for the ML model’s predictions. Our experimental results on text analysis and the challenging molecular predictions show that LLMCorr improves the performance of a number of models by up to 39%. 2024.findings-acl.867 @@ -17913,11 +17913,11 @@ Debatrix: Multi-dimensional Debate Judge with Iterative Chronological Analysis Based on <fixed-case>LLM</fixed-case> - JingcongLiangFudan University + JingcongLiangFudan University RongYeByteDance MengHan RuofeiLai - XinyuZhangHuawei Technologies Ltd. + XinyuZhangHuawei Technologies Ltd. XuanjingHuangFudan University ZhongyuWeiFudan University 14575-14595 @@ -17928,12 +17928,12 @@ <fixed-case>C</fixed-case>ycle<fixed-case>A</fixed-case>lign: Iterative Distillation from Black-box <fixed-case>LLM</fixed-case> to White-box Models for Better Human Alignment - JixiangHongRenmin University of China - QuanTu + JixiangHongRenmin University of China + QuanTu ChangyuChenRenmin University of China GaoXing JiZhangAlibaba Group - RuiYanRenmin University of China + RuiYanRenmin University of China 14596-14609 Language models trained on large-scale corpus often generate harmful responses that are harmful and contrary to human values. A prevalent approach for human alignment is reinforcement learning from human feedback (RLHF), utilizing algorithms such as proximal policy optimization (PPO). However, these methods are often characterized by complexity, instability, and substantial resource consumption. Considering that existing large language models (LLMs) like ChatGPT are already relatively well-aligned and cost-friendly, researchers propose to align the language model with human preferences from AI feedback. Nevertheless, the common practices, that unidirectionally distill the responses, are constrained by the inherent capability of LLMs. To address it, we introduce CycleAlign, a framework that distills alignment capabilities from the parameter-invisible LLMs (black-box) to the parameter-visible models (white-box) in an iterative manner. CycleAlign iteratively improves both the white-box and black-box models by integrating static and dynamic in-context learning and a belief alignment method.Empirical results illustrate that the model fine-tuned by CycleAlign remarkably exceeds existing methods, and achieves the state-of-the-art performance in alignment with human value. 2024.findings-acl.869 @@ -17942,9 +17942,9 @@ Towards a new research agenda for multimodal enterprise document understanding: What are we missing? - ArminehNourbakhshSchool of Computer Science, Carnegie Mellon University and J.P. Morgan Chase + ArminehNourbakhshSchool of Computer Science, Carnegie Mellon University and J.P. Morgan Chase SameenaShahJ.P. Morgan Chase - CarolynRoseSchool of Computer Science, Carnegie Mellon University + CarolynRoseSchool of Computer Science, Carnegie Mellon University 14610-14622 The field of multimodal document understanding has produced a suite of models that have achieved stellar performance across several tasks, even coming close to human performance on certain benchmarks. Nevertheless, the application of these models to real-world enterprise datasets remains constrained by a number of limitations. In this position paper, we discuss these limitations in the context of three key aspects of research: dataset curation, model development, and evaluation on downstream tasks. By analyzing 14 datasets and 7 SotA models, we identify major gaps in their utility in the context of a real-world scenario. We demonstrate how each limitation impedes the widespread use of SotA models in enterprise settings, and present a set of research challenges that are motivated by these limitations. Lastly, we propose a research agenda that is aimed at driving the field towards higher impact in enterprise applications. 2024.findings-acl.870 @@ -17954,11 +17954,11 @@ <fixed-case>CAUSE</fixed-case>: Counterfactual Assessment of User Satisfaction Estimation in Task-Oriented Dialogue Systems AminAbolghasemi - ZhaochunRenLeiden University + ZhaochunRenLeiden University ArianAskari - MohammadAliannejadiUniversity of Amsterdam + MohammadAliannejadiUniversity of Amsterdam Maartende RijkeUniversity of Amsterdam - SuzanVerberneUniversiteit Leiden + SuzanVerberneUniversiteit Leiden 14623-14635 An important unexplored aspect in previous work on user satisfaction estimation for Task-Oriented Dialogue (TOD) systems is their evaluation in terms of robustness for the identification of user dissatisfaction: current benchmarks for user satisfaction estimation in TOD systems are highly skewed towards dialogues for which the user is satisfied. The effect of having a more balanced set of satisfaction labels on performance is unknown. However, balancing the data with more dissatisfactory dialogue samples requires further data collection and human annotation, which is costly and time-consuming. In this work, we leverage large language models (LLMs) and unlock their ability to generate satisfaction-aware counterfactual dialogues to augment the set of original dialogues of a test collection. We gather human annotations to ensure the reliability of the generated samples. We evaluate two open-source LLMs as user satisfaction estimators on our augmented collection against state-of-the-art fine-tuned models. Our experiments show that when used as few-shot user satisfaction estimators, open-source LLMs show higher robustness to the increase in the number of dissatisfaction labels in the test collection than the fine-tuned state-of-the-art models. Our results shed light on the need for data augmentation approaches for user satisfaction estimation in TOD systems. We release our aligned counterfactual dialogues, which are curated by human annotation, to facilitate further research on this topic. 2024.findings-acl.871 @@ -17967,11 +17967,11 @@ Measuring Retrieval Complexity in Question Answering Systems - MatteoGabburo + MatteoGabburo Nicolaas PaulJedema SiddhantGarg - Leonardo F. R.Ribeiro - AlessandroMoschitti + Leonardo F. R.Ribeiro + AlessandroMoschitti 14636-14650 In this paper, we investigate which questions are challenging for retrieval-based Question Answering (QA). We (i) propose retrieval complexity (RC), a novel metric conditioned on the completeness of retrieved documents, which measures the difficulty of answering questions, and (ii) propose an unsupervised pipeline to measure RC given an arbitrary retrieval system.Our proposed pipeline measures RC more accurately than alternative estimators, including LLMs, on six challenging QA benchmarks. Further investigation reveals that RC scores strongly correlate with both QA performance and expert judgment across five of the six studied benchmarks, indicating that RC is an effective measure of question difficulty.Subsequent categorization of high-RC questions shows that they span a broad set of question shapes, including multi-hop, compositional, and temporal QA, indicating that RC scores can categorize a new subset of complex questions. Our system can also have a major impact on retrieval-based systems by helping to identify more challenging questions on existing datasets. 2024.findings-acl.872 @@ -17983,9 +17983,9 @@ JiayuSongQueen Mary, University of London JennyChimQueen Mary University London AdamTsakalidisCedefop and Alan Turing Institute - JuliaIveQueen Mary, University of London + JuliaIveQueen Mary, University of London DanaAtzil-SlonimBar-Ilan University - MariaLiakataQueen Mary University London + MariaLiakataQueen Mary University London 14651-14672 We introduce a hybrid abstractive summarisation approach combining hierarchical VAEs with LLMs to produce clinically meaningful summaries from social media user timelines, appropriate for mental health monitoring. The summaries combine two different narrative points of view: (a) clinical insights in third person, generated by feeding into an LLM clinical expert-guided prompts, and importantly, (b) a temporally sensitive abstractive summary of the user’s timeline in first person, generated by a novel hierarchical variational autoencoder, TH-VAE. We assess the generated summaries via automatic evaluation against expert summaries and via human evaluation with clinical experts, showing that timeline summarisation by TH-VAE results in more factual and logically coherent summaries rich in clinical utility and superior to LLM-only approaches in capturing changes over time. 2024.findings-acl.873 @@ -17997,9 +17997,9 @@ <fixed-case>PIXAR</fixed-case>: Auto-Regressive Language Modeling in Pixel Space YintaoTai - XiyangLiao - AlessandroSugliaHeriot-Watt University - AntonioVergariUniversity of Edinburgh, University of Edinburgh + XiyangLiao + AlessandroSugliaHeriot-Watt University + AntonioVergariUniversity of Edinburgh, University of Edinburgh 14673-14695 Recent work showed the possibility of building open-vocabulary large language models (LLMs) that directly operate on pixel representations. These models are implemented as autoencoders that reconstruct masked patches of rendered text.However, these pixel-based LLMs are limited to discriminative tasks (e.g., classification) and, similar to BERT, cannot be used to generate text.Therefore, they cannot be used for generative tasks such as free-form question answering. In this work, we introduce PIXAR, the first pixel-based autoregressive LLM that performs text generation. Consisting of only a decoder, PIXAR can perform free-form generative tasks while keeping the number of parameters on par with previous encoder-decoder models.Furthermore, we highlight the challenges of generating text as non-noisy images and show this is due to using a maximum likelihood objective. To overcome this problem, we propose an adversarial pretraining stage that improves the readability and accuracy of PIXAR by 8.1 on LAMBADA and 8.5 on bAbI— making it comparable to GPT-2 on text generation tasks.This paves the way to build open-vocabulary LLMs that operate on perceptual input only and calls into question the necessity of the usual symbolic input representation, i.e., text as (sub)tokens. 2024.findings-acl.874 @@ -18011,12 +18011,12 @@ DaMa LuChenShanghai Jiaotong University PengyuWang - HongshenXuShanghai Jiaotong University + HongshenXuShanghai Jiaotong University HanqiLi LiangtaiSun SuZhu ShuaiFan - KaiYuShanghai Jiao Tong University + KaiYuShanghai Jiao Tong University 14696-14707 Large language models (LLMs) have demonstrated proficiency across various natural language processing (NLP) tasks but often require additional training, such as continual pre-training and supervised fine-tuning. However, the costs associated with this, primarily due to their large parameter count, remain high. This paper proposes leveraging sparsity in pre-trained LLMs to expedite this training process. By observing sparsity in activated neurons during forward iterations, we identify the potential for computational speed-ups by excluding inactive neurons. We address associated challenges by extending existing neuron importance evaluation metrics and introducing a ladder omission rate scheduler. Our experiments on Llama-2 demonstrate that Sparsity-Accelerated Training (SAT) achieves comparable or superior performance to standard training while significantly accelerating the process. Specifically, SAT achieves a 45% throughput improvement in continual pre-training and saves 38% training time in supervised fine-tuning. It offers a simple, hardware-agnostic, and easily deployable framework for additional LLM training. 2024.findings-acl.875 @@ -18037,8 +18037,8 @@ Do Language Models Exhibit Human-like Structural Priming Effects? JaapJumelet - WillemZuidemaUniversity of Amsterdam - ArabellaSinclairUniversity of Aberdeen + WillemZuidemaUniversity of Amsterdam + ArabellaSinclairUniversity of Aberdeen 14727-14742 We explore which linguistic factors—at the sentence and token level—play an important role in influencing language model predictions, and investigate whether these are reflective of results found in humans and human corpora (Gries and Kootstra, 2017). We make use of the structural priming paradigm—where recent exposure to a structure facilitates processing of the same structure—to investigate where priming effects manifest, and what factors predict them. We find these effects can be explained via the inverse frequency effect found in human priming, where rarer elements within a prime increase priming effects, as well as lexical dependence between prime and target. Our results provide an important piece in the puzzle of understanding how properties within their context affect structural prediction in language models. 2024.findings-acl.877 @@ -18055,14 +18055,14 @@ YuhanWu HongchengGuo RuitongGanThe Hong Kong Polytechnic University, Hong Kong Polytechnic University - ZehaoNi - JianYangAlibaba Group - ManZhang + ZehaoNi + JianYangAlibaba Group + ManZhang ZhaoxiangZhangInstitute of automation, Chinese academy of science, Chinese Academy of Sciences - WanliOuyangShanghai AI Lab + WanliOuyangShanghai AI Lab KeXuBeijing University of Aeronautics and Astronautics WenhaoHuang - JieFuHong Kong University of Science and Technology + JieFuHong Kong University of Science and Technology JunranPeng 14743-14777 The advent of Large Language Models (LLMs) has paved the way for complex tasks such as role-playing, which enhances user interactions by enabling models to imitate various characters. However, the closed-source nature of state-of-the-art LLMs and their general-purpose training limit role-playing optimization. In this paper, we introduce RoleLLM, a framework to benchmark, elicit, and enhance role-playing abilities in LLMs. RoleLLM comprises four stages: (1) Role Profile Construction for 100 roles; (2) Context-Based Instruction Generation (Context-Instruct) for role-specific knowledge extraction; (3) Role Prompting using GPT (RoleGPT) for speaking style imitation; and (4) Role-Conditioned Instruction Tuning (RoCIT) for fine-tuning open-source models along with role customization. By Context-Instruct and RoleGPT, we create RoleBench, the first systematic and fine-grained character-level benchmark dataset for role-playing with 168,093 samples. Moreover, RoCIT on RoleBench yields RoleLLaMA (English) and RoleGLM (Chinese), significantly enhancing role-playing abilities and even achieving comparable results with RoleGPT (using GPT-4). @@ -18087,10 +18087,10 @@ Views Are My Own, but Also Yours: Benchmarking Theory of Mind Using Common Ground AdilSoubkiState University of New York at Stony Brook JohnMurzaku, State University of New York at Stony Brook - ArashYousefi JordehiUniversity of Guilan + ArashYousefi JordehiUniversity of Guilan PeterZengState University of New York at Stony Brook MagdalenaMarkowska - Seyed AbolghasemMirroshandelUniversity of Guilan + Seyed AbolghasemMirroshandelUniversity of Guilan OwenRambowStony Brook University 14815-14823 Evaluating the theory of mind (ToM) capabilities of language models (LMs) has recently received a great deal of attention. However, many existing benchmarks rely on synthetic data, which risks misaligning the resulting experiments with human behavior. We introduce the first ToM dataset based on naturally occurring spoken dialogs, Common-ToM, and show that LMs struggle to demonstrate ToM. We then show that integrating a simple, explicit representation of beliefs improves LM performance on Common-ToM. @@ -18100,7 +18100,7 @@ <fixed-case>MAPLE</fixed-case>: Multilingual Evaluation of Parameter Efficient Finetuning of Large Language Models - DivyanshuAggarwal + DivyanshuAggarwal AshutoshSathe IshaanWattsGoogle DeepMind SunayanaSitaramMicrosoft @@ -18127,8 +18127,8 @@ Multi-Task Transfer Matters During Instruction-Tuning DavidMuellerJohns Hopkins University - MarkDredzeDepartment of Computer Science, Whiting School of Engineering - NicholasAndrewsJohns Hopkins University + MarkDredzeDepartment of Computer Science, Whiting School of Engineering + NicholasAndrewsJohns Hopkins University 14880-14891 Instruction-tuning trains a language model on hundreds of tasks jointly to improve a model’s ability to learn in-context;however, the mechanisms that drive in-context learning are poorly understood and, as a result, the role of instruction-tuning on in-context generalization is poorly understood as well.In this work, we study the impact of instruction-tuning on multi-task transfer: how well a model’s parameters adapt to an unseen task via fine-tuning.We find that instruction-tuning negatively impacts a model’s transfer to unseen tasks, and that model transfer and in-context generalization are highly correlated, suggesting that this catastrophic forgetting may impact in-context learning.We study methods to improve model transfer, finding that multi-task training—how well the training tasks are optimized—can significantly impact ICL generalization; additionally, we find that continual training on unsupervised pre-training data can mitigate forgetting and improve ICL generalization as well.Finally, we demonstrate that, early into training, the impact of instruction-tuning on model transfer to tasks impacts in-context generalization on that task.Overall, we provide significant evidence that multi-task transfer is deeply connected to a model’s ability to learn a task in-context. 2024.findings-acl.883 @@ -18137,7 +18137,7 @@ What Makes a Good Order of Examples in In-Context Learning - QiGuo + QiGuo LeiyuWangnanjing university YidongWang WeiYePeking University @@ -18153,10 +18153,10 @@ YunyeGongSRI International RobikShresthaRochester Institute of Technology JaredClaypooleSRI International - MichaelCogswellSRI International + MichaelCogswellSRI International ArijitRayBoston University - ChristopherKananUniversity of Rochester - AjayDivakaranSRI International + ChristopherKananUniversity of Rochester + AjayDivakaranSRI International 14905-14918 We propose a novel VQA dataset, BloomVQA, to facilitate comprehensive evaluation of large vision-language models on comprehension tasks. Unlike current benchmarks that often focus on fact-based memorization and simple reasoning tasks without theoretical grounding, we collect multiple-choice samples based on picture stories that reflect different levels of comprehension, as laid out in Bloom’s Taxonomy, a classic framework for learning assessment widely adopted in education research. Our data maps to a novel hierarchical graph representation which enables automatic data augmentation and novel measures characterizing model consistency. We perform graded evaluation and reliability analysis on recent multi-modal models. In comparison to low-level tasks, we observe decreased performance on tasks requiring advanced comprehension and cognitive skills with up to 38.0% drop in VQA accuracy. In comparison to earlier models, GPT-4V demonstrates improved accuracy over all comprehension levels and also shows a tendency of bypassing visual inputs especially for higher-level tasks. Current models also show consistency patterns misaligned with human comprehension in various scenarios, demonstrating the need for improvement based on theoretically-grounded criteria. The dataset can be accessed at https://huggingface.co/datasets/ygong/BloomVQA. 2024.findings-acl.885 @@ -18165,7 +18165,7 @@ <fixed-case>A</fixed-case>ttribution<fixed-case>B</fixed-case>ench: How Hard is Automatic Attribution Evaluation? - YifeiLi + YifeiLi XiangYueCarnegie Mellon University ZeyiLiaoOhio State University, Columbus HuanSunThe Ohio State University, Columbus @@ -18191,10 +18191,10 @@ <fixed-case>I</fixed-case>nstruct<fixed-case>E</fixed-case>d: Soft-Instruction Tuning for Model Editing with Hops XiaoQiHan RuLiShanxi University - XiaoliLi + XiaoliLi JiyeLiangShanxi University - ZifangZhang - JeffPanUniversity of Edinburgh, University of Edinburgh + ZifangZhang + JeffPanUniversity of Edinburgh, University of Edinburgh 14953-14968 The task of model editing becomes popular for correcting inaccurate or outdated parametric knowledge in Large Language Models (LLMs). However, there are major limitations of state of the art (SOTA) model editing methods, including the excessive memorization issue caused by the direct editing methods, as well as the error propagation and knowledge conflict issues from the memory enhancement methods, resulting in hindering models’ *portability*, e.g., the ability to transfer the new knowledge to related one-hop or multi-hop content. To address these issues, we propose the InstructEd method, the idea of which is to insert soft instructions into the attention module so as to facilitate interactions between instructions and questions and to understand and utilize new facts. Our main findings are: (i) InstructEd has achieved SOTA performance on three datasets for one-hop/multi-hop evaluation with LLaMAs and GPT2, achieving 10% (5%) improvement in one-hop (multi-hop) model editing.(ii) Different from earlier methods on editing parameters in FFN, we show that editing attention can also help. (iii) Model editing is highly related to retrieval augmented methods, which can help improve the locality of model editing while slightly decrease the editing performance with hops. 2024.findings-acl.888 @@ -18203,16 +18203,16 @@ <fixed-case>TLCR</fixed-case>: Token-Level Continuous Reward for Fine-grained Reinforcement Learning from Human Feedback - EunseopYoonKAIST - Hee SukYoonKorea Advanced Institute of Science & Technology + EunseopYoonKAIST + Hee SukYoonKorea Advanced Institute of Science & Technology SooHwanEomKorea Advanced Institute of Science & Technology GunsooHanKakao Brain DanielNamKakao Brain Corp. DaejinJoKorea University and Kakao Brain Kyoung-WoonOnKakao - MarkHasegawa-JohnsonUniversity of Illinois, Urbana Champaign + MarkHasegawa-JohnsonUniversity of Illinois, Urbana Champaign SungwoongKimKorea University - ChangYooKorea Advanced Institute of Science and Technology + ChangYooKorea Advanced Institute of Science and Technology 14969-14981 Reinforcement Learning from Human Feedback (RLHF) leverages human preference data to train language models to align more closely with human essence. These human preference data, however, are labeled at the sequence level, creating a mismatch between sequence-level preference labels and tokens, which are autoregressively generated from the language model. Although several recent approaches have tried to provide token-level (i.e., dense) rewards for each individual token, these typically rely on predefined discrete reward values (e.g., positive: +1, negative: -1, neutral: 0), failing to account for varying degrees of preference inherent to each token. To address this limitation, we introduce TLCR (Token-Level Continuous Reward) for RLHF, which incorporates a discriminator trained to distinguish positive and negative tokens, and the confidence of the discriminator is used to assign continuous rewards to each token considering the context. Extensive experiments show that our proposed TLCR leads to consistent performance improvements over previous sequence-level or token-level discrete rewards on open-ended generation benchmarks. 2024.findings-acl.889 @@ -18222,16 +18222,16 @@ Found in the middle: Calibrating Positional Attention Bias Improves Long Context Utilization Cheng-YuHsiehUniversity of Washington - Yung-SungChuangMassachusetts Institute of Technology + Yung-SungChuangMassachusetts Institute of Technology Chun-LiangLiGoogle ZifengWangGoogle LongLeGoogle AbhishekKumarGoogle DeepMind - JamesGlass + JamesGlass AlexanderRatnerDepartment of Computer Science, University of Washington Chen-YuLeeGoogle RanjayKrishnaDepartment of Computer Science - TomasPfisterGoogle + TomasPfisterGoogle 14982-14995 Large language models (LLMs), even when specifically trained to process long input contexts, struggle to capture relevant information located in the middle of their input. This phenomenon has been known as the lost-in-the-middle problem. In this work, we make three contributions. First, we set out to understand the factors that cause this phenomenon. In doing so, we establish a connection between lost-in-the-middle to LLMs’ intrinsic attention bias: LLMs exhibit an U-shaped attention bias where the tokens at the beginning and at the end of its input receive higher attention, regardless of their relevance. Second, we mitigate this positional bias through a calibration mechanism, found-in-the-middle, that allows the model to attend to contexts faithfully according to their relevance, even though when they are in the middle. Third, we show found-in-the-middle not only achieves better performance in locating relevant information within a long context, but also eventually leads to improved retrieval-augmented generation (RAG) performance across various tasks, outperforming existing methods by up to 10 percentage point. These findings open up future directions in understanding LLM attention bias and its potential consequences. 2024.findings-acl.890 @@ -18241,7 +18241,7 @@ S3-<fixed-case>DST</fixed-case>: Structured Open-Domain Dialogue Segmentation and State Tracking in the Era of <fixed-case>LLM</fixed-case>s Sarkar Snigdha SarathiDas - ChiragShahUniversity of Washington + ChiragShahUniversity of Washington MengtingWanMicrosoft JenniferNevillePurdue University and Purdue University LongqiYangMicrosoft @@ -18256,11 +18256,11 @@ Set the Clock: Temporal Alignment of Pretrained Language Models - BowenZhao - ZanderBrumbaughDepartment of Computer Science + BowenZhao + ZanderBrumbaughDepartment of Computer Science YizhongWangDepartment of Computer Science, University of Washington HannanehHajishirziUniversity of Washington, University of Washington, Allen Institute for Artificial Intelligence and University of Washington, Seattle - NoahSmithUniversity of Washington and Allen Institute for Artificial Intelligence + NoahSmithUniversity of Washington and Allen Institute for Artificial Intelligence 15015-15040 Language models (LMs) are trained on web text originating from many points in time and, in general, without any explicit temporal grounding. This work investigates the temporal chaos of pretrained LMs and explores various methods to align their internal knowledge to a target time, which we call “temporal alignment.” To do this, we first automatically construct a dataset containing 20K time-sensitive questions and their answers for each year from 2000 to 2023. Based on this dataset, we empirically show that pretrained LMs (e.g., LLaMa2), despite having a recent pretraining cutoff (e.g., 2022), mostly answer questions using earlier knowledge (e.g., in 2019). We then develop several methods, from prompting to finetuning, to align LMs to use their most recent knowledge when answering questions, and investigate various factors in this alignment. Our experiments demonstrate that aligning LLaMa2 to the year 2022 can enhance its performance by up to 62% according to that year’s answers. This improvement occurs even without explicitly mentioning time information, indicating the possibility of aligning models’ internal sense of time after pretraining. Finally, we find that alignment to a historical time is also possible, with up to 2.8\times the performance of the unaligned LM in 2010 if finetuning models to that year. These findings hint at the sophistication of LMs’ internal knowledge organization and the necessity of tuning them properly. 2024.findings-acl.892 @@ -18272,7 +18272,7 @@ BeyzaErmisCohere AI LuizaPozzobon SaraHookerCohere For AI - PatrickLewis + PatrickLewis 15041-15058 To date, toxicity mitigation in language models has almost entirely been focused on single-language settings. As language models embrace multilingual capabilities, it’s crucial our safety measures keep pace. Recognizing this research gap, our approach expands the scope of conventional toxicity mitigation to address the complexities presented by multiple languages. In the absence of sufficient annotated datasets across languages, we employ translated data to evaluate and enhance our mitigation techniques. We also compare finetuning mitigation approaches against retrieval-augmented techniques under both static and continual toxicity mitigation scenarios. This allows us to examine the effects of translation quality and the cross-lingual transfer on toxicity mitigation. We also explore how model size and data quantity affect the success of these mitigation efforts. Covering nine languages, our study represents a broad array of linguistic families and levels of resource availability, ranging from high to mid-resource languages. Through comprehensive experiments, we provide insights into the complexities of multilingual toxicity mitigation, offering valuable insights and paving the way for future research in this increasingly important field. 2024.findings-acl.893 @@ -18284,9 +18284,9 @@ AnshArora XuanliHeUniversity College London, University of London MaximilianMozesCohere - SrinibasSwain - MarkDrasMacquarie University - QiongkaiXuMacquarie University + SrinibasSwain + MarkDrasMacquarie University + QiongkaiXuMacquarie University 15059-15075 The democratization of pre-trained language models through open-source initiatives has rapidly advanced innovation and expanded access to cutting-edge technologies. However, this openness also brings significant security risks, including backdoor attacks, where hidden malicious behaviors are triggered by specific inputs, compromising natural language processing (NLP) system integrity and reliability. This paper suggests that merging a backdoored model with other homogeneous models can significantly remediate backdoor vulnerabilities even if such models are not entirely secure. In our experiments, we verify our hypothesis on various models (BERT-Base, RoBERTa-Large, Llama2-7B, and Mistral-7B) and datasets (SST-2, OLID, AG News, and QNLI). Compared to multiple advanced defensive approaches, our method offers an effective and efficient inference-stage defense against backdoor attacks on classification and instruction-tuned tasks without additional resources or specific knowledge. Our approach consistently outperforms recent advanced baselines, leading to an average of about 75% reduction in the attack success rate. Since model merging has been an established approach for improving model performance, the extra advantage it provides regarding defense can be seen as a cost-free bonus. 2024.findings-acl.894 @@ -18295,9 +18295,9 @@ Enhancing Sentence Simplification in <fixed-case>P</fixed-case>ortuguese: Leveraging Paraphrases, Context, and Linguistic Features - ArthurScalercio - MariaFinattoUniversidade Federal do Rio Grande do Sul - AlinePaesUniversidade Federal Fluminense + ArthurScalercio + MariaFinattoUniversidade Federal do Rio Grande do Sul + AlinePaesUniversidade Federal Fluminense 15076-15091 Automatic text simplification focuses on transforming texts into a more comprehensible version without sacrificing their precision. However, automatic methods usually require (paired) datasets that can be rather scarce in languages other than English. This paper presents a new approach to automatic sentence simplification that leverages paraphrases, context, and linguistic attributes to overcome the absence of paired texts in Portuguese.We frame the simplification problem as a textual style transfer task and learn a style representation using the sentences around the target sentence in the document and its linguistic attributes. Moreover, unlike most unsupervised approaches that require style-labeled training data, we fine-tune strong pre-trained models using sentence-level paraphrases instead of annotated data. Our experiments show that our model achieves remarkable results, surpassing the current state-of-the-art (BART+ACCESS) while competitively matching a Large Language Model. 2024.findings-acl.895 @@ -18319,9 +18319,9 @@ Toward Reliable Ad-hoc Scientific Information Extraction: A Case Study on Two Materials Dataset - SatanuGhosh + SatanuGhosh NealBrodnik - CarolinaFrey + CarolinaFrey CollinHolgateUniversity of California, Santa Barbara TresaPollockUniversity of California-Santa Barbara SamanthaDalyUniversity of Michigan Ann Arbor @@ -18335,7 +18335,7 @@ Structural Optimization Ambiguity and Simplicity Bias in Unsupervised Neural Grammar Induction JinwookParkGwangju Institute of Science and Technology - KangilKimGwangju Institute of Science and Technology + KangilKimGwangju Institute of Science and Technology 15124-15139 Neural parameterization has significantly advanced unsupervised grammar induction. However, training these models with a traditional likelihood loss for all possible parses exacerbates two issues: 1) *structural optimization ambiguity* that arbitrarily selects one among structurally ambiguous optimal grammars despite the specific preference of gold parses, and 2) *structural simplicity bias* that leads a model to underutilize rules to compose parse trees. These challenges subject unsupervised neural grammar induction (UNGI) to inevitable prediction errors, high variance, and the necessity for extensive grammars to achieve accurate predictions. This paper tackles these issues, offering a comprehensive analysis of their origins. As a solution, we introduce *sentence-wise parse-focusing* to reduce the parse pool per sentence for loss evaluation, using the structural bias from pre-trained parsers on the same dataset.In unsupervised parsing benchmark tests, our method significantly improves performance while effectively reducing variance and bias toward overly simplistic parses. Our research promotes learning more compact, accurate, and consistent explicit grammars, facilitating better interpretability. 2024.findings-acl.898 @@ -18349,8 +18349,8 @@ FlorianLuisierGoogle GuolongSuGoogle XiaoyuSunGoogle - Ramya SreeBoppanaGoogle - ZilongWangUniversity of California, San Diego + Ramya SreeBoppanaGoogle + ZilongWangUniversity of California, San Diego ZifengWangGoogle JiaqiMuGoogle HaoZhang @@ -18365,9 +18365,9 @@ <fixed-case>DBQR</fixed-case>-<fixed-case>QA</fixed-case>: A Question Answering Dataset on a Hybrid of Database Querying and Reasoning RungsimanNararatwong - Chung-ChiChenAIST, National Institute of Advanced Industrial Science and Technology + Chung-ChiChenAIST, National Institute of Advanced Industrial Science and Technology NatthawutKertkeidkachornJapan Advanced Institute of Science and Technology, Tokyo Institute of Technology - HiroyaTakamuraAIST, National Institute of Advanced Industrial Science and Technology + HiroyaTakamuraAIST, National Institute of Advanced Industrial Science and Technology RyutaroIchiseNational Intitute of Informatics and Tokyo Institute of Technology, Tokyo Institute of Technology 15169-15182 This paper introduces the Database Querying and Reasoning Dataset for Question Answering (DBQR-QA), aimed at addressing the gap in current question-answering (QA) research by emphasizing the essential processes of database querying and reasoning to answer questions. Specifically designed to accommodate sequential questions and multi-hop queries, DBQR-QA more accurately mirrors the dynamics of real-world information retrieval and analysis, with a particular focus on the financial reports of US companies. The dataset’s construction, the challenges encountered during its development, the performance of large language models on this dataset, and a human evaluation are thoroughly discussed to illustrate the dataset’s complexity and highlight future research directions in querying and reasoning tasks. @@ -18378,12 +18378,12 @@ <fixed-case>N</fixed-case>ote<fixed-case>C</fixed-case>hat: A Dataset of Synthetic Patient-Physician Conversations Conditioned on Clinical Notes JundaWang - ZonghaiYaoUniversity of Massachusetts at Amherst - ZhichaoYangUniversity of Massachusetts, Amherst + ZonghaiYaoUniversity of Massachusetts at Amherst + ZhichaoYangUniversity of Massachusetts, Amherst HuixueZhou RumengLiUniversity of Massachusetts, Amherst XunWangMicrosoft - YuchengXu + YuchengXu HongYuColumbia University 15183-15201 We introduce NoteChat, a novel cooperative multi-agent framework leveraging Large Language Models (LLMs) to generate patient-physician dialogues. NoteChat embodies the principle that an ensemble of role-specific LLMs, through structured role-play and strategic prompting, can perform their assigned roles more effectively. The synergy among these role-playing LLMs results in a cohesive and efficient dialogue generation. Evaluation on MTS-dialogue, a benchmark dataset for patient-physician dialogues-note pairs, shows that models trained with the augmented synthetic patient-physician dialogues by NoteChat outperforms other state-of-the-art models for generating clinical notes. Our comprehensive automatic and human evaluation demonstrates that NoteChat substantially surpasses state-of-the-art models like ChatGPT and GPT-4 up to 22.78% by domain experts in generating superior synthetic patient-physician dialogues based on clinical notes. NoteChat has the potential to engage patients directly and help clinical documentation, a leading cause of physician burnout. @@ -18394,7 +18394,7 @@ Model Editing at Scale leads to Gradual and Catastrophic Forgetting AkshatGuptaUniversity of California, Berkeley - AnuragRao + AnuragRao GopalaAnumanchipalliUniversity of California, Berkeley 15202-15232 Editing knowledge in large language models is an attractive capability that allows us to correct incorrectly learned facts during pre-training, as well as update the model with an ever-growing list of new facts. While existing model editing techniques have shown promise, they are usually evaluated using metrics for reliability, specificity and generalization over one or few edits. We argue that for model editing to have practical utility, we must be able to make multiple edits to the same model. With this in mind, we evaluate current model editing methods at scale, focusing on two state of the art methods - ROME and MEMIT. With the lens of scalability, we evaluate model editing methods for three crucial properties - editing proficiency, fact forgetting and downstream performance. We find that as a model is edited sequentially with multiple facts, it continually becomes less editable, forgets previously edited facts and loses the ability to perform downstream tasks. For ROME and MEMIT, this “forgetting” happens in two phases - an initial gradual but progressive forgetting phase followed by an abrupt or catastrophic forgetting. Both gradual and catastrophic forgetting limit the usefulness of model editing methods at scale - the former makes model editing less effective as multiple edits are made to the model while the latter caps the scalability of such model editing methods. Our analysis also highlights other key limitations of ROME and MEMIT at scale. With our work, we push for better evaluation of model editing and development of model editing methods keeping scalability in mind. @@ -18404,13 +18404,13 @@ 3<fixed-case>MVRD</fixed-case>: Multimodal Multi-task Multi-teacher Visually-Rich Form Document Understanding - YihaoDing - LorenzoVaianiPolytechnic Institute of Turin + YihaoDing + LorenzoVaianiPolytechnic Institute of Turin CarenHanUniversity of Melbourne, University of Western Australia and University of Sydney - JeanLee - PaoloGarzaPolytechnic Institute of Turin - JosiahPoonUniversity of Sydney - LucaCaglieroPolytechnic Institute of Turin + JeanLee + PaoloGarzaPolytechnic Institute of Turin + JosiahPoonUniversity of Sydney + LucaCaglieroPolytechnic Institute of Turin 15233-15244 This paper presents a groundbreaking multimodal, multi-task, multi-teacher joint-grained knowledge distillation model for visually-rich form document understanding. The model is designed to leverage insights from both fine-grained and coarse-grained levels by facilitating a nuanced correlation between token and entity representations, addressing the complexities inherent in form documents. Additionally, we introduce new inter-grained and cross-grained loss functions to further refine diverse multi-teacher knowledge distillation transfer process, presenting distribution gaps and a harmonised understanding of form documents. Through a comprehensive evaluation across publicly available form document understanding datasets, our proposed model consistently outperforms existing baselines, showcasing its efficacy in handling the intricate structures and content of visually complex form documents. 2024.findings-acl.903 @@ -18420,9 +18420,9 @@ Faithful Persona-based Conversational Dataset Generation with Large Language Models PegahJandaghi - XianghaiShengGoogle - XinyiBaiGoogle - JayPujaraUniversity of Southern California + XianghaiShengGoogle + XinyiBaiGoogle + JayPujaraUniversity of Southern California HakimSidahmed 15245-15270 High-quality conversational datasets are essential for developing AI models that can communicate with users.One way to foster deeper interactions between a chatbot and its user is through *personas*, aspects of the user’s character that provide insights into their personality, motivations, and behaviors.Training Natural Language Processing (NLP) models on a diverse and comprehensive persona-based dataset can lead to conversational models that create a deeper connection with the user, and maintain their engagement. In this paper, we leverage the power of Large Language Models (LLMs) to create a large, high-quality conversational dataset from a seed dataset. We propose a Generator-Critic architecture framework to expand the initial dataset, while improving the quality of its conversations.The Generator is an LLM prompted to output conversations.The Critic consists of a mixture of expert LLMs that control the quality of the generated conversations.These experts select the best generated conversations, which we then use to improve the Generator.We release Synthetic-Persona-Chat, consisting of 20k conversations seeded from Persona-Chat.We evaluate the quality of Synthetic-Persona-Chat and our generation framework on different dimensions through extensive experiments, and observe that the losing rate of Synthetic-Persona-Chat against Persona-Chat during an AI detection test decreases from 17.2% to 8.8% over three iterations. @@ -18435,11 +18435,11 @@ ZhiyangXu ChaoFengUniversity of Michigan - Ann Arbor and University of Electronic Science and Technology of China RulinShao - TrevorAshbyVirginia Polytechnic Institute and State University + TrevorAshbyVirginia Polytechnic Institute and State University YingShen DiJinMeta YuChengThe Chinese University of Hong Kong - QifanWangMeta AI + QifanWangMeta AI LifuHuangVirginia Tech 15271-15342 Despite vision-language models’ (VLMs) remarkable capabilities as versatile visual assistants, two substantial challenges persist within the existing VLM frameworks: (1) lacking task diversity in pretraining and visual instruction tuning, and (2) annotation error and bias in GPT-4 synthesized instruction tuning data. Both challenges lead to issues such as poor generalizability, hallucination, and catastrophic forgetting. To address these challenges, we construct Vision-Flan, the most diverse publicly available visual instruction tuning dataset to date, comprising 187 diverse tasks and 1,664,261 instances sourced from academic datasets, and each task is accompanied by an expert-written instruction. In addition, we propose a two-stage instruction tuning framework, in which VLMs are firstly finetuned on Vision-Flan and further tuned on GPT-4 synthesized data. We find this two-stage tuning framework significantly outperforms the traditional single-stage visual instruction tuning framework and achieves the state-of-the-art performance across a wide range of multi-modal evaluation benchmarks. Finally, we conduct in-depth analyses to understand visual instruction tuning and our findings reveal that: (1) GPT-4 synthesized data does not substantially enhance VLMs’ capabilities but rather modulates the model’s responses to human-preferred formats; (2) A minimal quantity (e.g., 1,000) of GPT-4 synthesized data can effectively align VLM responses with human-preference; (3) Visual instruction tuning mainly helps large-language models (LLMs) to understand visual features. @@ -18463,9 +18463,9 @@ ClaireJin SudhaRaoMicrosoft XiangyuPengSalesForce.com - PortiaBotchwayVanderbilt University + PortiaBotchwayVanderbilt University JessicaQuayeHarvard University - ChrisBrockettMicrosoft + ChrisBrockettMicrosoft BillDolan 15353-15368 Advancements in large language models (LLMs) are revolutionizing interactive game design, enabling dynamic plotlines and interactions between players and non-player characters (NPCs). However, LLMs may exhibit flaws such as hallucinations, forgetfulness, or misinterpretations of prompts, causing logical inconsistencies and unexpected deviations from intended designs. Automated techniques for detecting such game bugs are still lacking. To address this, we propose a systematic LLM-based method for automatically identifying such bugs from player game logs, eliminating the need for collecting additional data such as post-play surveys. Applied to a text-based game DejaBoom!, our approach effectively identifies bugs inherent in LLM-powered interactive games, surpassing unstructured LLM-powered bug-catching methods and filling the gap in automated detection of logical and design flaws. @@ -18485,7 +18485,7 @@ Challenges to Evaluating the Generalization of Coreference Resolution Models: A Measurement Modeling Perspective - IanPoradaMcGill University + IanPoradaMcGill University AlexandraOlteanuResearch, Microsoft KaheerSuleman AdamTrischler @@ -18498,9 +18498,9 @@ <fixed-case>SAGA</fixed-case>: A Participant-specific Examination of Story Alternatives and Goal Applicability for a Deeper Understanding of Complex Events - SaiVallurupalli + SaiVallurupalli KatrinErkUniversity of Texas, Austin - FrancisFerraroUniversity of Maryland, Baltimore County + FrancisFerraroUniversity of Maryland, Baltimore County 15396-15420 Interpreting and assessing goal driven actions is vital to understanding and reasoning over complex events. It is important to be able to acquire the knowledge needed for this understanding, though doing so is challenging. We argue that such knowledge can be elicited through a participant achievement lens. We analyze a complex event in a narrative according to the intended achievements of the participants in that narrative, the likely future actions of the participants, and the likelihood of goal success. We collect 6.3K high quality goal and action annotations reflecting our proposed participant achievement lens, with an average weighted Fleiss-Kappa IAA of 80%. Our collection contains annotated alternate versions of each narrative. These alternate versions vary minimally from the “original” story, but can license drastically different inferences. Our findings suggest that while modern large language models can reflect some of the goal-based knowledge we study, they find it challenging to fully capture the design and intent behind concerted actions, even when the model pretraining included the data from which we extracted the goal knowledge. We show that smaller models fine-tuned on our dataset can achieve performance surpassing larger models. 2024.findings-acl.910 @@ -18510,10 +18510,10 @@ <fixed-case>SLIDE</fixed-case>: A Framework Integrating Small and Large Language Models for Open-Domain Dialogues Evaluation KunZhao - BohaoYangUniversity of Manchester - ChenTang + BohaoYangUniversity of Manchester + ChenTang ChenghuaLinUniversity of Manchester - LiangZhanUniversity of Pittsburgh + LiangZhanUniversity of Pittsburgh 15421-15435 The long-standing one-to-many problem of gold standard responses in open-domain dialogue systems presents challenges for automatic evaluation metrics. Though prior works have demonstrated some success by applying powerful Large Language Models (LLMs), existing approaches still struggle with the one-to-many problem, and exhibit subpar performance in domain-specific scenarios. We assume the commonsense reasoning biases within LLMs may hinder their performance in domain-specific evaluations. To address both issues, we propose a novel framework SLIDE (Small and Large Integrated for Dialogue Evaluation), that leverages both a small, specialised model (SLM), and LLMs for the evaluation of open domain dialogues. Our approach introduces several techniques: (1) Contrastive learning to differentiate between robust and non-robust response embeddings; (2) A novel metric for semantic sensitivity that combines embedding cosine distances with similarity learned through neural networks, and (3) A strategy for incorporating the evaluation results from both the SLM and LLMs. Our empirical results demonstrate that our approach achieves state-of-the-art performance in both the classification and evaluation tasks, and additionally the SLIDE evaluator exhibits better correlation with human judgements. Our code is available at https://github.com/hegehongcha/SLIDE-ACL2024. 2024.findings-acl.911 @@ -18537,7 +18537,7 @@ What Makes Language Models Good-enough? DaikiAsami - SakuSugawaraNational Institute of Informatics + SakuSugawaraNational Institute of Informatics 15453-15467 Psycholinguistic research suggests that humans may build a representation of linguistic input that is ‘good-enough’ for the task at hand. This study examines what architectural features make language models learn human-like good-enough language processing. We focus on the number of layers and self-attention heads in Transformers. We create a good-enough language processing (GELP) evaluation dataset (7,680 examples), which is designed to test the effects of two plausibility types, eight construction types, and three degrees of memory cost on language processing. To annotate GELP, we first conduct a crowdsourcing experiment whose design follows prior psycholinguistic studies. Our model evaluation against the annotated GELP then reveals that the full model as well as models with fewer layers and/or self-attention heads exhibit a good-enough performance. This result suggests that models with shallower depth and fewer heads can learn good-enough language processing. 2024.findings-acl.913 @@ -18574,15 +18574,15 @@ Knowledge-Infused Prompting: Assessing and Advancing Clinical Text Data Generation with Large Language Models RanXuEmory University - HejieCuiStanford University - YueYuGeorgia Institute of Technology + HejieCuiStanford University + YueYuGeorgia Institute of Technology XuanKan - WenqiShiUniversity of Texas Southwestern Medical Center + WenqiShiUniversity of Texas Southwestern Medical Center YuchenZhuangGeorgia Institute of Technology May DongmeiWang WeiJinEmory University JoyceHoEmory University - CarlYangEmory University + CarlYangEmory University 15496-15523 Clinical natural language processing faces challenges like complex medical terminology and clinical contexts. Recently, large language models (LLMs) have shown promise in this domain. Yet, their direct deployment can lead to privacy issues and are constrained by resources. To address this challenge, we delve into synthetic clinical text generation with LLMs for clinical NLP tasks. We propose an innovative, resource-efficient approach, ClinGen, which infuses knowledge into the process. Our model involves clinical knowledge extraction and context-informed LLM prompting. Both clinical topics and writing styles are drawn from external domain-specific knowledge graphs and LLMs to guide data generation. Our extensive empirical study across 8 clinical NLP tasks and 18 datasets reveals that ClinGen consistently enhances performance across various tasks by 7.7%-8.7% on average, effectively aligning the distribution of real datasets and enriching the diversity of generated training instances. 2024.findings-acl.916 @@ -18617,9 +18617,9 @@ <fixed-case>TELLER</fixed-case>: A Trustworthy Framework for Explainable, Generalizable and Controllable Fake News Detection - HuiLiuCity University of Hong Kong - WenyaWangNanyang Technological University - HaoruLi + HuiLiuCity University of Hong Kong + WenyaWangNanyang Technological University + HaoruLi HaoliangLiCity University of Hong Kong 15556-15583 The proliferation of fake news has emerged as a severe societal problem, raising significant interest from industry and academia. While existing deep-learning based methods have made progress in detecting fake news accurately, their reliability may be compromised caused by the non-transparent reasoning processes, poor generalization abilities and inherent risks of integration with large language models (LLMs). To address this challenge, we propose TELLER, a novel framework for trustworthy fake news detection that prioritizes explainability, generalizability and controllability of models. This is achieved via a dual-system framework that integrates cognition and decision systems, adhering to the principles above. The cognition system harnesses human expertise to generate logical predicates, which guide LLMs in generating human-readable logic atoms. Meanwhile, the decision system deduces generalizable logic rules to aggregate these atoms, enabling the identification of the truthfulness of the input news across diverse domains and enhancing transparency in the decision-making process. Finally, we present comprehensive evaluation results on four datasets, demonstrating the feasibility and trustworthiness of our proposed framework. @@ -18654,7 +18654,7 @@ A Meta-Learning Perspective on Transformers for Causal Language Modeling XinboWuUniversity of Illinois, Urbana Champaign - LavVarshneyUniversity of Illinois at Urbana-Champaign + LavVarshneyUniversity of Illinois at Urbana-Champaign 15612-15622 The Transformer architecture has become prominent in developing large causal language models. However, mechanisms to explain its capabilities are not well understood. Focused on the training process, here we establish a meta-learning view of the Transformer architecture when trained for the causal language modeling task, by explicating an inner optimization process that may happen within the Transformer. Further, from within the inner optimization, we discover and theoretically analyze a special characteristic of the norms of learned token representations within Transformer-based causal language models. Our analysis is supported by experiments conducted on pre-trained large language models and real-world data. 2024.findings-acl.922 @@ -18664,15 +18664,15 @@ <fixed-case>PL</fixed-case>a<fixed-case>D</fixed-case>: Preference-based Large Language Model Distillation with Pseudo-Preference Pairs RongzhiZhangGeorgia Institute of Technology and Zhejiang University - JiamingShenGoogle DeepMind + JiamingShenGoogle DeepMind TianqiLiuGoogle HaoruiWangGeorgia Institute of Technology - ZhenQinGoogle + ZhenQinGoogle FengHanResearch, Google JialuLiuGoogle Research SimonBaumgartnerGoogle - MichaelBenderskyGoogle - ChaoZhangGeorgia Institute of Technology + MichaelBenderskyGoogle + ChaoZhangGeorgia Institute of Technology 15623-15636 Large Language Models (LLMs) have exhibited impressive capabilities in various tasks, yet their vast parameter sizes restrict their applicability in resource-constrained settings. Knowledge distillation (KD) offers a viable solution by transferring expertise from large teacher models to compact student models. However, traditional KD techniques face specific challenges when applied to LLMs, including restricted access to LLM outputs, significant teacher-student capacity gaps, and the inherited mis-calibration issue. In this work, we present PLaD, a novel preference-based LLM distillation framework. PLaD exploits the teacher-student capacity discrepancy to generate pseudo-preference pairs where teacher outputs are preferred over student outputs. Then, PLaD leverages a ranking loss to re-calibrate the student’s estimation of sequence likelihood, which steers the student’s focus towards understanding the relative quality of outputs instead of simply imitating the teacher. PLaD bypasses the need for access to teacher LLM’s internal states, tackles the student’s expressivity limitations, and mitigates the student mis-calibration issue. Through extensive experiments on two sequence generation tasks and with various LLMs, we demonstrate the effectiveness of our proposed PLaD framework. 2024.findings-acl.923 @@ -18699,9 +18699,9 @@ KexunZhangCarnegie Mellon University YeeChoi ZhenqiaoSong - TaiqiHe + TaiqiHe William YangWangUC Santa Barbara - LeiLiSchool of Computer Science, Carnegie Mellon University + LeiLiSchool of Computer Science, Carnegie Mellon University 15654-15669 How can large language models (LLMs) process and translate endangered languages? Many languages lack a large corpus to train a decent LLM; therefore existing LLMs rarely perform well in unseen, endangered languages. On the contrary, we observe that 2000 endangered languages, though without a large corpus, have a grammar book or a dictionary. We propose LingoLLM, a training-free approach to enable an LLM to process unseen languages that hardly occur in its pre-training. Our key insight is to demonstrate linguistic knowledge of an unseen language in an LLM’s prompt, including a dictionary, a grammar book, and morphologically analyzed input text. We implement LingoLLM on top of two models, GPT-4 and Mixtral, and evaluate their performance on 5 tasks across 8 endangered or low-resource languages. Our results show that LingoLLM elevates translation capability from GPT-4’s 0 to 10.5 BLEU for 10 language directions. Our findings demonstrate the tremendous value of linguistic knowledge in the age of LLMs for endangered languages. Our data, code, and model generations will be released to the public. Our data, code, and model generations can be found at https://github.com/LLiLab/llm4endangeredlang. 2024.findings-acl.925 @@ -18710,7 +18710,7 @@ From Tarzan to <fixed-case>T</fixed-case>olkien: Controlling the Language Proficiency Level of <fixed-case>LLM</fixed-case>s for Content Generation - AliMalikStanford University + AliMalikStanford University StephenMayhewDuolingo ChristopherPiech KlintonBicknellDuolingo @@ -18737,9 +18737,9 @@ <fixed-case>CT</fixed-case>ool<fixed-case>E</fixed-case>val: A <fixed-case>C</fixed-case>hinese Benchmark for <fixed-case>LLM</fixed-case>-Powered Agent Evaluation in Real-World <fixed-case>API</fixed-case> Interactions - ZishanGuo + ZishanGuo YufeiHuang - DeyiXiongTianjin University + DeyiXiongTianjin University 15711-15724 Assessing the capabilities of large language models (LLMs) as agents in decision making and operational tasks is crucial for the development of LLM-as-agent service. We propose CToolEval, a benchmark designed to evaluate LLMs in the context of Chinese societal applications, featuring 398 APIs across 27 widely-used Apps (e.g., Apps for shopping, map, music, travel, etc.) that cover 14 domains. We further present an evaluation framework that simulates real-life scenarios, to facilitate the assessment of tool invocation ability of LLMs for tool learning and task completion ability for user interation. Our extensive experiments with CToolEval evaluate 11 LLMs, revealing that while GPT-3.5-turbo excels in tool invocation, Chinese LLMs usually struggle with issues like hallucination and a lack of comprehensive tool understanding. Our findings highlight the need for further refinement in decision-making capabilities of LLMs, offering insights into bridging the gap between current functionalities and agent-level performance. To promote further research for LLMs to fully act as reliable agents in complex, real-world situations, we release our data and codes at https://github.com/tjunlp-lab/CToolEval. 2024.findings-acl.928 @@ -18749,11 +18749,11 @@ Token Alignment via Character Matching for Subword Completion BenAthiwaratkunAmazon - ShiqiWangAmazon + ShiqiWangAmazon MingyueShangAmazon YuchenTian ZijianWangAmazon AWS AI Labs - Sujan KumarGonugondlaAmazon + Sujan KumarGonugondlaAmazon Sanjay KrishnaGoudaAmazon RobertKwiatkowskiAmazon RameshNallapatiAmazon Web Services @@ -18796,7 +18796,7 @@ Language-Informed Beam Search Decoding for Multilingual Machine Translation YilinYangOregon State University StefanLeeOregon State University - PrasadTadepalliOregon State University and Oregon State University + PrasadTadepalliOregon State University and Oregon State University 15761-15772 Beam search decoding is the de-facto method for decoding auto-regressive Neural Machine Translation (NMT) models, including multilingual NMT where the target language is specified as an input. However, decoding multilingual NMT models commonly produces off-target translations – yielding translation outputs not in the intended language.In this paper, we first conduct an error analysis of off-target translations for a strong multilingual NMT model and identify how these decodings are produced during beam search. We then propose Language-informed Beam Search (LiBS), a general decoding algorithm incorporating an off-the-shelf Language Identification (LiD) model into beam search decoding to reduce off-target translations. LiBS is an inference-time procedure that is NMT-model agnostic and does not require any additional parallel data. Results show that our proposed LiBS algorithm on average improves +1.1 BLEU and +0.9 BLEU on WMT and OPUS datasets, and reduces off-target rates from 22.9% to 7.7% and 65.8% to 25.3% respectively. 2024.findings-acl.932 @@ -18817,8 +18817,8 @@ The <fixed-case>PGNSC</fixed-case> Benchmark: How Do We Predict Where Information Spreads? - AlexanderTaylorUCLA Computer Science Department, University of California, Los Angeles - WeiWangUniversity of California, Los Angeles + AlexanderTaylorUCLA Computer Science Department, University of California, Los Angeles + WeiWangUniversity of California, Los Angeles 15787-15803 Social networks have become ideal vehicles for news dissemination because posted content is easily able to reach users beyond a news outlet’s direct audience. Understanding how information is transmitted among communities of users is a critical step towards understanding the impact social networks have on real-world events. Two significant barriers in this vein of work are identifying user clusters and meaningfully characterizing these communities. Thus, we propose the PGNSC benchmark, which builds information pathways based on the audiences of influential news sources and uses their content to characterize the communities. We present methods of aggregating these news-source-centric communities and for constructing the community feature representations that are used sequentially to construct information pathway prediction pipelines. Lastly, we perform extensive experiments to demonstrate the performance of baseline pipeline constructions and to highlight the possibilities for future work. 2024.findings-acl.934 @@ -18874,10 +18874,10 @@ A Critical Study of What Code-<fixed-case>LLM</fixed-case>s (Do Not) Learn - AbhinavAnandTechnische Universität Darmstadt + AbhinavAnandTechnische Universität Darmstadt ShwetaVermaTechnische Universität Darmstadt KrishnaNarasimhan - MiraMeziniTechnische Universität Darmstadt + MiraMeziniTechnische Universität Darmstadt 15869-15889 Large Language Models trained on code corpora (code-LLMs) have demonstrated impressive performance in various coding assistance tasks. However, despite their increased size and training dataset, code-LLMs still have limitations such as suggesting codes with syntactic errors, variable misuse etc. Some studies argue that code-LLMs perform well on coding tasks because they use self-attention and hidden representations to encode relations among input tokens. However, previous works have not studied what code properties are not encoded by code-LLMs. In this paper, we conduct a fine-grained analysis of attention maps and hidden representations of code-LLMs. Our study indicates that code-LLMs only encode relations among specific subsets of input tokens. Specifically, by categorizing input tokens into syntactic tokens and identifiers, we found that models encode relations among syntactic tokens and among identifiers, but they fail to encode relations between syntactic tokens and identifiers. We also found that fine-tuned models encode these relations poorly compared to their pre-trained counterparts. Additionally, larger models with billions of parameters encode significantly less information about code than models with only a few hundred million parameters. 2024.findings-acl.939 @@ -18886,7 +18886,7 @@ Visual In-Context Learning for Large Vision-Language Models - YuchengZhouUniversity of Macau + YuchengZhouUniversity of Macau XiangLi QianningWang JianbingShenUniversity of Macau @@ -18904,7 +18904,7 @@ Si-QingChen FuruWeiMicrosoft Research DongyanZhaoPeking University - RuiYanRenmin University of China + RuiYanRenmin University of China 15903-15918 In this paper, we introduce SCALE, a collaborative framework that connects a compact Specialized Translation Model (STM) and a general-purpose Large Language Model (LLM) as one unified translation engine. By introducing translation from STM into the triplet in-context demonstrations, SCALE unlocks refinement and pivoting ability of LLM, thus 1) mitigating language bias of LLMs and parallel data bias of STMs, 2) enhancing LLM speciality without sacrificing generality, and 3) facilitating continual learning in a LLM-tuning-free way.Our comprehensive experiments show that SCALE significantly outperforms both LLMs (GPT-4, GPT-3.5) and supervised models (NLLB, M2M) in either high-resource or challenging low-resource settings. Moreover SCALE shows great scalability by only updating the lightweight STM and witness consistent system improvement, an averaged 4 BLEURT score across 4 languages without tuning LLM. Interestingly, SCALE could also effectively exploit the existing language bias of LLMs by using an English-centric STM as a pivot to conduct translation between any language pairs, outperforming GPT-4 by an average of 6 COMET points across eight translation directions. Furthermore we provide an in-depth analysis of SCALE’s robustness, translation characteristics, latency costs and inherent language bias, providing solid foundation for future studies exploring the potential synergy between LLMs and more specialized models. 2024.findings-acl.941 @@ -18926,12 +18926,12 @@ Retrieval-Augmented Retrieval: Large Language Models are Strong Zero-Shot Retriever TaoShenOracle - GuodongLongUniversity of Technology Sydney + GuodongLongUniversity of Technology Sydney XiuboGengMicrosoft ChongyangTaoBeihang University YibinLeiUniversity of Amsterdam - TianyiZhouUniversity of Maryland, College Park - MichaelBlumensteinUniversity of Technology Sydney + TianyiZhouUniversity of Maryland, College Park + MichaelBlumensteinUniversity of Technology Sydney DaxinJiangMicrosoft 15933-15946 We propose a simple method that applies a large language model (LLM) to large-scale retrieval in zero-shot scenarios. Our method, the Large language model as Retriever (LameR), is built upon no other neural models but an LLM in a retrieval-augmented retrieval fashion, while breaking brute-force combinations of retrievers with LLMs and lifting the performance of zero-shot retrieval to be very competitive on benchmark datasets. Essentially, we propose to augment a query with its potential answers by prompting LLMs with a composition of the query and the query’s in-domain candidates. The candidates, regardless of correct or wrong, are obtained by a vanilla retrieval procedure on the target collection. As a part of the prompts, they are likely to help LLM generate more precise answers by pattern imitation or candidate summarization. Even if all the candidates are wrong, the prompts at least make LLM aware of in-collection patterns and genres. Moreover, due to the low performance of a self-supervised retriever, the LLM-based query augmentation becomes less effective as the retriever bottlenecks the whole pipeline. Therefore, we propose to leverage a non-parametric lexicon-based method (e.g., BM25) as the retrieval module to capture query-document overlap in a literal fashion. As such, LameR makes the retrieval procedure transparent to the LLM, thus circumventing the bottleneck. @@ -18941,11 +18941,11 @@ A Survey on Predicting the Factuality and the Bias of News Media - PreslavNakov + PreslavNakov JisunAn HaewoonKwak - Muhammad ArslanManzoor - Zain MuhammadMujahid + Muhammad ArslanManzoor + Zain MuhammadMujahid Husrev TahaSencar 15947-15962 The present level of proliferation of fake, biased, and propagandistic content online has made it impossible to fact-check every single suspicious claim or article, either manually or automatically. An increasing number of scholars are focusing on a coarser granularity, aiming to profile entire news outlets, which allows fast identification of potential “fake news” by checking the reliability of their source. Source factuality is also an important element of systems for automatic fact-checking and “fake news” detection, as they need to assess the reliability of the evidence they retrieve online. Political bias detection, which in the Western political landscape is about predicting left-center-right bias, is an equally important topic, which has experienced a similar shift toward profiling entire news outlets. Moreover, there is a clear connection between the two, as highly biased media are less likely to be factual; yet, the two problems have been addressed separately. In this survey, we review the state of the art on media profiling for factuality and bias, arguing for the need to model them jointly. We also shed light on some of the major challenges for modeling bias and factuality jointly. We further discuss interesting recent advances in using different information sources and modalities, which go beyond the text of the articles the target news outlet has published. Finally, we discuss current challenges and outline future research directions. @@ -18967,9 +18967,9 @@ Improving Multi-hop Logical Reasoning in Knowledge Graphs with Context-Aware Query Representation Learning JeonghoonKim - HeesooJung + HeesooJung HyejuJangIndiana University/Purdue University at Indianapolis - HogunParkSungkyunkwan University + HogunParkSungkyunkwan University 15978-15991 Multi-hop logical reasoning on knowledge graphs is a pivotal task in natural language processing, with numerous approaches aiming to answer First-Order Logic (FOL) queries. Recent geometry (e.g., box, cone) and probability (e.g., beta distribution)-based methodologies have effectively addressed complex FOL queries. However, a common challenge across these methods lies in determining accurate geometric bounds or probability parameters for these queries. The challenge arises because existing methods rely on linear sequential operations within their computation graphs, overlooking the logical structure of the query and the relation-induced information that can be gleaned from the relations of the query, which we call the context of the query. To address the problem, we propose a model-agnostic methodology that enhances the effectiveness of existing multi-hop logical reasoning approaches by fully integrating the context of the FOL query graph. Our approach distinctively discerns (1) the structural context inherent to the query structure and (2) the relation-induced context unique to each node in the query graph as delineated in the corresponding knowledge graph. This dual-context paradigm helps nodes within a query graph attain refined internal representations throughout the multi-hop reasoning steps. Through experiments on two datasets, our method consistently enhances the three multi-hop reasoning foundation models, achieving performance improvements of up to 19.5%. Our codes are available at https://github.com/kjh9503/caqr. 2024.findings-acl.946 @@ -18981,10 +18981,10 @@ YuzhaoHeng ChunyuanDengRice University YitongLi - YueYuGeorgia Institute of Technology - YinghaoLi + YueYuGeorgia Institute of Technology + YinghaoLi RongzhiZhangGeorgia Institute of Technology and Zhejiang University - ChaoZhangGeorgia Institute of Technology + ChaoZhangGeorgia Institute of Technology 15992-16030 Although Large Language Models (LLMs) exhibit remarkable adaptability across domains, these models often fall short in structured knowledge extraction tasks such as named entity recognition (NER). This paper explores an innovative, cost-efficient strategy to harness LLMs with modest NER capabilities for producing superior NER datasets. Our approach diverges from the basic class-conditional prompts by instructing LLMs to self-reflect on the specific domain, thereby generating domain-relevant attributes (such as category and emotions for movie reviews), which are utilized for creating attribute-rich training data. Furthermore, we preemptively generate entity terms and then develop NER context data around these entities, effectively bypassing the LLMs’ challenges with complex structures. Our experiments across both general and niche domains reveal significant performance enhancements over conventional data generation methods while being more cost-effective than existing alternatives. 2024.findings-acl.947 @@ -19007,8 +19007,8 @@ A Large Collection of Model-generated Contradictory Responses for Consistency-aware Dialogue Systems ShikiSatoCyberAgent, Inc. ReinaAkamaTohoku University and RIKEN - JunSuzukiTohoku University - KentaroInuiMohamed bin Zayed University of Artificial Intelligence, RIKEN and Tohoku University + JunSuzukiTohoku University + KentaroInuiMohamed bin Zayed University of Artificial Intelligence, RIKEN and Tohoku University 16047-16062 Mitigating the generation of contradictory responses poses a substantial challenge in dialogue response generation. The quality and quantity of available contradictory response data play a vital role in suppressing these contradictions, offering two significant benefits. First, having access to large contradiction data enables a comprehensive examination of their characteristics. Second, data-driven methods to mitigate contradictions may be enhanced with large-scale contradiction data for training. Nevertheless, no attempt has been made to build an extensive collection of model-generated contradictory responses. In this paper, we build a large dataset of response generation models’ contradictions for the first time. Then, we acquire valuable insights into the characteristics of model-generated contradictions through an extensive analysis of the collected responses. Lastly, we also demonstrate how this dataset substantially enhances the performance of data-driven contradiction suppression methods. 2024.findings-acl.949 @@ -19021,7 +19021,7 @@ RisakoAndoKeio University TakanobuMorishita HirohikoAbeKeio University - KojiMineshimaKeio University + KojiMineshimaKeio University MitsuhiroOkada 16063-16077 This paper explores the question of how accurately current large language models can perform logical reasoning in natural language, with an emphasis on whether these models exhibit reasoning biases similar to humans. Specifically, our study focuses on syllogistic reasoning, a form of deductive reasoning extensively studied in cognitive science as a natural form of human reasoning. We present a syllogism dataset called NeuBAROCO, which consists of syllogistic reasoning problems in English and Japanese. This dataset was originally designed for psychological experiments to assess human reasoning capabilities using various forms of syllogisms. Our experiments with leading large language models indicate that these models exhibit reasoning biases similar to humans, along with other error tendencies. Notably, there is significant room for improvement in reasoning problems where the relationship between premises and hypotheses is neither entailment nor contradiction. We also present experimental results and in-depth analysis using a new Chain-of-Thought prompting method, which asks LLMs to translate syllogisms into abstract logical expressions and then explain their reasoning process. Our analysis using this method suggests that the primary limitations of LLMs lie in the reasoning process itself rather than the interpretation of syllogisms. @@ -19048,7 +19048,7 @@ <fixed-case>DIMSIM</fixed-case>: Distilled Multilingual Critics for <fixed-case>I</fixed-case>ndic Text Simplification SnehaMondalGoogle RitikaRitikaGoogle - AshishAgrawal + AshishAgrawal PreethiJyothiIndian Institute of Technology Bombay AravindanRaghuveerGoogle 16093-16109 @@ -19062,7 +19062,7 @@ DongkyuLee ChandanaSatya PrakashAmazon JackFitzGeraldAmazon - JensLehmannAmazon, Technische Universität Dresden, University of Bonn and Fraunhofer IAIS + JensLehmannAmazon, Technische Universität Dresden, University of Bonn and Fraunhofer IAIS 16110-16121 Leveraging external knowledge is crucial for achieving high performance in knowledge-intensive tasks, such as question answering. The retrieve-and-read approach is widely adopted for integrating external knowledge into a language model. However, this approach suffers from increased computational cost and latency due to the long context length, which grows proportionally with the number of retrieved knowledge. Furthermore, existing retrieval-augmented models typically retrieve information from a single type of knowledge source, limiting their scalability to diverse knowledge sources with varying structures. In this work, we introduce an efficient memory-augmented transformer called MATTER, designed to retrieve relevant knowledge from multiple heterogeneous knowledge sources. Specifically, our model retrieves and reads from both unstructured sources (paragraphs) and semi-structured sources (QA pairs) in the form of fixed-length neural memories. We demonstrate that our model outperforms existing efficient retrieval-augmented models on popular QA benchmarks in terms of both accuracy and speed. Furthermore, MATTER achieves competitive results compared to conventional read-and-retrieve models while having 100x throughput during inference. 2024.findings-acl.953 @@ -19084,12 +19084,12 @@ Chain-of-History Reasoning for Temporal Knowledge Graph Forecasting - YuweiXia + YuweiXia DingWang - QiangLiuInstitute of Automation, Chinese Academy of Sciences - LiangWang + QiangLiuInstitute of Automation, Chinese Academy of Sciences + LiangWang ShuWuInstitute of automation, Chinese academy of science, Chinese Academy of Sciences - Xiao-YuZhangInstitute of Information Engineering, Chinese Academy of Sciences + Xiao-YuZhangInstitute of Information Engineering, Chinese Academy of Sciences 16144-16159 Temporal Knowledge Graph (TKG) forecasting aims to predict future facts based on given histories. Most recent graph-based models excel at capturing structural information within TKGs but lack semantic comprehension abilities. Nowadays, with the surge of LLMs, the LLM-based TKG prediction model has emerged. However, the existing LLM-based model exhibits three shortcomings: (1) It only focuses on the first-order history for prediction while ignoring high-order historical information, resulting in the provided information for LLMs being extremely limited. (2) LLMs struggle with optimal reasoning performance under heavy historical information loads. (3) For TKG prediction, the temporal reasoning capability of LLM alone is limited. To address the first two challenges, we propose Chain-of-History (CoH) reasoning which explores high-order histories step-by-step, achieving effective utilization of high-order historical information for LLMs on TKG prediction. To address the third issue, we design CoH as a plug-and-play module to enhance the performance of graph-based models for TKG prediction. Extensive experiments on three datasets and backbones demonstrate the effectiveness of CoH. 2024.findings-acl.955 @@ -19098,10 +19098,10 @@ Can <fixed-case>LLM</fixed-case>s Speak For Diverse People? Tuning <fixed-case>LLM</fixed-case>s via Debate to Generate Controllable Controversial Statements - MingLiUniversity of Maryland, College Park + MingLiUniversity of Maryland, College Park JiuhaiChen LichangChen - TianyiZhouUniversity of Maryland, College Park + TianyiZhouUniversity of Maryland, College Park 16160-16176 Making LLMs speak for different, especially minority groups of people, and generate statements supporting their diverse or even controversial perspectives is critical to creating an inclusive environment. However, existing LLMs lack sufficient controllability to the stance of their generated content, which often contains inconsistent, neutral, or biased statements. In this paper, we improve the controllability of LLMs in generating statements supporting an argument the user defined in the prompt. We find that multi-round debates between two LLMs with opposite stances generate higher-quality and more salient statements for each, which are important training data to improve the controllability of LLMs. Motivated by this, we develop a novel debate & tuning (“DEBATUNE”) pipeline finetuning LLMs to generate the statements obtained via debate. To examine DEBATUNE, we curate the largest dataset of debate topics so far, which covers 710 controversial topics and corresponding arguments for each topic. Evaluations by the GPT-4 judge with a novel controversy controllability metric show that LLMs’ capability of generating diverse perspectives is significantly improved by DEBATUNE. Moreover, such controllability can be generalized to unseen topics, generating high-quality statements supporting controversial arguments. 2024.findings-acl.956 @@ -19111,10 +19111,10 @@ Label-aware Hard Negative Sampling Strategies with Momentum Contrastive Learning for Implicit Hate Speech Detection JaehoonKimHanyang University - SeungwanJin + SeungwanJin SohyunPark SomeenPark - KyungsikHanHanyang University + KyungsikHanHanyang University 16177-16188 Detecting implicit hate speech that is not directly hateful remains a challenge. Recent research has attempted to detect implicit hate speech by applying contrastive learning to pre-trained language models such as BERT and RoBERTa, but the proposed models still do not have a significant advantage over cross-entropy loss-based learning. We found that contrastive learning based on randomly sampled batch data does not encourage the model to learn hard negative samples. In this work, we propose Label-aware Hard Negative sampling strategies (LAHN) that encourage the model to learn detailed features from hard negative samples, instead of naive negative samples in random batch, using momentum-integrated contrastive learning. LAHN outperforms the existing models for implicit hate speech detection both in- and cross-datasets. The code is available at https://github.com/Hanyang-HCC-Lab/LAHN 2024.findings-acl.957 @@ -19123,12 +19123,12 @@ Selective Reflection-Tuning: Student-Selected Data Recycling for <fixed-case>LLM</fixed-case> Instruction-Tuning - MingLiUniversity of Maryland, College Park + MingLiUniversity of Maryland, College Park LichangChen JiuhaiChen ShwaiHeUniversity of Maryland, College Park JiuxiangGuAdobe Systems - TianyiZhouUniversity of Maryland, College Park + TianyiZhouUniversity of Maryland, College Park 16189-16211 Instruction tuning is critical to large language models (LLMs) for achieving better instruction following and task adaptation capabilities but its success heavily relies on the training data quality. Many recent methods focus on improving the data quality but often overlook the compatibility of the data with the student model being finetuned. This paper introduces Selective Reflection-Tuning, a novel paradigm that synergizes a teacher LLM’s reflection and introspection for improving existing data quality with the data selection capability of the student LLM, to automatically refine existing instruction-tuning data. This teacher-student collaboration produces high-quality and student-compatible instruction-response pairs, resulting in sample-efficient instruction tuning and LLMs of superior performance. Selective Reflection-Tuning is a data augmentation and synthesis that generally improves LLM finetuning and self-improvement without collecting brand-new data. We apply our method to Alpaca and WizardLM data and achieve much stronger and top-tier 7B and 13B LLMs. 2024.findings-acl.958 @@ -19164,17 +19164,17 @@ <fixed-case>C</fixed-case>ontext<fixed-case>BLIP</fixed-case>: Doubly Contextual Alignment for Contrastive Image Retrieval from Linguistically Complex Descriptions - HonglinLin + HonglinLin SiyuLi GuoshunNanBeijing University of Posts and Telecommunications - ChaoyueTang - XuetingWang - JingxinXuBeijing University of Posts and Telecommunications + ChaoyueTang + XuetingWang + JingxinXuBeijing University of Posts and Telecommunications RongYankai - ZhouzhiliZhouzhiliGuangzhou University + ZhouzhiliZhouzhiliGuangzhou University YutongGaoBeijing jiaotong univercity, National Taipei University of Technology, Northeastern University and Minzu University of China QimeiCuiBeijing University of Posts and Telecommunications - XiaofengTao + XiaofengTao 16240-16258 Image retrieval from contextual descriptions (IRCD) aims to identify an image within a set of minimally contrastive candidates based on linguistically complex text. Despite the success of VLMs, they still significantly lag behind human performance in IRCD. The main challenges lie in aligning key contextual cues in two modalities, where these subtle cues are concealed in tiny areas of multiple contrastive images and within the complex linguistics of textual descriptions. This motivates us to propose ContextBLIP, a simple yet effective method that relies on a doubly contextual alignment scheme for challenging IRCD. Specifically, 1) our model comprises a multi-scale adapter, a matching loss, and a text-guided masking loss. The adapter learns to capture fine-grained visual cues. The two losses enable iterative supervision for the adapter, gradually highlighting the focal patches of a single image to the key textual cues. We term such a way as intra-contextual alignment. 2) Then, ContextBLIP further employs an inter-context encoder to learn dependencies among candidates, facilitating alignment between the text to multiple images. We term this step as inter-contextual alignment. Consequently, the nuanced cues concealed in each modality can be effectively aligned. Experiments on two benchmarks show the superiority of our method. We observe that ContextBLIP can yield comparable results with GPT-4V, despite involving about 7,500 times fewer parameters. 2024.findings-acl.961 @@ -19201,7 +19201,7 @@ JaehongKim ChaeyoonJeong SeongchanPark - MeeyoungChaKorea Advanced Institute of Science & Technology + MeeyoungChaKorea Advanced Institute of Science & Technology WonjaeLeeKorea Advanced Institute of Science and Technology 16274-16289 Understanding the interplay between emotions in language and user behaviors is critical. We study how moral emotions shape the political participation of users based on cross-cultural online petition data. To quantify moral emotions, we employ a context-aware NLP model that is designed to capture the subtle nuances of emotions across cultures. For model training, we construct and share a moral emotion dataset comprising nearly 50,000 petition sentences in Korean and English each, along with emotion labels annotated by a fine-tuned LLM. We examine two distinct types of user participation: general support (i.e., registered signatures of petitions) and active support (i.e., sharing petitions on social media). We discover that moral emotions like other-suffering increase both forms of participation and help petitions go viral, while self-conscious have the opposite effect. The most prominent moral emotion, other-condemning, led to polarizing responses among the audience. In contrast, other-praising was perceived differently by culture; it led to a rise in active support in Korea but a decline in the UK. Our findings suggest that both moral emotions embedded in language and cultural perceptions are critical to shaping the public’s political discourse. @@ -19225,8 +19225,8 @@ <fixed-case>CF</fixed-case>-<fixed-case>TCIR</fixed-case>: A Compositor-Free Framework for Hierarchical Text-Conditioned Image Retrieval YuchenYang - YuWangShanghai Jiao Tong University - YanfengWangShanghai Jiao Tong University + YuWangShanghai Jiao Tong University + YanfengWangShanghai Jiao Tong University 16315-16325 In text-conditioned image retrieval (TCIR), the combination of a reference image and modification text forms a query tuple, aiming to locate the most congruent target image within a dataset. The advantages of rich image semantic information and text flexibility are combined in this manner for more accurate retrieval. While traditional techniques often employ attention-driven compositors to craft a unified image-text representation, our paper introduces a compositor-free framework, CF-TCIR, which eschews the standard compositor. Compositor-based methods are designed to learn a joint representation of images and text, but they struggle to directly capture the correlations between attributes across the image and text modalities. Instead, we reformulate the retrieval process as a cross-modal interaction between a synthesized image feature and its corresponding text descriptor. This novel methodology offers advantages in terms of computational efficiency, scalability, and superior performance. To optimize the retrieval performance, we advocate a tiered retrieval mechanism, blending both coarse-grain and fine-grain paradigms. Moreover, to enrich the contextual relationship within the query tuple, we integrate a generative cross-modal alignment technique, ensuring synchronization of sequential attributes between image and text data. 2024.findings-acl.965 @@ -19237,7 +19237,7 @@ <fixed-case>DMIN</fixed-case>: A Discourse-specific Multi-granularity Integration Network for Conversational Aspect-based Sentiment Quadruple Analysis PeijieHuangSouth China Agricultural University XishengXiaoSouth China Agricultural University - YuhongXuSouth China Agricultural University + YuhongXuSouth China Agricultural University JiaweiChen 16326-16338 Conversational Aspect-based Sentiment Quadruple Analysis (DiaASQ) aims to extract fine-grained sentiment quadruples from dialogues. Previous research has primarily concentrated on enhancing token-level interactions, still lacking in sufficient modeling of the discourse structure information in dialogue. Firstly, it does not incorporate interactions among different utterances in the encoding stage, resulting in a limited token-level context understanding for subsequent modules. Secondly, it ignores the critical fact that discourse information is naturally organized at the utterance level and learning it solely at the token level is incomplete. In this work, we strengthen the token-level encoder by utilizing a discourse structure called “thread” and graph convolutional networks to enhance the token interaction among different utterances. Moreover, we propose an utterance-level encoder to learn the structured speaker and reply information, providing a macro understanding of dialogue discourse. Furthermore, we introduce a novel Multi-granularities Integrator to integrate token-level and utterance-level representations, resulting in a comprehensive and cohesive dialogue contextual understanding. Experiments on two datasets demonstrate that our model achieves state-of-the-art performance. Our codes are publicly available at https://github.com/SIGSDSscau/DMIN. @@ -19248,7 +19248,7 @@ Are Decoder-Only Language Models Better than Encoder-Only Language Models in Understanding Word Meaning? Muhammad RezaQorib - GeonsikMoon + GeonsikMoon Hwee TouNg 16339-16347 The natural language processing field has been evolving around language models for the past few years, from the usage of n-gram language models for re-ranking, to transfer learning with encoder-only (BERT-like) language models, and finally to large language models (LLMs) as general solvers. LLMs are dominated by the decoder-only type, and they are popular for their efficacy in numerous tasks. LLMs are regarded as having strong comprehension abilities and strong capabilities to solve new unseen tasks. As such, people may quickly assume that decoder-only LLMs always perform better than the encoder-only ones, especially for understanding word meaning. In this paper, we demonstrate that decoder-only LLMs perform worse on word meaning comprehension than an encoder-only language model that has vastly fewer parameters. @@ -19270,11 +19270,11 @@ On the Robustness of Document-Level Relation Extraction Models to Entity Name Variations ShiaoMengTsinghua University - XumingHuThe Hong Kong University of Science and Technology (Guangzhou) and Hong Kong University of Science and Technology + XumingHuThe Hong Kong University of Science and Technology (Guangzhou) and Hong Kong University of Science and Technology AiweiLiuTsinghua University, Tsinghua University FukunMaTsinghua University, Tsinghua University YawenYangTsinghua University, Tsinghua University - ShuangLiTencent + ShuangLiTencent LijieWenSchool of Software, Tsinghua University 16362-16374 Driven by the demand for cross-sentence and large-scale relation extraction, document-level relation extraction (DocRE) has attracted increasing research interest. Despite the continuous improvement in performance, we find that existing DocRE models which initially perform well may make more mistakes when merely changing the entity names in the document, hindering the generalization to novel entity names. To this end, we systematically investigate the robustness of DocRE models to entity name variations in this work. We first propose a principled pipeline to generate entity-renamed documents by replacing the original entity names with names from Wikidata. By applying the pipeline to DocRED and Re-DocRED datasets, we construct two novel benchmarks named Env-DocRED and Env-Re-DocRED for robustness evaluation. Experimental results show that both three representative DocRE models and two in-context learned large language models consistently lack sufficient robustness to entity name variations, particularly on cross-sentence relation instances and documents with more entities. Finally, we propose an entity variation robust training method which not only improves the robustness of DocRE models but also enhances their understanding and reasoning capabilities. We further verify that the basic idea of this method can be effectively transferred to in-context learning for DocRE as well. @@ -19284,7 +19284,7 @@ <fixed-case>RESEMO</fixed-case>: A Benchmark <fixed-case>C</fixed-case>hinese Dataset for Studying Responsive Emotion from Social Media Content - BoHuUniversity of Science and Technology of China + BoHuUniversity of Science and Technology of China MengZhang ChenfeiXieUniversity of Science and Technology of China YuanheTianUniversity of Washington, Seattle @@ -19311,7 +19311,7 @@ <fixed-case>KEEP</fixed-case> <fixed-case>CHATTING</fixed-case>! An Attractive Dataset for Continuous Conversation Agents YiheWang - JinLiuWuhan University + JinLiuWuhan University YaoWanHuazhong University of Science and Technology YitongLiHuawei Technologies Co., Ltd. ZifengLiu @@ -19325,10 +19325,10 @@ <fixed-case>R</fixed-case>e<fixed-case>P</fixed-case>air: Automated Program Repair with Process-based Feedback YuzeZhaoUniversity of Science and Technology of China - ZhenyaHuangUniversity of Science and Technology of China - YixiaoMaUniversity of Science and Technology of China - RuiLi - KaiZhang + ZhenyaHuangUniversity of Science and Technology of China + YixiaoMaUniversity of Science and Technology of China + RuiLi + KaiZhang HaoJiangUniversity of Science and Technology of China QiLiuUniversity of Science and Technology of China LinboZhu @@ -19344,7 +19344,7 @@ YangXuHarbin Institute of Technology YunlongFeng HonglinMuHarbin Institute Of Technology - YutaiHou + YutaiHou YitongLiHuawei Technologies Co., Ltd. XinghaoWang WanjunZhongByteDance Inc. @@ -19365,7 +19365,7 @@ JialiChengUniversity of Massachusetts at Lowell NidhiVakilUniversity of Massachusetts, Lowell HadiAmiriUniversity of Massachusetts Lowell - Leo AnthonyCeliMassachusetts Institute of Technology and Beth Israel Deaconess Medical Center + Leo AnthonyCeliMassachusetts Institute of Technology and Beth Israel Deaconess Medical Center 16442-16455 Medical decisions directly impact individuals’ health and well-being. Extracting decision spans from clinical notes plays a crucial role in understanding medical decision-making processes. In this paper, we develop a new dataset called “MedDec,” which contains clinical notes of eleven different phenotypes (diseases) annotated by ten types of medical decisions. We introduce the task of medical decision extraction, aiming to jointly extract and classify different types of medical decisions within clinical notes. We provide a comprehensive analysis of the dataset, develop a span detection model as a baseline for this task, evaluate recent span detection approaches, and employ a few metrics to measure the complexity of data samples. Our findings shed light on the complexities inherent in clinical decision extraction and enable future work in this area of research. The dataset and code are available through https://github.com/CLU-UML/MedDec. 2024.findings-acl.975 diff --git a/data/xml/2024.genbench.xml b/data/xml/2024.genbench.xml index c0cd452318..c345a4a400 100644 --- a/data/xml/2024.genbench.xml +++ b/data/xml/2024.genbench.xml @@ -35,10 +35,10 @@ From Language to Pixels: Task Recognition and Task Learning in <fixed-case>LLM</fixed-case>s - JanekFalkenstein - Carolin M.Schuster + JanekFalkenstein + Carolin M.Schuster Alexander H.BergerTechnische Universität München - GeorgGrohTechnical University Munich + GeorgGrohTechnical University Munich 27-41 LLMs can perform unseen tasks by learning from a few in-context examples. How in-context learning works is still uncertain. We investigate the mechanisms of in-context learning on a challenging non-language task. The task requires the LLM to generate pixel matrices representing images of basic shapes. We introduce a framework to analyze if this task is solved by recognizing similar formats from the training data (task recognition) or by understanding the instructions and learning the skill de novo during inference (task learning). Our experiments demonstrate that LLMs generate meaningful pixel matrices with task recognition and fail to learn such tasks when encountering unfamiliar formats. Our findings offer insights into LLMs’ learning mechanisms and their generalization ability to guide future research on their seemingly human-like behavior. 2024.genbench-1.2 @@ -47,8 +47,8 @@ The <fixed-case>S</fixed-case>lay<fixed-case>QA</fixed-case> benchmark of social reasoning: testing gender-inclusive generalization with neopronouns - BastianBunzeckUniversität Bielefeld - SinaZarrießBielefeld University + BastianBunzeckUniversität Bielefeld + SinaZarrießBielefeld University 42-53 We introduce SlayQA, a novel benchmark data set designed to evaluate language models’ ability to handle gender-inclusive language, specifically the use of neopronouns, in a question-answering setting. Derived from the Social IQa data set, SlayQA modifies context-question-answer triples to include gender-neutral pronouns, creating a significant linguistic distribution shift in comparison to common pre-training corpora like C4 or Dolma. Our results show that state-of-the-art language models struggle with the challenge, exhibiting small, but noticeable performance drops when answering question containing neopronouns compared to those without. 2024.genbench-1.3 @@ -72,7 +72,7 @@ SarthakJain PaulKantorUniversity of Wisconsin - Madison, Rutgers University, New Brunswick and Paul B Kantor, Consultant JacobFeldmanRutgers University - LazarosGallosRutgers University + LazarosGallosRutgers University HaoWangRutgers University 69-85 We propose MMLU-SR, a novel dataset designed to measure the true comprehension abilities of Large Language Models (LLMs) by challenging their performance in question-answering tasks with modified terms. We reasoned that an agent that “truly” understands a concept can still evaluate it when key terms are replaced by suitably defined alternate terms, and sought to differentiate such comprehension from mere text replacement. In our study, we modified standardized test questions by replacing a key term with a dummy word along with its definition. The key term could be in the context of questions, answers, or both questions and answers. Notwithstanding the high scores achieved by recent popular LLMs on the MMLU leaderboard, we found a substantial reduction in model performance after such replacement, suggesting poor comprehension. This new benchmark provides a rigorous benchmark for testing true model comprehension, and poses a challenge to the broader scientific community. @@ -83,7 +83,7 @@ <fixed-case>ML</fixed-case>issard: Multilingual Long and Simple Sequential Reasoning Benchmarks Mirelle CandidaBueno - RobertoLotufoUniversity of Campinas, Universidade Estadual de Campinas + RobertoLotufoUniversity of Campinas, Universidade Estadual de Campinas RodrigoFrassetto NogueiraUniversidade Estadual de Campinas 86-95 Language models are now capable of solving tasks that require dealing with long sequences consisting of hundreds of thousands of tokens. However, they often fail on tasks that require repetitive use of simple rules, even on sequences that are much shorter than those seen during training. For example, state-of-the-art LLMs can find common items in two lists with up to 20 items but fail when lists have 80 items. In this paper, we introduce MLissard, a multilingual benchmark designed to evaluate models’ abilities to process and generate texts of varied lengths and offers a mechanism for controlling sequence complexity. Our evaluation of open-source and proprietary models show a consistent decline in performance across all models and languages as the complexity of the sequence increases. Surprisingly, the use of in-context examples in languages other than English helps increase extrapolation performance significantly. @@ -93,12 +93,12 @@ <fixed-case>M</fixed-case>ulti<fixed-case>P</fixed-case>rag<fixed-case>E</fixed-case>val: Multilingual Pragmatic Evaluation of Large Language Models - DojunParkSeoul National University + DojunParkSeoul National University JiwooLeeNA SeohyunParkNA HyeyunJeongNA YoungeunKooNA - SoonhaHwangYonsei University + SoonhaHwangYonsei University SeonwooParkNA SungeunLeeNA 96-119 @@ -110,7 +110,7 @@ Beyond the Numbers: Transparency in Relation Extraction Benchmark Creation and Leaderboards VarvaraArzt - AllanHanburyComplexity Science Hub and Technische Universität Wien + AllanHanburyComplexity Science Hub and Technische Universität Wien 120-130 This paper investigates the transparency in the creation of benchmarks and the use of leaderboards for measuring progress in NLP, with a focus on the relation extraction (RE) task. Existing RE benchmarks often suffer from insufficient documentation, lacking crucial details such as data sources, inter-annotator agreement, the algorithms used for the selection of instances for datasets, and information on potential biases like dataset imbalance. Progress in RE is frequently measured by leaderboards that rank systems based on evaluation methods, typically limited to aggregate metrics like F1-score. However, the absence of detailed performance analysis beyond these metrics can obscure the true generalisation capabilities of models. Our analysis reveals that widely used RE benchmarks, such as TACRED and NYT, tend to be highly imbalanced and contain noisy labels. Moreover, the lack of class-based performance metrics fails to accurately reflect model performance across datasets with a large number of relation types. These limitations should be carefully considered when reporting progress in RE. While our discussion centers on the transparency of RE benchmarks and leaderboards, the observations we discuss are broadly applicable to other NLP tasks as well. Rather than undermining the significance and value of existing RE benchmarks and the development of new models, this paper advocates for improved documentation and more rigorous evaluation to advance the field. 2024.genbench-1.8 @@ -119,8 +119,8 @@ Is artificial intelligence still intelligence? <fixed-case>LLM</fixed-case>s generalize to novel adjective-noun pairs, but don’t mimic the full human distribution - HayleyRossHarvard University, Harvard University - KathrynDavidsonHarvard University + HayleyRossHarvard University, Harvard University + KathrynDavidsonHarvard University NajoungKimBoston University and Google 131-153 Inferences from adjective-noun combinations like “Is artificial intelligence still intelligence?” provide a good test bed for LLMs’ understanding of meaning and compositional generalization capability, since there are many combinations which are novel to both humans and LLMs but nevertheless elicit convergent human judgments. We study a range of LLMs and find that the largest models we tested are able to draw human-like inferences when the inference is determined by context and can generalize to unseen adjective-noun combinations. We also propose three methods to evaluate LLMs on these inferences out of context, where there is a distribution of human-like answers rather than a single correct answer. We find that LLMs show a human-like distribution on at most 75% of our dataset, which is promising but still leaves room for improvement. @@ -130,12 +130,12 @@ <fixed-case>CHIE</fixed-case>: Generative <fixed-case>MRC</fixed-case> Evaluation for in-context <fixed-case>QA</fixed-case> with Correctness, Helpfulness, Irrelevancy, and Extraneousness Aspects - WannaphongPhatthiyaphaibunVidyasirimedhi Institute of Science and Technology + WannaphongPhatthiyaphaibunVidyasirimedhi Institute of Science and Technology SuraponNonesungSCB 10X PeeratLimkonchotiwatAI Singapore CanUdomcharoenchaikitVidyasirimedhi Institute of Science and Technology (VISTEC) JitkapatSawatphol - EkapolChuangsuwanichChulalongkorn University + EkapolChuangsuwanichChulalongkorn University SaranaNutanong 154-164 The evaluation of generative models in Machine Reading Comprehension (MRC) presents distinct difficulties, as traditional metrics like BLEU, ROUGE, METEOR, Exact Match, and F1 score often struggle to capture the nuanced and diverse responses. While embedding-based metrics such as BERTScore and BARTScore focus on semantic similarity, they still fail to fully address aspects such as recognizing additional helpful information and rewarding contextual faithfulness. Recent advances in large language model (LLM) based metrics offer more fine-grained evaluations, but challenges such as score clustering remain. This paper introduces a multi-aspect evaluation framework, CHIE,incorporating aspects of Correctness, Helpfulness, Irrelevance, and Extraneousness. Our approach, which uses binary categorical values rather than continuous rating scales, aligns well with human judgments, indicating its potential as a comprehensive and effective evaluation method. @@ -148,8 +148,8 @@ RitamDutt SagnikRay Choudhury Varun VenkatRao - CarolynRose - V.G.VinodVydiswaran + CarolynRose + V.G.VinodVydiswaran 165-182 Generalization refers to the ability of machine learning models to perform well on dataset distributions different from the one it was trained on. While several pre-existing works have characterized the generalizability of NLP models across different dimensions, such as domain shift, adversarial perturbations, or compositional variations, most studies were carried out in a stand-alone setting, emphasizing a single dimension of interest. We bridge this gap by systematically investigating the generalizability of pre-trained language models across different architectures, sizes, and training strategies, over multiple dimensions for the task of natural language inference and question answering. Our results indicate that model instances typically exhibit consistent generalization trends, i.e., they generalize equally well (or poorly) across most scenarios, and this ability is correlated with model architecture, base dataset performance, size, and training mechanism. We hope this research motivates further work in a) developing a multi-dimensional generalization benchmark for systematic evaluation and b) examining the reasons behind models’ generalization abilities. The code and data are available at https://github.com/sagnik/md-gen-nlp, and the trained models are released at https://huggingface.co/varun-v-rao. 2024.genbench-1.11 @@ -158,14 +158,14 @@ <fixed-case>O</fixed-case>mni<fixed-case>D</fixed-case>ialog: A Multimodal Benchmark for Generalization Across Text, Visual, and Audio Modalities - AntonRazzhigaev + AntonRazzhigaev MaximKurkinSkolkovo Institute of Science and Technology and Artificial Intelligence Research Institute - ElizavetaGoncharovaHigher School of Economics - IrinaAbdullaeva + ElizavetaGoncharovaHigher School of Economics + IrinaAbdullaeva AnastasiaLysenko AlexanderPanchenkoSkoltech - AndreyKuznetsovAIRI, Sber and Samara National Research University - DenisDimitrovAIRI and Sber + AndreyKuznetsovAIRI, Sber and Samara National Research University + DenisDimitrovAIRI and Sber 183-195 We introduce \textit{OmniDialog} — the first trimodal comprehensive benchmark grounded in a knowledge graph (Wikidata) to evaluate the generalization of Large Multimodal Models (LMMs) across three modalities. Our benchmark consists of more than 4,000 dialogues, each averaging 10 turns, all annotated and cross-validated by human experts. The dialogues in our dataset are designed to prevent shortcut learning by incorporating various formats and misleading or irrelevant multimodal cues. We also evaluate both multimodal and unimodal models to gain insights into how they process modality inputs introduced in the conversation. 2024.genbench-1.12 @@ -174,7 +174,7 @@ Towards a new Benchmark for Emotion Detection in <fixed-case>NLP</fixed-case>: A Unifying Framework of Recent Corpora - AnnaKoufakouFlorida Gulf Coast University + AnnaKoufakouFlorida Gulf Coast University ElijahNievesNA JohnPellerNA 196-206 diff --git a/data/xml/2024.mrl.xml b/data/xml/2024.mrl.xml index ea4bbb9f82..4a5881ecd4 100644 --- a/data/xml/2024.mrl.xml +++ b/data/xml/2024.mrl.xml @@ -49,7 +49,7 @@ Adapting Open-Source Generative Large Language Models for Low-Resource Languages: A Case Study for <fixed-case>T</fixed-case>urkish - CagriToramanMETU, Middle East Technical University + CagriToramanMETU, Middle East Technical University 30-44 Despite advancements in English-dominant generative large language models, further development is needed for low-resource languages to enhance global accessibility. The primary methods for representing these languages are monolingual and multilingual pretraining. Monolingual pretraining is expensive due to hardware requirements, and multilingual models often have uneven performance across languages. This study explores an alternative solution by adapting large language models, primarily trained on English, to low-resource languages. We assess various strategies, including continual training, instruction fine-tuning, task-specific fine-tuning, and vocabulary extension. The results show that continual training improves language comprehension, as reflected in perplexity scores, and task-specific tuning generally enhances performance of downstream tasks. However, extending the vocabulary shows no substantial benefits. Additionally, while larger models improve task performance with few-shot tuning, multilingual models perform worse than their monolingual counterparts when adapted. 2024.mrl-1.3 @@ -59,7 +59,7 @@ An Efficient Approach for Studying Cross-Lingual Transfer in Multilingual Language Models FahimFaisal, George Mason University - AntoniosAnastasopoulosAthena Research Center and George Mason University + AntoniosAnastasopoulosAthena Research Center and George Mason University 45-92 The capacity and effectiveness of pre-trained multilingual models (MLMs) for zero-shot cross-lingual transfer is well established. However, phenomena of positive or negative transfer, and the effect of language choice still need to be fully understood, especially in the complex setting of massively multilingual LMs. We propose an efficient method to study transfer language influence in zero-shot performance on another target language. Unlike previous work, our approach disentangles downstream tasks from language, using dedicated adapter units. Our findings suggest that some languages do not largely affect others, while some languages, especially ones unseen during pre-training, can be extremely beneficial or detrimental for different target languages. We find that no transfer language is beneficial for all target languages. We do, curiously, observe languages previously unseen by MLMs consistently benefit from transfer from almost any language. We additionally use our modular approach to quantify negative interference efficiently and categorize languages accordingly. Furthermore, we provide a list of promising transfer-target language configurations that consistently lead to target language performance improvements. 2024.mrl-1.4 @@ -68,7 +68,7 @@ Are You Sure? Rank Them Again: Repeated Ranking For Better Preference Datasets - PeterDevine + PeterDevine 93-105 Training Large Language Models (LLMs) with Reinforcement Learning from AI Feedback (RLAIF) aligns model outputs more closely with human preferences. This involves an evaluator model ranking multiple candidate responses to user prompts. However, the rankings from popular evaluator models such as GPT-4 can be inconsistent.We propose the Repeat Ranking method, in which we evaluate the same responses multiple times and train only on those responses which are consistently ranked. Using 2,714 training prompts in 62 languages, we generated responses from 7 top multilingual LLMs and had GPT-4 rank them five times each. Evaluating on MT-Bench chat benchmarks in six languages, our method outperformed the standard practice of training on all available prompts.Our work highlights the quality versus quantity trade-off in RLAIF dataset generation and offers a stackable strategy for enhancing dataset and thus model quality. 2024.mrl-1.5 @@ -77,7 +77,7 @@ Tagengo: A Multilingual Chat Dataset - PeterDevine + PeterDevine 106-113 Open source large language models (LLMs) have shown great improvements in recent times. However, many of these models are focused solely on popular spoken languages. We present a high quality dataset of more than 70k prompt-response pairs in 74 languages which consist of human generated prompts and synthetic responses. We use this dataset to train a state-of-the-art open source English LLM to chat multilingually.We evaluate our model on MT-Bench chat benchmarks in 6 languages, finding that our multilingual model outperforms previous state-of-the-art open source LLMs across each language. We further find that training on more multilingual data is beneficial to the performance in a chosen target language (Japanese) compared to simply training on only data in that language.These results indicate the necessity of training on large amounts of high quality multilingual data to make a more accessible LLM. 2024.mrl-1.6 @@ -103,8 +103,8 @@ DemiZhang BushiXiao ChaoGao - SangpilYoumUniversity of Florida - Bonnie JDorrUniversity of Florida + SangpilYoumUniversity of Florida + Bonnie JDorrUniversity of Florida 127-136 This study evaluates the performance of Recurrent Neural Network (RNN) and Transformer models in replicating cross-language structural priming, a key indicator of abstract grammatical representations in human language processing. Focusing on Chinese-English priming, which involves two typologically distinct languages, we examine how these models handle the robust phenomenon of structural priming, where exposure to a particular sentence structure increases the likelihood of selecting a similar structure subsequently. Our findings indicate that transformers outperform RNNs in generating primed sentence structures, with accuracy rates that exceed 25.84% to 33. 33%. This challenges the conventional belief that human sentence processing primarily involves recurrent and immediate processing and suggests a role for cue-based retrieval mechanisms. This work contributes to our understanding of how computational models may reflect human cognitive processes across diverse language families. 2024.mrl-1.8 @@ -115,7 +115,7 @@ Recipe for Zero-shot <fixed-case>POS</fixed-case> Tagging: Is It Useful in Realistic Scenarios? ZenoVandenbulckeKU Leuven, KU Leuven LukasVermeire - Miryamde LhoneuxKU Leuven + Miryamde LhoneuxKU Leuven 137-147 POS tagging plays a fundamental role in numerous applications. While POS taggers are highly accurate in well-resourced settings, they lag behind in cases of limited or missing training data. This paper focuses on POS tagging for languages with limited data. We seek to identify favourable characteristics of datasets for training POS tagging models using related languages without specific training on the target language. This is a zero-shot approach. We investigate both mono- and multilingual models trained on related languages and compare their accuracies. Additionally, we compare these results with models trained directly on the target language itself. We do this for three target low-resource languages, for each of which we select several support languages. Our research highlights the importance of accurate dataset selection for developing effective zero-shot POS tagging models. Particularly, a strong linguistic relationship and high-quality datasets ensure optimal results. For extremely low-resource languages, zero-shot training proves to be a viable option. 2024.mrl-1.9 @@ -124,8 +124,8 @@ Gender-specific Machine Translation with Large Language Models - EduardoSánchezUniversity College London, University of London and Meta - PierreAndrews + EduardoSánchezUniversity College London, University of London and Meta + PierreAndrews PontusStenetorpUniversity College London MikelArtetxeReka AI Marta R.Costa-jussàMeta @@ -139,7 +139,7 @@ <fixed-case>J</fixed-case>ina-<fixed-case>C</fixed-case>ol<fixed-case>BERT</fixed-case>-v2: A General-Purpose Multilingual Late Interaction Retriever RohanJhaThe University of Texas at Austin BoWangJina AI - MichaelGüntherJina AI + MichaelGüntherJina AI GeorgiosMastrapasJina AI SabaSturuaJina AI IsabelleMohrJina AI @@ -158,7 +158,7 @@ DipendraYadav SumaiyaSuravee TobiasStraußUniversität Rostock - KristinaYordanovaErnst-Moritz-Arndt Universität Greifswald + KristinaYordanovaErnst-Moritz-Arndt Universität Greifswald 167-174 This study investigates the potential of cross-lingual transfer learning for Named Entity Recognition (NER) between Hindi and Nepali, two languages that, despite their linguistic similarities, face significant disparities in available resources. By leveraging multilingual BERT models, including RemBERT, BERT Multilingual, MuRIL, and DistilBERT Multilingual, the research examines whether pre-training them on a resource-rich language like Hindi can enhance NER performance in a resource-constrained language like Nepali and vice versa. The study conducts experiments in both monolingual and cross-lingual settings to evaluate the models’ effectiveness in transferring linguistic knowledge between the two languages. The findings reveal that while RemBERT and MuRIL perform well in monolingual contexts—RemBERT excelling in Hindi and MuRIL in Nepali—BERT Multilingual performs comparatively best in cross-lingual scenarios, in generalizing features across the languages. Although DistilBERT Multilingual demonstrates slightly lower performance in cross-lingual tasks, it balances efficiency with competitive results. The study underscores the importance of model selection based on linguistic and resource-specific contexts, highlighting that general-purpose models like BERT Multilingual are particularly well-suited for cross-lingual applications. 2024.mrl-1.12 @@ -168,7 +168,7 @@ Parameter-efficient Adaptation of Multilingual Multimodal Models for Low-resource <fixed-case>ASR</fixed-case> AbhishekGupta - AmrutaParulekar + AmrutaParulekar SameepChattopadhyay PreethiJyothiIndian Institute of Technology Bombay 175-185 @@ -190,10 +190,10 @@ Vikhr: Constructing a State-of-the-art Bilingual Open-Source Instruction-Following Large Language Model for <fixed-case>R</fixed-case>ussian AleksandrNikolich - KonstantinKorolev + KonstantinKorolev SergeiBratchikovMisis - IgorKiselevUniversity of Waterloo - ArtemShelmanovMohamed bin Zayed University of Artificial Intelligence + IgorKiselevUniversity of Waterloo + ArtemShelmanovMohamed bin Zayed University of Artificial Intelligence 189-199 There has been a surge in the development of various Large Language Models (LLMs). However, text generation for languages other than English often faces significant challenges, including poor generation quality and reduced computational performance due to the disproportionate representation of tokens in the model’s vocabulary. In this work, we address these issues by developing a pipeline for adaptation of English-oriented pre-trained models to other languages and constructing efficient bilingual LLMs. Using this pipeline, we construct Vikhr, a state-of-the-art bilingual open-source instruction-following LLM designed specifically for the Russian language. “Vikhr” refers to the name of the Mistral LLM series and means a “strong gust of wind.”Unlike previous Russian-language models that typically rely on LoRA adapters on top of English-oriented models, sacrificing performance for lower training costs, Vikhr features an adapted tokenizer vocabulary and undergoes the continued pre-training and instruction tuning of all weights. This not only enhances the model’s performance but also significantly improves its computational and contextual efficiency.The remarkable performance of Vikhr across various Russian-language benchmarks can also be attributed to our efforts in expanding instruction datasets and corpora for continued pre-training. Vikhr not only sets the new state of the art among open-source LLMs for Russian but even outperforms some proprietary closed-source models on certain benchmarks. The model weights, instruction sets, and code are publicly available. 2024.mrl-1.15 @@ -218,7 +218,7 @@ Leveraging Adapters for Improved Cross-lingual Transfer for Low-Resource Creole <fixed-case>MT</fixed-case> Marcell RichardFekete - ErnestsLavrinovics + ErnestsLavrinovics Nathaniel RomneyRobinsonDepartment of Computer Science, Whiting School of Engineering HeatherLentAalborg University RajDabreNational Institute of Information and Communications Technology (NICT), National Institute of Advanced Industrial Science and Technology @@ -246,15 +246,15 @@ Community <fixed-case>OSCAR</fixed-case>: A Community Effort for Multilingual Web Data ManuelBrackGerman Research Center for AI and Technische Universität Darmstadt MalteOstendorffDeutsche Telekom - PedroOrtiz SuarezCommon Crawl Foundation - José JavierSaizBarcelona Supercomputing Center + PedroOrtiz SuarezCommon Crawl Foundation + José JavierSaizBarcelona Supercomputing Center Iñaki LacunzaCastillaBarcelona Supercomputing Center - JorgePalomar-GinerBarcelona Supercomputing Center - AlexanderShvetsBarcelona Supercomputing Center - PatrickSchramowskiGerman Research Center for AI - GeorgRehmHumboldt Universität Berlin and Deutsches Forschungszentrum für Künstliche Intelligenz - MartaVillegasBarcelona Supercomputing Center, Universitat Pompeu Fabra and Universitat Autònoma de Barcelona - KristianKerstingGerman Research Center for AI, The Hessian Center for AI and TU Darmstadt + JorgePalomar-GinerBarcelona Supercomputing Center + AlexanderShvetsBarcelona Supercomputing Center + PatrickSchramowskiGerman Research Center for AI + GeorgRehmHumboldt Universität Berlin and Deutsches Forschungszentrum für Künstliche Intelligenz + MartaVillegasBarcelona Supercomputing Center, Universitat Pompeu Fabra and Universitat Autònoma de Barcelona + KristianKerstingGerman Research Center for AI, The Hessian Center for AI and TU Darmstadt 232-235 The development of large language models (LLMs) relies heavily on extensive, high-quality datasets. Publicly available datasets focus predominantly on English, leaving other language communities behind. To address this issue, we introduce Community OSCAR, a multilingual dataset initiative designed to address the gap between English and non-English data availability. Through a collective effort, Community OSCAR covers over 150 languages with 45 billion documents, totaling over 345 TiB of data. Initial results indicate that Community OSCAR provides valuable raw data for training LLMs and enhancing the performance of multilingual models. This work aims to contribute to the ongoing advancements in multilingual NLP and to support a more inclusive AI ecosystem by making high-quality, multilingual data more accessible to those working with low-resource languages. 2024.mrl-1.19 @@ -265,7 +265,7 @@ Leveraging <fixed-case>LLM</fixed-case>s for Translating and Classifying Mental Health Data KonstantinosSkianisUniversity of Ioannina A. SezaDoğruözGhent University - JohnPavlopoulosAthens University of Economics and Business + JohnPavlopoulosAthens University of Economics and Business 236-241 Large language models (LLMs) are increasingly used in medical fields. In mental health support, the early identification of linguistic markers associated with mental health conditions can provide valuable support to mental health professionals, and reduce long waiting times for patients.Despite the benefits of LLMs for mental health support, there is limited research on their application in mental health systems for languages other than English. Our study addresses this gap by focusing on the detection of depression severity in Greek through user-generated posts which are automatically translated from English. Our results show that GPT3.5-turbo is not very successful in identifying the severity of depression in English, and it has a varying performance in Greek as well. Our study underscores the necessity for further research, especially in languages with less resources.Also, careful implementation is necessary to ensure that LLMs are used effectively in mental health platforms, and human supervision remains crucial to avoid misdiagnosis. 2024.mrl-1.20 @@ -288,7 +288,7 @@ QiuhaiZeng ZimengQiuAmazon Dae YonHwangAmazon AGI - XinHeAmazon + XinHeAmazon William M.Campbell 269-279 Dense retrieval systems are commonly used for information retrieval (IR). They rely on learning text representations through an encoder and usually require supervised modeling via labelled data which can be costly to obtain or simply unavailable. In this study, we introduce a novel unsupervised text representation learning technique via instruction-tuning the pre-trained encoder-decoder large language model (LLM) under the dual-encoder retrieval framework. We demonstrate on multiple languages that the corpus representation can be augmented by the representations of relevant synthetic queries generated by the instruct-tuned LLM founded on the Rao-Blackwell theorem. Furthermore, we effectively align the query and corpus text representation with self-instruct tuning. We evaluate our proposed method under low-resource settings on three English, two German and one Portuguese retrieval datasets measuring NDCG@10, MRR@100, Recall@100. We significantly improve the average zero-shot retrieval performance on all metrics, increasing out-of-box FLAN-T5 model variations by [4.73%, 6.15%] in absolute NDCG@10 and exceeding four supervised dense retrievers. @@ -300,7 +300,7 @@ Language Bias in Multilingual Information Retrieval: The Nature of the Beast and Mitigation Methods JinruiYangThe University of Melbourne FanJiang - TimothyBaldwinMohamed bin Zayed University of Artificial Intelligence and The University of Melbourne + TimothyBaldwinMohamed bin Zayed University of Artificial Intelligence and The University of Melbourne 280-292 Language fairness in multilingual information retrieval (MLIR) systems is crucial for ensuring equitable access to information across diverse languages. This paper sheds light on the issue, based on the assumption that queries in different languages, but with identical semantics, should yield equivalent ranking lists when retrieving on the same multilingual documents. We evaluate the degree of fairness using both traditional retrieval methods, and a DPR neural ranker based on mBERT and XLM-R. Additionally, we introduce ‘LaKDA’, a novel loss designed to mitigate language biases in neural MLIR approaches. Our analysis exposes intrinsic language biases in current MLIR technologies, with notable disparities across the retrieval methods, and the effectiveness of LaKDA in enhancing language fairness. 2024.mrl-1.23 @@ -321,7 +321,7 @@ Generalization Measures for Zero-Shot Cross-Lingual Transfer - SakshamBassiNew York University + SakshamBassiNew York University DuyguAtamanNew York University KyunghyunChoGenentech and New York University 298-309 @@ -332,8 +332,8 @@ Detecting and Translating Language Ambiguity with Multilingual <fixed-case>LLM</fixed-case>s - BehrangMehrparvar - SandroPezzelleUniversity of Amsterdam + BehrangMehrparvar + SandroPezzelleUniversity of Amsterdam 310-323 Most languages could be ambiguous, which means the same conveyed text or speech, results in different actions by different readers or listeners. In this project, we propose a method to detect the ambiguity of a sentence using translation by multilingual LLMs. In particular, we hypothesize that a good machine translator should preserve the ambiguity of sentences in all target languages. Therefore, we investigate whether ambiguity is encoded in the hidden representation of a translation model or, instead, if only a single meaning is encoded. In our experiments, we have been able to predict ambiguity of sentences with high accuracy using machine translation without direct use of semantics and only based on the reconstruction error of a function that maps the forward and backward translation hidden representations to each other. The potential applications of the proposed approach span i) detecting ambiguous sentences, ii) fine-tuning existing multilingual LLMs to preserve ambiguous information, and iii) developing AI systems that can generate ambiguity-free languages when needed. 2024.mrl-1.26 @@ -347,7 +347,7 @@ An Attempt towards Generalized Retriever for In-Context Learning KazumaHashimotoGoogle Research Arjun ReddyAkulaGoogle Research KarthikRamanGoogle - MichaelBenderskyGoogle + MichaelBenderskyGoogle 324-345 This paper presents Multi-Lingual/Task Demonstration Retrieval (MLT-DR) for in-context learning with Large Language Models (LLMs).Our goal is to investigate how dense demonstration retrieval models are generalized across languages and tasks.We first convert 81 tasks into a common format, covering various languages, task types, and domains.For 8 English-based tasks among them, we use machine translation to create synthetic multi/cross-lingual tasks, by translating the examples into non-English languages to explicitly cover more than 130 languages.We then use an instruction-tuned LLM to estimate utility of demonstrations for all the tasks to train the demonstration retrieval models.In our experiments, we show an interesting counterintuitive observation; to compute embeddings of demonstrations, using both the input and ground-truth output hurts the generalization ability of the retriever on unseen tasks whose output space is quite different from those in the seen task set.We also examine that our retriever robustly works even with LLMs that we did not touch during the development of the models.The retrieval models’ checkpoints are publicly available at URL-available-upon-publication. 2024.mrl-1.27 @@ -359,7 +359,7 @@ An Attempt towards Generalized Retriever for In-Context Learning SenyuLi HaoYu JessicaOjoLelapa AI - David IfeoluwaAdelani + David IfeoluwaAdelani 346-356 We present our systems for the three tasks and five languages included in the MRL 2024 Shared Task on Multilingual Multi-task Information Retrieval: (1) Named Entity Recognition, (2) Free-form Question Answering, and (3) Multiple-choice Question Answering. For each task, we explored the impact of selecting different multilingual language models for fine-tuning across various target languages, and implemented an ensemble system that generates final outputs based on predictions from multiple fine-tuned models. All models are large language models fine-tuned on task-specific data. Our experimental results show that a more balanced dataset would yield better results. However, when training data for certain languages are scarce, fine-tuning on a large amount of English data supplemented by a small amount of “triggering data” in the target language can produce decent results. 2024.mrl-1.28 @@ -370,9 +370,9 @@ An Attempt towards Generalized Retriever for In-Context Learning <fixed-case>CUNI</fixed-case> and <fixed-case>LMU</fixed-case> Submission to the <fixed-case>MRL</fixed-case> 2024 Shared Task on Multi-lingual Multi-task Information Retrieval KatharinaHämmerl Andrei-AlexandruManea - GianlucaVicoCharles University Prague - JindřichHelclCharles University - JindřichLibovickýCharles University Prague + GianlucaVicoCharles University Prague + JindřichHelclCharles University + JindřichLibovickýCharles University Prague 357-364 We present the joint CUNI and LMU submission to the MRL 2024 Shared Task on Multi-lingual Multi-task Information Retrieval.The shared task objective was to explore how we can deploy modern methods in NLP in multi-lingual low-resource settings, tested on two sub-tasks: Named-entity recognition and question answering.Our solutions to the subtasks are based on data acquisition and model adaptation.We compare the performance of our submitted systems with the translate-test approachwhich proved to be the most useful in the previous edition of the shared task.Our results show that using more data as well as fine-tuning recent multilingual pre-trained models leads to considerable improvements over the translate-test baseline.Our code is available at https://github.com/ufal/mrl2024-multilingual-ir-shared-task. 2024.mrl-1.29 diff --git a/data/xml/2024.nlp4pi.xml b/data/xml/2024.nlp4pi.xml index c0fd700381..cd491a5d12 100644 --- a/data/xml/2024.nlp4pi.xml +++ b/data/xml/2024.nlp4pi.xml @@ -27,7 +27,7 @@ What is the social benefit of hate speech detection research? A Systematic Review - Sidney Gig-JanWongUniversity of Canterbury + Sidney Gig-JanWongUniversity of Canterbury 1-12 While NLP research into hate speech detection has grown exponentially in the last three decades, there has been minimal uptake or engagement from policy makers and non-profit organisations. We argue the absence of ethical frameworks have contributed to this rift between current practice and best practice. By adopting appropriate ethical frameworks, NLP researchers may enable the social impact potential of hate speech research. This position paper is informed by reviewing forty-eight hate speech detection systems associated with thirty-seven publications from different venues. 2024.nlp4pi-1.1 @@ -36,9 +36,9 @@ Multilingual Fact-Checking using <fixed-case>LLM</fixed-case>s - AryanSinghalUniversity of California, Santa Barbara + AryanSinghalUniversity of California, Santa Barbara ThomasLaw - CobyKassner + CobyKassner AyushmanGupta EvanDuan AviralDamle @@ -51,8 +51,8 @@ Transferring Fairness using Multi-Task Learning with Limited Demographic Information - Carlos AlejandroAguirreJohns Hopkins University - MarkDredzeDepartment of Computer Science, Whiting School of Engineering + Carlos AlejandroAguirreJohns Hopkins University + MarkDredzeDepartment of Computer Science, Whiting School of Engineering 32-49 Training supervised machine learning systems with a fairness loss can improve prediction fairness across different demographic groups. However, doing so requires demographic annotations for training data, without which we cannot produce debiased classifiers for most tasks. Drawing inspiration from transfer learning methods, we investigate whether we can utilize demographic data from a related task to improve the fairness of a target task. We adapt a single-task fairness loss to a multi-task setting to exploit demographic labels from a related task in debiasing a target task, and demonstrate that demographic fairness objectives transfer fairness within a multi-task framework. Additionally, we show that this approach enables intersectional fairness by transferring between two datasets with different single-axis demographics. We explore different data domains to show how our loss can improve fairness domains and tasks. 2024.nlp4pi-1.3 @@ -61,10 +61,10 @@ Selecting Shots for Demographic Fairness in Few-Shot Learning with Large Language Models - Carlos AlejandroAguirreJohns Hopkins University + Carlos AlejandroAguirreJohns Hopkins University KuleenSasse Isabel AlyssaCacholaDepartment of Computer Science, Whiting School of Engineering - MarkDredzeDepartment of Computer Science, Whiting School of Engineering + MarkDredzeDepartment of Computer Science, Whiting School of Engineering 50-67 Recently, work in NLP has shifted to few-shot (in-context) learning, with large language models (LLMs) performing well across a range of tasks. However, while fairness evaluations have become a standard for supervised methods, little is known about the fairness of LLMs as prediction systems. Further, common standard methods for fairness involve access to model weights or are applied during finetuning, which are not applicable in few-shot learning. Do LLMs exhibit prediction biases when used for standard NLP tasks?In this work, we analyze the effect of shots, which directly affect the performance of models, on the fairness of LLMs as NLP classification systems. We consider how different shot selection strategies, both existing and new demographically sensitive methods, affect model fairness across three standard fairness datasets. We find that overall the performance of LLMs is not indicative of their fairness, and there is not a single method that fits all scenarios. In light of these facts, we discuss how future work can include LLM fairness in evaluations. 2024.nlp4pi-1.4 @@ -91,7 +91,7 @@ SpandanaGellaAmazon ApurvVermaBloomberg TagyoungChungAmazon - JingHuangAmazon Alexa AI + JingHuangAmazon Alexa AI NanyunPengUniversity of California, Los Angeles 78-97 Creating children’s stories through text generation is a creative task that requires stories to be both entertaining and suitable for young audiences. However, since current story generation systems often rely on pre-trained language models fine-tuned with limited story data, they may not always prioritize child-friendliness. This can lead to the unintended generation of stories containing problematic elements such as violence, profanity, and biases. Regrettably, despite the significance of these concerns, there is a lack of clear guidelines and benchmark datasets for ensuring content safety for children. In this paper, we introduce a taxonomy specifically tailored to assess content safety in text, with a strong emphasis on children’s well-being. We present PG-Story, a dataset that includes detailed annotations for both sentence-level and discourse-level safety. We demonstrate the potential of identifying unsafe content through self-diagnosis and employing controllable generation techniques during the decoding phase to minimize unsafe elements in generated stories. @@ -101,7 +101,7 @@ Towards Explainable Multi-Label Text Classification: A Multi-Task Rationalisation Framework for Identifying Indicators of Forced Labour - Erick MendezGuzman + Erick MendezGuzman ViktorSchlegelImperial College London RizaBatista-NavarroUniversity of Manchester 98-112 @@ -113,9 +113,9 @@ All Models are Wrong, But Some are Deadly: Inconsistencies in Emotion Detection in Suicide-related Tweets Annika MarieSchoeneInstitute for Experiential AI Northeastern University - ResmiRamachandranpillaiInstitute for Experiential AI and Linköping University - TomoLazovichU.S. Census Bureau - Ricardo A.Baeza-YatesNortheastern University, Universitat Pompeu Fabra and Universidad de Chile + ResmiRamachandranpillaiInstitute for Experiential AI and Linköping University + TomoLazovichU.S. Census Bureau + Ricardo A.Baeza-YatesNortheastern University, Universitat Pompeu Fabra and Universidad de Chile 113-122 Recent work in psychology has shown that people who experience mental health challenges are more likely to express their thoughts, emotions, and feelings on social media than share it with a clinical professional. Distinguishing suicide-related content, such as suicide mentioned in a humorous context, from genuine expressions of suicidal ideation is essential to better understanding context and risk. In this paper, we give a first insight and analysis into the differences between emotion labels annotated by humans and labels predicted by three fine-tuned language models (LMs) for suicide-related content. We find that (i) there is little agreement between LMs and humans for emotion labels of suicide-related Tweets and (ii) individual LMs predict similar emotion labels for all suicide-related categories. Our findings lead us to question the credibility and usefulness of such methods in high-risk scenarios such as suicide ideation detection. 2024.nlp4pi-1.9 @@ -124,7 +124,7 @@ Efficient Aspect-Based Summarization of Climate Change Reports with Small Language Models - IacopoGhinassiQueen Mary University of London + IacopoGhinassiQueen Mary University of London LeonardoCatalanoUniversity of Pisa TommasoColellaUniversita’ di Pisa, University of Pisa 123-139 @@ -136,7 +136,7 @@ An <fixed-case>NLP</fixed-case> Case Study on Predicting the Before and After of the <fixed-case>U</fixed-case>kraine–<fixed-case>R</fixed-case>ussia and Hamas–<fixed-case>I</fixed-case>srael Conflicts JordanMiner - John E.OrtegaNortheastern University, Columbia University and New York University + John E.OrtegaNortheastern University, Columbia University and New York University 140-151 We propose a method to predict toxicity and other textual attributes through the use of natural language processing (NLP) techniques for two recent events: the Ukraine-Russia and Hamas-Israel conflicts. This article provides a basis for exploration in future conflicts with hopes to mitigate risk through the analysis of social media before and after a conflict begins. Our work compiles several datasets from Twitter and Reddit for both conflicts in a before and after separation with an aim of predicting a future state of social media for avoidance. More specifically, we show that: (1) there is a noticeable difference in social media discussion leading up to and following a conflict and (2) social media discourse on platforms like Twitter and Reddit is useful in identifying future conflicts before they arise. Our results show that through the use of advanced NLP techniques (both supervised and unsupervised) toxicity and other attributes about language before and after a conflict is predictable with a low error of nearly 1.2 percent for both conflicts. 2024.nlp4pi-1.14 @@ -145,10 +145,10 @@ Exploring the Jungle of Bias: Political Bias Attribution in Language Models via Dependency Analysis - David F.JennyETHZ - ETH Zurich and ETHZ - ETH Zurich - YannBilleterZHAW - Zürcher Hochschule für Angewandte Wissenschaften - BernhardSchölkopfELLIS Institute and Max Planck Institute for Intelligent Systems, Max-Planck Institute - ZhijingJinDepartment of Computer Science, University of Toronto + David F.JennyETHZ - ETH Zurich and ETHZ - ETH Zurich + YannBilleterZHAW - Zürcher Hochschule für Angewandte Wissenschaften + BernhardSchölkopfELLIS Institute and Max Planck Institute for Intelligent Systems, Max-Planck Institute + ZhijingJinDepartment of Computer Science, University of Toronto 152-178 The rapid advancement of Large Language Models (LLMs) has sparked intense debate regarding the prevalence of bias in these models and its mitigation. Yet, as exemplified by both results on debiasing methods in the literature and reports of alignment-related defects from the wider community, bias remains a poorly understood topic despite its practical relevance. To enhance the understanding of the internal causes of bias, we analyse LLM bias through the lens of causal fairness analysis, which enables us to both comprehend the origins of bias and reason about its downstream consequences and mitigation. To operationalize this framework, we propose a prompt-based method for the extraction of confounding and mediating attributes which contribute to the LLM decision process. By applying Activity Dependency Networks (ADNs), we then analyse how these attributes influence an LLM’s decision process. We apply our method to LLM ratings of argument quality in political debates. We find that the observed disparate treatment can at least in part be attributed to confounding and mitigating attributes and model misalignment, and discuss the consequences of our findings for human-AI alignment and bias mitigation. 2024.nlp4pi-1.15 @@ -158,7 +158,7 @@ <fixed-case>A</fixed-case>gri<fixed-case>LLM</fixed-case>:Harnessing Transformers for Framer Queries KrishDidwania - PratinavSethArya.ai + PratinavSethArya.ai AdityaKasliwal AmitAgarwalWells Fargo 179-187 @@ -180,7 +180,7 @@ Investigating Ableism in <fixed-case>LLM</fixed-case>s through Multi-turn Conversation GuojunWuUniversity of Zurich - SarahEblingUniversity of Zurich + SarahEblingUniversity of Zurich 202-210 To reveal ableism (i.e., bias against persons with disabilities) in large language models (LLMs), we introduce a novel approach involving multi-turn conversations, enabling a comparative assessment. Initially, we prompt the LLM to elaborate short biographies, followed by a request to incorporate information about a disability. Finally, we employ several methods to identify the top words that distinguish the disability-integrated biographies from those without. This comparative setting helps us uncover how LLMs handle disability-related information and reveal underlying biases. We observe that LLMs tend to highlight disabilities in a manner that can be perceived as patronizing or as implying that overcoming challenges is unexpected due to the disability. 2024.nlp4pi-1.18 @@ -199,7 +199,7 @@ Inferring Mental Burnout Discourse Across <fixed-case>R</fixed-case>eddit Communities - NazaninSabri + NazaninSabri Anh C.Pham IshitaKakkar MaiElSheriefNortheastern University @@ -214,7 +214,7 @@ RongLiUniversity of Zurich AshwiniKamarajUniversity of Zurich JingMaUniversity of Zurich - SarahEblingUniversity of Zurich + SarahEblingUniversity of Zurich 232-249 With the pervasive use of large language models (LLMs) across various domains, addressing the inherent ableist biases within these models requires more attention and resolution. This paper examines ableism in three LLMs (GPT-3.5, GPT-4, and Llama 3) by analyzing the intersection of disability with two additional social categories: gender and social class. Utilizing two task-specific prompts, we generated and analyzed text outputs with two metrics, VADER and regard, to evaluate sentiment and social perception biases within the responses. Our results indicate a marked improvement in bias mitigation from GPT-3.5 to GPT-4, with the latter demonstrating more positive sentiments overall, while Llama 3 showed comparatively weaker performance. Additionally, our findings underscore the complexity of intersectional biases: These biases are shaped by the combined effects of disability, gender, and class, which alter the expression and perception of ableism in LLM outputs. This research highlights the necessity for more nuanced and inclusive bias mitigation strategies in AI development, contributing to the ongoing dialogue on ethical AI practices. 2024.nlp4pi-1.22 @@ -223,7 +223,7 @@ Explainable Identification of Hate Speech towards Islam using Graph Neural Networks - Azmine ToushikWasi + Azmine ToushikWasi 250-257 Islamophobic language on online platforms fosters intolerance, making detection and elimination crucial for promoting harmony. Traditional hate speech detection models rely on NLP techniques like tokenization, part-of-speech tagging, and encoder-decoder models. However, Graph Neural Networks (GNNs), with their ability to utilize relationships between data points, offer more effective detection and greater explainability. In this work, we represent speeches as nodes and connect them with edges based on their context and similarity to develop the graph. This study introduces a novel paradigm using GNNs to identify and explain hate speech towards Islam. Our model leverages GNNs to understand the context and patterns of hate speech by connecting texts via pretrained NLP-generated word embeddings, achieving state-of-the-art performance and enhancing detection accuracy while providing valuable explanations. This highlights the potential of GNNs in combating online hate speech and fostering a safer, more inclusive online environment. 2024.nlp4pi-1.23 @@ -232,9 +232,9 @@ From Text to Maps: <fixed-case>LLM</fixed-case>-Driven Extraction and Geotagging of Epidemiological Data - Karlyn K.HarrodOak Ridge National Laboratory + Karlyn K.HarrodOak Ridge National Laboratory PrabinBhandariGeorge Mason University - AntoniosAnastasopoulosAthena Research Center and George Mason University + AntoniosAnastasopoulosAthena Research Center and George Mason University 258-270 Epidemiological datasets are essential for public health analysis and decision-making, yet they remain scarce and often difficult to compile due to inconsistent data formats, language barriers, and evolving political boundaries. Traditional methods of creating such datasets involve extensive manual effort and are prone to errors in accurate location extraction. To address these challenges, we propose utilizing large language models (LLMs) to automate the extraction and geotagging of epidemiological data from textual documents. Our approach significantly reduces the manual effort required, limiting human intervention to validating a subset of records against text snippets and verifying the geotagging reasoning, as opposed to reviewing multiple entire documents manually to extract, clean, and geotag. Additionally, the LLMs identify information often overlooked by human annotators, further enhancing the dataset’s completeness. Our findings demonstrate that LLMs can be effectively used to semi-automate the extraction and geotagging of epidemiological data, offering several key advantages: (1) comprehensive information extraction with minimal risk of missing critical details; (2) minimal human intervention; (3) higher-resolution data with more precise geotagging; and (4) significantly reduced resource demands compared to traditional methods. 2024.nlp4pi-1.24 @@ -247,8 +247,8 @@ DanicaRovóTechnische Universität München ShaghayeghkolliShaghayeghkolli RabiaVarolTechnische Universität München - GeorgGrohTechnical University Munich - DarynaDementieva + GeorgGrohTechnical University Munich + DarynaDementieva 271-307 In the era dominated by information overload and its facilitation with Large Language Models (LLMs), the prevalence of misinformation poses a significant threat to public discourse and societal well-being. A critical concern at present involves the identification of machine-generated news. In this work, we take a significant step by introducing a benchmark dataset designed for neural news detection in four languages: English, Turkish, Hungarian, and Persian. The dataset incorporates outputs from multiple multilingual generators (in both, zero-shot and fine-tuned setups) such as BloomZ, LLaMa-2, Mistral, Mixtral, and GPT-4. Next, we experiment with a variety of classifiers, ranging from those based on linguistic features to advanced Transformer-based models and LLMs prompting. We present the detection results aiming to delve into the interpretablity and robustness of machine-generated texts detectors across all target languages. 2024.nlp4pi-1.25 @@ -267,7 +267,7 @@ <fixed-case>M</fixed-case>ulti<fixed-case>C</fixed-case>limate: Multimodal Stance Detection on Climate Change Videos - JiawenWang + JiawenWang LongfeiZuo SiyaoPengLudwig-Maximilians-Universität München BarbaraPlankLudwig-Maximilians-Universität München and IT University of Copenhagen @@ -308,9 +308,9 @@ Improving Industrial Safety by Auto-Generating Case-specific Preventive Recommendations SangameshwarPatilIndian Institute of Technology, Madras and Tata Consultancy Services Limited, India - SumitKoundanyaTata Consultancy Services Limited, India - ShubhamKumbharTata Consultancy Services Limited, India - AlokKumarTata Consultancy Services Limited, India + SumitKoundanyaTata Consultancy Services Limited, India + ShubhamKumbharTata Consultancy Services Limited, India + AlokKumarTata Consultancy Services Limited, India 349-353 In this paper, we propose a novel application to improve industrial safety by generating preventive recommendations using LLMs. Using a dataset of 275 incidents representing 11 different incident types sampled from real-life OSHA incidents, we compare three different LLMs to evaluate the quality of preventive recommendations generated by them. We also show that LLMs are not a panacea for the preventive recommendation generation task. They have limitations and can produce responses that are incorrect or irrelevant. We found that about 65% of the output from Vicuna model was not acceptable at all at the basic readability and other sanity checks level. Mistral and Phi_3 are better than Vicuna, but not all of their recommendations are of similar quality. We find that for a given safety incident case, the generated recommendations can be categorized as specific, generic, or irrelevant. This helps us to better quantify and compare the performance of the models. This paper is among the initial and novel work for the preventive recommendation generation problem. We believe it will pave way for use of NLP to positively impact the industrial safety. 2024.nlp4pi-1.30 diff --git a/data/xml/2024.nlp4science.xml b/data/xml/2024.nlp4science.xml index 63cbb20af4..21c6481698 100644 --- a/data/xml/2024.nlp4science.xml +++ b/data/xml/2024.nlp4science.xml @@ -53,8 +53,8 @@ <fixed-case>P</fixed-case>sycho<fixed-case>L</fixed-case>ex: Unveiling the Psychological Mind of Large Language Models - Mohammad AminAbbasiIran University of Science and Technology Tehran, University of Tehran - Farnaz SadatMirnezami + Mohammad AminAbbasiIran University of Science and Technology Tehran, University of Tehran + Farnaz SadatMirnezami HassanNaderi 24-35 This paper explores the intersection of psychology and artificial intelligence through the development and evaluation of specialized Large Language Models (LLMs). We introduce PsychoLex , a suite of resources designed to enhance LLMs’ proficiency in psychological tasks in both Persian and English. Key contributions include the PsychoLexQA dataset for instructional content and the PsychoLexEval dataset for rigorous evaluation of LLMs in complex psychological scenarios. Additionally, we present the PsychoLexLLaMA model, optimized specifically for psychological applications, demonstrating superior performance compared to general-purpose models. The findings underscore the potential of tailored LLMs for advancing psychological research and applications, while also highlighting areas for further refinement. This research offers a foundational step towards integrating LLMs into specialized psychological domains, with implications for future advancements in AI-driven psychological practice. @@ -77,7 +77,7 @@ <fixed-case>GCD</fixed-case>-<fixed-case>TM</fixed-case>: Graph-Driven Community Detection for Topic Modelling in Psychiatry Texts - AnusuyaKrishnanUnited Arab Emirates University + AnusuyaKrishnanUnited Arab Emirates University Isaias MehariGhebrehiwet 47-57 Psychiatry texts provide critical insights into patient mental states and therapeutic interactions. These texts are essential for understanding psychiatric conditions, treatment dynamics, and patient responses. However, the complex and diverse nature of psychiatric communications poses significant challenges for traditional topic modeling methods. The intricate language, subtle psychological nuances, and varying lengths of text segments make it difficult to extract coherent and meaningful topics. Conventional approaches often struggle to capture the depth and overlap of themes present in these texts. In this study, we present a novel approach to topic modeling that addresses these limitations by reformulating the problem as a community detection task within a graph constructed from the text corpus. Our methodology includes lemmatization for data standardization, TF-IDF vectorization to create a term-document matrix, and cosine similarity computation to produce a similarity matrix. This matrix is then binarized to form a graph, on which community detection is performed using the Louvain method. The detected communities are subsequently analyzed with Latent Dirichlet Allocation (LDA) to extract topics. Our approach outperforms traditional topic modeling methods, offering more accurate and interpretable topic extraction with improved coherence and lower perplexity. @@ -91,7 +91,7 @@ SaiMunikotiPacific Northwest National Laboratory IanStewartPacific Northwest National Laboratory HenryKvingePacific Northwest National Laboratory - KarlPazdernikNorth Carolina State University, Pacific Northwest National Laboratory and Deep Football + KarlPazdernikNorth Carolina State University, Pacific Northwest National Laboratory and Deep Football 58-72 Instruction finetuning is a popular paradigm to align large language models (LLM) with human intent. Despite its popularity, this idea is less explored in improving LLMs to align existing foundation models with scientific disciplines, concepts and goals. In this work, we present SciTune as a tuning framework to improve the ability of LLMs to follow multimodal instructions generated from scientific publications. To test our methodology, we train a large multimodal model LLaMA-SciTune that connects a vision encoder and LLM for science-focused visual and language understanding. LLaMA-SciTune significantly outperforms the state-of-the-art models in the generated figure types and captions in SciCap and VisText benchmarks. In comparison to the models that are finetuned with synthetic data only, LLaMA-SciTune surpasses human performance on average and in many sub-categories on the ScienceQA benchmark. Our results demonstrate that human-generated scientific multimodal instructions remain highly valuable in tuning LLMs to perform well on science tasks, despite their lower volume and relative scarcity compared to synthetic data. 2024.nlp4science-1.7 @@ -100,7 +100,7 @@ <fixed-case>RACER</fixed-case>: An <fixed-case>LLM</fixed-case>-powered Methodology for Scalable Analysis of Semi-structured Mental Health Interviews - Satpreet HarcharanSinghHarvard University + Satpreet HarcharanSinghHarvard University KevinJiangNA KanchanBhasinNA AshutoshSabharwal @@ -114,14 +114,14 @@ Soft Measures for Extracting Causal Collective Intelligence - MaryamBerijanianMichigan State University + MaryamBerijanianMichigan State University SpencerDork KuldeepSingh Michael RileyMillikan AshlinRiggs AadarshSwaminathan - Sarah L.GibbsUniversity of South Alabama - Scott E.FriedmanSIFT + Sarah L.GibbsUniversity of South Alabama + Scott E.FriedmanSIFT NathanBrugnoneTwo Six Technologies 99-116 Understanding and modeling collective intelligence is essential for addressing complex social systems. Directed graphs called fuzzy cognitive maps (FCMs) offer a powerful tool for encoding causal mental models, but extracting high-integrity FCMs from text is challenging. This study presents an approach using large language models (LLMs) to automate FCM extraction. We introduce novel graph-based similarity measures and evaluate them by correlating their outputs with human judgments through the Elo rating system. Results show positive correlations with human evaluations, but even the best-performing measure exhibits limitations in capturing FCM nuances. Fine-tuning LLMs improves performance, but existing measures still fall short. This study highlights the need for soft similarity measures tailored to FCM extraction, advancing collective intelligence modeling with NLP. @@ -144,7 +144,7 @@ Dreaming with <fixed-case>C</fixed-case>hat<fixed-case>GPT</fixed-case>: Unraveling the Challenges of <fixed-case>LLM</fixed-case>s Dream Generation - HarelBerger + HarelBerger HadarKingNA OmerDavidNA 140-147 @@ -155,7 +155,7 @@ <fixed-case>LLM</fixed-case>s and <fixed-case>NLP</fixed-case> for Generalized Learning in <fixed-case>AI</fixed-case>-Enhanced Educational Videos and Powering Curated Videos with Generative Intelligence - NainaChaturvedi + NainaChaturvedi 148-154 LLMs and NLP for Generalized Learning in AI-Enhanced Educational Videos and Powering Curated Videos with Generative IntelligenceAuthors - Naina Chaturvedi, Rutgers UniversityAnanda Gunawardena, Rutgers UniversityContact: cnaina1601@gmail.com or nc832@cs.rutgers.eduThe rapid advancement of Large Language Models (LLMs) and Natural Language Processing (NLP) technologies has opened new frontiers in educational content creation and consumption. This paper explores the intersection of these technologies with instructional videos in computer science education, addressing the crucial aspect of generalization in NLP models within an educational context.With 78% of computer science students utilizing YouTube to supplement traditional learning materials, there’s a clear demand for high-quality video content. However, the challenge of finding appropriate resources has led 73% of students to prefer curated video libraries. We propose a novel approach that leverages LLMs and NLP techniques to revolutionize this space, focusing on the ability of these models to generalize across diverse educational content and contexts.Our research utilizes the cubits.ai platform, developed at Princeton University, to demonstrate how generative AI, powered by advanced LLMs, can transform standard video playlists into interactive, AI-enhanced learning experiences. We present a framework for creating AI-generated video summaries, on-demand questions, and in-depth topic explorations, all while considering the challenges posed by LLMs trained on vast, often opaque datasets. Our approach not only enhances student engagement but also provides a unique opportunity to study how well these models generalize across different educational topics and student needs.Drawing insights from computer science courses at Princeton and Rutgers Universities, we highlight the transformative potential of AI-enhanced videos in promoting active learning, particularly in large classes. This research contributes to the ongoing dialogue about generalization in NLP while simultaneously demonstrating practical applications in educational technology. By bridging these domains, we aim to establish a shared platform for state-of-the-art generalization testing in NLP within an educational framework.Our findings not only demonstrate how educators can enhance existing video playlists using AI but also provide insights into the challenges and opportunities of using LLMs in educational settings. This work serves as a cornerstone for catalyzing research on generalization in the NLP community, particularly focusing on the application and evaluation of LLMs in adaptive, personalized learning environments.Keywords: Instructional videos; AI-enhanced learning; Large Language Models (LLMs); Natural Language Processing (NLP); generalization in NLP; computer science education; cubits.ai platform; AI-generated content; interactive video experiences; video summarization; on-demand questions; personalized learning; active learning; data-driven insights; generative AI; educational technology; adaptive learning environments 2024.nlp4science-1.12 @@ -167,7 +167,7 @@ RenjieCao MiaoyanHu JiahanWei - BahaIhnainiWenzhou Kean University + BahaIhnainiWenzhou Kean University 155-165 Moral sentiments expressed in natural language significantly influence both online and offline environments, shaping behavioral styles and interaction patterns, including social media self-presentation, cyberbullying, adherence to social norms, and ethical decision-making. To effectively measure moral sentiments in natural language processing texts, it is crucial to utilize large, annotated datasets that provide nuanced understanding for accurate analysis and model training. However, existing corpora, while valuable, often face linguistic limitations. To address this gap in the Chinese language domain, we introduce the Moral Foundation Weibo Corpus. This corpus consists of 25,671 Chinese comments on Weibo, encompassing six diverse topic areas. Each comment is manually annotated by at least three systematically trained annotators based on ten moral categories derived from a grounded theory of morality. To assess annotator reliability, we present the kappa test results, a gold standard for measuring consistency. Additionally, we apply several the latest large language models to supplement the manual annotations, conducting analytical experiments to compare their performance and report baseline results for moral sentiment classification. 2024.nlp4science-1.13 @@ -211,7 +211,7 @@ Exploring Scientific Hypothesis Generation with Mamba MiaosenChai EmilyHerronOak Ridge National Laboratory - ErickCervantes + ErickCervantes TirthankarGhosalOak Ridge National Laboratory 197-207 Generating scientifically grounded hypotheses is a challenging frontier task for generative AI models in science. The difficulty arises from the inherent subjectivity of the task and the extensive knowledge of prior work required to assess the validity of a generated hypothesis. Large Language Models (LLMs), trained on vast datasets from diverse sources, have shown a strong ability to utilize the knowledge embedded in their training data. Recent research has explored using transformer-based models for scientific hypothesis generation, leveraging their advanced capabilities. However, these models often require a significant number of parameters to manage Long sequences, which can be a limitation. State Space Models, such as Mamba, offer an alternative by effectively handling very Long sequences with fewer parameters than transformers. In this work, we investigate the use of Mamba for scientific hypothesis generation. Our preliminary findings indicate that Mamba achieves similar performance w.r.t. transformer-based models of similar sizes for a higher-order complex task like hypothesis generation. We have made our code available here: https://github.com/fglx-c/Exploring-Scientific-Hypothesis-Generation-with-Mamba @@ -221,8 +221,8 @@ Benchmarking Automated Theorem Proving with Large Language Models - VanessaLamaOak Ridge National Laboratory - CatherineMa + VanessaLamaOak Ridge National Laboratory + CatherineMa TirthankarGhosalOak Ridge National Laboratory 208-218 Theorem proving presents a significant challenge for large language models (LLMs) due to the requirement for formal proofs to be rigorously checked by proof assistants, such as Lean, eliminating any margin for error or hallucination. While existing LLM-based theorem provers attempt to operate autonomously, they often struggle with novel and complex theorems where human insights are essential. Lean Copilot is a novel framework that integrates LLM inference into the Lean proof assistant environment. In this work, we benchmark performance of several LLMs including general and math-specific models for theorem proving using the Lean Copilot framework. Our initial investigation suggests that a general-purpose large model like LLaMa-70B still has edge over math-specific smaller models for the task under consideration. We provide useful insights into the performance of different LLMs we chose for the task. @@ -265,7 +265,7 @@ <fixed-case>C</fixed-case>og<fixed-case>E</fixed-case>rg<fixed-case>LLM</fixed-case>: Exploring Large Language Model Systems Design Perspective Using Cognitive Ergonomics - Azmine ToushikWasi + Azmine ToushikWasi Mst RafiaIslam 249-258 Integrating cognitive ergonomics with LLMs is crucial for improving safety, reliability, and user satisfaction in human-AI interactions. Current LLM designs often lack this integration, resulting in systems that may not fully align with human cognitive capabilities and limitations. This oversight exacerbates biases in LLM outputs and leads to suboptimal user experiences due to inconsistent application of user-centered design principles. Researchers are increasingly leveraging NLP, particularly LLMs, to model and understand human behavior across social sciences, psychology, psychiatry, health, and neuroscience. Our position paper explores the need to integrate cognitive ergonomics into LLM design, providing a comprehensive framework and practical guidelines for ethical development. By addressing these challenges, we aim to advance safer, more reliable, and ethically sound human-AI interactions.