Xiangru (Robert) Tang

Recent Preprints

[2] MedAgentsBench: Benchmarking Thinking Models and Agent Frameworks for Complex Medical Reasoning
Xiangru Tang, Daniel Shao, Jiwoong Sohn, Jiapeng Chen, Jiayi Zhang, Jinyu Xiang, Fang Wu, Yilun Zhao, Chenglin Wu, Wenqi Shi, Arman Cohan, Mark Gerstein.
arXiv preprint arXiv:2503.07459, Patterns (in review)
"Thinking models (DeepSeek R1 and OpenAI o3) show exceptional performance on medical QA tasks."
[PDF] [Abstract] [Bib]

MedagentsBench

@article{tang2025medagentsbench,
  title={MedAgentsBench: Benchmarking Thinking Models and Agent Frameworks for Complex Medical Reasoning},
  author={Tang, Xiangru and Shao, Daniel and Sohn, Jiwoong and Chen, Jiapeng and Zhang, Jiayi and Xiang, Jinyu and Wu, Fang and Zhao, Yilun and Wu, Chenglin and Shi, Wenqi and others},
  journal={arXiv preprint arXiv:2503.07459},
  year={2025}
}

[1] BC-Design: A Biochemistry-Aware Framework for High-Precision Inverse Protein Folding
Xiangru Tang*, Xinwu Ye*, Fang Wu*, Yanjun Shao, Yin Fang, Siming Chen, Dong Xu, Mark Gerstein.
bioRxiv 2024, Nature (in review)
"A quantum leap in inverse protein folding from 67% to 88%!"
[PDF] [Abstract] [Bib]

BC-Design

@article{tang2024bc,
  title={BC-Design: A Biochemistry-Aware Framework for High-Precision Inverse Protein Folding},
  author={Tang, Xiangru and Ye, Xinwu and Wu, Fang and Shao, Yanjun and Fang, Yin and Chen, Siming and Xu, Dong and Gerstein, Mark},
  journal={bioRxiv},
  pages={2024--10},
  year={2024},
  publisher={Cold Spring Harbor Laboratory}
}

Selected Publications

[18] MMSciBench: Benchmarking Language Models on Multimodal Scientific Problems
Xinwu Ye, Chengfan Li, Siming Chen, Wei Wei, Xiangru Tang
ACL 2025 Findings
[PDF] [Abstract] [Bib]

MMSciBench

@article{ye2025mmscibench,
  title={MMSciBench: Benchmarking Language Models on Multimodal Scientific Problems},
  author={Ye, Xinwu and Li, Chengfan and Chen, Siming and Tang, Xiangru and Wei, Wei},
  journal={arXiv preprint arXiv:2503.01891},
  year={2025}
}

[17] LocAgent: Graph-Guided LLM Agents for Code Localization
Zhaoling Chen*, Xiangru Tang*, Gangda Deng*, Fang Wu, Jialong Wu, Zhiwei Jiang, Viktor Prasanna, Arman Cohan, Xingyao Wang.
ACL 2025
"No need to embed the entire repo, agent + graph-based indexing is all you need!"
[PDF] [Abstract] [Bib]

LocAgent

@article{chen2025locagent,
  title={LocAgent: Graph-Guided LLM Agents for Code Localization},
  author={Chen, Zhaoling and Tang, Xiangru and Deng, Gangda and Wu, Fang and Wu, Jialong and Jiang, Zhiwei and Prasanna, Viktor and Cohan, Arman and Wang, Xingyao},
  journal={arXiv preprint arXiv:2503.09089},
  year={2025}
}

[16] ML-Bench: Evaluating Large Language Models and Agents for Machine Learning Tasks on Repository-Level Code
Xiangru Tang*, Yuliang Liu*, Zefan Cai*, Junjie Lu, Yichi Zhang, Yanjun Shao, Zexuan Deng, Helan Hu, Kaikai An, Ruijun Huang, Shuzheng Si, Sheng Chen, Haozhe Zhao, Liang Chen, Yan Wang, Tianyu Liu, Zhiwei Jiang, Baobao Chang, Yujia Qin, Wangchunshu Zhou, Yilun Zhao, Arman Cohan, Mark Gerstein.
Nature Computational Science (in review)
ICLR 2025 Deep Learning for Code
ICLR 2025 Agentic AI for Scientific Discovery
"Can LLMs do machine learning tasks?"
[PDF] [Abstract] [Bib]

ML-Bench

@article{tang2023ml,
  title={ML-Bench: Evaluating Large Language Models and Agents for Machine Learning Tasks on Repository-Level Code},
  author={Tang, Xiangru and Liu, Yuliang and Cai, Zefan and Shao, Yanjun and Lu, Junjie and Zhang, Yichi and Deng, Zexuan and Hu, Helan and Yang, Zengxian and An, Kaikai and others},
  journal={arXiv preprint arXiv:2311.09835},
  year={2023}
}

[15] Risks of AI Scientists: Prioritizing Safeguarding Over Autonomy
Xiangru Tang, Qiao Jin, Kunlun Zhu, Tongxin Yuan, Yichi Zhang, Wangchunshu Zhou, Meng Qu, Yilun Zhao, Jian Tang, Zhuosheng Zhang, Arman Cohan, Zhiyong Lu, Mark Gerstein.
Nature Communications, 2025 (IF 14.7)
ICLR 2024 Workshop on LLM Agents
[PDF] [Abstract] [Bib]

@article{tang2024prioritizing,
  title={Prioritizing Safeguarding Over Autonomy: Risks of LLM Agents for Science},
  author={Tang, Xiangru and Jin, Qiao and Zhu, Kunlun and Yuan, Tongxin and Zhang, Yichi and Zhou, Wangchunshu and Qu, Meng and Zhao, Yilun and Tang, Jian and Zhang, Zhuosheng and others},
  journal={arXiv preprint arXiv:2402.04247},
  year={2024}
}

[14] ChemAgent: Self-updating Memories in Large Language Models Improves Chemical Reasoning
Xiangru Tang*, Tianyu Hu*, Muyang Ye*, Yanjun Shao*, Xunjian Yin, Siru Ouyang, Wangchunshu Zhou, Pan Lu, Zhuosheng Zhang, Yilun Zhao, Arman Cohan, Mark Gerstein.
ICLR 2025
"Enable LLMs to continuously improve through experience."
[PDF] [Abstract] [Bib]

ChemAgent

@inproceedings{tang2025chemagent,
  title={ChemAgent: Self-updating Library in Large Language Models Improves Chemical Reasoning},
  author={Tang, Xiangru and Hu, Tianyu and Ye, Muyang and Shao, Yanjun and Yin, Xunjian and Ouyang, Siru and Zhou, Wangchunshu and Lu, Pan and Zhang, Zhuosheng and Zhao, Yilun and others},
  booktitle={The Thirteenth International Conference on Learning Representations}
}

[13] Fast, Sensitive Detection of Protein Homologs Using Deep Dense Retrieval
Liang Hong*, Zhihang Hu*, Siqi Sun*, Xiangru Tang*, Jiuming Wang, Qingxiong Tan, Liangzhen Zheng, Sheng Wang, Sheng Xu, Irwin King, Mark Gerstein, Yu Li.
Nature Biotechnology, 2024 (IF 33.1)
"Up to 28,700 times faster than HMMER!"
[PDF] [Abstract] [Bib]

DPR

@article{hong2024fast,
  title={Fast, sensitive detection of protein homologs using deep dense retrieval},
  author={Hong, Liang and Hu, Zhihang and Sun, Siqi and Tang, Xiangru and Wang, Jiuming and Tan, Qingxiong and Zheng, Liangzhen and Wang, Sheng and Xu, Sheng and King, Irwin and others},
  journal={Nature Biotechnology},
  pages={1--13},
  year={2024},
  publisher={Nature Publishing Group US New York}
}

[12] MIMIR: A Customizable Agent Tuning Platform for Enhanced Scientific Applications
Xiangru Tang*, Chunyuan Deng*, Hanmin Wang*, Haoran Wang*, Yilun Zhao, Wenqi Shi, Yi Fung, Wangchunshu Zhou, Jiannan Cao, Heng Ji, Arman Cohan, Mark Gerstein.
EMNLP 2024
[PDF] [Abstract] [Bib]

MIMIR

@inproceedings{tang-etal-2024-mimir,
    title = "{MIMIR}: A Customizable Agent Tuning Platform for Enhanced Scientific Applications",
    author = "Tang, Xiangru  and
      Deng, Chunyuan  and
      Wang, Hanmin  and
      Wang, Haoran  and
      Zhao, Yilun  and
      Shi, Wenqi  and
      Fung, Yi  and
      Zhou, Wangchunshu  and
      Cao, Jiannan  and
      Ji, Heng  and
      Cohan, Arman  and
      Gerstein, Mark",
    editor = "Hernandez Farias, Delia Irazu  and
      Hope, Tom  and
      Li, Manling",
    booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: System Demonstrations",
    month = nov,
    year = "2024",
    address = "Miami, Florida, USA",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2024.emnlp-demo.49",
    pages = "486--496",
}

[11] Step-Back Profiling: Distilling User History for Personalized Scientific Writing
Xiangru Tang, Xingyao Zhang, Yanjun Shao, Jie Wu, Yilun Zhao, Arman Cohan, Ming Gong, Dongmei Zhang, Mark Gerstein.
IJCAI 2024 Workshop on AI4Research (Best Paper Award)
[PDF] [Abstract] [Bib]

Step-Back Profiling

@article{tang2024step,
  title={Step-Back Profiling: Distilling User History for Personalized Scientific Writing},
  author={Xiangru Tang and Xingyao Zhang and Yanjun Shao and Jie Wu and Yilun Zhao and Arman Cohan and Ming Gong and Dongmei Zhang and Mark Gerstein},
  journal={arXiv preprint arXiv:2406.14275},
  year={2024}
}

[10] A Survey of Generative AI for De Novo Drug Design: New Frontiers in Molecule and Protein Generation
Xiangru Tang*, Howard Dai*, Elizabeth Knight*, Fang Wu, Yunyang Li, Tianxiao Li, Mark Gerstein.
Briefings in Bioinformatics, 2024 (IF 13.99, JCR "Q1")
"An introductory overview with a clear breakdown of datasets, benchmarks, & models."
[PDF] [Abstract] [Bib]

GenAI4Drug

@article{tang2024survey,
  title={A survey of generative ai for de novo drug design: new frontiers in molecule and protein generation},
  author={Tang, Xiangru and Dai, Howard and Knight, Elizabeth and Wu, Fang and Li, Yunyang and Li, Tianxiao and Gerstein, Mark},
  journal={Briefings in Bioinformatics},
  volume={25},
  number={4},
  year={2024},
  publisher={Oxford Academic}
}

[9] MolLM: A Unified Language Model for Integrating Biomedical Text with 2D and 3D Molecular Representations
Xiangru Tang, Andrew Tran, Jeffrey Tan, Mark Gerstein.
ISMB 2024, Proceedings in Bioinformatics (IF 6.93, JCR "Q1")
[PDF] [Abstract] [Bib]

MolLM

@article{10.1093/bioinformatics/btae260,
    author = {Tang, Xiangru and Tran, Andrew and Tan, Jeffrey and Gerstein, Mark B},
    title = "{MolLM: a unified language model for integrating biomedical text with 2D and 3D molecular representations}",
    journal = {Bioinformatics},
    volume = {40},
    number = {Supplement_1},
    pages = {i357-i368},
    year = {2024},
    month = {06},
    abstract = "{The current paradigm of deep learning models for the joint representation of molecules and text primarily relies on 1D or 2D molecular formats, neglecting significant 3D structural information that offers valuable physical insight. This narrow focus inhibits the models’ versatility and adaptability across a wide range of modalities. Conversely, the limited research focusing on explicit 3D representation tends to overlook textual data within the biomedical domain.We present a unified pre-trained language model, MolLM, that concurrently captures 2D and 3D molecular information alongside biomedical text. MolLM consists of a text Transformer encoder and a molecular Transformer encoder, designed to encode both 2D and 3D molecular structures. To support MolLM’s self-supervised pre-training, we constructed 160K molecule-text pairings. Employing contrastive learning as a supervisory signal for learning, MolLM demonstrates robust molecular representation capabilities across four downstream tasks, including cross-modal molecule and text matching, property prediction, captioning, and text-prompted molecular editing. Through ablation, we demonstrate that the inclusion of explicit 3D representations improves performance in these downstream tasks.Our code, data, pre-trained model weights, and examples of using our model are all available at https://github.com/gersteinlab/MolLM. In particular, we provide Jupyter Notebooks offering step-by-step guidance on how to use MolLM to extract embeddings for both molecules and text.}",
    issn = {1367-4811},
    doi = {10.1093/bioinformatics/btae260},
    url = {https://doi.org/10.1093/bioinformatics/btae260},
    eprint = {https://academic.oup.com/bioinformatics/article-pdf/40/Supplement\_1/i357/58355106/btae260.pdf},
}

[8] BioCoder: A Benchmark for Bioinformatics Code Generation with Large Language Models
Xiangru Tang, Bill Qian, Rick Gao, Jiakang Chen, Xinyun Chen, Mark Gerstein.
ISMB 2024, Proceedings in Bioinformatics (IF 6.93, JCR "Q1")
"BioCoder input covers repository-level potential package dependencies, class declarations, & global variables."
[PDF] [Abstract] [Bib]

BioCoder

@article{10.1093/bioinformatics/btae230,
    author = {Tang, Xiangru and Qian, Bill and Gao, Rick and Chen, Jiakang and Chen, Xinyun and Gerstein, Mark B},
    title = "{BioCoder: a benchmark for bioinformatics code generation with large language models}",
    journal = {Bioinformatics},
    volume = {40},
    number = {Supplement_1},
    pages = {i266-i276},
    year = {2024},
    month = {06},
    abstract = "{Pretrained large language models (LLMs) have significantly improved code generation. As these models scale up, there is an increasing need for the output to handle more intricate tasks and to be appropriately specialized to particular domains. Here, we target bioinformatics due to the amount of domain knowledge, algorithms, and data operations this discipline requires. We present BioCoder, a benchmark developed to evaluate LLMs in generating bioinformatics-specific code. BioCoder spans much of the field, covering cross-file dependencies, class declarations, and global variables. It incorporates 1026 Python functions and 1243 Java methods extracted from GitHub, along with 253 examples from the Rosalind Project, all pertaining to bioinformatics. Using topic modeling, we show that the overall coverage of the included code is representative of the full spectrum of bioinformatics calculations. BioCoder incorporates a fuzz-testing framework for evaluation. We have applied it to evaluate various models including InCoder, CodeGen, CodeGen2, SantaCoder, StarCoder, StarCoder+, InstructCodeT5+, GPT-3.5, and GPT-4. Furthermore, we fine-tuned one model (StarCoder), demonstrating that our training dataset can enhance the performance on our testing benchmark (by \\>15\\% in terms of Pass@K under certain prompt configurations and always \\>3\\%). The results highlight two key aspects of successful models: (i) Successful models accommodate a long prompt (\\>2600 tokens) with full context, including functional dependencies. (ii) They contain domain-specific knowledge of bioinformatics, beyond just general coding capability. This is evident from the performance gain of GPT-3.5/4 compared to the smaller models on our benchmark (50\\% versus up to 25\\%).All datasets, benchmark, Docker images, and scripts required for testing are available at: https://github.com/gersteinlab/biocoder and https://biocoder-benchmark.github.io/.}",
    issn = {1367-4811},
    doi = {10.1093/bioinformatics/btae230},
    url = {https://doi.org/10.1093/bioinformatics/btae230},
    eprint = {https://academic.oup.com/bioinformatics/article-pdf/40/Supplement\_1/i266/58354818/btae230.pdf},
}

[7] MedAgents: Large Language Models as Collaborators for Zero-shot Medical Reasoning
Xiangru Tang*, Anni Zou*, Zhuosheng Zhang, Yilun Zhao, Xingyao Zhang, Arman Cohan, Mark Gerstein.
ACL 2024 Findings
"The first multi-agent framework within the medical context!"
[PDF] [Abstract] [Bib]

MedAgents

@article{tang2023medagents,
  title={MedAgents: Large Language Models as Collaborators for Zero-shot Medical Reasoning},
  author={Tang, Xiangru and Zou, Anni and Zhang, Zhuosheng and Zhao, Yilun and Zhang, Xingyao and Cohan, Arman and Gerstein, Mark},
  journal={arXiv preprint arXiv:2311.10537},
  year={2023}
}

[6] Struc-Bench: Are Large Language Models Good at Generating Complex Structured Tabular Data?
Xiangru Tang, Yiming Zong, Jason Phang, Yilun Zhao, Wangchunshu Zhou, Arman Cohan, Mark Gerstein.
NAACL 2024 (Oral)
[PDF] [Abstract] [Bib]

Struc-Bench

@inproceedings{tang-etal-2024-struc,
    title = "Struc-Bench: Are Large Language Models Good at Generating Complex Structured Tabular Data?",
    author = "Tang, Xiangru  and
      Zong, Yiming  and
      Phang, Jason  and
      Zhao, Yilun  and
      Zhou, Wangchunshu  and
      Cohan, Arman  and
      Gerstein, Mark",
    editor = "Duh, Kevin  and
      Gomez, Helena  and
      Bethard, Steven",
    booktitle = "Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 2: Short Papers)",
    month = jun,
    year = "2024",
    address = "Mexico City, Mexico",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2024.naacl-short.2",
    pages = "12--34",
    abstract = "Despite the remarkable capabilities of Large Language Models (LLMs) like GPT-4, producing complex, structured tabular data remains challenging. Our study assesses LLMs{'} proficiency in structuring tables and introduces a novel fine-tuning method, cognizant of data structures, to bolster their performance. We unveil Struc-Bench, a comprehensive benchmark featuring prominent LLMs (GPT-NeoX-20B, GPT-3.5, GPT-4, and Vicuna), which spans text tables, HTML, and LaTeX formats. Our proposed FormatCoT aids in crafting format-specific instructions from the intended outputs to populate this benchmark. Addressing the gap in task-centered evaluation, we propose two innovative metrics, P-Score (Prompting Score) and H-Score (Heuristical Score), to more accurately gauge LLM performance. Our experiments show that applying our structure-aware fine-tuning to LLaMA-7B leads to substantial performance gains, outshining its LLM counterparts across most measures. In-depth error analysis and creating an ability map across six dimensions, coverage, formatting, reasoning, comprehension, pragmatics, and hallucination, highlight areas for future enhancements and suggest forthcoming research trajectories. Our code and models can be found at https://github.com/gersteinlab/Struc-Bench.",
}

[5] Meta-CoT: Generalizable Chain-of-Thought Prompting in Mixed-task Scenarios with Large Language Models
Anni Zou, Zhuosheng Zhang, Hai Zhao, Xiangru Tang.
IEEE Transactions on Audio, Speech and Language Processing (In Review)
"Bridge the gap between performance and generalization when using the CoT prompting!"
[PDF] [Abstract] [Bib]

Meta-CoT

@article{zou2023metacot,
  title={Meta-CoT: Generalizable Chain-of-Thought Prompting in Mixed-task Scenarios with Large Language Models},
  author={Anni Zou and Zhuosheng Zhang and Hai Zhao and Xiangru Tang},
  journal={arXiv preprint arXiv:2310.06692},
  year={2023}
}

[4] Aligning Factual Consistency for Clinical Studies Summarization through Reinforcement Learning
Xiangru Tang, Arman Cohan, Mark Gerstein.
ACL 2023 Clinical Natural Language Processing
[PDF] [Abstract] [Bib]

@inproceedings{tang-etal-2023-aligning,
    title = "Aligning Factual Consistency for Clinical Studies Summarization through Reinforcement Learning",
    author = "Tang, Xiangru  and
      Cohan, Arman  and
      Gerstein, Mark",
    editor = "Naumann, Tristan  and
      Ben Abacha, Asma  and
      Bethard, Steven  and
      Roberts, Kirk  and
      Rumshisky, Anna",
    booktitle = "Proceedings of the 5th Clinical Natural Language Processing Workshop",
    month = jul,
    year = "2023",
    address = "Toronto, Canada",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2023.clinicalnlp-1.7",
    doi = "10.18653/v1/2023.clinicalnlp-1.7",
    pages = "48--58",
    abstract = "In the rapidly evolving landscape of medical research, accurate and concise summarization of clinical studies is crucial to support evidence-based practice. This paper presents a novel approach to clinical studies summarization, leveraging reinforcement learning to enhance factual consistency and align with human annotator preferences. Our work focuses on two tasks: Conclusion Generation and Review Generation. We train a CONFIT summarization model that outperforms GPT-3 and previous state-of-the-art models on the same datasets and collects expert and crowd-worker annotations to evaluate the quality and factual consistency of the generated summaries. These annotations enable us to measure the correlation of various automatic metrics, including modern factual evaluation metrics like QAFactEval, with human-assessed factual consistency. By employing top-correlated metrics as objectives for a reinforcement learning model, we demonstrate improved factuality in generated summaries that are preferred by human annotators.",
}

[3] GersteinLab at MEDIQA-Chat 2023: Clinical Note Summarization from Doctor-Patient Conversations through Fine-tuning and In-context Learning
Xiangru Tang, Andrew Tran, Jeffrey Tan, Mark Gerstein.
ACL 2023 Clinical Natural Language Processing
[PDF] [Abstract] [Bib]

MEDIQA

@inproceedings{tang-etal-2023-gersteinlab,
    title = "{G}erstein{L}ab at {MEDIQA}-Chat 2023: Clinical Note Summarization from Doctor-Patient Conversations through Fine-tuning and In-context Learning",
    author = "Tang, Xiangru  and
      Tran, Andrew  and
      Tan, Jeffrey  and
      Gerstein, Mark",
    editor = "Naumann, Tristan  and
      Ben Abacha, Asma  and
      Bethard, Steven  and
      Roberts, Kirk  and
      Rumshisky, Anna",
    booktitle = "Proceedings of the 5th Clinical Natural Language Processing Workshop",
    month = jul,
    year = "2023",
    address = "Toronto, Canada",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2023.clinicalnlp-1.58",
    doi = "10.18653/v1/2023.clinicalnlp-1.58",
    pages = "546--554",
    abstract = "This paper presents our contribution to the MEDIQA-2023 Dialogue2Note shared task, encompassing both subtask A and subtask B. We approach the task as a dialogue summarization problem and implement two distinct pipelines: (a) a fine-tuning of a pre-trained dialogue summarization model and GPT-3, and (b) few-shot in-context learning (ICL) using a large language model, GPT-4. Both methods achieve excellent results in terms of ROUGE-1 F1, BERTScore F1 (deberta-xlarge-mnli), and BLEURT, with scores of 0.4011, 0.7058, and 0.5421, respectively. Additionally, we predict the associated section headers using RoBERTa and SciBERT based classification models. Our team ranked fourth among all teams, while each team is allowed to submit three runs as part of their submission. We also utilize expert annotations to demonstrate that the notes generated through the ICL GPT-4 are better than all other baselines. The code for our submission is available.",
}

[2] CONFIT: Toward Faithful Dialogue Summarization with Linguistically-Informed Contrastive Fine-tuning
Xiangru Tang, Arjun Nair, Borui Wang, Bingyao Wang, Jai Desai, Aaron Wade, Haoran Li, Asli Celikyilmaz, Yashar Mehdad, Dragomir Radev.
NAACL 2022 (Oral)
[PDF] [Abstract] [Bib]

@inproceedings{tang-etal-2022-confit,
    title = "{CONFIT}: Toward Faithful Dialogue Summarization with Linguistically-Informed Contrastive Fine-tuning",
    author = "Tang, Xiangru  and
      Nair, Arjun  and
      Wang, Borui  and
      Wang, Bingyao  and
      Desai, Jai  and
      Wade, Aaron  and
      Li, Haoran  and
      Celikyilmaz, Asli  and
      Mehdad, Yashar  and
      Radev, Dragomir",
    editor = "Carpuat, Marine  and
      de Marneffe, Marie-Catherine  and
      Meza Ruiz, Ivan Vladimir",
    booktitle = "Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies",
    month = jul,
    year = "2022",
    address = "Seattle, United States",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2022.naacl-main.415",
    doi = "10.18653/v1/2022.naacl-main.415",
    pages = "5657--5668",
}

[1] Investigating Crowdsourcing Protocols for Evaluating the Factual Consistency of Summaries
Xiangru Tang, Alexander Fabbri, Haoran Li, Ziming Mao, Griffin Adams, Borui Wang, Asli Celikyilmaz, Yashar Mehdad, Dragomir Radev.
NAACL 2022
[PDF] [Abstract] [Bib]

@inproceedings{tang-etal-2022-investigating,
    title = "Investigating Crowdsourcing Protocols for Evaluating the Factual Consistency of Summaries",
    author = "Tang, Xiangru  and
      Fabbri, Alexander  and
      Li, Haoran  and
      Mao, Ziming  and
      Adams, Griffin  and
      Wang, Borui  and
      Celikyilmaz, Asli  and
      Mehdad, Yashar  and
      Radev, Dragomir",
    editor = "Carpuat, Marine  and
      de Marneffe, Marie-Catherine  and
      Meza Ruiz, Ivan Vladimir",
    booktitle = "Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies",
    month = jul,
    year = "2022",
    address = "Seattle, United States",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2022.naacl-main.417",
    doi = "10.18653/v1/2022.naacl-main.417",
    pages = "5680--5692",
}

Other Publications

[18] MultiAgentBench : Evaluating the Collaboration and Competition of LLM agents
Kunlun Zhu, Hongyi Du, Zhaochen Hong, Xiaocheng Yang, Shuyi Guo, Zhe Wang, Zhenhailong Wang, Cheng Qian, Xiangru Tang, Heng Ji, Jiaxuan You
ACL 2025
[PDF] [Abstract] [Bib]

MultiAgentBench

@article{zhu2025multiagentbench,
  title={MultiAgentBench: Evaluating the Collaboration and Competition of LLM agents},
  author={Zhu, Kunlun and Du, Hongyi and Hong, Zhaochen and Yang, Xiaocheng and Guo, Shuyi and Wang, Zhe and Wang, Zhenhailong and Qian, Cheng and Tang, Xiangru and Ji, Heng and others},
  journal={arXiv preprint arXiv:2503.01935},
  year={2025}
}

[17] OpenHands: An Open Platform for AI Software Developers as Generalist Agents
Xingyao Wang, Boxuan Li, Yufan Song, Frank F. Xu, Xiangru Tang, Mingchen Zhuge, Jiayi Pan, Yueqi Song, Bowen Li, Jaskirat Singh, Hoang H. Tran, Fuqiang Li, Ren Ma, Mingzhang Zheng, Bill Qian, Yanjun Shao, Niklas Muennighoff, Yizhe Zhang, Binyuan Hui, Junyang Lin, Robert Brennan, Hao Peng, Heng Ji, Graham Neubig
ICLR 2025
"AI agents function as software developers, capable of command execution, web browsing & API interaction."
[PDF] [Abstract] [Bib]

OpenHands

@inproceedings{wang2025openhands,
      title={{OpenHands: An Open Platform for AI Software Developers as Generalist Agents}},
      author={Xingyao Wang and Boxuan Li and Yufan Song and Frank F. Xu and Xiangru Tang and Mingchen Zhuge and Jiayi Pan and Yueqi Song and Bowen Li and Jaskirat Singh and Hoang H. Tran and Fuqiang Li and Ren Ma and Mingzhang Zheng and Bill Qian and Yanjun Shao and Niklas Muennighoff and Yizhe Zhang and Binyuan Hui and Junyang Lin and Robert Brennan and Hao Peng and Heng Ji and Graham Neubig},
  booktitle={The Thirteenth International Conference on Learning Representations}
}

[16] MMVU: Measuring Expert-Level Multi-Discipline Video Understanding
Yilun Zhao, Lujing Xie, Haowei Zhang, Guo Gan, Yitao Long, Zhiyuan Hu, Tongyan Hu, Weiyuan Chen, Chuhan Li, Junyang Song, Zhijian Xu, Chengye Wang, Weifeng Pan, Ziyao Shangguan, Xiangru Tang, Zhenwen Liang, Yixin Liu, Chen Zhao, Arman Cohan.
CVPR 2025
[PDF] [Abstract] [Bib]

MMVU

@article{zhao2025mmvu,
  title={MMVU: Measuring Expert-Level Multi-Discipline Video Understanding},
  author={Zhao, Yilun and Xie, Lujing and Zhang, Haowei and Gan, Guo and Long, Yitao and Hu, Zhiyuan and Hu, Tongyan and Chen, Weiyuan and Li, Chuhan and Song, Junyang and others},
  journal={arXiv preprint arXiv:2501.12380},
  year={2025}
}

[15] Igniting Language Intelligence: The Hitchhiker's Guide From Chain-of-Thought Reasoning to Language Agents
Zhuosheng Zhang, Yao Yao, Aston Zhang, Xiangru Tang, Xinbei Ma, Zhiwei He, Yiming Wang, Mark Gerstein, Rui Wang, Gongshen Liu, Hai Zhao.
ACM Computing Surveys, 2024 (IF 23.8)
"Generalization, efficiency, customization, scaling, and safety related to CoT and agents."
[PDF] [Abstract] [Bib]

CoT-Igniting-Agent

@article{zhang2025igniting,
  title={Igniting Language Intelligence: The Hitchhiker's Guide From Chain-of-Thought Reasoning to Language Agents},
  author={Zhang, Zhuosheng and Yao, Yao and Zhang, Aston and Tang, Xiangru and Ma, Xinbei and He, Zhiwei and Wang, Yiming and Gerstein, Mark and Wang, Rui and Liu, Gongshen and others},
  journal={ACM Computing Surveys},
  year={2025}
}

[14] Survey on Factuality in Large Language Models: Knowledge, Retrieval and Domain-Specificity
Cunxiang Wang*, Xiaoze Liu*, Yuanhao Yue*, Xiangru Tang, Tianhang Zhang, Cheng Jiayang, Yunzhi Yao, Wenyang Gao, Xuming Hu, Zehan Qi, Yidong Wang, Linyi Yang, Jindong Wang, Xing Xie, Zheng Zhang, Yue Zhang.
ACM Computing Surveys, 2024 (IF 23.8)
[PDF] [Abstract] [Bib]

LLM-Factuality-Survey

@article{wang2025survey,
  title={Survey on Factuality in Large Language Models: Knowledge, Retrieval and Domain-Specificity},
  author={Wang, Cunxiang and Liu, Xiaoze and Yue, Yuanhao and Tang, Xiangru and Zhang, Tianhang and Jiayang, Cheng and Yao, Yunzhi and Gao, Wenyang and Hu, Xuming and Qi, Zehan and others},
  journal={ACM Computing Surveys},
  year={2025}
}

[13] OctoPack: Instruction Tuning Code Large Language Models
Niklas Muennighoff, Qian Liu, Armel Randy Zebaze, Qinkai Zheng, Binyuan Hui, Terry Yue Zhuo, Swayam Singh, Xiangru Tang, Leandro Von Werra, Shayne Longpre.
ICLR 2024
[PDF] [Abstract] [Bib]

octopack

@inproceedings{muennighoffoctopack,
  title={OctoPack: Instruction Tuning Code Large Language Models},
  author={Muennighoff, Niklas and Liu, Qian and Zebaze, Armel Randy and Zheng, Qinkai and Hui, Binyuan and Zhuo, Terry Yue and Singh, Swayam and Tang, Xiangru and Von Werra, Leandro and Longpre, Shayne},
  booktitle={The Twelfth International Conference on Learning Representations}
}

[12] ToolLLM: Facilitating Large Language Models to Master 16000+ Real-world APIs
Yujia Qin, Shihao Liang, Yining Ye, Kunlun Zhu, Lan Yan, Yaxi Lu, Yankai Lin, Xin Cong, Xiangru Tang, Bill Qian, Sihan Zhao, Lauren Hong, Runchu Tian, Ruobing Xie, Jie Zhou, Mark Gerstein, dahai li, Zhiyuan Liu, Maosong Sun.
ICLR 2024
[PDF] [Abstract] [Bib]

ToolBench

Despite the advancements of open-source large language models (LLMs), e.g., LLaMA, they remain significantly limited in tool-use capabilities, i.e., using external tools (APIs) to fulfill human instructions. The reason is that current instruction tuning largely focuses on basic language tasks but ignores the tool-use domain. This is in contrast to the excellent tool-use capabilities of state-of-the-art (SOTA) closed-source LLMs, e.g., ChatGPT. To bridge this gap, we introduce ToolLLM, a general tool-use framework encompassing data construction, model training, and evaluation. We first present ToolBench, an instruction-tuning dataset for tool use, which is constructed automatically using ChatGPT. Specifically, the construction can be divided into three stages: (i) API collection: we collect 16,464 real-world RESTful APIs spanning 49 categories from RapidAPI Hub; (ii) instruction generation: we prompt ChatGPT to generate diverse instructions involving these APIs, covering both single-tool and multi-tool scenarios; (iii) solution path annotation: we use ChatGPT to search for a valid solution path (chain of API calls) for each instruction. To enhance the reasoning capabilities of LLMs, we develop a novel depth-first search-based decision tree algorithm. It enables LLMs to evaluate multiple reasoning traces and expand the search space. Moreover, to evaluate the tool-use capabilities of LLMs, we develop an automatic evaluator: ToolEval. Based on ToolBench, we fine-tune LLaMA to obtain an LLM ToolLLaMA, and equip it with a neural API retriever to recommend appropriate APIs for each instruction. Experiments show that ToolLLaMA demonstrates a remarkable ability to execute complex instructions and generalize to unseen APIs, and exhibits comparable performance to ChatGPT. Our ToolLLaMA also demonstrates strong zero-shot generalization ability in an out-of-distribution tool-use dataset: APIBench.

@article{qin2023toolllm,
  title={Toolllm: Facilitating large language models to master 16000+ real-world apis},
  author={Qin, Yujia and Liang, Shihao and Ye, Yining and Zhu, Kunlun and Yan, Lan and Lu, Yaxi and Lin, Yankai and Cong, Xin and Tang, Xiangru and Qian, Bill and others},
  journal={arXiv preprint arXiv:2307.16789},
  year={2023}
}

[11] DocMath-Eval: Evaluating Math Reasoning Capabilities of LLMs in Understanding Long and Specialized Documents
Yilun Zhao, Yitao Long, Hongjun Liu, Ryo Kamoi, Linyong Nan, Lyuhao Chen, Yixin Liu, Xiangru Tang, Rui Zhang, Arman Cohan.
ACL 2024
[PDF] [Abstract] [Bib]

DocMath-Eval

We introduce FinDVer, a comprehensive benchmark specifically designed to evaluate the explainable claim verification capabilities of LLMs in the context of understanding and analyzing long, hybrid-content financial documents. FinDVer contains 4,000 expert-annotated examples across four subsets, each focusing on a type of scenario that frequently arises in real-world financial domains. We assess a broad spectrum of 25 LLMs under long-context and RAG settings. Our results show that even the current best-performing system (i.e., GPT-4o) significantly lags behind human experts. Our detailed findings and insights highlight the strengths and limitations of existing LLMs in this new task. We believe FinDVer can serve as a valuable benchmark for evaluating LLM capabilities in claim verification over complex, expert-domain documents.

@inproceedings{zhao-etal-2024-docmath,
    title = "{D}oc{M}ath-Eval: Evaluating Math Reasoning Capabilities of {LLM}s in Understanding Long and Specialized Documents",
    author = "Zhao, Yilun  and
      Long, Yitao  and
      Liu, Hongjun  and
      Kamoi, Ryo  and
      Nan, Linyong  and
      Chen, Lyuhao  and
      Liu, Yixin  and
      Tang, Xiangru  and
      Zhang, Rui  and
      Cohan, Arman",
    editor = "Ku, Lun-Wei  and
      Martins, Andre  and
      Srikumar, Vivek",
    booktitle = "Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
    month = aug,
    year = "2024",
    address = "Bangkok, Thailand",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2024.acl-long.852",
    doi = "10.18653/v1/2024.acl-long.852",
    pages = "16103--16120",
    abstract = "Recent LLMs have demonstrated remarkable performance in solving exam-like math word problems. However, the degree to which these numerical reasoning skills are effective in real-world scenarios, particularly in expert domains, is still largely unexplored. This paper introduces DocMath-Eval, a comprehensive benchmark specifically designed to evaluate the numerical reasoning capabilities of LLMs in the context of understanding and analyzing specialized documents containing both text and tables. We conduct an extensive evaluation of 48 LLMs with Chain-of-Thought and Program-of-Thought prompting methods, aiming to comprehensively assess the capabilities and limitations of existing LLMs in DocMath-Eval. We found that even the current best-performing system (i.e., GPT-4o) still significantly lags behind human experts in solving complex numerical reasoning problems grounded in long contexts. We believe that DocMath-Eval can serve as a valuable benchmark for evaluating LLMs' capabilities in solving challenging numerical reasoning problems within expert domains."
}

[10] Unveiling the Spectrum of Data Contamination in Language Model: A Survey from Detection to Remediation
Chunyuan Deng, Yilun Zhao, Yuzhao Heng, Yitong Li, Jiannan Cao, Xiangru Tang, Arman Cohan.
ACL 2024 Findings
[PDF] [Abstract] [Bib]

Contamination-Survey

@inproceedings{deng-etal-2024-unveiling,
    title = "Unveiling the Spectrum of Data Contamination in Language Model: A Survey from Detection to Remediation",
    author = "Deng, Chunyuan  and
      Zhao, Yilun  and
      Heng, Yuzhao  and
      Li, Yitong  and
      Cao, Jiannan  and
      Tang, Xiangru  and
      Cohan, Arman",
    editor = "Ku, Lun-Wei  and
      Martins, Andre  and
      Srikumar, Vivek",
    booktitle = "Findings of the Association for Computational Linguistics: ACL 2024",
    month = aug,
    year = "2024",
    address = "Bangkok, Thailand",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2024.findings-acl.951/",
    doi = "10.18653/v1/2024.findings-acl.951",
    pages = "16078--16092",
    abstract = "Data contamination has garnered increased attention in the era of Large language models (LLMs) due to the reliance on extensive internet-derived training corpora. The issue of training corpus overlap with evaluation benchmarks{---}referred to as contamination{---}has been the focus of significant recent research. This body of work aims to identify contamination, understand its impacts, and explore mitigation strategies from diverse perspectives. However, comprehensive studies that provide a clear pathway from foundational concepts to advanced insights are lacking in this nascent field. Therefore, we present the first survey in the field of data contamination. We begin by examining the effects of data contamination across various stages and forms. We then provide a detailed analysis of current contamination detection methods, categorizing them to highlight their focus, assumptions, strengths, and limitations. We also discuss mitigation strategies, offering a clear guide for future research. This survey serves as a succinct overview of the most recent advancements in data contamination research, providing a straightforward guide for the benefit of future research endeavors."
}

[9] FinDVer: Explainable Claim Verification over Long and Hybrid-content Financial Documents
Yilun Zhao, Yitao Long, Tintin Jiang, Chengye Wang, Weiyuan Chen, Hongjun Liu, Xiangru Tang, Yiming Zhang, Chen Zhao, Arman Cohan.
EMNLP 2024
[PDF] [Abstract] [Bib]

@inproceedings{zhao-etal-2024-findver,
    title = "{F}in{DV}er: Explainable Claim Verification over Long and Hybrid-content Financial Documents",
    author = "Zhao, Yilun  and
      Long, Yitao  and
      Jiang, Tintin  and
      Wang, Chengye  and
      Chen, Weiyuan  and
      Liu, Hongjun  and
      Tang, Xiangru  and
      Zhang, Yiming  and
      Zhao, Chen  and
      Cohan, Arman",
    editor = "Al-Onaizan, Yaser  and
      Bansal, Mohit  and
      Chen, Yun-Nung",
    booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
    month = nov,
    year = "2024",
    address = "Miami, Florida, USA",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2024.emnlp-main.818/",
    doi = "10.18653/v1/2024.emnlp-main.818",
    pages = "14739--14752",
    abstract = "We introduce FinDVer, a comprehensive benchmark specifically designed to evaluate the explainable claim verification capabilities of LLMs in the context of understanding and analyzing long, hybrid-content financial documents. FinDVer contains 4,000 expert-annotated examples across four subsets, each focusing on a type of scenario that frequently arises in real-world financial domains. We assess a broad spectrum of 25 LLMs under long-context and RAG settings. Our results show that even the current best-performing system (i.e., GPT-4o) significantly lags behind human experts. Our detailed findings and insights highlight the strengths and limitations of existing LLMs in this new task. We believe FinDVer can serve as a valuable benchmark for evaluating LLM capabilities in claim verification over complex, expert-domain documents."
}

[8] PRESTO: Progressive Pretraining Enhances Synthetic Chemistry Outcomes
He Cao, Yanjun Shao, Zhiyuan Liu, Zijing Liu, Xiangru Tang, Yuan Yao, Yu Li.
EMNLP 2024 Findings
[PDF] [Abstract] [Bib]

PRESTO

@inproceedings{cao-etal-2024-presto,
    title = "{PRESTO}: Progressive Pretraining Enhances Synthetic Chemistry Outcomes",
    author = "Cao, He  and
      Shao, Yanjun  and
      Liu, Zhiyuan  and
      Liu, Zijing  and
      Tang, Xiangru  and
      Yao, Yuan  and
      Li, Yu",
    editor = "Al-Onaizan, Yaser  and
      Bansal, Mohit  and
      Chen, Yun-Nung",
    booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024",
    month = nov,
    year = "2024",
    address = "Miami, Florida, USA",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2024.findings-emnlp.597/",
    doi = "10.18653/v1/2024.findings-emnlp.597",
    pages = "10197--10224",
    abstract = "Multimodal Large Language Models (MLLMs) have seen growing adoption across various scientific disciplines. These advancements encourage the investigation of molecule-text modeling within synthetic chemistry, a field dedicated to designing and conducting chemical reactions to synthesize new compounds with desired properties and applications. Current approaches, however, often neglect the critical role of multi-molecule graph interaction in understanding chemical reactions, leading to suboptimal performance in synthetic chemistry tasks. This study introduces PRESTO (Progressive Pretraining Enhances Synthetic Chemistry Outcomes), a new framework that bridges the molecule-text modality gap by integrating a comprehensive benchmark of pretraining strategies and dataset configurations. It progressively improves multimodal LLMs through cross-modal alignment and multi-graph understanding. Our extensive experiments demonstrate that PRESTO offers competitive results in downstream synthetic chemistry tasks. The code can be found at https://github.com/IDEA-XL/PRESTO."
}

[7] Investigating Data Contamination in Modern Benchmarks for Large Language Models
Chunyuan Deng, Yilun Zhao, Xiangru Tang, Mark Gerstein, Arman Cohan.
NAACL 2024
[PDF] [Abstract] [Bib]

@inproceedings{deng-etal-2024-investigating,
    title = "Investigating Data Contamination in Modern Benchmarks for Large Language Models",
    author = "Deng, Chunyuan  and
      Zhao, Yilun  and
      Tang, Xiangru  and
      Gerstein, Mark  and
      Cohan, Arman",
    editor = "Duh, Kevin  and
      Gomez, Helena  and
      Bethard, Steven",
    booktitle = "Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)",
    month = jun,
    year = "2024",
    address = "Mexico City, Mexico",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2024.naacl-long.482/",
    doi = "10.18653/v1/2024.naacl-long.482",
    pages = "8706--8719",
    abstract = "Recent observations have underscored a disparity between the inflated benchmark scores and the actual performance of LLMs, raising concerns about potential contamination of evaluation benchmarks. This issue is especially critical for closed-source models and certain open-source models where training data transparency is lacking. In this paper we study data contamination by proposing two methods tailored for both open-source and proprietary LLMs. We first introduce a retrieval-based system to explore potential overlaps between evaluation benchmarks and pretraining corpora. We further present a novel investigation protocol named Testset Slot Guessing (TS-Guessing), applicable to both open and proprietary models. This approach entails masking a wrong answer in a multiple-choice question and prompting the model to fill in the gap. Additionally, it involves obscuring an unlikely word in an evaluation example and asking the model to produce it. We find that certain commercial LLMs could surprisingly guess the missing option in various test sets. Specifically, in the MMLU benchmark, ChatGPT and GPT-4 demonstrated an exact match rate of 52{\%} and 57{\%}, respectively, in guessing the missing options in benchmark test data. We hope these results underscore the need for more robust evaluation methodologies and benchmarks in the field."
}

[6] Data preparation for Deep Learning based Code Smell Detection: A systematic literature review
Fengji Zhang, Zexian Zhang, Jacky Wai Keung, Xiangru Tang, Zhen Yang, Xiao Yu, Wenhua Hu.
Journal of Systems and Software, 2024 (IF 3.7, JCR "Q1")
[PDF] [Abstract] [Bib]

@article{zhang2024data,
  title={Data preparation for deep learning based code smell detection: A systematic literature review},
  author={Zhang, Fengji and Zhang, Zexian and Keung, Jacky Wai and Tang, Xiangru and Yang, Zhen and Yu, Xiao and Hu, Wenhua},
  journal={Journal of Systems and Software},
  pages={112131},
  year={2024},
  publisher={Elsevier}
}

[5] FAVOR-GPT: a generative natural language interface to whole genome variant functional annotations
Thomas Cheng Li, Hufeng Zhou, Vineet Verma, Xiangru Tang, Yanjun Shao, Eric Van Buren, Zhiping Weng, Mark Gerstein, Benjamin Neale, Shamil R Sunyaev, Xihong Lin.
Bioinformatics Advances, 2024 (IF 2.32, JCR Q2)
[PDF] [Abstract] [Bib]

@article{10.1093/bioadv/vbae143,
    author = {Li, Thomas Cheng and Zhou, Hufeng and Verma, Vineet and Tang, Xiangru and Shao, Yanjun and Van Buren, Eric and Weng, Zhiping and Gerstein, Mark and Neale, Benjamin and Sunyaev, Shamil R and Lin, Xihong},
    title = {FAVOR-GPT: a generative natural language interface to whole genome variant functional annotations},
    journal = {Bioinformatics Advances},
    volume = {4},
    number = {1},
    pages = {vbae143},
    year = {2024},
    month = {09},
    abstract = {Functional Annotation of genomic Variants Online Resources (FAVOR) offers multi-faceted, whole genome variant functional annotations, which is essential for Whole Genome and Exome Sequencing (WGS/WES) analysis and the functional prioritization of disease-associated variants. A versatile chatbot designed to facilitate informative interpretation and interactive, user-centric summary of the whole genome variant functional annotation data in the FAVOR database is needed.We have developed FAVOR-GPT, a generative natural language interface powered by integrating large language models (LLMs) and FAVOR. It is developed based on the Retrieval Augmented Generation (RAG) approach, and complements the original FAVOR portal, enhancing usability for users, especially those without specialized expertise. FAVOR-GPT simplifies raw annotations by providing interpretable explanations and result summaries in response to the user’s prompt. It shows high accuracy when cross-referencing with the FAVOR database, underscoring the robustness of the retrieval framework.Researchers can access FAVOR-GPT at FAVOR’s main website (https://favor.genohub.org).},
    issn = {2635-0041},
    doi = {10.1093/bioadv/vbae143},
    url = {https://doi.org/10.1093/bioadv/vbae143},
    eprint = {https://academic.oup.com/bioinformaticsadvances/article-pdf/4/1/vbae143/59645690/vbae143.pdf},
}

[4] RobuT: A Systematic Study of Table QA Robustness Against Human-Annotated Adversarial Perturbations
Yilun Zhao, Chen Zhao, Linyong Nan, Zhenting Qi, Wenlin Zhang, Xiangru Tang, Boyu Mi, Dragomir Radev.
ACL 2023
[PDF] [Abstract] [Bib]

@inproceedings{zhao-etal-2023-robut,
    title = "{R}obu{T}: A Systematic Study of Table {QA} Robustness Against Human-Annotated Adversarial Perturbations",
    author = "Zhao, Yilun  and
      Zhao, Chen  and
      Nan, Linyong  and
      Qi, Zhenting  and
      Zhang, Wenlin  and
      Tang, Xiangru  and
      Mi, Boyu  and
      Radev, Dragomir",
    editor = "Rogers, Anna  and
      Boyd-Graber, Jordan  and
      Okazaki, Naoaki",
    booktitle = "Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
    month = jul,
    year = "2023",
    address = "Toronto, Canada",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2023.acl-long.334/",
    doi = "10.18653/v1/2023.acl-long.334",
    pages = "6064--6081",
    abstract = "Despite significant progress having been made in question answering on tabular data (Table QA), it`s unclear whether, and to what extent existing Table QA models are robust to task-specific perturbations, e.g., replacing key question entities or shuffling table columns. To systematically study the robustness of Table QA models, we propose a benchmark called RobuT, which builds upon existing Table QA datasets (WTQ, WikiSQL-Weak, and SQA) and includes human-annotated adversarial perturbations in terms of table header, table content, and question. Our results indicate that both state-of-the-art Table QA models and large language models (e.g., GPT-3) with few-shot learning falter in these adversarial sets. We propose to address this problem by using large language models to generate adversarial examples to enhance training, which significantly improves the robustness of Table QA models."
}

[3] QTSumm: Query-Focused Summarization over Tabular Data
Yilun Zhao, Zhenting Qi, Linyong Nan, Boyu Mi, Yixin Liu, Weijin Zou, Simeng Han, Ruizhe Chen, Xiangru Tang, Yumo Xu, Dragomir Radev, Arman Cohan.
EMNLP 2023
[PDF] [Abstract] [Bib]

@article{zhao2023qtsumm,
  title={QTSumm: Query-focused summarization over tabular data},
  author={Zhao, Yilun and Qi, Zhenting and Nan, Linyong and Mi, Boyu and Liu, Yixin and Zou, Weijin and Han, Simeng and Chen, Ruizhe and Tang, Xiangru and Xu, Yumo and others},
  journal={arXiv preprint arXiv:2305.14303},
  year={2023}
}

[2] Investigating Table-to-Text Generation Capabilities of Large Language Models in Real-World Information Seeking Scenarios
Yilun Zhao, Haowei Zhang, Shengyun Si, Linyong Nan, Xiangru Tang, Arman Cohan.
EMNLP 2023
[PDF] [Abstract] [Bib]

@inproceedings{zhao-etal-2023-investigating,
    title = "Investigating Table-to-Text Generation Capabilities of Large Language Models in Real-World Information Seeking Scenarios",
    author = "Zhao, Yilun  and
      Zhang, Haowei  and
      Si, Shengyun  and
      Nan, Linyong  and
      Tang, Xiangru  and
      Cohan, Arman",
    editor = "Wang, Mingxuan  and
      Zitouni, Imed",
    booktitle = "Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing: Industry Track",
    month = dec,
    year = "2023",
    address = "Singapore",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2023.emnlp-industry.17/",
    doi = "10.18653/v1/2023.emnlp-industry.17",
    pages = "160--175",
    abstract = "Tabular data is prevalent across various industries, necessitating significant time and effort for users to understand and manipulate for their information-seeking purposes. The advancements in large language models (LLMs) have shown enormous potential to improve user efficiency. However, the adoption of LLMs in real-world applications for table information seeking remains underexplored. In this paper, we investigate the table-to-text capabilities of different LLMs using four datasets within two real-world information seeking scenarios. These include the LogicNLG and our newly-constructed LoTNLG datasets for data insight generation, along with the FeTaQA and our newly-constructed F2WTQ datasets for query-based generation. We structure our investigation around three research questions, evaluating the performance of LLMs in table-to-text generation, automated evaluation, and feedback generation, respectively. Experimental results indicate that the current high-performing LLM, specifically GPT-4, can effectively serve as a table-to-text generator, evaluator, and feedback generator, facilitating users' information seeking purposes in real-world scenarios. However, a significant performance gap still exists between other open-sourced LLMs (e.g., Vicuna and LLaMA-2) and GPT-4 models. Our data and code are publicly available at https://github.com/yale-nlp/LLM-T2T."
}

[1] RWKV: Reinventing RNNs for the Transformer Era
Bo Peng, Eric Alcaide, Quentin Anthony, Alon Albalak, Samuel Arcadinho, Stella Biderman, Huanqi Cao, Xin Cheng, Michael Chung, Leon Derczynski, Xingjian Du, Matteo Grella, Kranthi Gv, Xuzheng He, Haowen Hou, Przemyslaw Kazienko, Jan Kocon, Jiaming Kong, Bartłomiej Koptyra, Hayden Lau, Jiaju Lin, Krishna Sri Ipsit Mantri, Ferdinand Mom, Atsushi Saito, Guangyu Song, Xiangru Tang, Johan Wind, Stanisław Woźniak, Zhenyuan Zhang, Qinghua Zhou, Jian Zhu, Rui-Jie Zhu.
EMNLP 2023
[PDF] [Abstract] [Bib]

@inproceedings{peng-etal-2023-rwkv,
    title = "{RWKV}: Reinventing {RNN}s for the Transformer Era",
    author = "Peng, Bo  and
      Alcaide, Eric  and
      Anthony, Quentin  and
      Albalak, Alon  and
      Arcadinho, Samuel  and
      Biderman, Stella  and
      Cao, Huanqi  and
      Cheng, Xin  and
      Chung, Michael  and
      Derczynski, Leon  and
      Du, Xingjian  and
      Grella, Matteo  and
      Gv, Kranthi  and
      He, Xuzheng  and
      Hou, Haowen  and
      Kazienko, Przemyslaw  and
      Kocon, Jan  and
      Kong, Jiaming  and
      Koptyra, Bart{\l}omiej  and
      Lau, Hayden  and
      Lin, Jiaju  and
      Mantri, Krishna Sri Ipsit  and
      Mom, Ferdinand  and
      Saito, Atsushi  and
      Song, Guangyu  and
      Tang, Xiangru  and
      Wind, Johan  and
      Wo{\'z}niak, Stanis{\l}aw  and
      Zhang, Zhenyuan  and
      Zhou, Qinghua  and
      Zhu, Jian  and
      Zhu, Rui-Jie",
    editor = "Bouamor, Houda  and
      Pino, Juan  and
      Bali, Kalika",
    booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2023",
    month = dec,
    year = "2023",
    address = "Singapore",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2023.findings-emnlp.936/",
    doi = "10.18653/v1/2023.findings-emnlp.936",
    pages = "14048--14077",
    abstract = "Transformers have revolutionized almost all natural language processing (NLP) tasks but suffer from memory and computational complexity that scales quadratically with sequence length. In contrast, recurrent neural networks (RNNs) exhibit linear scaling in memory and computational requirements but struggle to match the same performance as Transformers due to limitations in parallelization and scalability. We propose a novel model architecture, Receptance Weighted Key Value (RWKV), that combines the efficient parallelizable training of transformers with the efficient inference of RNNs. Our approach leverages a linear attention mechanism and allows us to formulate the model as either a Transformer or an RNN, thus parallelizing computations during training and maintains constant computational and memory complexity during inference. We scale our models as large as 14 billion parameters, by far the largest dense RNN ever trained, and find RWKV performs on par with similarly sized Transformers, suggesting future work can leverage this architecture to create more efficient models. This work presents a significant step towards reconciling trade-offs between computational efficiency and model performance in sequence processing tasks."
}

Recent Talks

07/2025 Talk at ISMB 2025 3DSIG Section.
11/2024 Talk at Takeda Pharmaceutical.
07/2024 Talk at Yale Department of Biomedical Informatics & Data Science.
07/2024 Talk at ISMB 2024 Text Mining Section.
07/2024 Talk at Multimodal Large Language Model.
02/2024 Talk at AI in Medicine Symposium at Yale School of Medicine.
01/2024 Talk at PSB 2024 Workshop on LLMs for Biomedicine.
07/2023 Talk at ISMB/ECCB 2023 Text Mining Section.

Workshop & Tutorial Organizers

Workshop Organizer: ICML 2025 Workshop on Multi-Agent Systems, ICCV 2025 Workshop on Knowledge-Intensive Multimodal Reasoning, ICLR 2024 Workshop on LLM Agents, SIGDIAL/INLG 2023 Workshop on Taming LLMs.
Tutorial Organizer: ISMB 2024 Tutorial on A Practical Introduction to LLMs in Biomedical Research.
Session Chair: ACL 2024 BoF on AI for Science, NAACL 2024 BoF on LLMs for Science.

Services

Area Chair: ACL ARR (ACL, EMNLP, NAACL, etc).
Conference Program Committee / Reviewer: NeurIPS, ICML, ACL, EMNLP, CIKM, NAACL, INLG, IEEE BigData, COLM.
Journal Reviewer: npj Digital Medicine, TPAMI, Neurocomputing, Briefings in Bioinformatics, PLOS Computational Biology, BMC Bioinformatics, PLOS ONE, Health Data Science.
Workshop Reviewer: KDD 2023 Workshop on Data Mining in Bioinformatics, ACL 2023 Workshop on Building Educational Apps, ACL 2023 Workshop on Clinical NLP, ICML 2023 Workshop on Neural Conv AI, ICML 2023 Workshop on Interpretable ML in Healthcare, NAACL-HLT 2021 Workshop on Language and Vision Research.

Teaching

Teaching Fellow - CPSC 452/CPSC 552/AMTH 552/CB&B 663 Deep Learning Theory and Applications, Yale University, 2023 Spring.
Teaching Fellow - CPSC 437/CPSC 537 Introduction to Database Systems, Yale University, 2023 Fall.
Teaching Fellow - CPSC 452/CPSC 552/AMTH 552/CB&B 663 Deep Learning Theory and Applications, Yale University, 2024 Spring.
Teaching Fellow - CPSC 437/CPSC 537 Database Systems, Yale University, 2024 Fall.

Misc.

I took 12 courses (and 3 additional project credits) at Yale: CPSC 523 Principles of Operating Systems, 537 Intro to Database, 539 Software Engineering, 552 Deep Learning Theory, 553 Unsupervised Learning, 569 Randomized Algorithms, 577 NLP, 583 Deep Learning on Graph, 668 Blockchain Research, 677 Adv NLP, 680 Trustworthy Deep Learning, 752 Biomedical Data Sci.
Interestingly, this course load matches the entire requirement for a Yale undergraduate B.S. degree in Computer Science (which requires 11 courses + 1 project credit) and exceeds what's needed for a B.A. (which requires only 9 courses + 1 project credit).