Xiangru (Robert) Tang

Recent Preprints

[2] MedAgentsBench: Benchmarking Thinking Models and Agent Frameworks for Complex Medical Reasoning
Xiangru Tang, Daniel Shao, Jiwoong Sohn, Jiapeng Chen, Jiayi Zhang, Jinyu Xiang, Fang Wu, Yilun Zhao, Chenglin Wu, Wenqi Shi, Arman Cohan, Mark Gerstein.
arXiv preprint arXiv:2503.07459, Patterns (in review)
"Thinking models (DeepSeek R1 and OpenAI o3) show exceptional performance on medical QA tasks."
[PDF] [Abstract] [Bib]

MedagentsBench

@article{tang2025medagentsbench,
  title={MedAgentsBench: Benchmarking Thinking Models and Agent Frameworks for Complex Medical Reasoning},
  author={Tang, Xiangru and Shao, Daniel and Sohn, Jiwoong and Chen, Jiapeng and Zhang, Jiayi and Xiang, Jinyu and Wu, Fang and Zhao, Yilun and Wu, Chenglin and Shi, Wenqi and others},
  journal={arXiv preprint arXiv:2503.07459},
  year={2025}
}

[1] BC-Design: A Biochemistry-Aware Framework for High-Precision Inverse Protein Folding
Xiangru Tang*, Xinwu Ye*, Fang Wu*, Yanjun Shao, Yin Fang, Siming Chen, Dong Xu, Mark Gerstein.
bioRxiv 2024, Nature (in review)
"A quantum leap in inverse protein folding from 67% to 88%!"
[PDF] [Abstract] [Bib]

BC-Design

@article{tang2024bc,
  title={BC-Design: A Biochemistry-Aware Framework for High-Precision Inverse Protein Folding},
  author={Tang, Xiangru and Ye, Xinwu and Wu, Fang and Shao, Yanjun and Fang, Yin and Chen, Siming and Xu, Dong and Gerstein, Mark},
  journal={bioRxiv},
  pages={2024--10},
  year={2024},
  publisher={Cold Spring Harbor Laboratory}
}

Selected Publications

[18] MMSciBench: Benchmarking Language Models on Multimodal Scientific Problems
Xinwu Ye, Chengfan Li, Siming Chen, Wei Wei, Xiangru Tang
ACL 2025 Findings
[PDF] [Abstract] [Bib]

MMSciBench

@article{ye2025mmscibench,
  title={MMSciBench: Benchmarking Language Models on Multimodal Scientific Problems},
  author={Ye, Xinwu and Li, Chengfan and Chen, Siming and Tang, Xiangru and Wei, Wei},
  journal={arXiv preprint arXiv:2503.01891},
  year={2025}
}

[17] LocAgent: Graph-Guided LLM Agents for Code Localization
Zhaoling Chen*, Xiangru Tang*, Gangda Deng*, Fang Wu, Jialong Wu, Zhiwei Jiang, Viktor Prasanna, Arman Cohan, Xingyao Wang.
ACL 2025
"No need to embed the entire repo, agent + graph-based indexing is all you need!"
[PDF] [Abstract] [Bib]

LocAgent

@article{chen2025locagent,
  title={LocAgent: Graph-Guided LLM Agents for Code Localization},
  author={Chen, Zhaoling and Tang, Xiangru and Deng, Gangda and Wu, Fang and Wu, Jialong and Jiang, Zhiwei and Prasanna, Viktor and Cohan, Arman and Wang, Xingyao},
  journal={arXiv preprint arXiv:2503.09089},
  year={2025}
}

[16] ML-Bench: Evaluating Large Language Models and Agents for Machine Learning Tasks on Repository-Level Code
Xiangru Tang*, Yuliang Liu*, Zefan Cai*, Junjie Lu, Yichi Zhang, Yanjun Shao, Zexuan Deng, Helan Hu, Kaikai An, Ruijun Huang, Shuzheng Si, Sheng Chen, Haozhe Zhao, Liang Chen, Yan Wang, Tianyu Liu, Zhiwei Jiang, Baobao Chang, Yujia Qin, Wangchunshu Zhou, Yilun Zhao, Arman Cohan, Mark Gerstein.
Nature Computational Science (in review)
ICLR 2025 Deep Learning for Code
ICLR 2025 Agentic AI for Scientific Discovery
"Can LLMs do machine learning tasks?"
[PDF] [Abstract] [Bib]

ML-Bench

@article{tang2023ml,
  title={ML-Bench: Evaluating Large Language Models and Agents for Machine Learning Tasks on Repository-Level Code},
  author={Tang, Xiangru and Liu, Yuliang and Cai, Zefan and Shao, Yanjun and Lu, Junjie and Zhang, Yichi and Deng, Zexuan and Hu, Helan and Yang, Zengxian and An, Kaikai and others},
  journal={arXiv preprint arXiv:2311.09835},
  year={2023}
}

[15] Risks of AI Scientists: Prioritizing Safeguarding Over Autonomy
Xiangru Tang, Qiao Jin, Kunlun Zhu, Tongxin Yuan, Yichi Zhang, Wangchunshu Zhou, Meng Qu, Yilun Zhao, Jian Tang, Zhuosheng Zhang, Arman Cohan, Zhiyong Lu, Mark Gerstein.
Nature Communications, 2025 (IF 14.7)
ICLR 2024 Workshop on LLM Agents
[PDF] [Abstract] [Bib]

@article{tang2024prioritizing,
  title={Prioritizing Safeguarding Over Autonomy: Risks of LLM Agents for Science},
  author={Tang, Xiangru and Jin, Qiao and Zhu, Kunlun and Yuan, Tongxin and Zhang, Yichi and Zhou, Wangchunshu and Qu, Meng and Zhao, Yilun and Tang, Jian and Zhang, Zhuosheng and others},
  journal={arXiv preprint arXiv:2402.04247},
  year={2024}
}

[14] ChemAgent: Self-updating Memories in Large Language Models Improves Chemical Reasoning
Xiangru Tang*, Tianyu Hu*, Muyang Ye*, Yanjun Shao*, Xunjian Yin, Siru Ouyang, Wangchunshu Zhou, Pan Lu, Zhuosheng Zhang, Yilun Zhao, Arman Cohan, Mark Gerstein.
ICLR 2025
"Enable LLMs to continuously improve through experience."
[PDF] [Abstract] [Bib]

ChemAgent

@inproceedings{tang2025chemagent,
  title={ChemAgent: Self-updating Library in Large Language Models Improves Chemical Reasoning},
  author={Tang, Xiangru and Hu, Tianyu and Ye, Muyang and Shao, Yanjun and Yin, Xunjian and Ouyang, Siru and Zhou, Wangchunshu and Lu, Pan and Zhang, Zhuosheng and Zhao, Yilun and others},
  booktitle={The Thirteenth International Conference on Learning Representations}
}

[13] Fast, Sensitive Detection of Protein Homologs Using Deep Dense Retrieval
Liang Hong*, Zhihang Hu*, Siqi Sun*, Xiangru Tang*, Jiuming Wang, Qingxiong Tan, Liangzhen Zheng, Sheng Wang, Sheng Xu, Irwin King, Mark Gerstein, Yu Li.
Nature Biotechnology, 2024 (IF 33.1)
"Up to 28,700 times faster than HMMER!"
[PDF] [Abstract] [Bib]

DPR

@article{hong2024fast,
  title={Fast, sensitive detection of protein homologs using deep dense retrieval},
  author={Hong, Liang and Hu, Zhihang and Sun, Siqi and Tang, Xiangru and Wang, Jiuming and Tan, Qingxiong and Zheng, Liangzhen and Wang, Sheng and Xu, Sheng and King, Irwin and others},
  journal={Nature Biotechnology},
  pages={1--13},
  year={2024},
  publisher={Nature Publishing Group US New York}
}

[12] MIMIR: A Customizable Agent Tuning Platform for Enhanced Scientific Applications
Xiangru Tang*, Chunyuan Deng*, Hanmin Wang*, Haoran Wang*, Yilun Zhao, Wenqi Shi, Yi Fung, Wangchunshu Zhou, Jiannan Cao, Heng Ji, Arman Cohan, Mark Gerstein.
EMNLP 2024
[PDF] [Abstract] [Bib]

MIMIR

@inproceedings{tang-etal-2024-mimir,
    title = "{MIMIR}: A Customizable Agent Tuning Platform for Enhanced Scientific Applications",
    author = "Tang, Xiangru  and
      Deng, Chunyuan  and
      Wang, Hanmin  and
      Wang, Haoran  and
      Zhao, Yilun  and
      Shi, Wenqi  and
      Fung, Yi  and
      Zhou, Wangchunshu  and
      Cao, Jiannan  and
      Ji, Heng  and
      Cohan, Arman  and
      Gerstein, Mark",
    editor = "Hernandez Farias, Delia Irazu  and
      Hope, Tom  and
      Li, Manling",
    booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: System Demonstrations",
    month = nov,
    year = "2024",
    address = "Miami, Florida, USA",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2024.emnlp-demo.49",
    pages = "486--496",
}

[11] Step-Back Profiling: Distilling User History for Personalized Scientific Writing
Xiangru Tang, Xingyao Zhang, Yanjun Shao, Jie Wu, Yilun Zhao, Arman Cohan, Ming Gong, Dongmei Zhang, Mark Gerstein.
IJCAI 2024 Workshop on AI4Research (Best Paper Award)
[PDF] [Abstract] [Bib]

Step-Back Profiling

@article{tang2024step,
  title={Step-Back Profiling: Distilling User History for Personalized Scientific Writing},
  author={Xiangru Tang and Xingyao Zhang and Yanjun Shao and Jie Wu and Yilun Zhao and Arman Cohan and Ming Gong and Dongmei Zhang and Mark Gerstein},
  journal={arXiv preprint arXiv:2406.14275},
  year={2024}
}

[10] A Survey of Generative AI for De Novo Drug Design: New Frontiers in Molecule and Protein Generation
Xiangru Tang*, Howard Dai*, Elizabeth Knight*, Fang Wu, Yunyang Li, Tianxiao Li, Mark Gerstein.
Briefings in Bioinformatics, 2024 (IF 13.99, JCR "Q1")
"An introductory overview with a clear breakdown of datasets, benchmarks, & models."
[PDF] [Abstract] [Bib]

GenAI4Drug

@article{tang2024survey,
  title={A survey of generative ai for de novo drug design: new frontiers in molecule and protein generation},
  author={Tang, Xiangru and Dai, Howard and Knight, Elizabeth and Wu, Fang and Li, Yunyang and Li, Tianxiao and Gerstein, Mark},
  journal={Briefings in Bioinformatics},
  volume={25},
  number={4},
  year={2024},
  publisher={Oxford Academic}
}

[9] MolLM: A Unified Language Model for Integrating Biomedical Text with 2D and 3D Molecular Representations
Xiangru Tang, Andrew Tran, Jeffrey Tan, Mark Gerstein.
ISMB 2024, Proceedings in Bioinformatics (IF 6.93, JCR "Q1")
[PDF] [Abstract] [Bib]

MolLM

@article{10.1093/bioinformatics/btae260,
    author = {Tang, Xiangru and Tran, Andrew and Tan, Jeffrey and Gerstein, Mark B},
    title = "{MolLM: a unified language model for integrating biomedical text with 2D and 3D molecular representations}",
    journal = {Bioinformatics},
    volume = {40},
    number = {Supplement_1},
    pages = {i357-i368},
    year = {2024},
    month = {06},
    abstract = "{The current paradigm of deep learning models for the joint representation of molecules and text primarily relies on 1D or 2D molecular formats, neglecting significant 3D structural information that offers valuable physical insight. This narrow focus inhibits the models’ versatility and adaptability across a wide range of modalities. Conversely, the limited research focusing on explicit 3D representation tends to overlook textual data within the biomedical domain.We present a unified pre-trained language model, MolLM, that concurrently captures 2D and 3D molecular information alongside biomedical text. MolLM consists of a text Transformer encoder and a molecular Transformer encoder, designed to encode both 2D and 3D molecular structures. To support MolLM’s self-supervised pre-training, we constructed 160K molecule-text pairings. Employing contrastive learning as a supervisory signal for learning, MolLM demonstrates robust molecular representation capabilities across four downstream tasks, including cross-modal molecule and text matching, property prediction, captioning, and text-prompted molecular editing. Through ablation, we demonstrate that the inclusion of explicit 3D representations improves performance in these downstream tasks.Our code, data, pre-trained model weights, and examples of using our model are all available at https://github.com/gersteinlab/MolLM. In particular, we provide Jupyter Notebooks offering step-by-step guidance on how to use MolLM to extract embeddings for both molecules and text.}",
    issn = {1367-4811},
    doi = {10.1093/bioinformatics/btae260},
    url = {https://doi.org/10.1093/bioinformatics/btae260},
    eprint = {https://academic.oup.com/bioinformatics/article-pdf/40/Supplement\_1/i357/58355106/btae260.pdf},
}

[8] BioCoder: A Benchmark for Bioinformatics Code Generation with Large Language Models
Xiangru Tang, Bill Qian, Rick Gao, Jiakang Chen, Xinyun Chen, Mark Gerstein.
ISMB 2024, Proceedings in Bioinformatics (IF 6.93, JCR "Q1")
"BioCoder input covers repository-level potential package dependencies, class declarations, & global variables."
[PDF] [Abstract] [Bib]

BioCoder

@article{10.1093/bioinformatics/btae230,
    author = {Tang, Xiangru and Qian, Bill and Gao, Rick and Chen, Jiakang and Chen, Xinyun and Gerstein, Mark B},
    title = "{BioCoder: a benchmark for bioinformatics code generation with large language models}",
    journal = {Bioinformatics},
    volume = {40},
    number = {Supplement_1},
    pages = {i266-i276},
    year = {2024},
    month = {06},
    abstract = "{Pretrained large language models (LLMs) have significantly improved code generation. As these models scale up, there is an increasing need for the output to handle more intricate tasks and to be appropriately specialized to particular domains. Here, we target bioinformatics due to the amount of domain knowledge, algorithms, and data operations this discipline requires. We present BioCoder, a benchmark developed to evaluate LLMs in generating bioinformatics-specific code. BioCoder spans much of the field, covering cross-file dependencies, class declarations, and global variables. It incorporates 1026 Python functions and 1243 Java methods extracted from GitHub, along with 253 examples from the Rosalind Project, all pertaining to bioinformatics. Using topic modeling, we show that the overall coverage of the included code is representative of the full spectrum of bioinformatics calculations. BioCoder incorporates a fuzz-testing framework for evaluation. We have applied it to evaluate various models including InCoder, CodeGen, CodeGen2, SantaCoder, StarCoder, StarCoder+, InstructCodeT5+, GPT-3.5, and GPT-4. Furthermore, we fine-tuned one model (StarCoder), demonstrating that our training dataset can enhance the performance on our testing benchmark (by \\>15\\% in terms of Pass@K under certain prompt configurations and always \\>3\\%). The results highlight two key aspects of successful models: (i) Successful models accommodate a long prompt (\\>2600 tokens) with full context, including functional dependencies. (ii) They contain domain-specific knowledge of bioinformatics, beyond just general coding capability. This is evident from the performance gain of GPT-3.5/4 compared to the smaller models on our benchmark (50\\% versus up to 25\\%).All datasets, benchmark, Docker images, and scripts required for testing are available at: https://github.com/gersteinlab/biocoder and https://biocoder-benchmark.github.io/.}",
    issn = {1367-4811},
    doi = {10.1093/bioinformatics/btae230},
    url = {https://doi.org/10.1093/bioinformatics/btae230},
    eprint = {https://academic.oup.com/bioinformatics/article-pdf/40/Supplement\_1/i266/58354818/btae230.pdf},
}

[7] MedAgents: Large Language Models as Collaborators for Zero-shot Medical Reasoning
Xiangru Tang*, Anni Zou*, Zhuosheng Zhang, Yilun Zhao, Xingyao Zhang, Arman Cohan, Mark Gerstein.
ACL 2024 Findings
"The first multi-agent framework within the medical context!"
[PDF] [Abstract] [Bib]

MedAgents

@article{tang2023medagents,
  title={MedAgents: Large Language Models as Collaborators for Zero-shot Medical Reasoning},
  author={Tang, Xiangru and Zou, Anni and Zhang, Zhuosheng and Zhao, Yilun and Zhang, Xingyao and Cohan, Arman and Gerstein, Mark},
  journal={arXiv preprint arXiv:2311.10537},
  year={2023}
}

[6] Struc-Bench: Are Large Language Models Good at Generating Complex Structured Tabular Data?
Xiangru Tang, Yiming Zong, Jason Phang, Yilun Zhao, Wangchunshu Zhou, Arman Cohan, Mark Gerstein.
NAACL 2024 (Oral)
[PDF] [Abstract] [Bib]

Struc-Bench

@inproceedings{tang-etal-2024-struc,
    title = "Struc-Bench: Are Large Language Models Good at Generating Complex Structured Tabular Data?",
    author = "Tang, Xiangru  and
      Zong, Yiming  and
      Phang, Jason  and
      Zhao, Yilun  and
      Zhou, Wangchunshu  and
      Cohan, Arman  and
      Gerstein, Mark",
    editor = "Duh, Kevin  and
      Gomez, Helena  and
      Bethard, Steven",
    booktitle = "Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 2: Short Papers)",
    month = jun,
    year = "2024",
    address = "Mexico City, Mexico",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2024.naacl-short.2",
    pages = "12--34",
    abstract = "Despite the remarkable capabilities of Large Language Models (LLMs) like GPT-4, producing complex, structured tabular data remains challenging. Our study assesses LLMs{'} proficiency in structuring tables and introduces a novel fine-tuning method, cognizant of data structures, to bolster their performance. We unveil Struc-Bench, a comprehensive benchmark featuring prominent LLMs (GPT-NeoX-20B, GPT-3.5, GPT-4, and Vicuna), which spans text tables, HTML, and LaTeX formats. Our proposed FormatCoT aids in crafting format-specific instructions from the intended outputs to populate this benchmark. Addressing the gap in task-centered evaluation, we propose two innovative metrics, P-Score (Prompting Score) and H-Score (Heuristical Score), to more accurately gauge LLM performance. Our experiments show that applying our structure-aware fine-tuning to LLaMA-7B leads to substantial performance gains, outshining its LLM counterparts across most measures. In-depth error analysis and creating an ability map across six dimensions, coverage, formatting, reasoning, comprehension, pragmatics, and hallucination, highlight areas for future enhancements and suggest forthcoming research trajectories. Our code and models can be found at https://github.com/gersteinlab/Struc-Bench.",
}

[5] Meta-CoT: Generalizable Chain-of-Thought Prompting in Mixed-task Scenarios with Large Language Models
Anni Zou, Zhuosheng Zhang, Hai Zhao, Xiangru Tang.
IEEE Transactions on Audio, Speech and Language Processing (In Review)
"Bridge the gap between performance and generalization when using the CoT prompting!"
[PDF] [Abstract] [Bib]

Meta-CoT

@article{zou2023metacot,
  title={Meta-CoT: Generalizable Chain-of-Thought Prompting in Mixed-task Scenarios with Large Language Models},
  author={Anni Zou and Zhuosheng Zhang and Hai Zhao and Xiangru Tang},
  journal={arXiv preprint arXiv:2310.06692},
  year={2023}
}

[4] Aligning Factual Consistency for Clinical Studies Summarization through Reinforcement Learning
Xiangru Tang, Arman Cohan, Mark Gerstein.
ACL 2023 Clinical Natural Language Processing
[PDF] [Abstract] [Bib]

@inproceedings{tang-etal-2023-aligning,
    title = "Aligning Factual Consistency for Clinical Studies Summarization through Reinforcement Learning",
    author = "Tang, Xiangru  and
      Cohan, Arman  and
      Gerstein, Mark",
    editor = "Naumann, Tristan  and
      Ben Abacha, Asma  and
      Bethard, Steven  and
      Roberts, Kirk  and
      Rumshisky, Anna",
    booktitle = "Proceedings of the 5th Clinical Natural Language Processing Workshop",
    month = jul,
    year = "2023",
    address = "Toronto, Canada",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2023.clinicalnlp-1.7",
    doi = "10.18653/v1/2023.clinicalnlp-1.7",
    pages = "48--58",
    abstract = "In the rapidly evolving landscape of medical research, accurate and concise summarization of clinical studies is crucial to support evidence-based practice. This paper presents a novel approach to clinical studies summarization, leveraging reinforcement learning to enhance factual consistency and align with human annotator preferences. Our work focuses on two tasks: Conclusion Generation and Review Generation. We train a CONFIT summarization model that outperforms GPT-3 and previous state-of-the-art models on the same datasets and collects expert and crowd-worker annotations to evaluate the quality and factual consistency of the generated summaries. These annotations enable us to measure the correlation of various automatic metrics, including modern factual evaluation metrics like QAFactEval, with human-assessed factual consistency. By employing top-correlated metrics as objectives for a reinforcement learning model, we demonstrate improved factuality in generated summaries that are preferred by human annotators.",
}

[3] GersteinLab at MEDIQA-Chat 2023: Clinical Note Summarization from Doctor-Patient Conversations through Fine-tuning and In-context Learning
Xiangru Tang, Andrew Tran, Jeffrey Tan, Mark Gerstein.
ACL 2023 Clinical Natural Language Processing
[PDF] [Abstract] [Bib]

MEDIQA

@inproceedings{tang-etal-2023-gersteinlab,
    title = "{G}erstein{L}ab at {MEDIQA}-Chat 2023: Clinical Note Summarization from Doctor-Patient Conversations through Fine-tuning and In-context Learning",
    author = "Tang, Xiangru  and
      Tran, Andrew  and
      Tan, Jeffrey  and
      Gerstein, Mark",
    editor = "Naumann, Tristan  and
      Ben Abacha, Asma  and
      Bethard, Steven  and
      Roberts, Kirk  and
      Rumshisky, Anna",
    booktitle = "Proceedings of the 5th Clinical Natural Language Processing Workshop",
    month = jul,
    year = "2023",
    address = "Toronto, Canada",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2023.clinicalnlp-1.58",
    doi = "10.18653/v1/2023.clinicalnlp-1.58",
    pages = "546--554",
    abstract = "This paper presents our contribution to the MEDIQA-2023 Dialogue2Note shared task, encompassing both subtask A and subtask B. We approach the task as a dialogue summarization problem and implement two distinct pipelines: (a) a fine-tuning of a pre-trained dialogue summarization model and GPT-3, and (b) few-shot in-context learning (ICL) using a large language model, GPT-4. Both methods achieve excellent results in terms of ROUGE-1 F1, BERTScore F1 (deberta-xlarge-mnli), and BLEURT, with scores of 0.4011, 0.7058, and 0.5421, respectively. Additionally, we predict the associated section headers using RoBERTa and SciBERT based classification models. Our team ranked fourth among all teams, while each team is allowed to submit three runs as part of their submission. We also utilize expert annotations to demonstrate that the notes generated through the ICL GPT-4 are better than all other baselines. The code for our submission is available.",
}

[2] CONFIT: Toward Faithful Dialogue Summarization with Linguistically-Informed Contrastive Fine-tuning
Xiangru Tang, Arjun Nair, Borui Wang, Bingyao Wang, Jai Desai, Aaron Wade, Haoran Li, Asli Celikyilmaz, Yashar Mehdad, Dragomir Radev.
NAACL 2022 (Oral)
[PDF] [Abstract] [Bib]

@inproceedings{tang-etal-2022-confit,
    title = "{CONFIT}: Toward Faithful Dialogue Summarization with Linguistically-Informed Contrastive Fine-tuning",
    author = "Tang, Xiangru  and
      Nair, Arjun  and
      Wang, Borui  and
      Wang, Bingyao  and
      Desai, Jai  and
      Wade, Aaron  and
      Li, Haoran  and
      Celikyilmaz, Asli  and
      Mehdad, Yashar  and
      Radev, Dragomir",
    editor = "Carpuat, Marine  and
      de Marneffe, Marie-Catherine  and
      Meza Ruiz, Ivan Vladimir",
    booktitle = "Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies",
    month = jul,
    year = "2022",
    address = "Seattle, United States",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2022.naacl-main.415",
    doi = "10.18653/v1/2022.naacl-main.415",
    pages = "5657--5668",
}

[1] Investigating Crowdsourcing Protocols for Evaluating the Factual Consistency of Summaries
Xiangru Tang, Alexander Fabbri, Haoran Li, Ziming Mao, Griffin Adams, Borui Wang, Asli Celikyilmaz, Yashar Mehdad, Dragomir Radev.
NAACL 2022
[PDF] [Abstract] [Bib]

@inproceedings{tang-etal-2022-investigating,
    title = "Investigating Crowdsourcing Protocols for Evaluating the Factual Consistency of Summaries",
    author = "Tang, Xiangru  and
      Fabbri, Alexander  and
      Li, Haoran  and
      Mao, Ziming  and
      Adams, Griffin  and
      Wang, Borui  and
      Celikyilmaz, Asli  and
      Mehdad, Yashar  and
      Radev, Dragomir",
    editor = "Carpuat, Marine  and
      de Marneffe, Marie-Catherine  and
      Meza Ruiz, Ivan Vladimir",
    booktitle = "Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies",
    month = jul,
    year = "2022",
    address = "Seattle, United States",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2022.naacl-main.417",
    doi = "10.18653/v1/2022.naacl-main.417",
    pages = "5680--5692",
}

Other Publications

[9] MultiAgentBench : Evaluating the Collaboration and Competition of LLM agents
Kunlun Zhu, Hongyi Du, Zhaochen Hong, Xiaocheng Yang, Shuyi Guo, Zhe Wang, Zhenhailong Wang, Cheng Qian, Xiangru Tang, Heng Ji, Jiaxuan You
ACL 2025
[PDF] [Abstract] [Bib]

MultiAgentBench

@article{zhu2025multiagentbench,
  title={MultiAgentBench: Evaluating the Collaboration and Competition of LLM agents},
  author={Zhu, Kunlun and Du, Hongyi and Hong, Zhaochen and Yang, Xiaocheng and Guo, Shuyi and Wang, Zhe and Wang, Zhenhailong and Qian, Cheng and Tang, Xiangru and Ji, Heng and others},
  journal={arXiv preprint arXiv:2503.01935},
  year={2025}
}

[8] OpenHands: An Open Platform for AI Software Developers as Generalist Agents
Xingyao Wang, Boxuan Li, Yufan Song, Frank F. Xu, Xiangru Tang, Mingchen Zhuge, Jiayi Pan, Yueqi Song, Bowen Li, Jaskirat Singh, Hoang H. Tran, Fuqiang Li, Ren Ma, Mingzhang Zheng, Bill Qian, Yanjun Shao, Niklas Muennighoff, Yizhe Zhang, Binyuan Hui, Junyang Lin, Robert Brennan, Hao Peng, Heng Ji, Graham Neubig
ICLR 2025
"AI agents function as software developers, capable of command execution, web browsing & API interaction."
[PDF] [Abstract] [Bib]

OpenHands

@inproceedings{wang2025openhands,
      title={{OpenHands: An Open Platform for AI Software Developers as Generalist Agents}},
      author={Xingyao Wang and Boxuan Li and Yufan Song and Frank F. Xu and Xiangru Tang and Mingchen Zhuge and Jiayi Pan and Yueqi Song and Bowen Li and Jaskirat Singh and Hoang H. Tran and Fuqiang Li and Ren Ma and Mingzhang Zheng and Bill Qian and Yanjun Shao and Niklas Muennighoff and Yizhe Zhang and Binyuan Hui and Junyang Lin and Robert Brennan and Hao Peng and Heng Ji and Graham Neubig},
  booktitle={The Thirteenth International Conference on Learning Representations}
}

[7] Igniting Language Intelligence: The Hitchhiker's Guide From Chain-of-Thought Reasoning to Language Agents
Zhuosheng Zhang, Yao Yao, Aston Zhang, Xiangru Tang, Xinbei Ma, Zhiwei He, Yiming Wang, Mark Gerstein, Rui Wang, Gongshen Liu, Hai Zhao.
ACM Computing Surveys, 2024 (IF 23.8)
"Generalization, efficiency, customization, scaling, and safety related to CoT and agents."
[PDF] [Abstract] [Bib]

CoT-Igniting-Agent

@article{zhang2025igniting,
  title={Igniting Language Intelligence: The Hitchhiker's Guide From Chain-of-Thought Reasoning to Language Agents},
  author={Zhang, Zhuosheng and Yao, Yao and Zhang, Aston and Tang, Xiangru and Ma, Xinbei and He, Zhiwei and Wang, Yiming and Gerstein, Mark and Wang, Rui and Liu, Gongshen and others},
  journal={ACM Computing Surveys},
  year={2025}
}

[6] Survey on Factuality in Large Language Models: Knowledge, Retrieval and Domain-Specificity
Cunxiang Wang*, Xiaoze Liu*, Yuanhao Yue*, Xiangru Tang, Tianhang Zhang, Cheng Jiayang, Yunzhi Yao, Wenyang Gao, Xuming Hu, Zehan Qi, Yidong Wang, Linyi Yang, Jindong Wang, Xing Xie, Zheng Zhang, Yue Zhang.
ACM Computing Surveys, 2024 (IF 23.8)
[PDF] [Abstract] [Bib]

LLM-Factuality-Survey

@article{wang2025survey,
  title={Survey on Factuality in Large Language Models: Knowledge, Retrieval and Domain-Specificity},
  author={Wang, Cunxiang and Liu, Xiaoze and Yue, Yuanhao and Tang, Xiangru and Zhang, Tianhang and Jiayang, Cheng and Yao, Yunzhi and Gao, Wenyang and Hu, Xuming and Qi, Zehan and others},
  journal={ACM Computing Surveys},
  year={2025}
}

[5] OctoPack: Instruction Tuning Code Large Language Models
Niklas Muennighoff, Qian Liu, Armel Randy Zebaze, Qinkai Zheng, Binyuan Hui, Terry Yue Zhuo, Swayam Singh, Xiangru Tang, Leandro Von Werra, Shayne Longpre.
ICLR 2024
[PDF] [Abstract] [Bib]

octopack

@inproceedings{muennighoffoctopack,
  title={OctoPack: Instruction Tuning Code Large Language Models},
  author={Muennighoff, Niklas and Liu, Qian and Zebaze, Armel Randy and Zheng, Qinkai and Hui, Binyuan and Zhuo, Terry Yue and Singh, Swayam and Tang, Xiangru and Von Werra, Leandro and Longpre, Shayne},
  booktitle={The Twelfth International Conference on Learning Representations}
}

[4] ToolLLM: Facilitating Large Language Models to Master 16000+ Real-world APIs
Yujia Qin, Shihao Liang, Yining Ye, Kunlun Zhu, Lan Yan, Yaxi Lu, Yankai Lin, Xin Cong, Xiangru Tang, Bill Qian, Sihan Zhao, Lauren Hong, Runchu Tian, Ruobing Xie, Jie Zhou, Mark Gerstein, dahai li, Zhiyuan Liu, Maosong Sun.
ICLR 2024
[PDF] [Abstract] [Bib]

ToolBench

Despite the advancements of open-source large language models (LLMs), e.g., LLaMA, they remain significantly limited in tool-use capabilities, i.e., using external tools (APIs) to fulfill human instructions. The reason is that current instruction tuning largely focuses on basic language tasks but ignores the tool-use domain. This is in contrast to the excellent tool-use capabilities of state-of-the-art (SOTA) closed-source LLMs, e.g., ChatGPT. To bridge this gap, we introduce ToolLLM, a general tool-use framework encompassing data construction, model training, and evaluation. We first present ToolBench, an instruction-tuning dataset for tool use, which is constructed automatically using ChatGPT. Specifically, the construction can be divided into three stages: (i) API collection: we collect 16,464 real-world RESTful APIs spanning 49 categories from RapidAPI Hub; (ii) instruction generation: we prompt ChatGPT to generate diverse instructions involving these APIs, covering both single-tool and multi-tool scenarios; (iii) solution path annotation: we use ChatGPT to search for a valid solution path (chain of API calls) for each instruction. To enhance the reasoning capabilities of LLMs, we develop a novel depth-first search-based decision tree algorithm. It enables LLMs to evaluate multiple reasoning traces and expand the search space. Moreover, to evaluate the tool-use capabilities of LLMs, we develop an automatic evaluator: ToolEval. Based on ToolBench, we fine-tune LLaMA to obtain an LLM ToolLLaMA, and equip it with a neural API retriever to recommend appropriate APIs for each instruction. Experiments show that ToolLLaMA demonstrates a remarkable ability to execute complex instructions and generalize to unseen APIs, and exhibits comparable performance to ChatGPT. Our ToolLLaMA also demonstrates strong zero-shot generalization ability in an out-of-distribution tool-use dataset: APIBench.

@article{qin2023toolllm,
  title={Toolllm: Facilitating large language models to master 16000+ real-world apis},
  author={Qin, Yujia and Liang, Shihao and Ye, Yining and Zhu, Kunlun and Yan, Lan and Lu, Yaxi and Lin, Yankai and Cong, Xin and Tang, Xiangru and Qian, Bill and others},
  journal={arXiv preprint arXiv:2307.16789},
  year={2023}
}

[3] PRESTO: Progressive Pretraining Enhances Synthetic Chemistry Outcomes
He Cao, Yanjun Shao, Zhiyuan Liu, Zijing Liu, Xiangru Tang, Yuan Yao, Yu Li.
EMNLP 2024 Findings
[PDF] [Abstract] [Bib]

PRESTO

@inproceedings{cao-etal-2024-presto,
    title = "{PRESTO}: Progressive Pretraining Enhances Synthetic Chemistry Outcomes",
    author = "Cao, He  and
      Shao, Yanjun  and
      Liu, Zhiyuan  and
      Liu, Zijing  and
      Tang, Xiangru  and
      Yao, Yuan  and
      Li, Yu",
    editor = "Al-Onaizan, Yaser  and
      Bansal, Mohit  and
      Chen, Yun-Nung",
    booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024",
    month = nov,
    year = "2024",
    address = "Miami, Florida, USA",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2024.findings-emnlp.597/",
    doi = "10.18653/v1/2024.findings-emnlp.597",
    pages = "10197--10224",
    abstract = "Multimodal Large Language Models (MLLMs) have seen growing adoption across various scientific disciplines. These advancements encourage the investigation of molecule-text modeling within synthetic chemistry, a field dedicated to designing and conducting chemical reactions to synthesize new compounds with desired properties and applications. Current approaches, however, often neglect the critical role of multi-molecule graph interaction in understanding chemical reactions, leading to suboptimal performance in synthetic chemistry tasks. This study introduces PRESTO (Progressive Pretraining Enhances Synthetic Chemistry Outcomes), a new framework that bridges the molecule-text modality gap by integrating a comprehensive benchmark of pretraining strategies and dataset configurations. It progressively improves multimodal LLMs through cross-modal alignment and multi-graph understanding. Our extensive experiments demonstrate that PRESTO offers competitive results in downstream synthetic chemistry tasks. The code can be found at https://github.com/IDEA-XL/PRESTO."
}

[2] Investigating Data Contamination in Modern Benchmarks for Large Language Models
Chunyuan Deng, Yilun Zhao, Xiangru Tang, Mark Gerstein, Arman Cohan.
NAACL 2024
[PDF] [Abstract] [Bib]

@inproceedings{deng-etal-2024-investigating,
    title = "Investigating Data Contamination in Modern Benchmarks for Large Language Models",
    author = "Deng, Chunyuan  and
      Zhao, Yilun  and
      Tang, Xiangru  and
      Gerstein, Mark  and
      Cohan, Arman",
    editor = "Duh, Kevin  and
      Gomez, Helena  and
      Bethard, Steven",
    booktitle = "Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)",
    month = jun,
    year = "2024",
    address = "Mexico City, Mexico",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2024.naacl-long.482/",
    doi = "10.18653/v1/2024.naacl-long.482",
    pages = "8706--8719",
    abstract = "Recent observations have underscored a disparity between the inflated benchmark scores and the actual performance of LLMs, raising concerns about potential contamination of evaluation benchmarks. This issue is especially critical for closed-source models and certain open-source models where training data transparency is lacking. In this paper we study data contamination by proposing two methods tailored for both open-source and proprietary LLMs. We first introduce a retrieval-based system to explore potential overlaps between evaluation benchmarks and pretraining corpora. We further present a novel investigation protocol named Testset Slot Guessing (TS-Guessing), applicable to both open and proprietary models. This approach entails masking a wrong answer in a multiple-choice question and prompting the model to fill in the gap. Additionally, it involves obscuring an unlikely word in an evaluation example and asking the model to produce it. We find that certain commercial LLMs could surprisingly guess the missing option in various test sets. Specifically, in the MMLU benchmark, ChatGPT and GPT-4 demonstrated an exact match rate of 52{\%} and 57{\%}, respectively, in guessing the missing options in benchmark test data. We hope these results underscore the need for more robust evaluation methodologies and benchmarks in the field."
}

[1] FAVOR-GPT: a generative natural language interface to whole genome variant functional annotations
Thomas Cheng Li, Hufeng Zhou, Vineet Verma, Xiangru Tang, Yanjun Shao, Eric Van Buren, Zhiping Weng, Mark Gerstein, Benjamin Neale, Shamil R Sunyaev, Xihong Lin.
Bioinformatics Advances, 2024 (IF 2.32, JCR Q2)
[PDF] [Abstract] [Bib]

@article{10.1093/bioadv/vbae143,
    author = {Li, Thomas Cheng and Zhou, Hufeng and Verma, Vineet and Tang, Xiangru and Shao, Yanjun and Van Buren, Eric and Weng, Zhiping and Gerstein, Mark and Neale, Benjamin and Sunyaev, Shamil R and Lin, Xihong},
    title = {FAVOR-GPT: a generative natural language interface to whole genome variant functional annotations},
    journal = {Bioinformatics Advances},
    volume = {4},
    number = {1},
    pages = {vbae143},
    year = {2024},
    month = {09},
    abstract = {Functional Annotation of genomic Variants Online Resources (FAVOR) offers multi-faceted, whole genome variant functional annotations, which is essential for Whole Genome and Exome Sequencing (WGS/WES) analysis and the functional prioritization of disease-associated variants. A versatile chatbot designed to facilitate informative interpretation and interactive, user-centric summary of the whole genome variant functional annotation data in the FAVOR database is needed.We have developed FAVOR-GPT, a generative natural language interface powered by integrating large language models (LLMs) and FAVOR. It is developed based on the Retrieval Augmented Generation (RAG) approach, and complements the original FAVOR portal, enhancing usability for users, especially those without specialized expertise. FAVOR-GPT simplifies raw annotations by providing interpretable explanations and result summaries in response to the user’s prompt. It shows high accuracy when cross-referencing with the FAVOR database, underscoring the robustness of the retrieval framework.Researchers can access FAVOR-GPT at FAVOR’s main website (https://favor.genohub.org).},
    issn = {2635-0041},
    doi = {10.1093/bioadv/vbae143},
    url = {https://doi.org/10.1093/bioadv/vbae143},
    eprint = {https://academic.oup.com/bioinformaticsadvances/article-pdf/4/1/vbae143/59645690/vbae143.pdf},
}

Recent Talks

07/2025 Talk at ISMB 2025 3DSIG Section.
11/2024 Talk at Takeda Pharmaceutical.
07/2024 Talk at Yale Department of Biomedical Informatics & Data Science.
07/2024 Talk at ISMB 2024 Text Mining Section.
07/2024 Talk at Multimodal Large Language Model.
02/2024 Talk at AI in Medicine Symposium at Yale School of Medicine.
01/2024 Talk at PSB 2024 Workshop on LLMs for Biomedicine.
07/2023 Talk at ISMB/ECCB 2023 Text Mining Section.

Workshop & Tutorial Organizers

Workshop Organizer: ICML 2025 Workshop on Multi-Agent Systems, ICCV 2025 Workshop on Knowledge-Intensive Multimodal Reasoning, ICLR 2024 Workshop on LLM Agents, SIGDIAL/INLG 2023 Workshop on Taming LLMs.
Tutorial Organizer: ISMB 2024 Tutorial on A Practical Introduction to LLMs in Biomedical Research.
Session Chair: ACL 2024 BoF on AI for Science, NAACL 2024 BoF on LLMs for Science.

Services

Area Chair: ACL ARR (ACL, EMNLP, NAACL, etc).
Conference Program Committee / Reviewer: NeurIPS, ICML, ACL, EMNLP, CIKM, NAACL, INLG, IEEE BigData, COLM.
Journal Reviewer: npj Digital Medicine, TPAMI, Neurocomputing, Briefings in Bioinformatics, PLOS Computational Biology, BMC Bioinformatics, PLOS ONE, Health Data Science.
Workshop Reviewer: KDD 2023 Workshop on Data Mining in Bioinformatics, ACL 2023 Workshop on Building Educational Apps, ACL 2023 Workshop on Clinical NLP, ICML 2023 Workshop on Neural Conv AI, ICML 2023 Workshop on Interpretable ML in Healthcare, NAACL-HLT 2021 Workshop on Language and Vision Research.

Teaching

Teaching Fellow - CPSC 452/CPSC 552/AMTH 552/CB&B 663 Deep Learning Theory and Applications, Yale University, 2023 Spring.
Teaching Fellow - CPSC 437/CPSC 537 Introduction to Database Systems, Yale University, 2023 Fall.
Teaching Fellow - CPSC 452/CPSC 552/AMTH 552/CB&B 663 Deep Learning Theory and Applications, Yale University, 2024 Spring.
Teaching Fellow - CPSC 437/CPSC 537 Database Systems, Yale University, 2024 Fall.

Misc.

I took 12 courses (and 3 additional project credits) at Yale: CPSC 523 Principles of Operating Systems, 537 Intro to Database, 539 Software Engineering, 552 Deep Learning Theory, 553 Unsupervised Learning, 569 Randomized Algorithms, 577 NLP, 583 Deep Learning on Graph, 668 Blockchain Research, 677 Adv NLP, 680 Trustworthy Deep Learning, 752 Biomedical Data Sci.
Interestingly, this course load matches the entire requirement for a Yale undergraduate B.S. degree in Computer Science (which requires 11 courses + 1 project credit) and exceeds what's needed for a B.A. (which requires only 9 courses + 1 project credit).