Empirical Inference Conference Paper 2024

What Makes and Breaks Safety Fine-tuning? A Mechanistic Study

Author(s): Jain, S. and Lubana, E. S. and Oksuz, K. and Joy, T. and Torr, P. and Sanyal, A. and Dokania, P. K.
Book Title: Advances in Neural Information Processing Systems 37 (NeurIPS 2024)
Volume: 37
Pages: 93406--93478
Year: 2024
Month: December
Editors: A. Globerson and L. Mackey and D. Belgrave and A. Fan and U. Paquet and J. Tomczak and C. Zhang
Publisher: Curran Associates, Inc.
Bibtex Type: Conference Paper (conference)
Event Name: 38th Annual Conference on Neural Information Processing Systems
Event Place: Vancouver, Canada
State: Published
URL: https://proceedings.neurips.cc/paper_files/paper/2024/file/a9bef53eb7b0e5950d4f2d9c74a16006-Paper-Conference.pdf

BibTex

@conference{Jainetal24,
  title = {What Makes and Breaks Safety Fine-tuning? A Mechanistic Study},
  booktitle = {Advances in Neural Information Processing Systems 37 (NeurIPS 2024)},
  volume = {37},
  pages = {93406--93478},
  editors = {A. Globerson and L. Mackey and D. Belgrave and A. Fan and U. Paquet and J. Tomczak and C. Zhang},
  publisher = {Curran Associates, Inc.},
  month = dec,
  year = {2024},
  slug = {jainetal24-721b15d0-65bd-4334-a544-01904e7d1787},
  author = {Jain, S. and Lubana, E. S. and Oksuz, K. and Joy, T. and Torr, P. and Sanyal, A. and Dokania, P. K.},
  url = {https://proceedings.neurips.cc/paper_files/paper/2024/file/a9bef53eb7b0e5950d4f2d9c74a16006-Paper-Conference.pdf},
  month_numeric = {12}
}