Back
What Makes Safety Fine-tuning Methods Safe? A Mechanistic Study
@conference{Jainetal24b, title = {What Makes Safety Fine-tuning Methods Safe? A Mechanistic Study}, booktitle = {ICML 2024 Workshop on Mechanistic Interpretability (Spotlight)}, month = jul, year = {2024}, slug = {jainetal24b}, author = {Jain, S. and Lubana, E. S. and Oksuz, K. and Joy, T. and Torr, P. H. S. and Sanyal, A. and Dokania, P. K.}, url = {https://openreview.net/forum?id=BS2CbUkJpy}, month_numeric = {7} }