% Calibration Ledger — Beta cited findings
% https://calibrationledger.com/beta/
% License: CC-BY-4.0 (the compilation; individual papers retain their own copyright)
% Last updated: 2026-04-27

@article{mellers2015psychology,
  author    = {Mellers, Barbara and Stone, Eric and Atanasov, Pavel and Rohrbaugh, Nick and Metz, Samuel E. and Ungar, Lyle and Bishop, Michael M. and Horowitz, Michael and Merkle, Edgar and Tetlock, Philip},
  title     = {The psychology of intelligence analysis: {D}rivers of prediction accuracy in world politics},
  journal   = {Journal of Experimental Psychology: Applied},
  volume    = {21},
  number    = {1},
  pages     = {1--14},
  year      = {2015},
  doi       = {10.1037/xap0000040},
  url       = {https://doi.org/10.1037/xap0000040},
  note      = {Good Judgment Project Superforecasters; Brier ~0.25 vs ~0.37 control}
}

@misc{metaculus_track_record,
  author       = {{Metaculus}},
  title        = {Track Record + Scoring Methodology},
  howpublished = {Publicly maintained dashboard},
  url          = {https://www.metaculus.com/questions/track-record/},
  year         = {2026},
  note         = {Aggregated community-prediction calibration across all resolved binary questions}
}

@misc{manifold_calibration,
  author       = {{Manifold Markets}},
  title        = {Calibration Plot},
  howpublished = {Public live dashboard},
  url          = {https://manifold.markets/calibration},
  year         = {2026},
  note         = {Live calibration plot of all resolved binary markets on the platform}
}

@techreport{openai2023gpt4,
  author      = {{OpenAI}},
  title       = {{GPT-4} Technical Report},
  institution = {OpenAI},
  number      = {arXiv:2303.08774},
  year        = {2023},
  url         = {https://arxiv.org/abs/2303.08774},
  note        = {\S{3.2} Calibration: pre-RLHF well-calibrated; RLHF degrades calibration on multiple-choice benchmarks}
}

@unpublished{bradshaw2011analysts,
  author = {Bradshaw, Mark T.},
  title  = {Analysts' Forecasts: {W}hat Do We Know After Decades of Work?},
  year   = {2011},
  note   = {Working paper, Boston College Carroll School of Management},
  url    = {https://papers.ssrn.com/sol3/papers.cfm?abstract_id=1880339}
}

@article{osc2015reproducibility,
  author  = {{Open Science Collaboration}},
  title   = {Estimating the reproducibility of psychological science},
  journal = {Science},
  volume  = {349},
  number  = {6251},
  pages   = {aac4716},
  year    = {2015},
  doi     = {10.1126/science.aac4716},
  url     = {https://doi.org/10.1126/science.aac4716},
  note    = {36\% replication rate; mean effect size halved on replication}
}

@misc{kadavath2022models,
  author       = {Kadavath, Saurav and Conerly, Tom and Askell, Amanda and Henighan, Tom and Drain, Dawn and Perez, Ethan and Schiefer, Nicholas and Hatfield-Dodds, Zac and DasSarma, Nova and Tran-Johnson, Eli and others},
  title        = {Language Models (Mostly) Know What They Know},
  howpublished = {arXiv preprint arXiv:2207.05221},
  year         = {2022},
  url          = {https://arxiv.org/abs/2207.05221},
  note         = {Anthropic; well-calibrated P(IK) and P(True) for base LMs; calibration improves with scale}
}

@article{camerer2018evaluating,
  author  = {Camerer, Colin F. and Dreber, Anna and Holzmeister, Felix and Ho, Teck-Hua and Huber, J{\"u}rgen and Johannesson, Magnus and Kirchler, Michael and Nave, Gideon and Nosek, Brian A. and Pfeiffer, Thomas and others},
  title   = {Evaluating the replicability of social science experiments in {N}ature and {S}cience between 2010 and 2015},
  journal = {Nature Human Behaviour},
  volume  = {2},
  pages   = {637--644},
  year    = {2018},
  doi     = {10.1038/s41562-018-0399-z},
  url     = {https://doi.org/10.1038/s41562-018-0399-z},
  note    = {62\% replication rate; effect sizes ~50\% of original}
}

@misc{philadelphia_fed_spf,
  author       = {{Federal Reserve Bank of Philadelphia}},
  title        = {Survey of Professional Forecasters --- Documentation and Forecast Accuracy},
  howpublished = {Public dataset and accuracy reports},
  url          = {https://www.philadelphiafed.org/surveys-and-data/real-time-data-research/survey-of-professional-forecasters},
  year         = {2026},
  note         = {Quarterly survey of US macroeconomic forecasters; data back to 1968}
}

@article{hausfather2020evaluating,
  author  = {Hausfather, Zeke and Drake, Henri F. and Abbott, Tristan and Schmidt, Gavin A.},
  title   = {Evaluating the Performance of Past Climate Model Projections},
  journal = {Geophysical Research Letters},
  volume  = {47},
  number  = {1},
  pages   = {e2019GL085378},
  year    = {2020},
  doi     = {10.1029/2019GL085378},
  url     = {https://doi.org/10.1029/2019GL085378},
  note    = {14 of 17 climate models from 1970-2007 within natural-variability range of observed warming when adjusted for actual emissions}
}
