1use regex::Regex;
92use std::sync::LazyLock;
93
94static ARXIV_RE: LazyLock<Regex> =
95 LazyLock::new(|| Regex::new(r"arxiv\.org/(?:abs|pdf)/(\d{4}\.\d{4,5}(?:v\d+)?)").unwrap());
96static PDF_HREF_RE: LazyLock<Regex> =
97 LazyLock::new(|| Regex::new(r#"href="(https?://[^"]+\.pdf)""#).unwrap());
98static SSRN_RE: LazyLock<Regex> = LazyLock::new(|| {
99 Regex::new(r#"href="(https?://papers\.ssrn\.com/sol3/papers\.cfm\?abstract_id=\d+)""#).unwrap()
100});
101static URL_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"https?://[^\s,)]+").unwrap());
102
103static PDF_RUNTIME: std::sync::OnceLock<tokio::runtime::Runtime> = std::sync::OnceLock::new();
110
111fn pdf_runtime() -> &'static tokio::runtime::Runtime {
112 PDF_RUNTIME.get_or_init(|| {
113 tokio::runtime::Builder::new_multi_thread()
114 .worker_threads(2)
115 .enable_all()
116 .build()
117 .expect("failed to build PDF tokio runtime")
118 })
119}
120
121#[derive(Debug, Clone)]
132#[non_exhaustive]
133pub struct ResolvedPdf {
134 pub url: String,
136 pub source: String,
138 pub downloadable: bool,
140}
141
142#[derive(Debug, Clone)]
144#[non_exhaustive]
145pub struct ResolveReport {
146 pub pdf: Option<ResolvedPdf>,
148 pub outcomes: Vec<(String, Result<String, String>)>,
150}
151
152impl ResolveReport {
153 pub fn summary(&self) -> String {
155 let mut out = String::new();
156 if let Some(ref pdf) = self.pdf {
157 out.push_str(&format!(
158 "PDF found via {} (downloadable: {})\n {}\n\n",
159 pdf.source, pdf.downloadable, pdf.url
160 ));
161 } else {
162 out.push_str("No downloadable PDF found.\n\n");
163 }
164 out.push_str("Sources queried:\n");
165 for (name, outcome) in &self.outcomes {
166 match outcome {
167 Ok(url) => out.push_str(&format!(" {name}: found {url}\n")),
168 Err(reason) => out.push_str(&format!(" {name}: {reason}\n")),
169 }
170 }
171 out
172 }
173}
174
175#[derive(Debug, Clone)]
177#[non_exhaustive]
178pub struct SourceEntry {
179 pub name: String,
180 pub enabled: bool,
181}
182
183impl SourceEntry {
184 pub fn new(name: impl Into<String>, enabled: bool) -> Self {
185 Self {
186 name: name.into(),
187 enabled,
188 }
189 }
190}
191
192#[derive(Debug, Clone)]
194#[non_exhaustive]
195pub struct Endpoints {
196 pub openalex: String,
197 pub core: String,
198 pub google_scholar: String,
199 pub unpaywall: String,
200 pub crossref: String,
201 pub zenodo: String,
202 pub ssrn: String,
203 pub semantic_scholar: String,
204}
205
206impl Default for Endpoints {
207 fn default() -> Self {
208 Self {
209 openalex: "https://api.openalex.org".into(),
210 core: "https://api.core.ac.uk/v3".into(),
211 google_scholar: "https://scholar.google.com".into(),
212 unpaywall: "https://api.unpaywall.org/v2".into(),
213 crossref: "https://api.crossref.org".into(),
214 zenodo: "https://zenodo.org/api".into(),
215 ssrn: "https://papers.ssrn.com".into(),
216 semantic_scholar: "https://api.semanticscholar.org/graph/v1".into(),
217 }
218 }
219}
220
221#[derive(Debug, Clone)]
228#[non_exhaustive]
229pub struct ResolverConfig {
230 pub email: String,
232 pub user_agent: String,
234 pub timeout_secs: u64,
236 pub sources: Vec<SourceEntry>,
239 pub extra_blocked_domains: Vec<String>,
241 pub endpoints: Endpoints,
243}
244
245pub const SOURCE_NAMES: &[&str] = &[
247 "arxiv",
248 "openalex",
249 "core",
250 "google_scholar",
251 "unpaywall",
252 "crossref",
253 "zenodo",
254 "ssrn",
255 "semantic_scholar",
256];
257
258impl Default for ResolverConfig {
259 fn default() -> Self {
260 Self {
261 email: "biblion@example.com".into(),
262 user_agent: "biblion/0.1".into(),
263 timeout_secs: 20,
264 sources: SOURCE_NAMES
265 .iter()
266 .map(|&name| SourceEntry {
267 name: name.into(),
268 enabled: true,
269 })
270 .collect(),
271 extra_blocked_domains: vec![],
272 endpoints: Endpoints::default(),
273 }
274 }
275}
276
277impl ResolverConfig {
278 pub fn is_enabled(&self, name: &str) -> bool {
280 self.sources.iter().any(|s| s.name == name && s.enabled)
281 }
282
283 pub fn priority(&self, name: &str) -> u8 {
285 self.sources
286 .iter()
287 .position(|s| s.name == name)
288 .map(|p| (p + 1) as u8)
289 .unwrap_or(99)
290 }
291}
292
293const BLOCKED_DOMAINS: &[&str] = &[
295 "academic.oup.com",
296 "wiley.com",
297 "www.sciencedirect.com",
298 "link.springer.com",
299 "www.nature.com",
300 "www.tandfonline.com",
301 "ieeexplore.ieee.org",
302 "journals.sagepub.com",
303 "silverchair.com",
304];
305
306fn is_downloadable(url: &str) -> bool {
307 !BLOCKED_DOMAINS.iter().any(|d| url.contains(d))
308}
309
310fn is_downloadable_cfg(url: &str, config: &ResolverConfig) -> bool {
312 if !is_downloadable(url) {
313 return false;
314 }
315 !config
316 .extra_blocked_domains
317 .iter()
318 .any(|d| url.contains(d.as_str()))
319}
320
321pub fn resolve_pdf(
326 doi: Option<&str>,
327 url: Option<&str>,
328 title: Option<&str>,
329) -> Option<ResolvedPdf> {
330 resolve_pdf_with_config(doi, url, title, &ResolverConfig::default())
331}
332
333pub fn resolve_pdf_with_config(
338 doi: Option<&str>,
339 url: Option<&str>,
340 title: Option<&str>,
341 config: &ResolverConfig,
342) -> Option<ResolvedPdf> {
343 if config.is_enabled("arxiv") {
345 if let Some(doi) = doi
346 && let Some(id) = doi_to_arxiv_id(doi)
347 {
348 return Some(ResolvedPdf {
349 url: format!("https://arxiv.org/pdf/{id}.pdf"),
350 source: "arxiv".into(),
351 downloadable: true,
352 });
353 }
354 if let Some(url) = url
355 && let Some(id) = url_to_arxiv_id(url)
356 {
357 return Some(ResolvedPdf {
358 url: format!("https://arxiv.org/pdf/{id}.pdf"),
359 source: "arxiv".into(),
360 downloadable: true,
361 });
362 }
363 }
364
365 pdf_runtime().block_on(resolve_pdf_async(doi, url, title, config))
367}
368
369pub fn resolve_pdf_with_report(
374 doi: Option<&str>,
375 url: Option<&str>,
376 title: Option<&str>,
377 config: &ResolverConfig,
378) -> ResolveReport {
379 if config.is_enabled("arxiv") {
381 if let Some(doi) = doi
382 && let Some(id) = doi_to_arxiv_id(doi)
383 {
384 let pdf = ResolvedPdf {
385 url: format!("https://arxiv.org/pdf/{id}.pdf"),
386 source: "arxiv".into(),
387 downloadable: true,
388 };
389 return ResolveReport {
390 outcomes: vec![("arxiv".into(), Ok(pdf.url.clone()))],
391 pdf: Some(pdf),
392 };
393 }
394 if let Some(url) = url
395 && let Some(id) = url_to_arxiv_id(url)
396 {
397 let pdf = ResolvedPdf {
398 url: format!("https://arxiv.org/pdf/{id}.pdf"),
399 source: "arxiv".into(),
400 downloadable: true,
401 };
402 return ResolveReport {
403 outcomes: vec![("arxiv".into(), Ok(pdf.url.clone()))],
404 pdf: Some(pdf),
405 };
406 }
407 }
408
409 pdf_runtime().block_on(resolve_pdf_async_with_report(doi, url, title, config))
410}
411
412pub async fn resolve_pdf_async_with_report(
414 doi: Option<&str>,
415 url: Option<&str>,
416 title: Option<&str>,
417 config: &ResolverConfig,
418) -> ResolveReport {
419 if config.is_enabled("arxiv")
421 && let Some(url) = url
422 && let Some(id) = url_to_arxiv_id(url)
423 {
424 let pdf = ResolvedPdf {
425 url: format!("https://arxiv.org/pdf/{id}.pdf"),
426 source: "arxiv".into(),
427 downloadable: true,
428 };
429 return ResolveReport {
430 outcomes: vec![("arxiv".into(), Ok(pdf.url.clone()))],
431 pdf: Some(pdf),
432 };
433 }
434
435 let client = match reqwest::Client::builder()
436 .timeout(std::time::Duration::from_secs(config.timeout_secs))
437 .redirect(reqwest::redirect::Policy::limited(10))
438 .build()
439 {
440 Ok(c) => c,
441 Err(_) => {
442 return ResolveReport {
443 pdf: None,
444 outcomes: vec![("client".into(), Err("failed to build HTTP client".into()))],
445 };
446 }
447 };
448
449 type ReportFuture<'a> = std::pin::Pin<
450 Box<
451 dyn std::future::Future<Output = (String, u8, Result<ResolvedPdf, String>)> + Send + 'a,
452 >,
453 >;
454 let mut futures: Vec<ReportFuture<'_>> = Vec::new();
455
456 let ep = &config.endpoints;
457 for source in &config.sources {
458 if !source.enabled {
459 continue;
460 }
461 let pri = config.priority(&source.name);
462 let c = &client;
463 let name = source.name.clone();
464 match source.name.as_str() {
465 "arxiv" => {} "openalex" => futures.push(Box::pin(async move {
467 let r = try_openalex(c, doi, title, ep).await;
468 (name, pri, r.ok_or_else(|| "no OA location found".into()))
469 })),
470 "core" => futures.push(Box::pin(async move {
471 let r = try_core(c, doi, title, ep).await;
472 (name, pri, r.ok_or_else(|| "no result".into()))
473 })),
474 "google_scholar" => futures.push(Box::pin(async move {
475 let r = try_google_scholar_report(c, title, ep).await;
476 (name, pri, r)
477 })),
478 "unpaywall" => {
479 let email = config.email.clone();
480 if email.contains("example.com") || email.contains("example.org") {
481 futures.push(Box::pin(async move {
482 (
483 name,
484 pri,
485 Err("skipped — configure resolver.email in config.toml".into()),
486 )
487 }));
488 } else {
489 futures.push(Box::pin(async move {
490 let r = try_unpaywall(c, doi, &email, ep).await;
491 (name, pri, r.ok_or_else(|| "no OA PDF for this DOI".into()))
492 }));
493 }
494 }
495 "crossref" => {
496 let email = config.email.clone();
497 let ua = config.user_agent.clone();
498 futures.push(Box::pin(async move {
499 let r = try_crossref(c, doi, &email, &ua, ep).await;
500 (name, pri, r.ok_or_else(|| "no PDF link in metadata".into()))
501 }));
502 }
503 "zenodo" => futures.push(Box::pin(async move {
504 let r = try_zenodo(c, title, ep).await;
505 (name, pri, r.ok_or_else(|| "no result".into()))
506 })),
507 "ssrn" => futures.push(Box::pin(async move {
508 let r = try_ssrn(c, title, ep).await;
509 (name, pri, r.ok_or_else(|| "no result".into()))
510 })),
511 "semantic_scholar" => futures.push(Box::pin(async move {
512 let r = try_semantic_scholar(c, doi, title, ep).await;
513 (name, pri, r.ok_or_else(|| "no OA PDF found".into()))
514 })),
515 _ => {}
516 }
517 }
518
519 let results = futures::future::join_all(futures).await;
520
521 let mut outcomes: Vec<(String, Result<String, String>)> = Vec::new();
522 let mut candidates: Vec<(u8, ResolvedPdf)> = Vec::new();
523
524 for (name, pri, result) in results {
525 match result {
526 Ok(mut pdf) => {
527 if pdf.downloadable {
528 pdf.downloadable = is_downloadable_cfg(&pdf.url, config);
529 }
530 let url = pdf.url.clone();
531 if pdf.downloadable {
532 outcomes.push((name, Ok(url)));
533 } else {
534 outcomes.push((name, Err(format!("found {} but blocked domain", url))));
535 }
536 candidates.push((pri, pdf));
537 }
538 Err(reason) => {
539 outcomes.push((name, Err(reason)));
540 }
541 }
542 }
543
544 candidates.sort_by_key(|(pri, r)| (!r.downloadable, *pri));
545 let pdf = candidates.into_iter().next().map(|(_, r)| r);
546
547 ResolveReport { pdf, outcomes }
548}
549
550async fn try_google_scholar_report(
552 client: &reqwest::Client,
553 title: Option<&str>,
554 endpoints: &Endpoints,
555) -> Result<ResolvedPdf, String> {
556 let title = title.ok_or("no title provided")?;
557 let resp = client
558 .get(format!("{}/scholar", endpoints.google_scholar))
559 .query(&[("q", &format!("\"{title}\"")), ("num", &"5".to_string())])
560 .header(
561 "User-Agent",
562 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
563 )
564 .header("Accept", "text/html,application/xhtml+xml")
565 .header("Accept-Language", "en-US,en;q=0.9")
566 .send()
567 .await
568 .map_err(|e| format!("request failed: {e}"))?;
569
570 let status = resp.status();
571 if status == reqwest::StatusCode::TOO_MANY_REQUESTS {
572 return Err("rate-limited (429) — try again later".into());
573 }
574 if !status.is_success() {
575 return Err(format!("HTTP {status}"));
576 }
577
578 let html = resp.text().await.map_err(|e| format!("body error: {e}"))?;
579
580 if html.contains("unusual traffic") || html.contains("captcha") {
581 return Err("blocked by captcha".into());
582 }
583
584 let academic_hosts = [
585 ".edu",
586 ".ac.uk",
587 "research.google",
588 "hal.science",
589 "eprint.iacr.org",
590 ];
591
592 for cap in PDF_HREF_RE.captures_iter(&html) {
594 let url = &cap[1];
595 if academic_hosts.iter().any(|h| url.contains(h)) {
596 return Ok(ResolvedPdf {
597 url: url.into(),
598 source: "google_scholar".into(),
599 downloadable: true,
600 });
601 }
602 }
603 for cap in PDF_HREF_RE.captures_iter(&html) {
605 let url = &cap[1];
606 if is_downloadable(url) {
607 return Ok(ResolvedPdf {
608 url: url.into(),
609 source: "google_scholar".into(),
610 downloadable: true,
611 });
612 }
613 }
614 Err("no PDF links found in results".into())
615}
616
617pub async fn resolve_pdf_async(
622 doi: Option<&str>,
623 url: Option<&str>,
624 title: Option<&str>,
625 config: &ResolverConfig,
626) -> Option<ResolvedPdf> {
627 if config.is_enabled("arxiv")
629 && let Some(url) = url
630 && let Some(id) = url_to_arxiv_id(url)
631 {
632 return Some(ResolvedPdf {
633 url: format!("https://arxiv.org/pdf/{id}.pdf"),
634 source: "arxiv".into(),
635 downloadable: true,
636 });
637 }
638
639 let client = reqwest::Client::builder()
640 .timeout(std::time::Duration::from_secs(config.timeout_secs))
641 .redirect(reqwest::redirect::Policy::limited(10))
642 .build()
643 .ok()?;
644
645 type PdfFuture<'a> =
647 std::pin::Pin<Box<dyn std::future::Future<Output = Option<(u8, ResolvedPdf)>> + Send + 'a>>;
648 let mut futures: Vec<PdfFuture<'_>> = Vec::new();
649
650 let ep = &config.endpoints;
651 for source in &config.sources {
652 if !source.enabled {
653 continue;
654 }
655 let pri = config.priority(&source.name);
656 let c = &client;
657 match source.name.as_str() {
658 "arxiv" => {} "openalex" => futures.push(Box::pin(async move {
660 try_openalex(c, doi, title, ep).await.map(|r| (pri, r))
661 })),
662 "core" => futures.push(Box::pin(async move {
663 try_core(c, doi, title, ep).await.map(|r| (pri, r))
664 })),
665 "google_scholar" => futures.push(Box::pin(async move {
666 try_google_scholar(c, title, ep).await.map(|r| (pri, r))
667 })),
668 "unpaywall" => {
669 let email = config.email.clone();
670 futures.push(Box::pin(async move {
671 try_unpaywall(c, doi, &email, ep).await.map(|r| (pri, r))
672 }))
673 }
674 "crossref" => {
675 let email = config.email.clone();
676 let ua = config.user_agent.clone();
677 futures.push(Box::pin(async move {
678 try_crossref(c, doi, &email, &ua, ep)
679 .await
680 .map(|r| (pri, r))
681 }))
682 }
683 "zenodo" => futures.push(Box::pin(async move {
684 try_zenodo(c, title, ep).await.map(|r| (pri, r))
685 })),
686 "ssrn" => futures.push(Box::pin(async move {
687 try_ssrn(c, title, ep).await.map(|r| (pri, r))
688 })),
689 "semantic_scholar" => futures.push(Box::pin(async move {
690 try_semantic_scholar(c, doi, title, ep)
691 .await
692 .map(|r| (pri, r))
693 })),
694 _ => {} }
696 }
697
698 let results = futures::future::join_all(futures).await;
699
700 let mut candidates: Vec<(u8, ResolvedPdf)> = results
702 .into_iter()
703 .flatten()
704 .map(|(pri, mut r)| {
705 if r.downloadable {
707 r.downloadable = is_downloadable_cfg(&r.url, config);
708 }
709 (pri, r)
710 })
711 .collect();
712
713 candidates.sort_by_key(|(pri, r)| (!r.downloadable, *pri));
715 candidates.into_iter().next().map(|(_, r)| r)
716}
717
718fn doi_to_arxiv_id(doi: &str) -> Option<String> {
723 doi.strip_prefix("10.48550/arXiv.").map(String::from)
724}
725
726fn url_to_arxiv_id(url: &str) -> Option<String> {
727 ARXIV_RE.captures(url).map(|c| c[1].to_string())
728}
729
730async fn try_openalex(
735 client: &reqwest::Client,
736 doi: Option<&str>,
737 title: Option<&str>,
738 endpoints: &Endpoints,
739) -> Option<ResolvedPdf> {
740 let resp = if let Some(doi) = doi {
741 client
742 .get(format!("{}/works/doi:{doi}", endpoints.openalex))
743 .query(&[("select", "open_access,locations,best_oa_location")])
744 .send()
745 .await
746 .ok()?
747 } else {
748 let title = title?;
749 client
750 .get(format!("{}/works", endpoints.openalex))
751 .query(&[
752 ("search", title),
753 ("per_page", "1"),
754 ("select", "open_access,locations,best_oa_location"),
755 ])
756 .send()
757 .await
758 .ok()?
759 };
760
761 if !resp.status().is_success() {
762 return None;
763 }
764 let data: serde_json::Value = resp.json().await.ok()?;
765
766 let work = if let Some(results) = data.get("results").and_then(|v| v.as_array()) {
767 results.first()?
768 } else {
769 &data
770 };
771
772 if let Some(url) = work
774 .pointer("/best_oa_location/pdf_url")
775 .and_then(|v| v.as_str())
776 {
777 return Some(ResolvedPdf {
778 url: url.into(),
779 source: "openalex".into(),
780 downloadable: is_downloadable(url),
781 });
782 }
783 if let Some(url) = work.pointer("/open_access/oa_url").and_then(|v| v.as_str())
784 && url.ends_with(".pdf")
785 {
786 return Some(ResolvedPdf {
787 url: url.into(),
788 source: "openalex".into(),
789 downloadable: is_downloadable(url),
790 });
791 }
792 for loc in work
793 .get("locations")
794 .and_then(|v| v.as_array())
795 .unwrap_or(&vec![])
796 {
797 if let Some(url) = loc.get("pdf_url").and_then(|v| v.as_str()) {
798 return Some(ResolvedPdf {
799 url: url.into(),
800 source: "openalex".into(),
801 downloadable: is_downloadable(url),
802 });
803 }
804 }
805 None
806}
807
808async fn try_core(
813 client: &reqwest::Client,
814 doi: Option<&str>,
815 title: Option<&str>,
816 endpoints: &Endpoints,
817) -> Option<ResolvedPdf> {
818 let query = if let Some(doi) = doi {
819 format!(r#"doi:"{doi}""#)
820 } else {
821 let title = title?;
822 format!(r#"title:"{title}""#)
823 };
824
825 let resp = client
826 .get(format!("{}/search/works", endpoints.core))
827 .query(&[("q", &query), ("limit", &"1".to_string())])
828 .send()
829 .await
830 .ok()?;
831
832 if !resp.status().is_success() {
833 return None;
834 }
835 let data: serde_json::Value = resp.json().await.ok()?;
836 let work = data.get("results")?.as_array()?.first()?;
837
838 if let Some(url) = work.get("downloadUrl").and_then(|v| v.as_str()) {
839 return Some(ResolvedPdf {
840 url: url.into(),
841 source: "core".into(),
842 downloadable: is_downloadable(url),
843 });
844 }
845 None
846}
847
848async fn try_google_scholar(
853 client: &reqwest::Client,
854 title: Option<&str>,
855 endpoints: &Endpoints,
856) -> Option<ResolvedPdf> {
857 let title = title?;
858 let resp = client
859 .get(format!("{}/scholar", endpoints.google_scholar))
860 .query(&[("q", &format!("\"{title}\"")), ("num", &"5".to_string())])
861 .header("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
862 .header("Accept", "text/html")
863 .send().await.ok()?;
864
865 if !resp.status().is_success() {
866 return None;
867 }
868 let html = resp.text().await.ok()?;
869
870 let academic_hosts = [
871 ".edu",
872 ".ac.uk",
873 "research.google",
874 "hal.science",
875 "eprint.iacr.org",
876 ];
877
878 for cap in PDF_HREF_RE.captures_iter(&html) {
880 let url = &cap[1];
881 if academic_hosts.iter().any(|h| url.contains(h)) {
882 return Some(ResolvedPdf {
883 url: url.into(),
884 source: "google_scholar".into(),
885 downloadable: true,
886 });
887 }
888 }
889 for cap in PDF_HREF_RE.captures_iter(&html) {
891 let url = &cap[1];
892 if is_downloadable(url) {
893 return Some(ResolvedPdf {
894 url: url.into(),
895 source: "google_scholar".into(),
896 downloadable: true,
897 });
898 }
899 }
900 None
901}
902
903async fn try_unpaywall(
908 client: &reqwest::Client,
909 doi: Option<&str>,
910 email: &str,
911 endpoints: &Endpoints,
912) -> Option<ResolvedPdf> {
913 let doi = doi?;
914 let resp = client
915 .get(format!("{}/{doi}", endpoints.unpaywall))
916 .query(&[("email", email)])
917 .send()
918 .await
919 .ok()?;
920
921 if !resp.status().is_success() {
922 return None;
923 }
924 let data: serde_json::Value = resp.json().await.ok()?;
925
926 if let Some(url) = data
927 .pointer("/best_oa_location/url_for_pdf")
928 .and_then(|v| v.as_str())
929 {
930 return Some(ResolvedPdf {
931 url: url.into(),
932 source: "unpaywall".into(),
933 downloadable: is_downloadable(url),
934 });
935 }
936 for loc in data
937 .get("oa_locations")
938 .and_then(|v| v.as_array())
939 .unwrap_or(&vec![])
940 {
941 if let Some(url) = loc.get("url_for_pdf").and_then(|v| v.as_str()) {
942 return Some(ResolvedPdf {
943 url: url.into(),
944 source: "unpaywall".into(),
945 downloadable: is_downloadable(url),
946 });
947 }
948 }
949 None
950}
951
952async fn try_crossref(
957 client: &reqwest::Client,
958 doi: Option<&str>,
959 email: &str,
960 user_agent: &str,
961 endpoints: &Endpoints,
962) -> Option<ResolvedPdf> {
963 let doi = doi?;
964 let resp = client
965 .get(format!("{}/works/{doi}", endpoints.crossref))
966 .header("User-Agent", format!("{user_agent} (mailto:{email})"))
967 .send()
968 .await
969 .ok()?;
970
971 if !resp.status().is_success() {
972 return None;
973 }
974 let data: serde_json::Value = resp.json().await.ok()?;
975 let msg = data.get("message")?;
976
977 if let Some(url) = msg
979 .pointer("/resource/primary/URL")
980 .and_then(|v| v.as_str())
981 && url.to_lowercase().ends_with(".pdf")
982 {
983 return Some(ResolvedPdf {
984 url: url.into(),
985 source: "crossref".into(),
986 downloadable: is_downloadable(url),
987 });
988 }
989 for link in msg
991 .get("link")
992 .and_then(|v| v.as_array())
993 .unwrap_or(&vec![])
994 {
995 let ct = link
996 .get("content-type")
997 .and_then(|v| v.as_str())
998 .unwrap_or("");
999 if ct.contains("pdf")
1000 && let Some(url) = link.get("URL").and_then(|v| v.as_str())
1001 {
1002 return Some(ResolvedPdf {
1003 url: url.into(),
1004 source: "crossref".into(),
1005 downloadable: is_downloadable(url),
1006 });
1007 }
1008 }
1009 None
1010}
1011
1012async fn try_zenodo(
1017 client: &reqwest::Client,
1018 title: Option<&str>,
1019 endpoints: &Endpoints,
1020) -> Option<ResolvedPdf> {
1021 let title = title?;
1022 let resp = client
1023 .get(format!("{}/records", endpoints.zenodo))
1024 .query(&[("q", title), ("size", "3"), ("type", "publication")])
1025 .send()
1026 .await
1027 .ok()?;
1028
1029 if !resp.status().is_success() {
1030 return None;
1031 }
1032 let data: serde_json::Value = resp.json().await.ok()?;
1033
1034 for hit in data
1035 .pointer("/hits/hits")
1036 .and_then(|v| v.as_array())
1037 .unwrap_or(&vec![])
1038 {
1039 for file in hit
1040 .get("files")
1041 .and_then(|v| v.as_array())
1042 .unwrap_or(&vec![])
1043 {
1044 if file
1045 .get("key")
1046 .and_then(|v| v.as_str())
1047 .unwrap_or("")
1048 .to_lowercase()
1049 .ends_with(".pdf")
1050 && let Some(url) = file.pointer("/links/self").and_then(|v| v.as_str())
1051 {
1052 return Some(ResolvedPdf {
1053 url: url.into(),
1054 source: "zenodo".into(),
1055 downloadable: true,
1056 });
1057 }
1058 }
1059 }
1060 None
1061}
1062
1063async fn try_ssrn(
1068 client: &reqwest::Client,
1069 title: Option<&str>,
1070 endpoints: &Endpoints,
1071) -> Option<ResolvedPdf> {
1072 let title = title?;
1073 let resp = client
1074 .get(format!("{}/sol3/results.cfm", endpoints.ssrn))
1075 .query(&[("txtKey_Words", title), ("npage", "1")])
1076 .header("User-Agent", "Mozilla/5.0")
1077 .header("Accept", "text/html")
1078 .send()
1079 .await
1080 .ok()?;
1081
1082 if !resp.status().is_success() {
1083 return None;
1084 }
1085 let html = resp.text().await.ok()?;
1086
1087 if let Some(cap) = SSRN_RE.captures(&html) {
1088 return Some(ResolvedPdf {
1089 url: cap[1].to_string(),
1090 source: "ssrn".into(),
1091 downloadable: false,
1092 });
1093 }
1094 None
1095}
1096
1097async fn try_semantic_scholar(
1102 client: &reqwest::Client,
1103 doi: Option<&str>,
1104 title: Option<&str>,
1105 endpoints: &Endpoints,
1106) -> Option<ResolvedPdf> {
1107 let resp = if let Some(doi) = doi {
1108 client
1109 .get(format!("{}/paper/DOI:{doi}", endpoints.semantic_scholar))
1110 .query(&[("fields", "openAccessPdf")])
1111 .send()
1112 .await
1113 .ok()?
1114 } else {
1115 let title = title?;
1116 client
1117 .get(format!("{}/paper/search", endpoints.semantic_scholar))
1118 .query(&[
1119 ("query", title),
1120 ("limit", "1"),
1121 ("fields", "openAccessPdf"),
1122 ])
1123 .send()
1124 .await
1125 .ok()?
1126 };
1127
1128 if !resp.status().is_success() {
1129 return None;
1130 }
1131 let data: serde_json::Value = resp.json().await.ok()?;
1132
1133 let work = if let Some(items) = data.get("data").and_then(|v| v.as_array()) {
1134 items.first()?
1135 } else {
1136 &data
1137 };
1138
1139 let oa = work.get("openAccessPdf")?;
1140 if let Some(url) = oa.get("url").and_then(|v| v.as_str()) {
1141 return Some(ResolvedPdf {
1142 url: url.into(),
1143 source: "semantic_scholar".into(),
1144 downloadable: is_downloadable(url),
1145 });
1146 }
1147 if let Some(disclaimer) = oa.get("disclaimer").and_then(|v| v.as_str()) {
1149 for m in URL_RE.find_iter(disclaimer) {
1150 let url = m.as_str();
1151 if url.contains("arxiv.org/abs/") {
1152 let pdf_url = url.replace("/abs/", "/pdf/");
1153 return Some(ResolvedPdf {
1154 url: format!("{pdf_url}.pdf"),
1155 source: "semantic_scholar".into(),
1156 downloadable: true,
1157 });
1158 }
1159 if !url.contains("arxiv.org") || url.contains("/pdf/") {
1160 return Some(ResolvedPdf {
1161 url: url.into(),
1162 source: "semantic_scholar".into(),
1163 downloadable: is_downloadable(url),
1164 });
1165 }
1166 }
1167 }
1168 None
1169}
1170
1171#[cfg(test)]
1172mod tests {
1173 use super::*;
1174
1175 #[test]
1176 fn doi_to_arxiv_id_valid() {
1177 assert_eq!(
1178 doi_to_arxiv_id("10.48550/arXiv.2105.15183"),
1179 Some("2105.15183".into())
1180 );
1181 }
1182
1183 #[test]
1184 fn doi_to_arxiv_id_invalid() {
1185 assert_eq!(doi_to_arxiv_id("10.1234/other"), None);
1186 }
1187
1188 #[test]
1189 fn url_to_arxiv_id_abs() {
1190 assert_eq!(
1191 url_to_arxiv_id("https://arxiv.org/abs/2105.15183"),
1192 Some("2105.15183".into())
1193 );
1194 }
1195
1196 #[test]
1197 fn url_to_arxiv_id_pdf_versioned() {
1198 assert_eq!(
1199 url_to_arxiv_id("https://arxiv.org/pdf/2105.15183v2"),
1200 Some("2105.15183v2".into())
1201 );
1202 }
1203
1204 #[test]
1205 fn url_to_arxiv_id_non_arxiv() {
1206 assert_eq!(url_to_arxiv_id("https://example.com/paper"), None);
1207 }
1208
1209 #[test]
1210 fn resolve_arxiv_doi_instant() {
1211 let result = resolve_pdf(Some("10.48550/arXiv.2105.15183"), None, None);
1212 let r = result.unwrap();
1213 assert_eq!(r.source, "arxiv");
1214 assert_eq!(r.url, "https://arxiv.org/pdf/2105.15183.pdf");
1215 assert!(r.downloadable);
1216 }
1217
1218 #[test]
1219 fn resolve_arxiv_url_instant() {
1220 let result = resolve_pdf(None, Some("https://arxiv.org/abs/2301.01234"), None);
1221 let r = result.unwrap();
1222 assert_eq!(r.source, "arxiv");
1223 assert!(r.url.contains("2301.01234"));
1224 }
1225
1226 #[test]
1227 fn is_downloadable_blocked() {
1228 assert!(!is_downloadable("https://ieeexplore.ieee.org/doc/123.pdf"));
1229 assert!(!is_downloadable(
1230 "https://www.sciencedirect.com/article.pdf"
1231 ));
1232 }
1233
1234 #[test]
1235 fn is_downloadable_ok() {
1236 assert!(is_downloadable("https://arxiv.org/pdf/2105.15183.pdf"));
1237 assert!(is_downloadable("https://example.edu/paper.pdf"));
1238 }
1239}
1240
1241#[cfg(test)]
1242mod config_tests {
1243 use super::*;
1244
1245 #[test]
1246 fn default_config_has_all_sources_enabled() {
1247 let config = ResolverConfig::default();
1248 assert_eq!(config.sources.len(), 9);
1249 for source in &config.sources {
1250 assert!(source.enabled, "Source {} should be enabled", source.name);
1251 }
1252 }
1253
1254 #[test]
1255 fn is_enabled_true_for_enabled_source() {
1256 let config = ResolverConfig::default();
1257 assert!(config.is_enabled("arxiv"));
1258 assert!(config.is_enabled("openalex"));
1259 assert!(config.is_enabled("semantic_scholar"));
1260 }
1261
1262 #[test]
1263 fn is_enabled_false_for_disabled_source() {
1264 let mut config = ResolverConfig::default();
1265 config.sources[1].enabled = false; assert!(!config.is_enabled("openalex"));
1267 assert!(config.is_enabled("arxiv")); }
1269
1270 #[test]
1271 fn is_enabled_false_for_unknown_source() {
1272 let config = ResolverConfig::default();
1273 assert!(!config.is_enabled("nonexistent"));
1274 }
1275
1276 #[test]
1277 fn priority_reflects_position() {
1278 let config = ResolverConfig::default();
1279 assert_eq!(config.priority("arxiv"), 1);
1280 assert_eq!(config.priority("openalex"), 2);
1281 assert_eq!(config.priority("semantic_scholar"), 9);
1282 }
1283
1284 #[test]
1285 fn priority_returns_99_for_unknown() {
1286 let config = ResolverConfig::default();
1287 assert_eq!(config.priority("nonexistent"), 99);
1288 }
1289
1290 #[test]
1291 fn custom_source_order_changes_priority() {
1292 let config = ResolverConfig {
1293 sources: vec![
1294 SourceEntry {
1295 name: "unpaywall".into(),
1296 enabled: true,
1297 },
1298 SourceEntry {
1299 name: "arxiv".into(),
1300 enabled: true,
1301 },
1302 ],
1303 ..Default::default()
1304 };
1305 assert_eq!(config.priority("unpaywall"), 1);
1306 assert_eq!(config.priority("arxiv"), 2);
1307 }
1308
1309 #[test]
1310 fn resolve_with_arxiv_disabled_skips_arxiv() {
1311 let mut config = ResolverConfig::default();
1312 config.sources[0].enabled = false;
1314 let result =
1316 resolve_pdf_with_config(Some("10.48550/arXiv.2105.15183"), None, None, &config);
1317 match result {
1320 None => {} Some(r) => assert_ne!(r.source, "arxiv", "Should not use disabled arxiv"),
1322 }
1323 }
1324
1325 #[test]
1326 fn resolve_with_config_uses_arxiv_when_enabled() {
1327 let config = ResolverConfig::default();
1328 let result =
1329 resolve_pdf_with_config(Some("10.48550/arXiv.2105.15183"), None, None, &config);
1330 let r = result.unwrap();
1331 assert_eq!(r.source, "arxiv");
1332 assert!(r.downloadable);
1333 }
1334}
1335
1336#[cfg(test)]
1337mod mock_tests {
1338 use super::*;
1339 use wiremock::matchers::{method, path, path_regex};
1340 use wiremock::{Mock, MockServer, ResponseTemplate};
1341
1342 fn single_source_config(source_name: &str, base_uri: &str) -> ResolverConfig {
1345 let endpoints = Endpoints {
1346 openalex: base_uri.into(),
1347 core: base_uri.into(),
1348 google_scholar: base_uri.into(),
1349 unpaywall: base_uri.into(),
1350 crossref: base_uri.into(),
1351 zenodo: base_uri.into(),
1352 ssrn: base_uri.into(),
1353 semantic_scholar: base_uri.into(),
1354 };
1355 ResolverConfig {
1356 sources: vec![SourceEntry::new(source_name, true)],
1357 endpoints,
1358 ..Default::default()
1359 }
1360 }
1361
1362 #[tokio::test]
1367 async fn openalex_doi_happy_path() {
1368 let server = MockServer::start().await;
1369 Mock::given(method("GET"))
1370 .and(path_regex(r"/works/doi:.*"))
1371 .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({
1372 "best_oa_location": {
1373 "pdf_url": "https://example.edu/paper.pdf"
1374 }
1375 })))
1376 .mount(&server)
1377 .await;
1378
1379 let config = single_source_config("openalex", &server.uri());
1380 let result = resolve_pdf_async(Some("10.1234/test"), None, None, &config).await;
1381 let r = result.unwrap();
1382 assert_eq!(r.source, "openalex");
1383 assert_eq!(r.url, "https://example.edu/paper.pdf");
1384 assert!(r.downloadable);
1385 }
1386
1387 #[tokio::test]
1388 async fn openalex_title_search_happy_path() {
1389 let server = MockServer::start().await;
1390 Mock::given(method("GET"))
1391 .and(path_regex(r"/works$"))
1392 .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({
1393 "results": [{
1394 "best_oa_location": {
1395 "pdf_url": "https://example.edu/search-result.pdf"
1396 }
1397 }]
1398 })))
1399 .mount(&server)
1400 .await;
1401
1402 let config = single_source_config("openalex", &server.uri());
1403 let result = resolve_pdf_async(None, None, Some("mutation testing"), &config).await;
1404 let r = result.unwrap();
1405 assert_eq!(r.source, "openalex");
1406 assert_eq!(r.url, "https://example.edu/search-result.pdf");
1407 }
1408
1409 #[tokio::test]
1410 async fn openalex_oa_url_fallback() {
1411 let server = MockServer::start().await;
1412 Mock::given(method("GET"))
1413 .and(path_regex(r"/works/doi:.*"))
1414 .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({
1415 "open_access": {
1416 "oa_url": "https://example.edu/open.pdf"
1417 }
1418 })))
1419 .mount(&server)
1420 .await;
1421
1422 let config = single_source_config("openalex", &server.uri());
1423 let result = resolve_pdf_async(Some("10.1234/test"), None, None, &config).await;
1424 let r = result.unwrap();
1425 assert_eq!(r.url, "https://example.edu/open.pdf");
1426 }
1427
1428 #[tokio::test]
1429 async fn openalex_locations_fallback() {
1430 let server = MockServer::start().await;
1431 Mock::given(method("GET"))
1432 .and(path_regex(r"/works/doi:.*"))
1433 .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({
1434 "locations": [
1435 { "pdf_url": "https://example.edu/loc.pdf" }
1436 ]
1437 })))
1438 .mount(&server)
1439 .await;
1440
1441 let config = single_source_config("openalex", &server.uri());
1442 let result = resolve_pdf_async(Some("10.1234/test"), None, None, &config).await;
1443 let r = result.unwrap();
1444 assert_eq!(r.url, "https://example.edu/loc.pdf");
1445 }
1446
1447 #[tokio::test]
1448 async fn openalex_404_returns_none() {
1449 let server = MockServer::start().await;
1450 Mock::given(method("GET"))
1451 .and(path_regex(r"/works/doi:.*"))
1452 .respond_with(ResponseTemplate::new(404))
1453 .mount(&server)
1454 .await;
1455
1456 let config = single_source_config("openalex", &server.uri());
1457 let result = resolve_pdf_async(Some("10.1234/test"), None, None, &config).await;
1458 assert!(result.is_none());
1459 }
1460
1461 #[tokio::test]
1462 async fn openalex_blocked_domain_not_downloadable() {
1463 let server = MockServer::start().await;
1464 Mock::given(method("GET"))
1465 .and(path_regex(r"/works/doi:.*"))
1466 .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({
1467 "best_oa_location": {
1468 "pdf_url": "https://www.sciencedirect.com/paper.pdf"
1469 }
1470 })))
1471 .mount(&server)
1472 .await;
1473
1474 let config = single_source_config("openalex", &server.uri());
1475 let result = resolve_pdf_async(Some("10.1234/test"), None, None, &config).await;
1476 let r = result.unwrap();
1477 assert!(!r.downloadable);
1478 }
1479
1480 #[tokio::test]
1485 async fn core_doi_happy_path() {
1486 let server = MockServer::start().await;
1487 Mock::given(method("GET"))
1488 .and(path_regex(r"/search/works"))
1489 .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({
1490 "results": [{
1491 "downloadUrl": "https://core.ac.uk/download/pdf/123.pdf"
1492 }]
1493 })))
1494 .mount(&server)
1495 .await;
1496
1497 let config = single_source_config("core", &server.uri());
1498 let result = resolve_pdf_async(Some("10.1234/test"), None, None, &config).await;
1499 let r = result.unwrap();
1500 assert_eq!(r.source, "core");
1501 assert_eq!(r.url, "https://core.ac.uk/download/pdf/123.pdf");
1502 assert!(r.downloadable);
1503 }
1504
1505 #[tokio::test]
1506 async fn core_title_search_happy_path() {
1507 let server = MockServer::start().await;
1508 Mock::given(method("GET"))
1509 .and(path_regex(r"/search/works"))
1510 .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({
1511 "results": [{
1512 "downloadUrl": "https://core.ac.uk/download/pdf/456.pdf"
1513 }]
1514 })))
1515 .mount(&server)
1516 .await;
1517
1518 let config = single_source_config("core", &server.uri());
1519 let result = resolve_pdf_async(None, None, Some("mutation testing"), &config).await;
1520 let r = result.unwrap();
1521 assert_eq!(r.source, "core");
1522 }
1523
1524 #[tokio::test]
1525 async fn core_404_returns_none() {
1526 let server = MockServer::start().await;
1527 Mock::given(method("GET"))
1528 .and(path_regex(r"/search/works"))
1529 .respond_with(ResponseTemplate::new(404))
1530 .mount(&server)
1531 .await;
1532
1533 let config = single_source_config("core", &server.uri());
1534 let result = resolve_pdf_async(Some("10.1234/test"), None, None, &config).await;
1535 assert!(result.is_none());
1536 }
1537
1538 #[tokio::test]
1539 async fn core_empty_results_returns_none() {
1540 let server = MockServer::start().await;
1541 Mock::given(method("GET"))
1542 .and(path_regex(r"/search/works"))
1543 .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({
1544 "results": []
1545 })))
1546 .mount(&server)
1547 .await;
1548
1549 let config = single_source_config("core", &server.uri());
1550 let result = resolve_pdf_async(Some("10.1234/test"), None, None, &config).await;
1551 assert!(result.is_none());
1552 }
1553
1554 #[tokio::test]
1559 async fn google_scholar_happy_path_academic_host() {
1560 let server = MockServer::start().await;
1561 Mock::given(method("GET"))
1562 .and(path_regex(r"/scholar"))
1563 .respond_with(ResponseTemplate::new(200).set_body_string(
1564 r#"<html><body>
1565 <a href="https://cs.stanford.edu/paper.pdf">[PDF]</a>
1566 </body></html>"#,
1567 ))
1568 .mount(&server)
1569 .await;
1570
1571 let config = single_source_config("google_scholar", &server.uri());
1572 let result = resolve_pdf_async(None, None, Some("mutation testing"), &config).await;
1573 let r = result.unwrap();
1574 assert_eq!(r.source, "google_scholar");
1575 assert_eq!(r.url, "https://cs.stanford.edu/paper.pdf");
1576 assert!(r.downloadable);
1577 }
1578
1579 #[tokio::test]
1580 async fn google_scholar_fallback_non_academic_pdf() {
1581 let server = MockServer::start().await;
1582 Mock::given(method("GET"))
1583 .and(path_regex(r"/scholar"))
1584 .respond_with(ResponseTemplate::new(200).set_body_string(
1585 r#"<html><body>
1586 <a href="https://example.com/paper.pdf">PDF</a>
1587 </body></html>"#,
1588 ))
1589 .mount(&server)
1590 .await;
1591
1592 let config = single_source_config("google_scholar", &server.uri());
1593 let result = resolve_pdf_async(None, None, Some("mutation testing"), &config).await;
1594 let r = result.unwrap();
1595 assert_eq!(r.source, "google_scholar");
1596 assert_eq!(r.url, "https://example.com/paper.pdf");
1597 assert!(r.downloadable);
1598 }
1599
1600 #[tokio::test]
1601 async fn google_scholar_blocked_pdf_skipped() {
1602 let server = MockServer::start().await;
1603 Mock::given(method("GET"))
1605 .and(path_regex(r"/scholar"))
1606 .respond_with(ResponseTemplate::new(200).set_body_string(
1607 r#"<html><body>
1608 <a href="https://www.sciencedirect.com/paper.pdf">PDF</a>
1609 </body></html>"#,
1610 ))
1611 .mount(&server)
1612 .await;
1613
1614 let config = single_source_config("google_scholar", &server.uri());
1615 let result = resolve_pdf_async(None, None, Some("mutation testing"), &config).await;
1616 assert!(result.is_none());
1618 }
1619
1620 #[tokio::test]
1621 async fn google_scholar_no_title_returns_none() {
1622 let config = single_source_config("google_scholar", "http://unused");
1623 let result = resolve_pdf_async(Some("10.1234/test"), None, None, &config).await;
1624 assert!(result.is_none());
1625 }
1626
1627 #[tokio::test]
1628 async fn google_scholar_404_returns_none() {
1629 let server = MockServer::start().await;
1630 Mock::given(method("GET"))
1631 .and(path_regex(r"/scholar"))
1632 .respond_with(ResponseTemplate::new(404))
1633 .mount(&server)
1634 .await;
1635
1636 let config = single_source_config("google_scholar", &server.uri());
1637 let result = resolve_pdf_async(None, None, Some("mutation testing"), &config).await;
1638 assert!(result.is_none());
1639 }
1640
1641 #[tokio::test]
1646 async fn unpaywall_happy_path() {
1647 let server = MockServer::start().await;
1648 Mock::given(method("GET"))
1649 .and(path_regex(r"/10\.1234/test"))
1650 .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({
1651 "best_oa_location": {
1652 "url_for_pdf": "https://europepmc.org/paper.pdf"
1653 }
1654 })))
1655 .mount(&server)
1656 .await;
1657
1658 let config = single_source_config("unpaywall", &server.uri());
1659 let result = resolve_pdf_async(Some("10.1234/test"), None, None, &config).await;
1660 let r = result.unwrap();
1661 assert_eq!(r.source, "unpaywall");
1662 assert_eq!(r.url, "https://europepmc.org/paper.pdf");
1663 assert!(r.downloadable);
1664 }
1665
1666 #[tokio::test]
1667 async fn unpaywall_oa_locations_fallback() {
1668 let server = MockServer::start().await;
1669 Mock::given(method("GET"))
1670 .and(path_regex(r"/10\.1234/test"))
1671 .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({
1672 "oa_locations": [
1673 { "url_for_pdf": "https://repo.edu/fallback.pdf" }
1674 ]
1675 })))
1676 .mount(&server)
1677 .await;
1678
1679 let config = single_source_config("unpaywall", &server.uri());
1680 let result = resolve_pdf_async(Some("10.1234/test"), None, None, &config).await;
1681 let r = result.unwrap();
1682 assert_eq!(r.url, "https://repo.edu/fallback.pdf");
1683 }
1684
1685 #[tokio::test]
1686 async fn unpaywall_no_doi_returns_none() {
1687 let config = single_source_config("unpaywall", "http://unused");
1688 let result = resolve_pdf_async(None, None, Some("title"), &config).await;
1689 assert!(result.is_none());
1690 }
1691
1692 #[tokio::test]
1693 async fn unpaywall_404_returns_none() {
1694 let server = MockServer::start().await;
1695 Mock::given(method("GET"))
1696 .and(path_regex(r"/10\.1234/test"))
1697 .respond_with(ResponseTemplate::new(404))
1698 .mount(&server)
1699 .await;
1700
1701 let config = single_source_config("unpaywall", &server.uri());
1702 let result = resolve_pdf_async(Some("10.1234/test"), None, None, &config).await;
1703 assert!(result.is_none());
1704 }
1705
1706 #[tokio::test]
1707 async fn unpaywall_blocked_domain_not_downloadable() {
1708 let server = MockServer::start().await;
1709 Mock::given(method("GET"))
1710 .and(path_regex(r"/10\.1234/test"))
1711 .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({
1712 "best_oa_location": {
1713 "url_for_pdf": "https://link.springer.com/paper.pdf"
1714 }
1715 })))
1716 .mount(&server)
1717 .await;
1718
1719 let config = single_source_config("unpaywall", &server.uri());
1720 let result = resolve_pdf_async(Some("10.1234/test"), None, None, &config).await;
1721 let r = result.unwrap();
1722 assert!(!r.downloadable);
1723 }
1724
1725 #[tokio::test]
1730 async fn crossref_primary_url_happy_path() {
1731 let server = MockServer::start().await;
1732 Mock::given(method("GET"))
1733 .and(path_regex(r"/works/10\.1234/test"))
1734 .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({
1735 "message": {
1736 "resource": {
1737 "primary": {
1738 "URL": "https://publisher.org/article.pdf"
1739 }
1740 }
1741 }
1742 })))
1743 .mount(&server)
1744 .await;
1745
1746 let config = single_source_config("crossref", &server.uri());
1747 let result = resolve_pdf_async(Some("10.1234/test"), None, None, &config).await;
1748 let r = result.unwrap();
1749 assert_eq!(r.source, "crossref");
1750 assert_eq!(r.url, "https://publisher.org/article.pdf");
1751 assert!(r.downloadable);
1752 }
1753
1754 #[tokio::test]
1755 async fn crossref_link_array_fallback() {
1756 let server = MockServer::start().await;
1757 Mock::given(method("GET"))
1758 .and(path_regex(r"/works/10\.1234/test"))
1759 .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({
1760 "message": {
1761 "link": [
1762 {
1763 "URL": "https://publisher.org/full.pdf",
1764 "content-type": "application/pdf"
1765 }
1766 ]
1767 }
1768 })))
1769 .mount(&server)
1770 .await;
1771
1772 let config = single_source_config("crossref", &server.uri());
1773 let result = resolve_pdf_async(Some("10.1234/test"), None, None, &config).await;
1774 let r = result.unwrap();
1775 assert_eq!(r.source, "crossref");
1776 assert_eq!(r.url, "https://publisher.org/full.pdf");
1777 }
1778
1779 #[tokio::test]
1780 async fn crossref_no_doi_returns_none() {
1781 let config = single_source_config("crossref", "http://unused");
1782 let result = resolve_pdf_async(None, None, Some("title"), &config).await;
1783 assert!(result.is_none());
1784 }
1785
1786 #[tokio::test]
1787 async fn crossref_404_returns_none() {
1788 let server = MockServer::start().await;
1789 Mock::given(method("GET"))
1790 .and(path_regex(r"/works/10\.1234/test"))
1791 .respond_with(ResponseTemplate::new(404))
1792 .mount(&server)
1793 .await;
1794
1795 let config = single_source_config("crossref", &server.uri());
1796 let result = resolve_pdf_async(Some("10.1234/test"), None, None, &config).await;
1797 assert!(result.is_none());
1798 }
1799
1800 #[tokio::test]
1805 async fn zenodo_happy_path() {
1806 let server = MockServer::start().await;
1807 Mock::given(method("GET"))
1808 .and(path_regex(r"/records"))
1809 .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({
1810 "hits": {
1811 "hits": [{
1812 "files": [{
1813 "key": "paper.pdf",
1814 "links": {
1815 "self": "https://zenodo.org/records/123/files/paper.pdf"
1816 }
1817 }]
1818 }]
1819 }
1820 })))
1821 .mount(&server)
1822 .await;
1823
1824 let config = single_source_config("zenodo", &server.uri());
1825 let result = resolve_pdf_async(None, None, Some("mutation testing"), &config).await;
1826 let r = result.unwrap();
1827 assert_eq!(r.source, "zenodo");
1828 assert_eq!(r.url, "https://zenodo.org/records/123/files/paper.pdf");
1829 assert!(r.downloadable);
1830 }
1831
1832 #[tokio::test]
1833 async fn zenodo_no_pdf_files_returns_none() {
1834 let server = MockServer::start().await;
1835 Mock::given(method("GET"))
1836 .and(path_regex(r"/records"))
1837 .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({
1838 "hits": {
1839 "hits": [{
1840 "files": [{
1841 "key": "data.csv",
1842 "links": {
1843 "self": "https://zenodo.org/records/123/files/data.csv"
1844 }
1845 }]
1846 }]
1847 }
1848 })))
1849 .mount(&server)
1850 .await;
1851
1852 let config = single_source_config("zenodo", &server.uri());
1853 let result = resolve_pdf_async(None, None, Some("mutation testing"), &config).await;
1854 assert!(result.is_none());
1855 }
1856
1857 #[tokio::test]
1858 async fn zenodo_no_title_returns_none() {
1859 let config = single_source_config("zenodo", "http://unused");
1860 let result = resolve_pdf_async(Some("10.1234/test"), None, None, &config).await;
1861 assert!(result.is_none());
1862 }
1863
1864 #[tokio::test]
1865 async fn zenodo_404_returns_none() {
1866 let server = MockServer::start().await;
1867 Mock::given(method("GET"))
1868 .and(path_regex(r"/records"))
1869 .respond_with(ResponseTemplate::new(404))
1870 .mount(&server)
1871 .await;
1872
1873 let config = single_source_config("zenodo", &server.uri());
1874 let result = resolve_pdf_async(None, None, Some("mutation testing"), &config).await;
1875 assert!(result.is_none());
1876 }
1877
1878 #[tokio::test]
1883 async fn ssrn_happy_path() {
1884 let server = MockServer::start().await;
1885 Mock::given(method("GET"))
1886 .and(path_regex(r"/sol3/results\.cfm"))
1887 .respond_with(ResponseTemplate::new(200).set_body_string(
1888 r#"<html><body>
1889 <a href="https://papers.ssrn.com/sol3/papers.cfm?abstract_id=1234567">Paper</a>
1890 </body></html>"#,
1891 ))
1892 .mount(&server)
1893 .await;
1894
1895 let config = single_source_config("ssrn", &server.uri());
1896 let result = resolve_pdf_async(None, None, Some("volatility modeling"), &config).await;
1897 let r = result.unwrap();
1898 assert_eq!(r.source, "ssrn");
1899 assert_eq!(
1900 r.url,
1901 "https://papers.ssrn.com/sol3/papers.cfm?abstract_id=1234567"
1902 );
1903 assert!(!r.downloadable);
1905 }
1906
1907 #[tokio::test]
1908 async fn ssrn_no_match_returns_none() {
1909 let server = MockServer::start().await;
1910 Mock::given(method("GET"))
1911 .and(path_regex(r"/sol3/results\.cfm"))
1912 .respond_with(
1913 ResponseTemplate::new(200)
1914 .set_body_string(r#"<html><body>No results found.</body></html>"#),
1915 )
1916 .mount(&server)
1917 .await;
1918
1919 let config = single_source_config("ssrn", &server.uri());
1920 let result = resolve_pdf_async(None, None, Some("nonexistent paper"), &config).await;
1921 assert!(result.is_none());
1922 }
1923
1924 #[tokio::test]
1925 async fn ssrn_no_title_returns_none() {
1926 let config = single_source_config("ssrn", "http://unused");
1927 let result = resolve_pdf_async(Some("10.1234/test"), None, None, &config).await;
1928 assert!(result.is_none());
1929 }
1930
1931 #[tokio::test]
1936 async fn semantic_scholar_doi_happy_path() {
1937 let server = MockServer::start().await;
1938 Mock::given(method("GET"))
1939 .and(path_regex(r"/paper/DOI:.*"))
1940 .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({
1941 "openAccessPdf": {
1942 "url": "https://example.edu/s2paper.pdf"
1943 }
1944 })))
1945 .mount(&server)
1946 .await;
1947
1948 let config = single_source_config("semantic_scholar", &server.uri());
1949 let result = resolve_pdf_async(Some("10.1234/test"), None, None, &config).await;
1950 let r = result.unwrap();
1951 assert_eq!(r.source, "semantic_scholar");
1952 assert_eq!(r.url, "https://example.edu/s2paper.pdf");
1953 assert!(r.downloadable);
1954 }
1955
1956 #[tokio::test]
1957 async fn semantic_scholar_title_search_happy_path() {
1958 let server = MockServer::start().await;
1959 Mock::given(method("GET"))
1960 .and(path_regex(r"/paper/search"))
1961 .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({
1962 "data": [{
1963 "openAccessPdf": {
1964 "url": "https://example.edu/s2search.pdf"
1965 }
1966 }]
1967 })))
1968 .mount(&server)
1969 .await;
1970
1971 let config = single_source_config("semantic_scholar", &server.uri());
1972 let result = resolve_pdf_async(None, None, Some("mutation testing"), &config).await;
1973 let r = result.unwrap();
1974 assert_eq!(r.source, "semantic_scholar");
1975 assert_eq!(r.url, "https://example.edu/s2search.pdf");
1976 }
1977
1978 #[tokio::test]
1979 async fn semantic_scholar_disclaimer_arxiv_fallback() {
1980 let server = MockServer::start().await;
1981 Mock::given(method("GET"))
1982 .and(path_regex(r"/paper/DOI:.*"))
1983 .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({
1984 "openAccessPdf": {
1985 "disclaimer": "See https://arxiv.org/abs/2105.15183 for the open access version."
1986 }
1987 })))
1988 .mount(&server)
1989 .await;
1990
1991 let config = single_source_config("semantic_scholar", &server.uri());
1992 let result = resolve_pdf_async(Some("10.1234/test"), None, None, &config).await;
1993 let r = result.unwrap();
1994 assert_eq!(r.source, "semantic_scholar");
1995 assert_eq!(r.url, "https://arxiv.org/pdf/2105.15183.pdf");
1996 assert!(r.downloadable);
1997 }
1998
1999 #[tokio::test]
2000 async fn semantic_scholar_disclaimer_non_arxiv_url() {
2001 let server = MockServer::start().await;
2002 Mock::given(method("GET"))
2003 .and(path_regex(r"/paper/DOI:.*"))
2004 .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({
2005 "openAccessPdf": {
2006 "disclaimer": "Available at https://example.edu/paper.pdf for download."
2007 }
2008 })))
2009 .mount(&server)
2010 .await;
2011
2012 let config = single_source_config("semantic_scholar", &server.uri());
2013 let result = resolve_pdf_async(Some("10.1234/test"), None, None, &config).await;
2014 let r = result.unwrap();
2015 assert_eq!(r.source, "semantic_scholar");
2016 assert!(r.url.contains("example.edu"));
2017 }
2018
2019 #[tokio::test]
2020 async fn semantic_scholar_404_returns_none() {
2021 let server = MockServer::start().await;
2022 Mock::given(method("GET"))
2023 .and(path_regex(r"/paper/DOI:.*"))
2024 .respond_with(ResponseTemplate::new(404))
2025 .mount(&server)
2026 .await;
2027
2028 let config = single_source_config("semantic_scholar", &server.uri());
2029 let result = resolve_pdf_async(Some("10.1234/test"), None, None, &config).await;
2030 assert!(result.is_none());
2031 }
2032
2033 #[tokio::test]
2034 async fn semantic_scholar_no_oa_pdf_returns_none() {
2035 let server = MockServer::start().await;
2036 Mock::given(method("GET"))
2037 .and(path_regex(r"/paper/DOI:.*"))
2038 .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({
2039 "title": "Some paper"
2040 })))
2041 .mount(&server)
2042 .await;
2043
2044 let config = single_source_config("semantic_scholar", &server.uri());
2045 let result = resolve_pdf_async(Some("10.1234/test"), None, None, &config).await;
2046 assert!(result.is_none());
2047 }
2048
2049 #[test]
2054 fn report_arxiv_doi_has_outcome() {
2055 let report = resolve_pdf_with_report(
2056 Some("10.48550/arXiv.2105.15183"),
2057 None,
2058 None,
2059 &ResolverConfig::default(),
2060 );
2061 assert!(report.pdf.is_some());
2062 assert_eq!(report.pdf.as_ref().unwrap().source, "arxiv");
2063 assert!(!report.outcomes.is_empty());
2064 assert!(report.outcomes[0].1.is_ok());
2065 }
2066
2067 #[test]
2068 fn report_summary_no_pdf_shows_sources() {
2069 let report = ResolveReport {
2070 pdf: None,
2071 outcomes: vec![
2072 ("openalex".into(), Err("closed access".into())),
2073 ("unpaywall".into(), Err("no email configured".into())),
2074 ],
2075 };
2076 let s = report.summary();
2077 assert!(s.contains("No downloadable PDF found"));
2078 assert!(s.contains("openalex: closed access"));
2079 assert!(s.contains("unpaywall: no email configured"));
2080 }
2081
2082 #[test]
2083 fn report_summary_with_pdf_shows_url() {
2084 let report = ResolveReport {
2085 pdf: Some(ResolvedPdf {
2086 url: "https://example.edu/paper.pdf".into(),
2087 source: "google_scholar".into(),
2088 downloadable: true,
2089 }),
2090 outcomes: vec![(
2091 "google_scholar".into(),
2092 Ok("https://example.edu/paper.pdf".into()),
2093 )],
2094 };
2095 let s = report.summary();
2096 assert!(s.contains("PDF found via google_scholar"));
2097 assert!(s.contains("example.edu/paper.pdf"));
2098 }
2099
2100 #[tokio::test]
2101 async fn report_unpaywall_skipped_with_placeholder_email() {
2102 let server = MockServer::start().await;
2103 let config = ResolverConfig {
2106 sources: vec![SourceEntry::new("unpaywall", true)],
2107 email: "biblion@example.com".into(),
2108 endpoints: Endpoints {
2109 unpaywall: server.uri(),
2110 ..Default::default()
2111 },
2112 ..Default::default()
2113 };
2114 let report = resolve_pdf_async_with_report(None, None, Some("test"), &config).await;
2115 assert!(report.pdf.is_none());
2116 let unpaywall_outcome = report.outcomes.iter().find(|(n, _)| n == "unpaywall");
2117 assert!(unpaywall_outcome.is_some());
2118 assert!(
2119 unpaywall_outcome
2120 .unwrap()
2121 .1
2122 .as_ref()
2123 .err()
2124 .unwrap()
2125 .contains("email")
2126 );
2127 }
2128
2129 #[tokio::test]
2130 async fn report_google_scholar_429_reports_rate_limit() {
2131 let server = MockServer::start().await;
2132 Mock::given(method("GET"))
2133 .and(path("/scholar"))
2134 .respond_with(ResponseTemplate::new(429))
2135 .mount(&server)
2136 .await;
2137
2138 let config = single_source_config("google_scholar", &server.uri());
2139 let report =
2140 resolve_pdf_async_with_report(None, None, Some("mutation testing"), &config).await;
2141 assert!(report.pdf.is_none());
2142 let gs_outcome = report.outcomes.iter().find(|(n, _)| n == "google_scholar");
2143 assert!(gs_outcome.is_some());
2144 assert!(
2145 gs_outcome
2146 .unwrap()
2147 .1
2148 .as_ref()
2149 .err()
2150 .unwrap()
2151 .contains("429")
2152 );
2153 }
2154
2155 #[tokio::test]
2156 async fn report_openalex_closed_access_reports_reason() {
2157 let server = MockServer::start().await;
2158 Mock::given(method("GET"))
2159 .and(path_regex(r"/works/doi:.*"))
2160 .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({
2161 "open_access": { "is_oa": false, "oa_status": "closed" }
2162 })))
2163 .mount(&server)
2164 .await;
2165
2166 let config = single_source_config("openalex", &server.uri());
2167 let report = resolve_pdf_async_with_report(Some("10.1234/test"), None, None, &config).await;
2168 assert!(report.pdf.is_none());
2169 let oa_outcome = report.outcomes.iter().find(|(n, _)| n == "openalex");
2170 assert!(oa_outcome.is_some());
2171 assert!(oa_outcome.unwrap().1.is_err());
2172 }
2173
2174 #[tokio::test]
2175 async fn report_multiple_sources_collects_all_outcomes() {
2176 let server = MockServer::start().await;
2177 Mock::given(method("GET"))
2179 .respond_with(ResponseTemplate::new(404))
2180 .mount(&server)
2181 .await;
2182
2183 let config = ResolverConfig {
2184 sources: vec![
2185 SourceEntry::new("openalex", true),
2186 SourceEntry::new("core", true),
2187 ],
2188 endpoints: Endpoints {
2189 openalex: server.uri(),
2190 core: server.uri(),
2191 ..Default::default()
2192 },
2193 ..Default::default()
2194 };
2195 let report =
2196 resolve_pdf_async_with_report(Some("10.1234/test"), None, Some("test"), &config).await;
2197 assert!(report.pdf.is_none());
2198 assert_eq!(report.outcomes.len(), 2);
2199 }
2200
2201 #[tokio::test]
2206 async fn semantic_scholar_blocked_domain_not_downloadable() {
2207 let server = MockServer::start().await;
2208 Mock::given(method("GET"))
2209 .and(path_regex(r"/paper/DOI:.*"))
2210 .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({
2211 "openAccessPdf": {
2212 "url": "https://ieeexplore.ieee.org/paper.pdf"
2213 }
2214 })))
2215 .mount(&server)
2216 .await;
2217
2218 let config = single_source_config("semantic_scholar", &server.uri());
2219 let result = resolve_pdf_async(Some("10.1234/test"), None, None, &config).await;
2220 let r = result.unwrap();
2221 assert!(!r.downloadable);
2222 }
2223}