paper_resolver/
lib.rs

1//! Concurrent PDF resolver for academic papers — 9 open-access sources.
2//!
3//! # What it does
4//!
5//! Given a DOI, URL, or title, queries up to 9 academic sources in parallel
6//! and returns the best downloadable PDF URL. No Zotero, no reference manager
7//! dependency — just `(doi, url, title) → Option<ResolvedPdf>`.
8//!
9//! # Sources (by priority)
10//!
11//! | # | Source | Coverage | Method |
12//! |---|--------|----------|--------|
13//! | 1 | **arXiv** | arXiv papers | Instant DOI/URL pattern match (no network) |
14//! | 2 | **OpenAlex** | 250M+ works | Structured OA location data |
15//! | 3 | **CORE** | 300M+ OA works | Institutional repositories |
16//! | 4 | **Google Scholar** | Widest coverage | HTML scraping (rate-limit risk) |
17//! | 5 | **Unpaywall** | 30M+ OA articles | DOI lookup (requires email) |
18//! | 6 | **Crossref** | Publisher links | DOI metadata |
19//! | 7 | **Zenodo** | Cross-disciplinary | CERN preprint repository |
20//! | 8 | **SSRN** | Finance/economics | Preprint abstracts |
21//! | 9 | **Semantic Scholar** | CS/bio/med | OA PDFs + disclaimer parsing |
22//!
23//! # Quick start
24//!
25//! ```no_run
26//! // Resolve a PDF by DOI (sync — creates its own tokio runtime):
27//! let result = paper_resolver::resolve_pdf(
28//!     Some("10.48550/arXiv.1706.03762"), // DOI
29//!     None,                              // URL
30//!     Some("Attention Is All You Need"), // title (fallback)
31//! );
32//!
33//! if let Some(pdf) = result {
34//!     println!("Found: {} (via {})", pdf.url, pdf.source);
35//!     println!("Downloadable: {}", pdf.downloadable);
36//! }
37//! ```
38//!
39//! # Detailed reporting
40//!
41//! Use [`resolve_pdf_with_report`] to see why each source succeeded or failed:
42//!
43//! ```no_run
44//! let report = paper_resolver::resolve_pdf_with_report(
45//!     Some("10.1109/TSE.2010.62"), None,
46//!     Some("mutation testing"),
47//!     &paper_resolver::ResolverConfig::default(),
48//! );
49//! println!("{}", report.summary());
50//! // PDF found via google_scholar (downloadable: true)
51//! //   https://mutationtesting.uni.lu/TR-09-06.pdf
52//! //
53//! // Sources queried:
54//! //   openalex: no OA location found
55//! //   google_scholar: found https://...
56//! //   unpaywall: skipped — configure resolver.email
57//! //   ...
58//! ```
59//!
60//! # Custom configuration
61//!
62//! ```ignore
63//! use paper_resolver::{ResolverConfig, SourceEntry};
64//!
65//! let mut config = ResolverConfig::default();
66//! config.email = "researcher@university.edu".into();
67//! config.timeout_secs = 10;
68//! config.sources = vec![
69//!     SourceEntry::new("arxiv", true),
70//!     SourceEntry::new("openalex", true),
71//!     SourceEntry::new("unpaywall", true),
72//!     // disable the rest
73//! ];
74//!
75//! let result = paper_resolver::resolve_pdf_with_config(
76//!     Some("10.1234/example"), None, None, &config,
77//! );
78//! ```
79//!
80//! # Design decisions
81//!
82//! - **Concurrent by default**: all enabled sources are queried simultaneously
83//!   via `futures::future::join_all`. First-to-return wins by priority.
84//! - **Blocked domains**: publisher paywalls (IEEE, Springer, Elsevier, etc.)
85//!   are detected and marked `downloadable: false` rather than silently failing.
86//! - **No file I/O**: this crate has zero filesystem dependency. Configuration
87//!   is passed as a struct — the caller owns config file parsing.
88//! - **Standalone**: no Zotero dependency. Usable in any Rust project that
89//!   needs academic PDF resolution.
90
91use regex::Regex;
92use std::sync::LazyLock;
93
94static ARXIV_RE: LazyLock<Regex> =
95    LazyLock::new(|| Regex::new(r"arxiv\.org/(?:abs|pdf)/(\d{4}\.\d{4,5}(?:v\d+)?)").unwrap());
96static PDF_HREF_RE: LazyLock<Regex> =
97    LazyLock::new(|| Regex::new(r#"href="(https?://[^"]+\.pdf)""#).unwrap());
98static SSRN_RE: LazyLock<Regex> = LazyLock::new(|| {
99    Regex::new(r#"href="(https?://papers\.ssrn\.com/sol3/papers\.cfm\?abstract_id=\d+)""#).unwrap()
100});
101static URL_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"https?://[^\s,)]+").unwrap());
102
103/// Shared tokio runtime for PDF resolution. Created once, reused across calls.
104///
105/// Uses `new_multi_thread` (not `new_current_thread`) because in SSE mode,
106/// multiple concurrent requests may call `resolve_pdf()` from different
107/// `spawn_blocking` threads. A current-thread runtime would deadlock or panic
108/// when a second `block_on()` is called on the same runtime.
109static PDF_RUNTIME: std::sync::OnceLock<tokio::runtime::Runtime> = std::sync::OnceLock::new();
110
111fn pdf_runtime() -> &'static tokio::runtime::Runtime {
112    PDF_RUNTIME.get_or_init(|| {
113        tokio::runtime::Builder::new_multi_thread()
114            .worker_threads(2)
115            .enable_all()
116            .build()
117            .expect("failed to build PDF tokio runtime")
118    })
119}
120
121/// Result of PDF URL resolution.
122///
123/// Contains the URL, the source that found it, and whether the URL
124/// points to a directly downloadable file (vs. a paywall landing page).
125///
126/// # Fields
127///
128/// - `url` — the resolved PDF URL (may or may not be directly downloadable)
129/// - `source` — which source found it (e.g., `"arxiv"`, `"openalex"`, `"google_scholar"`)
130/// - `downloadable` — `false` if the URL points to a known paywall domain
131#[derive(Debug, Clone)]
132#[non_exhaustive]
133pub struct ResolvedPdf {
134    /// The resolved PDF URL.
135    pub url: String,
136    /// Which source found this URL (e.g., `"arxiv"`, `"google_scholar"`).
137    pub source: String,
138    /// Whether the URL is directly downloadable (`false` for paywall domains).
139    pub downloadable: bool,
140}
141
142/// Detailed resolution result — includes per-source failure reasons.
143#[derive(Debug, Clone)]
144#[non_exhaustive]
145pub struct ResolveReport {
146    /// The best PDF found (if any).
147    pub pdf: Option<ResolvedPdf>,
148    /// Per-source outcomes: `(source_name, Ok(url) | Err(reason))`.
149    pub outcomes: Vec<(String, Result<String, String>)>,
150}
151
152impl ResolveReport {
153    /// Format a human-readable summary of all source outcomes.
154    pub fn summary(&self) -> String {
155        let mut out = String::new();
156        if let Some(ref pdf) = self.pdf {
157            out.push_str(&format!(
158                "PDF found via {} (downloadable: {})\n  {}\n\n",
159                pdf.source, pdf.downloadable, pdf.url
160            ));
161        } else {
162            out.push_str("No downloadable PDF found.\n\n");
163        }
164        out.push_str("Sources queried:\n");
165        for (name, outcome) in &self.outcomes {
166            match outcome {
167                Ok(url) => out.push_str(&format!("  {name}: found {url}\n")),
168                Err(reason) => out.push_str(&format!("  {name}: {reason}\n")),
169            }
170        }
171        out
172    }
173}
174
175/// A source entry — name + enabled flag.
176#[derive(Debug, Clone)]
177#[non_exhaustive]
178pub struct SourceEntry {
179    pub name: String,
180    pub enabled: bool,
181}
182
183impl SourceEntry {
184    pub fn new(name: impl Into<String>, enabled: bool) -> Self {
185        Self {
186            name: name.into(),
187            enabled,
188        }
189    }
190}
191
192/// Base URLs for each source — overridable for testing.
193#[derive(Debug, Clone)]
194#[non_exhaustive]
195pub struct Endpoints {
196    pub openalex: String,
197    pub core: String,
198    pub google_scholar: String,
199    pub unpaywall: String,
200    pub crossref: String,
201    pub zenodo: String,
202    pub ssrn: String,
203    pub semantic_scholar: String,
204}
205
206impl Default for Endpoints {
207    fn default() -> Self {
208        Self {
209            openalex: "https://api.openalex.org".into(),
210            core: "https://api.core.ac.uk/v3".into(),
211            google_scholar: "https://scholar.google.com".into(),
212            unpaywall: "https://api.unpaywall.org/v2".into(),
213            crossref: "https://api.crossref.org".into(),
214            zenodo: "https://zenodo.org/api".into(),
215            ssrn: "https://papers.ssrn.com".into(),
216            semantic_scholar: "https://api.semanticscholar.org/graph/v1".into(),
217        }
218    }
219}
220
221/// Configuration for the paper resolver.
222///
223/// Controls which sources are queried, their priority (order in the vec),
224/// timeouts, and API identification. Callers construct this from their
225/// own config files (TOML, env vars, etc.) — paper-resolver has no
226/// file I/O dependency.
227#[derive(Debug, Clone)]
228#[non_exhaustive]
229pub struct ResolverConfig {
230    /// Email for Unpaywall/Crossref polite pool (required by their ToS).
231    pub email: String,
232    /// User-Agent string for HTTP requests.
233    pub user_agent: String,
234    /// HTTP request timeout in seconds.
235    pub timeout_secs: u64,
236    /// Ordered list of sources. Position = priority (first = highest).
237    /// Disabled sources are skipped.
238    pub sources: Vec<SourceEntry>,
239    /// Extra domains to treat as non-downloadable (appended to defaults).
240    pub extra_blocked_domains: Vec<String>,
241    /// Base URLs for each source — override for testing with mock servers.
242    pub endpoints: Endpoints,
243}
244
245/// All available source names.
246pub const SOURCE_NAMES: &[&str] = &[
247    "arxiv",
248    "openalex",
249    "core",
250    "google_scholar",
251    "unpaywall",
252    "crossref",
253    "zenodo",
254    "ssrn",
255    "semantic_scholar",
256];
257
258impl Default for ResolverConfig {
259    fn default() -> Self {
260        Self {
261            email: "biblion@example.com".into(),
262            user_agent: "biblion/0.1".into(),
263            timeout_secs: 20,
264            sources: SOURCE_NAMES
265                .iter()
266                .map(|&name| SourceEntry {
267                    name: name.into(),
268                    enabled: true,
269                })
270                .collect(),
271            extra_blocked_domains: vec![],
272            endpoints: Endpoints::default(),
273        }
274    }
275}
276
277impl ResolverConfig {
278    /// Check if a source is enabled by name.
279    pub fn is_enabled(&self, name: &str) -> bool {
280        self.sources.iter().any(|s| s.name == name && s.enabled)
281    }
282
283    /// Get the priority (position index) for a source.
284    pub fn priority(&self, name: &str) -> u8 {
285        self.sources
286            .iter()
287            .position(|s| s.name == name)
288            .map(|p| (p + 1) as u8)
289            .unwrap_or(99)
290    }
291}
292
293/// Default domains known to block programmatic downloads.
294const BLOCKED_DOMAINS: &[&str] = &[
295    "academic.oup.com",
296    "wiley.com",
297    "www.sciencedirect.com",
298    "link.springer.com",
299    "www.nature.com",
300    "www.tandfonline.com",
301    "ieeexplore.ieee.org",
302    "journals.sagepub.com",
303    "silverchair.com",
304];
305
306fn is_downloadable(url: &str) -> bool {
307    !BLOCKED_DOMAINS.iter().any(|d| url.contains(d))
308}
309
310/// Config-aware version that also checks extra_blocked_domains.
311fn is_downloadable_cfg(url: &str, config: &ResolverConfig) -> bool {
312    if !is_downloadable(url) {
313        return false;
314    }
315    !config
316        .extra_blocked_domains
317        .iter()
318        .any(|d| url.contains(d.as_str()))
319}
320
321/// Resolve a PDF URL using all available sources (default config).
322///
323/// Convenience wrapper that uses [`ResolverConfig::default()`].
324/// For custom configuration, use [`resolve_pdf_with_config`].
325pub fn resolve_pdf(
326    doi: Option<&str>,
327    url: Option<&str>,
328    title: Option<&str>,
329) -> Option<ResolvedPdf> {
330    resolve_pdf_with_config(doi, url, title, &ResolverConfig::default())
331}
332
333/// Resolve a PDF URL with custom configuration.
334///
335/// Sync version — creates a tokio runtime internally.
336/// For async callers, use [`resolve_pdf_async`].
337pub fn resolve_pdf_with_config(
338    doi: Option<&str>,
339    url: Option<&str>,
340    title: Option<&str>,
341    config: &ResolverConfig,
342) -> Option<ResolvedPdf> {
343    // 1. arXiv — instant, no network
344    if config.is_enabled("arxiv") {
345        if let Some(doi) = doi
346            && let Some(id) = doi_to_arxiv_id(doi)
347        {
348            return Some(ResolvedPdf {
349                url: format!("https://arxiv.org/pdf/{id}.pdf"),
350                source: "arxiv".into(),
351                downloadable: true,
352            });
353        }
354        if let Some(url) = url
355            && let Some(id) = url_to_arxiv_id(url)
356        {
357            return Some(ResolvedPdf {
358                url: format!("https://arxiv.org/pdf/{id}.pdf"),
359                source: "arxiv".into(),
360                downloadable: true,
361            });
362        }
363    }
364
365    // 2-9. Concurrent HTTP queries via tokio (shared runtime)
366    pdf_runtime().block_on(resolve_pdf_async(doi, url, title, config))
367}
368
369/// Resolve with detailed per-source reporting.
370///
371/// Returns a [`ResolveReport`] with both the best PDF (if any) and
372/// per-source outcomes explaining why each source succeeded or failed.
373pub fn resolve_pdf_with_report(
374    doi: Option<&str>,
375    url: Option<&str>,
376    title: Option<&str>,
377    config: &ResolverConfig,
378) -> ResolveReport {
379    // 1. arXiv — instant, no network
380    if config.is_enabled("arxiv") {
381        if let Some(doi) = doi
382            && let Some(id) = doi_to_arxiv_id(doi)
383        {
384            let pdf = ResolvedPdf {
385                url: format!("https://arxiv.org/pdf/{id}.pdf"),
386                source: "arxiv".into(),
387                downloadable: true,
388            };
389            return ResolveReport {
390                outcomes: vec![("arxiv".into(), Ok(pdf.url.clone()))],
391                pdf: Some(pdf),
392            };
393        }
394        if let Some(url) = url
395            && let Some(id) = url_to_arxiv_id(url)
396        {
397            let pdf = ResolvedPdf {
398                url: format!("https://arxiv.org/pdf/{id}.pdf"),
399                source: "arxiv".into(),
400                downloadable: true,
401            };
402            return ResolveReport {
403                outcomes: vec![("arxiv".into(), Ok(pdf.url.clone()))],
404                pdf: Some(pdf),
405            };
406        }
407    }
408
409    pdf_runtime().block_on(resolve_pdf_async_with_report(doi, url, title, config))
410}
411
412/// Async version with detailed per-source reporting.
413pub async fn resolve_pdf_async_with_report(
414    doi: Option<&str>,
415    url: Option<&str>,
416    title: Option<&str>,
417    config: &ResolverConfig,
418) -> ResolveReport {
419    // arXiv from URL (same as sync path)
420    if config.is_enabled("arxiv")
421        && let Some(url) = url
422        && let Some(id) = url_to_arxiv_id(url)
423    {
424        let pdf = ResolvedPdf {
425            url: format!("https://arxiv.org/pdf/{id}.pdf"),
426            source: "arxiv".into(),
427            downloadable: true,
428        };
429        return ResolveReport {
430            outcomes: vec![("arxiv".into(), Ok(pdf.url.clone()))],
431            pdf: Some(pdf),
432        };
433    }
434
435    let client = match reqwest::Client::builder()
436        .timeout(std::time::Duration::from_secs(config.timeout_secs))
437        .redirect(reqwest::redirect::Policy::limited(10))
438        .build()
439    {
440        Ok(c) => c,
441        Err(_) => {
442            return ResolveReport {
443                pdf: None,
444                outcomes: vec![("client".into(), Err("failed to build HTTP client".into()))],
445            };
446        }
447    };
448
449    type ReportFuture<'a> = std::pin::Pin<
450        Box<
451            dyn std::future::Future<Output = (String, u8, Result<ResolvedPdf, String>)> + Send + 'a,
452        >,
453    >;
454    let mut futures: Vec<ReportFuture<'_>> = Vec::new();
455
456    let ep = &config.endpoints;
457    for source in &config.sources {
458        if !source.enabled {
459            continue;
460        }
461        let pri = config.priority(&source.name);
462        let c = &client;
463        let name = source.name.clone();
464        match source.name.as_str() {
465            "arxiv" => {} // Already handled synchronously above
466            "openalex" => futures.push(Box::pin(async move {
467                let r = try_openalex(c, doi, title, ep).await;
468                (name, pri, r.ok_or_else(|| "no OA location found".into()))
469            })),
470            "core" => futures.push(Box::pin(async move {
471                let r = try_core(c, doi, title, ep).await;
472                (name, pri, r.ok_or_else(|| "no result".into()))
473            })),
474            "google_scholar" => futures.push(Box::pin(async move {
475                let r = try_google_scholar_report(c, title, ep).await;
476                (name, pri, r)
477            })),
478            "unpaywall" => {
479                let email = config.email.clone();
480                if email.contains("example.com") || email.contains("example.org") {
481                    futures.push(Box::pin(async move {
482                        (
483                            name,
484                            pri,
485                            Err("skipped — configure resolver.email in config.toml".into()),
486                        )
487                    }));
488                } else {
489                    futures.push(Box::pin(async move {
490                        let r = try_unpaywall(c, doi, &email, ep).await;
491                        (name, pri, r.ok_or_else(|| "no OA PDF for this DOI".into()))
492                    }));
493                }
494            }
495            "crossref" => {
496                let email = config.email.clone();
497                let ua = config.user_agent.clone();
498                futures.push(Box::pin(async move {
499                    let r = try_crossref(c, doi, &email, &ua, ep).await;
500                    (name, pri, r.ok_or_else(|| "no PDF link in metadata".into()))
501                }));
502            }
503            "zenodo" => futures.push(Box::pin(async move {
504                let r = try_zenodo(c, title, ep).await;
505                (name, pri, r.ok_or_else(|| "no result".into()))
506            })),
507            "ssrn" => futures.push(Box::pin(async move {
508                let r = try_ssrn(c, title, ep).await;
509                (name, pri, r.ok_or_else(|| "no result".into()))
510            })),
511            "semantic_scholar" => futures.push(Box::pin(async move {
512                let r = try_semantic_scholar(c, doi, title, ep).await;
513                (name, pri, r.ok_or_else(|| "no OA PDF found".into()))
514            })),
515            _ => {}
516        }
517    }
518
519    let results = futures::future::join_all(futures).await;
520
521    let mut outcomes: Vec<(String, Result<String, String>)> = Vec::new();
522    let mut candidates: Vec<(u8, ResolvedPdf)> = Vec::new();
523
524    for (name, pri, result) in results {
525        match result {
526            Ok(mut pdf) => {
527                if pdf.downloadable {
528                    pdf.downloadable = is_downloadable_cfg(&pdf.url, config);
529                }
530                let url = pdf.url.clone();
531                if pdf.downloadable {
532                    outcomes.push((name, Ok(url)));
533                } else {
534                    outcomes.push((name, Err(format!("found {} but blocked domain", url))));
535                }
536                candidates.push((pri, pdf));
537            }
538            Err(reason) => {
539                outcomes.push((name, Err(reason)));
540            }
541        }
542    }
543
544    candidates.sort_by_key(|(pri, r)| (!r.downloadable, *pri));
545    let pdf = candidates.into_iter().next().map(|(_, r)| r);
546
547    ResolveReport { pdf, outcomes }
548}
549
550/// Google Scholar with error reporting (instead of Option).
551async fn try_google_scholar_report(
552    client: &reqwest::Client,
553    title: Option<&str>,
554    endpoints: &Endpoints,
555) -> Result<ResolvedPdf, String> {
556    let title = title.ok_or("no title provided")?;
557    let resp = client
558        .get(format!("{}/scholar", endpoints.google_scholar))
559        .query(&[("q", &format!("\"{title}\"")), ("num", &"5".to_string())])
560        .header(
561            "User-Agent",
562            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
563        )
564        .header("Accept", "text/html,application/xhtml+xml")
565        .header("Accept-Language", "en-US,en;q=0.9")
566        .send()
567        .await
568        .map_err(|e| format!("request failed: {e}"))?;
569
570    let status = resp.status();
571    if status == reqwest::StatusCode::TOO_MANY_REQUESTS {
572        return Err("rate-limited (429) — try again later".into());
573    }
574    if !status.is_success() {
575        return Err(format!("HTTP {status}"));
576    }
577
578    let html = resp.text().await.map_err(|e| format!("body error: {e}"))?;
579
580    if html.contains("unusual traffic") || html.contains("captcha") {
581        return Err("blocked by captcha".into());
582    }
583
584    let academic_hosts = [
585        ".edu",
586        ".ac.uk",
587        "research.google",
588        "hal.science",
589        "eprint.iacr.org",
590    ];
591
592    // Prefer academic hosts
593    for cap in PDF_HREF_RE.captures_iter(&html) {
594        let url = &cap[1];
595        if academic_hosts.iter().any(|h| url.contains(h)) {
596            return Ok(ResolvedPdf {
597                url: url.into(),
598                source: "google_scholar".into(),
599                downloadable: true,
600            });
601        }
602    }
603    // Fallback: any downloadable PDF
604    for cap in PDF_HREF_RE.captures_iter(&html) {
605        let url = &cap[1];
606        if is_downloadable(url) {
607            return Ok(ResolvedPdf {
608                url: url.into(),
609                source: "google_scholar".into(),
610                downloadable: true,
611            });
612        }
613    }
614    Err("no PDF links found in results".into())
615}
616
617/// Async version with configuration — caller owns the tokio runtime.
618///
619/// All enabled sources are queried concurrently. Disabled sources are skipped.
620/// Source priority is determined by position in `config.sources` (first = highest).
621pub async fn resolve_pdf_async(
622    doi: Option<&str>,
623    url: Option<&str>,
624    title: Option<&str>,
625    config: &ResolverConfig,
626) -> Option<ResolvedPdf> {
627    // arXiv from URL (same as sync path)
628    if config.is_enabled("arxiv")
629        && let Some(url) = url
630        && let Some(id) = url_to_arxiv_id(url)
631    {
632        return Some(ResolvedPdf {
633            url: format!("https://arxiv.org/pdf/{id}.pdf"),
634            source: "arxiv".into(),
635            downloadable: true,
636        });
637    }
638
639    let client = reqwest::Client::builder()
640        .timeout(std::time::Duration::from_secs(config.timeout_secs))
641        .redirect(reqwest::redirect::Policy::limited(10))
642        .build()
643        .ok()?;
644
645    // Fire enabled sources concurrently via join_all pattern
646    type PdfFuture<'a> =
647        std::pin::Pin<Box<dyn std::future::Future<Output = Option<(u8, ResolvedPdf)>> + Send + 'a>>;
648    let mut futures: Vec<PdfFuture<'_>> = Vec::new();
649
650    let ep = &config.endpoints;
651    for source in &config.sources {
652        if !source.enabled {
653            continue;
654        }
655        let pri = config.priority(&source.name);
656        let c = &client;
657        match source.name.as_str() {
658            "arxiv" => {} // Already handled synchronously above
659            "openalex" => futures.push(Box::pin(async move {
660                try_openalex(c, doi, title, ep).await.map(|r| (pri, r))
661            })),
662            "core" => futures.push(Box::pin(async move {
663                try_core(c, doi, title, ep).await.map(|r| (pri, r))
664            })),
665            "google_scholar" => futures.push(Box::pin(async move {
666                try_google_scholar(c, title, ep).await.map(|r| (pri, r))
667            })),
668            "unpaywall" => {
669                let email = config.email.clone();
670                futures.push(Box::pin(async move {
671                    try_unpaywall(c, doi, &email, ep).await.map(|r| (pri, r))
672                }))
673            }
674            "crossref" => {
675                let email = config.email.clone();
676                let ua = config.user_agent.clone();
677                futures.push(Box::pin(async move {
678                    try_crossref(c, doi, &email, &ua, ep)
679                        .await
680                        .map(|r| (pri, r))
681                }))
682            }
683            "zenodo" => futures.push(Box::pin(async move {
684                try_zenodo(c, title, ep).await.map(|r| (pri, r))
685            })),
686            "ssrn" => futures.push(Box::pin(async move {
687                try_ssrn(c, title, ep).await.map(|r| (pri, r))
688            })),
689            "semantic_scholar" => futures.push(Box::pin(async move {
690                try_semantic_scholar(c, doi, title, ep)
691                    .await
692                    .map(|r| (pri, r))
693            })),
694            _ => {} // Unknown source name, skip
695        }
696    }
697
698    let results = futures::future::join_all(futures).await;
699
700    // Collect successful results and apply config-aware downloadability check
701    let mut candidates: Vec<(u8, ResolvedPdf)> = results
702        .into_iter()
703        .flatten()
704        .map(|(pri, mut r)| {
705            // Re-check downloadability with extra_blocked_domains from config
706            if r.downloadable {
707                r.downloadable = is_downloadable_cfg(&r.url, config);
708            }
709            (pri, r)
710        })
711        .collect();
712
713    // Prefer downloadable, then highest priority (lowest number)
714    candidates.sort_by_key(|(pri, r)| (!r.downloadable, *pri));
715    candidates.into_iter().next().map(|(_, r)| r)
716}
717
718// ---------------------------------------------------------------------------
719// arXiv helpers
720// ---------------------------------------------------------------------------
721
722fn doi_to_arxiv_id(doi: &str) -> Option<String> {
723    doi.strip_prefix("10.48550/arXiv.").map(String::from)
724}
725
726fn url_to_arxiv_id(url: &str) -> Option<String> {
727    ARXIV_RE.captures(url).map(|c| c[1].to_string())
728}
729
730// ---------------------------------------------------------------------------
731// OpenAlex
732// ---------------------------------------------------------------------------
733
734async fn try_openalex(
735    client: &reqwest::Client,
736    doi: Option<&str>,
737    title: Option<&str>,
738    endpoints: &Endpoints,
739) -> Option<ResolvedPdf> {
740    let resp = if let Some(doi) = doi {
741        client
742            .get(format!("{}/works/doi:{doi}", endpoints.openalex))
743            .query(&[("select", "open_access,locations,best_oa_location")])
744            .send()
745            .await
746            .ok()?
747    } else {
748        let title = title?;
749        client
750            .get(format!("{}/works", endpoints.openalex))
751            .query(&[
752                ("search", title),
753                ("per_page", "1"),
754                ("select", "open_access,locations,best_oa_location"),
755            ])
756            .send()
757            .await
758            .ok()?
759    };
760
761    if !resp.status().is_success() {
762        return None;
763    }
764    let data: serde_json::Value = resp.json().await.ok()?;
765
766    let work = if let Some(results) = data.get("results").and_then(|v| v.as_array()) {
767        results.first()?
768    } else {
769        &data
770    };
771
772    // Try best_oa_location.pdf_url → open_access.oa_url → locations[].pdf_url
773    if let Some(url) = work
774        .pointer("/best_oa_location/pdf_url")
775        .and_then(|v| v.as_str())
776    {
777        return Some(ResolvedPdf {
778            url: url.into(),
779            source: "openalex".into(),
780            downloadable: is_downloadable(url),
781        });
782    }
783    if let Some(url) = work.pointer("/open_access/oa_url").and_then(|v| v.as_str())
784        && url.ends_with(".pdf")
785    {
786        return Some(ResolvedPdf {
787            url: url.into(),
788            source: "openalex".into(),
789            downloadable: is_downloadable(url),
790        });
791    }
792    for loc in work
793        .get("locations")
794        .and_then(|v| v.as_array())
795        .unwrap_or(&vec![])
796    {
797        if let Some(url) = loc.get("pdf_url").and_then(|v| v.as_str()) {
798            return Some(ResolvedPdf {
799                url: url.into(),
800                source: "openalex".into(),
801                downloadable: is_downloadable(url),
802            });
803        }
804    }
805    None
806}
807
808// ---------------------------------------------------------------------------
809// CORE
810// ---------------------------------------------------------------------------
811
812async fn try_core(
813    client: &reqwest::Client,
814    doi: Option<&str>,
815    title: Option<&str>,
816    endpoints: &Endpoints,
817) -> Option<ResolvedPdf> {
818    let query = if let Some(doi) = doi {
819        format!(r#"doi:"{doi}""#)
820    } else {
821        let title = title?;
822        format!(r#"title:"{title}""#)
823    };
824
825    let resp = client
826        .get(format!("{}/search/works", endpoints.core))
827        .query(&[("q", &query), ("limit", &"1".to_string())])
828        .send()
829        .await
830        .ok()?;
831
832    if !resp.status().is_success() {
833        return None;
834    }
835    let data: serde_json::Value = resp.json().await.ok()?;
836    let work = data.get("results")?.as_array()?.first()?;
837
838    if let Some(url) = work.get("downloadUrl").and_then(|v| v.as_str()) {
839        return Some(ResolvedPdf {
840            url: url.into(),
841            source: "core".into(),
842            downloadable: is_downloadable(url),
843        });
844    }
845    None
846}
847
848// ---------------------------------------------------------------------------
849// Google Scholar
850// ---------------------------------------------------------------------------
851
852async fn try_google_scholar(
853    client: &reqwest::Client,
854    title: Option<&str>,
855    endpoints: &Endpoints,
856) -> Option<ResolvedPdf> {
857    let title = title?;
858    let resp = client
859        .get(format!("{}/scholar", endpoints.google_scholar))
860        .query(&[("q", &format!("\"{title}\"")), ("num", &"5".to_string())])
861        .header("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
862        .header("Accept", "text/html")
863        .send().await.ok()?;
864
865    if !resp.status().is_success() {
866        return None;
867    }
868    let html = resp.text().await.ok()?;
869
870    let academic_hosts = [
871        ".edu",
872        ".ac.uk",
873        "research.google",
874        "hal.science",
875        "eprint.iacr.org",
876    ];
877
878    // Prefer academic hosts
879    for cap in PDF_HREF_RE.captures_iter(&html) {
880        let url = &cap[1];
881        if academic_hosts.iter().any(|h| url.contains(h)) {
882            return Some(ResolvedPdf {
883                url: url.into(),
884                source: "google_scholar".into(),
885                downloadable: true,
886            });
887        }
888    }
889    // Fallback: any downloadable PDF
890    for cap in PDF_HREF_RE.captures_iter(&html) {
891        let url = &cap[1];
892        if is_downloadable(url) {
893            return Some(ResolvedPdf {
894                url: url.into(),
895                source: "google_scholar".into(),
896                downloadable: true,
897            });
898        }
899    }
900    None
901}
902
903// ---------------------------------------------------------------------------
904// Unpaywall
905// ---------------------------------------------------------------------------
906
907async fn try_unpaywall(
908    client: &reqwest::Client,
909    doi: Option<&str>,
910    email: &str,
911    endpoints: &Endpoints,
912) -> Option<ResolvedPdf> {
913    let doi = doi?;
914    let resp = client
915        .get(format!("{}/{doi}", endpoints.unpaywall))
916        .query(&[("email", email)])
917        .send()
918        .await
919        .ok()?;
920
921    if !resp.status().is_success() {
922        return None;
923    }
924    let data: serde_json::Value = resp.json().await.ok()?;
925
926    if let Some(url) = data
927        .pointer("/best_oa_location/url_for_pdf")
928        .and_then(|v| v.as_str())
929    {
930        return Some(ResolvedPdf {
931            url: url.into(),
932            source: "unpaywall".into(),
933            downloadable: is_downloadable(url),
934        });
935    }
936    for loc in data
937        .get("oa_locations")
938        .and_then(|v| v.as_array())
939        .unwrap_or(&vec![])
940    {
941        if let Some(url) = loc.get("url_for_pdf").and_then(|v| v.as_str()) {
942            return Some(ResolvedPdf {
943                url: url.into(),
944                source: "unpaywall".into(),
945                downloadable: is_downloadable(url),
946            });
947        }
948    }
949    None
950}
951
952// ---------------------------------------------------------------------------
953// Crossref
954// ---------------------------------------------------------------------------
955
956async fn try_crossref(
957    client: &reqwest::Client,
958    doi: Option<&str>,
959    email: &str,
960    user_agent: &str,
961    endpoints: &Endpoints,
962) -> Option<ResolvedPdf> {
963    let doi = doi?;
964    let resp = client
965        .get(format!("{}/works/{doi}", endpoints.crossref))
966        .header("User-Agent", format!("{user_agent} (mailto:{email})"))
967        .send()
968        .await
969        .ok()?;
970
971    if !resp.status().is_success() {
972        return None;
973    }
974    let data: serde_json::Value = resp.json().await.ok()?;
975    let msg = data.get("message")?;
976
977    // Check resource.primary.URL
978    if let Some(url) = msg
979        .pointer("/resource/primary/URL")
980        .and_then(|v| v.as_str())
981        && url.to_lowercase().ends_with(".pdf")
982    {
983        return Some(ResolvedPdf {
984            url: url.into(),
985            source: "crossref".into(),
986            downloadable: is_downloadable(url),
987        });
988    }
989    // Check link[] array
990    for link in msg
991        .get("link")
992        .and_then(|v| v.as_array())
993        .unwrap_or(&vec![])
994    {
995        let ct = link
996            .get("content-type")
997            .and_then(|v| v.as_str())
998            .unwrap_or("");
999        if ct.contains("pdf")
1000            && let Some(url) = link.get("URL").and_then(|v| v.as_str())
1001        {
1002            return Some(ResolvedPdf {
1003                url: url.into(),
1004                source: "crossref".into(),
1005                downloadable: is_downloadable(url),
1006            });
1007        }
1008    }
1009    None
1010}
1011
1012// ---------------------------------------------------------------------------
1013// Zenodo
1014// ---------------------------------------------------------------------------
1015
1016async fn try_zenodo(
1017    client: &reqwest::Client,
1018    title: Option<&str>,
1019    endpoints: &Endpoints,
1020) -> Option<ResolvedPdf> {
1021    let title = title?;
1022    let resp = client
1023        .get(format!("{}/records", endpoints.zenodo))
1024        .query(&[("q", title), ("size", "3"), ("type", "publication")])
1025        .send()
1026        .await
1027        .ok()?;
1028
1029    if !resp.status().is_success() {
1030        return None;
1031    }
1032    let data: serde_json::Value = resp.json().await.ok()?;
1033
1034    for hit in data
1035        .pointer("/hits/hits")
1036        .and_then(|v| v.as_array())
1037        .unwrap_or(&vec![])
1038    {
1039        for file in hit
1040            .get("files")
1041            .and_then(|v| v.as_array())
1042            .unwrap_or(&vec![])
1043        {
1044            if file
1045                .get("key")
1046                .and_then(|v| v.as_str())
1047                .unwrap_or("")
1048                .to_lowercase()
1049                .ends_with(".pdf")
1050                && let Some(url) = file.pointer("/links/self").and_then(|v| v.as_str())
1051            {
1052                return Some(ResolvedPdf {
1053                    url: url.into(),
1054                    source: "zenodo".into(),
1055                    downloadable: true,
1056                });
1057            }
1058        }
1059    }
1060    None
1061}
1062
1063// ---------------------------------------------------------------------------
1064// SSRN
1065// ---------------------------------------------------------------------------
1066
1067async fn try_ssrn(
1068    client: &reqwest::Client,
1069    title: Option<&str>,
1070    endpoints: &Endpoints,
1071) -> Option<ResolvedPdf> {
1072    let title = title?;
1073    let resp = client
1074        .get(format!("{}/sol3/results.cfm", endpoints.ssrn))
1075        .query(&[("txtKey_Words", title), ("npage", "1")])
1076        .header("User-Agent", "Mozilla/5.0")
1077        .header("Accept", "text/html")
1078        .send()
1079        .await
1080        .ok()?;
1081
1082    if !resp.status().is_success() {
1083        return None;
1084    }
1085    let html = resp.text().await.ok()?;
1086
1087    if let Some(cap) = SSRN_RE.captures(&html) {
1088        return Some(ResolvedPdf {
1089            url: cap[1].to_string(),
1090            source: "ssrn".into(),
1091            downloadable: false,
1092        });
1093    }
1094    None
1095}
1096
1097// ---------------------------------------------------------------------------
1098// Semantic Scholar
1099// ---------------------------------------------------------------------------
1100
1101async fn try_semantic_scholar(
1102    client: &reqwest::Client,
1103    doi: Option<&str>,
1104    title: Option<&str>,
1105    endpoints: &Endpoints,
1106) -> Option<ResolvedPdf> {
1107    let resp = if let Some(doi) = doi {
1108        client
1109            .get(format!("{}/paper/DOI:{doi}", endpoints.semantic_scholar))
1110            .query(&[("fields", "openAccessPdf")])
1111            .send()
1112            .await
1113            .ok()?
1114    } else {
1115        let title = title?;
1116        client
1117            .get(format!("{}/paper/search", endpoints.semantic_scholar))
1118            .query(&[
1119                ("query", title),
1120                ("limit", "1"),
1121                ("fields", "openAccessPdf"),
1122            ])
1123            .send()
1124            .await
1125            .ok()?
1126    };
1127
1128    if !resp.status().is_success() {
1129        return None;
1130    }
1131    let data: serde_json::Value = resp.json().await.ok()?;
1132
1133    let work = if let Some(items) = data.get("data").and_then(|v| v.as_array()) {
1134        items.first()?
1135    } else {
1136        &data
1137    };
1138
1139    let oa = work.get("openAccessPdf")?;
1140    if let Some(url) = oa.get("url").and_then(|v| v.as_str()) {
1141        return Some(ResolvedPdf {
1142            url: url.into(),
1143            source: "semantic_scholar".into(),
1144            downloadable: is_downloadable(url),
1145        });
1146    }
1147    // Disclaimer fallback
1148    if let Some(disclaimer) = oa.get("disclaimer").and_then(|v| v.as_str()) {
1149        for m in URL_RE.find_iter(disclaimer) {
1150            let url = m.as_str();
1151            if url.contains("arxiv.org/abs/") {
1152                let pdf_url = url.replace("/abs/", "/pdf/");
1153                return Some(ResolvedPdf {
1154                    url: format!("{pdf_url}.pdf"),
1155                    source: "semantic_scholar".into(),
1156                    downloadable: true,
1157                });
1158            }
1159            if !url.contains("arxiv.org") || url.contains("/pdf/") {
1160                return Some(ResolvedPdf {
1161                    url: url.into(),
1162                    source: "semantic_scholar".into(),
1163                    downloadable: is_downloadable(url),
1164                });
1165            }
1166        }
1167    }
1168    None
1169}
1170
1171#[cfg(test)]
1172mod tests {
1173    use super::*;
1174
1175    #[test]
1176    fn doi_to_arxiv_id_valid() {
1177        assert_eq!(
1178            doi_to_arxiv_id("10.48550/arXiv.2105.15183"),
1179            Some("2105.15183".into())
1180        );
1181    }
1182
1183    #[test]
1184    fn doi_to_arxiv_id_invalid() {
1185        assert_eq!(doi_to_arxiv_id("10.1234/other"), None);
1186    }
1187
1188    #[test]
1189    fn url_to_arxiv_id_abs() {
1190        assert_eq!(
1191            url_to_arxiv_id("https://arxiv.org/abs/2105.15183"),
1192            Some("2105.15183".into())
1193        );
1194    }
1195
1196    #[test]
1197    fn url_to_arxiv_id_pdf_versioned() {
1198        assert_eq!(
1199            url_to_arxiv_id("https://arxiv.org/pdf/2105.15183v2"),
1200            Some("2105.15183v2".into())
1201        );
1202    }
1203
1204    #[test]
1205    fn url_to_arxiv_id_non_arxiv() {
1206        assert_eq!(url_to_arxiv_id("https://example.com/paper"), None);
1207    }
1208
1209    #[test]
1210    fn resolve_arxiv_doi_instant() {
1211        let result = resolve_pdf(Some("10.48550/arXiv.2105.15183"), None, None);
1212        let r = result.unwrap();
1213        assert_eq!(r.source, "arxiv");
1214        assert_eq!(r.url, "https://arxiv.org/pdf/2105.15183.pdf");
1215        assert!(r.downloadable);
1216    }
1217
1218    #[test]
1219    fn resolve_arxiv_url_instant() {
1220        let result = resolve_pdf(None, Some("https://arxiv.org/abs/2301.01234"), None);
1221        let r = result.unwrap();
1222        assert_eq!(r.source, "arxiv");
1223        assert!(r.url.contains("2301.01234"));
1224    }
1225
1226    #[test]
1227    fn is_downloadable_blocked() {
1228        assert!(!is_downloadable("https://ieeexplore.ieee.org/doc/123.pdf"));
1229        assert!(!is_downloadable(
1230            "https://www.sciencedirect.com/article.pdf"
1231        ));
1232    }
1233
1234    #[test]
1235    fn is_downloadable_ok() {
1236        assert!(is_downloadable("https://arxiv.org/pdf/2105.15183.pdf"));
1237        assert!(is_downloadable("https://example.edu/paper.pdf"));
1238    }
1239}
1240
1241#[cfg(test)]
1242mod config_tests {
1243    use super::*;
1244
1245    #[test]
1246    fn default_config_has_all_sources_enabled() {
1247        let config = ResolverConfig::default();
1248        assert_eq!(config.sources.len(), 9);
1249        for source in &config.sources {
1250            assert!(source.enabled, "Source {} should be enabled", source.name);
1251        }
1252    }
1253
1254    #[test]
1255    fn is_enabled_true_for_enabled_source() {
1256        let config = ResolverConfig::default();
1257        assert!(config.is_enabled("arxiv"));
1258        assert!(config.is_enabled("openalex"));
1259        assert!(config.is_enabled("semantic_scholar"));
1260    }
1261
1262    #[test]
1263    fn is_enabled_false_for_disabled_source() {
1264        let mut config = ResolverConfig::default();
1265        config.sources[1].enabled = false; // disable openalex
1266        assert!(!config.is_enabled("openalex"));
1267        assert!(config.is_enabled("arxiv")); // others still enabled
1268    }
1269
1270    #[test]
1271    fn is_enabled_false_for_unknown_source() {
1272        let config = ResolverConfig::default();
1273        assert!(!config.is_enabled("nonexistent"));
1274    }
1275
1276    #[test]
1277    fn priority_reflects_position() {
1278        let config = ResolverConfig::default();
1279        assert_eq!(config.priority("arxiv"), 1);
1280        assert_eq!(config.priority("openalex"), 2);
1281        assert_eq!(config.priority("semantic_scholar"), 9);
1282    }
1283
1284    #[test]
1285    fn priority_returns_99_for_unknown() {
1286        let config = ResolverConfig::default();
1287        assert_eq!(config.priority("nonexistent"), 99);
1288    }
1289
1290    #[test]
1291    fn custom_source_order_changes_priority() {
1292        let config = ResolverConfig {
1293            sources: vec![
1294                SourceEntry {
1295                    name: "unpaywall".into(),
1296                    enabled: true,
1297                },
1298                SourceEntry {
1299                    name: "arxiv".into(),
1300                    enabled: true,
1301                },
1302            ],
1303            ..Default::default()
1304        };
1305        assert_eq!(config.priority("unpaywall"), 1);
1306        assert_eq!(config.priority("arxiv"), 2);
1307    }
1308
1309    #[test]
1310    fn resolve_with_arxiv_disabled_skips_arxiv() {
1311        let mut config = ResolverConfig::default();
1312        // Disable arxiv
1313        config.sources[0].enabled = false;
1314        // This DOI would normally resolve instantly via arxiv
1315        let result =
1316            resolve_pdf_with_config(Some("10.48550/arXiv.2105.15183"), None, None, &config);
1317        // With arxiv disabled and no network, should return None
1318        // (or a result from another source if network available)
1319        match result {
1320            None => {} // Expected without network
1321            Some(r) => assert_ne!(r.source, "arxiv", "Should not use disabled arxiv"),
1322        }
1323    }
1324
1325    #[test]
1326    fn resolve_with_config_uses_arxiv_when_enabled() {
1327        let config = ResolverConfig::default();
1328        let result =
1329            resolve_pdf_with_config(Some("10.48550/arXiv.2105.15183"), None, None, &config);
1330        let r = result.unwrap();
1331        assert_eq!(r.source, "arxiv");
1332        assert!(r.downloadable);
1333    }
1334}
1335
1336#[cfg(test)]
1337mod mock_tests {
1338    use super::*;
1339    use wiremock::matchers::{method, path, path_regex};
1340    use wiremock::{Mock, MockServer, ResponseTemplate};
1341
1342    /// Build a config that enables only the given source, pointing all endpoints
1343    /// at the mock server.
1344    fn single_source_config(source_name: &str, base_uri: &str) -> ResolverConfig {
1345        let endpoints = Endpoints {
1346            openalex: base_uri.into(),
1347            core: base_uri.into(),
1348            google_scholar: base_uri.into(),
1349            unpaywall: base_uri.into(),
1350            crossref: base_uri.into(),
1351            zenodo: base_uri.into(),
1352            ssrn: base_uri.into(),
1353            semantic_scholar: base_uri.into(),
1354        };
1355        ResolverConfig {
1356            sources: vec![SourceEntry::new(source_name, true)],
1357            endpoints,
1358            ..Default::default()
1359        }
1360    }
1361
1362    // -----------------------------------------------------------------------
1363    // OpenAlex
1364    // -----------------------------------------------------------------------
1365
1366    #[tokio::test]
1367    async fn openalex_doi_happy_path() {
1368        let server = MockServer::start().await;
1369        Mock::given(method("GET"))
1370            .and(path_regex(r"/works/doi:.*"))
1371            .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({
1372                "best_oa_location": {
1373                    "pdf_url": "https://example.edu/paper.pdf"
1374                }
1375            })))
1376            .mount(&server)
1377            .await;
1378
1379        let config = single_source_config("openalex", &server.uri());
1380        let result = resolve_pdf_async(Some("10.1234/test"), None, None, &config).await;
1381        let r = result.unwrap();
1382        assert_eq!(r.source, "openalex");
1383        assert_eq!(r.url, "https://example.edu/paper.pdf");
1384        assert!(r.downloadable);
1385    }
1386
1387    #[tokio::test]
1388    async fn openalex_title_search_happy_path() {
1389        let server = MockServer::start().await;
1390        Mock::given(method("GET"))
1391            .and(path_regex(r"/works$"))
1392            .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({
1393                "results": [{
1394                    "best_oa_location": {
1395                        "pdf_url": "https://example.edu/search-result.pdf"
1396                    }
1397                }]
1398            })))
1399            .mount(&server)
1400            .await;
1401
1402        let config = single_source_config("openalex", &server.uri());
1403        let result = resolve_pdf_async(None, None, Some("mutation testing"), &config).await;
1404        let r = result.unwrap();
1405        assert_eq!(r.source, "openalex");
1406        assert_eq!(r.url, "https://example.edu/search-result.pdf");
1407    }
1408
1409    #[tokio::test]
1410    async fn openalex_oa_url_fallback() {
1411        let server = MockServer::start().await;
1412        Mock::given(method("GET"))
1413            .and(path_regex(r"/works/doi:.*"))
1414            .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({
1415                "open_access": {
1416                    "oa_url": "https://example.edu/open.pdf"
1417                }
1418            })))
1419            .mount(&server)
1420            .await;
1421
1422        let config = single_source_config("openalex", &server.uri());
1423        let result = resolve_pdf_async(Some("10.1234/test"), None, None, &config).await;
1424        let r = result.unwrap();
1425        assert_eq!(r.url, "https://example.edu/open.pdf");
1426    }
1427
1428    #[tokio::test]
1429    async fn openalex_locations_fallback() {
1430        let server = MockServer::start().await;
1431        Mock::given(method("GET"))
1432            .and(path_regex(r"/works/doi:.*"))
1433            .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({
1434                "locations": [
1435                    { "pdf_url": "https://example.edu/loc.pdf" }
1436                ]
1437            })))
1438            .mount(&server)
1439            .await;
1440
1441        let config = single_source_config("openalex", &server.uri());
1442        let result = resolve_pdf_async(Some("10.1234/test"), None, None, &config).await;
1443        let r = result.unwrap();
1444        assert_eq!(r.url, "https://example.edu/loc.pdf");
1445    }
1446
1447    #[tokio::test]
1448    async fn openalex_404_returns_none() {
1449        let server = MockServer::start().await;
1450        Mock::given(method("GET"))
1451            .and(path_regex(r"/works/doi:.*"))
1452            .respond_with(ResponseTemplate::new(404))
1453            .mount(&server)
1454            .await;
1455
1456        let config = single_source_config("openalex", &server.uri());
1457        let result = resolve_pdf_async(Some("10.1234/test"), None, None, &config).await;
1458        assert!(result.is_none());
1459    }
1460
1461    #[tokio::test]
1462    async fn openalex_blocked_domain_not_downloadable() {
1463        let server = MockServer::start().await;
1464        Mock::given(method("GET"))
1465            .and(path_regex(r"/works/doi:.*"))
1466            .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({
1467                "best_oa_location": {
1468                    "pdf_url": "https://www.sciencedirect.com/paper.pdf"
1469                }
1470            })))
1471            .mount(&server)
1472            .await;
1473
1474        let config = single_source_config("openalex", &server.uri());
1475        let result = resolve_pdf_async(Some("10.1234/test"), None, None, &config).await;
1476        let r = result.unwrap();
1477        assert!(!r.downloadable);
1478    }
1479
1480    // -----------------------------------------------------------------------
1481    // CORE
1482    // -----------------------------------------------------------------------
1483
1484    #[tokio::test]
1485    async fn core_doi_happy_path() {
1486        let server = MockServer::start().await;
1487        Mock::given(method("GET"))
1488            .and(path_regex(r"/search/works"))
1489            .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({
1490                "results": [{
1491                    "downloadUrl": "https://core.ac.uk/download/pdf/123.pdf"
1492                }]
1493            })))
1494            .mount(&server)
1495            .await;
1496
1497        let config = single_source_config("core", &server.uri());
1498        let result = resolve_pdf_async(Some("10.1234/test"), None, None, &config).await;
1499        let r = result.unwrap();
1500        assert_eq!(r.source, "core");
1501        assert_eq!(r.url, "https://core.ac.uk/download/pdf/123.pdf");
1502        assert!(r.downloadable);
1503    }
1504
1505    #[tokio::test]
1506    async fn core_title_search_happy_path() {
1507        let server = MockServer::start().await;
1508        Mock::given(method("GET"))
1509            .and(path_regex(r"/search/works"))
1510            .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({
1511                "results": [{
1512                    "downloadUrl": "https://core.ac.uk/download/pdf/456.pdf"
1513                }]
1514            })))
1515            .mount(&server)
1516            .await;
1517
1518        let config = single_source_config("core", &server.uri());
1519        let result = resolve_pdf_async(None, None, Some("mutation testing"), &config).await;
1520        let r = result.unwrap();
1521        assert_eq!(r.source, "core");
1522    }
1523
1524    #[tokio::test]
1525    async fn core_404_returns_none() {
1526        let server = MockServer::start().await;
1527        Mock::given(method("GET"))
1528            .and(path_regex(r"/search/works"))
1529            .respond_with(ResponseTemplate::new(404))
1530            .mount(&server)
1531            .await;
1532
1533        let config = single_source_config("core", &server.uri());
1534        let result = resolve_pdf_async(Some("10.1234/test"), None, None, &config).await;
1535        assert!(result.is_none());
1536    }
1537
1538    #[tokio::test]
1539    async fn core_empty_results_returns_none() {
1540        let server = MockServer::start().await;
1541        Mock::given(method("GET"))
1542            .and(path_regex(r"/search/works"))
1543            .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({
1544                "results": []
1545            })))
1546            .mount(&server)
1547            .await;
1548
1549        let config = single_source_config("core", &server.uri());
1550        let result = resolve_pdf_async(Some("10.1234/test"), None, None, &config).await;
1551        assert!(result.is_none());
1552    }
1553
1554    // -----------------------------------------------------------------------
1555    // Google Scholar
1556    // -----------------------------------------------------------------------
1557
1558    #[tokio::test]
1559    async fn google_scholar_happy_path_academic_host() {
1560        let server = MockServer::start().await;
1561        Mock::given(method("GET"))
1562            .and(path_regex(r"/scholar"))
1563            .respond_with(ResponseTemplate::new(200).set_body_string(
1564                r#"<html><body>
1565                <a href="https://cs.stanford.edu/paper.pdf">[PDF]</a>
1566                </body></html>"#,
1567            ))
1568            .mount(&server)
1569            .await;
1570
1571        let config = single_source_config("google_scholar", &server.uri());
1572        let result = resolve_pdf_async(None, None, Some("mutation testing"), &config).await;
1573        let r = result.unwrap();
1574        assert_eq!(r.source, "google_scholar");
1575        assert_eq!(r.url, "https://cs.stanford.edu/paper.pdf");
1576        assert!(r.downloadable);
1577    }
1578
1579    #[tokio::test]
1580    async fn google_scholar_fallback_non_academic_pdf() {
1581        let server = MockServer::start().await;
1582        Mock::given(method("GET"))
1583            .and(path_regex(r"/scholar"))
1584            .respond_with(ResponseTemplate::new(200).set_body_string(
1585                r#"<html><body>
1586                <a href="https://example.com/paper.pdf">PDF</a>
1587                </body></html>"#,
1588            ))
1589            .mount(&server)
1590            .await;
1591
1592        let config = single_source_config("google_scholar", &server.uri());
1593        let result = resolve_pdf_async(None, None, Some("mutation testing"), &config).await;
1594        let r = result.unwrap();
1595        assert_eq!(r.source, "google_scholar");
1596        assert_eq!(r.url, "https://example.com/paper.pdf");
1597        assert!(r.downloadable);
1598    }
1599
1600    #[tokio::test]
1601    async fn google_scholar_blocked_pdf_skipped() {
1602        let server = MockServer::start().await;
1603        // Only blocked-domain PDFs — no downloadable ones
1604        Mock::given(method("GET"))
1605            .and(path_regex(r"/scholar"))
1606            .respond_with(ResponseTemplate::new(200).set_body_string(
1607                r#"<html><body>
1608                <a href="https://www.sciencedirect.com/paper.pdf">PDF</a>
1609                </body></html>"#,
1610            ))
1611            .mount(&server)
1612            .await;
1613
1614        let config = single_source_config("google_scholar", &server.uri());
1615        let result = resolve_pdf_async(None, None, Some("mutation testing"), &config).await;
1616        // The google_scholar handler skips blocked domains internally
1617        assert!(result.is_none());
1618    }
1619
1620    #[tokio::test]
1621    async fn google_scholar_no_title_returns_none() {
1622        let config = single_source_config("google_scholar", "http://unused");
1623        let result = resolve_pdf_async(Some("10.1234/test"), None, None, &config).await;
1624        assert!(result.is_none());
1625    }
1626
1627    #[tokio::test]
1628    async fn google_scholar_404_returns_none() {
1629        let server = MockServer::start().await;
1630        Mock::given(method("GET"))
1631            .and(path_regex(r"/scholar"))
1632            .respond_with(ResponseTemplate::new(404))
1633            .mount(&server)
1634            .await;
1635
1636        let config = single_source_config("google_scholar", &server.uri());
1637        let result = resolve_pdf_async(None, None, Some("mutation testing"), &config).await;
1638        assert!(result.is_none());
1639    }
1640
1641    // -----------------------------------------------------------------------
1642    // Unpaywall
1643    // -----------------------------------------------------------------------
1644
1645    #[tokio::test]
1646    async fn unpaywall_happy_path() {
1647        let server = MockServer::start().await;
1648        Mock::given(method("GET"))
1649            .and(path_regex(r"/10\.1234/test"))
1650            .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({
1651                "best_oa_location": {
1652                    "url_for_pdf": "https://europepmc.org/paper.pdf"
1653                }
1654            })))
1655            .mount(&server)
1656            .await;
1657
1658        let config = single_source_config("unpaywall", &server.uri());
1659        let result = resolve_pdf_async(Some("10.1234/test"), None, None, &config).await;
1660        let r = result.unwrap();
1661        assert_eq!(r.source, "unpaywall");
1662        assert_eq!(r.url, "https://europepmc.org/paper.pdf");
1663        assert!(r.downloadable);
1664    }
1665
1666    #[tokio::test]
1667    async fn unpaywall_oa_locations_fallback() {
1668        let server = MockServer::start().await;
1669        Mock::given(method("GET"))
1670            .and(path_regex(r"/10\.1234/test"))
1671            .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({
1672                "oa_locations": [
1673                    { "url_for_pdf": "https://repo.edu/fallback.pdf" }
1674                ]
1675            })))
1676            .mount(&server)
1677            .await;
1678
1679        let config = single_source_config("unpaywall", &server.uri());
1680        let result = resolve_pdf_async(Some("10.1234/test"), None, None, &config).await;
1681        let r = result.unwrap();
1682        assert_eq!(r.url, "https://repo.edu/fallback.pdf");
1683    }
1684
1685    #[tokio::test]
1686    async fn unpaywall_no_doi_returns_none() {
1687        let config = single_source_config("unpaywall", "http://unused");
1688        let result = resolve_pdf_async(None, None, Some("title"), &config).await;
1689        assert!(result.is_none());
1690    }
1691
1692    #[tokio::test]
1693    async fn unpaywall_404_returns_none() {
1694        let server = MockServer::start().await;
1695        Mock::given(method("GET"))
1696            .and(path_regex(r"/10\.1234/test"))
1697            .respond_with(ResponseTemplate::new(404))
1698            .mount(&server)
1699            .await;
1700
1701        let config = single_source_config("unpaywall", &server.uri());
1702        let result = resolve_pdf_async(Some("10.1234/test"), None, None, &config).await;
1703        assert!(result.is_none());
1704    }
1705
1706    #[tokio::test]
1707    async fn unpaywall_blocked_domain_not_downloadable() {
1708        let server = MockServer::start().await;
1709        Mock::given(method("GET"))
1710            .and(path_regex(r"/10\.1234/test"))
1711            .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({
1712                "best_oa_location": {
1713                    "url_for_pdf": "https://link.springer.com/paper.pdf"
1714                }
1715            })))
1716            .mount(&server)
1717            .await;
1718
1719        let config = single_source_config("unpaywall", &server.uri());
1720        let result = resolve_pdf_async(Some("10.1234/test"), None, None, &config).await;
1721        let r = result.unwrap();
1722        assert!(!r.downloadable);
1723    }
1724
1725    // -----------------------------------------------------------------------
1726    // Crossref
1727    // -----------------------------------------------------------------------
1728
1729    #[tokio::test]
1730    async fn crossref_primary_url_happy_path() {
1731        let server = MockServer::start().await;
1732        Mock::given(method("GET"))
1733            .and(path_regex(r"/works/10\.1234/test"))
1734            .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({
1735                "message": {
1736                    "resource": {
1737                        "primary": {
1738                            "URL": "https://publisher.org/article.pdf"
1739                        }
1740                    }
1741                }
1742            })))
1743            .mount(&server)
1744            .await;
1745
1746        let config = single_source_config("crossref", &server.uri());
1747        let result = resolve_pdf_async(Some("10.1234/test"), None, None, &config).await;
1748        let r = result.unwrap();
1749        assert_eq!(r.source, "crossref");
1750        assert_eq!(r.url, "https://publisher.org/article.pdf");
1751        assert!(r.downloadable);
1752    }
1753
1754    #[tokio::test]
1755    async fn crossref_link_array_fallback() {
1756        let server = MockServer::start().await;
1757        Mock::given(method("GET"))
1758            .and(path_regex(r"/works/10\.1234/test"))
1759            .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({
1760                "message": {
1761                    "link": [
1762                        {
1763                            "URL": "https://publisher.org/full.pdf",
1764                            "content-type": "application/pdf"
1765                        }
1766                    ]
1767                }
1768            })))
1769            .mount(&server)
1770            .await;
1771
1772        let config = single_source_config("crossref", &server.uri());
1773        let result = resolve_pdf_async(Some("10.1234/test"), None, None, &config).await;
1774        let r = result.unwrap();
1775        assert_eq!(r.source, "crossref");
1776        assert_eq!(r.url, "https://publisher.org/full.pdf");
1777    }
1778
1779    #[tokio::test]
1780    async fn crossref_no_doi_returns_none() {
1781        let config = single_source_config("crossref", "http://unused");
1782        let result = resolve_pdf_async(None, None, Some("title"), &config).await;
1783        assert!(result.is_none());
1784    }
1785
1786    #[tokio::test]
1787    async fn crossref_404_returns_none() {
1788        let server = MockServer::start().await;
1789        Mock::given(method("GET"))
1790            .and(path_regex(r"/works/10\.1234/test"))
1791            .respond_with(ResponseTemplate::new(404))
1792            .mount(&server)
1793            .await;
1794
1795        let config = single_source_config("crossref", &server.uri());
1796        let result = resolve_pdf_async(Some("10.1234/test"), None, None, &config).await;
1797        assert!(result.is_none());
1798    }
1799
1800    // -----------------------------------------------------------------------
1801    // Zenodo
1802    // -----------------------------------------------------------------------
1803
1804    #[tokio::test]
1805    async fn zenodo_happy_path() {
1806        let server = MockServer::start().await;
1807        Mock::given(method("GET"))
1808            .and(path_regex(r"/records"))
1809            .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({
1810                "hits": {
1811                    "hits": [{
1812                        "files": [{
1813                            "key": "paper.pdf",
1814                            "links": {
1815                                "self": "https://zenodo.org/records/123/files/paper.pdf"
1816                            }
1817                        }]
1818                    }]
1819                }
1820            })))
1821            .mount(&server)
1822            .await;
1823
1824        let config = single_source_config("zenodo", &server.uri());
1825        let result = resolve_pdf_async(None, None, Some("mutation testing"), &config).await;
1826        let r = result.unwrap();
1827        assert_eq!(r.source, "zenodo");
1828        assert_eq!(r.url, "https://zenodo.org/records/123/files/paper.pdf");
1829        assert!(r.downloadable);
1830    }
1831
1832    #[tokio::test]
1833    async fn zenodo_no_pdf_files_returns_none() {
1834        let server = MockServer::start().await;
1835        Mock::given(method("GET"))
1836            .and(path_regex(r"/records"))
1837            .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({
1838                "hits": {
1839                    "hits": [{
1840                        "files": [{
1841                            "key": "data.csv",
1842                            "links": {
1843                                "self": "https://zenodo.org/records/123/files/data.csv"
1844                            }
1845                        }]
1846                    }]
1847                }
1848            })))
1849            .mount(&server)
1850            .await;
1851
1852        let config = single_source_config("zenodo", &server.uri());
1853        let result = resolve_pdf_async(None, None, Some("mutation testing"), &config).await;
1854        assert!(result.is_none());
1855    }
1856
1857    #[tokio::test]
1858    async fn zenodo_no_title_returns_none() {
1859        let config = single_source_config("zenodo", "http://unused");
1860        let result = resolve_pdf_async(Some("10.1234/test"), None, None, &config).await;
1861        assert!(result.is_none());
1862    }
1863
1864    #[tokio::test]
1865    async fn zenodo_404_returns_none() {
1866        let server = MockServer::start().await;
1867        Mock::given(method("GET"))
1868            .and(path_regex(r"/records"))
1869            .respond_with(ResponseTemplate::new(404))
1870            .mount(&server)
1871            .await;
1872
1873        let config = single_source_config("zenodo", &server.uri());
1874        let result = resolve_pdf_async(None, None, Some("mutation testing"), &config).await;
1875        assert!(result.is_none());
1876    }
1877
1878    // -----------------------------------------------------------------------
1879    // SSRN
1880    // -----------------------------------------------------------------------
1881
1882    #[tokio::test]
1883    async fn ssrn_happy_path() {
1884        let server = MockServer::start().await;
1885        Mock::given(method("GET"))
1886            .and(path_regex(r"/sol3/results\.cfm"))
1887            .respond_with(ResponseTemplate::new(200).set_body_string(
1888                r#"<html><body>
1889                <a href="https://papers.ssrn.com/sol3/papers.cfm?abstract_id=1234567">Paper</a>
1890                </body></html>"#,
1891            ))
1892            .mount(&server)
1893            .await;
1894
1895        let config = single_source_config("ssrn", &server.uri());
1896        let result = resolve_pdf_async(None, None, Some("volatility modeling"), &config).await;
1897        let r = result.unwrap();
1898        assert_eq!(r.source, "ssrn");
1899        assert_eq!(
1900            r.url,
1901            "https://papers.ssrn.com/sol3/papers.cfm?abstract_id=1234567"
1902        );
1903        // SSRN never serves direct PDFs
1904        assert!(!r.downloadable);
1905    }
1906
1907    #[tokio::test]
1908    async fn ssrn_no_match_returns_none() {
1909        let server = MockServer::start().await;
1910        Mock::given(method("GET"))
1911            .and(path_regex(r"/sol3/results\.cfm"))
1912            .respond_with(
1913                ResponseTemplate::new(200)
1914                    .set_body_string(r#"<html><body>No results found.</body></html>"#),
1915            )
1916            .mount(&server)
1917            .await;
1918
1919        let config = single_source_config("ssrn", &server.uri());
1920        let result = resolve_pdf_async(None, None, Some("nonexistent paper"), &config).await;
1921        assert!(result.is_none());
1922    }
1923
1924    #[tokio::test]
1925    async fn ssrn_no_title_returns_none() {
1926        let config = single_source_config("ssrn", "http://unused");
1927        let result = resolve_pdf_async(Some("10.1234/test"), None, None, &config).await;
1928        assert!(result.is_none());
1929    }
1930
1931    // -----------------------------------------------------------------------
1932    // Semantic Scholar
1933    // -----------------------------------------------------------------------
1934
1935    #[tokio::test]
1936    async fn semantic_scholar_doi_happy_path() {
1937        let server = MockServer::start().await;
1938        Mock::given(method("GET"))
1939            .and(path_regex(r"/paper/DOI:.*"))
1940            .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({
1941                "openAccessPdf": {
1942                    "url": "https://example.edu/s2paper.pdf"
1943                }
1944            })))
1945            .mount(&server)
1946            .await;
1947
1948        let config = single_source_config("semantic_scholar", &server.uri());
1949        let result = resolve_pdf_async(Some("10.1234/test"), None, None, &config).await;
1950        let r = result.unwrap();
1951        assert_eq!(r.source, "semantic_scholar");
1952        assert_eq!(r.url, "https://example.edu/s2paper.pdf");
1953        assert!(r.downloadable);
1954    }
1955
1956    #[tokio::test]
1957    async fn semantic_scholar_title_search_happy_path() {
1958        let server = MockServer::start().await;
1959        Mock::given(method("GET"))
1960            .and(path_regex(r"/paper/search"))
1961            .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({
1962                "data": [{
1963                    "openAccessPdf": {
1964                        "url": "https://example.edu/s2search.pdf"
1965                    }
1966                }]
1967            })))
1968            .mount(&server)
1969            .await;
1970
1971        let config = single_source_config("semantic_scholar", &server.uri());
1972        let result = resolve_pdf_async(None, None, Some("mutation testing"), &config).await;
1973        let r = result.unwrap();
1974        assert_eq!(r.source, "semantic_scholar");
1975        assert_eq!(r.url, "https://example.edu/s2search.pdf");
1976    }
1977
1978    #[tokio::test]
1979    async fn semantic_scholar_disclaimer_arxiv_fallback() {
1980        let server = MockServer::start().await;
1981        Mock::given(method("GET"))
1982            .and(path_regex(r"/paper/DOI:.*"))
1983            .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({
1984                "openAccessPdf": {
1985                    "disclaimer": "See https://arxiv.org/abs/2105.15183 for the open access version."
1986                }
1987            })))
1988            .mount(&server)
1989            .await;
1990
1991        let config = single_source_config("semantic_scholar", &server.uri());
1992        let result = resolve_pdf_async(Some("10.1234/test"), None, None, &config).await;
1993        let r = result.unwrap();
1994        assert_eq!(r.source, "semantic_scholar");
1995        assert_eq!(r.url, "https://arxiv.org/pdf/2105.15183.pdf");
1996        assert!(r.downloadable);
1997    }
1998
1999    #[tokio::test]
2000    async fn semantic_scholar_disclaimer_non_arxiv_url() {
2001        let server = MockServer::start().await;
2002        Mock::given(method("GET"))
2003            .and(path_regex(r"/paper/DOI:.*"))
2004            .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({
2005                "openAccessPdf": {
2006                    "disclaimer": "Available at https://example.edu/paper.pdf for download."
2007                }
2008            })))
2009            .mount(&server)
2010            .await;
2011
2012        let config = single_source_config("semantic_scholar", &server.uri());
2013        let result = resolve_pdf_async(Some("10.1234/test"), None, None, &config).await;
2014        let r = result.unwrap();
2015        assert_eq!(r.source, "semantic_scholar");
2016        assert!(r.url.contains("example.edu"));
2017    }
2018
2019    #[tokio::test]
2020    async fn semantic_scholar_404_returns_none() {
2021        let server = MockServer::start().await;
2022        Mock::given(method("GET"))
2023            .and(path_regex(r"/paper/DOI:.*"))
2024            .respond_with(ResponseTemplate::new(404))
2025            .mount(&server)
2026            .await;
2027
2028        let config = single_source_config("semantic_scholar", &server.uri());
2029        let result = resolve_pdf_async(Some("10.1234/test"), None, None, &config).await;
2030        assert!(result.is_none());
2031    }
2032
2033    #[tokio::test]
2034    async fn semantic_scholar_no_oa_pdf_returns_none() {
2035        let server = MockServer::start().await;
2036        Mock::given(method("GET"))
2037            .and(path_regex(r"/paper/DOI:.*"))
2038            .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({
2039                "title": "Some paper"
2040            })))
2041            .mount(&server)
2042            .await;
2043
2044        let config = single_source_config("semantic_scholar", &server.uri());
2045        let result = resolve_pdf_async(Some("10.1234/test"), None, None, &config).await;
2046        assert!(result.is_none());
2047    }
2048
2049    // -----------------------------------------------------------------------
2050    // ResolveReport tests
2051    // -----------------------------------------------------------------------
2052
2053    #[test]
2054    fn report_arxiv_doi_has_outcome() {
2055        let report = resolve_pdf_with_report(
2056            Some("10.48550/arXiv.2105.15183"),
2057            None,
2058            None,
2059            &ResolverConfig::default(),
2060        );
2061        assert!(report.pdf.is_some());
2062        assert_eq!(report.pdf.as_ref().unwrap().source, "arxiv");
2063        assert!(!report.outcomes.is_empty());
2064        assert!(report.outcomes[0].1.is_ok());
2065    }
2066
2067    #[test]
2068    fn report_summary_no_pdf_shows_sources() {
2069        let report = ResolveReport {
2070            pdf: None,
2071            outcomes: vec![
2072                ("openalex".into(), Err("closed access".into())),
2073                ("unpaywall".into(), Err("no email configured".into())),
2074            ],
2075        };
2076        let s = report.summary();
2077        assert!(s.contains("No downloadable PDF found"));
2078        assert!(s.contains("openalex: closed access"));
2079        assert!(s.contains("unpaywall: no email configured"));
2080    }
2081
2082    #[test]
2083    fn report_summary_with_pdf_shows_url() {
2084        let report = ResolveReport {
2085            pdf: Some(ResolvedPdf {
2086                url: "https://example.edu/paper.pdf".into(),
2087                source: "google_scholar".into(),
2088                downloadable: true,
2089            }),
2090            outcomes: vec![(
2091                "google_scholar".into(),
2092                Ok("https://example.edu/paper.pdf".into()),
2093            )],
2094        };
2095        let s = report.summary();
2096        assert!(s.contains("PDF found via google_scholar"));
2097        assert!(s.contains("example.edu/paper.pdf"));
2098    }
2099
2100    #[tokio::test]
2101    async fn report_unpaywall_skipped_with_placeholder_email() {
2102        let server = MockServer::start().await;
2103        // No mock needed — unpaywall should be skipped entirely
2104
2105        let config = ResolverConfig {
2106            sources: vec![SourceEntry::new("unpaywall", true)],
2107            email: "biblion@example.com".into(),
2108            endpoints: Endpoints {
2109                unpaywall: server.uri(),
2110                ..Default::default()
2111            },
2112            ..Default::default()
2113        };
2114        let report = resolve_pdf_async_with_report(None, None, Some("test"), &config).await;
2115        assert!(report.pdf.is_none());
2116        let unpaywall_outcome = report.outcomes.iter().find(|(n, _)| n == "unpaywall");
2117        assert!(unpaywall_outcome.is_some());
2118        assert!(
2119            unpaywall_outcome
2120                .unwrap()
2121                .1
2122                .as_ref()
2123                .err()
2124                .unwrap()
2125                .contains("email")
2126        );
2127    }
2128
2129    #[tokio::test]
2130    async fn report_google_scholar_429_reports_rate_limit() {
2131        let server = MockServer::start().await;
2132        Mock::given(method("GET"))
2133            .and(path("/scholar"))
2134            .respond_with(ResponseTemplate::new(429))
2135            .mount(&server)
2136            .await;
2137
2138        let config = single_source_config("google_scholar", &server.uri());
2139        let report =
2140            resolve_pdf_async_with_report(None, None, Some("mutation testing"), &config).await;
2141        assert!(report.pdf.is_none());
2142        let gs_outcome = report.outcomes.iter().find(|(n, _)| n == "google_scholar");
2143        assert!(gs_outcome.is_some());
2144        assert!(
2145            gs_outcome
2146                .unwrap()
2147                .1
2148                .as_ref()
2149                .err()
2150                .unwrap()
2151                .contains("429")
2152        );
2153    }
2154
2155    #[tokio::test]
2156    async fn report_openalex_closed_access_reports_reason() {
2157        let server = MockServer::start().await;
2158        Mock::given(method("GET"))
2159            .and(path_regex(r"/works/doi:.*"))
2160            .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({
2161                "open_access": { "is_oa": false, "oa_status": "closed" }
2162            })))
2163            .mount(&server)
2164            .await;
2165
2166        let config = single_source_config("openalex", &server.uri());
2167        let report = resolve_pdf_async_with_report(Some("10.1234/test"), None, None, &config).await;
2168        assert!(report.pdf.is_none());
2169        let oa_outcome = report.outcomes.iter().find(|(n, _)| n == "openalex");
2170        assert!(oa_outcome.is_some());
2171        assert!(oa_outcome.unwrap().1.is_err());
2172    }
2173
2174    #[tokio::test]
2175    async fn report_multiple_sources_collects_all_outcomes() {
2176        let server = MockServer::start().await;
2177        // openalex: 404, core: 404
2178        Mock::given(method("GET"))
2179            .respond_with(ResponseTemplate::new(404))
2180            .mount(&server)
2181            .await;
2182
2183        let config = ResolverConfig {
2184            sources: vec![
2185                SourceEntry::new("openalex", true),
2186                SourceEntry::new("core", true),
2187            ],
2188            endpoints: Endpoints {
2189                openalex: server.uri(),
2190                core: server.uri(),
2191                ..Default::default()
2192            },
2193            ..Default::default()
2194        };
2195        let report =
2196            resolve_pdf_async_with_report(Some("10.1234/test"), None, Some("test"), &config).await;
2197        assert!(report.pdf.is_none());
2198        assert_eq!(report.outcomes.len(), 2);
2199    }
2200
2201    // -----------------------------------------------------------------------
2202    // Existing tests below
2203    // -----------------------------------------------------------------------
2204
2205    #[tokio::test]
2206    async fn semantic_scholar_blocked_domain_not_downloadable() {
2207        let server = MockServer::start().await;
2208        Mock::given(method("GET"))
2209            .and(path_regex(r"/paper/DOI:.*"))
2210            .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({
2211                "openAccessPdf": {
2212                    "url": "https://ieeexplore.ieee.org/paper.pdf"
2213                }
2214            })))
2215            .mount(&server)
2216            .await;
2217
2218        let config = single_source_config("semantic_scholar", &server.uri());
2219        let result = resolve_pdf_async(Some("10.1234/test"), None, None, &config).await;
2220        let r = result.unwrap();
2221        assert!(!r.downloadable);
2222    }
2223}