Source code for pysyrev.bibdata

import warnings
from rapidfuzz import fuzz

from pysyrev.core.api import OpenAlexClient, WosClient
from pysyrev.core.bib import (fetch_citations, generate_bib, generate_oa_bib,
                               extract_documents, check_bib_dataset)
from pysyrev.core.mappers import from_openalex_result, from_wos_result
from pysyrev.core.references import (resolve_references as _resolve_references,
                                     flag_shared_unresolved_references as _flag_unresolved)
from pysyrev.core.config import BibConfig, OpenAlexSourceConfig, WosSourceConfig
from pysyrev.core.merge_bibs import merge_bibs
from pysyrev.core.clean import clean_doi, clean_abstracts
from typing import Iterable, List

import pandas as pd

_SCORER_MAP = {
    "partial_token_sort_ratio": fuzz.partial_token_sort_ratio,
    "token_set_ratio":          fuzz.token_set_ratio,
    "partial_ratio":            fuzz.partial_ratio,
    "WRatio":                   fuzz.WRatio,
    "ratio":                    fuzz.ratio,
}



[docs]
class BibDataset:

    _db = None
    _bib_dataset = None
    _cross_id_map: dict  # {dropped_id: kept_id} built during merge

    def __init__(self, bibfile=None, bib_dataset=None):
        """

        Parameters
        ----------
        bibfile: pandas.DataFrame OR str
            path to bib file (.csv, .bib, etc.)
            or Pandas DataFrame
        bib_dataset: pandas.DataFrame
            Already processed bib dataset
        """
        self._cross_id_map = {}
        if bibfile is not None:
            self.generate_bib(bibfile)
        elif bib_dataset is not None:
            self._bib_dataset = check_bib_dataset(bib_dataset)
        else:
            raise ValueError("Either bibfile or bib_dataset must be provided.")



[docs]
    def clean_and_drop(self,
                       min_signals_to_reject: int=2,
                       extra_garbage_phrases: Iterable[str] = (),
                       use_langdetect: bool = False,
                       ):
        """ Clean DOI and abstract columns, drop no-abstract rows

        Parameters
        ----------
        min_signals_to_reject
        extra_garbage_phrases
        use_langdetect

        Returns
        -------

        """
        # CLEAN
        self._bib_dataset.doi = self._bib_dataset.doi.apply(clean_doi)
        self._bib_dataset.abstract = clean_abstracts(self._bib_dataset.abstract,
                                                     min_signals_to_reject,
                                                     extra_garbage_phrases,
                                                     use_langdetect)

        # Does it have abstract ? DROP no-abstract rows
        self._bib_dataset.drop(self._bib_dataset.index[pd.isna(self._bib_dataset.abstract)],
                               axis=0,
                               inplace=True)

        # Reset index and drop index col
        self._bib_dataset.reset_index(inplace=True,
                                      drop=True)

        return self



[docs]
    def extract_documents(self, include_document_type=None, year=1900,
                          nb_citations=0,
                          language="english",
                          scorer=fuzz.partial_token_sort_ratio,
                          score_cutoff=90,
                          exclude_document_type=None):
        """ Create sub bib dataset through metadata selection

        Parameters
        ----------
        include_document_type: str or list[str] or None
            document types to include (fuzzy-matched); None keeps all
        year: int or float
            min publication year
        nb_citations: int or float
            min citation count
        language: str or list[str] or None
            None means no language filter (keep all)
        scorer: callable
        score_cutoff: int
        exclude_document_type: str or list[str] or None
            document types to exclude (fuzzy-matched); takes priority over inclusion

        Returns
        -------

        """
        return self._propagate_to(
            self.__class__(bib_dataset=extract_documents(self._bib_dataset,
                                                         include_document_type,
                                                         language,
                                                         year,
                                                         nb_citations,
                                                         scorer,
                                                         score_cutoff,
                                                         exclude_doc_type=exclude_document_type))
        )



[docs]
    def flag_shared_unresolved_references(self):
        """Add a 'shared_unresolved_references' column.

        For each document, the column contains the unresolved references
        that appear in at least one other document in the dataset — useful
        as edges for co-citation network analysis on unresolved refs.
        Requires resolve_references() to have been called first.
        """
        self._bib_dataset = _flag_unresolved(self._bib_dataset)

        return self



[docs]
    def fetch_abstracts(self):
        """ Use online APIs to retrieve abstracts

        Returns
        -------

        """
        #TODO
        pass



[docs]
    def fetch_citations(self):
        """ Fetch citation count through
        Semantic Scholar or CrossRef

        Returns
        -------

        """
        try:
            return fetch_citations(self.doi)
        except AttributeError:
            raise ValueError("Bibliography has not been generated yet: no DOI available")



[docs]
    def generate_bib(self, bibfile, del_duplicated=True, verbose=True):
        """ Generate bib using a custom version of pbx_probe

        Parameters
        ----------
        bibfile: str or bytes or os.PathLike
        del_duplicated: bool
        verbose:
            print command outputs

        Returns
        -------

        """
        self._bib_dataset = generate_bib(bibfile,
                                         db = self._db,
                                         del_duplicated=del_duplicated,
                                         print_log=verbose)
        # self._bib_dataset = self._pbx_probe.data

        return self



[docs]
    def merge(self,
              others,
              title_similarity: int = 98,
              ngram_size: int = 3,
              max_candidates_per_row: int = 200,
              scorer = fuzz.token_set_ratio):
        """ Merge dataset with other(s) and remove duplicates

        Parameters
        ----------
        others: List[BibDataset]
        title_similarity: int
            FuzzyWuzzy similarity threshold
        ngram_size : int
            Word n-gram size for the blocking index. Larger = fewer but stricter
            candidates (3 is a reasonable choice for scientific titles).
        max_candidates_per_row : int
            Upper bound on the shortlist size per query. Prevents pathological
            cases where very common n-grams pull in thousands of candidates.
        scorer : callable
            rapidfuzz scorer used to compare shortlisted candidates
            (e.g. ``rapidfuzz.fuzz.token_set_ratio`` or ``fuzz.WRatio``).

        Returns
        -------

        """
        datasets = [self._bib_dataset] + [other.dataset for other in others]

        merged_df, cross_id_map = merge_bibs(datasets,
                                             title_similarity_threshold=title_similarity,
                                             ngram_size=ngram_size,
                                             max_candidates_per_row=max_candidates_per_row,
                                             scorer=scorer)
        instance = self.__class__(bib_dataset=merged_df)
        instance._cross_id_map = cross_id_map
        return instance



[docs]
    def resolve_references(self,
                           fuzzy_score_cutoff: int = 90,
                           ngram_size: int = 3,
                           max_candidates: int = 50,
                           scorer=fuzz.token_set_ratio):
        """Resolve raw references to internal document IDs.

        Adds two columns to the dataset:

        ``reference_ids``
            Internal doc IDs of resolved references ('; '-joined), or None.
        ``unresolved_references``
            Raw reference strings that found no match ('; '-joined), or None.

        Parameters
        ----------
        fuzzy_score_cutoff : int
            Minimum rapidfuzz score (0-100) to accept a fuzzy title match.
            Pass 100 to disable fuzzy matching entirely.
        ngram_size : int
            Word n-gram size for the blocking index.
        max_candidates : int
            Maximum candidates per query in the blocking phase.
        scorer : callable
            rapidfuzz scorer for fuzzy title comparison.

        """
        self._bib_dataset = _resolve_references(
            self._bib_dataset,
            cross_id_map=self._cross_id_map,
            fuzzy_score_cutoff=fuzzy_score_cutoff,
            ngram_size=ngram_size,
            max_candidates=max_candidates,
            scorer=scorer,
        )
        return self



[docs]
    def sample(self, size=100, random_state=None):
        """ Sample dataset at random

        Parameters
        ----------
        size: int
            Sample size
        random_state: int
            Seed for random number generator

        Returns
        -------
        new instance of BibDataset

        """
        return self._propagate_to(
            self.__class__(bib_dataset=self._bib_dataset.sample(n=size,
                                                                random_state=random_state,
                                                                axis=0,
                                                                ignore_index=True))
        )



[docs]
    def to_csv(self,
               file_name,
               sep=",",
               index=False):
        """ Write bib to csv file

        Parameters
        ----------
        file_name: str
        sep: str
        index: bool
            Write row names

        Returns
        -------

        """
        self.dataset.to_csv(file_name,
                            sep=sep,
                            index=index)


    # ---- Protected methods -------------------------------------------------
    def _propagate_to(self, instance):
        instance._cross_id_map = self._cross_id_map
        return instance

    # ---- bridge from configuration -----------------------------------------


[docs]
    @classmethod
    def from_config(cls, config: BibConfig) -> 'BibDataset':
        """Build a BibDataset from all sources declared in a BibConfig.

        Pipeline: load sources → merge → clean → extract (if include_doc_type set)
        → resolve references (if enabled).  All parameters are driven by the
        config; see CleanConfig, ExtractConfig, MergeConfig, and
        ResolveReferencesConfig for defaults.
        """
        datasets: List[BibDataset] = []

        if config.wos:
            datasets.append(WosDataset.from_config(config.wos))
        if config.open_alex:
            datasets.append(OpenAlexDataset.from_config(config.open_alex))
        if config.scopus:
            datasets.append(ScopusDataset(bibfile=config.scopus))
        if config.pubmed:
            datasets.append(PubmedDataset(bibfile=config.pubmed))

        if not datasets:
            raise ValueError("No bib source is configured — set at least one of "
                             "wos, open_alex, scopus, or pubmed in the config.")

        cfg_merge = config.merge
        merged = (
            datasets[0] if len(datasets) == 1
            else datasets[0].merge(
                datasets[1:],
                title_similarity      = cfg_merge.title_similarity,
                ngram_size            = cfg_merge.ngram_size,
                max_candidates_per_row= cfg_merge.max_candidates_per_row,
                scorer                = _SCORER_MAP[cfg_merge.scorer],
            )
        )

        cfg_clean = config.clean
        merged = merged.clean_and_drop(
            min_signals_to_reject = cfg_clean.min_signals_to_reject,
            extra_garbage_phrases = cfg_clean.extra_garbage_phrases or (),
            use_langdetect        = cfg_clean.use_langdetect,
        )

        # resolve_references runs before extract_documents so that:
        #   * references are resolved against the full
        #     cleaned dataset, maximizing the number of
        #     resolvable targets.
        cfg_rr = config.resolve_references
        if cfg_rr.enabled:
            api_sources = [
                name for name, src in (('wos', config.wos), ('open_alex', config.open_alex))
                if isinstance(src, (WosSourceConfig, OpenAlexSourceConfig))
                and src.source == 'api'
            ]
            if api_sources:
                warnings.warn(
                    f"resolve_references is enabled but the following sources use "
                    f"the API ({', '.join(api_sources)}), which does not return "
                    f"references inline. The 'references' column will be empty for "
                    f"those records and resolution will produce no matches. "
                    f"Switch to source: file to get references.",
                    UserWarning,
                    stacklevel=2,
                )
            merged = merged.resolve_references(
                fuzzy_score_cutoff = cfg_rr.fuzzy_score_cutoff,
                ngram_size         = cfg_rr.ngram_size,
                max_candidates     = cfg_rr.max_candidates,
                scorer             = _SCORER_MAP[cfg_rr.scorer],
            )

            if cfg_rr.flag_unresolved:
                merged = merged.flag_shared_unresolved_references()

        cfg_extract = config.extract
        merged = merged.extract_documents(
            cfg_extract.include_doc_type,
            year                  = cfg_extract.year,
            nb_citations          = cfg_extract.nb_citations,
            language              = cfg_extract.language,
            scorer                = _SCORER_MAP[cfg_extract.scorer],
            score_cutoff          = cfg_extract.score_cutoff,
            exclude_document_type = cfg_extract.exclude_doc_type,
        )

        if config.export:
            config.export.resolve()
            merged.to_csv(config.export.dataset)

        return merged


    @classmethod
    def _from_source_config(cls, config) -> 'BibDataset':
        """Template: branch on source type and return a new instance.

        * ``source: file`` — delegates to the regular constructor.
        * ``source: api``  — calls :meth:`_from_api_config` (subclass hook)
          to produce a DEFAULT_FIELDS DataFrame, then wraps it in the constructor.
        """
        if config.source == 'file':
            return cls(bibfile=config.file)
        return cls(bib_dataset=cls._from_api_config(config.api))

    @classmethod
    def _from_api_config(cls, api_config) -> pd.DataFrame:
        """Hook: query the source API and return a DEFAULT_FIELDS DataFrame.

        Must be overridden by subclasses that support ``source: api``.
        """
        raise NotImplementedError(
            f"{cls.__name__} does not support API source. "
            "Override _from_api_config or set `source: file` in the config."
        )

    # ---- properties --------------------------------------------------------

    @property
    def doi(self):
        return self._bib_dataset.doi

    @property
    def citation_count(self):
        return self._bib_dataset.cited_by

    @property
    def dataset(self):
        return self._bib_dataset




[docs]
class WosDataset(BibDataset):

    _db = "wos"


[docs]
    @classmethod
    def from_config(cls, config: WosSourceConfig) -> 'WosDataset':
        return cls._from_source_config(config)


    @classmethod
    def _from_api_config(cls, api_config) -> pd.DataFrame:
        client = WosClient(
            api_key      = api_config.api_key,
            session_file = (f"{api_config.cache_dir}/session.json"
                            if api_config.cache_dir else None),
        )
        return from_wos_result(client.search(query=api_config.query))




[docs]
class OpenAlexDataset(BibDataset):

    _db = "scopus"


[docs]
    @classmethod
    def from_config(cls, config: OpenAlexSourceConfig) -> 'OpenAlexDataset':
        return cls._from_source_config(config)


    @classmethod
    def _from_api_config(cls, api_config) -> pd.DataFrame:
        client = OpenAlexClient(
            api_key      = api_config.api_key,
            email        = api_config.email,
            session_file = (f"{api_config.cache_dir}/session.json"
                            if api_config.cache_dir else None),
        )
        return from_openalex_result(
            client.search(query=api_config.query, filters=api_config.filters)
        )


[docs]
    def generate_bib(self, bibfile, **kwargs):

        new_bib_file = generate_oa_bib(bibfile)

        super().generate_bib(new_bib_file, **kwargs)





[docs]
class ScopusDataset(BibDataset):

    _db = "scopus"

    pass




[docs]
class PubmedDataset(BibDataset):

    _db = "pubmed"

    pass