Source code for pysyrev.pipeline
"""
High-level Pipeline class for running the pysyrev literature review pipeline
from Python or a Jupyter notebook.
Usage
-----
from pysyrev import Pipeline
# Full pipeline
pipeline = Pipeline.from_config("config.yaml")
pipeline.run()
# Selected stages (in canonical order: bib → review → bib-network → topic-model → topic-report)
pipeline.run(stages=["bib", "review"])
# Results are available as attributes
pipeline.bib.dataset # pd.DataFrame
pipeline.review.included_docs # pd.DataFrame
pipeline.network # BibNetwork
pipeline.topic # TopicModel
pipeline.report # TopicReport
Stage results persist on the instance between run() calls, so data is passed
in memory when stages are chained:
pipeline.run(stages=["bib"])
pipeline.run(stages=["review"]) # uses pipeline.bib.dataset automatically
"""
from __future__ import annotations
from dataclasses import dataclass, field
from typing import TYPE_CHECKING, List, Optional, Union
if TYPE_CHECKING:
from pysyrev.bibdata import BibDataset
from pysyrev.core.config import Config
from pysyrev.network import BibNetwork
from pysyrev.review import LLMReview
from pysyrev.topic_model import TopicModel
from pysyrev.topic_report import TopicReport
ALL_STAGES = ['bib', 'review', 'bib-network', 'topic-model', 'topic-report']
[docs]
@dataclass
class Pipeline:
"""Literature review pipeline.
Instantiate with :meth:`from_config`, then call :meth:`run`.
"""
config: 'Config'
bib: Optional['BibDataset'] = field(default=None, init=False)
review: Optional['LLMReview'] = field(default=None, init=False)
network: Optional['BibNetwork'] = field(default=None, init=False)
topic: Optional['TopicModel'] = field(default=None, init=False)
report: Optional['TopicReport'] = field(default=None, init=False)
[docs]
@classmethod
def from_config(cls, config: Union[str, 'Config']) -> 'Pipeline':
"""Create a Pipeline from a YAML config path or a Config object."""
from pysyrev.core.config import Config
if not isinstance(config, Config):
config = Config.load(config)
return cls(config=config)
[docs]
def run(self, stages: Optional[List[str]] = None) -> 'Pipeline':
"""Run one or more pipeline stages.
Parameters
----------
stages : list of str, optional
Subset of stages to run. Defaults to all configured stages in order.
Valid values: 'bib', 'review', 'bib-network', 'topic-model', 'topic-report'.
Stages are always executed in canonical order regardless of the
order they appear in the list.
When None, only stages that have a corresponding section in the
config are executed.
Returns
-------
self
The Pipeline instance (for chaining).
"""
if stages is None:
stages = self._configured_stages()
unknown = set(stages) - set(ALL_STAGES)
if unknown:
raise ValueError(
f"Unknown stage(s) {unknown}. Valid: {ALL_STAGES}"
)
ordered = [s for s in ALL_STAGES if s in stages]
if 'bib' in ordered:
from pysyrev.bibdata import BibDataset
self.bib = BibDataset.from_config(self.config.bib)
if 'review' in ordered:
from pysyrev.review import LLMReview
self.review = LLMReview.from_config(self.config.review)
dataset = self.bib.dataset if self.bib is not None else None
self.review.run(dataset).save()
if 'bib-network' in ordered:
from pysyrev.network import BibNetwork
self.network = BibNetwork.from_config(self.config.bib_network)
dataset = self.review.included_docs if self.review is not None else None
self.network.run(dataset)
if self.config.bib_network.export is not None:
self.network.save()
if 'topic-model' in ordered:
from pysyrev.topic_model import TopicModel
self.topic = TopicModel.from_config(self.config)
dataset = self.review.included_docs if self.review is not None else None
self.topic.run(dataset)
if 'topic-report' in ordered:
from pysyrev.topic_report import TopicReport
self.report = TopicReport.from_config(self.config)
self.report.generate_report()
return self
def _configured_stages(self) -> List[str]:
"""Return stages that have a section in the config, in canonical order."""
present = {
'bib': self.config.bib is not None,
'review': self.config.review is not None,
'bib-network': self.config.bib_network is not None,
'topic-model': self.config.topic_model is not None,
'topic-report': self.config.topic_report is not None,
}
return [s for s in ALL_STAGES if present[s]]