soleprint/station/tools/tester/gherkin/parser.py

"""
Parse Gherkin .feature files.

Simple parser without external dependencies - parses the subset we need.
For full Gherkin support, could use gherkin-python package later.
"""

import re
from pathlib import Path
from typing import Optional
from dataclasses import dataclass, field


@dataclass
class GherkinScenario:
    """A Gherkin scenario."""
    name: str
    description: str
    tags: list[str] = field(default_factory=list)
    steps: list[str] = field(default_factory=list)
    examples: dict = field(default_factory=dict)
    scenario_type: str = "Scenario"  # or "Scenario Outline" / "Esquema del escenario"


@dataclass
class GherkinFeature:
    """A parsed Gherkin feature file."""
    name: str
    description: str
    file_path: str
    language: str = "en"  # or "es"
    tags: list[str] = field(default_factory=list)
    background: Optional[dict] = None
    scenarios: list[GherkinScenario] = field(default_factory=list)


def parse_feature_file(file_path: Path) -> Optional[GherkinFeature]:
    """
    Parse a Gherkin .feature file.

    Supports both English and Spanish keywords.
    Extracts: Feature name, scenarios, tags, steps.
    """
    if not file_path.exists():
        return None

    try:
        content = file_path.read_text(encoding='utf-8')
    except Exception:
        return None

    # Detect language
    language = "en"
    if re.search(r"#\s*language:\s*es", content):
        language = "es"

    # Keywords by language
    if language == "es":
        feature_kw = r"Característica"
        scenario_kw = r"Escenario"
        outline_kw = r"Esquema del escenario"
        background_kw = r"Antecedentes"
        examples_kw = r"Ejemplos"
        given_kw = r"Dado"
        when_kw = r"Cuando"
        then_kw = r"Entonces"
        and_kw = r"Y"
        but_kw = r"Pero"
    else:
        feature_kw = r"Feature"
        scenario_kw = r"Scenario"
        outline_kw = r"Scenario Outline"
        background_kw = r"Background"
        examples_kw = r"Examples"
        given_kw = r"Given"
        when_kw = r"When"
        then_kw = r"Then"
        and_kw = r"And"
        but_kw = r"But"

    lines = content.split('\n')

    # Extract feature
    feature_name = None
    feature_desc = []
    feature_tags = []
    scenarios = []
    current_scenario = None
    current_tags = []

    i = 0
    while i < len(lines):
        line = lines[i].strip()

        # Skip comments and empty lines
        if not line or line.startswith('#'):
            i += 1
            continue

        # Tags
        if line.startswith('@'):
            tags = re.findall(r'@[\w-]+', line)
            current_tags.extend(tags)
            i += 1
            continue

        # Feature
        feature_match = re.match(rf"^{feature_kw}:\s*(.+)", line)
        if feature_match:
            feature_name = feature_match.group(1).strip()
            feature_tags = current_tags.copy()
            current_tags = []

            # Read feature description
            i += 1
            while i < len(lines):
                line = lines[i].strip()
                if not line or line.startswith('#'):
                    i += 1
                    continue
                # Stop at scenario or background
                if re.match(rf"^({scenario_kw}|{outline_kw}|{background_kw}):", line):
                    break
                feature_desc.append(line)
                i += 1
            continue

        # Scenario
        scenario_match = re.match(rf"^({scenario_kw}|{outline_kw}):\s*(.+)", line)
        if scenario_match:
            # Save previous scenario
            if current_scenario:
                scenarios.append(current_scenario)

            scenario_type = scenario_match.group(1)
            scenario_name = scenario_match.group(2).strip()

            current_scenario = GherkinScenario(
                name=scenario_name,
                description="",
                tags=current_tags.copy(),
                steps=[],
                scenario_type=scenario_type
            )
            current_tags = []

            # Read scenario steps
            i += 1
            while i < len(lines):
                line = lines[i].strip()

                # Empty or comment
                if not line or line.startswith('#'):
                    i += 1
                    continue

                # New scenario or feature-level element
                if re.match(rf"^({scenario_kw}|{outline_kw}|{examples_kw}):", line):
                    break

                # Tags (start of next scenario)
                if line.startswith('@'):
                    break

                # Step keywords
                if re.match(rf"^({given_kw}|{when_kw}|{then_kw}|{and_kw}|{but_kw})\s+", line):
                    current_scenario.steps.append(line)

                i += 1
            continue

        i += 1

    # Add last scenario
    if current_scenario:
        scenarios.append(current_scenario)

    if not feature_name:
        return None

    return GherkinFeature(
        name=feature_name,
        description=" ".join(feature_desc),
        file_path=str(file_path),
        language=language,
        tags=feature_tags,
        scenarios=scenarios
    )


def discover_features(features_dir: Path) -> list[GherkinFeature]:
    """
    Discover all .feature files in the features directory.
    """
    if not features_dir.exists():
        return []

    features = []

    for feature_file in features_dir.rglob("*.feature"):
        parsed = parse_feature_file(feature_file)
        if parsed:
            features.append(parsed)

    return features


def extract_tags_from_features(features: list[GherkinFeature]) -> set[str]:
    """Extract all unique tags from features."""
    tags = set()

    for feature in features:
        tags.update(feature.tags)
        for scenario in feature.scenarios:
            tags.update(scenario.tags)

    return tags


def get_feature_names(features: list[GherkinFeature]) -> list[str]:
    """Get list of feature names."""
    return [f.name for f in features]


def get_scenario_names(features: list[GherkinFeature]) -> list[str]:
    """Get list of all scenario names across all features."""
    scenarios = []
    for feature in features:
        for scenario in feature.scenarios:
            scenarios.append(scenario.name)
    return scenarios