Team

`TeamDataSchema`

Bases: DataFrameModel

A panderas.DataFrameModel for team information.

Provider-specific team identifier fields are added before validation during TeamSyncableContent.validate_data_schema().

competition_id and season_id must be provided when using TeamSyncEngine.use_competition_context.

Source code in src/glass_onion/team.py

class TeamDataSchema(pa.DataFrameModel):
    """
    A panderas.DataFrameModel for team information.

    Provider-specific team identifier fields are added before validation during [TeamSyncableContent.validate_data_schema()][glass_onion.team.TeamSyncableContent.validate_data_schema].

    `competition_id` and `season_id` must be provided when using `TeamSyncEngine.use_competition_context`.
    """

    team_name: Series[str] = Field(nullable=False)
    """
    The name of the team.
    """
    competition_id: Optional[Series[str]] = Field(nullable=False, coerce=True)
    """
    The competition the team is competing in. This is assumed to be universally unique across the [TeamSyncableContent][glass_onion.team.TeamSyncableContent] objects provided to [TeamSyncEngine][glass_onion.team.TeamSyncEngine].
    """
    season_id: Optional[Series[str]] = Field(nullable=False, coerce=True)
    """
    The season of the competition that the team is competing in. This is assumed to be universally unique across the [TeamSyncableContent][glass_onion.team.TeamSyncableContent] objects provided to [TeamSyncEngine][glass_onion.team.TeamSyncEngine].
    """

`competition_id = Field(nullable=False, coerce=True)` `class-attribute` `instance-attribute`

The competition the team is competing in. This is assumed to be universally unique across the TeamSyncableContent objects provided to TeamSyncEngine.

`season_id = Field(nullable=False, coerce=True)` `class-attribute` `instance-attribute`

The season of the competition that the team is competing in. This is assumed to be universally unique across the TeamSyncableContent objects provided to TeamSyncEngine.

`team_name = Field(nullable=False)` `class-attribute` `instance-attribute`

The name of the team.

`TeamSyncEngine`

Bases: SyncEngine

A subclass of SyncEngine to use for team objects.

See synchronize_pair()[glass_onion.team.TeamSyncEngine.synchronize_pair] for methodology details.

Source code in src/glass_onion/team.py

class TeamSyncEngine(SyncEngine):
    """
    A subclass of SyncEngine to use for team objects.

    See `synchronize_pair()`[glass_onion.team.TeamSyncEngine.synchronize_pair] for methodology details.
    """

    def __init__(
        self,
        content: list[SyncableContent],
        use_competition_context: bool = False,
        verbose: bool = False,
    ):
        """
        Creates a new TeamSyncEngine object. Setting `use_competition_context` adds `competition_id` and `season_id` (assumed to be universal across all data providers) to `join_columns`.

        Args:
            content (list[str]): a list of SyncableContent objects.
            use_competition_context (bool): should the competition context (IE: columns `competition_id` and `season_id`) be used to synchronize/aggregate team identifiers?
            verbose (bool): a flag to verbose logging. This will be `extremely` verbose, allowing new SyncEngine developers and those integrating SyncEngine into their workflows to see the interactions between different logical layers during synchronization.
        """
        super().__init__(
            "team",
            content,
            ["team_name", "competition_id", "season_id"]
            if use_competition_context
            else ["team_name"],
            verbose,
        )

        if use_competition_context:
            comp_schema = TeamDataSchema.to_schema().update_columns(
                {
                    "competition_id": {"required": True, "nullable": False},
                    "season_id": {"required": True, "nullable": False},
                }
            )
            assert [comp_schema.validate(d.data) for d in self.content]

    def synchronize_pair(
        self, input1: SyncableContent, input2: SyncableContent
    ) -> SyncableContent:
        """
        Synchronizes two TeamSyncableContent objects.

        Methodology:
            1. Attempt to join pair simply on `team_name`.
            2. With remaining records, attempt to match via cosine similarity using a minimum threshold of 75% similarity.
            3. For any remaining records, attempt to match via cosine similarity using no minimum similarity threshold.

        Args:
            input1 (glass_onion.SyncableContent): a TeamSyncableContent object from `TeamSyncEngine.content`
            input2 (glass_onion.SyncableContent): a TeamSyncableContent object from `TeamSyncEngine.content`

        Returns:
            If `input1`'s underlying `data` dataframe is empty, returns `input2` with a column in `input2.data` for `input1.id_field`.
            If `input2`'s underlying `data` dataframe is empty, returns `input1` with a column in `input1.data` for `input2.id_field`.
            If both dataframes are non-empty, returns a new TeamSyncableContent object with synchronized identifiers from `input1` and `input2`.
        """
        if len(input1.data) == 0 and len(input2.data) > 0:
            input2.data[input1.id_field] = pd.NA
            return input2

        if len(input1.data) > 0 and len(input2.data) == 0:
            input1.data[input2.id_field] = pd.NA
            return input1

        # first pass: names are equal
        self.verbose_log(
            f"Attempting pair synchronization for inputs {input1.provider} (length {len(input1.data)}) and {input2.provider} (length {len(input2.data)})"
        )
        sync_result = pd.merge(
            input1.data, input2.data, on=self.join_columns, how="left"
        )
        synced = sync_result.dropna(subset=[input1.id_field, input2.id_field])

        # second pass: cosine similarity
        remaining_1 = TeamSyncableContent(
            input1.provider,
            input1.data[~(input1.data[input1.id_field].isin(synced[input1.id_field]))],
        )
        remaining_2 = TeamSyncableContent(
            input2.provider,
            input2.data[~(input2.data[input2.id_field].isin(synced[input2.id_field]))],
        )

        if len(remaining_1.data) > 0 and len(remaining_2.data) > 0:
            cosine_results = self.synchronize_with_cosine_similarity(
                remaining_1,
                remaining_2,
                fields=("team_name", "team_name"),
            )

            if len(cosine_results) > 0:
                attempt_syncs = cosine_results.dropna().drop_duplicates()
                self.verbose_log(
                    f"Via cosine-similarity pair synchronization for inputs, found {len(attempt_syncs)} new rows"
                )

                if len(attempt_syncs) > 0:
                    sync_result = pd.merge(
                        sync_result, attempt_syncs, on=input1.id_field, how="left"
                    )
                    dataframe_coalesce(sync_result, [input2.id_field])

                    # update remainders check
                    synced = sync_result.dropna(
                        subset=[input1.id_field, input2.id_field]
                    )

        # third pass: cosine similarity but we don't care about the threshold
        remaining_1 = TeamSyncableContent(
            input1.provider,
            input1.data[~(input1.data[input1.id_field].isin(synced[input1.id_field]))],
        )
        remaining_2 = TeamSyncableContent(
            input2.provider,
            input2.data[~(input2.data[input2.id_field].isin(synced[input2.id_field]))],
        )
        if len(remaining_1.data) > 0 and len(remaining_2.data) > 0:
            self.verbose_log(
                f"Attempting less-stringent cosine-similarity pair synchronization for inputs {remaining_1.provider} (length {len(remaining_1.data)}) and {remaining_2.provider} (length {len(remaining_2.data)})"
            )
            cosine_results = self.synchronize_with_cosine_similarity(
                remaining_1,
                remaining_2,
                fields=("team_name", "team_name"),
                threshold=0.0,
            )

            if len(cosine_results) > 0:
                attempt_syncs = cosine_results.dropna().drop_duplicates()
                attempt_syncs = attempt_syncs[
                    [remaining_1.id_field, remaining_2.id_field]
                ]
                self.verbose_log(
                    f"Via less-stringent cosine-similarity pair synchronization for inputs, found {len(attempt_syncs)} new rows"
                )

                if len(attempt_syncs) > 0:
                    sync_result = pd.merge(
                        sync_result, attempt_syncs, on=input1.id_field, how="left"
                    )

                    dataframe_coalesce(sync_result, [input2.id_field])

        return TeamSyncableContent(input1.provider, data=sync_result)

`init(content, use_competition_context=False, verbose=False)`

Creates a new TeamSyncEngine object. Setting use_competition_context adds competition_id and season_id (assumed to be universal across all data providers) to join_columns.

Parameters:

Name	Type	Description	Default
`content`	`list[str]`	a list of SyncableContent objects.	required
`use_competition_context`	`bool`	should the competition context (IE: columns `competition_id` and `season_id`) be used to synchronize/aggregate team identifiers?	`False`
`verbose`	`bool`	a flag to verbose logging. This will be `extremely` verbose, allowing new SyncEngine developers and those integrating SyncEngine into their workflows to see the interactions between different logical layers during synchronization.	`False`

Source code in src/glass_onion/team.py

def __init__(
    self,
    content: list[SyncableContent],
    use_competition_context: bool = False,
    verbose: bool = False,
):
    """
    Creates a new TeamSyncEngine object. Setting `use_competition_context` adds `competition_id` and `season_id` (assumed to be universal across all data providers) to `join_columns`.

    Args:
        content (list[str]): a list of SyncableContent objects.
        use_competition_context (bool): should the competition context (IE: columns `competition_id` and `season_id`) be used to synchronize/aggregate team identifiers?
        verbose (bool): a flag to verbose logging. This will be `extremely` verbose, allowing new SyncEngine developers and those integrating SyncEngine into their workflows to see the interactions between different logical layers during synchronization.
    """
    super().__init__(
        "team",
        content,
        ["team_name", "competition_id", "season_id"]
        if use_competition_context
        else ["team_name"],
        verbose,
    )

    if use_competition_context:
        comp_schema = TeamDataSchema.to_schema().update_columns(
            {
                "competition_id": {"required": True, "nullable": False},
                "season_id": {"required": True, "nullable": False},
            }
        )
        assert [comp_schema.validate(d.data) for d in self.content]

`synchronize_pair(input1, input2)`

Synchronizes two TeamSyncableContent objects.

Methodology

Attempt to join pair simply on team_name.
With remaining records, attempt to match via cosine similarity using a minimum threshold of 75% similarity.
For any remaining records, attempt to match via cosine similarity using no minimum similarity threshold.

Parameters:

Name	Type	Description	Default
`input1`	`SyncableContent`	a TeamSyncableContent object from `TeamSyncEngine.content`	required
`input2`	`SyncableContent`	a TeamSyncableContent object from `TeamSyncEngine.content`	required

Returns:

Type	Description
`SyncableContent`	If `input1`'s underlying `data` dataframe is empty, returns `input2` with a column in `input2.data` for `input1.id_field`.
`SyncableContent`	If `input2`'s underlying `data` dataframe is empty, returns `input1` with a column in `input1.data` for `input2.id_field`.
`SyncableContent`	If both dataframes are non-empty, returns a new TeamSyncableContent object with synchronized identifiers from `input1` and `input2`.

Source code in src/glass_onion/team.py

def synchronize_pair(
    self, input1: SyncableContent, input2: SyncableContent
) -> SyncableContent:
    """
    Synchronizes two TeamSyncableContent objects.

    Methodology:
        1. Attempt to join pair simply on `team_name`.
        2. With remaining records, attempt to match via cosine similarity using a minimum threshold of 75% similarity.
        3. For any remaining records, attempt to match via cosine similarity using no minimum similarity threshold.

    Args:
        input1 (glass_onion.SyncableContent): a TeamSyncableContent object from `TeamSyncEngine.content`
        input2 (glass_onion.SyncableContent): a TeamSyncableContent object from `TeamSyncEngine.content`

    Returns:
        If `input1`'s underlying `data` dataframe is empty, returns `input2` with a column in `input2.data` for `input1.id_field`.
        If `input2`'s underlying `data` dataframe is empty, returns `input1` with a column in `input1.data` for `input2.id_field`.
        If both dataframes are non-empty, returns a new TeamSyncableContent object with synchronized identifiers from `input1` and `input2`.
    """
    if len(input1.data) == 0 and len(input2.data) > 0:
        input2.data[input1.id_field] = pd.NA
        return input2

    if len(input1.data) > 0 and len(input2.data) == 0:
        input1.data[input2.id_field] = pd.NA
        return input1

    # first pass: names are equal
    self.verbose_log(
        f"Attempting pair synchronization for inputs {input1.provider} (length {len(input1.data)}) and {input2.provider} (length {len(input2.data)})"
    )
    sync_result = pd.merge(
        input1.data, input2.data, on=self.join_columns, how="left"
    )
    synced = sync_result.dropna(subset=[input1.id_field, input2.id_field])

    # second pass: cosine similarity
    remaining_1 = TeamSyncableContent(
        input1.provider,
        input1.data[~(input1.data[input1.id_field].isin(synced[input1.id_field]))],
    )
    remaining_2 = TeamSyncableContent(
        input2.provider,
        input2.data[~(input2.data[input2.id_field].isin(synced[input2.id_field]))],
    )

    if len(remaining_1.data) > 0 and len(remaining_2.data) > 0:
        cosine_results = self.synchronize_with_cosine_similarity(
            remaining_1,
            remaining_2,
            fields=("team_name", "team_name"),
        )

        if len(cosine_results) > 0:
            attempt_syncs = cosine_results.dropna().drop_duplicates()
            self.verbose_log(
                f"Via cosine-similarity pair synchronization for inputs, found {len(attempt_syncs)} new rows"
            )

            if len(attempt_syncs) > 0:
                sync_result = pd.merge(
                    sync_result, attempt_syncs, on=input1.id_field, how="left"
                )
                dataframe_coalesce(sync_result, [input2.id_field])

                # update remainders check
                synced = sync_result.dropna(
                    subset=[input1.id_field, input2.id_field]
                )

    # third pass: cosine similarity but we don't care about the threshold
    remaining_1 = TeamSyncableContent(
        input1.provider,
        input1.data[~(input1.data[input1.id_field].isin(synced[input1.id_field]))],
    )
    remaining_2 = TeamSyncableContent(
        input2.provider,
        input2.data[~(input2.data[input2.id_field].isin(synced[input2.id_field]))],
    )
    if len(remaining_1.data) > 0 and len(remaining_2.data) > 0:
        self.verbose_log(
            f"Attempting less-stringent cosine-similarity pair synchronization for inputs {remaining_1.provider} (length {len(remaining_1.data)}) and {remaining_2.provider} (length {len(remaining_2.data)})"
        )
        cosine_results = self.synchronize_with_cosine_similarity(
            remaining_1,
            remaining_2,
            fields=("team_name", "team_name"),
            threshold=0.0,
        )

        if len(cosine_results) > 0:
            attempt_syncs = cosine_results.dropna().drop_duplicates()
            attempt_syncs = attempt_syncs[
                [remaining_1.id_field, remaining_2.id_field]
            ]
            self.verbose_log(
                f"Via less-stringent cosine-similarity pair synchronization for inputs, found {len(attempt_syncs)} new rows"
            )

            if len(attempt_syncs) > 0:
                sync_result = pd.merge(
                    sync_result, attempt_syncs, on=input1.id_field, how="left"
                )

                dataframe_coalesce(sync_result, [input2.id_field])

    return TeamSyncableContent(input1.provider, data=sync_result)

`TeamSyncableContent`

Bases: SyncableContent

A subclass of SyncableContent to use for team objects.

Source code in src/glass_onion/team.py

class TeamSyncableContent(SyncableContent):
    """
    A subclass of SyncableContent to use for team objects.
    """

    def __init__(self, provider: str, data: pd.DataFrame):
        super().__init__("team", provider, data)

    def validate_data_schema(self) -> bool:
        """
        Checks if this object's `data` meets the schema requirements for this object type. See [TeamDataSchema][glass_onion.team.TeamDataSchema] for more details.

        Raises:
            pandera.errors.SchemaError: if `data` does not conform to the schema.

        Returns:
            True, if `data` is formatted properly.
        """
        (
            TeamDataSchema.to_schema()
            .add_columns(
                {
                    f"{self.id_field}": Column(
                        str, required=True, nullable=False, coerce=True
                    )
                }
            )
            .validate(self.data)
        )
        return super().validate_data_schema()

`validate_data_schema()`

Checks if this object's data meets the schema requirements for this object type. See TeamDataSchema for more details.

Raises:

Type	Description
`SchemaError`	if `data` does not conform to the schema.

Returns:

Type	Description
`bool`	True, if `data` is formatted properly.

Source code in src/glass_onion/team.py

def validate_data_schema(self) -> bool:
    """
    Checks if this object's `data` meets the schema requirements for this object type. See [TeamDataSchema][glass_onion.team.TeamDataSchema] for more details.

    Raises:
        pandera.errors.SchemaError: if `data` does not conform to the schema.

    Returns:
        True, if `data` is formatted properly.
    """
    (
        TeamDataSchema.to_schema()
        .add_columns(
            {
                f"{self.id_field}": Column(
                    str, required=True, nullable=False, coerce=True
                )
            }
        )
        .validate(self.data)
    )
    return super().validate_data_schema()

Team

TeamDataSchema

competition_id = Field(nullable=False, coerce=True) class-attribute instance-attribute

season_id = Field(nullable=False, coerce=True) class-attribute instance-attribute

team_name = Field(nullable=False) class-attribute instance-attribute

TeamSyncEngine

__init__(content, use_competition_context=False, verbose=False)

synchronize_pair(input1, input2)

TeamSyncableContent

validate_data_schema()

`TeamDataSchema`

`competition_id = Field(nullable=False, coerce=True)` `class-attribute` `instance-attribute`

`season_id = Field(nullable=False, coerce=True)` `class-attribute` `instance-attribute`

`team_name = Field(nullable=False)` `class-attribute` `instance-attribute`

`TeamSyncEngine`

`init(content, use_competition_context=False, verbose=False)`

`synchronize_pair(input1, input2)`

`TeamSyncableContent`

`validate_data_schema()`