Source code for bobleesj.utils.parsers.formula

import re

from bobleesj.utils.data.element import Element as E



[docs]
class Formula:
    """A class to parse and manipulate chemical formulas. This class
    provides methods to sort, filter, and analyze chemical.

    Examples
    --------
    >>> formula = Formula("NdSi2")
    >>> formula.parsed_formula
    [('Nd', 1.0), ('Si', 2.0)]
    >>> formula.element_count
    2
    >>> formula.get_normalized_formula()
    'Nd1.0Si2.0'
    >>> formula.get_normalized_parsed_formula()
    [('Nd', 0.333333), ('Si', 0.666667)]
    >>> formula.get_normalized_indices()
    """

    def __init__(self, formula: str, validate=True):
        self.formula = formula
        self.parsed_formula = self._parse_formula(formula)
        valid_elements = E.all_symbols()
        if validate:
            if not all(
                element in valid_elements for element, _ in self.parsed_formula
            ):
                raise ValueError(
                    f"No elements found in the perdiodic table for {formula}."
                )


[docs]
    @staticmethod
    def order_by_alphabetical(formulas: list[str], reverse=False) -> list[str]:
        """Sort formulas alphabetically.

        Examples
        --------
        >>> formulas = ["AB2", "AB", "BC2D2", "BBC2"]
        >>> Formula.order_by_alphabetical(formulas)
        ["AB", "AB2", "BBC2", "BC2D2"]
        """
        return sorted(formulas, reverse=reverse)



[docs]
    @staticmethod
    def count(formulas: list[str]) -> int:
        """Count the number of formulas in a list.

        Examples
        --------
        >>> formulas = ["NdSi2", "ThOs", "NdSi2Th2", "YNdThSi2"]
        >>> Formula.count(formulas)
        4
        """
        return len(formulas)



[docs]
    @staticmethod
    def count_unique(
        formulas: list[str],
    ) -> int:
        """Count the number of unique formulas in a list.

        Examples
        --------
        >>> formulas = ["NdSi2", "ThOs", "NdSi2Th2", "YNdThSi2"]
        >>> Formula.count_unique(formulas)
        4
        """
        return len(set(formulas))



[docs]
    @staticmethod
    def count_individual(
        formulas: list[str],
    ) -> dict[str, int]:
        """Count the number of occurrences of each formula in a list of
        formulas.

        Examples
        --------
        >>> formulas = ["NdSi2", "ThOs", "NdSi2Th2", "YNdThSi2"]
        >>> Formula.count_formulas(formulas)
        {"NdSi2": 1, "ThOs": 1, "NdSi2Th2": 1, "YNdThSi2": 1}
        """
        return {formula: formulas.count(formula) for formula in formulas}



[docs]
    @staticmethod
    def count_by_composition(
        formulas: list[str],
    ) -> dict[int, int]:
        """Count the number of formulas in each composition category.

        Examples
        --------
        >>>
        formulas = ["NdSi2", "ThOs", "NdSi2Th2", "YNdThSi2"]
        >>> Formula.count_formulas_by_composition(formulas)
        {2: 2, 3: 1, 4: 1}
        """
        sorted_formulas = Formula.filter_by_composition(formulas)
        return {k: len(v) for k, v in sorted_formulas.items()}



[docs]
    @staticmethod
    def get_unique_formulas(
        formulas: list[str],
    ) -> set[str]:
        """Get unique formulas from a list of formulas.

        Examples
        --------
        >>> formulas = ["NdSi2", "ThOs", "ThOs"]
        >>> Formula.get_unique_formulas(formulas)
        {"NdSi2", "ThOs"}
        """
        return set(formulas)



[docs]
    @staticmethod
    def get_unique_elements(formulas: list[str]) -> set[str]:
        """Get unique elements from a list of formulas.

        Examples
        --------
        >>> formulas = ["NdSi2", "ThOs", "NdSi2Th2", "YNdThSi2"]
        >>> Formula.get_unique_elements(formulas)
        {"Nd", "Si", "Th", "Os", "Y"}
        """
        elements = set()
        for formula in formulas:
            parsed_formula = Formula(formula).parsed_formula
            for element, _ in parsed_formula:
                elements.add(element)
        return elements



[docs]
    @staticmethod
    def get_element_count(formulas: list[str]) -> dict[str, int]:
        """Get the count of each element in a list of formulas. Do not
        consider the stoichiometric value.

        Examples
        --------
        >>> formulas = ["NdSi2", "ThOs", "NdSi2Th2", "YNdThSi2"]
        >>> Formula.get_element_count(formulas)
        {"Nd": 3, "Si": 3, "Th": 3, "Os": 1, "Y": 1}
        """
        element_count = {}
        for formula in formulas:
            parsed_formula = Formula(formula).parsed_formula
            for element, _ in parsed_formula:
                if element not in element_count:
                    element_count[element] = 0
                element_count[element] += 1
        return element_count


    @staticmethod
    def _parse_formula(formula: str) -> list[tuple[str, float]]:
        pattern = r"([A-Z][a-z]*)(\d*\.?\d*)"
        parsed = re.findall(pattern, formula)
        return [
            (element, float(index) if index else 1.0)
            for element, index in parsed
        ]


[docs]
    @staticmethod
    def build_formula_from_parsed(
        parsed_formula: list[tuple[str, float]]
    ) -> str:
        """Convert the parsed formula into a string. If the index can be
        converted to 1 (int), it will be removed.

        Examples
        --------
        >>> parsed_formula = [("Nd", 1.0), ("Si", 2.0)]
        >>> Formula.build_formula_from_parsed(parsed_formula)
        "NdSi2"
        """
        formula_string = ""
        for element, index in parsed_formula:
            if index.is_integer() and int(index) != 1:
                formula_string += f"{element}{int(index)}"
            elif index.is_integer() and int(index) == 1:
                formula_string += f"{element}"
            else:
                formula_string += f"{element}{index}"
        return formula_string



[docs]
    @staticmethod
    def filter_by_composition(
        formulas: list[str],
    ) -> dict[int, list[str]]:
        """Sort formulas into categories based on their composition.

        Examples
        --------
        >>> formulas = ["NdSi2", "ThOs", "NdSi2Th2", "YNdThSi2"]
        >>> Formula.filter_by_composition(formulas)
        {2: ["NdSi2", "ThOs"], 3: ["NdSi2Th2"], 4: ["YNdThSi2"]}
        """
        sorted_formulas = {}
        for formula in formulas:
            element_count = Formula(formula).element_count
            if element_count not in sorted_formulas:
                sorted_formulas[element_count] = []
            sorted_formulas[element_count].append(formula)
        return sorted_formulas



[docs]
    @staticmethod
    def filter_by_single_composition(
        formulas: list[str], composition_type: int
    ) -> list[str]:
        """Filter formulas by the given composition type.

        Examples
        --------
        >>> formulas = ["NdSi2", "ThOs", "NdSi2Th2", "YNdThSi2"]
        >>> Formula.filter_by_single_composition(formulas, 2)
        ["NdSi2", "ThOs"]
        """
        return [
            formula
            for formula in formulas
            if Formula(formula).element_count == composition_type
        ]



[docs]
    @staticmethod
    def filter_by_elements_containing(
        formulas: list[str], elements: list[str]
    ) -> list[str]:
        """Filter formulas by a list of elements.

        Examples
        --------
        >>> formulas = ["NdSi2", "ThOs", "NdSi2Th2", "YNdThSi2"]
        >>> elements = ["Nd", "Si"]
        >>> Formula.filter_by_elements(formulas, elements)
        ["NdSi2", "NdSi2Th2", "YNdThSi2"]
        """
        filtered_formulas = []
        for formula in formulas:
            parsed_formula = Formula(formula).parsed_formula
            if all(element in dict(parsed_formula) for element in elements):
                filtered_formulas.append(formula)
        return filtered_formulas



[docs]
    @staticmethod
    def filter_by_elements_matching(
        formulas: list[str], elements: list[str]
    ) -> list[str]:
        """Filter formulas by a list of elements but the specified
        elements should be only contained.

        Examples
        --------
        >>> formulas = ["NdSi2", "ThOs", "NdSi2Th2", "YNdThSi2"]
        >>> elements = ["Nd", "Si"]
        >>> filter_by_elements(formulas, elements)
        ["NdSi2"]
        """
        filtered_formulas = []
        for formula in formulas:
            parsed_formula = Formula(formula).parsed_formula
            if all(element in dict(parsed_formula) for element in elements):
                if len(parsed_formula) == len(elements):
                    filtered_formulas.append(formula)
        return filtered_formulas



[docs]
    @staticmethod
    def count_duplicates(formulas: list[str]) -> dict[str, int]:
        """Count the number of duplicates in a list of formulas.

        Examples
        --------
        >>> formulas = ["NdSi2", "NdSi2", "NdSi2Th2", "NdSi2Th2", "ThOs"]
        >>> Formula.count_duplicates(formulas)
        {"NdSi2": 2, "NdSi2Th2": 2}
        """
        duplicates = {}
        for formula in formulas:
            if formula not in duplicates:
                duplicates[formula] = 0
            duplicates[formula] += 1
        return {k: v for k, v in duplicates.items() if v > 1}



[docs]
    @staticmethod
    def count_by_formula(formulas: list[str], formula_to_count: str) -> int:
        """Count the number of occurrences of a specific formula in a
        list of formulas.

        Examples
        --------
        >>> formulas = ["NdSi2", "NdSi2", "NdSi2Th2", "NdSi2Th2", "ThOs"]
        >>> Formula.count_by_formula(formulas, "NdSi2")
        2
        """
        return formulas.count(formula_to_count)


    @staticmethod
    def _convert_custom_labels_to_order_map(custom_labels: dict) -> dict:
        """Convert a nested custom_labels dictionary into an element
        order mapping.

        This function is used for the sorting of elements in the sort formula
        as a part of the sorted function above.

        Parameters
        ----------
        custom_labels : dict
            The dictionary mapping element counts to label mappings. Each label
            mapping is a dictionary where keys are label names and values are
            lists of element symbols or comma-separated strings of element
            symbols.

        Returns
        -------
        label_order_map : dict[int, dict[str, int]]
            The dictionary mapping element counts to dictionaries of element
            symbol to order index.

        Examples
        --------
        >>> custom_labels = {
        ...     2: {
        ...         "A": ["Li", "Er"],
        ...         "B": ["B", "In"],
        ...     },
        ...     3: {
        ...         "R": ["Er"],
        ...         "M": ["Co"],
        ...         "X": ["In"],
        ...     },
        ...     4: {
        ...         "A": ["Er"],
        ...         "B": ["Co"],
        ...         "C": ["In"],
        ...         "D": ["U"],
        ...     },
        ... }
        >>> convert_custom_labels_to_order_map(custom_labels)
        {
            2: {'Li': 0, 'Er': 0, 'B': 1, 'In': 1},
            3: {'Er': 0, 'Co': 1, 'In': 2},
            4: {'Er': 0, 'Co': 1, 'In': 2, 'U': 3}
        }
        """
        label_order_map = {}
        for element_count, label_mapping in custom_labels.items():
            order_map = {}
            for idx, elements in enumerate(label_mapping.values()):
                for element in elements:
                    order_map[element] = idx
            label_order_map[element_count] = order_map
        return label_order_map

    def _normalized(self, decimals: int = 6) -> str:
        index_sum = sum(self.indices)
        normalized = [
            (element, count / index_sum)
            for element, count in self.parsed_formula
        ]
        return "".join(
            f"{element}{format(index, f'.{decimals}f')}"
            for element, index in normalized
        )

    @property
    def elements(self) -> list[str]:
        """Get the list of elements in the formula.

        Examples
        --------
        >>> formula = Formula("NdSi2")
        >>> formula.elements
        ["Nd", "Si"]
        """
        return [element for element, _ in self.parsed_formula]

    @property
    def indices(self) -> list[float]:
        """Get the list of indices in the formula.

        Examples
        --------
        >>> formula = Formula("NdSi2")
        >>> formula.indices
        [1.0, 2.0]
        """
        return [index for _, index in self.parsed_formula]

    @property
    def element_count(self) -> int:
        """Get the number of unique elements in the formula.

        Examples
        --------
        >>> formula = Formula("NdSi2")
        >>> formula.element_count
        2
        """
        return len(self.parsed_formula)

    @property
    def max_min_avg_index(self) -> tuple[float, float, float]:
        """Get the max, min, and avg index of the formula.

        Examples
        --------
        >>> formula = Formula("NdSi2")
        >>> formula.max_min_avg_index
        (2.0, 1.0, 1.5)
        """
        indices = self.indices
        return max(indices), min(indices), sum(indices) / len(indices)


[docs]
    def get_normalized_indices(self, decimals=6) -> list[float]:
        """Get the normalized indices of the formula.

        Examples
        --------
        >>> formula = Formula("NdSi2")
        >>> formula.get_normalized_indices()
        [0.333333, 0.666667]
        >>> formula.get_normalized_indices(2)
        [0.33, 0.67]
        """
        total = sum(self.indices)
        return [round(index / total, decimals) for index in self.indices]



[docs]
    def get_normalized_formula(self, decimals=6) -> str:
        """Get the normalized formula of the formula.

        Examples
        --------
        >>> formula = Formula("NdSi2")
        >>> formula.get_normalized_formula()
        "Nd0.333333Si0.666667"
        >>> formula.get_normalized_formula(2)
        "Nd0.33Si0.67"
        """
        return self._normalized(decimals=decimals)



[docs]
    def get_normalized_parsed_formula(
        self, decimals=6
    ) -> list[tuple[str, float]]:
        """Get the normalized parsed formula of the formula.

        Examples
        --------
        >>> formula = Formula("NdSi2")
        >>> formula.get_normalized_parsed_formula()
        [("Nd", 0.333333), ("Si", 0.666667)]
        >>> formula.get_normalized_parsed_formula(2)
        [("Nd", 0.33), ("Si", 0.67)]
        """
        normalized = self._normalized(decimals=decimals)
        return self._parse_formula(normalized)



[docs]
    def sort_by_custom_label(
        self, custom_labels: dict[int : dict[str : list[str]]], normalize=False
    ) -> str:
        """Sort elements in a chemical formula using a precomputed
        element order map.

        Parameters
        ----------
        formula : str
            The chemical formula to be sorted.
        element_order : dict[int, dict[str, int]]
            The mapping from element symbols to their desired sort index.
        normalize : bool, optional
            Whether to normalize the parsed formula, by default False.

        Returns
        -------
        str
            The sorted formula string.
        Examples
        --------
        >>> formula = "BLi"
        >>> custom_labels = {
        ...     2: {
        ...         "A": ["Li", "Er"],
        ...         "B": ["B", "In"],
        ...     },
        ...     3: {
        ...         "R": ["Er"],
        ...         "M": ["Co"],
        ...         "X": ["In"],
        ...     },
        ...     4: {
        ...         "A": ["Er"],
        ...         "B": ["Co"],
        ...         "C": ["In"],
        ...         "D": ["U"],
        ...     },
        ... }
        >>> sorted_formula = sort(formula, custom_labels)
        >>> print(sorted_formula)
        LiB
        """
        formula_parsed = (
            self.get_normalized_parsed_formula()
            if normalize
            else self.parsed_formula
        )
        label_order_map = self._convert_custom_labels_to_order_map(
            custom_labels
        )
        element_order = label_order_map.get(self.element_count, {})
        formula_sorted = sorted(
            formula_parsed, key=lambda x: element_order.get(x[0], float("inf"))
        )
        return Formula.build_formula_from_parsed(formula_sorted)



[docs]
    def sort_by_elemental_property(
        self,
        property_data: dict[str, float],
        ascending=True,
        normalize=False,
    ) -> str:
        """Sort the elements in a chemical formula based on a specified
        CAF property.

        Parameters
        ----------
        formula : str
            The chemical formula to be sorted.
        property_data: dict[str, float]
            The dictionary that contains the single value for each element of
            the given formula.
        ascending : bool, optional
            Whether to sort in ascending order. Defaults to True.
        normalize : bool, optional
            Whether to normalize the formula before sorting. Defaults to False.

        Returns
        -------
        str
            The formula string with elements sorted according to the specified
            property.

        Examples
        --------
        #FIXME: Double check this example
        >>> from bobleesj.utils.sources.oliynyk import Oliynyk
        >>> from bobleesj.utils.sources.oliynyk import Property as P
        >>> formula = "LiFe"
        >>> oliynyk = Oliynyk()
        >>> prop_data = oliynyk.get_property_data_for_formula(formula, P.AW)
        >>> Formula(formula).sort("LiFe", prop_data)
        "LiFe"
        #FIXME: TEST THIS EXAMPLES
        """
        formula_parsed = (
            self.get_normalized_parsed_formula()
            if normalize
            else self.parsed_formula
        )
        formula_sorted = sorted(
            formula_parsed,
            key=lambda x: property_data.get(x[0], 0),
            reverse=not ascending,
        )
        return Formula.build_formula_from_parsed(formula_sorted)



[docs]
    def sort_by_stoichiometry(
        self, property_data: dict[str:float], ascending=True, normalize=False
    ) -> str:
        """Sort the elements in the chemical formula based on their
        composition.

        When there are more than one element with the same compsition, the
        Mendeleev number is used to break the tie. During the tie, the
        Mendeleev number is always sorted in ascending order.

        Parameters
        ----------
        property_data: dict[str: float]
            The data to sort with when when stoichiometric raito is the same.
            The default value is optional that if no data provided, then we
            will simply sort alphabetically from a to z.
        ascending : bool, optional
            Whether to sort in ascending order. Defaults to True.
        normalize : bool, optional
            Whether to normalize the formula before sorting. Defaults to False.

        Returns
        -------
        str
            The formula string with elements sorted according to the specified
            property.

        Examples
        --------
        >>> sort("LiNa2B", db)
        "LiBNa2"
        """
        formula_parsed = (
            self.get_normalized_parsed_formula()
            if normalize
            else self.parsed_formula
        )
        reverse = not ascending
        formula_sorted = sorted(
            formula_parsed,
            key=lambda x: (
                # 1st sort, reverse sort if descending (reversed)
                -x[1] if reverse else x[1],
                # 2nd sort for the same compoposition. Always ascending sort.
                property_data[x[0]],
            ),
        )
        return Formula.build_formula_from_parsed(formula_sorted)