Source code for bobleesj.utils.parsers.formula
import re
from bobleesj.utils.data.element import Element as E
[docs]
class Formula:
"""A class to parse and manipulate chemical formulas. This class provides
methods to sort, filter, and analyze chemical.
Examples
--------
>>> formula = Formula("NdSi2")
>>> formula.parsed_formula
[('Nd', 1.0), ('Si', 2.0)]
>>> formula.element_count
2
>>> formula.get_normalized_formula()
'Nd1.0Si2.0'
>>> formula.get_normalized_parsed_formula()
[('Nd', 0.333333), ('Si', 0.666667)]
>>> formula.get_normalized_indices()
"""
def __init__(self, formula: str, validate=True):
self.formula = formula
self.parsed_formula = self._parse_formula(formula)
valid_elements = E.all_symbols()
if validate:
if not all(
element in valid_elements for element, _ in self.parsed_formula
):
raise ValueError(
f"No elements found in the perdiodic table for {formula}."
)
[docs]
@staticmethod
def order_by_alphabetical(formulas: list[str], reverse=False) -> list[str]:
"""Sort formulas alphabetically.
Examples
--------
>>> formulas = ["AB2", "AB", "BC2D2", "BBC2"]
>>> Formula.order_by_alphabetical(formulas)
["AB", "AB2", "BBC2", "BC2D2"]
"""
return sorted(formulas, reverse=reverse)
[docs]
@staticmethod
def count(formulas: list[str]) -> int:
"""Count the number of formulas in a list.
Examples
--------
>>> formulas = ["NdSi2", "ThOs", "NdSi2Th2", "YNdThSi2"]
>>> Formula.count(formulas)
4
"""
return len(formulas)
[docs]
@staticmethod
def count_unique(
formulas: list[str],
) -> int:
"""Count the number of unique formulas in a list.
Examples
--------
>>> formulas = ["NdSi2", "ThOs", "NdSi2Th2", "YNdThSi2"]
>>> Formula.count_unique(formulas)
4
"""
return len(set(formulas))
[docs]
@staticmethod
def count_individual(
formulas: list[str],
) -> dict[str, int]:
"""Count the number of occurrences of each formula in a list of
formulas.
Examples
--------
>>> formulas = ["NdSi2", "ThOs", "NdSi2Th2", "YNdThSi2"]
>>> Formula.count_formulas(formulas)
{"NdSi2": 1, "ThOs": 1, "NdSi2Th2": 1, "YNdThSi2": 1}
"""
return {formula: formulas.count(formula) for formula in formulas}
[docs]
@staticmethod
def count_by_composition(
formulas: list[str],
) -> dict[int, int]:
"""Count the number of formulas in each composition category.
Examples
--------
>>>
formulas = ["NdSi2", "ThOs", "NdSi2Th2", "YNdThSi2"]
>>> Formula.count_formulas_by_composition(formulas)
{2: 2, 3: 1, 4: 1}
"""
sorted_formulas = Formula.filter_by_composition(formulas)
return {k: len(v) for k, v in sorted_formulas.items()}
[docs]
@staticmethod
def get_unique_formulas(
formulas: list[str],
) -> set[str]:
"""Get unique formulas from a list of formulas.
Examples
--------
>>> formulas = ["NdSi2", "ThOs", "ThOs"]
>>> Formula.get_unique_formulas(formulas)
{"NdSi2", "ThOs"}
"""
return set(formulas)
[docs]
@staticmethod
def get_unique_elements(formulas: list[str]) -> set[str]:
"""Get unique elements from a list of formulas.
Examples
--------
>>> formulas = ["NdSi2", "ThOs", "NdSi2Th2", "YNdThSi2"]
>>> Formula.get_unique_elements(formulas)
{"Nd", "Si", "Th", "Os", "Y"}
"""
elements = set()
for formula in formulas:
parsed_formula = Formula(formula).parsed_formula
for element, _ in parsed_formula:
elements.add(element)
return elements
[docs]
@staticmethod
def get_element_count(formulas: list[str]) -> dict[str, int]:
"""Get the count of each element in a list of formulas. Do not consider
the stoichiometric value.
Examples
--------
>>> formulas = ["NdSi2", "ThOs", "NdSi2Th2", "YNdThSi2"]
>>> Formula.get_element_count(formulas)
{"Nd": 3, "Si": 3, "Th": 3, "Os": 1, "Y": 1}
"""
element_count = {}
for formula in formulas:
parsed_formula = Formula(formula).parsed_formula
for element, _ in parsed_formula:
if element not in element_count:
element_count[element] = 0
element_count[element] += 1
return element_count
@staticmethod
def _parse_formula(formula: str) -> list[tuple[str, float]]:
pattern = r"([A-Z][a-z]*)(\d*\.?\d*)"
parsed = re.findall(pattern, formula)
return [
(element, float(index) if index else 1.0)
for element, index in parsed
]
[docs]
@staticmethod
def build_formula_from_parsed(
parsed_formula: list[tuple[str, float]]
) -> str:
"""Convert the parsed formula into a string. If the index can be
converted to 1 (int), it will be removed.
Examples
--------
>>> parsed_formula = [("Nd", 1.0), ("Si", 2.0)]
>>> Formula.build_formula_from_parsed(parsed_formula)
"NdSi2"
"""
formula_string = ""
for element, index in parsed_formula:
if index.is_integer() and int(index) != 1:
formula_string += f"{element}{int(index)}"
elif index.is_integer() and int(index) == 1:
formula_string += f"{element}"
else:
formula_string += f"{element}{index}"
return formula_string
[docs]
@staticmethod
def filter_by_composition(
formulas: list[str],
) -> dict[int, list[str]]:
"""Sort formulas into categories based on their composition.
Examples
--------
>>> formulas = ["NdSi2", "ThOs", "NdSi2Th2", "YNdThSi2"]
>>> Formula.filter_by_composition(formulas)
{2: ["NdSi2", "ThOs"], 3: ["NdSi2Th2"], 4: ["YNdThSi2"]}
"""
sorted_formulas = {}
for formula in formulas:
element_count = Formula(formula).element_count
if element_count not in sorted_formulas:
sorted_formulas[element_count] = []
sorted_formulas[element_count].append(formula)
return sorted_formulas
[docs]
@staticmethod
def filter_by_single_composition(
formulas: list[str], composition_type: int
) -> list[str]:
"""Filter formulas by the given composition type.
Examples
--------
>>> formulas = ["NdSi2", "ThOs", "NdSi2Th2", "YNdThSi2"]
>>> Formula.filter_by_single_composition(formulas, 2)
["NdSi2", "ThOs"]
"""
return [
formula
for formula in formulas
if Formula(formula).element_count == composition_type
]
[docs]
@staticmethod
def filter_by_elements_containing(
formulas: list[str], elements: list[str]
) -> list[str]:
"""Filter formulas by a list of elements.
Examples
--------
>>> formulas = ["NdSi2", "ThOs", "NdSi2Th2", "YNdThSi2"]
>>> elements = ["Nd", "Si"]
>>> Formula.filter_by_elements(formulas, elements)
["NdSi2", "NdSi2Th2", "YNdThSi2"]
"""
filtered_formulas = []
for formula in formulas:
parsed_formula = Formula(formula).parsed_formula
if all(element in dict(parsed_formula) for element in elements):
filtered_formulas.append(formula)
return filtered_formulas
[docs]
@staticmethod
def filter_by_elements_matching(
formulas: list[str], elements: list[str]
) -> list[str]:
"""Filter formulas by a list of elements but the specified elements
should be only contained.
Examples
--------
>>> formulas = ["NdSi2", "ThOs", "NdSi2Th2", "YNdThSi2"]
>>> elements = ["Nd", "Si"]
>>> filter_by_elements(formulas, elements)
["NdSi2"]
"""
filtered_formulas = []
for formula in formulas:
parsed_formula = Formula(formula).parsed_formula
if all(element in dict(parsed_formula) for element in elements):
if len(parsed_formula) == len(elements):
filtered_formulas.append(formula)
return filtered_formulas
[docs]
@staticmethod
def count_duplicates(formulas: list[str]) -> dict[str, int]:
"""Count the number of duplicates in a list of formulas.
Examples
--------
>>> formulas = ["NdSi2", "NdSi2", "NdSi2Th2", "NdSi2Th2", "ThOs"]
>>> Formula.count_duplicates(formulas)
{"NdSi2": 2, "NdSi2Th2": 2}
"""
duplicates = {}
for formula in formulas:
if formula not in duplicates:
duplicates[formula] = 0
duplicates[formula] += 1
return {k: v for k, v in duplicates.items() if v > 1}
[docs]
@staticmethod
def count_by_formula(formulas: list[str], formula_to_count: str) -> int:
"""Count the number of occurrences of a specific formula in a list of
formulas.
Examples
--------
>>> formulas = ["NdSi2", "NdSi2", "NdSi2Th2", "NdSi2Th2", "ThOs"]
>>> Formula.count_by_formula(formulas, "NdSi2")
2
"""
return formulas.count(formula_to_count)
@staticmethod
def _convert_custom_labels_to_order_map(custom_labels: dict) -> dict:
"""Convert a nested custom_labels dictionary into an element order
mapping.
This function is used for the sorting of elements in the sort formula
as a part of the sorted function above.
Parameters
----------
custom_labels : dict
The dictionary mapping element counts to label mappings. Each label
mapping is a dictionary where keys are label names and values are
lists of element symbols or comma-separated strings of element
symbols.
Returns
-------
label_order_map : dict[int, dict[str, int]]
The dictionary mapping element counts to dictionaries of element
symbol to order index.
Examples
--------
>>> custom_labels = {
... 2: {
... "A": ["Li", "Er"],
... "B": ["B", "In"],
... },
... 3: {
... "R": ["Er"],
... "M": ["Co"],
... "X": ["In"],
... },
... 4: {
... "A": ["Er"],
... "B": ["Co"],
... "C": ["In"],
... "D": ["U"],
... },
... }
>>> convert_custom_labels_to_order_map(custom_labels)
{
2: {'Li': 0, 'Er': 0, 'B': 1, 'In': 1},
3: {'Er': 0, 'Co': 1, 'In': 2},
4: {'Er': 0, 'Co': 1, 'In': 2, 'U': 3}
}
"""
label_order_map = {}
for element_count, label_mapping in custom_labels.items():
order_map = {}
for idx, elements in enumerate(label_mapping.values()):
for element in elements:
order_map[element] = idx
label_order_map[element_count] = order_map
return label_order_map
def _normalized(self, decimals: int = 6) -> str:
index_sum = sum(self.indices)
normalized = [
(element, count / index_sum)
for element, count in self.parsed_formula
]
return "".join(
f"{element}{format(index, f'.{decimals}f')}"
for element, index in normalized
)
@property
def elements(self) -> list[str]:
"""Get the list of elements in the formula.
Examples
--------
>>> formula = Formula("NdSi2")
>>> formula.elements
["Nd", "Si"]
"""
return [element for element, _ in self.parsed_formula]
@property
def indices(self) -> list[float]:
"""Get the list of indices in the formula.
Examples
--------
>>> formula = Formula("NdSi2")
>>> formula.indices
[1.0, 2.0]
"""
return [index for _, index in self.parsed_formula]
@property
def element_count(self) -> int:
"""Get the number of unique elements in the formula.
Examples
--------
>>> formula = Formula("NdSi2")
>>> formula.element_count
2
"""
return len(self.parsed_formula)
@property
def max_min_avg_index(self) -> tuple[float, float, float]:
"""Get the max, min, and avg index of the formula.
Examples
--------
>>> formula = Formula("NdSi2")
>>> formula.max_min_avg_index
(2.0, 1.0, 1.5)
"""
indices = self.indices
return max(indices), min(indices), sum(indices) / len(indices)
[docs]
def get_normalized_indices(self, decimals=6) -> list[float]:
"""Get the normalized indices of the formula.
Examples
--------
>>> formula = Formula("NdSi2")
>>> formula.get_normalized_indices()
[0.333333, 0.666667]
>>> formula.get_normalized_indices(2)
[0.33, 0.67]
"""
total = sum(self.indices)
return [round(index / total, decimals) for index in self.indices]
[docs]
def get_normalized_formula(self, decimals=6) -> str:
"""Get the normalized formula of the formula.
Examples
--------
>>> formula = Formula("NdSi2")
>>> formula.get_normalized_formula()
"Nd0.333333Si0.666667"
>>> formula.get_normalized_formula(2)
"Nd0.33Si0.67"
"""
return self._normalized(decimals=decimals)
[docs]
def get_normalized_parsed_formula(
self, decimals=6
) -> list[tuple[str, float]]:
"""Get the normalized parsed formula of the formula.
Examples
--------
>>> formula = Formula("NdSi2")
>>> formula.get_normalized_parsed_formula()
[("Nd", 0.333333), ("Si", 0.666667)]
>>> formula.get_normalized_parsed_formula(2)
[("Nd", 0.33), ("Si", 0.67)]
"""
normalized = self._normalized(decimals=decimals)
return self._parse_formula(normalized)
[docs]
def sort_by_custom_label(
self, custom_labels: dict[int : dict[str : list[str]]], normalize=False
) -> str:
"""Sort elements in a chemical formula using a precomputed element
order map.
Parameters
----------
formula : str
The chemical formula to be sorted.
element_order : dict[int, dict[str, int]]
The mapping from element symbols to their desired sort index.
normalize : bool, optional
Whether to normalize the parsed formula, by default False.
Returns
-------
str
The sorted formula string.
Examples
--------
>>> formula = "BLi"
>>> custom_labels = {
... 2: {
... "A": ["Li", "Er"],
... "B": ["B", "In"],
... },
... 3: {
... "R": ["Er"],
... "M": ["Co"],
... "X": ["In"],
... },
... 4: {
... "A": ["Er"],
... "B": ["Co"],
... "C": ["In"],
... "D": ["U"],
... },
... }
>>> sorted_formula = sort(formula, custom_labels)
>>> print(sorted_formula)
LiB
"""
formula_parsed = (
self.get_normalized_parsed_formula()
if normalize
else self.parsed_formula
)
label_order_map = self._convert_custom_labels_to_order_map(
custom_labels
)
element_order = label_order_map.get(self.element_count, {})
formula_sorted = sorted(
formula_parsed, key=lambda x: element_order.get(x[0], float("inf"))
)
return Formula.build_formula_from_parsed(formula_sorted)
[docs]
def sort_by_elemental_property(
self,
property_data: dict[str, float],
ascending=True,
normalize=False,
) -> str:
"""Sort the elements in a chemical formula based on a specified CAF
property.
Parameters
----------
formula : str
The chemical formula to be sorted.
property_data: dict[str, float]
The dictionary that contains the single value for each element of
the given formula.
ascending : bool, optional
Whether to sort in ascending order. Defaults to True.
normalize : bool, optional
Whether to normalize the formula before sorting. Defaults to False.
Returns
-------
str
The formula string with elements sorted according to the specified
property.
Examples
--------
#FIXME: Double check this example
>>> from bobleesj.utils.sources.oliynyk import Oliynyk
>>> from bobleesj.utils.sources.oliynyk import Property as P
>>> formula = "LiFe"
>>> oliynyk = Oliynyk()
>>> prop_data = oliynyk.get_property_data_for_formula(formula, P.AW)
>>> Formula(formula).sort("LiFe", prop_data)
"LiFe"
#FIXME: TEST THIS EXAMPLES
"""
formula_parsed = (
self.get_normalized_parsed_formula()
if normalize
else self.parsed_formula
)
formula_sorted = sorted(
formula_parsed,
key=lambda x: property_data.get(x[0], 0),
reverse=not ascending,
)
return Formula.build_formula_from_parsed(formula_sorted)
[docs]
def sort_by_stoichiometry(
self, property_data: dict[str:float], ascending=True, normalize=False
) -> str:
"""Sort the elements in the chemical formula based on their
composition.
When there are more than one element with the same compsition, the
Mendeleev number is used to break the tie. During the tie, the
Mendeleev number is always sorted in ascending order.
Parameters
----------
property_data: dict[str: float]
The data to sort with when when stoichiometric raito is the same.
The default value is optional that if no data provided, then we
will simply sort alphabetically from a to z.
ascending : bool, optional
Whether to sort in ascending order. Defaults to True.
normalize : bool, optional
Whether to normalize the formula before sorting. Defaults to False.
Returns
-------
str
The formula string with elements sorted according to the specified
property.
Examples
--------
>>> sort("LiNa2B", db)
"LiBNa2"
"""
formula_parsed = (
self.get_normalized_parsed_formula()
if normalize
else self.parsed_formula
)
reverse = not ascending
formula_sorted = sorted(
formula_parsed,
key=lambda x: (
# 1st sort, reverse sort if descending (reversed)
-x[1] if reverse else x[1],
# 2nd sort for the same compoposition. Always ascending sort.
property_data[x[0]],
),
)
return Formula.build_formula_from_parsed(formula_sorted)