Source code for surfaxe.vasp_data

# Pymatgen  
from pymatgen.core import Structure
from pymatgen.io.vasp.outputs import Locpot, Outcar, Vasprun
from pymatgen.analysis.local_env import CrystalNN

# Misc
import os
import pandas as pd 
import numpy as np 
import warnings 

# surfaxe 
from surfaxe.generation import oxidation_states
from surfaxe.io import _custom_formatwarning, slab_from_file, _instantiate_structure

[docs]def process_data(bulk_per_atom, parse_hkl=True, path_to_fols=None, hkl_dict=None,
parse_core_energy=False, core_atom=None, bulk_nn=None, parse_vacuum=False, 
save_csv=True, csv_fname='data.csv', **kwargs): 
    """
    Parses the folders to collect all final data on relevant input and output 
    parameters, and optionally core and vacuum level energies. 

    If you are processing data for folder structures generated with 
    `generation` make sure you use `convergence.parse_fols` function. This 
    function is for parsing full sets of information from the output of 
    production run calculations. 

    The folder structure for parsing of data is fairly flexible and can be: 
    1. automatically parsed if `parse_hkl=True` - the function searches for 
    folders with names three digits long in cwd (default)
        e.g. it finds folders cwd/100 and cwd/010 that correspond to Miller 
        indices (1,0,0) and (0,1,0)
    2. automatically parsed from a specific working directory if `path_to_fols`
    is specified
    3. manually specified using `hkl_dict`, where the Miller index is mapped 
    directly to the path to where the files are. If you are only interested in 
    the specified folders, do not forget to change `parse_hkl=False`.
        e.g. hkl_dict = {(0,1,1): 'path/to/001/files/', 
                         (2,0,1): 'path/to/201/files/'}
    4. automatically parsed from cwd or a specific working directory in addition 
    to a defined `hkl_dict`

    Each of the folders must contain POSCAR and vasprun.xml files and optionally 
    LOCPOT (or potential.csv) and OUTCAR files if vacuum or core energy are 
    parsed. 

    The function returns None by default and saves the DataFrame to a csv file. 
    Optionally, it can return the DataFrame. 

    Args:
        bulk_per_atom (`float`): Bulk energy per atom in eV per atom. 
        parse_hkl (`bool`, optional): If ``True`` the script parses the names   
            of the folders to get the Miller indices. Defaults to ``True``.
        path_to_fols (`str`, optional): Path to where surfaxe should look for 
            the hkl folders are. Defaults to None which searches in cwd.     
        hkl_dict (`dict`, optional): dictionary of tuples of Miller indices 
            and paths to the folders the relevant outputs. Defaults to ``None``. 
            E.g. If the outputs of the calculations on the (1,-1,2) slab are in 
            folder ``path/to/folder/112``, the ``hkl_dict`` would be: 
            {(1,-1,2): 'path/to/folder/112'}
        parse_core_energy (`bool`, optional): If True the scripts attempts to 
            parse core energies from a supplied OUTCAR. Defaults to ``False``. 
        core_atom (`str`, optional): The symbol of atom the core state energy 
            level should be parsed from. Defaults to ``None``. 
        bulk_nn (`list`, optional): The symbols of the nearest neighbours of the 
            `core_atom`. Defaults to ``None``. 
        parse_vacuum (`bool`, optional): if ``True`` the script attempts 
            to parse LOCPOT using analysis.electrostatic_potential to use the 
            maximum value of planar potential as the vacuum energy level. 
            Defaults to ``True``. 
        save_csv (`bool`, optional): If ``True``, it writes data to a csv file.
            Defaults to ``True``.
        csv_fname (`str`, optional): The filename of the csv. Defaults to 
            data.csv 
    Returns: 
        DataFrame
    """
    # Check if hkl_dict is correctly set up 
    if hkl_dict: 
        for key, value in hkl_dict.items(): 
            if not isinstance(key, tuple): 
                raise TypeError('The keys supplied to hkl_dict are not tuples.')
            if not isinstance(value, str): 
                raise TypeError('The values supplied to hkl_dict are not strings.')
    
    cwd = os.getcwd()
    if path_to_fols: 
        cwd = path_to_fols 

    # Get the Miller indices as tuples and strings from folders in root dir
    if parse_hkl:
        if not hkl_dict: 
            hkl_dict = {}
        for fol in os.listdir(cwd):
            if os.path.isdir(os.path.join(cwd, fol)) and len(fol)==3 and\
                fol.isdigit():
                hkl_dict[tuple(map(int, fol))] = os.path.join(cwd, fol)

    # Set up additional arguments for get_core_energy 
    get_core_energy_kwargs = {'orbital': '1s', 'ox_states': None, 
    'nn_method': CrystalNN()}
    get_core_energy_kwargs.update(
        (k, kwargs[k]) for k in get_core_energy_kwargs.keys() & kwargs.keys()
    )
    get_core=False
    if parse_core_energy: 
        if core_atom is not None and bulk_nn is not None:
            get_core=True 
        else: 
            warnings.formatwarning = _custom_formatwarning
            warnings.warn(('Core atom or bulk nearest neighbours were not '
            'supplied. Core energy will not be parsed.'))

    # For each miller index, check if the folders specified are there and 
    # parse them for data
    df_list, electrostatic_list, core_energy_list = ([] for i in range(3))

    for hkl_tuple, path in hkl_dict.items(): 
        vsp_path = '{}/vasprun.xml'.format(path)
        psc_path = '{}/POSCAR'.format(path)
        vsp = Vasprun(vsp_path, parse_potcar_file=False)
        slab = slab_from_file(psc_path,hkl_tuple)
        vsp_dict = vsp.as_dict()
        
        df_list.append({
            'hkl': ''.join(map(str, hkl_tuple)), 
            'hkl_tuple': hkl_tuple,
            'area': slab.surface_area, 
            'atoms': vsp_dict['nsites'], 
            'functional': vsp_dict['run_type'], 
            'encut': vsp_dict['input']['incar']['ENCUT'], 
            'algo': vsp_dict['input']['incar']['ALGO'],
            'ismear': vsp_dict['input']['parameters']['ISMEAR'],
            'sigma': vsp_dict['input']['parameters']['SIGMA'],
            'kpoints': vsp_dict['input']['kpoints']['kpoints'],
            'bandgap': vsp_dict['output']['bandgap'],  
            'slab_energy': vsp_dict['output']['final_energy'],
            'slab_per_atom': vsp_dict['output']['final_energy_per_atom']
        })

        if parse_vacuum: 
            electrostatic_list.append(
                vacuum(path)
            )
                                    
        if get_core: 
            otc_path='{}/OUTCAR'.format(path)
            core_energy_list.append(
                core_energy(core_atom, bulk_nn, outcar=otc_path, 
                structure=psc_path, **get_core_energy_kwargs)
                )      
                        
    df = pd.DataFrame(df_list)
    df['surface_energy'] = (
        (df['slab_energy'] - bulk_per_atom * df['atoms'])/(2*df['area']) * 16.02
        )
    df['surface_energy_ev'] = (
        (df['slab_energy'] - bulk_per_atom * df['atoms'])/(2*df['area'])
    )

    if electrostatic_list: 
        df['vacuum_potential'] = electrostatic_list
    
    if core_energy_list: 
        df['core_energy'] = core_energy_list

    # Save to csv or return DataFrame
    if save_csv: 
        if not csv_fname.endswith('.csv'):
            csv_fname += '.csv'
        df.to_csv(csv_fname, header=True, index=False)
    else:
        return df

[docs]def vacuum(path=None): 
    '''
    Gets the energy of the vacuum level. It either parses potential.csv file if 
    available or tries to calculate planar potential from LOCPOT. If neither 
    file is available, function returns np.nan.  

    Args:
        path (`str`, optional): the path to potential.csv or LOCPOT files. 
            Can be the path to a directory in which either file is or you can 
            specify a path that must end in .csv or contain LOCPOT. Defaults to 
            looking for potential.csv or LOCPOT in cwd. 

    Returns:
        Maximum value of planar potential

    '''
    
    if type(path)==str and path.endswith('.csv'): 
        df = pd.read_csv(path)
        max_potential = df['planar'].max()
        max_potential = round(max_potential, 3)
    
    elif type(path)==str and 'LOCPOT' in path: 
        lpt = Locpot.from_file(path)
        planar = lpt.get_average_along_axis(2)
        max_potential = float(f"{np.max(planar): .3f}")
    
    else: 
        if path is None: 
            cwd = os.getcwd()
        else: 
            cwd = path 

        if os.path.isfile('{}/potential.csv'.format(cwd)): 
            df = pd.read_csv('{}/potential.csv'.format(cwd))
            max_potential = df['planar'].max()
            max_potential = round(max_potential, 3)

        elif os.path.isfile('{}/LOCPOT'.format(cwd)): 
            lpt = Locpot.from_file('{}/LOCPOT'.format(cwd))
            planar = lpt.get_average_along_axis(2)
            max_potential = float(f"{np.max(planar): .3f}")

        else: 
            max_potential = np.nan
            warnings.formatwarning = _custom_formatwarning
            warnings.warn('Vacuum electrostatic potential was not parsed from {} '
            'no LOCPOT or potential.csv files were provided.'.format(path))

    return max_potential
        

[docs]def core_energy(core_atom, bulk_nn, orbital='1s', ox_states=None, 
nn_method=CrystalNN(), outcar='OUTCAR', structure='POSCAR'): 
    """
    Parses the structure and OUTCAR files for the core level energy. Check the 
    validity of nearest neighbour method on the bulk structure before using it 
    on slabs.

    Args: 
        core_atom (`str`, optional): The symbol of atom the core state energy 
            level should be parsed from.  
        bulk_nn (`list`, optional): The symbols of the nearest neighbours of the 
            `core_atom`.   
        orbital (`str`, optional): The orbital of core state. Defaults to 1s.
        ox_states (``None``, `list` or  `dict`, optional): Add oxidation states 
            to the structure. Different types of oxidation states specified will 
            result in different pymatgen functions used. The options are: 
            
            * if supplied as ``list``: The oxidation states are added by site 
                    
                    e.g. ``[3, 2, 2, 1, -2, -2, -2, -2]``
            
            * if supplied as ``dict``: The oxidation states are added by element
                    
                    e.g. ``{'Fe': 3, 'O':-2}``
            
            * if ``None``: The oxidation states are added by guess.  

            Defaults to ``None``.
        nn_method (`class` instance, optional): The coordination number 
            algorithm used. Because the ``nn_method`` is a class, the class 
            needs to be imported from pymatgen.analysis.local_env before it 
            can be instantiated here. Defaults to ``CrystalNN()``.
        outcar (`str`, optional): Path to the OUTCAR file. Defaults to
             ``./OUTCAR``. 
        structure (`str`, optional): Path to the structure file in any format 
            supported by pymatgen. Defaults to ``./POSCAR``. Can also accept a
            pymaten.core.Structure object directly.  

    Returns: 
        Core state energy 
    """
    struc = _instantiate_structure(structure)
    struc = oxidation_states(struc, ox_states)
    bonded_struc = nn_method.get_bonded_structure(struc)
    bulk_nn.sort()
    bulk_nn_str = ' '.join(bulk_nn)
    
    # Get the nearest neighbours info, the c-coordinate and index number 
    # for each atom
    list_of_dicts = [] 
    for n, pos in enumerate(struc): 
        if pos.specie.symbol == core_atom: 
            nn_info = bonded_struc.get_connected_sites(n)
            slab_nn_list = []
            for d in nn_info: 
                nn = d.site.specie.symbol
                slab_nn_list.append(nn)
            slab_nn_list.sort()
            slab_nn = ' '.join(slab_nn_list)
            list_of_dicts.append({
                'atom': n, 
                'nn': slab_nn, 
                'c_coord': pos.c
            })

    # Make pandas Dataframe, query the interquartile range of fractional 
    # coordinates of atoms whose nearest neighbour environment is same as the
    # bulk nearest neighbours provided, get the atom that is the nearest to the 
    # median of the interquartile range 
    df = pd.DataFrame(list_of_dicts)
    low, high = df['c_coord'].quantile([0.25,0.75])
    df = df.query('@low<c_coord<@high and nn==@bulk_nn_str')
    atom = df['atom'].quantile(interpolation='nearest')  

    # Check if the df from query isn't empty - if it is it returns
    # a nan as core energy, otherwise it attempts to extract from OUTCAR 
    if type(atom) is np.float64: 
        core_energy = np.nan 
    else:       
        # Read OUTCAR, get the core state energy 
        otc = Outcar(outcar)
        core_energy_dict = otc.read_core_state_eigen()
        try: 
            core_energy = core_energy_dict[atom][orbital][-1]
        except IndexError: 
            core_energy = np.nan



    return core_energy