Source code for fivpy.srs

"""Module to perform forest inventories using Simple Random Sampling
"""

import math
import logging
import pandas as pd
from scipy import stats


logging.basicConfig(format='%(levelname)s - %(message)s', level=logging.DEBUG)


[docs]class RandomSampling:
    """Class to create a forest inventory object
    which uses Simple Random Sampling to estimate total wood volume"""

    def __init__(self,
                 dataframe,
                 unit_area,
                 sampling_area,
                 significance=95,
                 sampling_error=20,
                 deg_free=None):

        self.dataframe = dataframe
        self.unit_area = unit_area
        self.sampling_area = sampling_area
        self.significance = significance
        self.sampling_error = sampling_error
        self.deg_free = deg_free
        self.n_sampled_units = self.dataframe.groupby("units").count().shape[0]
        self._add_vol()
        self._get_deg_free()
        self.mean = self.dataframe.groupby("units").sum().mean()[["volume"]]
        self.std = self.dataframe.groupby("units").sum().std()[["volume"]]
        self.var_coef = self.std / self.mean * 100
        self.total_units = self.sampling_area / self.unit_area
        self.t_value = stats.t.ppf(1 - (1 - (self.significance/100)) / 2, self.deg_free)


[docs]    def _add_vol(self):

        try:
            self.get_vol()
            logging.warning('Adding colum with Volume.')
        except KeyError:
            if 'volume' in self.dataframe.columns:
                logging.warning('Volume column already exists.')
                return self.dataframe

            logging.warning('Adding columns with DBH and Volume.')
            self.dataframe["dbh"] = self.dataframe["cbh"] / math.pi
            self.get_vol()

        return self.dataframe

[docs]    def get_vol(self, beta_0=0.000074, beta_1=1.707348, beta_2=1.16873):
        """Add a column with volume info to the dataframe

        Args:
            beta_0 (float, optional): first parameter. Defaults to 0.000074.
            beta_1 (float, optional): second parameter. Defaults to 1.707348.
            beta_2 (float, optional): third parameter. Defaults to 1.16873.

        Returns:
            dataframe (pd.DataFrame): A DataFrame containing volume info.

        Examples
        --------
        >>> instance.get_vol()
        ### To change the equation used to calculate volume
        >>> instance.get_vol(beta_0=0.000025, beta_1=1.4568, beta_2=1.2563)

        """
        self.dataframe["volume"] = beta_0 * (self.dataframe["dbh"] ** beta_1)\
            * (self.dataframe["height"] ** beta_2)
        return self.dataframe

[docs]    def _get_deg_free(self):
        # Get the degrees of freedom
        if self.deg_free is None:
            self.deg_free = self.n_sampled_units - 1
            logging.warning("Degrees of freedom not informed."
                            "Using default value: %s", self.deg_free)
        return self.deg_free

[docs]    def get_sample_size(self):
        """Returns the estimated required sample size to perform the
        definitive inventory

        Returns:
            sample size (int): the required sample size

        Examples
        --------
        >>> instance.get_sample_size()
        """
        dividend_sample_size = ((self.t_value ** 2) * (self.var_coef ** 2))
        divisor_sample_size = (self.sampling_error ** 2) + (((self.t_value\
            ** 2) * (self.var_coef ** 2)) / self.total_units)
        sample_size = dividend_sample_size / divisor_sample_size
        return math.ceil(sample_size)

[docs]    def srs_inventory(self):
        """Performs the Simple Random Sampling inventory

        Raises:
            ValueError: If the required sample size is greater than the number
            of units sampled, it is not possible to perform the inventory.

        Returns:
            dataframe (pd.DataFrame): A DataFrame containing the inventory
            results.

        Examples
        --------
        >>> instance.srs_inventory()
        """
        if self.get_sample_size() > self.n_sampled_units:
            raise ValueError("Required sample size is greater than "
                             "the number of sampled units "
                             f"{self.get_sample_size()}. "
                             "It is necessary to obtain more data")

        logging.info("Calculating final inventory.")
        var = self.dataframe.groupby("units").sum().var()[["volume"]]

        # standardized error of mean - sem
        sem = math.sqrt(((var / self.n_sampled_units) * (1 -\
            (self.n_sampled_units / self.total_units))))
        total_vol = self.total_units * self.mean
        sample_error = sem * self.t_value
        sample_error_perc = (sample_error / self.mean) * 100

        # CI true mean
        ic_up = self.mean + sample_error
        ic_lo = self.mean - sample_error

        # CI prod by ha
        mean_ha = self.mean * (1 / self.unit_area)
        ic_pop_up = (self.mean + sample_error) * (1 / self.unit_area)
        ic_pop_lo = (self.mean - sample_error) * (1 / self.unit_area)

        # CI vol total pop
        ic_pop_total = self.total_units * self.mean
        ic_pop_up_total = (self.total_units * self.mean) + \
            (self.total_units * sample_error)
        ic_pop_lo_total = (self.total_units * self.mean) - \
            (self.total_units * sample_error)

        t_value_uni = stats.t.ppf(1 - ((1 - (self.significance/100)) * 2) \
            / 2, self.deg_free)

        # Minimum Reliable Estimate - mre
        mre = self.mean - (t_value_uni * sem)
        mre_population = (self.mean - (t_value_uni * sem)) * self.total_units

        results = {"Variables": ["Ym", "StdDev", "CV", "S²", "N samples",
                                 "N Total", "MeanErr", "Total V",
                                "Err", "E(%)", "CI Supe", "CI Infe", "Ym/ha",
                                "CI Supe ha", "CI Infe ha",
                                "Ym/total", "CI Supe total",
                                "CI Infe total", "MRE", "MRE Pop"],
                "Data": [self.mean.item(), self.std.item(),
                          self.var_coef.item(), var.item(), self.deg_free,
                          self.total_units,
                            sem, total_vol.item(), sample_error,
                            sample_error_perc.item(), ic_up.item(),
                            ic_lo.item(), mean_ha.item(), ic_pop_up.item(),
                            ic_pop_lo.item(), ic_pop_total.item(),
                            ic_pop_up_total.item(), ic_pop_lo_total.item(),
                            mre.item(), mre_population.item()]}

        return pd.DataFrame(data=results)