Source code for ssp.ml.transformer.ssp_labeller

#!/usr/bin/env python

__author__ = "Mageswaran Dhandapani"
__copyright__ = "Copyright 2020, The Spark Structured Playground Project"
__credits__ = []
__license__ = "Apache License"
__version__ = "2.0"
__maintainer__ = "Mageswaran Dhandapani"
__email__ = "mageswaran1989@gmail.com"
__status__ = "Education Purpose"

import pandas as pd
from pyspark.sql.types import IntegerType
from sklearn.base import BaseEstimator, TransformerMixin
from pyspark.sql.functions import udf
from ssp.utils.ai_key_words import AIKeyWords


[docs]def labelme(text, keywords=AIKeyWords.POSITIVE.split("|")):
    text = text.replace("#", "").replace("@", "")
    res = 0
    for keyword in keywords:
        if f' {keyword.lower()} ' in f' {text.lower()} ':
            res = 1
    return res

labelme_udf = udf(labelme, IntegerType())

[docs]class SSPTextLabeler(BaseEstimator, TransformerMixin):
    def __init__(self, input_col=None, output_col="label"):
        self._input_col = input_col
        self._output_col = output_col

    # Return self nothing else to do here
[docs]    def fit(self, X, y=None):
        return self

[docs]    def transform(self, X, y=None):
        if isinstance(X, pd.DataFrame):
            if self._input_col:
               X[self._output_col] = X[self._input_col].swifter.apply(lambda x: labelme(x, AIKeyWords.POSITIVE))
               print(X[self._output_col].value_counts())
               return X
        elif isinstance(X, list):
            X = [self.labelme(x, AIKeyWords.POSITIVE) for x in X]
            return X
        elif isinstance(X, str):
            return self.labelme(X, AIKeyWords.POSITIVE)