This post will compare vectorizing word data using term frequency-inverse document frequency (TF-IDF) in several python implementations.
TF-IDF is used in the natural language processing (NLP) area of artificial intelligence to determine the importance of words in a document and collection of documents, A.K.A. corpus.
Various implementations of TF-IDF were tested in python to gauge how they would perform against a large set of data. Tested were sklearn, gensim and pyspark.
The tests were executed in a virtual machine with 48 CPU and 320gb RAM, running Oracle Linux 7 and using python 3.8.
The dataset contains 6876405 rows of text data, which has been pre-cleaned by removing stop words, converting all characters to lower case, removing special characters, etc...
Performance results
Time to load parquet 6.176868851063773 Time to TfidfVectorizer 1420.4231280069798 Time total 1426.6006411050912
Code used
import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer from timeit import default_timer as timer if __name__ == "__main__": toptimer = timer() starttime = timer() df=pd.read_parquet('/u01/loader/cleanpanda.parquet') print("Time to load parquet", timer() - starttime) starttime = timer() tfidf = TfidfVectorizer(sublinear_tf=True, max_features=100000, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english') features = tfidf.fit_transform(df.description) print("Time to TfidfVectorizer", timer() - starttime) print("Time total", timer() - toptimer)
Performance results
Time to load parquet 5.7981067900545895 Time to tokenize 651.322252145037 Time to create BoW 769.5724206231534 Time to fit model 96.3426871181 Time total 1523.036551590776
Code used
from gensim import corpora from gensim.utils import simple_preprocess from gensim import models import pandas as pd from timeit import default_timer as timer if __name__ == "__main__": toptimer = timer() starttime = timer() df = pd.read_parquet('/u01/loader/cleanpanda.parquet') dataset = df.description print("Time to load parquet", timer() - starttime) starttime = timer() doc_tokenized = [simple_preprocess(doc) for doc in df.description] print("Time to tokenize", timer() - starttime) starttime = timer() dictionary = corpora.Dictionary() BoW_corpus = [dictionary.doc2bow(doc, allow_update=True) for doc in doc_tokenized] print("Time to create BoW", timer() - starttime) starttime = timer() model = models.TfidfModel(BoW_corpus) # fit model vector = model[BoW_corpus[0]] # apply model to the first corpus document print("Time to fit model", timer() - starttime) print("Time total", timer() - toptimer)
Performance results
Time to startup spark 3.516299287090078 Time to load parquet 3.8542269258759916 Time to tokenize 0.28877926408313215 Time to CountVectorizer 28.51735320384614 Time to IDF 24.151005786843598 Time total 60.32788718002848
Code used
from pyspark import SparkContext from pyspark.sql import SparkSession from pyspark import SparkConf from pyspark.ml.feature import HashingTF, IDF, Tokenizer from pyspark.ml.feature import CountVectorizer from timeit import default_timer as timer from pyspark.sql.functions import udf from pyspark.sql.types import DoubleType, ArrayType import time import pandas as pd if __name__ == "__main__": toptimer = timer() starttime = timer() conf = SparkConf().setMaster("local[*]").setAppName("SparkVect").set('spark.driver.memory', '300G').set('spark.driver.maxResultSize', '20G').set('spark.network.timeout', '7200s').set('spark.local.dir', '/u01/tmp') sc = SparkContext(conf=conf) # sc.setLogLevel("ERROR") spark = SparkSession(sc) print(sc._conf.getAll()) # check context settings print("Time to startup spark", timer() - starttime) starttime = timer() sentenceData = spark.read.parquet("/u01/loader/cleanpanda.parquet") print("Time to load parquet", timer() - starttime) starttime = timer() tokenizer = Tokenizer(inputCol="description", outputCol="words") wordsData = tokenizer.transform(sentenceData) print("Time to tokenize", timer() - starttime) # HashingTF can also be used to get term frequency vectors #hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20) #featurizedData = hashingTF.transform(wordsData) starttime = timer() countVectors = CountVectorizer(inputCol="words", outputCol="rawFeatures", vocabSize=100000, minDF=5) model = countVectors.fit(wordsData) result = model.transform(wordsData) print("Time to CountVectorizer", timer() - starttime) starttime = timer() idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(result) rescaledData = idfModel.transform(result) print("Time to IDF", timer() - starttime) print("Time total", timer() - toptimer) spark.stop()