from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
tokenizer=RegexpTokenizer(‘r\w+’)
eng_stopwords=set(stopwords.words(‘english’))
ps=PorterStemmer()
def StemmedReview(review):
review=review.lower()
tokens=tokenizer.tokenize(review)
new_tokens=[token for token in tokens if token not in en_stopwords]
stemmed_tokens=[ps.stem(new_tokens) for token in new_tokens]
cleaned_review=''.join(stemmed_tokens)
return cleaned_review
x=[“This was a awesome movie.”,
“Great movie!I liked it alot.”,
“Happy Ending!Awesome acting by the hero!!!”,
“Loved it, truly great”,
"bad movie,waste of money ",
“could have been better”,
“Surely a Disappointing movie”]
y=[1,1,1,1,0,0,0]
x_clean=[StemmedReview(i) for i in x]
this is generating error:
TypeError Traceback (most recent call last)
in
----> 1 x_clean=[StemmedReview(i) for i in x]
in (.0)
----> 1 x_clean=[StemmedReview(i) for i in x]
in StemmedReview(review)
1 def StemmedReview(review):
2 review=review.lower
----> 3 tokens=tokenizer.tokenize(review)
4 new_tokens=[token for token in tokens if token not in eng_stopwords]
5 stemmed_tokens=[ps.stem(new_tokens) for token in new_tokens]
~\Anaconda3\lib\site-packages\nltk\tokenize\regexp.py in tokenize(self, text)
134 # If our regexp matches tokens, use re.findall:
135 else:
–> 136 return self._regexp.findall(text)
137
138 def span_tokenize(self, text):
TypeError: expected string or bytes-like object