-
Notifications
You must be signed in to change notification settings - Fork 158
/
Copy pathapp.py
56 lines (39 loc) · 1.71 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import re
from enum import Enum
from typing import Set
import nltk
from pydantic import BaseModel, Field
try:
EN_STOP_WORDS = set(nltk.corpus.stopwords.words("english"))
except LookupError:
nltk.download("stopwords")
EN_STOP_WORDS = set(nltk.corpus.stopwords.words("english"))
class PreprocessingStep(str, Enum):
REMOVE_SPECIAL_CHARACTERS = "remove-special-characters"
REMOVE_SINGLE_CHARACTERS = "remove-single-characters"
CLEAN_MULTIPLE_SPACES = "clean-multiple-spaces"
LOWERCASE = "lowercase"
REMOVE_STOP_WORDS = "remove-stop-words"
class TextPreprocessingInput(BaseModel):
text: str
preprocessing_steps: Set[PreprocessingStep] = Field(
..., description="Preprocessing steps to apply on the text."
)
class TextProcessingOutput(BaseModel):
preprocessed_text: str = Field(...)
def preprocess_text(input: TextPreprocessingInput) -> TextProcessingOutput:
"""Clean up text data based on selected preprocessing steps."""
text = input.text
if PreprocessingStep.REMOVE_SPECIAL_CHARACTERS in input.preprocessing_steps:
text = re.sub(r"\W", " ", text)
if PreprocessingStep.REMOVE_SINGLE_CHARACTERS in input.preprocessing_steps:
text = re.sub(r"\s+[a-zA-Z]\s+", " ", text)
if PreprocessingStep.CLEAN_MULTIPLE_SPACES in input.preprocessing_steps:
text = re.sub(r"\s+", " ", text, flags=re.I)
if PreprocessingStep.LOWERCASE in input.preprocessing_steps:
text = text.lower()
if PreprocessingStep.REMOVE_STOP_WORDS in input.preprocessing_steps:
tokens = text.split()
tokens = [word for word in tokens if word not in EN_STOP_WORDS]
text = " ".join(tokens)
return TextProcessingOutput(preprocessed_text=text)