.输入一个段落,分红句子(Punkt句子分割器)python
import nltk import nltk.data def splitSentence(paragraph): tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') sentences = tokenizer.tokenize(paragraph) return sentences if __name__ == '__main__': print splitSentence("My name is Tom. I am a boy. I like soccer!")
结果为['My name is Tom.', 'I am a boy.', 'I like soccer!']
2.输入一个句子,分红词组code
from nltk.tokenize import WordPunctTokenizer def wordtokenizer(sentence): #分段 words = WordPunctTokenizer().tokenize(sentence) return words if __name__ == '__main__': print wordtokenizer("My name is Tom.")
结果为['My', 'name', 'is', 'Tom', '.']