TM03 POS Tagging#
Install NLTK POS Tagger#
# !pip install nltk
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.tokenize import word_tokenize
[nltk_data] Downloading package punkt to /Users/jirlong/nltk_data...
[nltk_data] Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data] /Users/jirlong/nltk_data...
[nltk_data] Package averaged_perceptron_tagger is already up-to-
[nltk_data] date!
Test#
text = "The dog eats the big hotdog."
tokens = word_tokenize(text)
print(nltk.pos_tag(tokens))
[('The', 'DT'), ('dog', 'NN'), ('eats', 'VBZ'), ('the', 'DT'), ('big', 'JJ'), ('hotdog', 'NN'), ('.', '.')]
print(nltk.pos_tag(word_tokenize("The book is written by my father.")))
[('The', 'DT'), ('book', 'NN'), ('is', 'VBZ'), ('written', 'VBN'), ('by', 'IN'), ('my', 'PRP$'), ('father', 'NN'), ('.', '.')]
print(nltk.pos_tag(word_tokenize("My father has written more than ten books.")))
[('My', 'PRP$'), ('father', 'NN'), ('has', 'VBZ'), ('written', 'VBN'), ('more', 'JJR'), ('than', 'IN'), ('ten', 'JJ'), ('books', 'NNS'), ('.', '.')]
Perform POS tagging for all tokens in the corpus.#
with open("data/corpus02.txt", encoding="utf8") as fin:
text = fin.read()
print("Number of characters: %d" % len(text))
Number of characters: 75346
tokens = word_tokenize(text)
tokens_with_tag = nltk.pos_tag(tokens)
Find the most frequent nouns#
from collections import Counter
noun_counts = Counter()
for word, tag in tokens_with_tag:
if tag == 'NN':
noun_counts[word] += 1
for k, v in noun_counts.most_common(20):
print(k, '\t', v)
class 104
bourgeoisie 89
society 72
bourgeois 69
proletariat 62
property 55
production 52
labor 30
existence 30
development 28
industry 27
capital 22
form 21
movement 19
struggle 17
character 17
country 15
abolition 15
time 14
man 14
Convert words into lower case, excepting proper nouns.#
from collections import Counter
noun_counts = Counter()
for word, tag in tokens_with_tag:
if tag == 'NN':
noun_counts[word.lower()] += 1
print(noun_counts.most_common(20))
[('class', 104), ('bourgeoisie', 89), ('society', 73), ('bourgeois', 69), ('proletariat', 62), ('property', 56), ('production', 52), ('labor', 30), ('existence', 30), ('development', 28), ('industry', 27), ('capital', 24), ('form', 21), ('abolition', 20), ('movement', 19), ('struggle', 17), ('character', 17), ('country', 15), ('time', 14), ('man', 14)]
Explore other kinds of part of speech tags
from collections import Counter
noun_counts = Counter()
for word, tag in tokens_with_tag:
if tag == 'VB':
noun_counts[word.lower()] += 1
print(noun_counts.most_common(20))
[('be', 41), ('have', 8), ('do', 8), ('introduce', 5), ('increase', 5), ('lose', 5), ('thus', 4), ('attain', 4), ('let', 4), ('bring', 3), ('become', 3), ('use', 3), ('form', 3), ('abolish', 3), ('acquire', 3), ('vanish', 3), ('take', 3), ('pass', 2), ('work', 2), ('say', 2)]
from collections import Counter
noun_counts = Counter()
for word, tag in tokens_with_tag:
if tag == 'NNP' or tag == 'NNPS':
noun_counts[word] += 1
print(noun_counts.most_common(20))
[('Communists', 23), ('Socialism', 21), ('Germany', 13), ('Communism', 12), ('France', 12), ('State', 11), ('England', 9), ('Communist', 7), ('Socialist', 6), ('America', 5), ('_i.e._', 5), ('AND', 4), ('THE', 3), ('II', 3), ('I.', 3), ('Modern', 3), ('Hence', 3), ('Communistic', 3), ('Bourgeois', 3), ('SOCIALISM', 3)]
from collections import Counter
noun_counts = Counter()
for word, tag in tokens_with_tag:
if tag[0] == 'V':
noun_counts[word.lower()] += 1
print(noun_counts.most_common(20))
[('is', 138), ('has', 68), ('are', 59), ('have', 45), ('be', 41), ('was', 29), ('been', 18), ('do', 18), ('existing', 15), ('were', 14), ('had', 12), ('being', 11), ('working', 10), ('become', 9), ('does', 9), ('made', 8), ('developed', 7), ('see', 7), ('becomes', 7), ('created', 7)]
With lemmatization for better handle different forms of verbs.#
GOAL - 在計算出現次數前,先用
lemmatize()
將動詞或名詞恢復成原型METHOD - Load WordNet Lemmatizer provided by NTLK
EXAMPLE
‘is’, ‘are’, ‘were’ -> ‘be’
‘has’, ‘have’ -> ‘have’
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
import nltk
# nltk.download('wordnet')
# nltk.download('omw-1.4')
from collections import Counter
noun_counts = Counter()
for word, tag in tokens_with_tag:
if tag[0] == 'V':
noun_counts[wordnet_lemmatizer.lemmatize(word.lower(), 'v')] += 1
# ADJ (a), ADJ_SAT (s), ADV (r), NOUN (n) or VERB (v)
print(noun_counts.most_common(20))
---------------------------------------------------------------------------
LookupError Traceback (most recent call last)
File ~/anaconda3/lib/python3.10/site-packages/nltk/corpus/util.py:84, in LazyCorpusLoader.__load(self)
83 try:
---> 84 root = nltk.data.find(f"{self.subdir}/{zip_name}")
85 except LookupError:
File ~/anaconda3/lib/python3.10/site-packages/nltk/data.py:583, in find(resource_name, paths)
582 resource_not_found = f"\n{sep}\n{msg}\n{sep}\n"
--> 583 raise LookupError(resource_not_found)
LookupError:
**********************************************************************
Resource wordnet not found.
Please use the NLTK Downloader to obtain the resource:
>>> import nltk
>>> nltk.download('wordnet')
For more information see: https://www.nltk.org/data.html
Attempted to load corpora/wordnet.zip/wordnet/
Searched in:
- '/Users/jirlong/nltk_data'
- '/Users/jirlong/anaconda3/nltk_data'
- '/Users/jirlong/anaconda3/share/nltk_data'
- '/Users/jirlong/anaconda3/lib/nltk_data'
- '/usr/share/nltk_data'
- '/usr/local/share/nltk_data'
- '/usr/lib/nltk_data'
- '/usr/local/lib/nltk_data'
**********************************************************************
During handling of the above exception, another exception occurred:
LookupError Traceback (most recent call last)
Cell In[13], line 8
6 for word, tag in tokens_with_tag:
7 if tag[0] == 'V':
----> 8 noun_counts[wordnet_lemmatizer.lemmatize(word.lower(), 'v')] += 1
9 # ADJ (a), ADJ_SAT (s), ADV (r), NOUN (n) or VERB (v)
11 print(noun_counts.most_common(20))
File ~/anaconda3/lib/python3.10/site-packages/nltk/stem/wordnet.py:45, in WordNetLemmatizer.lemmatize(self, word, pos)
33 def lemmatize(self, word: str, pos: str = "n") -> str:
34 """Lemmatize `word` using WordNet's built-in morphy function.
35 Returns the input word unchanged if it cannot be found in WordNet.
36
(...)
43 :return: The lemma of `word`, for the given `pos`.
44 """
---> 45 lemmas = wn._morphy(word, pos)
46 return min(lemmas, key=len) if lemmas else word
File ~/anaconda3/lib/python3.10/site-packages/nltk/corpus/util.py:121, in LazyCorpusLoader.__getattr__(self, attr)
118 if attr == "__bases__":
119 raise AttributeError("LazyCorpusLoader object has no attribute '__bases__'")
--> 121 self.__load()
122 # This looks circular, but its not, since __load() changes our
123 # __class__ to something new:
124 return getattr(self, attr)
File ~/anaconda3/lib/python3.10/site-packages/nltk/corpus/util.py:86, in LazyCorpusLoader.__load(self)
84 root = nltk.data.find(f"{self.subdir}/{zip_name}")
85 except LookupError:
---> 86 raise e
88 # Load the corpus.
89 corpus = self.__reader_cls(root, *self.__args, **self.__kwargs)
File ~/anaconda3/lib/python3.10/site-packages/nltk/corpus/util.py:81, in LazyCorpusLoader.__load(self)
79 else:
80 try:
---> 81 root = nltk.data.find(f"{self.subdir}/{self.__name}")
82 except LookupError as e:
83 try:
File ~/anaconda3/lib/python3.10/site-packages/nltk/data.py:583, in find(resource_name, paths)
581 sep = "*" * 70
582 resource_not_found = f"\n{sep}\n{msg}\n{sep}\n"
--> 583 raise LookupError(resource_not_found)
LookupError:
**********************************************************************
Resource wordnet not found.
Please use the NLTK Downloader to obtain the resource:
>>> import nltk
>>> nltk.download('wordnet')
For more information see: https://www.nltk.org/data.html
Attempted to load corpora/wordnet
Searched in:
- '/Users/jirlong/nltk_data'
- '/Users/jirlong/anaconda3/nltk_data'
- '/Users/jirlong/anaconda3/share/nltk_data'
- '/Users/jirlong/anaconda3/lib/nltk_data'
- '/usr/share/nltk_data'
- '/usr/local/share/nltk_data'
- '/usr/lib/nltk_data'
- '/usr/local/lib/nltk_data'
**********************************************************************
from collections import Counter
noun_counts = Counter()
for word, tag in tokens_with_tag:
if tag[0] == 'N':
noun_counts[wordnet_lemmatizer.lemmatize(word.lower(), 'n')] += 1
# ADJ (a), ADJ_SAT (s), ADV (r), NOUN (n) or VERB (v)
print(noun_counts.most_common(20))
[('class', 124), ('bourgeoisie', 91), ('society', 76), ('bourgeois', 75), ('proletariat', 64), ('condition', 59), ('property', 56), ('production', 53), ('industry', 35), ('communist', 34), ('relation', 32), ('mean', 30), ('labor', 30), ('existence', 30), ('form', 28), ('development', 28), ('country', 28), ('socialism', 28), ('capital', 24), ('state', 24)]
from collections import Counter
noun_counts = Counter()
for word, tag in tokens_with_tag:
if tag[0] == 'R':
noun_counts[word.lower()] += 1
# ADJ (a), ADJ_SAT (s), ADV (r), NOUN (n) or VERB (v)
print(noun_counts.most_common(20))
[('not', 55), ('more', 32), ('only', 29), ('so', 27), ('up', 25), ('therefore', 21), ('most', 17), ('away', 13), ('also', 13), ('longer', 12), ('then', 11), ('no', 10), ('even', 10), ('now', 9), ('everywhere', 9), ('generally', 9), ('ever', 9), ('just', 9), ('out', 9), ('already', 8)]
from collections import Counter
noun_counts = Counter()
for word, tag in tokens_with_tag:
if tag[0] == 'R':
noun_counts[wordnet_lemmatizer.lemmatize(word.lower(), 'r')] += 1
# ADJ (a), ADJ_SAT (s), ADV (r), NOUN (n) or VERB (v)
print(noun_counts.most_common(20))
[('not', 55), ('more', 32), ('only', 29), ('so', 27), ('up', 25), ('therefore', 21), ('most', 17), ('away', 13), ('also', 13), ('longer', 12), ('far', 11), ('then', 11), ('no', 10), ('even', 10), ('now', 9), ('everywhere', 9), ('generally', 9), ('ever', 9), ('just', 9), ('out', 9)]
Mining Specific Distant Collocations#
Back to last week
window_size = 9
word_pair_counts = Counter()
word_pair_distance_counts = Counter()
for i in range(len(tokens) - 1):
for distance in range(1, window_size):
if i + distance < len(tokens):
w1 = tokens[i]
w2 = tokens[i + distance]
word_pair_distance_counts[(w1, w2, distance)] += 1
word_pair_counts[(w1, w2)] += 1
for (w1, w2, distance), c in word_pair_distance_counts.most_common(20):
print("%s\t%s\t%d\t%d" % (w1, w2, distance, c))
the of 2 279
of the 1 242
the the 3 154
, the 2 118
, and 1 111
, , 2 109
the of 3 105
the , 5 101
. The 1 100
, , 7 98
, , 4 97
of , 3 97
the the 7 95
the , 8 94
the the 6 92
, , 6 92
the the 4 92
, the 4 92
of , 2 91
the , 2 91
Detect all verb-noun collocations#
window_size = 9
word_pair_counts = Counter()
word_pair_distance_counts = Counter()
for i in range(len(tokens_with_tag) - 1):
w1, t1 = tokens_with_tag[i]
if t1[0] != 'V':
continue
w1 = wordnet_lemmatizer.lemmatize(w1.lower(), 'v')
for distance in range(1, window_size):
if i + distance < len(tokens_with_tag):
w2, t2 = tokens_with_tag[i + distance]
if t2[0] == 'N':
w2 = wordnet_lemmatizer.lemmatize(w2.lower(), 'n')
word_pair_distance_counts[(w1, w2, distance)] += 1
word_pair_counts[(w1, w2)] += 1
for (w1, w2, distance), c in word_pair_distance_counts.most_common(20):
print("%s\t%s\t%d\t%d" % (w1, w2, distance, c))
work class 1 10
be class 4 6
exist society 1 5
be class 6 5
be class 7 4
rule class 1 4
be class 3 4
be society 5 4
work party 2 4
be struggle 5 3
have society 6 3
pave way 2 3
put end 2 3
be bourgeoisie 8 3
lose character 3 3
be bourgeois 5 3
be hand 4 3
be condition 6 3
fight bourgeoisie 3 3
have nothing 1 3
Compute the mean distance of each verb-noun pair.#
pair_mean_distances = Counter()
for (w1, w2, distance), c in word_pair_distance_counts.most_common():
if word_pair_counts[(w1, w2)] > 1:
pair_mean_distances[(w1, w2)] += distance * (c / word_pair_counts[(w1, w2)])
Show the longest, middle, and shortest pairs.#
longest part#
### longest part
for (w1, w2), distance in pair_mean_distances.most_common(20):
print("%s\t%s\t%f\t%d" % (w1, w2, distance, word_pair_counts[(w1, w2)]))
be case 8.000000 2
introduce bourgeoisie 8.000000 2
be communism 7.500000 2
have communism 7.500000 2
do population 7.500000 2
have force 7.500000 2
have master 7.500000 2
have ruling 7.500000 2
be interest 7.500000 2
be order 7.500000 2
be moment 7.500000 2
have man 7.000000 2
have industry 7.000000 2
see proletariat 7.000000 2
have dissolution 7.000000 2
be relation 7.000000 3
be action 7.000000 2
be modern 7.000000 2
leave man 7.000000 2
see production 7.000000 2
Middle part#
### Middle part
num_pairs = len(pair_mean_distances)
mid = num_pairs // 2
for (w1, w2), distance in pair_mean_distances.most_common()[mid-20:mid+20]:
print("%s\t%s\t%f\t%d" % (w1, w2, distance, word_pair_counts[(w1, w2)]))
be mean 5.333333 6
have feudal 5.333333 3
have character 5.333333 3
be laborer 5.333333 3
be property 5.333333 9
be capitalist 5.333333 3
be class 5.238095 21
be capital 5.166667 6
be struggle 5.166667 6
be condition 5.100000 10
be hand 5.000000 4
have part 5.000000 2
compel proletariat 5.000000 2
be time 5.000000 5
do property 5.000000 5
be advance 5.000000 2
have population 5.000000 3
have dependent 5.000000 2
be way 5.000000 3
be slave 5.000000 2
see antagonism 5.000000 2
be bare 5.000000 2
be idea 5.000000 4
abolish property 5.000000 3
create property 5.000000 2
convert property 5.000000 2
keep laborer 5.000000 2
increase labor 5.000000 2
be family 5.000000 2
exist bourgeoisie 5.000000 2
replace education 5.000000 2
do history 5.000000 2
deaden class 5.000000 2
be man 4.750000 4
have mean 4.750000 4
be character 4.750000 4
have hand 4.600000 5
be party 4.500000 2
create force 4.500000 2
abolish appropriation 4.500000 2
Shortest part#
### Shortest part
for (w1, w2), distance in pair_mean_distances.most_common()[-20:]:
print("%s\t%s\t%f\t%d" % (w1, w2, distance, word_pair_counts[(w1, w2)]))
be asunder 2.000000 2
supply proletariat 2.000000 2
be name 2.000000 2
have meaning 2.000000 2
keep pace 2.000000 2
stand face 2.000000 2
continue existence 2.000000 2
exist property 2.000000 2
go hand 2.000000 2
appropriate product 1.600000 5
take place 1.500000 2
increase capital 1.500000 2
have individuality 1.500000 2
create condition 1.500000 2
exist society 1.333333 6
rule class 1.000000 4
rise bourgeoisie 1.000000 2
bourgeois society 1.000000 2
introduce community 1.000000 2
lose sight 1.000000 2
Find out the meaningful verb/noun pairs with deviation.#
pair_deviations.most_common()[-20:]
to filter in low deviation part
pair_deviations = Counter()
for (w1, w2, distance), c in word_pair_distance_counts.most_common():
if word_pair_counts[(w1, w2)] > 1:
pair_deviations[(w1, w2)] += c * ((distance - pair_mean_distances[(w1, w2)]) ** 2)
for (w1, w2), dev_tmp in pair_deviations.most_common():
s_2 = dev_tmp / (word_pair_counts[(w1, w2)] - 1)
pair_deviations[(w1, w2)] = s_2 ** 0.5
for (w1, w2), dev in pair_deviations.most_common()[-20:]:
print("%s\t%s\t%f\t%f\t%d" % (w1, w2, pair_mean_distances[(w1, w2)], dev, word_pair_counts[(w1, w2)]))
compel proletariat 5.000000 0.000000 2
see proletariat 7.000000 0.000000 2
supply proletariat 2.000000 0.000000 2
have dissolution 7.000000 0.000000 2
be name 2.000000 0.000000 2
be antagonism 4.000000 0.000000 2
base antagonism 3.000000 0.000000 2
transform property 3.000000 0.000000 2
dominate society 6.000000 0.000000 2
have meaning 2.000000 0.000000 2
determine condition 4.000000 0.000000 2
admit case 3.000000 0.000000 2
be case 8.000000 0.000000 2
introduce community 1.000000 0.000000 2
introduce woman 3.000000 0.000000 2
introduce bourgeoisie 8.000000 0.000000 2
keep pace 2.000000 0.000000 2
organize class 4.000000 0.000000 2
lose sight 1.000000 0.000000 2
be action 7.000000 0.000000 2
Filter out the stopwords.#
pair_deviations.most_common()[-20:]
to filter in low deviation part
from nltk.corpus import stopwords
stopword_list = stopwords.words('english')
pair_deviations = Counter()
for (w1, w2, distance), c in word_pair_distance_counts.most_common():
if w1 in stopword_list:
continue
if word_pair_counts[(w1, w2)] > 1:
pair_deviations[(w1, w2)] += c * ((distance - pair_mean_distances[(w1, w2)]) ** 2)
for (w1, w2), dev_tmp in pair_deviations.most_common():
s_2 = dev_tmp / (word_pair_counts[(w1, w2)] - 1)
pair_deviations[(w1, w2)] = s_2 ** 0.5
for (w1, w2), dev in pair_deviations.most_common()[-20:]:
print("%s\t%s\t%f\t%f\t%d" % (w1, w2, pair_mean_distances[(w1, w2)], dev, word_pair_counts[(w1, w2)]))
pave way 2.000000 0.000000 3
put end 2.000000 0.000000 3
lose character 3.000000 0.000000 3
rise bourgeoisie 1.000000 0.000000 2
get hand 3.000000 0.000000 2
bourgeois society 1.000000 0.000000 2
compel proletariat 5.000000 0.000000 2
see proletariat 7.000000 0.000000 2
supply proletariat 2.000000 0.000000 2
base antagonism 3.000000 0.000000 2
transform property 3.000000 0.000000 2
dominate society 6.000000 0.000000 2
determine condition 4.000000 0.000000 2
admit case 3.000000 0.000000 2
introduce community 1.000000 0.000000 2
introduce woman 3.000000 0.000000 2
introduce bourgeoisie 8.000000 0.000000 2
keep pace 2.000000 0.000000 2
organize class 4.000000 0.000000 2
lose sight 1.000000 0.000000 2
Filtered by Deviation: Further filter out the low frequent pairs#
pair_deviations = Counter()
for (w1, w2, distance), c in word_pair_distance_counts.most_common():
if w1 in stopword_list:
continue
if word_pair_counts[(w1, w2)] > 2:
pair_deviations[(w1, w2)] += c * ((distance - pair_mean_distances[(w1, w2)]) ** 2)
for (w1, w2), dev_tmp in pair_deviations.most_common():
s_2 = dev_tmp / (word_pair_counts[(w1, w2)] - 1)
pair_deviations[(w1, w2)] = s_2 ** 0.5
for (w1, w2), dev in pair_deviations.most_common()[-20:]:
print("%s\t%s\t%f\t%f\t%d" % (w1, w2, pair_mean_distances[(w1, w2)], dev, word_pair_counts[(w1, w2)]))
find work 2.666667 2.886751 3
abolish property 5.000000 2.645751 3
join class 5.666667 2.516611 3
fight bourgeoisie 3.800000 2.387467 5
work class 2.000000 2.374103 12
represent interest 3.666667 2.081666 3
mean bourgeois 4.333333 2.081666 3
exist thing 4.000000 1.732051 3
revolutionize production 3.333333 1.154701 3
intend property 6.666667 1.154701 3
concentrate hand 3.666667 1.154701 3
attain end 3.000000 1.000000 3
exist society 1.333333 0.816497 6
produce product 3.500000 0.577350 4
appropriate product 1.600000 0.547723 5
rule class 1.000000 0.000000 4
work party 2.000000 0.000000 4
pave way 2.000000 0.000000 3
put end 2.000000 0.000000 3
lose character 3.000000 0.000000 3
General method for distant collocation mining.#
# A handy lemmatizer
# WordNet Style: ADJ (a), ADJ_SAT (s), ADV (r), NOUN (n) or VERB (v)
# Penn Style: ADJ (J*), ADJ_SAT (J*), ADV (R*), NOUN (N*), or VERB (V*)
def lemmatize_verbose(word, pos):
if pos[0] == 'J':
return wordnet_lemmatizer.lemmatize(word, 'a')
elif pos[0] == 'R':
return wordnet_lemmatizer.lemmatize(word, 'r')
elif pos[0] == 'N':
return wordnet_lemmatizer.lemmatize(word, 'n')
elif pos[0] == 'V':
return wordnet_lemmatizer.lemmatize(word, 'v')
else:
return word
def lemmatize_shorter(word, pos):
if pos[0] == 'J':
pos = 'a'
elif pos[0] == 'R':
pos = 'r'
elif pos[0] == 'N':
pos = 'n'
elif pos[0] == 'V':
pos = 'v'
else:
return word
return wordnet_lemmatizer.lemmatize(word, pos)
def lemmatize_smarter(word, pos):
if pos[0] in ['R', 'N', 'V']:
pos = pos[0].lower()
elif pos[0] == 'J':
pos = 'a'
else:
return word
return wordnet_lemmatizer.lemmatize(word, pos)
# Recommended implementation.
def lemmatize(word, pos):
mapping = {'J': 'a', 'R': 'r', 'N': 'n', 'V': 'v'}
if pos[0] in mapping:
return wordnet_lemmatizer.lemmatize(word, mapping[pos[0]])
return word
Count all pairs.#
def distant_collocations(tokens_with_tag, pos1, pos2, min_cut=2, window_size=9):
word_pair_counts = Counter()
word_pair_distance_counts = Counter()
for i in range(len(tokens_with_tag) - 1):
w1, t1 = tokens_with_tag[i]
if not t1.startswith(pos1):
continue
w1 = lemmatize(w1.lower(), t1)
for distance in range(1, window_size):
if i + distance < len(tokens_with_tag):
w2, t2 = tokens_with_tag[i + distance]
if t2.startswith(pos2):
w2 = lemmatize(w2.lower(), t2)
word_pair_distance_counts[(w1, w2, distance)] += 1
word_pair_counts[(w1, w2)] += 1
pair_mean_distances = Counter()
for (w1, w2, distance), c in word_pair_distance_counts.most_common():
if word_pair_counts[(w1, w2)] > 1:
pair_mean_distances[(w1, w2)] += distance * (c / word_pair_counts[(w1, w2)])
pair_deviations = Counter()
for (w1, w2, distance), c in word_pair_distance_counts.most_common():
if w1 in stopword_list:
continue
if word_pair_counts[(w1, w2)] > min_cut:
pair_deviations[(w1, w2)] += c * ((distance - pair_mean_distances[(w1, w2)]) ** 2)
for (w1, w2), dev_tmp in pair_deviations.most_common():
s_2 = dev_tmp / (word_pair_counts[(w1, w2)] - 1)
pair_deviations[(w1, w2)] = s_2 ** 0.5
return pair_deviations
collocations = distant_collocations(tokens_with_tag, 'V', 'N')
for (w1, w2), dev in collocations.most_common()[-20:]:
print("%s\t%s\t%f\t%f\t%d" % (w1, w2, pair_mean_distances[(w1, w2)], dev, word_pair_counts[(w1, w2)]))
find work 2.666667 2.886751 3
abolish property 5.000000 2.645751 3
join class 5.666667 2.516611 3
fight bourgeoisie 3.800000 2.387467 5
work class 2.000000 2.374103 12
represent interest 3.666667 2.081666 3
mean bourgeois 4.333333 2.081666 3
exist thing 4.000000 1.732051 3
revolutionize production 3.333333 1.154701 3
intend property 6.666667 1.154701 3
concentrate hand 3.666667 1.154701 3
attain end 3.000000 1.000000 3
exist society 1.333333 0.816497 6
produce product 3.500000 0.577350 4
appropriate product 1.600000 0.547723 5
rule class 1.000000 0.000000 4
work party 2.000000 0.000000 4
pave way 2.000000 0.000000 3
put end 2.000000 0.000000 3
lose character 3.000000 0.000000 3
collocations = distant_collocations(tokens_with_tag, 'N', 'N')
for (w1, w2), dev in collocations.most_common()[-20:]:
print("%s\t%s\t%f\t%f\t%d" % (w1, w2, pair_mean_distances[(w1, w2)], dev, word_pair_counts[(w1, w2)]))
mode production 0.000000 0.000000 0
socialist communist 0.000000 0.000000 0
division labor 0.000000 0.000000 0
community woman 0.000000 0.000000 0
world market 0.000000 0.000000 0
condition life 0.000000 0.000000 0
mean subsistence 0.000000 0.000000 0
form society 0.000000 0.000000 0
form property 0.000000 0.000000 0
member society 0.000000 0.000000 0
state thing 0.000000 0.000000 0
disappearance class 0.000000 0.000000 0
benefit class 0.000000 0.000000 0
relation production 0.000000 0.000000 0
mean communication 0.000000 0.000000 0
portion bourgeoisie 0.000000 0.000000 0
section class 0.000000 0.000000 0
state society 0.000000 0.000000 0
hand state 0.000000 0.000000 0
bourgeois socialism 0.000000 0.000000 1
collocations = distant_collocations(tokens_with_tag, 'J', 'N')
for (w1, w2), dev in collocations.most_common()[-20:]:
print("%s\t%s\t%f\t%f\t%d" % (w1, w2, pair_mean_distances[(w1, w2)], dev, word_pair_counts[(w1, w2)]))
productive force 0.000000 0.000000 0
middle age 0.000000 0.000000 0
modern bourgeois 0.000000 0.000000 0
private property 0.000000 0.000000 0
feudal society 0.000000 0.000000 0
middle class 0.000000 0.000000 0
petty bourgeois 0.000000 0.000000 0
absolute monarchy 0.000000 0.000000 0
modern bourgeoisie 0.000000 0.000000 0
free trade 0.000000 0.000000 0
bourgeois production 0.000000 0.000000 1
political bourgeoisie 0.000000 0.000000 0
immense majority 0.000000 0.000000 0
french revolution 0.000000 0.000000 0
mere production 0.000000 0.000000 0
political supremacy 0.000000 0.000000 0
eighteenth century 0.000000 0.000000 0
historical development 0.000000 0.000000 0
eternal truth 0.000000 0.000000 0
undeveloped state 0.000000 0.000000 0
collocations = distant_collocations(tokens_with_tag, 'NNP', 'N')
for (w1, w2), dev in collocations.most_common()[-20:]:
print("%s\t%s\t%f\t%f\t%d" % (w1, w2, pair_mean_distances[(w1, w2)], dev, word_pair_counts[(w1, w2)]))
true socialism 0.000000 3.785939 1
communist party 0.000000 2.863564 0
communist communist 0.000000 2.645751 0
england france 0.000000 2.516611 0
socialist literature 0.000000 2.500000 0
communist literature 0.000000 2.500000 0
communism power 0.000000 1.527525 0
germany bourgeoisie 0.000000 0.577350 0
socialist communist 0.000000 0.000000 0
Implememnt a better lemmatizer for handling proper nouns (NNP / NNPS).#
def lemmatize(word, pos):
if not pos.startswith('NNP'):
word = word.lower()
mapping = {'J': 'a', 'R': 'r', 'N': 'n', 'V': 'v'}
if pos[0] in mapping:
return wordnet_lemmatizer.lemmatize(word, mapping[pos[0]])
return word
And do not lower() the word in the main function anymore.#
def distant_collocations(tokens_with_tag, pos1, pos2, min_cut=2, window_size=9):
word_pair_counts = Counter()
word_pair_distance_counts = Counter()
for i in range(len(tokens_with_tag) - 1):
w1, t1 = tokens_with_tag[i]
if not t1.startswith(pos1):
continue
w1 = lemmatize(w1, t1)
for distance in range(1, window_size):
if i + distance < len(tokens_with_tag):
w2, t2 = tokens_with_tag[i + distance]
if t2.startswith(pos2):
w2 = lemmatize(w2, t2)
word_pair_distance_counts[(w1, w2, distance)] += 1
word_pair_counts[(w1, w2)] += 1
pair_mean_distances = Counter()
for (w1, w2, distance), c in word_pair_distance_counts.most_common():
if word_pair_counts[(w1, w2)] > 1:
pair_mean_distances[(w1, w2)] += distance * (c / word_pair_counts[(w1, w2)])
pair_deviations = Counter()
for (w1, w2, distance), c in word_pair_distance_counts.most_common():
if w1 in stopword_list:
continue
if word_pair_counts[(w1, w2)] > min_cut:
pair_deviations[(w1, w2)] += c * ((distance - pair_mean_distances[(w1, w2)]) ** 2)
for (w1, w2), dev_tmp in pair_deviations.most_common():
s_2 = dev_tmp / (word_pair_counts[(w1, w2)] - 1)
pair_deviations[(w1, w2)] = s_2 ** 0.5
return pair_deviations
collocations = distant_collocations(tokens_with_tag, 'NNP', 'N')
for (w1, w2), dev in collocations.most_common()[-20:]:
print("%s\t%s\t%f\t%f\t%d" % (w1, w2, pair_mean_distances[(w1, w2)], dev, word_pair_counts[(w1, w2)]))
Communist literature 0.000000 2.886751 0
Socialist literature 0.000000 2.886751 0
England France 0.000000 2.516611 0
Communism power 0.000000 1.527525 0
Communists party 0.000000 1.154701 0
Germany bourgeoisie 0.000000 0.577350 0
Socialist Communist 0.000000 0.000000 0
collocations = distant_collocations(tokens_with_tag, 'V', 'N')
for (w1, w2), dev in collocations.most_common()[-20:]:
print("%s\t%s\t%f\t%f\t%d" % (w1, w2, pair_mean_distances[(w1, w2)], dev, word_pair_counts[(w1, w2)]))
find work 2.666667 2.886751 3
abolish property 5.000000 2.645751 3
join class 5.666667 2.516611 3
fight bourgeoisie 3.800000 2.387467 5
work class 2.000000 2.374103 12
represent interest 3.666667 2.081666 3
mean bourgeois 4.333333 2.081666 3
exist thing 4.000000 1.732051 3
revolutionize production 3.333333 1.154701 3
intend property 6.666667 1.154701 3
concentrate hand 3.666667 1.154701 3
attain end 3.000000 1.000000 3
exist society 1.333333 0.816497 6
produce product 3.500000 0.577350 4
appropriate product 1.600000 0.547723 5
rule class 1.000000 0.000000 4
work party 2.000000 0.000000 4
pave way 2.000000 0.000000 3
put end 2.000000 0.000000 3
lose character 3.000000 0.000000 3
Try chatGPT#
Detecting pattern with POS#
import re
# 偵測模式的正則表達式
pattern = r'年輕人.+越來越.+VV'
# 輸入的資料
input_data = [('現在', 'NR'), ('的', 'DEG'), ('年輕人', 'NN'), ('越來越', 'AD'), ('不', 'AD'), ('守禮教', 'VV'), ('約束', 'NN'), (',', 'PU'), ('讓', 'VV'), ('長輩', 'NN'), ('們', 'N'), ('感到', 'VV'), ('擔憂', 'NN'), ('。', 'PU')]
# 將資料轉成字串
input_str = ''.join([word[0] + '/' + word[1] + ' ' for word in input_data])
print(input_str)
# 偵測是否符合pattern
if re.search(pattern, input_str):
print("符合pattern")
else:
print("不符合pattern")
現在/NR 的/DEG 年輕人/NN 越來越/AD 不/AD 守禮教/VV 約束/NN ,/PU 讓/VV 長輩/NN 們/N 感到/VV 擔憂/NN 。/PU
符合pattern
Detecting multiple patterns#
import pandas as pd
import re
# 原始資料
data = [('現在年輕人實在是越來越沒有教養',),
('現在不少青少年在面對未來社會時愈來愈讓人覺得沒信心',)]
# 定義正則表達式
pattern = r'(.*)(年輕人|青少年)(.*)(越來越|愈來愈)(.*)'
# 如果是括號內加上一個?:的話,代表是要偵測有這兩個字的,但沒有要把這兩個保留出來,所以偵測的結果會少這個字組
# pattern = r'(.*)(?:年輕人|青少年)(.*)(越來越|愈來愈)(.*)'
# 將資料轉換為DataFrame
df = pd.DataFrame(data, columns=['text'])
# 使用apply運算並轉換為DataFrame
df = df['text'].apply(lambda x: pd.Series(re.findall(pattern, x)[0])).rename(columns={0: 'prefix', 1: 'target1', 2: 'middle', 3:'target2', 4: 'suffix'})
# 顯示結果
print(df)
prefix target1 middle target2 suffix
0 現在 年輕人 實在是 越來越 沒有教養
1 現在不少 青少年 在面對未來社會時 愈來愈 讓人覺得沒信心
from IPython.display import HTML
html_table = df.to_html()
HTML(html_table)
prefix | target1 | middle | target2 | suffix | |
---|---|---|---|---|---|
0 | 現在 | 年輕人 | 實在是 | 越來越 | 沒有教養 |
1 | 現在不少 | 青少年 | 在面對未來社會時 | 愈來愈 | 讓人覺得沒信心 |
With POS#
data = [('現在/NR 的/DEG 年輕人/NN 越來越/AD 沒有/VV 教養/NN ,/PU 讓/VV 長輩/NN 們/N 感到/VV 擔憂/NN 。/PU',),
('現在/NR 不少/AD 青少年/NN 在/P 面對/VV 未來/NN 社會/NN 時/NT 愈來愈/AD 讓/VD 人/NN 覺得/VV 沒/VE 信心/NN /PU',)]
df = pd.DataFrame(data, columns=['text'])
pattern = r'(.*)(年輕人|青少年)(.*)(越來越|愈來愈).*?\s+(.+?/VV)(.*)'
df = df['text'].apply(lambda x: pd.Series(re.findall(pattern, x)[0])).rename(columns={0: 'prefix', 1: 'target1', 2: 'middle', 3:'target2', 4: 'suffix'})
html_table = df.to_html()
HTML(html_table)
prefix | target1 | middle | target2 | suffix | 5 | |
---|---|---|---|---|---|---|
0 | 現在/NR 的/DEG | 年輕人 | /NN | 越來越 | 沒有/VV | 教養/NN ,/PU 讓/VV 長輩/NN 們/N 感到/VV 擔憂/NN 。/PU |
1 | 現在/NR 不少/AD | 青少年 | /NN 在/P 面對/VV 未來/NN 社會/NN 時/NT | 愈來愈 | 讓/VD 人/NN 覺得/VV | 沒/VE 信心/NN /PU |