-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape.py
49 lines (43 loc) · 1.56 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import twitter
import string
import pickle
from secret import CONSUMER_KEY, CONSUMER_SECRET, ACCESS_TOKEN, ACCESS_TOKEN_SECRET
api = twitter.Api(consumer_key=CONSUMER_KEY, consumer_secret=CONSUMER_SECRET,
access_token_key=ACCESS_TOKEN, access_token_secret=ACCESS_TOKEN_SECRET)
def fetch():
data = {}
tweets = []
max_id = None
total = 0
exclude_words = ['My Top'] # words whose tweets should be excluded
while True:
statuses = api.GetUserTimeline(screen_name='AyBe', count=200,
max_id=max_id, include_rts=False,
exclude_replies=True)
new_count = ignored_count = 0
for s in statuses:
if s.id in data:
ignored_count += 1
else:
if not any(word in s.text for word in exclude_words):
tweets.append(format_tweet(s))
new_count += 1
total += new_count
print("Fetched %d/%d/%d new/old/total." % ( new_count, ignored_count, total))
if new_count == 0:
break
max_id = min([s.id for s in statuses]) - 1
pickle.dump(tweets, tweet_file)
return data.values()
def format_tweet(tweet):
exclude_chars = ['"'] # characters to exclude
tweet = tweet.text.replace('&','&') \
.replace('<','<') \
.replace('>','>')
tweet = ''.join(ch for ch in tweet if ch not in exclude_chars)
tweet += '|'
return tweet
if __name__ == '__main__':
tweet_file = open('tweets.pickle', 'wb')
fetch()
tweet_file.close()