forked from andisltn48/Pemrograman-Fungsional
-
Notifications
You must be signed in to change notification settings - Fork 0
/
TwitterExtraction.py
38 lines (31 loc) · 2.04 KB
/
TwitterExtraction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import tweepy
import csv
import os
from collections import namedtuple
import re
import concurrent.futures
key = namedtuple('key', ['API', 'API_KEY_SECRET', 'ACCESS_TOKEN', 'ACCESS_TOKEN_SECRET'])
API = key('BmC1Bs84AS2IYha9InGMcAsou', 'KmTkztacSjHVLq1EwXWzwMCslPvjEs2tEO9Jm3XBUN9tvB6sm2',\
'1292119959015833602-vOk0QiFz63m4Spfkeup4POAsDp0AC3','sewODxJHqRjOf17YzQIT7i6K33UlLgWhsrUmQXqtqPBAf')
auth = tweepy.OAuthHandler(API.API, API.API_KEY_SECRET)
auth.set_access_token(API.ACCESS_TOKEN, API.ACCESS_TOKEN_SECRET)
cleanUrl = lambda twitterResult: re.sub(r'http\S+', '', twitterResult.lower())
cleanTwitterSym = lambda twitterResult: re.sub("(@[A-Za-z0-9]+)|(#[A-Za-z0-9]+)|(r[^\x00-\x7F]+)|([^0-9A-Za-z])",' ', cleanUrl(twitterResult))
cleanDigitSym = lambda twitterResult: re.sub("(:)|(r…)|(rt|RT)|([0-9])", '', cleanTwitterSym(twitterResult))
cleanTags = lambda twitterResult: re.sub("</?.*?>","<>", cleanDigitSym(twitterResult))
cleanTweet = lambda twitterResult: re.sub("( +)", ' ', cleanDigitSym(twitterResult).lstrip(' ')) # -> str
getData = lambda query, banyakTweet: (dict(created_at=x.created_at, username=x.user.screen_name, tweet=cleanTweet(x.full_text))
for x in tweepy.API(auth).search(q=query, include_rts=False, lang="id", tweet_mode="extended", count=banyakTweet)) # -> dict
def extractTwitter(nameFile:str, query:str, banyakTweet:int) -> csv:
filePath = f"data/datasetSource/tweet-dataset-{nameFile}.csv"
with open(filePath, "a+" if os.path.exists(filePath) else "w") as file:
writer_csv = csv.DictWriter(file, ["created_at", "username", "tweet"])
True if file.mode == "a+" else writer_csv.writeheader()
with concurrent.futures.ThreadPoolExecutor() as executor:
executor.map(writer_csv.writerow, getData(query, banyakTweet))
if __name__ == "__main__":
import time
time1 = time.perf_counter()
extractTwitter("covid4", "COVID19 OR COVID-19 OR vaksin OR (varian AND baru AND covid) OR corona OR (virus AND covid)", 200)
time2 = time.perf_counter()
print(f"waktu : {time2-time1}")