-
Notifications
You must be signed in to change notification settings - Fork 0
/
search_scraper.py
125 lines (93 loc) · 3.28 KB
/
search_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import time
import csv
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
# Functions
def add_item(arr, soupi):
if soupi:
arr.append(" ".join(soupi.get_text().split()))
return
arr.append(None)
def add_from_page(soupi):
boxes = soupi.find_all(
'div', class_='entity-result__content entity-result__divider pt3 pb3 t-12 t-black--light')
time_taken = 0
while not boxes and time_taken < 30:
time.sleep(5)
time_taken += 5
boxes = soupi.find_all(
'div', class_='entity-result__content entity-result__divider pt3 pb3 t-12 t-black--light')
for box in boxes:
profile = []
title_div = box.find('div', class_='t-roman t-sans')
link_div = title_div.find('a')
# names
if link_div:
name_span = link_div.find('span', attrs={'aria-hidden': 'true'})
add_item(profile, name_span)
else:
profile.append(None)
print("link_div not found - names None")
# abouts
about_div = box.find(
'div', class_='entity-result__primary-subtitle t-14 t-black t-normal')
add_item(profile, about_div)
# locations
loc_div = box.find(
'div', class_='entity-result__secondary-subtitle t-14 t-normal')
add_item(profile, loc_div)
# href
if link_div:
profile.append(link_div['href'])
else:
profile.append(None)
print("link_div not found - hrefs None")
profiles.append(profile)
def log_in(usr, pwd):
driver.get("https://www.linkedin.com/login/")
elementID = driver.find_element(By.ID, 'username')
elementID.send_keys(usr)
elementID = driver.find_element(By.ID, 'password')
elementID.send_keys(pwd)
elementID.submit()
########
chrome_driver_path = input("Enter absolute path of chrome driver \n")
url = input("Enter the Search URL\n")
output_file = input("Enter the name of output file\n")
s = Service(chrome_driver_path) if chrome_driver_path else Service(
'D:\Aryan\Python\allumni\driver\chromedriver.exe')
driver = webdriver.Chrome(service=s)
file = open("config.txt")
line = file.readlines()
log_in(line[0], line[1])
driver.get(url)
time.sleep(10)
src = driver.page_source
soup = BeautifulSoup(src, 'lxml')
total_res_div = soup.find('h2', class_='pb2 t-black--light t-14')
time_taken = 10
while not total_res_div and time_taken < 30:
time.sleep(5)
total_res_div = soup.find('h2', class_='pb2 t-black--light t-14')
time_taken += 5
total_res = int(total_res_div.get_text().split()[0])
total_page = (total_res+9)//10
print("Total number of results are", total_res)
print("Total number of pages are", total_page)
profiles = []
add_from_page(soup)
for curr_page in range(2, total_page+1):
new_url = url+"&page="+str(curr_page)
driver.get(new_url)
time.sleep(10)
new_src = driver.page_source
new_soup = BeautifulSoup(new_src, 'lxml')
add_from_page(new_soup)
driver.quit()
print(len(profiles), "Profiles scraped successfully")
print(total_res-len(profiles), "Profiles failed")
with open(output_file, 'w', newline='', encoding="utf-8") as f:
wr = csv.writer(f)
wr.writerows(profiles)