import requests
import time
headers = {
# headers
}
pages = {
# url 1
# url 2
}
for url in pages:
page = requests.get(url, headers = headers)
# scraping
time.sleep(5)
import requests
import time
import random
headers = {
# headers
}
pages = {
# url 1
# url 2
}
for url in pages:
page = requests.get(url, headers = headers)
# scraping
time.sleep(random.uniform(5, 10))
import requests
page = requests.get('https://www.google.com/')
print(page.request.headers)
{'User-Agent': 'python-requests/2.22.0', 'Accept-Encoding': 'gzip, deflate',
'Accept': '*/*', 'Connection': 'keep-alive'}
print(page.headers)
{'Date': 'Tue, 18 Feb 2020 15:55:09 GMT', 'Content-Type': 'application/json',
'Content-Length': '51', 'Connection': 'keep-alive', 'Server': 'gunicorn/19.9.0',
'Access-Control-Allow-Origin': '*', 'Access-Control-Allow-Credentials': 'true'}
import requests
headers = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:69.0), [email protected]'
}
page = requests.get('https://www.google.com/', headers = headers)
print(page.request.headers)
{'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:69.0), [email protected]',
'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive'}
import requests
headers = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:69.0),
[email protected]', 'Referer' : 'https://www.google.com/'
}
page = requests.get('https://en.wikipedia.org/wiki/Web_scraping', headers = headers)
print(page.request.headers)
{'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:69.0),
[email protected]', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*',
'Connection': 'keep-alive', 'Referer': 'https://www.google.com/'}
import requests
from bs4 import BeautifulSoup
url = 'https://free-proxy-list.net/'
page = requests.get(url)
soup = BeautifulSoup(page.content, ‘html.parser’)
# every 8th <td> element is the IP
ips = [x.text for x in soup.find_all('td', limit = 160)[::8]]
# omit the 1st element (IP), get all the 8ths
ports = [x.text for x in soup.find_all('td', limit = 160)[1::8]]
proxies = [":".join([a, b]) for a, b in zip(ips, ports)]
def new_proxies():
url = 'https://free-proxy-list.net/'
page = requests.get(url)
soup = BeautifulSoup(page.content, ‘html.parser’)
ips = [x.text for x in soup.find_all('td', limit = 160)[::8]]
ports = [x.text for x in soup.find_all('td', limit = 160)[1::8]]
proxies = [":".join([a, b]) for a, b in zip(ips, ports)]
return proxies
addresses = new_proxies()
url = 'https://httpbin.org/ip'
for nb, i in enumerate(addresses, 1):
proxies = {'http': 'http://' + i, 'https': 'http://' + i}
print("Request Number %d" %nb)
try:
page = requests.get(url, proxies = proxies)
print(page.json())
except:
print('Connection failed. Skipping proxy.')
import requests
s = requests.Session()
# this is a cookie pre-set for testing purposes
s.get('http://httpbin.org/cookies/set/sessioncookie/1234')
# next request
page = s.get('http://httpbin.org/cookies')
print(page.request.headers)
# {'User-Agent': 'python-requests/2.22.0', 'Accept-Encoding': 'gzip, deflate',
'Accept': '*/*', 'Connection': 'keep-alive', 'Cookie': 'sessioncookie=1234'}