admin管理员组文章数量:1559072
1.最近经理被邮件烦到了,可能是太多了,想着批量读取并将需要的数据处理好,传到他的接口去,无奈腾讯的js又多,又难,又绕~看不懂看不懂,手动狗头。于是准备使用selenium模拟浏览器的方式
2.由于需要每天定时爬取,所以这里使用的是scrapy框架,其实、、主要作用是为了定时方便才用的,直接发布就行了。之前有定时框架了,所以采用scrapy。不是定时的朋友直接用自己喜欢的方式就行了
3.先导入一些库,好像有些没用到。。反正是灰色的就去掉或者注释就行了~
import json
import re
import time
import requests
import scrapy
from django.http import request, httpresponse
from lxml import etree
from selenium import webdriver
from selenium.webdriver.chrome.options import options
from selenium.webdriver.support.wait import webdriverwait
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.common.by import by
from selenium.webdriver import chromeoptions, desiredcapabilities
4.以下是爬虫代码了,嗐,别喷我,写在start_requests里面方便一点。。。别问我为什么。。本人很懒。。但是给你们注释好~
5.这里有个坑,注意到上面的browser.switch_to.frame(“mainframe”),这里必须要跳到这个iframe,不然接下来的元素你找不到会报错,并且等待3秒钟也是必要的,不然也找不到.
class temailspider(scrapy.spider):
name = 'temail'
allowed_domains = []
start_urls = []
custom_settings = {'concurrent_requests': '10', }#线程数~
def chuliinfo(self, shuju):
shuju = ''.join(shuju)
shuju = shuju.replace('\n', '').replace(' ', '')
return shuju
def start_requests(self):
option = chromeoptions()
option.add_experimental_option('excludeswitches', ['enable-automation'])
option.add_experimental_option('useautomationextension', false)
# 这里开启无头,这样能快很多
# option.add_argument('--headless')
option.add_argument('--disable-gpu')
# 这里是禁止加载提高速度
# prefs = {
# 'profile.default_content_setting_values': {
# 'images': 2,#2这个状态是禁止加载图片
# # # 'javascript': 2
# }
# }
# option.add_experimental_option('prefs', prefs)
# 谷歌浏览器
desired_capabilities = desiredcapabilities.chrome
desired_capabilities["pageloadstrategy"] = "none"
browser = webdriver.chrome(options=option, desired_capabilities=desired_capabilities)
browser.execute_cdp_cmd('page.addscripttoevaluateonnewdocument',
{'source': 'object.defineproperty(navigator, "webdriver", {get: () => undefined})'})
browser.get("https://exmail.qq/login") # get接受url可以是任何网址,此处以百度为例
wait = webdriverwait(browser, 30)
element = wait.until(ec.presence_of_element_located((by.xpath, '//*[@id="loginform"]/div[3]/div[3]/a[1]'))).click() # 它的作用现在就是,等!!!!!!!
input_user = wait.until(ec.presence_of_element_located((by.xpath, '//*[@id="inputuin"]'))).send_keys(
'这里输入你的账号') # 输入账号
input_pwd = wait.until(ec.presence_of_element_located((by.xpath, '//*[@id="pp"]'))).send_keys('这里输入你的密码') # 输入密码
bt_login = wait.until(ec.presence_of_element_located((by.xpath, '//*[@id="btlogin"]'))).click() # 点击登录按钮
# 以下是收件箱的等待时间
# time.sleep(6)
folder = wait.until(ec.presence_of_element_located((by.xpath, '//*[@id="folder_1"]'))).click() # 点击收件箱
browser.switch_to.frame("mainframe")
# 定位 收件人
time.sleep(3)
6.下面的代码可加可不加,可以直接选择你要的邮件页数跳转,否则就是从第一页开始,建议操作试试~
#下面这一坨是跳转页数的
maillistjump = wait.until(ec.presence_of_element_located((by.xpath, '//*[@id="maillistjump"]'))).click() # 跳转
jumppage = wait.until(ec.presence_of_element_located((by.xpath, '//*[@id="list"]/div[1]//input'))).send_keys(
'10') # 输入账号 # 跳转页数
mljump = wait.until(ec.presence_of_element_located((by.xpath, '//*[@id="list"]/div[1]//a'))).click() # 跳转按钮
browser.switch_to.default_content()
browser.switch_to.frame("mainframe")
time.sleep(3)
7.接下来就是重点的解析与循环了这可是其中的time.sleep把我坑哭了,哇哇的
# print(len(email_list))
for i in range(1000):
if i != 0:
browser.switch_to.frame("mainframe")
time.sleep(3)
email_list = wait.until(ec.presence_of_all_elements_located((by.xpath, '//*[@id="frm"]/div[2]/table'))) # 一页中所有的邮件
# email_list1 = webdriverwait(browser, 5).until(ec.presence_of_all_elements_located((by.xpath, '//*[@id="frm"]/div[3]/table'))) # 另一种情况
email_list1=browser.find_elements_by_xpath('//*[@id="frm"]/div[3]/table')
# email_list = browser.find_elements_by_xpath('//*[@id="frm"]/div[2]/table') # //tr/td[3]/table//tr/td[3]/div[1]/u
# email_title =browser.find_element_by_xpath('//*[@id="frm"]/div[2]/table//tr/td[3]/table//tr/td[3]/div[1]/u').text
for index1, email in enumerate(email_list):
email_title = wait.until(ec.presence_of_element_located(
(by.xpath, '//*[@id="frm"]/div[2]/table[{}]//tr/td[3]/table//tr/td[3]/div[1]/u'.format(index1 1)))).text
if email_title == '邮件标题名':#这里你可以用xpath找一下就知道了~只打开你的邮件标题的邮件
print(email_title)
# totime=wait.until(ec.presence_of_element_located((by.xpath,'//*[@id="frm"]/div[2]/table[{}]//tr/td[1]/input'.format(index1 1)))).get_attribute('totime')
emailinfo = wait.until(ec.presence_of_element_located((by.xpath,'//*[@id="frm"]/div[2]/table[{}]//tr/td[3]/table//tr/td[3]/div[1]/u'.format(index1 1)))).click()
time.sleep(1)
backemail = wait.until(ec.presence_of_element_located((by.xpath, '//*[@id="mainmail"]/div[1]/div[2]/a[1]')))
html = browser.page_source
response = etree.html(html)
content = response.xpath('//*[@id="mailcontentcontainer"]//text()') # 文本内容
customer = content[1].replace(' ', '').replace('\n', '').replace('\xa0', '').replace("'", "").split(':')
blno = content[2].replace(' ', '').replace('\n', '').replace('\xa0', '').replace("'", "").split(':')
portdis = content[3].replace(' ', '').replace('\n', '').replace('\xa0', '').replace("'", "").split(':')
containere = content[4].replace(' ', '').replace('\n', '').replace('\xa0', '').replace("'", "").split(':')
customerno = customer[-1]
blno = blno[-1]
portdis = portdis[-1]
container_remake = containere[-1]
weituoinfo = {
"customernumber": customerno,
"shipmentnumber": blno,
"dischargingport": portdis,
"boxremark": container_remake
}
print(weituoinfo)
hea = {"content-type": "application/json"}#可以去掉
url = '传入接口的url'#可以去掉
response1 = requests.post(url, data=json.dumps(weituoinfo), headers=hea)#可以去掉
backemail.click() # 点击返回
time.sleep(1)
elif email_title == '邮件标题名':#这里你可以用xpath找一下就知道了~只打开你的邮件标题的邮件
print(email_title)
emailinfo = wait.until(ec.presence_of_element_located((by.xpath,'//*[@id="frm"]/div[2]/table[{}]//tr/td[3]/table//tr/td[3]/div[1]/u'.format(index1 1)))).click()
time.sleep(1)
backemail = wait.until(ec.presence_of_element_located((by.xpath, '//*[@id="mainmail"]/div[1]/div[2]/a[1]')))
html = browser.page_source
response = etree.html(html)
content = response.xpath('//*[@id="mailcontentcontainer"]//text()') # 文本内容
customer = content[1].replace(' ', '').replace('\n', '').replace("'", "").replace('\xa0', '').split(':')
booking = content[2].replace(' ', '').replace('\n', '').replace("'", "").replace('\xa0', '').split(':')
blno = content[3].replace(' ', '').replace('\n', '').replace("'", "").replace('\xa0', '').split(':')
portdis = content[4].replace(' ', '').replace('\n', '').replace("'", "").replace('\xa0', '').split(':')
containere = content[5].replace(' ', '').replace('\n', '').replace("'", "").replace('\xa0', '').split(":")
customerno = customer[-1]
booking = booking[-1]
blno = blno[-1]
portdis = portdis[-1]
container_remake = containere[-1]
shipbookinginfo = {
"customernumber": customerno,
"shipmentnumber":booking,
"blno": blno,
"dischargingport":portdis,
"predistributionbox": container_remake
}
print(shipbookinginfo)
backemail.click() # 点击返回
time.sleep(1)
if email_list1!=[]:
for index1, email in enumerate(email_list):
email_title = wait.until(ec.presence_of_element_located((by.xpath, '//*[@id="frm"]/div[3]/table[{}]//tr/td[3]/table//tr/td[3]/div[1]/u'.format(index1 1)))).text
if email_title == '邮件标题名':#这里你可以用xpath找一下就知道了~只打开你的邮件标题的邮件
print(email_title)
# totime=wait.until(ec.presence_of_element_located((by.xpath,'//*[@id="frm"]/div[2]/table[{}]//tr/td[1]/input'.format(index1 1)))).get_attribute('totime')
emailinfo = wait.until(ec.presence_of_element_located((by.xpath,'//*[@id="frm"]/div[3]/table[{}]//tr/td[3]/table//tr/td[3]/div[1]/u'.format(index1 1)))).click()
time.sleep(1)
backemail = wait.until(ec.presence_of_element_located((by.xpath, '//*[@id="mainmail"]/div[1]/div[2]/a[1]')))
html = browser.page_source
response = etree.html(html)
content = response.xpath('//*[@id="mailcontentcontainer"]//text()') # 文本内容
customer = content[1].replace(' ', '').replace('\n', '').replace('\xa0', '').replace("'", "").split(':')
blno = content[2].replace(' ', '').replace('\n', '').replace('\xa0', '').replace("'", "").split(':')
portdis = content[3].replace(' ', '').replace('\n', '').replace('\xa0', '').replace("'", "").split(':')
containere = content[4].replace(' ', '').replace('\n', '').replace('\xa0', '').replace("'", "").split(':')
customerno = customer[-1]
blno = blno[-1]
portdis = portdis[-1]
container_remake = containere[-1]
weituoinfo = {
"customernumber": customerno,
"shipmentnumber": blno,
"dischargingport": portdis,
"boxremark": container_remake
}
print(weituoinfo)
backemail.click() # 点击返回
time.sleep(1)
elif email_title == '邮件标题名':#这里你可以用xpath找一下就知道了~只打开你的邮件标题的邮件
print(email_title)
emailinfo = wait.until(ec.presence_of_element_located((by.xpath,'//*[@id="frm"]/div[3]/table[{}]//tr/td[3]/table//tr/td[3]/div[1]/u'.format(index1 1)))).click()
time.sleep(1)
backemail = wait.until(ec.presence_of_element_located((by.xpath, '//*[@id="mainmail"]/div[1]/div[2]/a[1]')))
html = browser.page_source
response = etree.html(html)
content = response.xpath('//*[@id="mailcontentcontainer"]//text()') # 文本内容
customer = content[1].replace(' ', '').replace('\n', '').replace("'", "").replace('\xa0', '').split(':')
booking = content[2].replace(' ', '').replace('\n', '').replace("'", "").replace('\xa0', '').split(':')
blno = content[3].replace(' ', '').replace('\n', '').replace("'", "").replace('\xa0', '').split(':')
portdis = content[4].replace(' ', '').replace('\n', '').replace("'", "").replace('\xa0', '').split(':')
containere = content[5].replace(' ', '').replace('\n', '').replace("'", "").replace('\xa0', '').split(":")
customerno = customer[-1]
booking = booking[-1]
blno = blno[-1]
portdis = portdis[-1]
container_remake = containere[-1]
shipbookinginfo = {
"customernumber": customerno,
"shipmentnumber":booking,
"blno": blno,
"dischargingport":portdis,
"predistributionbox": container_remake
}
print(shipbookinginfo)
backemail.click() # 点击返回
time.sleep(1)
next_pages = wait.until(ec.presence_of_element_located((by.id, 'nextpage'))) # 翻页prevpage#nextpage
next_pages.click()
browser.switch_to.default_content()
# browser.switch_to.frame("mainframe")
if next_pages == []:
break
8.如果您还有不懂的可以私信我hhhhhhhhhh在线接单??哈哈哈~
#运行这个来启动哟~
cmdline.execute("scrapy crawl temail".split())
九游会下载的版权声明:本文标题:python3 scrapy selenium腾讯企业邮箱的指定邮件读取功能 内容由热心网友自发贡献,该文观点仅代表作者本人, 转载请联系作者并注明出处:https://www.elefans.com/dongtai/1727422262a1113712.html, 本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌抄袭侵权/违法违规的内容,一经查实,本站将立刻删除。
发表评论