开源软件名称:WinCrawler开源软件地址:https://gitee.com/UnlimitedBladeWorks_123/WinCrawler开源软件介绍:WinCrawler通用爬虫框架,设计参考八爪鱼、火车头等爬虫软件,采用“页面+接口”可以混用的方式进行采集数据。任务因中断导致没采集完数据,后续执行不重复采集相同数据。 特色
使用方法
步骤配置逻辑步骤
if(附加数据持久化){ "name":"if条件判断", "type":"ifGroup", "condition":"${(ret.body.success==true)?c}", "needStore": true, "storeKey": "age_${o.age}-sex_${o.sex}-payEndYear_${o.payEndYear}", "stepList":[ { "name":"打开登录页面", "type":"openWebPage", "url":"http://www.baidu.com/" } ], "delayTime": 100} for(附加数据持久化){ "name":"条件loop", "type":"forGroup", "listStr":"${descartes('age',range(30,70),'sex','[\"Sex1\",\"Sex2\"]','payEndYear','[2,3,4]')}", "list": [], "iterAlias":"o", "needStore": true, "storeKey": "age_${o.age}-sex_${o.sex}-payEndYear_${o.payEndYear}", "stepList":[{},{}] } while(附加数据持久化){ "name":"while循环", "type":"whileGroup", "breakCondition":"${(ret.body.success==true)?c}", "maxCount":3, "needStore": true, "storeKey": "age_${o.age}-sex_${o.sex}-payEndYear_${o.payEndYear}", "stepList":[]} ifSignal(同代码if逻辑, step_continue,step_break,step_throwe){ "name":"if条件判断", "type":"ifGroup", "condition":"${(ret.body.success==true)?c}", "signal": "step_continue", "signalMsg": "记录到数据库的信息"} setV { "name":"设置常量", "type":"setV", "values":{ "origin_ids":"${pid},<#list productList as p>${p.productId}<#if p?has_next>,</#if></#list>" }, "alias":"init" } 页面步骤
点击{ "name":"选择投保人性别", "type":"click", "frameXPathList":[ "//[@id='page-content']", "//[@id='WorkFrame']" ], "matchXPath":"//[@id='appSex1']", "hasAlert": false} 输入文本{ "name":"输入用户名", "type":"inputText", "waitElementXPath":"//[@id='frmInput']", "matchXPath":"//[@id='username']", "value":"33011082001025"} 打开网页{ "name":"打开登录页面", "type":"openWebPage", "url":"http://ehome.e-chinalife.com/", "timeOut": 5000 ,"//":"(不配置,使用默认值)"} 验证码校验{ "name":"输入验证码登录", "type":"validateCode", "waitElementXPath":"//[@id='frmInput']", "codeImageXPath":"//[@id='validateNum']", "inputTextXPath":"//[@id='inputValidateNum']", "ajaxFlag":true, "presentXPathOnSuccess":"//img[@id='rightId' and contains(@src,'tick_circle.png')]", "presentXPathOnFailure":"//img[@id='rightId' and contains(@src,'cross_circle.png')]", "submitXPath":"//[@id='frmInput']/div/table/tbody/tr[4]/td/input[1]", "retryTimesOnFailure":3} 下拉框单选(select){ "name":"选择缴费方式", "type":"select", "matchXPath":"//select[@name='common[pay_period]' and @data-key='pay_period']", "selectConfig":{ "type":"byIndex", "value":"${o.payPeriod}" }} 下拉框多选(select){ "name":"选择缴费方式", "type":"multiSelect", "matchXPath":"//select[@name='common[pay_period]' and @data-key='pay_period']", "selectConfigList":[{ "type":"byIndex", "value":"${o.payPeriod}" }]} 下拉框选择(扩展,逻辑上){ "name":"选择新增计划书", "type":"selectEx", "waitElementXPath":"//[@id='menu-content']", "matchXPath":"//[@id='menu-flag']", "selectXPath":"//[@id='menu-content']/div/ul/li[@menuid=508]"} 提取数据{ "name":"提取数据", "type":"extractData", "frameXPathList":[ "//[@id='page-content']", "//[@id='WorkFrame']" ], "matchXPath":"//[@id='riskcheck']/tbody/tr[position()>3]", "ruleList":[ { "name":"c2", "rule":".//td[2]" }, { "name":"c3", "rule":".//td[3]" }, { "name":"c4", "rule":".//td[4]" }, { "name":"c5", "rule":".//td[5]" }, { "name":"c6", "rule":".//td[6]" }, { "name":"c7", "rule":".//td[7]" }, { "name":"c8", "rule":".//td[8]" } ]} 获取cookie{ "name":"获取cookies", "type":"fetchCookie", "matchXPath":"//[@id='menu-flag']", "alias":"cookie"} 设置cookie{ "name":"设置cookies", "type":"setCookie", "cookies":"cookieValue"} AJAX步骤
http请求{ "name":"登录", "type":"http", "requestUrl":"https://insurance-tech-api.winbaoxian.cn/api/group/login", "requestParams":{ "mobile":"${a}", "validateCode":"${b}" }, "requestMethod":"POST", "requestContentType":"application/x-www-form-urlencoded", "alias":"ret"} 识别图片验证码{ "name":"识别验证吗", "type":"ocr", "requestUrl":"http://ehome.e-chinalife.com/passport/validateNum.jsp", "requestHeader":{ "cookie":"${toJSONString(homePage.cookies)}" }, "alias":"ret"} 提取http请求数据{ "tpl":"${toJSONString(data)}", "requestUrl":"https://www.baiduc.com/video/getAudioUrlList?uuid=113f5651c1704ebda34fd5b88f86114903c", "name":"提取数据", "alias":"c", "type":"extractDataHttp"} 导出配置使用simple简单格式导出{ "type":"simple", "fileName":"template3.csv//可不设置,使用task的name", "titles":[ "年龄", "性别", "交费期间", "保单年度", "年末年龄", "年初保费", "累计保费", "年度最高身故保障", "生存给付", "满期给付", "退保金" ], "valueTplList":[ "${params.o.age}", "<#if params.o.sex=='Sex1'>男<#else>女</#if>", "<#if params.o.payEndYear==2>3<#elseif params.o.payEndYear==3>5<#elseif params.o.payEndYear==4>10<#else>0</#if>", "${result.c2}", "${result.c3}", "${result.c4}", "${result.c5}", "${result.c6}", "${result.c7}", "${result.c8}" ]} 使用tpl导出{ "type":"tpl", "fileName":"p2.csv//可不设置,使用task的name", "tpl":"年龄,性别,交费期间,保单年度,年末年龄,年初保费,累计保费,年度最高身故保障,生存给付,满期给付,退保金 <#list list as entity > <#if entity.result?default('')?trim?length gt 0> <#assign p = toJSON(entity.params)> <#assign result = toJSON(entity.result)> <#list result as ret> ${p.o.age},<#if p.o.sex=='Sex1'>男<#else>女</#if>,<#if p.o.payEndYear==2>3<#elseif p.o.payEndYear==3>5<#elseif p.o.payEndYear==4>10<#else>0</#if>,${ret.c2},${ret.c3},${ret.c4},${ret.c5},${ret.c6},${ret.c7},${ret.c8} </#list> <#else> ${entity.keyName},error </#if> </#list>"} 联系方式
|
请发表评论