Push-Pull
了解 Oxylabs 网页爬虫 API 的 Push-Pull 集成方法。提交任务后,可稍后使用 JSON 格式轮询结果端点。
Push-Pull 是我们推荐的集成方法,用于可靠地处理大量数据。
访问 Oxylabs 的 GitHub 仓库以获取一个完整的可运行示例,演示 Python 中的 Push-Pull 集成.
Push-Pull 是一种异步集成方法。在提交任务后,您会立即收到一个 JSON 响应,其中包含所有任务详细信息,包括任务参数、ID 以及用于下载结果和检查状态的 URL。一旦您的任务被处理,如果您提供了 JSON 回调时,我们会通过发送一个 回调 URL。结果在完成后至少可供检索 24 小时 以上。
使用 Push-Pull,您可以将结果直接上传到您的 云存储 (Google Cloud Storage、AWS S3、Alibaba Cloud OSS 或其他兼容 S3 的存储)。
您还可以使用以下工具探索 Push-Pull 的工作原理: Postman.
单次任务
端点
此端点仅接受单个 query 或 URL 值分隔。
POST https://data.oxylabs.io/v1/queries输入
以 JSON 有效负载的形式提供任务参数,如下面的示例所示。Python 和 PHP 示例包含注释以便说明。
curl --user "user:pass1" \
'https://data.oxylabs.io/v1/queries' \
-H "Content-Type: application/json" \
-d '{"source": "ENTER_SOURCE_HERE", "url": "https://www.example.com", "geo_location": "United States", "callback_url": "https://your.callback.url", "storage_type": "s3", "storage_url": "s3://your.storage.bucket.url"}'import requests
from pprint import pprint
# 构建负载(payload)。
payload = {
"source": "ENTER_SOURCE_HERE", # 您选择的来源,例如 "universal"
"url": "https://www.example.com", # 检查特定来源是应使用 "url" 还是 "query"
"geo_location": "United States", # 某些来源接受邮编或坐标
#"render" : "html", # 如果您想在页面内渲染 JavaScript,请取消注释
#"render" : "png", # 如果您想对抓取的网页截图,请取消注释
#"parse" : true, # 检查哪些来源支持解析数据
#"callback_url": "https://your.callback.url", # 使用回调监听器时必填
"callback_url": "https://your.callback.url",
"storage_type": "s3",
"storage_url": "s3://your.storage.bucket.url"
}
# 获取响应。
response = requests.request(
'POST',
'https://data.oxylabs.io/v1/queries',
auth=('YOUR_USERNAME', 'YOUR_PASSWORD'), # 在此处填写您的凭据
json=payload,
)
# 将美化后的响应打印到标准输出。
pprint(response.json())<?php
$params = array(
'source' => 'ENTER_SOURCE_HERE', // 您选择的来源,例如 "universal"
'url' => 'https://www.example.com', // 检查特定来源是应使用 "url" 还是 "query"
'geo_location' => 'United States', // 某些来源接受邮编或坐标
//'render' : 'html', // 如果您想在页面内渲染 JavaScript,请取消注释
//'render' : 'png', // 如果您想对抓取的网页截图,请取消注释
//'parse' : TRUE, // 检查哪些来源支持解析数据
//'callback_url' => 'https://your.callback.url', // 使用回调监听器时必填
'callback_url': 'https://your.callback.url',
'storage_type' => 's3',
'storage_url' => 's3://your.storage.bucket.url'
);
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, "https://data.oxylabs.io/v1/queries");
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_POSTFIELDS, json_encode($params));
curl_setopt($ch, CURLOPT_POST, 1);
curl_setopt($ch, CURLOPT_USERPWD, "YOUR_USERNAME" . ":" . "YOUR_PASSWORD"); // 在此处填写您的凭据
$headers = array();
$headers[] = "Content-Type: application/json";
curl_setopt($ch, CURLOPT_HTTPHEADER, $headers);
$result = curl_exec($ch);
echo $result;
if (curl_errno($ch)) {
echo 'Error:' . curl_error($ch);
}
curl_close ($ch);
?>using System;
using System.Collections.Generic;
using System.Net.Http;
using System.Net.Http.Json;
using System.Threading.Tasks;
namespace OxyApi
{
class Program
{
static async Task Main()
{
const string Username = "YOUR_USERNAME";
const string Password = "YOUR_PASSWORD";
var parameters = new Dictionary<string, string>()
{
{ "source", "ENTER_SOURCE_HERE" },
{ "url", "https://example.com" },
{ "geo_location", "United States" },
{ "callback_url", "https://your.callback.url" },
};
var client = new HttpClient();
Uri baseUri = new Uri("https://data.oxylabs.io");
client.BaseAddress = baseUri;
var requestMessage = new HttpRequestMessage(HttpMethod.Post, "/v1/queries");
requestMessage.Content = JsonContent.Create(parameters);
var authenticationString = $"{Username}:{Password}";
var base64EncodedAuthenticationString = Convert.ToBase64String(System.Text.ASCIIEncoding.UTF8.GetBytes(authenticationString));
requestMessage.Headers.Add("Authorization", "Basic " + base64EncodedAuthenticationString);
var response = await client.SendAsync(requestMessage);
var contents = await response.Content.ReadAsStringAsync();
Console.WriteLine(contents);
}
}
}
package main
import (
"bytes"
"encoding/json"
"fmt"
"io/ioutil"
"net/http"
)
func main() {
const Username = "YOUR_USERNAME"
const Password = "YOUR_PASSWORD"
payload := map[string]string{
"source": "ENTER_SOURCE_HERE",
"url": "https://example.com",
"geo_location": "美国",
"callback_url": "https://your.callback.url",
}
jsonValue, _ := json.Marshal(payload)
client := &http.Client{}
request, _ := http.NewRequest("POST",
"https://data.oxylabs.io/v1/queries",
bytes.NewBuffer(jsonValue),
)
request.Header.Add("Content-type", "application/json")
request.SetBasicAuth(Username, Password)
response, _ := client.Do(request)
responseText, _ := ioutil.ReadAll(response.Body)
fmt.Println(string(responseText))
}
package org.example;
import okhttp3.*;
import org.json.JSONObject;
public class Main implements Runnable {
private static final String AUTHORIZATION_HEADER = "Authorization";
public static final String USERNAME = "YOUR_USERNAME";
public static final String PASSWORD = "YOUR_PASSWORD";
public void run() {
JSONObject jsonObject = new JSONObject();
jsonObject.put("source", "ENTER_SOURCE_HERE");
jsonObject.put("url", "https://example.com");
jsonObject.put("geo_location", "United States");
jsonObject.put("callback_url", "https://your.callback.url");
Authenticator authenticator = (route, response) -> {
String credential = Credentials.basic(USERNAME, PASSWORD);
return response
.request()
.newBuilder()
.header(AUTHORIZATION_HEADER, credential)
.build();
};
var client = new OkHttpClient.Builder()
.authenticator(authenticator)
.build();
var mediaType = MediaType.parse("application/json; charset=utf-8");
var body = RequestBody.create(jsonObject.toString(), mediaType);
var request = new Request.Builder()
.url("https://data.oxylabs.io/v1/queries")
.post(body)
.build();
try (var response = client.newCall(request).execute()) {
assert response.body() != null;
System.out.println(response.body().string());
} catch (Exception exception) {
System.out.println("Error: " + exception.getMessage());
}
System.exit(0);
}
public static void main(String[] args) {
new Thread(new Main()).start();
}
}import fetch from 'node-fetch';
const username = 'YOUR_USERNAME';
const password = 'YOUR_PASSWORD';
const body = {
source: 'ENTER_SOURCE_HERE',
url: 'https://www.example.com',
geo_location: 'United States',
callback_url: 'https://your.callback.url',
};
const response = await fetch('https://data.oxylabs.io/v1/queries', {
method: 'post',
body: JSON.stringify(body),
headers: {
'Content-Type': 'application/json',
'Authorization': 'Basic ' + Buffer.from(`${username}:${password}`).toString('base64'),
}
});
console.log(await response.json());输出
该 API 将以包含任务信息的 JSON 响应,类似如下:
{
"callback_url": "https://your.callback.url",
"client_id": 5,
"context": [
{
"key": "results_language",
"value": null
},
{
"key": "safe_search",
"value": null
},
{
"key": "tbm",
"value": null
},
{
"key": "cr",
"value": null
},
{
"key": "filter",
"value": null
}
],
"created_at": "2024-06-26 00:00:01",
"domain": "com",
"geo_location": "美国",
"id": "12345678900987654321",
"limit": 10,
"locale": null,
"pages": 1,
"parse": false,
"render": null,
"url": "https://www.example.com",
"source": "universal",
"start_page": 1,
"status": "pending",
"storage_type": "s3",
"storage_url": "YOUR_BUCKET_NAME/12345678900987654321.json",
"subdomain": "www",
"updated_at": "2024-06-26 00:00:01",
"user_agent_type": "desktop",
"_links": [
{
"rel": "self",
"href": "http://data.oxylabs.io/v1/queries/12345678900987654321",
"method": "GET"
},
{
"rel": "results",
"href": "http://data.oxylabs.io/v1/queries/12345678900987654321/results",
"method": "GET"
}
]
}数据字典
有关任务输入参数的详细说明,请参阅下表或查阅您感兴趣的抓取器的具体文档页面。
created_at
任务创建的日期时间。
字符串
client_id
与发出请求的客户端用户名相关联的数字 ID。
字符串
client_notes
客户端在发送任务时提交的备注。
字符串
id
任务的唯一 ID。
字符串
status
任务的状态。 pending 表示任务仍在处理中。 done 表示我们已完成该任务。 faulted 表示我们在尝试完成该任务时遇到错误并放弃了。
字符串
子域
网站的子域名。
字符串
updated_at
任务最后更新的日期时间。对于已完成(status is done 或 faulted)的任务,此日期时间表示任务完成的时间。
字符串
链接
与提供的输入相关的链接列表。
JSON 数组
链接:rel
链接类型。 self URL 包含任务的元数据,而 results URL 包含任务结果。
字符串
链接:href
资源的 URL。
字符串
链接:方法
与给定 URL 交互时应使用的 HTTP 方法。
字符串
回调
回调是一个 POST 请求,我们发送到您的机器,通知数据提取任务已完成并提供下载抓取内容的 URL。这意味着您无需 手动检查任务状态 。一旦数据可用,我们会通知您,您现在需要做的就是 检索它.
输入
# 这是一个简单的 Sanic Web 服务器,监听本地 8080 端口的回调路由。
# 它会将任务结果打印到 stdout。
import requests
from pprint import pprint
from sanic import Sanic, response
AUTH_TUPLE = ('user', 'pass1')
app = Sanic()
# 定义接受 POST 请求的 /job_listener 端点。
@app.route('/job_listener', methods=['POST'])
async def job_listener(request):
try:
res = request.json
links = res.get('_links', [])
for link in links:
if link['rel'] == 'results':
# Sanic 是异步的,但 requests 是同步的;要充分利用
# Sanic,请使用 aiohttp。
res_response = requests.request(
method='GET',
url=link['href'],
auth=AUTH_TUPLE,
)
pprint(res_response.json())
break
except Exception as e:
print("Listener exception: {}".format(e))
return response.json(status=200, body={'status': 'ok'})
if __name__ == '__main__':
app.run(host='0.0.0.0', port=8080)<?php
$stdout = fopen('php://stdout', 'w');
if (isset($_POST)) {
$result = array_merge($_POST, (array) json_decode(file_get_contents('php://input')));
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, "https://data.oxylabs.io/v1/queries/".$result['id'].'/results');
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_CUSTOMREQUEST, "GET");
curl_setopt($ch, CURLOPT_USERPWD, "user" . ":" . "pass1");
$result = curl_exec($ch);
fwrite($stdout, $result);
if (curl_errno($ch)) {
echo 'Error:' . curl_error($ch);
}
curl_close ($ch);
}
?>using Microsoft.AspNetCore.Builder;
using Microsoft.AspNetCore.Hosting;
using Microsoft.Extensions.Configuration;
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.Hosting;
using System;
using System.Collections.Generic;
using System.Net.Http;
namespace OxyApiWeb
{
public class Callback
{
public Link[] _links { get; set; }
}
public class Link
{
public string rel { get; set; }
public string href { get; set; }
}
public class Startup
{
private const string USERNAME = "YOUR_USERNAME";
private const string PASSWORD = "YOUR_PASSWORD";
public Startup(IConfiguration configuration)
{
Configuration = configuration;
client = new HttpClient();
}
public IConfiguration Configuration { get; }
private HttpClient client;
public void ConfigureServices(IServiceCollection services)
{
services.AddControllers();
}
public void Configure(IApplicationBuilder app, IWebHostEnvironment env)
{
if (env.IsDevelopment())
{
app.UseDeveloperExceptionPage();
}
app.UseRouting();
app.UseAuthorization();
app.UseEndpoints(endpoints =>
{
endpoints.MapPost("/job_listener", async context =>
{
var callback = await System.Text.Json.JsonSerializer.DeserializeAsync<Callback>(context.Request.Body);
foreach (var link in callback._links)
{
if (link.rel != "results")
{
continue;
}
var requestMessage = new HttpRequestMessage(HttpMethod.Get, new Uri(link.href));
var authenticationString = $"{USERNAME}:{PASSWORD}";
var base64EncodedAuthenticationString = Convert.ToBase64String(System.Text.ASCIIEncoding.UTF8.GetBytes(authenticationString));
requestMessage.Headers.Add("Authorization", "Basic " + base64EncodedAuthenticationString);
var response = await client.SendAsync(requestMessage);
var contents = await response.Content.ReadAsStringAsync();
Console.WriteLine(contents);
}
var okMessage = new Dictionary<string, string>()
{
{ "message", "ok" }
};
await System.Text.Json.JsonSerializer.SerializeAsync(context.Response.Body, okMessage);
});
});
}
}
}
package main
import (
"fmt"
"github.com/labstack/echo/v4"
"io/ioutil"
"net/http"
)
const Username = "YOUR_USERNAME"
const Password = "YOUR_PASSWORD"
type Callback struct {
Links []Link `json:"_links"`
}
type Link struct {
Href string `json:"href"`
Method string `json:"method"`
Rel string `json:"rel"`
}
func main() {
echoServer := echo.New()
client := &http.Client{}
echoServer.POST("/job_listener", func(context echo.Context) error {
callback := new(Callback)
if err := context.Bind(callback); err != nil {
return err
}
for _, link := range callback.Links {
if link.Rel != "results" {
continue
}
request, _ := http.NewRequest("GET",
link.Href,
nil,
)
request.Header.Add("Content-type", "application/json")
request.SetBasicAuth(Username, Password)
response, _ := client.Do(request)
responseText, _ := ioutil.ReadAll(response.Body)
fmt.Println(string(responseText))
}
return context.JSON(http.StatusOK, map[string]string { "status": "ok" })
})
echoServer.Logger.Fatal(echoServer.Start(":8080"))
}package org.example;
import okhttp3.*;
import com.sun.net.httpserver.HttpServer;
import org.apache.commons.io.IOUtils;
import org.json.JSONArray;
import org.json.JSONObject;
import java.io.IOException;
import java.io.OutputStream;
import java.net.InetSocketAddress;
import java.nio.charset.StandardCharsets;
import java.util.Map;
import java.util.Objects;
public class Main implements Runnable {
private static final String AUTHORIZATION_HEADER = "Authorization";
public static final String USERNAME = "YOUR_USERNAME";
public static final String PASSWORD = "YOUR_PASSWORD";
public void run() {
HttpServer server = null;
try {
server = HttpServer.create(new InetSocketAddress("0.0.0.0", 8080), 0);
} catch (IOException exception) {
exception.printStackTrace();
System.exit(1);
}
Authenticator authenticator = (route, response) -> {
String credential = Credentials.basic(USERNAME, PASSWORD);
return response
.request()
.newBuilder()
.header(AUTHORIZATION_HEADER, credential)
.build();
};
var client = new OkHttpClient.Builder()
.authenticator(authenticator)
.build();
server.createContext("/job_listener", exchange -> {
var requestBody = IOUtils.toString(exchange.getRequestBody(), StandardCharsets.UTF_8);
JSONObject requestJson = new JSONObject(requestBody);
JSONArray links = requestJson.getJSONArray("_links");
for (var link : links.toList()) {
var linkMap = (Map<?, ?>)link;
if (!Objects.equals(linkMap.get("rel"), "results")) {
continue;
}
var request = new Request.Builder()
.url((String) linkMap.get("href"))
.get()
.build();
try (var response = client.newCall(request).execute()) {
assert response.body() != null;
System.out.println(response.body().string());
} catch (Exception exception) {
System.out.println("Error: " + exception.getMessage());
}
}
var responseJson = new JSONObject();
responseJson.put("status", "ok");
exchange.sendResponseHeaders(200, responseJson.toString().length());
OutputStream responseBody = exchange.getResponseBody();
responseBody.write(responseJson.toString().getBytes());
responseBody.flush();
responseBody.close();
exchange.close();
});
server.setExecutor(null);
server.start();
}
public static void main(String[] args) {
new Thread(new Main()).start();
}
}import express from 'express'
import fetch from 'node-fetch';
const username = 'YOUR_USERNAME';
const password = 'YOUR_PASSWORD';
const app = express();
app.use(express.json());
app.post('/job_listener', async(request, response) => {
for (const index in request.body._links) {
const link = request.body._links[index];
if (link.rel !== 'results') {
continue;
}
const jobResultResponse = await fetch(link.href, {
method: 'get',
headers: {
'Content-Type': 'application/json',
'Authorization': 'Basic ' + Buffer.from(`${username}:${password}`).toString('base64'),
}
});
console.log(await jobResultResponse.json());
}
response.send({status: 'ok'});
});
app.listen(8080);输出
{
"created_at":"2019-10-01 00:00:01",
"updated_at":"2019-10-01 00:00:15",
"locale":null,
"client_id":163,
"user_agent_type":"desktop",
"source":"google_shopping_search",
"pages":1,
"subdomain":"www",
"status":"done",
"start_page":1,
"parse":0,
"render":null,
"priority":0,
"ttl":0,
"origin":"api",
"persist":true,
"id":"12345678900987654321",
"callback_url":"http://your.callback.url/",
"query":"adidas",
"domain":"com",
"limit":10,
"geo_location":null,
{...}
"_links":[
{
"href":"https://data.oxylabs.io/v1/queries/12345678900987654321",
"method":"GET",
"rel":"self"
},
{
"href":"https://data.oxylabs.io/v1/queries/12345678900987654321/results",
"method":"GET",
"rel":"results"
}
],
}检查任务状态
如果您在提交任务时提供了有效的回调 URL,我们将在完成后通过向指定回调 URL 发送 JSON 有效负载来通知您。该有效负载将表明任务已完成并且其状态设置为 done.
但是,如果您提交任务时没有使用 回调服务,您可以手动检查任务状态。从提交任务后收到的响应中检索 href 字段中的 URL, rel:self 部分。检查任务状态的 URL 类似如下: http://data.oxylabs.io/v1/queries/12345678900987654321。查询此 URL 将返回任务信息,包括其当前 status.
端点
GET https://data.oxylabs.io/v1/queries/{id}输入
curl --user "user:pass1" \
'http://data.oxylabs.io/v1/queries/12345678900987654321'import requests
from pprint import pprint
# 从统计端点获取响应。
response = requests.request(
method='GET',
url='http://data.oxylabs.io/v1/queries/12345678900987654321',
auth=('user', 'pass1'),
)
# 将美化后的 JSON 响应打印到 stdout。
pprint(response.json())<?php
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, "http://data.oxylabs.io/v1/queries/12345678900987654321");
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_CUSTOMREQUEST, "GET");
curl_setopt($ch, CURLOPT_USERPWD, "user" . ":" . "pass1");
$result = curl_exec($ch);
echo $result;
if (curl_errno($ch)) {
echo 'Error:' . curl_error($ch);
}
curl_close ($ch);
?>using System;
using System.Collections.Generic;
using System.Net.Http;
using System.Net.Http.Json;
using System.Threading.Tasks;
namespace OxyApi
{
class Program
{
static async Task Main()
{
const string JobId = "12345678900987654321";
const string Username = "YOUR_USERNAME";
const string Password = "YOUR_PASSWORD";
var client = new HttpClient();
Uri baseUri = new Uri("https://data.oxylabs.io");
client.BaseAddress = baseUri;
var requestMessage = new HttpRequestMessage(HttpMethod.Get, $"/v1/queries/{JobId}");
var authenticationString = $"{Username}:{Password}";
var base64EncodedAuthenticationString = Convert.ToBase64String(System.Text.ASCIIEncoding.UTF8.GetBytes(authenticationString));
requestMessage.Headers.Add("Authorization", "Basic " + base64EncodedAuthenticationString);
var response = await client.SendAsync(requestMessage);
var contents = await response.Content.ReadAsStringAsync();
Console.WriteLine(contents);
}
}
}package main
import (
"fmt"
"io/ioutil"
"net/http"
)
func main() {
const JobId = "12345678900987654321"
const Username = "YOUR_USERNAME"
const Password = "YOUR_PASSWORD"
client := &http.Client{}
request, _ := http.NewRequest("GET",
fmt.Sprintf("https://data.oxylabs.io/v1/queries/%s", JobId),
nil,
)
request.Header.Add("Content-type", "application/json")
request.SetBasicAuth(Username, Password)
response, _ := client.Do(request)
responseText, _ := ioutil.ReadAll(response.Body)
fmt.Println(string(responseText))
}package org.example;
import okhttp3.*;
public class Main implements Runnable {
private static final String AUTHORIZATION_HEADER = "Authorization";
private static final String JOB_ID = "12345678900987654321";
public static final String USERNAME = "YOUR_USERNAME";
public static final String PASSWORD = "YOUR_PASSWORD";
public void run() {
Authenticator authenticator = (route, response) -> {
String credential = Credentials.basic(USERNAME, PASSWORD);
return response
.request()
.newBuilder()
.header(AUTHORIZATION_HEADER, credential)
.build();
};
var client = new OkHttpClient.Builder()
.authenticator(authenticator)
.build();
var request = new Request.Builder()
.url(String.format("https://data.oxylabs.io/v1/queries/%s", JOB_ID))
.get()
.build();
try (var response = client.newCall(request).execute()) {
assert response.body() != null;
System.out.println(response.body().string());
} catch (Exception exception) {
System.out.println("Error: " + exception.getMessage());
}
System.exit(0);
}
public static void main(String[] args) {
new Thread(new Main()).start();
}
}import fetch from 'node-fetch';
const jobId = '12345678900987654321';
const username = 'YOUR_USERNAME';
const password = 'YOUR_PASSWORD';
const response = await fetch(`https://data.oxylabs.io/v1/queries/${jobId}`, {
method: 'get',
headers: {
'Content-Type': 'application/json',
'Authorization': 'Basic ' + Buffer.from(`${username}:${password}`).toString('base64'),
}
});
console.log(await response.json());输出
在任务完成后,API 将以 JSON 格式返回查询信息。任务状态将更改为 done,表示任务已完成。您可以通过查询提供的某个链接来检索内容。此外,响应还将包含任务最后更新的时间戳,以便跟踪其处理时间。
{
"client_id": 5,
"context": [
{
"key": "results_language",
"value": null
},
{
"key": "safe_search",
"value": null
},
{
"key": "tbm",
"value": null
},
{
"key": "cr",
"value": null
},
{
"key": "filter",
"value": null
}
],
"created_at": "2019-10-01 00:00:01",
"domain": "com",
"geo_location": null,
"id": "7173957294344910849",
"limit": 10,
"locale": null,
"pages": 1,
"parse": false,
"render": null,
"query": "adidas",
"source": "google_shopping_search",
"start_page": 1,
"status": "完成",
"subdomain": "www",
"updated_at": "2019-10-01 00:00:15",
"user_agent_type": "desktop",
"_links": [
{
"rel": "self",
"href": "http://data.oxylabs.io/v1/queries/7173957294344910849",
"method": "GET"
},
{
"rel": "results",
"href": "http://data.oxylabs.io/v1/queries/7173957294344910849/results",
"method": "GET"
},
{
"rel": "results-html",
"href": "http://data.oxylabs.io/v1/queries/7173957294344910849/results?type=raw",
"method": "GET"
},
{
"rel": "results-parsed",
"href": "http://data.oxylabs.io/v1/queries/7173957294344910849/results?type=parsed",
"method": "GET"
},
{
"rel": "results-parsed",
"href": "http://data.oxylabs.io/v1/queries/7173957294344910849/results?type=png",
"method": "GET"
}
]
}状态值
pending
任务仍在处理中,尚未完成。
done
任务已完成。您可以通过查询响应中 href 字段下的 URL 来检索结果, rel:results 部分,例如: http://data.oxylabs.io/v1/queries/12345678900987654321/results.
faulted
该任务存在问题,无法完成。此类 faulted 任务不向您收费。
检索任务内容
一旦任务可供检索,您可以使用响应中在 rel:results 部分提供的 URL。该 URL 看起来像这样: http://data.oxylabs.io/v1/queries/7173957294344910849/results.
端点
您可以使用以下端点检索不同类型的结果:
GET https://data.oxylabs.io/v1/queries/{job_id}/resultsGET https://data.oxylabs.io/v1/queries/{job_id}/results?type=rawGET https://data.oxylabs.io/v1/queries/{job_id}/results?type=parsedGET https://data.oxylabs.io/v1/queries/{job_id}/results?type=pngGET https://data.oxylabs.io/v1/queries/{job_id}/results?type=xhrGET https://data.oxylabs.io/v1/queries/{job_id}/results?type=markdown您也可以检索 多种结果类型 在单个响应中,例如:
GET https://data.oxylabs.io/v1/queries/{job_id}/results?type=parsed,raw输入
下面是演示如何使用该 /results 端点的代码示例:
curl --user "user:pass1" \
'http://data.oxylabs.io/v1/queries/12345678900987654321/results'import requests
from pprint import pprint
# 从统计端点获取响应。
response = requests.request(
method='GET',
url='http://data.oxylabs.io/v1/queries/12345678900987654321/results',
auth=('user', 'pass1'),
)
# 将美化后的 JSON 响应打印到 stdout。
pprint(response.json())<?php
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, "http://data.oxylabs.io/v1/queries/12345678900987654321/results");
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_CUSTOMREQUEST, "GET");
curl_setopt($ch, CURLOPT_USERPWD, "user" . ":" . "pass1");
$result = curl_exec($ch);
echo $result;
if (curl_errno($ch)) {
echo 'Error:' . curl_error($ch);
}
curl_close ($ch);
?>using System;
using System.Net.Http;
using System.Threading.Tasks;
namespace OxyApi
{
class Program
{
static async Task Main()
{
const string JobId = "12345678900987654321";
const string Username = "YOUR_USERNAME";
const string Password = "YOUR_PASSWORD";
var client = new HttpClient();
Uri baseUri = new Uri("https://data.oxylabs.io");
client.BaseAddress = baseUri;
var requestMessage = new HttpRequestMessage(HttpMethod.Get, $"/v1/queries/{JobId}/results");
var authenticationString = $"{Username}:{Password}";
var base64EncodedAuthenticationString = Convert.ToBase64String(System.Text.ASCIIEncoding.UTF8.GetBytes(authenticationString));
requestMessage.Headers.Add("Authorization", "Basic " + base64EncodedAuthenticationString);
var response = await client.SendAsync(requestMessage);
var contents = await response.Content.ReadAsStringAsync();
Console.WriteLine(contents);
}
}
}
package main
import (
"fmt"
"io/ioutil"
"net/http"
)
func main() {
const JobId = "12345678900987654321"
const Username = "YOUR_USERNAME"
const Password = "YOUR_PASSWORD"
client := &http.Client{}
request, _ := http.NewRequest("GET",
fmt.Sprintf("https://data.oxylabs.io/v1/queries/%s/results", JobId),
nil,
)
request.Header.Add("Content-type", "application/json")
request.SetBasicAuth(Username, Password)
response, _ := client.Do(request)
responseText, _ := ioutil.ReadAll(response.Body)
fmt.Println(string(responseText))
}package org.example;
import okhttp3.*;
public class Main implements Runnable {
private static final String AUTHORIZATION_HEADER = "Authorization";
private static final String JOB_ID = "12345678900987654321";
public static final String USERNAME = "YOUR_USERNAME";
public static final String PASSWORD = "YOUR_PASSWORD";
public void run() {
Authenticator authenticator = (route, response) -> {
String credential = Credentials.basic(USERNAME, PASSWORD);
return response
.request()
.newBuilder()
.header(AUTHORIZATION_HEADER, credential)
.build();
};
var client = new OkHttpClient.Builder()
.authenticator(authenticator)
.build();
var request = new Request.Builder()
.url(String.format("https://data.oxylabs.io/v1/queries/%s/results", JOB_ID))
.get()
.build();
try (var response = client.newCall(request).execute()) {
assert response.body() != null;
System.out.println(response.body().string());
} catch (Exception exception) {
System.out.println("Error: " + exception.getMessage());
}
System.exit(0);
}
public static void main(String[] args) {
new Thread(new Main()).start();
}
}import fetch from 'node-fetch';
const jobId = '12345678900987654321';
const username = 'YOUR_USERNAME';
const password = 'YOUR_PASSWORD';
const response = await fetch(`https://data.oxylabs.io/v1/queries/${jobId}/results`, {
method: 'get',
headers: {
'Content-Type': 'application/json',
'Authorization': 'Basic ' + Buffer.from(`${username}:${password}`).toString('base64'),
}
});
console.log(await response.json());输出
下表说明了基于 API 请求有效负载中包含的头部,默认和其它可用的结果类型。
x
x
x
html
html
html
x
x
html
html
html
x
true
xhr
html, xhr
html
true
true
parsed
html, xhr, parsed
png
x
x
png
html, png
x
true
x
parsed
html, parsed
html
true
x
parsed
html, parsed
png
true
x
png
html, parsed, png
下面是一个示例响应,包含 /results 端点的代码示例:
{
"results": [
{
"content": "<!doctype html><html>
内容
</html>",
"created_at": "2019-10-01 00:00:01",
"updated_at": "2019-10-01 00:00:15",
"page": 1,
"url": "https://www.google.com/search?q=adidas&hl=en&gl=US",
"job_id": "12345678900987654321",
"status_code": 200
}
]
}通过设置 回调 服务,可以在不周期性检查任务状态的情况下自动检索结果。为此,在提交任务时指定一个能够接收传入 HTTP(S) 请求的服务器 URL。当我们的系统完成任务后,它会向提供的 URL 发送 POST 一个 JSON 有效负载,回调服务将按照 回调实现示例.
批量查询
Scraper APIs 支持在单个批量请求中提交最多 5,000 个 query 或 url 参数值。
Batch requests are currently not supported 用于 chatgpt 和 perplexity 来源。
端点
POST https://data.oxylabs.io/v1/queries/batch系统会将每个 query 或 url 作为单独的任务处理。如果您提供回调 URL,则每个关键字会收到单独的回调。否则,我们的初始响应将包含所有关键字的任务 id信息。例如,如果您发送了 50 个关键字,我们将返回 50 个唯一的任务 id。
输入
您需要将查询参数作为 JSON 有效负载发送。下面是提交批量任务的方式:
curl --user "user:pass1" \
'https://data.oxylabs.io/v1/queries/batch' \
-H 'Content-Type: application/json' \
-d '@keywords.json'import requests
import json
from pprint import pprint
# 从文件获取有效负载。
with open('keywords.json', 'r') as f:
payload = json.loads(f.read())
response = requests.request(
'POST',
'https://data.oxylabs.io/v1/queries/batch',
auth=('user', 'pass1'),
json=payload,
)
# 打印美化后的响应。
pprint(response.json())<?php
$paramsFile = file_get_contents(realpath("keywords.json"));
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, "https://data.oxylabs.io/v1/queries/batch");
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_POSTFIELDS, $paramsFile);
curl_setopt($ch, CURLOPT_POST, 1);
curl_setopt($ch, CURLOPT_USERPWD, "user" . ":" . "pass1");
$headers = array();
$headers[] = "Content-Type: application/json";
curl_setopt($ch, CURLOPT_HTTPHEADER, $headers);
$result = curl_exec($ch);
echo $result;
if (curl_errno($ch)) {
echo 'Error:' . curl_error($ch);
}
curl_close ($ch);
?>using System;
using System.IO;
using System.Net.Http;
using System.Text;
using System.Threading.Tasks;
namespace OxyApi
{
class Program
{
static async Task Main()
{
const string Username = "YOUR_USERNAME";
const string Password = "YOUR_PASSWORD";
var content = File.ReadAllText(@"C:\path\to\keywords.json");
var client = new HttpClient();
var requestMessage = new HttpRequestMessage(HttpMethod.Post, new Uri("https://data.oxylabs.io/v1/queries/batch"));
requestMessage.Content = new StringContent(content, Encoding.UTF8, "application/json");
var authenticationString = $"{Username}:{Password}";
var base64EncodedAuthenticationString = Convert.ToBase64String(ASCIIEncoding.UTF8.GetBytes(authenticationString));
requestMessage.Headers.Add("Authorization", "Basic " + base64EncodedAuthenticationString);
var response = await client.SendAsync(requestMessage);
var contents = await response.Content.ReadAsStringAsync();
Console.WriteLine(contents);
}
}
}package main
import (
"bytes"
"fmt"
"io/ioutil"
"net/http"
"os"
)
func main() {
const Username = "YOUR_USERNAME"
const Password = "YOUR_PASSWORD"
content, err := os.ReadFile("keywords.json")
if err != nil {
panic(err)
}
client := &http.Client{}
request, _ := http.NewRequest("POST",
"https://data.oxylabs.io/v1/queries/batch",
bytes.NewBuffer(content),
)
request.Header.Add("Content-type", "application/json")
request.SetBasicAuth(Username, Password)
response, _ := client.Do(request)
responseText, _ := ioutil.ReadAll(response.Body)
fmt.Println(string(responseText))
}import okhttp3.*;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
public class Main implements Runnable {
private static final String AUTHORIZATION_HEADER = "Authorization";
public static final String USERNAME = "YOUR_USERNAME";
public static final String PASSWORD = "YOUR_PASSWORD";
public void run() {
Path filePath = Path.of("/path/to/keywords.json");
String jsonContent = null;
try {
jsonContent = Files.readString(filePath);
} catch (IOException e) {
throw new RuntimeException(e);
}
Authenticator authenticator = (route, response) -> {
String credential = Credentials.basic(USERNAME, PASSWORD);
return response
.request()
.newBuilder()
.header(AUTHORIZATION_HEADER, credential)
.build();
};
var client = new OkHttpClient.Builder()
.authenticator(authenticator)
.build();
var mediaType = MediaType.parse("application/json; charset=utf-8");
var body = RequestBody.create(jsonContent, mediaType);
var request = new Request.Builder()
.url("https://data.oxylabs.io/v1/queries/batch")
.post(body)
.build();
try (var response = client.newCall(request).execute()) {
assert response.body() != null;
System.out.println(response.body().string());
} catch (Exception exception) {
System.out.println("Error: " + exception.getMessage());
}
System.exit(0);
}
public static void main(String[] args) {
new Thread(new Main()).start();
}
}import fetch from 'node-fetch';
import fs from 'fs'
const username = 'YOUR_USERNAME';
const password = 'YOUR_PASSWORD';
const payload = fs.readFileSync('keywords.json').toString();
const response = await fetch('https://data.oxylabs.io/v1/queries/batch', {
method: 'post',
body: payload,
headers: {
'Content-Type': 'application/json',
'Authorization': 'Basic ' + Buffer.from(`${username}:${password}`).toString('base64'),
}
});
console.log(await response.json());您可能注意到上面的代码示例没有说明 JSON 有效负载应该如何格式化,并且引用了一个预先制作的 JSON 文件。下面是 keywords.json 文件的内容,其中包含多个 query 参数值:
{
"query":[
"adidas",
"nike",
"reebok"
],
"source": "google_shopping_search",
"domain": "com",
"callback_url": "https://your.callback.url"
}...下面是一个 keywords.json 批量输入文件,包含多个 URL:
{
"url":[
"https://example.com/url1.html",
"https://example.com/url2.html",
"https://example.com/url3.html"
],
"source": "universal",
"callback_url": "https://your.callback.url"
}输出
API 会返回一个 JSON 对象,包含为每个创建的任务的作业信息。响应将类似于:
{
"queries": [
{
"callback_url": "https://your.callback.url",
{...}
"created_at": "2019-10-01 00:00:01",
"domain": "com",
"id": "12345678900987654321",
{...}
"query": "adidas",
"source": "google_shopping_search",
{...}
"rel": "results",
"href": "http://data.oxylabs.io/v1/queries/12345678900987654321/results",
"method": "GET"
}
]
},
{
"callback_url": "https://your.callback.url",
{...}
"created_at": "2019-10-01 00:00:01",
"domain": "com",
"id": "12345678901234567890",
{...}
"query": "nike",
"source": "google_shopping_search",
{...}
"rel": "results",
"href": "http://data.oxylabs.io/v1/queries/12345678901234567890/results",
"method": "GET"
}
]
},
{
"callback_url": "https://your.callback.url",
{...}
"created_at": "2019-10-01 00:00:01",
"domain": "com",
"id": "01234567899876543210",
{...}
"query": "reebok",
"source": "google_shopping_search",
{...}
"rel": "results",
"href": "http://data.oxylabs.io/v1/queries/01234567899876543210/results",
"method": "GET"
}
]
}
]
}获取通知者 IP 地址列表
您可能希望将向您发送回调消息的 IP 列入白名单,或为其他用途获取这些 IP 的列表。您可以通过 GET访问此端点:
端点
GET https://data.oxylabs.io/v1/info/callbacker_ips输入
下面的代码示例展示了如何访问 /callbacker_ips 端点的代码示例:
curl --user "user:pass1" \
'https://data.oxylabs.io/v1/info/callbacker_ips'import requests
from pprint import pprint
# 从回调 IP 端点获取响应。
response = requests.request(
method='GET',
url='https://data.oxylabs.io/v1/info/callbacker_ips',
auth=('user', 'pass1'),
)
# 将美化后的 JSON 响应打印到 stdout。
pprint(response.json())<?php
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, "https://data.oxylabs.io/v1/info/callbacker_ips");
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_CUSTOMREQUEST, "GET");
curl_setopt($ch, CURLOPT_USERPWD, "user" . ":" . "pass1");
$result = curl_exec($ch);
echo $result;
if (curl_errno($ch)) {
echo 'Error:' . curl_error($ch);
}
curl_close ($ch);
?>using System;
using System.Net.Http;
using System.Threading.Tasks;
namespace OxyApi
{
class Program
{
static async Task Main()
{
const string Username = "YOUR_USERNAME";
const string Password = "YOUR_PASSWORD";
var client = new HttpClient();
Uri baseUri = new Uri("https://data.oxylabs.io");
client.BaseAddress = baseUri;
var requestMessage = new HttpRequestMessage(HttpMethod.Get, "/v1/info/callbacker_ips");
var authenticationString = $"{Username}:{Password}";
var base64EncodedAuthenticationString = Convert.ToBase64String(System.Text.ASCIIEncoding.UTF8.GetBytes(authenticationString));
requestMessage.Headers.Add("Authorization", "Basic " + base64EncodedAuthenticationString);
var response = await client.SendAsync(requestMessage);
var contents = await response.Content.ReadAsStringAsync();
Console.WriteLine(contents);
}
}
}package main
import (
"fmt"
"io/ioutil"
"net/http"
)
func main() {
const Username = "YOUR_USERNAME"
const Password = "YOUR_PASSWORD"
client := &http.Client{}
request, _ := http.NewRequest("GET",
"https://data.oxylabs.io/v1/info/callbacker_ips",
nil,
)
request.Header.Add("Content-type", "application/json")
request.SetBasicAuth(Username, Password)
response, _ := client.Do(request)
responseText, _ := ioutil.ReadAll(response.Body)
fmt.Println(string(responseText))
}package org.example;
import okhttp3.*;
public class Main implements Runnable {
private static final String AUTHORIZATION_HEADER = "Authorization";
public static final String USERNAME = "YOUR_USERNAME";
public static final String PASSWORD = "YOUR_PASSWORD";
public void run() {
Authenticator authenticator = (route, response) -> {
String credential = Credentials.basic(USERNAME, PASSWORD);
return response
.request()
.newBuilder()
.header(AUTHORIZATION_HEADER, credential)
.build();
};
var client = new OkHttpClient.Builder()
.authenticator(authenticator)
.build();
var request = new Request.Builder()
.url("https://data.oxylabs.io/v1/info/callbacker_ips")
.get()
.build();
try (var response = client.newCall(request).execute()) {
assert response.body() != null;
System.out.println(response.body().string());
} catch (Exception exception) {
System.out.println("Error: " + exception.getMessage());
}
System.exit(0);
}
public static void main(String[] args) {
new Thread(new Main()).start();
}
}import fetch from 'node-fetch';
const username = 'YOUR_USERNAME';
const password = 'YOUR_PASSWORD';
const response = await fetch('https://data.oxylabs.io/v1/info/callbacker_ips', {
method: 'get',
headers: {
'Content-Type': 'application/json',
'Authorization': 'Basic ' + Buffer.from(`${username}:${password}`).toString('base64'),
}
});
console.log(await response.json());输出
API 将返回向您的系统发起回调请求的 IP 列表:
{
"ips": [
"x.x.x.x",
"y.y.y.y"
]
}调度器
调度器是一个可用于安排定期爬取任务的服务。
它扩展了 Push-Pull 集成的功能,最好与 云集成 功能一起使用。
访问此页面 以了解如何使用调度器功能:
任务管家 (Scheduler)最后更新于
这有帮助吗?

