自定义解析器 (Custom Parser)
查看 Oxylabs 自定义解析器的快速入门步骤。
自定义解析器是一个免费的 网页爬虫API 功能,可让你 创建解析和数据处理逻辑 这些逻辑在原始 HTML 结果上执行。你可以使用 AI 自动生成解析器,或在高级场景中手动编写它们。
有关详细说明和示例,请参阅以下页面:
快速开始
1. 生成解析器
我们建议从我们由 AI 驱动的 OxyCopilot 工具开始,该工具允许你在不编写任何代码的情况下生成爬虫和解析器。
要访问 OxyCopilot,请登录左侧菜单中的 Oxylabs 仪表板 并选择 Scraper APIs Playground 。
按照视频中展示的步骤来 生成解析器:
以下是视频中展示的相同步骤:
输入你想要抓取和解析的 URL(或多个 URL)
指定任何参数 例如 JavaScript 渲染
编写一个提示 描述你想要解析的内容
运行 OxyCopilot
当你对生成的解析器满意后,加载指令。
2. 将解析器保存为预设
你可以通过 OxyCopilot 轻松保存生成的解析器以便日后使用。参见以下步骤:
将预设分配 到特定的 API 用户
点击 保存 (Save)
输入预设名称 以及描述(可选)
保存预设后,你可以在 API 请求中使用它。
3. 在 API 请求中使用解析器
要在 网页爬虫API 中使用你的预设,请发送包含 parser_preset 参数且其值为你的预设名称的请求载荷。在下面的代码示例中,我们重用之前步骤中创建的 example_parser 预设。
curl 'https://realtime.oxylabs.io/v1/queries' \
--user 'USERNAME:PASSWORD' \
-H 'Content-Type: application/json' \
-d '{
"source": "universal",
"url": "https://example.com/",
"parse": true,
"parser_preset": "example_parser"
}'import requests
from pprint import pprint
# 设置要使用的解析器预设。
payload = {
'source': 'universal',
'url': 'https://example.com/',
'parse': True,
'parser_preset': 'example_parser'
}
# 获取响应。
response = requests.request(
'POST',
'https://realtime.oxylabs.io/v1/queries',
auth=('USERNAME', 'PASSWORD'),
json=payload
)
# 将美化后的响应打印到标准输出。
pprint(response.json())const https = require("https");
const username = "USERNAME";
const password = "PASSWORD";
const body = {
source: "universal",
url: "https://example.com/",
parse: true,
parser_preset: "example_parser"
};
const options = {
hostname: "realtime.oxylabs.io",
path: "/v1/queries",
method: "POST",
headers: {
"Content-Type": "application/json",
Authorization:
"Basic " + Buffer.from(`${username}:${password}`).toString("base64"),
},
};
const request = https.request(options, (response) => {
let data = "";
response.on("data", (chunk) => {
data += chunk;
});
response.on("end", () => {
const responseData = JSON.parse(data);
console.log(JSON.stringify(responseData, null, 2));
});
});
request.on("error", (error) => {
console.error("Error:", error);
});
request.write(JSON.stringify(body));
request.end();# 您提交的整个字符串必须进行 URL 编码。
https://realtime.oxylabs.io/v1/queries?source=universal&url=https%3A%2F%2Fexample.com%2F&parse=true&parser_preset=example_parser&access_token=12345abcde<?php
$params = array(
'source' => 'universal',
'url' => 'https://example.com/',
'parse' => true,
'parser_preset' => 'example_parser'
);
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, "https://realtime.oxylabs.io/v1/queries");
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_POSTFIELDS, json_encode($params));
curl_setopt($ch, CURLOPT_POST, 1);
curl_setopt($ch, CURLOPT_USERPWD, "USERNAME" . ":" . "PASSWORD");
$headers = array();
$headers[] = "Content-Type: application/json";
curl_setopt($ch, CURLOPT_HTTPHEADER, $headers);
$result = curl_exec($ch);
echo $result;
if (curl_errno($ch)) {
echo 'Error:' . curl_error($ch);
}
curl_close($ch);package main
import (
"bytes"
"encoding/json"
"fmt"
"io/ioutil"
"net/http"
)
func main() {
const Username = "USERNAME"
const Password = "PASSWORD"
payload := map[string]interface{}{
"source": "universal",
"url": "https://example.com/",
"parse": true,
"parser_preset": "example_parser",
}
jsonValue, _ := json.Marshal(payload)
client := &http.Client{}
request, _ := http.NewRequest("POST",
"https://realtime.oxylabs.io/v1/queries",
bytes.NewBuffer(jsonValue),
)
request.SetBasicAuth(Username, Password)
response, _ := client.Do(request)
responseText, _ := ioutil.ReadAll(response.Body)
fmt.Println(string(responseText))
}
using System;
using System.Collections.Generic;
using System.Net.Http;
using System.Net.Http.Json;
using System.Threading.Tasks;
namespace OxyApi
{
class Program
{
static async Task Main()
{
const string Username = "USERNAME";
const string Password = "PASSWORD";
var parameters = new {
source = "universal",
url = "https://example.com/",
parse = true,
parser_preset = "example_parser"
};
var client = new HttpClient();
Uri baseUri = new Uri("https://realtime.oxylabs.io");
client.BaseAddress = baseUri;
var requestMessage = new HttpRequestMessage(HttpMethod.Post, "/v1/queries");
requestMessage.Content = JsonContent.Create(parameters);
var authenticationString = $"{Username}:{Password}";
var base64EncodedAuthenticationString = Convert.ToBase64String(System.Text.ASCIIEncoding.UTF8.GetBytes(authenticationString));
requestMessage.Headers.Add("Authorization", "Basic " + base64EncodedAuthenticationString);
var response = await client.SendAsync(requestMessage);
var contents = await response.Content.ReadAsStringAsync();
Console.WriteLine(contents);
}
}
}package org.example;
import okhttp3.*;
import org.json.JSONObject;
import java.util.concurrent.TimeUnit;
public class Main implements Runnable {
private static final String AUTHORIZATION_HEADER = "Authorization";
public static final String USERNAME = "USERNAME";
public static final String PASSWORD = "PASSWORD";
public void run() {
JSONObject jsonObject = new JSONObject();
jsonObject.put("source", "universal");
jsonObject.put("url", "https://example.com/");
jsonObject.put("parse", true);
jsonObject.put("parser_preset", "example_parser");
Authenticator authenticator = (route, response) -> {
String credential = Credentials.basic(USERNAME, PASSWORD);
return response
.request()
.newBuilder()
.header(AUTHORIZATION_HEADER, credential)
.build();
};
var client = new OkHttpClient.Builder()
.authenticator(authenticator)
.readTimeout(180, TimeUnit.SECONDS)
.build();
var mediaType = MediaType.parse("application/json; charset=utf-8");
var body = RequestBody.create(jsonObject.toString(), mediaType);
var request = new Request.Builder()
.url("https://realtime.oxylabs.io/v1/queries")
.post(body)
.build();
try (var response = client.newCall(request).execute()) {
if (response.body() != null) {
try (var responseBody = response.body()) {
System.out.println(responseBody.string());
}
}
} catch (Exception exception) {
System.out.println("Error: " + exception.getMessage());
}
System.exit(0);
}
public static void main(String[] args) {
new Thread(new Main()).start();
}
}{
"source": "universal",
"url": "https://example.com/",
"parse": true,
"parser_preset": "example_parser"
}获取已解析任务的 HTML 内容
你也可以通过在结果检索 URL 末尾添加 ?type=raw 来检索原始 HTML 结果。阅读更多 此处.
最后更新于
这有帮助吗?

