新闻搜索
大规模抓取 Google News 结果并获取完全解析的数据。提取带标题、来源和发布日期的文章。
该 google_search source 旨在检索 Google 搜索结果(SERP)。此子页专门展示与 Google 新闻搜索 相关的数据。要查看其他结果类型,请阅读: 网页搜索, 图片搜索.
要抓取 Google 新闻搜索,请包含 context:udm 参数并将其值设置为 12 或 context:tbm 参数并将其值设置为 nws.
请求示例
在下面的示例中,我们发送请求以获取搜索词为 adidas 在 google.nl 域。
udm
curl 'https://realtime.oxylabs.io/v1/queries' \
--user 'USERNAME:PASSWORD' \
-H 'Content-Type: application/json' \
-d '{
"source": "google_search",
"domain": "nl",
"query": "adidas",
"parse": true,
"context": [
{
"key": "udm",
"value": "12"
}
]
}'import requests
from pprint import pprint
# 构建负载(payload)。
payload = {
'source': 'google_search',
'domain': 'nl',
'query': 'adidas',
'parse': True,
'context': [
{'key': 'udm', 'value': '12'},
],
}
# 获取响应。
response = requests.post(
'https://realtime.oxylabs.io/v1/queries',
auth=('USERNAME', 'PASSWORD'),
json=payload,
)
# 将美化后的响应打印到标准输出。
pprint(response.json())const https = require("https");
const username = "USERNAME";
const password = "PASSWORD";
const body = {
source: "google_search",
domain: "nl",
query: "adidas",
parse: true,
context: [
{ key: "udm", value: "12" },
],
};
const options = {
hostname: "realtime.oxylabs.io",
path: "/v1/queries",
method: "POST",
headers: {
"Content-Type": "application/json",
Authorization:
"Basic " + Buffer.from(`${username}:${password}`).toString("base64"),
},
};
const request = https.request(options, (response) => {
let data = "";
response.on("data", (chunk) => {
data += chunk;
});
response.on("end", () => {
const responseData = JSON.parse(data);
console.log(JSON.stringify(responseData, null, 2));
});
});
request.on("error", (error) => {
console.error("Error:", error);
});
request.write(JSON.stringify(body));
request.end();source=google_search&domain=nl&query=adidas&parse=true&context[0][key]=udm&context[0][value]=12&access_token=12345abcde<?php
$params = array(
'source' => 'google_search',
'domain' => 'nl',
'query' => 'adidas',
'parse' => true,
'context' => [
[
'key' => 'udm',
'value' => '12',
]
]
);
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, "https://realtime.oxylabs.io/v1/queries");
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_POSTFIELDS, json_encode($params));
curl_setopt($ch, CURLOPT_POST, 1);
curl_setopt($ch, CURLOPT_USERPWD, "USERNAME" . ":" . "PASSWORD");
$headers = array();
$headers[] = "Content-Type: application/json";
curl_setopt($ch, CURLOPT_HTTPHEADER, $headers);
$result = curl_exec($ch);
echo $result;
if (curl_errno($ch)) {
echo 'Error:' . curl_error($ch);
}
curl_close($ch);package main
import (
"bytes"
"encoding/json"
"fmt"
"io/ioutil"
"net/http"
)
func main() {
const Username = "USERNAME"
const Password = "PASSWORD"
payload := map[string]interface{}{
"source": "google_search",
"domain": "nl",
"query": "adidas",
"parse": true,
"context": []map[string]interface{}{
{"key": "udm", "value": "12"},
},
}
jsonValue, _ := json.Marshal(payload)
client := &http.Client{}
request, _ := http.NewRequest("POST",
"https://realtime.oxylabs.io/v1/queries",
bytes.NewBuffer(jsonValue),
)
request.SetBasicAuth(Username, Password)
response, _ := client.Do(request)
responseText, _ := ioutil.ReadAll(response.Body)
fmt.Println(string(responseText))
}using System;
using System.Collections.Generic;
using System.Net.Http;
using System.Net.Http.Json;
using System.Threading.Tasks;
namespace OxyApi
{
class Program
{
static async Task Main()
{
const string Username = "USERNAME";
const string Password = "PASSWORD";
var parameters = new {
source = "google_search",
domain = "nl",
query = "adidas",
parse = true,
context = new dynamic [] {
new { key = "udm", value = "12" },
}
};
var client = new HttpClient();
Uri baseUri = new Uri("https://realtime.oxylabs.io");
client.BaseAddress = baseUri;
var requestMessage = new HttpRequestMessage(HttpMethod.Post, "/v1/queries");
requestMessage.Content = JsonContent.Create(parameters);
var authenticationString = $"{Username}:{Password}";
var base64EncodedAuthenticationString = Convert.ToBase64String(System.Text.ASCIIEncoding.UTF8.GetBytes(authenticationString));
requestMessage.Headers.Add("Authorization", "Basic " + base64EncodedAuthenticationString);
var response = await client.SendAsync(requestMessage);
var contents = await response.Content.ReadAsStringAsync();
Console.WriteLine(contents);
}
}
}package org.example;
import okhttp3.*;
import org.json.JSONArray;
import org.json.JSONObject;
import java.util.concurrent.TimeUnit;
public class Main implements Runnable {
private static final String AUTHORIZATION_HEADER = "Authorization";
public static final String USERNAME = "USERNAME";
public static final String PASSWORD = "PASSWORD";
public void run() {
JSONObject jsonObject = new JSONObject();
jsonObject.put("source", "google_search");
jsonObject.put("domain", "nl");
jsonObject.put("query", "adidas");
jsonObject.put("parse", true);
jsonObject.put("context", new JSONArray()
.put(new JSONObject()
.put("key", "udm")
.put("value", "12"))
);
Authenticator authenticator = (route, response) -> {
String credential = Credentials.basic(USERNAME, PASSWORD);
return response
.request()
.newBuilder()
.header(AUTHORIZATION_HEADER, credential)
.build();
};
var client = new OkHttpClient.Builder()
.authenticator(authenticator)
.readTimeout(180, TimeUnit.SECONDS)
.build();
var mediaType = MediaType.parse("application/json; charset=utf-8");
var body = RequestBody.create(jsonObject.toString(), mediaType);
var request = new Request.Builder()
.url("https://realtime.oxylabs.io/v1/queries")
.post(body)
.build();
try (var response = client.newCall(request).execute()) {
if (response.body() != null) {
try (var responseBody = response.body()) {
System.out.println(responseBody.string());
}
}
} catch (Exception exception) {
System.out.println("Error: " + exception.getMessage());
}
System.exit(0);
}
public static void main(String[] args) {
new Thread(new Main()).start();
}
}{
"source": "google_search",
"domain": "nl",
"query": "adidas",
"parse": true,
"context": [
{
"key": "udm",
"value": "12"
}
]
}tbm
curl 'https://realtime.oxylabs.io/v1/queries' \
--user 'USERNAME:PASSWORD' \
-H 'Content-Type: application/json' \
-d '{
"source": "google_search",
"domain": "nl",
"query": "adidas",
"parse": true,
"context": [
{
"key": "tbm",
"value": "nws"
}
]
}'import requests
from pprint import pprint
# 构建负载(payload)。
payload = {
'source': 'google_search',
'domain': 'nl',
'query': 'adidas',
'parse': True,
'context': [
{'key': 'tbm', 'value': 'nws'},
],
}
# 获取响应。
response = requests.post(
'https://realtime.oxylabs.io/v1/queries',
auth=('USERNAME', 'PASSWORD'),
json=payload,
)
# 将美化后的响应打印到标准输出。
pprint(response.json())const https = require("https");
const username = "USERNAME";
const password = "PASSWORD";
const body = {
source: "google_search",
domain: "nl",
query: "adidas",
parse: true,
context: [
{ key: "tbm", value: "nws" },
],
};
const options = {
hostname: "realtime.oxylabs.io",
path: "/v1/queries",
method: "POST",
headers: {
"Content-Type": "application/json",
Authorization:
"Basic " + Buffer.from(`${username}:${password}`).toString("base64"),
},
};
const request = https.request(options, (response) => {
let data = "";
response.on("data", (chunk) => {
data += chunk;
});
response.on("end", () => {
const responseData = JSON.parse(data);
console.log(JSON.stringify(responseData, null, 2));
});
});
request.on("error", (error) => {
console.error("Error:", error);
});
request.write(JSON.stringify(body));
request.end();source=google_search&domain=nl&query=adidas&parse=true&context[0][key]=tbm&context[0][value]=nws&access_token=12345abcde<?php
$params = array(
'source' => 'google_search',
'domain' => 'nl',
'query' => 'adidas',
'parse' => true,
'context' => [
[
'key' => 'tbm',
'value' => 'nws',
]
]
);
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, "https://realtime.oxylabs.io/v1/queries");
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_POSTFIELDS, json_encode($params));
curl_setopt($ch, CURLOPT_POST, 1);
curl_setopt($ch, CURLOPT_USERPWD, "USERNAME" . ":" . "PASSWORD");
$headers = array();
$headers[] = "Content-Type: application/json";
curl_setopt($ch, CURLOPT_HTTPHEADER, $headers);
$result = curl_exec($ch);
echo $result;
if (curl_errno($ch)) {
echo 'Error:' . curl_error($ch);
}
curl_close($ch);package main
import (
"bytes"
"encoding/json"
"fmt"
"io/ioutil"
"net/http"
)
func main() {
const Username = "USERNAME"
const Password = "PASSWORD"
payload := map[string]interface{}{
"source": "google_search",
"domain": "nl",
"query": "adidas",
"parse": true,
"context": []map[string]interface{}{
{"key": "tbm", "value": "nws"},
},
}
jsonValue, _ := json.Marshal(payload)
client := &http.Client{}
request, _ := http.NewRequest("POST",
"https://realtime.oxylabs.io/v1/queries",
bytes.NewBuffer(jsonValue),
)
request.SetBasicAuth(Username, Password)
response, _ := client.Do(request)
responseText, _ := ioutil.ReadAll(response.Body)
fmt.Println(string(responseText))
}using System;
using System.Collections.Generic;
using System.Net.Http;
using System.Net.Http.Json;
using System.Threading.Tasks;
namespace OxyApi
{
class Program
{
static async Task Main()
{
const string Username = "USERNAME";
const string Password = "PASSWORD";
var parameters = new {
source = "google_search",
domain = "nl",
query = "adidas",
parse = true,
context = new dynamic [] {
new { key = "tbm", value = "nws" },
}
};
var client = new HttpClient();
Uri baseUri = new Uri("https://realtime.oxylabs.io");
client.BaseAddress = baseUri;
var requestMessage = new HttpRequestMessage(HttpMethod.Post, "/v1/queries");
requestMessage.Content = JsonContent.Create(parameters);
var authenticationString = $"{Username}:{Password}";
var base64EncodedAuthenticationString = Convert.ToBase64String(System.Text.ASCIIEncoding.UTF8.GetBytes(authenticationString));
requestMessage.Headers.Add("Authorization", "Basic " + base64EncodedAuthenticationString);
var response = await client.SendAsync(requestMessage);
var contents = await response.Content.ReadAsStringAsync();
Console.WriteLine(contents);
}
}
}package org.example;
import okhttp3.*;
import org.json.JSONArray;
import org.json.JSONObject;
import java.util.concurrent.TimeUnit;
public class Main implements Runnable {
private static final String AUTHORIZATION_HEADER = "Authorization";
public static final String USERNAME = "USERNAME";
public static final String PASSWORD = "PASSWORD";
public void run() {
JSONObject jsonObject = new JSONObject();
jsonObject.put("source", "google_search");
jsonObject.put("domain", "nl");
jsonObject.put("query", "adidas");
jsonObject.put("parse", true);
jsonObject.put("context", new JSONArray()
.put(new JSONObject()
.put("key", "tbm")
.put("value", "nws"))
);
Authenticator authenticator = (route, response) -> {
String credential = Credentials.basic(USERNAME, PASSWORD);
return response
.request()
.newBuilder()
.header(AUTHORIZATION_HEADER, credential)
.build();
};
var client = new OkHttpClient.Builder()
.authenticator(authenticator)
.readTimeout(180, TimeUnit.SECONDS)
.build();
var mediaType = MediaType.parse("application/json; charset=utf-8");
var body = RequestBody.create(jsonObject.toString(), mediaType);
var request = new Request.Builder()
.url("https://realtime.oxylabs.io/v1/queries")
.post(body)
.build();
try (var response = client.newCall(request).execute()) {
if (response.body() != null) {
try (var responseBody = response.body()) {
System.out.println(responseBody.string());
}
}
} catch (Exception exception) {
System.out.println("Error: " + exception.getMessage());
}
System.exit(0);
}
public static void main(String[] args) {
new Thread(new Main()).start();
}
}{
"source": "google_search",
"domain": "nl",
"query": "adidas",
"parse": true,
"context": [
{
"key": "tbm",
"value": "nws"
}
]
}在我们的示例中,我们使用同步的 Realtime 集成方法。如果您想使用 Proxy Endpoint 或异步的 Push-Pull 集成,请参阅 集成方法 部分。
请求参数值
通用
抓取 Google 新闻搜索结果的基本设置和自定义选项。
source
设置抓取器。
google_search
query
要搜索的关键字或短语。
-
context: tbm
要获取新闻搜索结果,请将 value 设置为 nws。其他接受的值包括: app, blg, bks, dsc, isch, pts, plcs, rcp, lcl
-
- 必填参数
- udm 和 tbm context 参数不能在单个抓取请求中一起使用; 请从中选择一个。 同时使用两者可能导致冲突或意外行为。
Google 高级搜索运算符
在抓取时,将 Google 高级搜索运算符与查询结合使用可能很有用。它使您能够自定义搜索范围,确保结果更相关、更聚焦。探索这些特殊命令 此处 和 此处。参见下面示例。
{
"source": "google_search",
"query": "iphone 15 launch inurl:apple",
}本地化
将搜索结果适配到特定的地理位置、域和语言。
分页
用于管理分页和检索搜索结果的控制项。
start_page
起始页码。
1
pages
要检索的页数。
1
limit
每页要检索的结果数量。
10
context:
limit_per_page
如果要使用相同 IP 抓取多页,请包含一个 JSON 数组并使用 page 键指定页码。你还必须通过添加一个 limit 键来指明每页的有机结果数量。 参见示例.
-
每页限制
要使用此功能,请在 JSON 数组中包含包含以下数据的 JSON 对象:
page
您想要抓取的页码。任何大于 0 的整数值均可
1
limit
该页上的结果数量。任何介于 1 和 100 (含)之间的整数值均可。
90
请求示例
{
"source": "google_search",
"query": "adidas",
"parse": true,
"context": [
{
"key": "limit_per_page",
"value": [
{"page": 1, "limit": 10},
{"page": 2, "limit": 90}
]
}]
}过滤
根据各种条件筛选和细化搜索结果的选项。
context:safe_search
安全搜索。设置为 true 以启用它。
false
context:
tbs
tbs 参数。该参数类似于容纳更晦涩 Google 参数的容器,比如按日期限制/排序结果以及其他一些过滤器,其中有些取决于 tbm 参数(例如 tbs=app_os:1 仅在与 tbm value app一起使用时可用)。更多信息 此处.
-
其他
用于专门需求的附加高级设置和控制。
context:
nfpr
true 将关闭拼写自动更正
false
上下文参数
所有上下文参数应作为对象添加到 context 数组中,包含 key 和 value 键值对,例如:
...
"context": [
{
"key": "filter",
"value": "0"
}
]
...结构化数据
SERP 爬虫 API 能够提取包含 Google 搜索结果的 HTML 或 JSON 对象,为结果页面的各个元素提供结构化数据。
输出数据字典
HTML 示例

JSON 结构
Google 新闻搜索的结构化输出包含如下字段: URL, page, results等字段。下表列出我们解析的每个 SERP 功能的详细清单、其描述和数据类型。表中还包含一些元数据。
url
Google 搜索页面的 URL。
字符串
results
包含搜索结果的字典。
数组
results.main
一份未付费新闻结果的列表,包含各自的详细信息。
数组
results.additional
一份趋势文章的列表,包含各自的详细信息。
对象
results.total_results_count
针对搜索查询找到的结果总数。
数组
created_at
抓取任务创建的时间戳。
时间戳
updated_at
抓取任务完成的时间戳。
时间戳
page
相对于 Google SERP 分页的页码。
整数
job_id
与抓取任务关联的作业 ID。
字符串
Main
显示未付费新闻结果的列表,为每篇文章提供相关详情。

...
"main": [
{
"url": "https://www.yahoo.com/lifestyle/tiger-woods-nikes-epic-partnership-015311819.html",
"desc": "曾经可能存在一个 Tiger Woods 没有被 Nike 赞助的世界,这似乎...",
"title": "Tiger Woods 与 Nike 那段史诗般合作关系如何破裂",
"source": "Yahoo",
"pos_overall": 1,
"relative_publish_date": "1 day ago"
},
...
},
...url
完整文章的 URL。
字符串
desc
文章全文的简短摘录。
字符串
title
文章的标题。
字符串
source
发布文章的网站名称。
字符串
pos_overall
指示该结果在新闻 SERP 主结果中的总体位置。
整数
relative_publish_date
描述文章发布的时间距离现在的时长。
字符串
其他
显示一份趋势文章的列表,并附带相关详情。

...
"additional": [
{
"items": [
{
"pos": 1,
"url": "https://www.complex.com/sneakers/a/brendan-dunne/nike-book-1-colorways-haven-hike-rattlesnake",
"title": "Nike Book 1 色彩款 Haven Hike Rattlesnake",
"source": "Complex",
"relative_publish_date": "1 day ago"
},
...
],
"pos_overall": 2,
"section_title": "Devin Booker 确认 Nike Book 1 发布存在问题"
}
...items
包含各自详细信息的文章列表。
数组
items.pos
表示文章在列表中位置的唯一标识符。
整数
items.url
完整文章的 URL。
字符串
items.title
文章的标题。
字符串
items.source
发布文章的网站名称。
字符串
items.relative_publish_date
描述文章发布的时间距离现在的时长。
字符串
pos_overall
指示该结果在新闻 SERP 附加结果中的总体位置。
整数
section_title
附加部分的名称。
字符串
最后更新于
这有帮助吗?

