Push-Pull is our recommended integration method for reliably handling large amounts of data.
Push-Pull is an asynchronous integration method. Upon job submission, you will promptly receive a JSON
response containing all job details, including job parameters, ID, and URLs for result download and status checking. Once your job is processed, we will update you via a JSON
payload sent to your server, if you provided a callback URL. Results remain available for retrieval for at least 24 hours after completion.
With Push-Pull, you can upload your results directly to your cloud storage (AWS S3 or Google Cloud Storage).
If you prefer not to set up a service for incoming callback notifications, you can simply retrieve your results periodically (polling ).
You can also explore how Push-Pull works using Postman .
Batch Query
Scraper APIs supports submitting up to 5,000 query
or url
parameter values within a single batch request.
Endpoint
Copy POST https://data.oxylabs.io/v1/queries/batch
The system will handle every query
or url
submitted as a separate job. If you provide a callback URL, you will get a separate call for each keyword. Otherwise, our initial response will contain job id
s for all keywords. For example, if you sent 50 keywords, we will return 50 unique job id
s.
IMPORTANT: With /batch
endpoint, you can only submit lists of query
or url
parameter values (depending on the source
you use). All other parameters should have singular values.
Input
You need to post query parameters as a JSON payload. Here is how you submit a batch job:
cURL Python PHP C# Golang Java Node.js
Copy curl --user "user:pass1" \
'https://data.oxylabs.io/v1/queries/batch' \
-H 'Content-Type: application/json' \
-d '@keywords.json'
Copy import requests
import json
from pprint import pprint
# Get payload from file.
with open ( 'keywords.json' , 'r' ) as f :
payload = json . loads (f. read ())
response = requests . request (
'POST' ,
'https://data.oxylabs.io/v1/queries/batch' ,
auth = ( 'user' , 'pass1' ),
json = payload,
)
# Print prettified response.
pprint (response. json ())
Copy <? php
$paramsFile = file_get_contents ( realpath ( "keywords.json" )) ;
$ch = curl_init () ;
curl_setopt ( $ch , CURLOPT_URL , "https://data.oxylabs.io/v1/queries/batch" ) ;
curl_setopt ( $ch , CURLOPT_RETURNTRANSFER , 1 ) ;
curl_setopt ( $ch , CURLOPT_POSTFIELDS , $paramsFile ) ;
curl_setopt ( $ch , CURLOPT_POST , 1 ) ;
curl_setopt ( $ch , CURLOPT_USERPWD , "user" . ":" . "pass1" ) ;
$headers = array ();
$headers[] = "Content-Type: application/json" ;
curl_setopt ( $ch , CURLOPT_HTTPHEADER , $headers ) ;
$result = curl_exec ( $ch ) ;
echo $result;
if ( curl_errno ( $ch ) ) {
echo 'Error:' . curl_error ( $ch ) ;
}
curl_close ( $ch ) ;
?>
Copy using System ;
using System . IO ;
using System . Net . Http ;
using System . Text ;
using System . Threading . Tasks ;
namespace OxyApi
{
class Program
{
static async Task Main ()
{
const string Username = "YOUR_USERNAME" ;
const string Password = "YOUR_PASSWORD" ;
var content = File . ReadAllText ( @"C:\path\to\keywords.json" );
var client = new HttpClient ();
var requestMessage = new HttpRequestMessage(HttpMethod.Post, new Uri("https://data.oxylabs.io/v1/queries/batch"));
requestMessage . Content = new StringContent (content , Encoding . UTF8 , "application/json" );
var authenticationString = $"{Username}:{Password}" ;
var base64EncodedAuthenticationString = Convert.ToBase64String(ASCIIEncoding.UTF8.GetBytes(authenticationString));
requestMessage . Headers . Add ( "Authorization" , "Basic " + base64EncodedAuthenticationString);
var response = await client . SendAsync (requestMessage);
var contents = await response . Content . ReadAsStringAsync ();
Console . WriteLine (contents);
}
}
}
Copy package main
import (
"bytes"
"fmt"
"io/ioutil"
"net/http"
"os"
)
func main () {
const Username = "YOUR_USERNAME"
const Password = "YOUR_PASSWORD"
content, err := os. ReadFile ( "keywords.json" )
if err != nil {
panic (err)
}
client := & http . Client {}
request, _ := http. NewRequest ( "POST" ,
"https://data.oxylabs.io/v1/queries/batch" ,
bytes. NewBuffer (content),
)
request.Header. Add ( "Content-type" , "application/json" )
request. SetBasicAuth (Username, Password)
response, _ := client. Do (request)
responseText, _ := ioutil. ReadAll (response.Body)
fmt. Println ( string (responseText))
}
Copy import okhttp3 . * ;
import java . io . IOException ;
import java . nio . file . Files ;
import java . nio . file . Path ;
public class Main implements Runnable {
private static final String AUTHORIZATION_HEADER = "Authorization" ;
public static final String USERNAME = "YOUR_USERNAME" ;
public static final String PASSWORD = "YOUR_PASSWORD" ;
public void run () {
Path filePath = Path . of ( "/path/to/keywords.json" );
String jsonContent = null ;
try {
jsonContent = Files . readString (filePath);
} catch ( IOException e) {
throw new RuntimeException(e) ;
}
Authenticator authenticator = (route , response) -> {
String credential = Credentials . basic (USERNAME , PASSWORD);
return response
. request ()
. newBuilder ()
. header (AUTHORIZATION_HEADER , credential)
. build ();
};
var client = new OkHttpClient . Builder ()
. authenticator (authenticator)
. build ();
var mediaType = MediaType . parse ( "application/json; charset=utf-8" );
var body = RequestBody . create (jsonContent , mediaType);
var request = new Request . Builder ()
. url ( "https://data.oxylabs.io/v1/queries/batch" )
. post (body)
. build ();
try ( var response = client . newCall (request) . execute ()) {
assert response . body () != null ;
System . out . println ( response . body () . string ());
} catch ( Exception exception) {
System . out . println ( "Error: " + exception . getMessage ());
}
System . exit ( 0 );
}
public static void main ( String [] args) {
new Thread( new Main()) . start ();
}
}
Copy import fetch from 'node-fetch' ;
import fs from 'fs'
const username = 'YOUR_USERNAME' ;
const password = 'YOUR_PASSWORD' ;
const payload = fs.readFileSync ( 'keywords.json' ) .toString ();
const response = await fetch ( 'https://data.oxylabs.io/v1/queries/batch' , {
method: 'post' ,
body: payload,
headers: {
'Content-Type' : 'application/json' ,
'Authorization' : 'Basic ' + Buffer.from ( `${username}:${password}` ) .toString ( 'base64' ) ,
}
});
console.log(await response.json ());
You may notice that the code example above doesn't explain how the JSON payload should be formatted and points out to a pre-made JSON file. Below is the content of keywords.json
file, containing multiple query
parameter values:
Copy {
"query" : [
"adidas" ,
"nike" ,
"reebok"
] ,
"source" : "google_search" ,
"domain" : "com" ,
"callback_url" : "https://your.callback.url"
}
...and here is a keywords.json
batch input file, containing multiple URLs:
Copy {
"url" : [
"https://example.com/url1.html" ,
"https://example.com/url2.html" ,
"https://example.com/url3.html"
] ,
"source" : "google_search" ,
"callback_url" : "https://your.callback.url"
}
Output
The API will respond with a JSON object, containing the job information for each job created. The response will be similar to this:
Copy {
"queries" : [
{
"callback_url" : "https://your.callback.url" ,
{...}
"created_at" : "2019-10-01 00:00:01" ,
"domain" : "com" ,
"id" : "12345678900987654321" ,
{...}
"query" : "adidas" ,
"source" : "google_search" ,
{...}
"rel" : "results" ,
"href" : "http://data.oxylabs.io/v1/queries/12345678900987654321/results" ,
"method" : "GET"
}
]
},
{
"callback_url" : "https://your.callback.url" ,
{...}
"created_at" : "2019-10-01 00:00:01" ,
"domain" : "com" ,
"id" : "12345678901234567890" ,
{...}
"query" : "nike" ,
"source" : "google_search" ,
{...}
"rel" : "results" ,
"href" : "http://data.oxylabs.io/v1/queries/12345678901234567890/results" ,
"method" : "GET"
}
]
},
{
"callback_url" : "https://your.callback.url" ,
{...}
"created_at" : "2019-10-01 00:00:01" ,
"domain" : "com" ,
"id" : "01234567899876543210" ,
{...}
"query" : "reebok" ,
"source" : "google_search" ,
{...}
"rel" : "results" ,
"href" : "http://data.oxylabs.io/v1/queries/01234567899876543210/results" ,
"method" : "GET"
}
]
}
]
}
Data dictionary
For detailed descriptions of the job input parameters, please consult the table below or refer to the specific documentation pages for the scrapers you are interested in.
Callback
The callback is a POST
request we send to your machine, informing that the data extraction task is completed and providing a URL to download scraped content. This means that you no don't need to check job status manually. Once the data is here, we will let you know, and all you need to do now is to retrieve it .
Input
Python PHP C# Golang Java Node.js
Copy # This is a simple Sanic web server with a route listening for callbacks on localhost:8080.
# It will print job results to stdout.
import requests
from pprint import pprint
from sanic import Sanic , response
AUTH_TUPLE = ( 'user' , 'pass1' )
app = Sanic ()
# Define /job_listener endpoint that accepts POST requests.
@app . route ( '/job_listener' , methods = [ 'POST' ])
async def job_listener ( request ):
try :
res = request . json
links = res . get ( '_links' , [])
for link in links :
if link [ 'rel' ] == 'results' :
# Sanic is async, but requests are synchronous, to fully take
# advantage of Sanic, use aiohttp.
res_response = requests . request (
method = 'GET' ,
url = link[ 'href' ],
auth = AUTH_TUPLE,
)
pprint (res_response. json ())
break
except Exception as e :
print ( "Listener exception: {} " . format (e))
return response . json (status = 200 , body = { 'status' : 'ok' })
if __name__ == '__main__' :
app . run (host = '0.0.0.0' , port = 8080 )
Copy <? php
$stdout = fopen ( 'php://stdout' , 'w' ) ;
if ( isset ( $_POST ) ) {
$result = array_merge ( $_POST , ( array ) json_decode ( file_get_contents ( 'php://input' ))) ;
$ch = curl_init () ;
curl_setopt ( $ch , CURLOPT_URL , "https://data.oxylabs.io/v1/queries/" . $result[ 'id' ] . '/results' ) ;
curl_setopt ( $ch , CURLOPT_RETURNTRANSFER , 1 ) ;
curl_setopt ( $ch , CURLOPT_CUSTOMREQUEST , "GET" ) ;
curl_setopt ( $ch , CURLOPT_USERPWD , "user" . ":" . "pass1" ) ;
$result = curl_exec ( $ch ) ;
fwrite ( $stdout , $result ) ;
if ( curl_errno ( $ch ) ) {
echo 'Error:' . curl_error ( $ch ) ;
}
curl_close ( $ch ) ;
}
?>
Copy using Microsoft . AspNetCore . Builder ;
using Microsoft . AspNetCore . Hosting ;
using Microsoft . Extensions . Configuration ;
using Microsoft . Extensions . DependencyInjection ;
using Microsoft . Extensions . Hosting ;
using System ;
using System . Collections . Generic ;
using System . Net . Http ;
namespace OxyApiWeb
{
public class Callback
{
public Link [] _links { get ; set ; }
}
public class Link
{
public string rel { get ; set ; }
public string href { get ; set ; }
}
public class Startup
{
private const string USERNAME = "YOUR_USERNAME" ;
private const string PASSWORD = "YOUR_PASSWORD" ;
public Startup ( IConfiguration configuration)
{
Configuration = configuration;
client = new HttpClient ();
}
public IConfiguration Configuration { get ; }
private HttpClient client;
public void ConfigureServices ( IServiceCollection services)
{
services . AddControllers ();
}
public void Configure ( IApplicationBuilder app , IWebHostEnvironment env)
{
if ( env . IsDevelopment ())
{
app . UseDeveloperExceptionPage ();
}
app . UseRouting ();
app . UseAuthorization ();
app . UseEndpoints (endpoints =>
{
endpoints . MapPost ( "/job_listener" , async context =>
{
var callback = await System.Text.Json.JsonSerializer.DeserializeAsync<Callback>(context.Request.Body);
foreach ( var link in callback . _links )
{
if ( link . rel != "results" )
{
continue ;
}
var requestMessage = new HttpRequestMessage ( HttpMethod . Get , new Uri ( link . href ));
var authenticationString = $"{USERNAME}:{PASSWORD}" ;
var base64EncodedAuthenticationString = Convert.ToBase64String(System.Text.ASCIIEncoding.UTF8.GetBytes(authenticationString));
requestMessage . Headers . Add ( "Authorization" , "Basic " + base64EncodedAuthenticationString);
var response = await client . SendAsync (requestMessage);
var contents = await response . Content . ReadAsStringAsync ();
Console . WriteLine (contents);
}
var okMessage = new Dictionary < string , string >()
{
{ "message" , "ok" }
};
await System . Text . Json . JsonSerializer . SerializeAsync ( context . Response . Body , okMessage);
});
});
}
}
}
Copy package main
import (
"fmt"
"github.com/labstack/echo/v4"
"io/ioutil"
"net/http"
)
const Username = "YOUR_USERNAME"
const Password = "YOUR_PASSWORD"
type Callback struct {
Links [] Link `json:"_links"`
}
type Link struct {
Href string `json:"href"`
Method string `json:"method"`
Rel string `json:"rel"`
}
func main () {
echoServer := echo. New ()
client := & http . Client {}
echoServer. POST ( "/job_listener" , func (context echo . Context ) error {
callback := new ( Callback )
if err := context. Bind (callback); err != nil {
return err
}
for _, link := range callback.Links {
if link.Rel != "results" {
continue
}
request, _ := http. NewRequest ( "GET" ,
link.Href,
nil ,
)
request.Header. Add ( "Content-type" , "application/json" )
request. SetBasicAuth (Username, Password)
response, _ := client. Do (request)
responseText, _ := ioutil. ReadAll (response.Body)
fmt. Println ( string (responseText))
}
return context. JSON (http.StatusOK, map [ string ] string { "status" : "ok" })
})
echoServer.Logger. Fatal (echoServer. Start ( ":8080" ))
}
Copy package org . example ;
import okhttp3 . * ;
import com . sun . net . httpserver . HttpServer ;
import org . apache . commons . io . IOUtils ;
import org . json . JSONArray ;
import org . json . JSONObject ;
import java . io . IOException ;
import java . io . OutputStream ;
import java . net . InetSocketAddress ;
import java . nio . charset . StandardCharsets ;
import java . util . Map ;
import java . util . Objects ;
public class Main implements Runnable {
private static final String AUTHORIZATION_HEADER = "Authorization" ;
public static final String USERNAME = "YOUR_USERNAME" ;
public static final String PASSWORD = "YOUR_PASSWORD" ;
public void run () {
HttpServer server = null ;
try {
server = HttpServer . create ( new InetSocketAddress( "0.0.0.0" , 8080 ) , 0 );
} catch ( IOException exception) {
exception . printStackTrace ();
System . exit ( 1 );
}
Authenticator authenticator = (route , response) -> {
String credential = Credentials . basic (USERNAME , PASSWORD);
return response
. request ()
. newBuilder ()
. header (AUTHORIZATION_HEADER , credential)
. build ();
};
var client = new OkHttpClient . Builder ()
. authenticator (authenticator)
. build ();
server . createContext ( "/job_listener" , exchange -> {
var requestBody = IOUtils . toString ( exchange . getRequestBody () , StandardCharsets . UTF_8 );
JSONObject requestJson = new JSONObject(requestBody) ;
JSONArray links = requestJson . getJSONArray ( "_links" );
for ( var link : links . toList ()) {
var linkMap = ( Map<? , ?> )link;
if ( ! Objects . equals ( linkMap . get ( "rel" ) , "results" )) {
continue ;
}
var request = new Request . Builder ()
. url ((String) linkMap . get ( "href" ))
. get ()
. build ();
try ( var response = client . newCall (request) . execute ()) {
assert response . body () != null ;
System . out . println ( response . body () . string ());
} catch ( Exception exception) {
System . out . println ( "Error: " + exception . getMessage ());
}
}
var responseJson = new JSONObject() ;
responseJson . put ( "status" , "ok" );
exchange . sendResponseHeaders ( 200 , responseJson . toString () . length ());
OutputStream responseBody = exchange . getResponseBody ();
responseBody . write ( responseJson . toString () . getBytes ());
responseBody . flush ();
responseBody . close ();
exchange . close ();
});
server . setExecutor ( null );
server . start ();
}
public static void main ( String [] args) {
new Thread( new Main()) . start ();
}
}
Copy import express from 'express'
import fetch from 'node-fetch' ;
const username = 'YOUR_USERNAME' ;
const password = 'YOUR_PASSWORD' ;
const app = express ();
app.use(express.json( ));
app.post( '/job_listener' , async ( request, response ) = > {
for ( const index in request.body._links ) {
const link = request.body._links[index] ;
if ( link.rel !== 'results' ) {
continue ;
}
const jobResultResponse = await fetch ( link.href, {
method: 'get' ,
headers: {
'Content-Type' : 'application/json' ,
'Authorization' : 'Basic ' + Buffer.from ( `${username}:${password}` ) .toString ( 'base64' ) ,
}
});
console.log(await jobResultResponse.json ());
}
response.send( {status: 'ok' } );
});
app.listen(8080 );
Output
Copy {
"created_at" : "2019-10-01 00:00:01" ,
"updated_at" : "2019-10-01 00:00:15" ,
"locale" : null ,
"client_id" : 163 ,
"user_agent_type" : "desktop" ,
"source" : "google_search" ,
"pages" : 1 ,
"subdomain" : "www" ,
"status" : "done" ,
"start_page" : 1 ,
"parse" : 0 ,
"render" : null ,
"priority" : 0 ,
"ttl" : 0 ,
"origin" : "api" ,
"persist" : true ,
"id" : "12345678900987654321" ,
"callback_url" : "http://your.callback.url/" ,
"query" : "adidas" ,
"domain" : "com" ,
"limit" : 10 ,
"geo_location" : null ,
{...}
"_links" :[
{
"href" : "https://data.oxylabs.io/v1/queries/12345678900987654321" ,
"method" : "GET" ,
"rel" : "self"
} ,
{
"href" : "https://data.oxylabs.io/v1/queries/12345678900987654321/results" ,
"method" : "GET" ,
"rel" : "results"
}
],
}
Check Job Status
If you provided a valid callback URL when submitting your job, we will notify you upon completion by sending a JSON
payload to the specified callback URL. This payload will indicate that the job has been completed and its status set to done
.
However, if you submitted a job without using callback service , you can check the job status manually. Retrieve the URL from the href
field in the rel:self
section of the response message received after job submission. The URL for checking the job status will resemble the following: http://data.oxylabs.io/v1/queries/12345678900987654321
. Querying this URL will return the job information, including its current status
.
Endpoint
Copy GET https://data.oxylabs.io/v1/queries/{id}
Input
cURL Python PHP C# Golang Java Node.js
Copy curl --user "user:pass1" \
'http://data.oxylabs.io/v1/queries/12345678900987654321'
Copy import requests
from pprint import pprint
# Get response from stats endpoint.
response = requests . request (
method = 'GET' ,
url = 'http://data.oxylabs.io/v1/queries/12345678900987654321' ,
auth = ( 'user' , 'pass1' ),
)
# Print prettified JSON response to stdout.
pprint (response. json ())
Copy <? php
$ch = curl_init () ;
curl_setopt ( $ch , CURLOPT_URL , "http://data.oxylabs.io/v1/queries/12345678900987654321" ) ;
curl_setopt ( $ch , CURLOPT_RETURNTRANSFER , 1 ) ;
curl_setopt ( $ch , CURLOPT_CUSTOMREQUEST , "GET" ) ;
curl_setopt ( $ch , CURLOPT_USERPWD , "user" . ":" . "pass1" ) ;
$result = curl_exec ( $ch ) ;
echo $result;
if ( curl_errno ( $ch ) ) {
echo 'Error:' . curl_error ( $ch ) ;
}
curl_close ( $ch ) ;
?>
Copy using System ;
using System . Collections . Generic ;
using System . Net . Http ;