Parsing function examples
Explore function examples for the Custom Parser in the Web Scraper API: pipelines for XPath, regex, conversions, arrays and more.
HTML processing
element_text
element_textSample HTML
<!DOCTYPE html>
<html>
<body>
<div id="product">
<div id="product-description">This is a nice product</div>
<div id="product-price"> 12 3
</div>
</div>
</body>
</html>Extract text from HTML element and strip whitespaces
{
"price": {
"_fns": [
{
"_fn": "xpath_one",
"_args": [".//*[@id='product-price']"]
},
{
"_fn": "element_text"
}
]
}
}{
"price": "12 3"
}Given a string value as an input, do nothing
{
"price": {
"_fns": [
{
"_fn": "xpath_one",
"_args": [".//*[@id='product-price']/text()"]
},
{
"_fn": "element_text"
}
]
}
}{
"price": " 12 3\n\n\n "
}xpath
xpathSample HTML
<body>
<div class="product" id="socks">
<div class="title">Socks</div>
<div class="price">123.12</div>
<div class="description">
<ul>
<li class="description-item">Very</li>
<li class="description-item">Nice</li>
<li class="description-item">Socks</li>
</ul>
</div>
</div>
</body>Get all description items
{
"description_items": {
"_fns": [
{
"_fn": "xpath",
"_args": ["//li[@class='description-item']/text()"]
}
]
}
}{
"description_items": ["Very", "Nice", "Socks"]
}Get the first description item
{
"first_description_item": {
"_fns": [
{
"_fn": "xpath",
"_args": ["(//li[@class='description-item'])[1]/text()"]
}
]
}
}{
"first_description_item": [
"Very"
]
}Check if the description section element exists
{
"description_section_exists": {
"_fns": [
{
"_fn": "xpath",
"_args": ["boolean(//div[@class='description'])"]
}
]
}
}{
"description_section_exists": true
}Get price as a number
{
"price": {
"_fns": [
{
"_fn": "xpath",
"_args": ["number(//div[@class='price'])"]
}
]
}
}{
"description_section_exists": 123.12
}Multiple expressions to fallback to in case preceding expression fails
{
"price": {
"_fns": [
{
"_fn": "xpath",
"_args": [
"//div[@class='product-price']/text()", <--- this does not find anything
"//div[@class='price']/text()" <--- this finds the target price
]
}
]
}
}{
"price": [
"123.12"
]
}XPath | operator to match with multiple expressions
{
"price_and_title": {
"_fns": [
{
"_fn": "xpath",
"_args": ["//div[@class='price']/text() | //div[@class='title']/text()"]
}
]
}
}{
"price_and_title": [
"Socks",
"123.12"
]
}xpath_one
xpath_oneSample HTML
<body>
<div class="product" id="socks">
<div class="title">Socks</div>
<div class="price">123.12</div>
<div class="description">
<ul>
<li class="description-item">Very</li>
<li class="description-item">Nice</li>
<li class="description-item">Socks</li>
</ul>
</div>
</div>
</body>Return the first match
{
"first_description_item": {
"_fns": [
{
"_fn": "xpath_one",
"_args": [".//li/text()"]
}
]
}
}{
"first_description_item": "Very"
}Using XSLT functions
{
"price": {
"_fns": [
{
"_fn": "xpath_one",
"_args": ["number(.//div[@class='price'])"]
}
]
}
}{
"price": 123.12
}String manipulation
amount_from_string
amount_from_stringSample HTML
<body>
<div class="product" id="socks">
<div class="title">Socks</div>
<div class="price">The price is: 123.12 pesos</div>
</div>
</body>Extract amount from string
{
"price": {
"_fns": [
{
"_fn": "xpath_one",
"_args": [".//div[@class='price']/text()"]
},
{
"_fn": "amount_from_string"
}
]
}
}{
"price": 123.12
}amount_range_from_string
amount_range_from_stringSample HTML
<body>
<div class="product">
<div class="price">
The price is: 123.12 pesos;
The price is: 345.12 pesos;
The price is: 678.12 pesos
</div>
</div>
</body>Extract all amounts from string
{
"prices": {
"_fns": [
{
"_fn": "xpath_one",
"_args": [".//div[@class='price']/text()"]
},
{
"_fn": "amount_range_from_string"
}
]
}
}{
"prices": [
123.12,
345.12,
678.12
]
}join
joinSample HTML
<body>
<div class="product">
<div class="price">
The price is: 123.12 pesos;
</div>
<div class="price">
The price is: 345.12 pesos;
</div>
<div class="price">
The price is: 678.12 pesos
</div>
</div>
</body>Join an array of strings into a single string
{
"price_variants": {
"_fns": [
{
"_fn": "xpath",
"_args": [".//div[@class='price']"]
},
{ // If we call normalize-space() in first pipeline function,
// it will return only the first value.
"_fn": "xpath",
"_args": ["normalize-space(text())"]
},
{
"_fn": "join",
"_args": ""
}
]
}
}{
"price_variants": "The price is: 123.12 pesos;The price is: 345.12 pesos;The price is: 678.12 pesos"
}regex_find_all
regex_find_allSample HTML
<body>
<div class="product">
<div class="description">
[one description]
[two description]
[three description]
</div>
</div>
</body>Find all matches between two characters
{
"descriptions": {
"_fns": [
{
"_fn": "xpath_one",
"_args": [".//div[@class='description']/text()"]
},
{
"_fn": "regex_find_all",
"_args": ["\\[(.*)\\]"]
}
]
}
}{
"descriptions": [
"one description",
"two description",
"three description"
]
}regex_search
regex_searchSample HTML
<body>
<div class="product">
<div class="description">
[one description]
[two description]
[three description]
{the one i need}
</div>
</div>
</body>Return description between two characters
{
"description": {
"_fns": [
{
"_fn": "xpath_one",
"_args": [".//div[@class='description']/text()"]
},
{
"_fn": "regex_search",
"_args": ["{(.*)}", 1]
}
]
}
}{
"description": "the one i need"
}regex_substring
regex_substringSample HTML
<body>
<div class="product">
<div class="description">
* one description
* two description
* three description
* {this one i would like to get replaced}
</div>
</div>
</body>Replace a part of text with specified value
{
"descriptions": {
"_fns": [
{
"_fn": "xpath_one",
"_args": [".//div[@class='description']/text()"]
},
{
"_fn": "regex_substring",
"_args": ["{this one i would like to get replaced}", "four description"]
},
{
"_fn": "regex_find_all",
"_args": ["\\*\\s(.*)\n"]
}
]
}
}{
"descriptions": [
"one description",
"two description",
"three description",
"four description"
]
}Common functions
convert_to_*
convert_to_*Sample HTML
<body>
<div class="product">
<div class="price">123</div>
<div class="price">124</div>
<div class="price">456</div>
<div class="price">421</div>
<div class="price">100</div>
</div>
</body>Get the count of price variants
{
"price_variants": {
"_fns": [
{
"_fn": "xpath",
"_args": [".//div[@class='price']"]
},
{
"_fn": "length"
}
]
}
}{
"price_variants": 5
}Get the count of price variants in a multi-dimensional array
Sample HTML:
<body>
<div class="product">
<property class="colors">
<option class="color">Red</option>
<option class="color">Green</option>
<option class="color">Blue</option>
</property>
<property class="sizes">
<option class="size">S</option>
<option class="size">M</option>
<option class="size">L</option>
<option class="size">XL</option>
</property>
</div>
</body>{
"number_of_variants": {
"_fns": [
{
"_fn": "xpath",
"_args": [".//property"]
},
{
"_fn": "xpath",
"_args": [".//option"]
},
{
"_fn": "length"
}
]
}
}{
"number_of_variants": [
3,
3
]
}select_nth
select_nthSample HTML
<body>
<div class="product" id="socks">
<div class="title">Socks</div>
<div class="price">123.12</div>
<div class="description">
<ul>
<li class="description-item">Very</li>
<li class="description-item">Nice</li>
<li class="description-item">Socks</li>
</ul>
</div>
</div>
</body>Select the first description item from the array
{
"price_and_title": {
"_fns": [
{
"_fn": "xpath",
"_args": ["//li[@class='description-item']/text()"]
},
{
"_fn": "select_nth",
"_args": 0
}
]
}
}{
"price_and_title": "Very"
}Select the last description item from the array
{
"price_and_title": {
"_fns": [
{
"_fn": "xpath",
"_args": ["//li[@class='description-item']/text()"]
},
{
"_fn": "select_nth",
"_args": -1
}
]
}
}{
"price_and_title": "Socks"
}Math functions
average
averageSample HTML
<body>
<div class="product">
<div class="price">123</div>
<div class="price">124</div>
<div class="price">456</div>
<div class="price">421</div>
<div class="price">100</div>
</div>
</body>Find the average of all listed prices
{
"price_average": {
"_fns": [
{
"_fn": "xpath",
"_args": [".//div[@class='price']"]
},
{
"_fn": "xpath_one",
"_args": ["number(text())"]
},
{
"_fn": "average"
}
]
}
}{
"price_average": 244.8
}max
maxSample HTML
<body>
<div class="product">
<div class="price">123</div>
<div class="price">124</div>
<div class="price">456</div>
<div class="price">421</div>
<div class="price">100</div>
</div>
</body>Find the max of all listed prices
{
"price_max": {
"_fns": [
{
"_fn": "xpath",
"_args": [".//div[@class='price']"]
},
{
"_fn": "xpath_one",
"_args": ["number(text())"]
},
{
"_fn": "max"
}
]
}
}{
"price_max": 456.0
}min
minSample HTML
<body>
<div class="product">
<div class="price">123</div>
<div class="price">124</div>
<div class="price">456</div>
<div class="price">421</div>
<div class="price">100</div>
</div>
</body>Find the average of all listed prices
{
"price_min": {
"_fns": [
{
"_fn": "xpath",
"_args": [".//div[@class='price']"]
},
{
"_fn": "xpath_one",
"_args": ["number(text())"]
},
{
"_fn": "min"
}
]
}
}{
"price_min": 100.0
}product
productSample HTML
<body>
<div class="product">
<property class="colors">
<option class="color">Red</option>
<option class="color">Green</option>
<option class="color">Blue</option>
</property>
<property class="sizes">
<option class="size">S</option>
<option class="size">M</option>
<option class="size">L</option>
<option class="size">XL</option>
</property>
</div>
</body>Get the count of different product variants
{
"number_of_variants": {
"_fns": [
{
"_fn": "xpath",
"_args": [".//property"]
},
{
"_fn": "xpath",
"_args": [".//option"]
},
{
"_fn": "length"
},
{
"_fn": "product"
}
]
}
}{
"number_of_variants": 12
}Last updated
Was this helpful?

