Documentation has been updated: see help center and changelog in one place.

Parsing function examples

Practical examples of Custom Parser functions for HTML processing, string manipulation, mathematical operations, and common parsing tasks.

HTML processing

element_text

Sample HTML

<!DOCTYPE html>
<html>
<body>
    <div id="product">
        <div id="product-description">This is a nice product</div>
        <div id="product-price">    12  3


        </div>
    </div>
</body>
</html>

Extract text from HTML element and strip whitespaces

{
    "price": {
        "_fns": [
            {
                "_fn": "xpath_one",
                "_args": [".//*[@id='product-price']"]
            },
            {
                "_fn": "element_text"
            }
        ]
    }
}
{
    "price": "12  3"
}

Given a string value as an input, do nothing

{
    "price": {
        "_fns": [
            {
                "_fn": "xpath_one",
                "_args": [".//*[@id='product-price']/text()"]
            },
            {
                "_fn": "element_text"
            }
        ]
    }
}
{
    "price": "    12  3\n\n\n        "
}

xpath

Sample HTML

<body>
    <div class="product" id="socks">
        <div class="title">Socks</div>
        <div class="price">123.12</div>
        <div class="description">
            <ul>
                <li class="description-item">Very</li>
                <li class="description-item">Nice</li>
                <li class="description-item">Socks</li>
            </ul>
        </div>
    </div>
</body>

Get all description items

{
    "description_items": {
        "_fns": [
            {
                "_fn": "xpath",
                "_args": ["//li[@class='description-item']/text()"]
            }
        ]
    }
}
{
    "description_items": ["Very", "Nice", "Socks"]
}

Get the first description item

{
    "first_description_item": {
        "_fns": [
            {
                "_fn": "xpath",
                "_args": ["(//li[@class='description-item'])[1]/text()"]
            }
        ]
    }
}
{
    "first_description_item": [
        "Very"
    ]
}

Check if the description section element exists

{
    "description_section_exists": {
        "_fns": [
            {
                "_fn": "xpath",
                "_args": ["boolean(//div[@class='description'])"]
            }
        ]
    }
}
{
    "description_section_exists": true
}

Get price as a number

{
    "price": {
        "_fns": [
            {
                "_fn": "xpath",
                "_args": ["number(//div[@class='price'])"]
            }
        ]
    }
}
{
    "description_section_exists": 123.12
}

Multiple expressions to fallback to in case preceding expression fails

{
    "price": {
        "_fns": [
            {
                "_fn": "xpath",
                "_args": [
                    "//div[@class='product-price']/text()", <--- this does not find anything
                    "//div[@class='price']/text()" <--- this finds the target price
                ]
            }
        ]
    }
}
{
    "price": [
        "123.12"
    ]
}

XPath | operator to match with multiple expressions

{
    "price_and_title": {
        "_fns": [
            {
                "_fn": "xpath",
                "_args": ["//div[@class='price']/text() | //div[@class='title']/text()"]
            }
        ]
    }
}
{
    "price_and_title": [
        "Socks",
        "123.12"
    ]
}

xpath_one

Sample HTML

<body>
    <div class="product" id="socks">
        <div class="title">Socks</div>
        <div class="price">123.12</div>
        <div class="description">
            <ul>
                <li class="description-item">Very</li>
                <li class="description-item">Nice</li>
                <li class="description-item">Socks</li>
            </ul>
        </div>
    </div>
</body>

Return the first match

{
    "first_description_item": {
        "_fns": [
            {
                "_fn": "xpath_one",
                "_args": [".//li/text()"]
            }
        ]
    }
}
{
    "first_description_item": "Very"
}

Using XSLT functions

{
    "price": {
        "_fns": [
            {
                "_fn": "xpath_one",
                "_args": ["number(.//div[@class='price'])"]
            }
        ]
    }
}
{
    "price": 123.12
}

String manipulation

amount_from_string

Sample HTML

<body>
    <div class="product" id="socks">
        <div class="title">Socks</div>
        <div class="price">The price is: 123.12 pesos</div>
    </div>
</body>

Extract amount from string

{
    "price": {
        "_fns": [
            {
                "_fn": "xpath_one",
                "_args": [".//div[@class='price']/text()"]
            },
            {
                "_fn": "amount_from_string"
            }
        ]
    }
}
{
    "price": 123.12
}

amount_range_from_string

Sample HTML

<body>
    <div class="product">
        <div class="price">
            The price is: 123.12 pesos;
            The price is: 345.12 pesos;
            The price is: 678.12 pesos
        </div>
    </div>
</body>

Extract all amounts from string

{
    "prices": {
        "_fns": [
            {
                "_fn": "xpath_one",
                "_args": [".//div[@class='price']/text()"]
            },
            {
                "_fn": "amount_range_from_string"
            }
        ]
    }
}
{    
    "prices": [
        123.12,
        345.12,
        678.12
    ]
}

join

Sample HTML

<body>
    <div class="product">
        <div class="price">
            The price is: 123.12 pesos;
        </div>
        <div class="price">
            The price is: 345.12 pesos;
        </div>
        <div class="price">
            The price is: 678.12 pesos
        </div>
    </div>
</body>

Join an array of strings into a single string

{
    "price_variants": {
        "_fns": [
            {
                "_fn": "xpath",
                "_args": [".//div[@class='price']"]
            },
            {  // If we call normalize-space() in first pipeline function, 
               // it will return only the first value.
                "_fn": "xpath",
                "_args": ["normalize-space(text())"]
            },  
            {
                "_fn": "join",
                "_args": ""
            }
        ]
    }
}
{
    "price_variants": "The price is: 123.12 pesos;The price is: 345.12 pesos;The price is: 678.12 pesos"
}

regex_find_all

Sample HTML

<body>
    <div class="product">
        <div class="description">
            [one description]
            [two description]
            [three description]
        </div>
    </div>
</body>

Find all matches between two characters

{
    "descriptions": {
        "_fns": [
            {
                "_fn": "xpath_one",
                "_args": [".//div[@class='description']/text()"]
            },
            {
                "_fn": "regex_find_all",
                "_args": ["\\[(.*)\\]"]
            }
        ]
    }
}
{
    "descriptions": [
        "one description",
        "two description",
        "three description"
    ]
}

Sample HTML

<body>
    <div class="product">
        <div class="description">
            [one description]
            [two description]
            [three description]
            {the one i need}
        </div>
    </div>
</body>

Return description between two characters

{
    "description": {
        "_fns": [
            {
                "_fn": "xpath_one",
                "_args": [".//div[@class='description']/text()"]
            },
            {
                "_fn": "regex_search",
                "_args": ["{(.*)}", 1]
            }
        ]
    }
}
{
    "description": "the one i need"
}

regex_substring

Sample HTML

<body>
    <div class="product">
        <div class="description">
            * one description
            * two description
            * three description
            * {this one i would like to get replaced}
        </div>
    </div>
</body>

Replace a part of text with specified value

{
    "descriptions": {
        "_fns": [
            {
                "_fn": "xpath_one",
                "_args": [".//div[@class='description']/text()"]
            },
            {
                "_fn": "regex_substring",
                "_args": ["{this one i would like to get replaced}", "four description"]
            },
            {
                "_fn": "regex_find_all",
                "_args": ["\\*\\s(.*)\n"]
            }
        ]
    }
}
{
    "descriptions": [
        "one description",
        "two description",
        "three description",
        "four description"
    ]
}

Common functions

convert_to_*

Sample HTML

<body>
    <div class="product">
        <div class="price">123</div>
        <div class="price">124</div>
        <div class="price">456</div>
        <div class="price">421</div>
        <div class="price">100</div>
    </div>
</body>

Get the count of price variants

{
    "price_variants": {
        "_fns": [
            {
                "_fn": "xpath",
                "_args": [".//div[@class='price']"]
            },
            {
                "_fn": "length"
            }
        ]
    }
}
{
    "price_variants": 5
}

Get the count of price variants in a multi-dimensional array

Sample HTML:

<body>
    <div class="product">
        <property class="colors">
            <option class="color">Red</option>
            <option class="color">Green</option>
            <option class="color">Blue</option>
        </property>
        <property class="sizes">
            <option class="size">S</option>
            <option class="size">M</option>
            <option class="size">L</option>
            <option class="size">XL</option>
        </property>
    </div>
</body>
{
    "number_of_variants": {
        "_fns": [
            {
                "_fn": "xpath",
                "_args": [".//property"]
            },
            {
                "_fn": "xpath",
                "_args": [".//option"]
            },
            {
                "_fn": "length"
            }
        ]
    }
}
{
    "number_of_variants": [
        3,
        3
    ]
}

select_nth

Sample HTML

<body>
    <div class="product" id="socks">
        <div class="title">Socks</div>
        <div class="price">123.12</div>
        <div class="description">
            <ul>
                <li class="description-item">Very</li>
                <li class="description-item">Nice</li>
                <li class="description-item">Socks</li>
            </ul>
        </div>
    </div>
</body>

Select the first description item from the array

{
    "price_and_title": {
        "_fns": [
            {
                "_fn": "xpath",
                "_args": ["//li[@class='description-item']/text()"]
            },
            {
                "_fn": "select_nth",
                "_args": 0
            }
        ]
    }
}
{
    "price_and_title": "Very"
}

Select the last description item from the array

{
    "price_and_title": {
        "_fns": [
            {
                "_fn": "xpath",
                "_args": ["//li[@class='description-item']/text()"]
            },
            {
                "_fn": "select_nth",
                "_args": -1
            }
        ]
    }
}
{
    "price_and_title": "Socks"
}

Math functions

average

Sample HTML

<body>
    <div class="product">
        <div class="price">123</div>
        <div class="price">124</div>
        <div class="price">456</div>
        <div class="price">421</div>
        <div class="price">100</div>
    </div>
</body>

Find the average of all listed prices

{
    "price_average": {
        "_fns": [
            {
                "_fn": "xpath",
                "_args": [".//div[@class='price']"]
            },
            {
                "_fn": "xpath_one",
                "_args": ["number(text())"]
            },
            {
                "_fn": "average"
            }
        ]
    }
}
{
    "price_average": 244.8
}

max

Sample HTML

<body>
    <div class="product">
        <div class="price">123</div>
        <div class="price">124</div>
        <div class="price">456</div>
        <div class="price">421</div>
        <div class="price">100</div>
    </div>
</body>

Find the max of all listed prices

{
    "price_max": {
        "_fns": [
            {
                "_fn": "xpath",
                "_args": [".//div[@class='price']"]
            },
            {
                "_fn": "xpath_one",
                "_args": ["number(text())"]
            },
            {
                "_fn": "max"
            }
        ]
    }
}
{
    "price_max": 456.0
}

min

Sample HTML

<body>
    <div class="product">
        <div class="price">123</div>
        <div class="price">124</div>
        <div class="price">456</div>
        <div class="price">421</div>
        <div class="price">100</div>
    </div>
</body>

Find the average of all listed prices

{
    "price_min": {
        "_fns": [
            {
                "_fn": "xpath",
                "_args": [".//div[@class='price']"]
            },
            {
                "_fn": "xpath_one",
                "_args": ["number(text())"]
            },
            {
                "_fn": "min"
            }
        ]
    }
}
{
    "price_min": 100.0
}

product

Sample HTML

<body>
    <div class="product">
        <property class="colors">
            <option class="color">Red</option>
            <option class="color">Green</option>
            <option class="color">Blue</option>
        </property>
        <property class="sizes">
            <option class="size">S</option>
            <option class="size">M</option>
            <option class="size">L</option>
            <option class="size">XL</option>
        </property>
    </div>
</body>

Get the count of different product variants

{
    "number_of_variants": {
        "_fns": [
            {
                "_fn": "xpath",
                "_args": [".//property"]
            },
            {
                "_fn": "xpath",
                "_args": [".//option"]
            },
            {
                "_fn": "length"
            },
            {
                "_fn": "product"
            }
        ]
    }
}
{
    "number_of_variants": 12
}

Last updated

Was this helpful?