* Fix #145
* Change the methods and variables `CharsetDetector` to `DetectCharset`
* Add searchapi.io SERP support
This commit is contained in:
Namhyeon Go 2024-09-27 15:53:19 +09:00
parent ad6cb7e430
commit a60e7e5d9e
3 changed files with 79 additions and 20 deletions

View File

@ -1,6 +1,6 @@
{
"description": "WelsonJS test profile (test-misc.json)",
"released": "2024-09-25",
"released": "2024-09-27",
"dependencies": {
"welsonjs": "0.2.7"
},
@ -93,7 +93,12 @@
},
{
"id": "proxy_custom_provider",
"description": "HTTP proxy with a custom provider",
"description": "HTTP proxy with an web proxy provider",
"tags": ["Network", "HTTP"]
},
{
"id": "proxy_serp",
"description": "HTTP proxy with a SERP provider",
"tags": ["Network", "HTTP"]
}
]

View File

@ -29,6 +29,12 @@ var AVAILABLE_PROXIES = [
"url": "http://scrapeops:{api_key}@residential-proxy.scrapeops.io:8181",
"documentation": "https://scrapeops.io?fpr=namhyeon75"
},
{
"type": "serp",
"provider": "searchapi",
"url": "https://www.searchapi.io/api/v1/search?api_key={api_key}&engine={engine}&q={q}",
"documentation": "https://www.searchapi.io/?via=namhyeon"
},
{
"type": "stateless-jsonrpc2",
"provider": "gnh1201/caterpillar",
@ -99,7 +105,7 @@ var HTTPObject = function(engine) {
"host": "127.0.0.1",
"port": 80,
"credential": null, // { username: "user", password: "pass" }
"url": "" // stateless only
"url": null // stateless only
};
this.engine = (typeof(engine) !== "undefined" ? engine : "MSXML");
@ -140,7 +146,7 @@ var HTTPObject = function(engine) {
this.curlOptions = [];
this.charset = FILE.CdoCharset.CdoUTF_8;
this.isUseCharsetDetector = false;
this.isUseDetectCharset = false;
this.isVerifySSL = true;
this.isCompressedResponse = false;
@ -258,17 +264,12 @@ var HTTPObject = function(engine) {
});
if (typeof availableProxy !== "undefined") {
this.proxy.provider = proxy['provider'];
if (proxyType == "stateless") {
this.proxy.url = availableProxy.url;
} else {
this.proxy.protocol = proxy['protocol'] || this.proxy.protocol;
this.proxy.host = proxy['host'] || this.proxy.host;
this.proxy.port = proxy['port'] || this.proxy.port;
this.proxy.credential = proxy['credential'] || this.proxy.credential;
this.proxy.url = proxy['url'] || this.proxy.url;
}
this.proxy.provider = availableProxy['provider'];
this.proxy.protocol = availableProxy['protocol'] || this.proxy.protocol;
this.proxy.host = availableProxy['host'] || this.proxy.host;
this.proxy.port = availableProxy['port'] || this.proxy.port;
this.proxy.credential = availableProxy['credential'] || this.proxy.credential;
this.proxy.url = availableProxy['url'] || this.proxy.url;
console.info("Please check documentation:", availableProxy.documentation);
}
@ -480,6 +481,14 @@ var HTTPObject = function(engine) {
};
this.getProxiedURL = function(url) {
if (!this.proxy.enabled) return url;
if (this.proxy.type == "serp") {
var serp = this.parseSerpUrl(url);
this.setVariable("engine", serp.engine);
this.setVariable("q", encodeURIComponent(serp.keyword));
}
this.setVariable("url", encodeURIComponent(url));
url = this.evaluate(this.proxy.url);
@ -488,6 +497,32 @@ var HTTPObject = function(engine) {
return url;
};
this.parseSerpUrl = function(url) {
var getEngine = function(url) {
var match = url.match(/(?:https?:\/\/)?(?:www\.)?(google|youtube|bing|baidu|amazon)\.\w+/), result;
if (match) {
result = match[1];
} else {
result = "google";
}
return result;
};
var getKeyword = function(url) {
var regex = /[?&](q|wd|query|k)=([^&]*)/g;
var match, keywords = [];
while ((match = regex.exec(url)) !== null) {
keywords.push(match[2]);
}
return keywords.join(' ');
};
return {
"engine": getEngine(url),
"keyword": getKeyword(url)
}
};
this.open = function(method, url) {
var url = this.serializeParameters(url);
@ -646,7 +681,7 @@ var HTTPObject = function(engine) {
}
// Add proxy: <[protocol://][user:password@]proxyhost[:port]>
if (this.proxy != null && this.proxy.enabled && this.proxy.type != "stateless") {
if (this.proxy != null && this.proxy.enabled && this.proxy.type == "stateful") {
cmd.push("-x");
if (this.proxy.credential != null) {
cmd.push([
@ -703,7 +738,7 @@ var HTTPObject = function(engine) {
}
// If enabled the charset(text encoding) detector
if (this.isUseCharsetDetector) {
if (this.isUseDetectCharset) {
var detectedCharset = this.detectCharset(responseText);
console.log("Detected charset:", detectedCharset);
@ -1062,8 +1097,8 @@ var HTTPObject = function(engine) {
return this;
};
this.setIsUseCharsetDetector = function(flag) {
this.isUseCharsetDetector = flag;
this.setIsUseDetectCharset = function(flag) {
this.isUseDetectCharset = flag;
return this;
};
@ -1175,7 +1210,7 @@ exports.parseURL = parseURL;
exports.DEFAULT_USER_AGENT = DEFAULT_USER_AGENT;
exports.defaultUserAgent = DEFAULT_USER_AGENT; // compatible
exports.VERSIONINFO = "HTTP REST Client (http.js) version 0.7.35";
exports.VERSIONINFO = "HTTP REST Client (http.js) version 0.7.36";
exports.AUTHOR = "abuse@catswords.net";
exports.global = global;
exports.require = global.require;

View File

@ -1023,6 +1023,25 @@ var test_implements = {
.open("GET", "https://example.org")
.send();
console.log("responseBody:", response.responseBody);
},
"proxy_serp": function() {
var HTTP = require("lib/http");
var response = HTTP.create("CURL")
.setVariables({
"api_key": "2DG3WQgeL2djLFvnQBw83J4y"
})
.setProxy({
"enabled": true,
"provider": "searchapi",
"type": "serp"
})
.setUserAgent("Mozilla/5.0 (Windows NT 10.0; Trident/7.0; rv:11.0) like Gecko")
.open("GET", "https://www.google.com/search?q=test")
.send();
console.log("responseBody:", response.responseBody);
}
};