<?xml version="1.0"?>
<feed xmlns="http://www.w3.org/2005/Atom" xml:lang="en-GB">
	<id>https://wiki.ext-9.eprints-hosting.org/w/index.php?action=history&amp;feed=atom&amp;title=Bots</id>
	<title>Bots - Revision history</title>
	<link rel="self" type="application/atom+xml" href="https://wiki.ext-9.eprints-hosting.org/w/index.php?action=history&amp;feed=atom&amp;title=Bots"/>
	<link rel="alternate" type="text/html" href="https://wiki.ext-9.eprints-hosting.org/w/index.php?title=Bots&amp;action=history"/>
	<updated>2026-05-11T02:43:52Z</updated>
	<subtitle>Revision history for this page on the wiki</subtitle>
	<generator>MediaWiki 1.43.6</generator>
	<entry>
		<id>https://wiki.ext-9.eprints-hosting.org/w/index.php?title=Bots&amp;diff=16835&amp;oldid=prev</id>
		<title>Libjlrs: /* Analysing traffic */</title>
		<link rel="alternate" type="text/html" href="https://wiki.ext-9.eprints-hosting.org/w/index.php?title=Bots&amp;diff=16835&amp;oldid=prev"/>
		<updated>2025-07-23T17:34:16Z</updated>

		<summary type="html">&lt;p&gt;&lt;span class=&quot;autocomment&quot;&gt;Analysing traffic&lt;/span&gt;&lt;/p&gt;
&lt;table style=&quot;background-color: #fff; color: #202122;&quot; data-mw=&quot;interface&quot;&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;tr class=&quot;diff-title&quot; lang=&quot;en-GB&quot;&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #202122; text-align: center;&quot;&gt;← Older revision&lt;/td&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #202122; text-align: center;&quot;&gt;Revision as of 17:34, 23 July 2025&lt;/td&gt;
				&lt;/tr&gt;&lt;tr&gt;&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot; id=&quot;mw-diff-left-l10&quot;&gt;Line 10:&lt;/td&gt;
&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 10:&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #202122; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;br&gt;&lt;/td&gt;&lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #202122; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;br&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #202122; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;== Analysing traffic ==&lt;/div&gt;&lt;/td&gt;&lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #202122; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;== Analysing traffic ==&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&quot;diff-marker&quot; data-marker=&quot;−&quot;&gt;&lt;/td&gt;&lt;td style=&quot;color: #202122; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #ffe49c; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;del style=&quot;font-weight: bold; text-decoration: none;&quot;&gt;You may be lucky enough to have your &lt;/del&gt;system logs &lt;del style=&quot;font-weight: bold; text-decoration: none;&quot;&gt;available for &lt;/del&gt;analysis &lt;del style=&quot;font-weight: bold; text-decoration: none;&quot;&gt;in a system like Splunk o&lt;/del&gt;&lt;/div&gt;&lt;/td&gt;&lt;td class=&quot;diff-marker&quot; data-marker=&quot;+&quot;&gt;&lt;/td&gt;&lt;td style=&quot;color: #202122; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #a3d3ff; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;ins style=&quot;font-weight: bold; text-decoration: none;&quot;&gt;* if &lt;/ins&gt;system logs &lt;ins style=&quot;font-weight: bold; text-decoration: none;&quot;&gt;are in an &lt;/ins&gt;analysis &lt;ins style=&quot;font-weight: bold; text-decoration: none;&quot;&gt;platform e.g. splunk&lt;/ins&gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&quot;diff-marker&quot; data-marker=&quot;−&quot;&gt;&lt;/td&gt;&lt;td style=&quot;color: #202122; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #ffe49c; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;TODO (+ Thanks to members of the EP-Tech mailing list for some of the suggestions below.  &lt;/div&gt;&lt;/td&gt;&lt;td class=&quot;diff-marker&quot; data-marker=&quot;+&quot;&gt;&lt;/td&gt;&lt;td style=&quot;color: #202122; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #a3d3ff; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;ins style=&quot;font-weight: bold; text-decoration: none;&quot;&gt;* &lt;/ins&gt;TODO (+ Thanks to members of the EP-Tech mailing list for some of the suggestions below.  &lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #202122; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;br&gt;&lt;/td&gt;&lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #202122; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;br&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #202122; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;=== Apache logs ===&lt;/div&gt;&lt;/td&gt;&lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #202122; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;=== Apache logs ===&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;/table&gt;</summary>
		<author><name>Libjlrs</name></author>
	</entry>
	<entry>
		<id>https://wiki.ext-9.eprints-hosting.org/w/index.php?title=Bots&amp;diff=16834&amp;oldid=prev</id>
		<title>Libjlrs: Created page with &quot;&lt;big&gt;&#039;&#039;&#039;NOTE: 2025-07-23 THIS PAGE IS CURRENTLY UNDER CONSTRUCTION&#039;&#039;&#039;&lt;/big&gt;  The GLAM (Galleries, Libraries, Archives and Museums) sector have seen increased activity that app...&quot;</title>
		<link rel="alternate" type="text/html" href="https://wiki.ext-9.eprints-hosting.org/w/index.php?title=Bots&amp;diff=16834&amp;oldid=prev"/>
		<updated>2025-07-23T17:33:05Z</updated>

		<summary type="html">&lt;p&gt;Created page with &amp;quot;&amp;lt;big&amp;gt;&amp;#039;&amp;#039;&amp;#039;NOTE: 2025-07-23 THIS PAGE IS CURRENTLY UNDER CONSTRUCTION&amp;#039;&amp;#039;&amp;#039;&amp;lt;/big&amp;gt;  The GLAM (Galleries, Libraries, Archives and Museums) sector have seen increased activity that app...&amp;quot;&lt;/p&gt;
&lt;p&gt;&lt;b&gt;New page&lt;/b&gt;&lt;/p&gt;&lt;div&gt;&amp;lt;big&amp;gt;&amp;#039;&amp;#039;&amp;#039;NOTE: 2025-07-23 THIS PAGE IS CURRENTLY UNDER CONSTRUCTION&amp;#039;&amp;#039;&amp;#039;&amp;lt;/big&amp;gt;&lt;br /&gt;
&lt;br /&gt;
The GLAM (Galleries, Libraries, Archives and Museums) sector have seen increased activity that appears to be automated, but doesn&amp;#039;t identify itself as a &amp;#039;robot&amp;#039; and doesn&amp;#039;t follow robots.txt rules. The traffic makes repeated requests to systems (including EPrints repositories) for the same search terms, but with requests coming from a wide spread of IP addresses. Often an IP address will only make a single request, so traditional approaches to block traffic based in the IP address may not be enough.&lt;br /&gt;
&lt;br /&gt;
Other attributes of these requests e.g. the User-Agent are variable and don&amp;#039;t help to distinguish when requests are part of the same &amp;#039;swarm&amp;#039; of requests.&lt;br /&gt;
&lt;br /&gt;
The problematic traffic are requests to the search interface. These are , and this can cause performance issues with the platform. This isn&amp;#039;t just an EPrints issue, other platforms experience the same issues.&lt;br /&gt;
&lt;br /&gt;
Below are some approaches that may help to limit the impact of these &amp;#039;swarms&amp;#039; of bots.&lt;br /&gt;
&lt;br /&gt;
== Analysing traffic ==&lt;br /&gt;
You may be lucky enough to have your system logs available for analysis in a system like Splunk o&lt;br /&gt;
TODO (+ Thanks to members of the EP-Tech mailing list for some of the suggestions below. &lt;br /&gt;
&lt;br /&gt;
=== Apache logs ===&lt;br /&gt;
TODO (detail from tech-list)&lt;br /&gt;
&lt;br /&gt;
=== Cachemap ===&lt;br /&gt;
When the internal EPrints search is used, a cache table is created in the database, and the details of the search are stored in the core &amp;lt;code&amp;gt;cachemap&amp;lt;/code&amp;gt; table.&lt;br /&gt;
&lt;br /&gt;
The pattern of &amp;#039;swarm&amp;#039; activity can result in many identical searches being run. We can see these using the following query:&lt;br /&gt;
&amp;lt;source lang=&amp;quot;sql&amp;quot;&amp;gt;&lt;br /&gt;
SET @threshold = 30;&lt;br /&gt;
SELECT COUNT(*) c, searchexp FROM cachemap GROUP BY searchexp HAVING c &amp;gt; @threshold ORDER BY c;&lt;br /&gt;
&amp;lt;/source&amp;gt;&lt;br /&gt;
&lt;br /&gt;
Analysis of the web logs using search expressions returned from above may identify some attributes of the &amp;#039;swarm&amp;#039;.&lt;br /&gt;
&lt;br /&gt;
NB If EPrints is using Xapian to process searches, the majority of searches will &amp;#039;&amp;#039;not&amp;#039;&amp;#039; create a cache table.&lt;br /&gt;
&lt;br /&gt;
== Blocking abusive search traffic ==&lt;br /&gt;
&lt;br /&gt;
=== Firewall ===&lt;br /&gt;
???&lt;br /&gt;
&lt;br /&gt;
=== fail2ban ===&lt;br /&gt;
??? - scanning logs for repeated cache=[x]&lt;br /&gt;
&lt;br /&gt;
=== Apache configuration - mod_security (WAF) ===&lt;br /&gt;
&amp;lt;source lang=&amp;quot;shell&amp;quot;&amp;gt;&lt;br /&gt;
# details from DRN 2025-07-23&lt;br /&gt;
&amp;lt;/source&amp;gt;&lt;br /&gt;
&lt;br /&gt;
=== Approach: EPrints configuration to block specific searches ===&lt;br /&gt;
The example below creates an EPrints trigger that is active when a request is being processed by EPrints. Any incoming requests that contain the terms in the &amp;lt;code&amp;gt;$bad_search&amp;lt;/code&amp;gt; configuration will not run a search, but will be presented with a &amp;#039;429 - too many requests&amp;#039; page.&lt;br /&gt;
&lt;br /&gt;
NB the core Apache::Const module does not include a constant for a 429 response, so a numeric value is used instead of e.g. &amp;lt;code&amp;gt;OK&amp;lt;/code&amp;gt; or &amp;lt;code&amp;gt;FORBIDDEN&amp;lt;/code&amp;gt;&lt;br /&gt;
&lt;br /&gt;
&amp;lt;source lang=&amp;quot;perl&amp;quot;&amp;gt;&lt;br /&gt;
# save in a cfg.d dir somewhere e.g. [EPRINTS_ROOT]/archives/[ARCHIVE_ID]/cfg/cfg.d/a_BOT_BLOCK.pl&lt;br /&gt;
use EPrints::Const;&lt;br /&gt;
&lt;br /&gt;
### UPDATE THESE WITH SEARCHES YOU WANT TO BLOCK!&lt;br /&gt;
my $bad_search = join &amp;quot;|&amp;quot;, map quotemeta, qw{&lt;br /&gt;
	IN:Habits|&lt;br /&gt;
	%3AHabits%7C&lt;br /&gt;
	ZEPLIN+COSINE+DRIFT+ADMX+LIGO+Kamiokande+SBND&lt;br /&gt;
};&lt;br /&gt;
$c-&amp;gt;{blocked_search_terms_re} = qr/$bad_search/;&lt;br /&gt;
&lt;br /&gt;
$c-&amp;gt;add_trigger( EP_TRIGGER_URL_REWRITE, sub {&lt;br /&gt;
	my( %args ) = @_;&lt;br /&gt;
&lt;br /&gt;
	# args passed are: request, lang, args, urlpath, cgipath, uri, secure, return_code&lt;br /&gt;
	my( $repository, $request, $return_code, $uri, $urlpath ) = @args{ qw( repository request return_code uri urlpath ) };&lt;br /&gt;
&lt;br /&gt;
	# Just interested in searches&lt;br /&gt;
	if( $uri =~ /^$urlpath\/cgi\/search/  )&lt;br /&gt;
	{&lt;br /&gt;
		my $r_args = $request-&amp;gt;args();&lt;br /&gt;
&lt;br /&gt;
		if( defined $r_args )&lt;br /&gt;
		{&lt;br /&gt;
			if( $r_args =~ /$c-&amp;gt;{blocked_search_terms_re}/ )&lt;br /&gt;
			{&lt;br /&gt;
				#NB Apache2::COnst doesn&amp;#039;t define 429.&lt;br /&gt;
				$request-&amp;gt;custom_response( 429, $c-&amp;gt;{bot_429_page_html} );&lt;br /&gt;
				${$return_code} = 429;&lt;br /&gt;
				return EP_TRIGGER_DONE;&lt;br /&gt;
			}&lt;br /&gt;
		}&lt;br /&gt;
	}&lt;br /&gt;
});&lt;br /&gt;
&lt;br /&gt;
$c-&amp;gt;{bot_429_page_html} = &amp;#039;&amp;lt;!DOCTYPE HTML&amp;gt;&lt;br /&gt;
&amp;lt;html&amp;gt;&lt;br /&gt;
&amp;lt;head&amp;gt;&lt;br /&gt;
&amp;lt;meta http-equiv=&amp;quot;Content-Type&amp;quot; content=&amp;quot;text/html; charset=UTF-8&amp;quot; /&amp;gt;&lt;br /&gt;
&amp;lt;style&amp;gt;&lt;br /&gt;
body {&lt;br /&gt;
    font-family: sans-serif;&lt;br /&gt;
    margin: 3em;&lt;br /&gt;
}&lt;br /&gt;
footer {&lt;br /&gt;
    font-size: 80%;&lt;br /&gt;
    margin-top:2em;&lt;br /&gt;
}&lt;br /&gt;
&amp;lt;/style&amp;gt;&lt;br /&gt;
&amp;lt;title&amp;gt;Rate Limited&amp;lt;/title&amp;gt;&lt;br /&gt;
&amp;lt;/head&amp;gt;&lt;br /&gt;
&amp;lt;body&amp;gt;&lt;br /&gt;
&amp;lt;header&amp;gt;&lt;br /&gt;
&amp;lt;h1&amp;gt;429 Too Many Requests&amp;lt;/h1&amp;gt;&lt;br /&gt;
&amp;lt;/header&amp;gt;&lt;br /&gt;
&amp;lt;section&amp;gt;&lt;br /&gt;
&amp;lt;p&amp;gt;This search has been blocked due to abuse by automated activity.&amp;lt;/p&amp;gt;&lt;br /&gt;
&amp;lt;/section&amp;gt;&lt;br /&gt;
&amp;lt;footer&amp;gt;&lt;br /&gt;
&amp;lt;p&amp;gt;White Rose Libraries&amp;lt;/p&amp;gt;&lt;br /&gt;
&amp;lt;/footer&amp;gt;&lt;br /&gt;
&amp;lt;/body&amp;gt;&lt;br /&gt;
&amp;lt;/html&amp;gt;&amp;#039;;&lt;br /&gt;
&amp;lt;/source&amp;gt;&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
=== Using 3rd party tools ===&lt;br /&gt;
* Cloudflare&lt;br /&gt;
* Anubis&lt;br /&gt;
&lt;br /&gt;
== Related resources ==&lt;br /&gt;
&lt;br /&gt;
* [https://wiki.code4lib.org/Blocking_Bots Code4Lib Blocking Bots] - there is also a useful channel on the Code4Lib slack&lt;br /&gt;
* [https://www.glamelab.org/products/are-ai-bots-knocking-cultural-heritage-offline/ Are AI Bots Knocking Cultural Heritage Offline? GLAM-E Lab report]&lt;/div&gt;</summary>
		<author><name>Libjlrs</name></author>
	</entry>
</feed>