[Forensics-changes] [cewl] 01/02: Imported Upstream version 5.3
Joao Eriberto Mota Filho
eriberto at moszumanska.debian.org
Sun Dec 4 19:18:49 UTC 2016
This is an automated email from the git hooks/post-receive script.
eriberto pushed a commit to branch debian
in repository cewl.
commit 70994f533f81d72868b60a88a741a80572547496
Author: Joao Eriberto Mota Filho <eriberto at debian.org>
Date: Sun Dec 4 17:16:19 2016 -0200
Imported Upstream version 5.3
---
README | 30 +-
cewl.rb | 1709 ++++++++++++++++++++++++++++++++-------------------------------
2 files changed, 899 insertions(+), 840 deletions(-)
diff --git a/README b/README
index 917e9ef..1451433 100644
--- a/README
+++ b/README
@@ -27,6 +27,13 @@ GitHub: https://github.com/digininja/CeWL
Change Log
==========
+Version 5.3
+-----------
+
+Added the command line argument --header (-H) to allow headers to be passed in.
+Parameters are specified in name:value pairs and you can pass multiple.
+
+
Version 5.2
-----------
@@ -172,6 +179,7 @@ Usage
=====
Usage: cewl [OPTION] ... URL
--help, -h: show help
+ --keep, -k: keep the downloaded file
--depth x, -d x: depth to spider to, default 2
--min_word_length, -m: minimum word length, default 3
--offsite, -o: let the spider visit other sites
@@ -179,11 +187,27 @@ Usage: cewl [OPTION] ... URL
--ua, -u user-agent: user agent to send
--no-words, -n: don't output the wordlist
--meta, -a include meta data
- --meta_file file: file for metadata output
+ --meta_file file: output file for meta data
--email, -e include email addresses
- --email_file file: file for email output
+ --email_file file: output file for email addresses
--meta-temp-dir directory: the temporary directory used by exiftool when parsing files, default /tmp
- -v: verbose
+ --count, -c: show the count for each word found
+
+ Authentication
+ --auth_type: digest or basic
+ --auth_user: authentication username
+ --auth_pass: authentication password
+
+ Proxy Support
+ --proxy_host: proxy host
+ --proxy_port: proxy port, default 8080
+ --proxy_username: username for proxy, if required
+ --proxy_password: password for proxy, if required
+
+ Headers
+ --header, -H: in format name:value - can pass multiple
+
+ --verbose, -v: verbose
URL: The site to spider.
diff --git a/cewl.rb b/cewl.rb
index 5800e03..836a40b 100755
--- a/cewl.rb
+++ b/cewl.rb
@@ -14,47 +14,47 @@
# cewl [OPTION] ... URL
#
# -h, --help:
-# show help
+# show help
#
# --depth x, -d x:
-# depth to spider to, default 2
+# depth to spider to, default 2
#
# --min_word_length, -m:
-# minimum word length, default 3
+# minimum word length, default 3
#
# --email file, -e
# --email_file file:
-# include any email addresses found during the spider, email_file is optional output file, if
-# not included the output is added to default output
+# include any email addresses found during the spider, email_file is optional output file, if
+# not included the output is added to default output
#
# --meta file, -a
# --meta_file file:
-# include any meta data found during the spider, meta_file is optional output file, if
-# not included the output is added to default output
+# include any meta data found during the spider, meta_file is optional output file, if
+# not included the output is added to default output
#
# --no-words, -n
-# don't output the wordlist
+# don't output the wordlist
#
# --offsite, -o:
-# let the spider visit other sites
+# let the spider visit other sites
#
# --write, -w file:
-# write the words to the file
+# write the words to the file
#
# --ua, -u user-agent:
-# useragent to send
+# useragent to send
#
# --meta-temp-dir directory:
-# the temporary directory used by exiftool when parsing files, default /tmp
+# the temporary directory used by exiftool when parsing files, default /tmp
#
# --keep, -k:
-# keep the documents that are downloaded
+# keep the documents that are downloaded
#
# --count, -c:
-# show the count for each of the words found
+# show the count for each of the words found
#
# -v
-# verbose
+# verbose
#
# URL: The site to spider.
#
@@ -63,27 +63,27 @@
# Licence:: CC-BY-SA 2.0 or GPL-3+
#
-VERSION = "5.2 (Some Chaos)"
+VERSION = "5.3 (Heading Upwards)"
puts "CeWL #{VERSION} Robin Wood (robin at digi.ninja) (https://digi.ninja/)\n"
begin
- require 'getoptlong'
- require 'spider'
- require 'nokogiri'
- require 'net/http'
+ require 'getoptlong'
+ require 'spider'
+ require 'nokogiri'
+ require 'net/http'
rescue LoadError => e
- # Catch error and provide feedback on installing gem
- if e.to_s =~ /cannot load such file -- (.*)/
- missing_gem = $1
- puts "\nError: #{missing_gem} gem not installed\n"
- puts "\t Use: 'gem install #{missing_gem}' to install the required gem\n\n"
- exit 2
- else
- puts "There was an error loading the gems:\n"
- puts e.to_s
- exit 2
- end
+ # Catch error and provide feedback on installing gem
+ if e.to_s =~ /cannot load such file -- (.*)/
+ missing_gem = $1
+ puts "\nError: #{missing_gem} gem not installed\n"
+ puts "\t Use: 'gem install #{missing_gem}' to install the required gem\n\n"
+ exit 2
+ else
+ puts "There was an error loading the gems:\n"
+ puts e.to_s
+ exit 2
+ end
end
# Doesn't work for some reason, maybe
@@ -95,432 +95,459 @@ require './cewl_lib'
# Doing this so I can override the allowed? function which normally checks
# the robots.txt file
class MySpider<Spider
- @@proxy_host = nil
- @@proxy_port = nil
- @@proxy_username = nil
- @@proxy_password = nil
-
- @@auth_type = nil
- @@auth_user = nil
- @@auth_password = nil
- @@verbose = false
- @@debug = false
-
- def self.proxy (host, port = nil, username = nil, password = nil)
- @@proxy_host = host
- port = 8080 if port.nil?
- @@proxy_port = port
- @@proxy_username = username
- @@proxy_password = password
- end
-
- def self.auth_creds (type, user, password)
- @@auth_type = type
- @@auth_user = user
- @@auth_password = password
- end
-
- def self.verbose (val)
- @@verbose = val
- end
-
- def self.debug (val)
- @@debug = val
- end
-
- # Create an instance of MySpiderInstance rather than SpiderInstance
- def self.start_at(a_url, &block)
- rules = RobotRules.new('Ruby Spider 1.0')
- a_spider = MySpiderInstance.new({nil => a_url}, [], rules, [])
- a_spider.auth_type = @@auth_type
- a_spider.auth_user = @@auth_user
- a_spider.auth_password = @@auth_password
-
- a_spider.proxy_host = @@proxy_host
- a_spider.proxy_port = @@proxy_port
- a_spider.proxy_username = @@proxy_username
- a_spider.proxy_password = @@proxy_password
-
- a_spider.verbose = @@verbose
- a_spider.debug = @@debug
- block.call(a_spider)
- a_spider.start!
- end
+ @@proxy_host = nil
+ @@proxy_port = nil
+ @@proxy_username = nil
+ @@proxy_password = nil
+
+ @@headers = nil
+
+ @@auth_type = nil
+ @@auth_user = nil
+ @@auth_password = nil
+ @@verbose = false
+ @@debug = false
+
+ def self.proxy (host, port = nil, username = nil, password = nil)
+ @@proxy_host = host
+ port = 8080 if port.nil?
+ @@proxy_port = port
+ @@proxy_username = username
+ @@proxy_password = password
+ end
+
+ def self.headers (headers)
+ header_hash = {}
+ headers.each do |header|
+ header_split = header.split(":")
+ if (header_split.count == 2)
+ header_hash[header_split[0].strip] = header_split[1].strip
+ else
+ puts "Invalid header: " + header.inspect
+ end
+ end
+ @@headers = header_hash
+ end
+
+ def self.auth_creds (type, user, password)
+ @@auth_type = type
+ @@auth_user = user
+ @@auth_password = password
+ end
+
+ def self.verbose (val)
+ @@verbose = val
+ end
+
+ def self.debug (val)
+ @@debug = val
+ end
+
+ # Create an instance of MySpiderInstance rather than SpiderInstance
+ def self.start_at(a_url, &block)
+ rules = RobotRules.new('Ruby Spider 1.0')
+ a_spider = MySpiderInstance.new({nil => a_url}, [], rules, [])
+
+ a_spider.headers = @@headers
+
+ a_spider.auth_type = @@auth_type
+ a_spider.auth_user = @@auth_user
+ a_spider.auth_password = @@auth_password
+
+ a_spider.proxy_host = @@proxy_host
+ a_spider.proxy_port = @@proxy_port
+ a_spider.proxy_username = @@proxy_username
+ a_spider.proxy_password = @@proxy_password
+
+ a_spider.verbose = @@verbose
+ a_spider.debug = @@debug
+ block.call(a_spider)
+ a_spider.start!
+ end
end
# My version of the spider class which allows all files
# to be processed
class MySpiderInstance<SpiderInstance
- attr_writer :auth_type
- attr_writer :auth_user
- attr_writer :auth_password
-
- attr_writer :proxy_host
- attr_writer :proxy_port
- attr_writer :proxy_username
- attr_writer :proxy_password
-
- attr_writer :verbose
- attr_writer :debug
-
- # Force all files to be allowed
- # Normally the robots.txt file will be honoured
- def allowed?(a_url, parsed_url)
- true
- end
-
- def start! #:nodoc:
- interrupted = false
- trap("SIGINT") { interrupted = true }
- begin
- next_urls = @next_urls.pop
- #tmp_n_u = {}
- next_urls.each do |prior_url, urls|
- x = []
-
- urls.each_line do |a_url|
- x << [a_url, (URI.parse(a_url) rescue nil)]
- end
-
- y = []
- x.select do |a_url, parsed_url|
- y << [a_url, parsed_url] if allowable_url?(a_url, parsed_url)
- end
-
- y.each do |a_url, parsed_url|
- @setup.call(a_url) unless @setup.nil?
- get_page(parsed_url) do |response|
- do_callbacks(a_url, response, prior_url)
- #tmp_n_u[a_url] = generate_next_urls(a_url, response)
- #@next_urls.push tmp_n_u
- generate_next_urls(a_url, response).each do |a_next_url|
- puts "Pushing #{a_next_url}" if @debug
- @next_urls.push a_url => a_next_url
- end
- #exit if interrupted
- end
-
- @teardown.call(a_url) unless @teardown.nil?
- exit if interrupted
- end
- end
- end while !@next_urls.empty?
- end
-
- def get_page(uri, &block) #:nodoc:
- @seen << uri
-
- begin
- if @proxy_host.nil?
- http = Net::HTTP.new(uri.host, uri.port)
-
- if uri.scheme == 'https'
- http.use_ssl = true
- http.verify_mode = OpenSSL::SSL::VERIFY_NONE
- end
- else
- proxy = Net::HTTP::Proxy(@proxy_host, @proxy_port, @proxy_username, @proxy_password)
- begin
- if uri.scheme == 'https'
- http = proxy.start(uri.host, uri.port, :use_ssl => true, :verify_mode => OpenSSL::SSL::VERIFY_NONE)
- else
- http = proxy.start(uri.host, uri.port)
- end
- rescue => e
- puts "\nFailed to connect to the proxy (#{@proxy_host}:#{@proxy_port})\n\n"
- exit 2
- end
- end
-
- req = Net::HTTP::Get.new(uri.request_uri, @headers)
-
- if @auth_type
- case @auth_type
- when "digest"
- uri.user = @auth_user
- uri.password = @auth_password
-
- res = http.request req
-
- if res['www-authenticate']
- digest_auth = Net::HTTP::DigestAuth.new
- auth = digest_auth.auth_header uri, res['www-authenticate'], 'GET'
-
- req = Net::HTTP::Get.new uri.request_uri
- req.add_field 'Authorization', auth
- end
-
- when "basic"
- req.basic_auth @auth_user, @auth_password
- end
- end
-
- res = http.request(req)
-
- if res.redirect?
- puts "Redirect URL" if @debug
- base_url = uri.to_s[0, uri.to_s.rindex('/')]
- new_url = URI.parse(construct_complete_url(base_url, res['Location']))
-
- # If auth is used then a name:pass@ gets added, this messes the tree
- # up so easiest to just remove it
- current_uri = uri.to_s.gsub(/:\/\/[^:]*:[^@]*@/, "://")
- @next_urls.push current_uri => new_url.to_s
- elsif res.code == "401"
- puts "Authentication required, can't continue on this branch - #{uri}" if @verbose
- else
- block.call(res)
- end
- rescue SocketError, Errno::EHOSTUNREACH => e
- puts "Couldn't hit the site #{uri}, moving on"
- rescue NoMethodError => e
- if @verbose
- puts "Unable to process URL"
- puts "Message is #{e.to_s}"
- end
- rescue => e
- puts "\nUnable to connect to the site (#{uri.scheme}://#{uri.host}:#{uri.port}#{uri.request_uri})"
-
- if @verbose
- puts "\nThe following error may help:"
- puts e.to_s
- puts e.backtrace
- puts "\nCaller"
- puts caller
- else
- puts "Run in verbose mode (-v) for more information"
- end
-
- puts "\n\n"
- end
- end
-
- # Overriding so that I can get it to ignore direct names - i.e. #name
- def construct_complete_url(base_url, additional_url, parsed_additional_url = nil) #:nodoc:
- return nil if additional_url =~ /^#/
-
- parsed_additional_url ||= URI.parse(additional_url)
- case parsed_additional_url.scheme
- when nil
- u = base_url.is_a?(URI) ? base_url : URI.parse(base_url)
- if additional_url[0].chr == '/'
- "#{u.scheme}://#{u.host}#{additional_url}"
- elsif u.path.nil? || u.path == ''
- "#{u.scheme}://#{u.host}/#{additional_url}"
- elsif u.path[0].chr == '/'
- "#{u.scheme}://#{u.host}#{u.path}/#{additional_url}"
- else
- "#{u.scheme}://#{u.host}/#{u.path}/#{additional_url}"
- end
- else
- additional_url
- end
- end
-
- # Overriding the original spider one as it doesn't find hrefs very well
- def generate_next_urls(a_url, resp) #:nodoc:
- web_page = resp.body
- if URI.parse(a_url).path.empty?
- base_url = a_url
- else
- base_url = a_url[0, a_url.rindex('/')]
- end
-
- doc = Nokogiri::HTML(web_page)
- links = doc.css('a').map { |a| a['href'] }
- links.map do |link|
- begin
- if link.nil?
- nil
- else
- begin
- parsed_link = URI.parse(link)
- parsed_link.fragment == '#' ? nil : construct_complete_url(base_url, link, parsed_link)
- rescue
- nil
- end
- end
- rescue => e
- puts "\nThere was an error generating URL list"
- puts "Error: #{e.inspect}"
- puts e.backtrace
- exit 2
- end
- end.compact
- end
+ attr_writer :auth_type
+ attr_writer :auth_user
+ attr_writer :auth_password
+
+ attr_writer :headers
+
+ attr_writer :proxy_host
+ attr_writer :proxy_port
+ attr_writer :proxy_username
+ attr_writer :proxy_password
+
+ attr_writer :verbose
+ attr_writer :debug
+
+ # Force all files to be allowed
+ # Normally the robots.txt file will be honoured
+ def allowed?(a_url, parsed_url)
+ true
+ end
+
+ def start! #:nodoc:
+ interrupted = false
+ trap("SIGINT") { interrupted = true }
+ begin
+ next_urls = @next_urls.pop
+ #tmp_n_u = {}
+ next_urls.each do |prior_url, urls|
+ x = []
+
+ urls.each_line do |a_url|
+ x << [a_url, (URI.parse(a_url) rescue nil)]
+ end
+
+ y = []
+ x.select do |a_url, parsed_url|
+ y << [a_url, parsed_url] if allowable_url?(a_url, parsed_url)
+ end
+
+ y.each do |a_url, parsed_url|
+ @setup.call(a_url) unless @setup.nil?
+ get_page(parsed_url) do |response|
+ do_callbacks(a_url, response, prior_url)
+ #tmp_n_u[a_url] = generate_next_urls(a_url, response)
+ #@next_urls.push tmp_n_u
+ generate_next_urls(a_url, response).each do |a_next_url|
+ puts "Pushing #{a_next_url}" if @debug
+ @next_urls.push a_url => a_next_url
+ end
+ #exit if interrupted
+ end
+
+ @teardown.call(a_url) unless @teardown.nil?
+ exit if interrupted
+ end
+ end
+ end while !@next_urls.empty?
+ end
+
+ def get_page(uri, &block) #:nodoc:
+ @seen << uri
+
+ begin
+ if @proxy_host.nil?
+ http = Net::HTTP.new(uri.host, uri.port)
+
+ if uri.scheme == 'https'
+ http.use_ssl = true
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
+ end
+ else
+ proxy = Net::HTTP::Proxy(@proxy_host, @proxy_port, @proxy_username, @proxy_password)
+ begin
+ if uri.scheme == 'https'
+ http = proxy.start(uri.host, uri.port, :use_ssl => true, :verify_mode => OpenSSL::SSL::VERIFY_NONE)
+ else
+ http = proxy.start(uri.host, uri.port)
+ end
+ rescue => e
+ puts "\nFailed to connect to the proxy (#{@proxy_host}:#{@proxy_port})\n\n"
+ exit 2
+ end
+ end
+
+ req = Net::HTTP::Get.new(uri.request_uri)
+ @headers.each_pair do |header, value|
+ req[header] = value
+ end
+
+ if @auth_type
+ case @auth_type
+ when "digest"
+ uri.user = @auth_user
+ uri.password = @auth_password
+
+ res = http.request req
+
+ if res['www-authenticate']
+ digest_auth = Net::HTTP::DigestAuth.new
+ auth = digest_auth.auth_header uri, res['www-authenticate'], 'GET'
+
+ req = Net::HTTP::Get.new uri.request_uri
+ req.add_field 'Authorization', auth
+ end
+
+ when "basic"
+ req.basic_auth @auth_user, @auth_password
+ end
+ end
+
+ res = http.request(req)
+
+ if res.redirect?
+ puts "Redirect URL" if @debug
+ base_url = uri.to_s[0, uri.to_s.rindex('/')]
+ new_url = URI.parse(construct_complete_url(base_url, res['Location']))
+
+ # If auth is used then a name:pass@ gets added, this messes the tree
+ # up so easiest to just remove it
+ current_uri = uri.to_s.gsub(/:\/\/[^:]*:[^@]*@/, "://")
+ @next_urls.push current_uri => new_url.to_s
+ elsif res.code == "401"
+ puts "Authentication required, can't continue on this branch - #{uri}" if @verbose
+ else
+ block.call(res)
+ end
+ rescue SocketError, Errno::EHOSTUNREACH => e
+ puts "Couldn't hit the site #{uri}, moving on"
+ rescue NoMethodError => e
+ if @verbose
+ puts "Unable to process URL"
+ puts "Message is #{e.to_s}"
+ end
+ rescue => e
+ puts "\nUnable to connect to the site (#{uri.scheme}://#{uri.host}:#{uri.port}#{uri.request_uri})"
+
+ if @verbose
+ puts "\nThe following error may help:"
+ puts e.to_s
+ puts e.backtrace
+ puts "\nCaller"
+ puts caller
+ else
+ puts "Run in verbose mode (-v) for more information"
+ end
+
+ puts "\n\n"
+ end
+ end
+
+ # Overriding so that I can get it to ignore direct names - i.e. #name
+ def construct_complete_url(base_url, additional_url, parsed_additional_url = nil) #:nodoc:
+ return nil if additional_url =~ /^#/
+
+ parsed_additional_url ||= URI.parse(additional_url)
+ case parsed_additional_url.scheme
+ when nil
+ u = base_url.is_a?(URI) ? base_url : URI.parse(base_url)
+ if additional_url[0].chr == '/'
+ "#{u.scheme}://#{u.host}#{additional_url}"
+ elsif u.path.nil? || u.path == ''
+ "#{u.scheme}://#{u.host}/#{additional_url}"
+ elsif u.path[0].chr == '/'
+ "#{u.scheme}://#{u.host}#{u.path}/#{additional_url}"
+ else
+ "#{u.scheme}://#{u.host}/#{u.path}/#{additional_url}"
+ end
+ else
+ additional_url
+ end
+ end
+
+ # Overriding the original spider one as it doesn't find hrefs very well
+ def generate_next_urls(a_url, resp) #:nodoc:
+ web_page = resp.body
+ if URI.parse(a_url).path.empty?
+ base_url = a_url
+ else
+ base_url = a_url[0, a_url.rindex('/')]
+ end
+
+ doc = Nokogiri::HTML(web_page)
+ links = doc.css('a').map { |a| a['href'] }
+ links.map do |link|
+ begin
+ if link.nil?
+ nil
+ else
+ begin
+ parsed_link = URI.parse(link)
+ parsed_link.fragment == '#' ? nil : construct_complete_url(base_url, link, parsed_link)
+ rescue
+ nil
+ end
+ end
+ rescue => e
+ puts "\nThere was an error generating URL list"
+ puts "Error: #{e.inspect}"
+ puts e.backtrace
+ exit 2
+ end
+ end.compact
+ end
end
# A node for a tree
class TreeNode
- attr :value
- attr :depth
- attr :key
- attr :visited, true
-
- def initialize(key, value, depth)
- @key = key
- @value = value
- @depth = depth
- @visited = false
- end
-
- def to_s
- if key.nil?
- return "key=nil value=#{@value} depth=#{@depth.to_s} visited=#{@visited.to_s}"
- else
- return "key=#{@key} value=#{@value} depth=#{@depth.to_s} visited=#{@visited.to_s}"
- end
- end
-
- def to_url_hash
- return({@key => @value})
- end
+ attr :value
+ attr :depth
+ attr :key
+ attr :visited, true
+
+ def initialize(key, value, depth)
+ @key = key
+ @value = value
+ @depth = depth
+ @visited = false
+ end
+
+ def to_s
+ if key.nil?
+ return "key=nil value=#{@value} depth=#{@depth.to_s} visited=#{@visited.to_s}"
+ else
+ return "key=#{@key} value=#{@value} depth=#{@depth.to_s} visited=#{@visited.to_s}"
+ end
+ end
+
+ def to_url_hash
+ return({@key => @value})
+ end
end
# A tree structure
class Tree
- attr :data
- @max_depth
- @children
-
- # Get the maximum depth the tree can grow to
- def max_depth
- @max_depth
- end
-
- # Set the max depth the tree can grow to
- def max_depth=(val)
- @max_depth = Integer(val)
- end
-
- # As this is used to work out if there are any more nodes to process it isn't a true empty
- def empty?
- if !@data.visited
- return false
- else
- @children.each { |node|
- return false if !node.data.visited
- }
- end
- return true
- end
-
- # The constructor
- def initialize(key=nil, value=nil, depth=0)
- @data = TreeNode.new(key, value, depth)
- @children = []
- @max_depth = 2
- end
-
- # Itterator
- def each
- yield @data
- @children.each do |child_node|
- child_node.each { |e| yield e }
- end
- end
-
- # Remove an item from the tree
- def pop
- if !@data.visited
- @data.visited = true
- return @data.to_url_hash
- else
- @children.each { |node|
- if !node.data.visited
- node.data.visited = true
- return node.data.to_url_hash
- end
- }
- end
- return nil
- end
-
- # Push an item onto the tree
- def push(value)
- key = value.keys.first
- value = value.values_at(key).first
-
- if key.nil?
- @data = TreeNode.new(key, value, 0)
- else
- # If the depth is 0 then don't add anything to the tree
- return if @max_depth == 0
- if key == @data.value
- child = Tree.new(key, value, @data.depth + 1)
- @children << child
- else
- @children.each { |node|
- if node.data.value == key && node.data.depth<@max_depth
- child = Tree.new(key, value, node.data.depth + 1)
- @children << child
- end
- }
- end
- end
- end
+ attr :data
+ @max_depth
+ @children
+
+ # Get the maximum depth the tree can grow to
+ def max_depth
+ @max_depth
+ end
+
+ # Set the max depth the tree can grow to
+ def max_depth=(val)
+ @max_depth = Integer(val)
+ end
+
+ # As this is used to work out if there are any more nodes to process it isn't a true empty
+ def empty?
+ if !@data.visited
+ return false
+ else
+ @children.each { |node|
+ return false if !node.data.visited
+ }
+ end
+ return true
+ end
+
+ # The constructor
+ def initialize(key=nil, value=nil, depth=0)
+ @data = TreeNode.new(key, value, depth)
+ @children = []
+ @max_depth = 2
+ end
+
+ # Itterator
+ def each
+ yield @data
+ @children.each do |child_node|
+ child_node.each { |e| yield e }
+ end
+ end
+
+ # Remove an item from the tree
+ def pop
+ if !@data.visited
+ @data.visited = true
+ return @data.to_url_hash
+ else
+ @children.each { |node|
+ if !node.data.visited
+ node.data.visited = true
+ return node.data.to_url_hash
+ end
+ }
+ end
+ return nil
+ end
+
+ # Push an item onto the tree
+ def push(value)
+ key = value.keys.first
+ value = value.values_at(key).first
+
+ if key.nil?
+ @data = TreeNode.new(key, value, 0)
+ else
+ # If the depth is 0 then don't add anything to the tree
+ return if @max_depth == 0
+ if key == @data.value
+ child = Tree.new(key, value, @data.depth + 1)
+ @children << child
+ else
+ @children.each { |node|
+ if node.data.value == key && node.data.depth<@max_depth
+ child = Tree.new(key, value, node.data.depth + 1)
+ @children << child
+ end
+ }
+ end
+ end
+ end
end
opts = GetoptLong.new(
- ['--help', '-h', GetoptLong::NO_ARGUMENT],
- ['--keep', '-k', GetoptLong::NO_ARGUMENT],
- ['--depth', '-d', GetoptLong::OPTIONAL_ARGUMENT],
- ['--min_word_length', "-m", GetoptLong::REQUIRED_ARGUMENT],
- ['--no-words', "-n", GetoptLong::NO_ARGUMENT],
- ['--offsite', "-o", GetoptLong::NO_ARGUMENT],
- ['--write', "-w", GetoptLong::REQUIRED_ARGUMENT],
- ['--ua', "-u", GetoptLong::REQUIRED_ARGUMENT],
- ['--meta-temp-dir', GetoptLong::REQUIRED_ARGUMENT],
- ['--meta_file', GetoptLong::REQUIRED_ARGUMENT],
- ['--email_file', GetoptLong::REQUIRED_ARGUMENT],
- ['--meta', "-a", GetoptLong::NO_ARGUMENT],
- ['--email', "-e", GetoptLong::NO_ARGUMENT],
- ['--count', '-c', GetoptLong::NO_ARGUMENT],
- ['--auth_user', GetoptLong::REQUIRED_ARGUMENT],
- ['--auth_pass', GetoptLong::REQUIRED_ARGUMENT],
- ['--auth_type', GetoptLong::REQUIRED_ARGUMENT],
- ['--proxy_host', GetoptLong::REQUIRED_ARGUMENT],
- ['--proxy_port', GetoptLong::REQUIRED_ARGUMENT],
- ['--proxy_username', GetoptLong::REQUIRED_ARGUMENT],
- ['--proxy_password', GetoptLong::REQUIRED_ARGUMENT],
- ["--verbose", "-v", GetoptLong::NO_ARGUMENT]
+ ['--help', '-h', GetoptLong::NO_ARGUMENT],
+ ['--keep', '-k', GetoptLong::NO_ARGUMENT],
+ ['--depth', '-d', GetoptLong::OPTIONAL_ARGUMENT],
+ ['--min_word_length', "-m", GetoptLong::REQUIRED_ARGUMENT],
+ ['--no-words', "-n", GetoptLong::NO_ARGUMENT],
+ ['--offsite', "-o", GetoptLong::NO_ARGUMENT],
+ ['--write', "-w", GetoptLong::REQUIRED_ARGUMENT],
+ ['--ua', "-u", GetoptLong::REQUIRED_ARGUMENT],
+ ['--meta-temp-dir', GetoptLong::REQUIRED_ARGUMENT],
+ ['--meta_file', GetoptLong::REQUIRED_ARGUMENT],
+ ['--email_file', GetoptLong::REQUIRED_ARGUMENT],
+ ['--meta', "-a", GetoptLong::NO_ARGUMENT],
+ ['--email', "-e", GetoptLong::NO_ARGUMENT],
+ ['--count', '-c', GetoptLong::NO_ARGUMENT],
+ ['--auth_user', GetoptLong::REQUIRED_ARGUMENT],
+ ['--auth_pass', GetoptLong::REQUIRED_ARGUMENT],
+ ['--auth_type', GetoptLong::REQUIRED_ARGUMENT],
+ ['--header', "-H", GetoptLong::REQUIRED_ARGUMENT],
+ ['--proxy_host', GetoptLong::REQUIRED_ARGUMENT],
+ ['--proxy_port', GetoptLong::REQUIRED_ARGUMENT],
+ ['--proxy_username', GetoptLong::REQUIRED_ARGUMENT],
+ ['--proxy_password', GetoptLong::REQUIRED_ARGUMENT],
+ ["--verbose", "-v", GetoptLong::NO_ARGUMENT]
)
# Display the usage
def usage
- puts "Usage: cewl [OPTION] ... URL
- --help, -h: show help
- --keep, -k: keep the downloaded file
- --depth x, -d x: depth to spider to, default 2
- --min_word_length, -m: minimum word length, default 3
- --offsite, -o: let the spider visit other sites
- --write, -w file: write the output to the file
- --ua, -u user-agent: user agent to send
- --no-words, -n: don't output the wordlist
- --meta, -a include meta data
- --meta_file file: output file for meta data
- --email, -e include email addresses
- --email_file file: output file for email addresses
- --meta-temp-dir directory: the temporary directory used by exiftool when parsing files, default /tmp
- --count, -c: show the count for each word found
-
- Authentication
- --auth_type: digest or basic
- --auth_user: authentication username
- --auth_pass: authentication password
-
- Proxy Support
- --proxy_host: proxy host
- --proxy_port: proxy port, default 8080
- --proxy_username: username for proxy, if required
- --proxy_password: password for proxy, if required
-
- --verbose, -v: verbose
-
- URL: The site to spider.
+ puts "Usage: cewl [OPTION] ... URL
+ --help, -h: show help
+ --keep, -k: keep the downloaded file
+ --depth x, -d x: depth to spider to, default 2
+ --min_word_length, -m: minimum word length, default 3
+ --offsite, -o: let the spider visit other sites
+ --write, -w file: write the output to the file
+ --ua, -u user-agent: user agent to send
+ --no-words, -n: don't output the wordlist
+ --meta, -a include meta data
+ --meta_file file: output file for meta data
+ --email, -e include email addresses
+ --email_file file: output file for email addresses
+ --meta-temp-dir directory: the temporary directory used by exiftool when parsing files, default /tmp
+ --count, -c: show the count for each word found
+
+ Authentication
+ --auth_type: digest or basic
+ --auth_user: authentication username
+ --auth_pass: authentication password
+
+ Proxy Support
+ --proxy_host: proxy host
+ --proxy_port: proxy port, default 8080
+ --proxy_username: username for proxy, if required
+ --proxy_password: password for proxy, if required
+
+ Headers
+ --header, -H: in format name:value - can pass multiple
+
+ --verbose, -v: verbose
+
+ URL: The site to spider.
"
- exit 0
+ exit 0
end
debug = false
@@ -548,103 +575,109 @@ proxy_port = nil
proxy_username = nil
proxy_password = nil
+# headers will be passed in in the format "header: value"
+# and there can be multiple
+headers = []
+
strip_css = true
strip_js = true
begin
- opts.each do |opt, arg|
- case opt
- when '--help'
- usage
- when "--count"
- show_count = true
- when "--meta-temp-dir"
- if !File.directory?(arg)
- puts "\nMeta temp directory is not a directory\n\n"
- exit 1
- end
-
- if !File.writable?(arg)
- puts "\nThe meta temp directory is not writable\n\n"
- exit 1
- end
-
- meta_temp_dir = arg
- meta_temp_dir += "/" if meta_temp_dir !~ /.*\/$/
- when "--keep"
- keep = true
- when "--no-words"
- wordlist = false
- when "--meta_file"
- meta_outfile = arg
- when "--meta"
- meta = true
- when "--email_file"
- email_outfile = arg
- when "--email"
- email = true
- when '--min_word_length'
- min_word_length = arg.to_i
- usage if min_word_length < 1
- when '--depth'
- depth = arg.to_i
- usage if depth < 0
- when '--offsite'
- offsite = true
- when '--ua'
- ua = arg
- when '--verbose'
- verbose = true
- when '--write'
- outfile = arg
- when "--proxy_password"
- proxy_password = arg
- when "--proxy_username"
- proxy_username = arg
- when "--proxy_host"
- proxy_host = arg
- when "--proxy_port"
- proxy_port = arg.to_i
- when "--auth_pass"
- auth_pass = arg
- when "--auth_user"
- auth_user = arg
- when "--auth_type"
- if arg =~ /(digest|basic)/i
- auth_type = $1.downcase
- if auth_type == "digest"
- begin
- require "net/http/digest_auth"
- rescue LoadError => e
- # Catch error and provide feedback on installing gem
- puts "\nError: To use digest auth you require the net-http-digest_auth gem\n"
- puts "\t Use: 'gem install net-http-digest_auth'\n\n"
- exit 2
- end
- end
- else
- puts "\nInvalid authentication type, please specify either basic or digest\n\n"
- exit 1
- end
- end
- end
+ opts.each do |opt, arg|
+ case opt
+ when '--help'
+ usage
+ when "--count"
+ show_count = true
+ when "--meta-temp-dir"
+ if !File.directory?(arg)
+ puts "\nMeta temp directory is not a directory\n\n"
+ exit 1
+ end
+
+ if !File.writable?(arg)
+ puts "\nThe meta temp directory is not writable\n\n"
+ exit 1
+ end
+
+ meta_temp_dir = arg
+ meta_temp_dir += "/" if meta_temp_dir !~ /.*\/$/
+ when "--keep"
+ keep = true
+ when "--no-words"
+ wordlist = false
+ when "--meta_file"
+ meta_outfile = arg
+ when "--meta"
+ meta = true
+ when "--email_file"
+ email_outfile = arg
+ when "--email"
+ email = true
+ when '--min_word_length'
+ min_word_length = arg.to_i
+ usage if min_word_length < 1
+ when '--depth'
+ depth = arg.to_i
+ usage if depth < 0
+ when '--offsite'
+ offsite = true
+ when '--ua'
+ ua = arg
+ when '--verbose'
+ verbose = true
+ when '--write'
+ outfile = arg
+ when "--header"
+ headers << arg
+ when "--proxy_password"
+ proxy_password = arg
+ when "--proxy_username"
+ proxy_username = arg
+ when "--proxy_host"
+ proxy_host = arg
+ when "--proxy_port"
+ proxy_port = arg.to_i
+ when "--auth_pass"
+ auth_pass = arg
+ when "--auth_user"
+ auth_user = arg
+ when "--auth_type"
+ if arg =~ /(digest|basic)/i
+ auth_type = $1.downcase
+ if auth_type == "digest"
+ begin
+ require "net/http/digest_auth"
+ rescue LoadError => e
+ # Catch error and provide feedback on installing gem
+ puts "\nError: To use digest auth you require the net-http-digest_auth gem\n"
+ puts "\t Use: 'gem install net-http-digest_auth'\n\n"
+ exit 2
+ end
+ end
+ else
+ puts "\nInvalid authentication type, please specify either basic or digest\n\n"
+ exit 1
+ end
+ end
+ end
rescue
- usage
+ usage
end
if auth_type && (auth_user.nil? || auth_pass.nil?)
- puts "\nIf using basic or digest auth you must provide a username and password\n\n"
- exit 1
+ puts "\nIf using basic or digest auth you must provide a username and password\n\n"
+ exit 1
end
if auth_type.nil? && (!auth_user.nil? || !auth_pass.nil?)
- puts "\nAuthentication details provided but no mention of basic or digest\n\n"
- exit 1
+ puts "\nAuthentication details provided but no mention of basic or digest\n\n"
+ exit 1
end
if ARGV.length != 1
- puts "\nMissing URL argument (try --help)\n\n"
- exit 1
+ puts "\nMissing URL argument (try --help)\n\n"
+ exit 1
end
url = ARGV.shift
@@ -654,8 +687,8 @@ url = "http://#{url}" if url !~ /^http(s)?:\/\//
# The spider doesn't work properly if there isn't a / on the end
if url !~ /\/$/
- # Commented out for Yori
- #url = "#{url}/"
+ # Commented out for Yori
+ #url = "#{url}/"
end
word_hash = {}
@@ -666,350 +699,352 @@ usernames = Array.new()
# Do the checks here so we don't do all the processing then find we can't open the file
if outfile
- begin
- outfile_file = File.new(outfile, "w")
- rescue
- puts "\nCouldn't open the output file for writing\n\n"
- exit 2
- end
+ begin
+ outfile_file = File.new(outfile, "w")
+ rescue
+ puts "\nCouldn't open the output file for writing\n\n"
+ exit 2
+ end
else
- outfile_file = $stdout
+ outfile_file = $stdout
end
if email_outfile && email
- begin
- email_outfile_file = File.new(email_outfile, "w")
- rescue
- puts "\nCouldn't open the email output file for writing\n\n"
- exit 2
- end
+ begin
+ email_outfile_file = File.new(email_outfile, "w")
+ rescue
+ puts "\nCouldn't open the email output file for writing\n\n"
+ exit 2
+ end
else
- email_outfile_file = outfile_file
+ email_outfile_file = outfile_file
end
if meta_outfile && email
- begin
- meta_outfile_file = File.new(meta_outfile, "w")
- rescue
- puts "\nCouldn't open the metadata output file for writing\n\n"
- exit 2
- end
+ begin
+ meta_outfile_file = File.new(meta_outfile, "w")
+ rescue
+ puts "\nCouldn't open the metadata output file for writing\n\n"
+ exit 2
+ end
else
- meta_outfile_file = outfile_file
+ meta_outfile_file = outfile_file
end
catch :ctrl_c do
- begin
- puts "Starting at #{url}" if verbose
-
- MySpider.proxy(proxy_host, proxy_port, proxy_username, proxy_password) if proxy_host
- MySpider.auth_creds(auth_type, auth_user, auth_pass) if auth_type
- MySpider.verbose(verbose)
-
- MySpider.start_at(url) do |s|
- s.headers['User-Agent'] = ua if ua
-
- s.add_url_check do |a_url|
- puts "Checking page #{a_url}" if debug
- allow = true
-
- # Extensions to ignore
- if a_url =~ /(\.zip$|\.gz$|\.zip$|\.bz2$|\.png$|\.gif$|\.jpg$|^#)/
- puts "Ignoring internal link or graphic: #{a_url}" if verbose
- allow = false
- else
- if /^mailto:(.*)/i.match(a_url)
- if email
- email_arr << $1
- puts "Found #{$1} on page #{a_url}" if verbose
- end
- allow = false
- else
- if !offsite
- a_url_parsed = URI.parse(a_url)
- url_parsed = URI.parse(url)
- puts "Comparing #{a_url} with #{url}" if debug
-
- # Make sure the host, port and scheme matches (else its offsite)
- allow = (a_url_parsed.host == url_parsed.host) && (a_url_parsed.port == url_parsed.port) && (a_url_parsed.scheme == url_parsed.scheme) ? true : false
-
- puts "Offsite link, not following: #{a_url}" if !allow && verbose
- end
- end
- end
- allow
- end
-
- s.on :success do |a_url, resp, prior_url|
- if verbose
- if prior_url.nil?
- puts "Visiting: #{a_url}, got response code #{resp.code}"
- else
- puts "Visiting: #{a_url} referred from #{prior_url}, got response code #{resp.code}"
- end
- end
-
- # May want 0-9 in here as well in the future but for now limit it to a-z so
- # you can't sneak any nasty characters in
- if /.*\.([a-z]+)(\?.*$|$)/i.match(a_url)
- file_extension = $1
- else
- file_extension = ''
- end
-
- # Don't get words from these file types. Most will have been blocked by the url_check function but
- # some are let through, such as .css, so that they can be checked for email addresses
-
- # This is a bad way to do this but it is either white or black list extensions and
- # the list of either is quite long, may as well black list and let extra through
- # that can then be weeded out later than stop things that could be useful
-
- #if file_extension =~ /^((doc|dot|ppt|pot|xls|xlt|pps)[xm]?)|(ppam|xlsb|xlam|pdf|zip|gz|zip|bz2|css|png|gif|jpg|#)$/
- if file_extension =~ /^((doc|dot|ppt|pot|xls|xlt|pps)[xm]?)|(ppam|xlsb|xlam|pdf|zip|gz|zip|bz2|png|gif|jpg|#)$/
- if meta
- begin
- if keep && file_extension =~ /^((doc|dot|ppt|pot|xls|xlt|pps)[xm]?)|(ppam|xlsb|xlam|pdf|zip|gz|zip|bz2)$/
- if /.*\/(.*)$/.match(a_url)
- output_filename = meta_temp_dir + $1
- puts "Keeping #{output_filename}" if verbose
- else
- # Shouldn't ever get here as the regex above should always be able to pull the filename out of the URL,
- # ...but just in case
-
- # Maybe look at doing this to make the temp name
- # require "tempfile"
- # Dir::Tmpname.make_tmpname "a", "b"
- # => "a20150707-8694-hrrxr4-b"
-
- output_filename = "#{meta_temp_dir}cewl_tmp"
- output_filename += ".#{file_extension}" unless file_extension.empty?
- end
- else
- output_filename = "#{meta_temp_dir}cewl_tmp"
- output_filename += ".#{file_extension}" unless file_extension.empty?
- end
-
- out = File.new(output_filename, "wb")
- out.print(resp.body)
- out.close
-
- meta_data = process_file(output_filename, verbose)
- usernames += meta_data if (meta_data != nil)
- rescue => e
- puts "\nCouldn't open the meta temp file for writing - #{e.inspect}\n\n"
- exit 2
- end
- end
- else
- html = resp.body.to_s.force_encoding("UTF-8")
- html.encode!('UTF-16', 'UTF-8', :invalid => :replace, :replace => '')
- html.encode!('UTF-8', 'UTF-16')
-
- dom = Nokogiri.HTML(html)
- dom.css('script').remove if strip_js
- dom.css('style').remove if strip_css
- body = dom.to_s
-
- # Get meta data
- if /.*<meta.*description.*content\s*=[\s'"]*(.*)/i.match(body)
- description = $1
- body += description.gsub(/[>"\/']*/, "")
- end
-
- if /.*<meta.*keywords.*content\s*=[\s'"]*(.*)/i.match(body)
- keywords = $1
- body += keywords.gsub(/[>"\/']*/, "")
- end
-
- puts body if debug
-
- # This bit will not normally fire as all JavaScript is stripped out
- # by the Nokogiri remove a few lines before this.
- #
- # The code isn't perfect but will do a rough job of working out
- # pages from relative location links
- while /(location.href\s*=\s*["']([^"']*)['"];)/i.match(body)
- full_match = $1
- j_url = $2
-
- puts "Javascript redirect found #{j_url}" if verbose
-
- re = Regexp.escape(full_match)
- body.gsub!(/#{re}/, "")
-
- if j_url !~ /https?:\/\//i
- parsed = URI.parse(a_url)
- protocol = parsed.scheme
- host = parsed.host
-
- domain = "#{protocol}://#{host}"
-
- j_url = domain + j_url
- j_url += $1 if j_url[0] == "/" && parsed.path =~ /(.*)\/.*/
-
- puts "Relative URL found, adding domain to make #{j_url}" if verbose
- end
-
- x = {a_url => j_url}
- url_stack.push x
- end
-
- # Strip comment tags
- body.gsub!(/<!--/, "")
- body.gsub!(/-->/, "")
-
- # If you want to add more attribute names to include, just add them to this array
- attribute_names = [
- "alt",
- "title",
- ]
-
- attribute_text = ''
-
- attribute_names.each { |attribute_name|
- body.gsub!(/#{attribute_name}="([^"]*)"/) { |attr| attribute_text += "#{$1} " }
- }
-
- if verbose and attribute_text
- puts "Attribute text found:"
- puts attribute_text
- puts
- end
-
- body += " #{attribute_text}"
-
- # Strip html tags
- words = body.gsub(/<\/?[^>]*>/, "")
-
- # Check if this is needed
- words.gsub!(/&[a-z]*;/, "")
-
- begin
- #if file_extension !~ /^((doc|dot|ppt|pot|xls|xlt|pps)[xm]?)|(ppam|xlsb|xlam|pdf|zip|gz|zip|bz2|css|png|gif|jpg|#)$/
- begin
- if email
- # Split the file down based on the email address regexp
- #words.gsub!(/\b([A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4})\b/i)
- #p words
-
- # If you want to pull email addresses from the contents of files found, such as word docs then move
- # this block outside the if statement
- # I've put it in here as some docs contain email addresses that have nothing to do with the target
- # so give false positive type results
- words.each_line do |word|
- while /\b([A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4})\b/i.match(word)
- puts "Found #{$1} on page #{a_url}" if verbose
- email_arr << $1
- word = word.gsub(/#{$1}/, "")
- end
- end
- end
- rescue => e
- puts "\nThere was a problem generating the email list"
- puts "Error: #{e.inspect}"
- puts e.backtrace
- end
-
- if wordlist
- # Remove any symbols
- words.gsub!(/[^[:alpha:]]/i, " ")
-
- # Add to the array
- words.split(" ").each do |word|
- if word.length >= min_word_length
- word_hash[word] = 0 if !word_hash.has_key?(word)
- word_hash[word] += 1
- end
- end
- end
- #end
- rescue => e
- puts "\nThere was a problem handling word generation"
- puts "Error: #{e.inspect}"
- puts e.backtrace
- end
- end
- end
- s.store_next_urls_with url_stack
- end
- rescue Errno::ENOENT
- puts "\nInvalid URL specified (#{url})\n\n"
- exit 2
- rescue => e
- puts "\nCouldn't access the site (#{url})\n"
- puts "Error: #{e.inspect}"
- exit 2
- end
+ begin
+ puts "Starting at #{url}" if verbose
+
+ MySpider.proxy(proxy_host, proxy_port, proxy_username, proxy_password) if proxy_host
+ MySpider.auth_creds(auth_type, auth_user, auth_pass) if auth_type
+ MySpider.headers(headers)
+ MySpider.verbose(verbose)
+ MySpider.debug(debug)
+
+ MySpider.start_at(url) do |s|
+ s.headers['User-Agent'] = ua if ua
+
+ s.add_url_check do |a_url|
+ puts "Checking page #{a_url}" if debug
+ allow = true
+
+ # Extensions to ignore
+ if a_url =~ /(\.zip$|\.gz$|\.zip$|\.bz2$|\.png$|\.gif$|\.jpg$|^#)/
+ puts "Ignoring internal link or graphic: #{a_url}" if verbose
+ allow = false
+ else
+ if /^mailto:(.*)/i.match(a_url)
+ if email
+ email_arr << $1
+ puts "Found #{$1} on page #{a_url}" if verbose
+ end
+ allow = false
+ else
+ if !offsite
+ a_url_parsed = URI.parse(a_url)
+ url_parsed = URI.parse(url)
+ puts "Comparing #{a_url} with #{url}" if debug
+
+ # Make sure the host, port and scheme matches (else its offsite)
+ allow = (a_url_parsed.host == url_parsed.host) && (a_url_parsed.port == url_parsed.port) && (a_url_parsed.scheme == url_parsed.scheme) ? true : false
+
+ puts "Offsite link, not following: #{a_url}" if !allow && verbose
+ end
+ end
+ end
+ allow
+ end
+
+ s.on :success do |a_url, resp, prior_url|
+ if verbose
+ if prior_url.nil?
+ puts "Visiting: #{a_url}, got response code #{resp.code}"
+ else
+ puts "Visiting: #{a_url} referred from #{prior_url}, got response code #{resp.code}"
+ end
+ end
+
+ # May want 0-9 in here as well in the future but for now limit it to a-z so
+ # you can't sneak any nasty characters in
+ if /.*\.([a-z]+)(\?.*$|$)/i.match(a_url)
+ file_extension = $1
+ else
+ file_extension = ''
+ end
+
+ # Don't get words from these file types. Most will have been blocked by the url_check function but
+ # some are let through, such as .css, so that they can be checked for email addresses
+
+ # This is a bad way to do this but it is either white or black list extensions and
+ # the list of either is quite long, may as well black list and let extra through
+ # that can then be weeded out later than stop things that could be useful
+
+ #if file_extension =~ /^((doc|dot|ppt|pot|xls|xlt|pps)[xm]?)|(ppam|xlsb|xlam|pdf|zip|gz|zip|bz2|css|png|gif|jpg|#)$/
+ if file_extension =~ /^((doc|dot|ppt|pot|xls|xlt|pps)[xm]?)|(ppam|xlsb|xlam|pdf|zip|gz|zip|bz2|png|gif|jpg|#)$/
+ if meta
+ begin
+ if keep && file_extension =~ /^((doc|dot|ppt|pot|xls|xlt|pps)[xm]?)|(ppam|xlsb|xlam|pdf|zip|gz|zip|bz2)$/
+ if /.*\/(.*)$/.match(a_url)
+ output_filename = meta_temp_dir + $1
+ puts "Keeping #{output_filename}" if verbose
+ else
+ # Shouldn't ever get here as the regex above should always be able to pull the filename out of the URL,
+ # ...but just in case
+
+ # Maybe look at doing this to make the temp name
+ # require "tempfile"
+ # Dir::Tmpname.make_tmpname "a", "b"
+ # => "a20150707-8694-hrrxr4-b"
+
+ output_filename = "#{meta_temp_dir}cewl_tmp"
+ output_filename += ".#{file_extension}" unless file_extension.empty?
+ end
+ else
+ output_filename = "#{meta_temp_dir}cewl_tmp"
+ output_filename += ".#{file_extension}" unless file_extension.empty?
+ end
+
+ out = File.new(output_filename, "wb")
+ out.print(resp.body)
+ out.close
+
+ meta_data = process_file(output_filename, verbose)
+ usernames += meta_data if (meta_data != nil)
+ rescue => e
+ puts "\nCouldn't open the meta temp file for writing - #{e.inspect}\n\n"
+ exit 2
+ end
+ end
+ else
+ html = resp.body.to_s.force_encoding("UTF-8")
+ html.encode!('UTF-16', 'UTF-8', :invalid => :replace, :replace => '')
+ html.encode!('UTF-8', 'UTF-16')
+
+ dom = Nokogiri.HTML(html)
+ dom.css('script').remove if strip_js
+ dom.css('style').remove if strip_css
+ body = dom.to_s
+
+ # Get meta data
+ if /.*<meta.*description.*content\s*=[\s'"]*(.*)/i.match(body)
+ description = $1
+ body += description.gsub(/[>"\/']*/, "")
+ end
+
+ if /.*<meta.*keywords.*content\s*=[\s'"]*(.*)/i.match(body)
+ keywords = $1
+ body += keywords.gsub(/[>"\/']*/, "")
+ end
+
+ puts body if debug
+
+ # This bit will not normally fire as all JavaScript is stripped out
+ # by the Nokogiri remove a few lines before this.
+ #
+ # The code isn't perfect but will do a rough job of working out
+ # pages from relative location links
+ while /(location.href\s*=\s*["']([^"']*)['"];)/i.match(body)
+ full_match = $1
+ j_url = $2
+
+ puts "Javascript redirect found #{j_url}" if verbose
+
+ re = Regexp.escape(full_match)
+ body.gsub!(/#{re}/, "")
+
+ if j_url !~ /https?:\/\//i
+ parsed = URI.parse(a_url)
+ protocol = parsed.scheme
+ host = parsed.host
+
+ domain = "#{protocol}://#{host}"
+
+ j_url = domain + j_url
+ j_url += $1 if j_url[0] == "/" && parsed.path =~ /(.*)\/.*/
+
+ puts "Relative URL found, adding domain to make #{j_url}" if verbose
+ end
+
+ x = {a_url => j_url}
+ url_stack.push x
+ end
+
+ # Strip comment tags
+ body.gsub!(/<!--/, "")
+ body.gsub!(/-->/, "")
+
+ # If you want to add more attribute names to include, just add them to this array
+ attribute_names = [
+ "alt",
+ "title",
+ ]
+
+ attribute_text = ''
+
+ attribute_names.each { |attribute_name|
+ body.gsub!(/#{attribute_name}="([^"]*)"/) { |attr| attribute_text += "#{$1} " }
+ }
+
+ if verbose and attribute_text
+ puts "Attribute text found:"
+ puts attribute_text
+ puts
+ end
+
+ body += " #{attribute_text}"
+
+ # Strip html tags
+ words = body.gsub(/<\/?[^>]*>/, "")
+
+ # Check if this is needed
+ words.gsub!(/&[a-z]*;/, "")
+
+ begin
+ #if file_extension !~ /^((doc|dot|ppt|pot|xls|xlt|pps)[xm]?)|(ppam|xlsb|xlam|pdf|zip|gz|zip|bz2|css|png|gif|jpg|#)$/
+ begin
+ if email
+ # Split the file down based on the email address regexp
+ #words.gsub!(/\b([A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4})\b/i)
+ #p words
+
+ # If you want to pull email addresses from the contents of files found, such as word docs then move
+ # this block outside the if statement
+ # I've put it in here as some docs contain email addresses that have nothing to do with the target
+ # so give false positive type results
+ words.each_line do |word|
+ while /\b([A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4})\b/i.match(word)
+ puts "Found #{$1} on page #{a_url}" if verbose
+ email_arr << $1
+ word = word.gsub(/#{$1}/, "")
+ end
+ end
+ end
+ rescue => e
+ puts "\nThere was a problem generating the email list"
+ puts "Error: #{e.inspect}"
+ puts e.backtrace
+ end
+
+ if wordlist
+ # Remove any symbols
+ words.gsub!(/[^[:alpha:]]/i, " ")
+
+ # Add to the array
+ words.split(" ").each do |word|
+ if word.length >= min_word_length
+ word_hash[word] = 0 if !word_hash.has_key?(word)
+ word_hash[word] += 1
+ end
+ end
+ end
+ #end
+ rescue => e
+ puts "\nThere was a problem handling word generation"
+ puts "Error: #{e.inspect}"
+ puts e.backtrace
+ end
+ end
+ end
+ s.store_next_urls_with url_stack
+ end
+ rescue Errno::ENOENT
+ puts "\nInvalid URL specified (#{url})\n\n"
+ exit 2
+ rescue => e
+ puts "\nCouldn't access the site (#{url})\n"
+ puts "Error: #{e.inspect}"
+ exit 2
+ end
end
puts "End of main loop" if debug
if wordlist
- if verbose
- if outfile.nil?
- puts "Words found\n"
- else
- puts "Writing words to file\n"
- end
- end
-
- sorted_wordlist = word_hash.sort_by do |word, count|
- -count
- end
-
- sorted_wordlist.each do |word, count|
- if show_count
- outfile_file.puts "#{word}, #{count.to_s}"
- else
- outfile_file.puts word
- end
- end
+ if verbose
+ if outfile.nil?
+ puts "Words found\n"
+ else
+ puts "Writing words to file\n"
+ end
+ end
+
+ sorted_wordlist = word_hash.sort_by do |word, count|
+ -count
+ end
+
+ sorted_wordlist.each do |word, count|
+ if show_count
+ outfile_file.puts "#{word}, #{count.to_s}"
+ else
+ outfile_file.puts word
+ end
+ end
end
puts "End of wordlist loop" if debug
if email
- if email_arr.length == 0
- puts "No email addresses found" if verbose
- else
- puts "Dumping email addresses to file" if verbose
-
- email_arr.delete_if { |x| x.chomp.empty? }
- email_arr.uniq!
- email_arr.sort!
-
- outfile_file.puts if (wordlist || verbose) && email_outfile.nil?
-
- if email_outfile.nil?
- outfile_file.puts "Email addresses found"
- outfile_file.puts "---------------------"
- outfile_file.puts email_arr.join("\n")
- else
- email_outfile_file.puts email_arr.join("\n")
- end
- end
+ if email_arr.length == 0
+ puts "No email addresses found" if verbose
+ else
+ puts "Dumping email addresses to file" if verbose
+
+ email_arr.delete_if { |x| x.chomp.empty? }
+ email_arr.uniq!
+ email_arr.sort!
+
+ outfile_file.puts if (wordlist || verbose) && email_outfile.nil?
+
+ if email_outfile.nil?
+ outfile_file.puts "Email addresses found"
+ outfile_file.puts "---------------------"
+ outfile_file.puts email_arr.join("\n")
+ else
+ email_outfile_file.puts email_arr.join("\n")
+ end
+ end
end
puts "End of email loop" if debug
if meta
- if usernames.length == 0
- puts "No meta data found" if verbose
- else
- puts "Dumping meta data to file" if verbose
- usernames.delete_if { |x| x.chomp.empty? }
- usernames.uniq!
- usernames.sort!
-
- outfile_file.puts if (email||wordlist) && meta_outfile.nil?
- if meta_outfile.nil?
- outfile_file.puts "Meta data found"
- outfile_file.puts "---------------"
- outfile_file.puts usernames.join("\n")
- else
- meta_outfile_file.puts usernames.join("\n")
- end
- end
+ if usernames.length == 0
+ puts "No meta data found" if verbose
+ else
+ puts "Dumping meta data to file" if verbose
+ usernames.delete_if { |x| x.chomp.empty? }
+ usernames.uniq!
+ usernames.sort!
+
+ outfile_file.puts if (email||wordlist) && meta_outfile.nil?
+ if meta_outfile.nil?
+ outfile_file.puts "Meta data found"
+ outfile_file.puts "---------------"
+ outfile_file.puts usernames.join("\n")
+ else
+ meta_outfile_file.puts usernames.join("\n")
+ end
+ end
end
puts "End of meta loop" if debug
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/forensics/cewl.git
More information about the forensics-changes
mailing list