今天整理资料的时候看到了两个python写的cms识别工具,一个是基于ico和robots.txt进行识别,另一个是基于页面和robots加上特殊的url的关键字匹配,放假之前就有写一个指纹识别的意愿,正好有参考,动手!

简述

看了一下两份代码,感觉各有优缺点,索性把两种方法综合起来。两个工具都使用了多线程,正好练习一下,此外还参考了凤凰扫描器的代码写法。
关于web指纹识别技术还有这一篇文章  http://www.freebuf.com/articles/2555.html
后面使用的时候出现了一点小问题,把代码修改了一下

代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import re
import requests
import threading
import queue
import os
import hashlib

class cmsRecognize(object):
def __init__(self, url, threads=50):
self.url = url
self.filePath = 'Bin/'
self.q = queue.Queue()
self.threads = threads
self.isKnown = False
self.knew = 0

def request(self, url):
try:
r = requests.get(url, timeout=10)
except requests.exceptions.Timeout as e:
print(e)
return False
except requests.exceptions.MissingSchema as e:
print(e)
return False
except requests.exceptions.RequestException as e:
print(e)
return False
return r.text if r.status_code == 200 else False

#获取ico的md5值
def getMd5Info(self, path='/favicon.ico'):
url = self.url + path
response = self.request(url)
if response:
md5 = hashlib.md5()
md5.update(response.encode('utf-8'))
return md5.hexdigest()
return False

def readFile(self, filename):
filename = self.filePath + filename
with open(filename, 'r') as f:
return f.readlines()

def compareIco(self):
res = self.getMd5Info()
if res != False:
for line in self.readFile('ico.txt'):
if res == line.strip().split('#')[1]:
print('[*]Based on favicon.ico:', line.strip().split('#')[0])
return True
print('[-]Based on favicon.ico: Unknown')

def getFeature(self):
files = os.listdir(self.filePath)
files.remove('ico.txt')
'''
主要是此处做了修改,原本是将文件放入队列中,现在修改为读取之后将测试路径放到队列中
'''
for f in files:
#self.q.put(f)
i = 0
for line in self.readFile(f):
i = i + 1
if i <= 2:
continue

line = line.strip().split('------')
self.q.put(line)

def compareFeature(self):
while not self.q.empty():
#i = 0
#knew = 0

# for line in self.readFile(self.q.get()):
# i = i + 1
# if i <= 2:
# continue

content = self.q.get()
response = self.request(self.url + content[0])
#print(threading.current_thread(), content[0])
if re.search(content[1], str(response)):
self.knew = self.knew + 1
#print(knew)
if self.knew >= 3:
print('[*]Based on feature:', content[2])
os._exit(0) #找到之后立即结束程序,减少开支

def run(self):
self.compareIco()

self.getFeature()
for i in range(self.threads):
t = threading.Thread(target=self.compareFeature)
t.setDaemon(True)
t.start()

#for i in range(self.threads):
#t.join()
self.q.join() #线程数控制有问题,做了修改

if not self.isKnown:
print('[-]Based on feature: Unknown')

if __name__ == '__main__':
cms = cmsRecognize('http://www.seacms.net')
cms.run()

特征文件

一、
#范例:链接——关键字——CMS别称
#范例:连接——正则表达式——匹配关键字——CMS别称
/dede/——dedecms——DedeCMS(织梦)
/data/admin/allowurl.txt——dedecms——DedeCMS(织梦)
/data/index.html——dedecms——DedeCMS(织梦)
……

二、
Discuz#528a222ab4cfa5f4ee19aac95a56c0ca
Phpcms#18fb0c67f6a7e5c7ad62fc37c5ab7637
dedecms#7ef1f0a0093460fe46bb691578c07c95
……

再扯两句。。

基于ico的识别成功率应该不是太高,现在的网站基本会换一个ico。。。说了半天ico到底是什么呢?
ico.png
看到左边的小标志了没~