检查无限多个自生成的URL的有效性，如果有效，则安全响应（http 200）

import threading, os import urllib.request, urllib.parse, urllib.error threadsNr = 1 dumpFolder = '/tmp/urls/' charSet = 'abcdefghijklmnopqrstuvwxyz0123456789_-' Url_pre = 'http://vorratsraum.com/' Url_post = 'alwaysTheSameTail' # class that generate the words class wordGenerator (): def __init__(self, word, charSet): self.currentWord = word self.charSet = charSet # generate the next word set that word as currentWord and return the word def nextWord (self): self.currentWord = self._incWord(self.currentWord) return self.currentWord # generate the next word def _incWord(self, word): word = str(word) # convert to string if word == '': # if word is empty return self.charSet[0] # return first char from the char set wordLastChar = word[len(word)-1] # get the last char wordLeftSide = word[0:len(word)-1] # get word without the last char lastCharPos = self.charSet.find(wordLastChar) # get position of last char in the char set if (lastCharPos+1) < len(self.charSet): # if position of last char is not at the end of the char set wordLastChar = self.charSet[lastCharPos+1] # get next char from the char set else: # it is the last char wordLastChar = self.charSet[0] # reset last char to have first character from the char set wordLeftSide = self._incWord(wordLeftSide) # send left site to be increased return wordLeftSide + wordLastChar # return the next word class newThread(threading.Thread): def run(self): global exitThread global wordsTried global newWord global hashList while exitThread == False: part = newWord.nextWord() # generate the next word to try url = Url_pre + part + Url_post wordsTried = wordsTried + 1 if wordsTried == 1000: # just for testing how fast it is exitThread = True print( 'trying ' + part) # display the word print( 'At URL ' + url) try: req = urllib.request.Request(url) req.addheaders = [('User-agent', 'Mozilla/5.0')] resp = urllib.request.urlopen(req) result = resp.read() found(part, result) except urllib.error.URLError as err: if err.code == 404: print('Page not found!') elif err.code == 403: print('Access denied!') else: print('Something happened! Error code', err.code) except urllib.error.URLError as err: print('Some other error happened:', err.reason) resultFile.close() def found(part, result): global exitThread global resultFile resultFile.write(part +"\n") if not os.path.isdir(dumpFolder + part): os.makedirs(dumpFolder + part) print('Found Part = ' + part) wordsTried = 0 exitThread = False # flag to kill all threads newWord = wordGenerator('',charSet); # word generator if not os.path.isdir(dumpFolder): os.makedirs(dumpFolder) resultFile = open(dumpFolder + 'parts.txt','a') # open file for append for i in range(threadsNr): newThread().start()

2条回答

网友

1楼 · 编辑于 2024-10-08 20:18:45

你想要暴力还是随机？下面是一个具有重复字符的顺序暴力方法。速度将在很大程度上取决于您的服务器响应。还要注意，这很可能会很快产生拒绝服务条件。在

import itertools
import url

pageChars = 5
alphabet = "abcdefghijklmnopqrstuvwxyz0123456789_-"

#iterate over the product of alphabet with <pageChar> elements
#this assumes repeating characters are allowed
# Beware this generates len(alphabet)**pageChars possible strings
for chars in itertools.product(alphabet,repeat=pageChars):
    pageString = ''.join(chars)

    urlString = 'https://mydomain.com/' + pageString

    try:
        url = urllib2.urlopen(url)

    except urllib2.HTTPError:
        print('No page at: %s' % urlString)
        continue     

    pageDate = url.read()
    #do something with page data

网友

2楼 · 编辑于 2024-10-08 20:18:45

你不能检查“无限数量的网址”而不是“非常慢”，初学者或没有

scraper占用的时间几乎肯定是由您访问的服务器的响应时间决定的，而不是由脚本的效率决定的。在

你到底想干什么？在

相关问题更多 >

编程相关推荐

热门问题

热门文章