@@ -30,10 +30,13 @@ def get_params(kwargs, key='url_param_'):
3030 names = [x for x in kwargs if x .startswith (key )]
3131 for n in range (len (names )):
3232 name = names [n ]
33+
3334 # Params are split by commas, with index corresponding to list index
3435 paramlist = kwargs .get (name ).split (',' )
36+
3537 # Remove the "url_param"
3638 name = name .replace (key , '' , 1 )
39+
3740 # Update the dictionary of dictionaries
3841 for i in range (len (paramlist )):
3942
@@ -54,27 +57,75 @@ def get_params(kwargs, key='url_param_'):
5457 return params
5558
5659
60+ def parse_success_response (response , kwargs ):
61+ '''parse a successful response of 200, meaning we honor the user
62+ request to return json, search for a regular expression, or return
63+ raw text. This is used by the basic GET/POST functions. For parsing
64+ with beautiful soup, see "get_results" and "get_url_selection"
65+
66+ Parameters
67+ ==========
68+ response: the requests (200) response
69+ kwargs: dictionary of keyword arguments provided to function
70+ '''
71+ result = None
72+ save_as = kwargs .get ('save_as' , 'json' )
73+ regex = kwargs .get ('regex' )
74+
75+ # Returning the result as json will detect dictionary, and save json
76+ if save_as == "json" :
77+ result = response .json ()
78+
79+ # As an alternative, search for a regular expression
80+ elif regex not in ["" , None ]:
81+ match = re .search (regex , response .text )
82+ result = match .group ()
83+
84+ # Otherwise, we return text
85+ else :
86+ result = response .text
87+ return result
88+
89+
5790def get_headers (kwargs ):
58- '''Get a single set of headers from the kwargs dict.
91+ '''Get a single set of headers from the kwargs dict. A user agent is added
92+ as it is helpful in most cases.
5993
6094 Parameters
6195 ==========
6296 kwargs: the dictionary of keyword arguments that may contain url
6397 parameters (format is url_param_<name>
6498 '''
65- headers = {}
99+ headers = {"User-Agent" : "Mozilla/5.0" }
66100
67101 for key , value in kwargs .items ():
68102 if key .startswith ('header_' ):
69103 name = key .replace ('header_' , '' , 1 )
70- headers [name ] = value
104+
105+ # The header is defined with a value
106+ if value != None :
107+ headers [name ] = value
108+
109+ # If the user wants to remove the User-Agent (or any) header
110+ elif value == None and name in headers :
111+ del headers [name ]
71112
72113 return headers
73114
74115
75- def get_results (url , selector , func = None , attributes = None , params = {}, get_text = False , headers = {}):
76- '''given a url, a function, an optional selector, optional attributes, and a set (dict)
77- of parameters, perform a request.
116+ def get_results (url ,
117+ selector ,
118+ func = None ,
119+ attributes = None ,
120+ params = {},
121+ get_text = False ,
122+ headers = {},
123+ regex = None ):
124+
125+ '''given a url, a function, an optional selector, optional attributes,
126+ and a set (dict) of parameters, perform a request. This function is
127+ used if the calling function needs special parsing of the html with
128+ beautiful soup. If only a post/get is needed, this is not necessary.
78129
79130 Parameters
80131 ==========
@@ -103,6 +154,11 @@ def get_results(url, selector, func=None, attributes=None, params={}, get_text=F
103154 if attributes != None :
104155 [results .append (entry .get (x )) for x in attributes ]
105156
157+ # Second priority for regular expression on text
158+ elif regex not in [None , "" ]:
159+ match = re .search (regex , entry .text )
160+ results .append (match .group ())
161+
106162 # Does the user want to get text?
107163 elif get_text == True :
108164 results .append (entry .text )
0 commit comments