realtime.patch 1.0 MB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515451645174518451945204521452245234524452545264527452845294530453145324533453445354536453745384539454045414542454345444545454645474548454945504551455245534554455545564557455845594560456145624563456445654566456745684569457045714572457345744575457645774578457945804581458245834584458545864587458845894590459145924593459445954596459745984599460046014602460346044605460646074608460946104611461246134614461546164617461846194620462146224623462446254626462746284629463046314632463346344635463646374638463946404641464246434644464546464647464846494650465146524653465446554656465746584659466046614662466346644665466646674668466946704671467246734674467546764677467846794680468146824683468446854686468746884689469046914692469346944695469646974698469947004701470247034704470547064707470847094710471147124713471447154716471747184719472047214722472347244725472647274728472947304731473247334734473547364737473847394740474147424743474447454746474747484749475047514752475347544755475647574758475947604761476247634764476547664767476847694770477147724773477447754776477747784779478047814782478347844785478647874788478947904791479247934794479547964797479847994800480148024803480448054806480748084809481048114812481348144815481648174818481948204821482248234824482548264827482848294830483148324833483448354836483748384839484048414842484348444845484648474848484948504851485248534854485548564857485848594860486148624863486448654866486748684869487048714872487348744875487648774878487948804881488248834884488548864887488848894890489148924893489448954896489748984899490049014902490349044905490649074908490949104911491249134914491549164917491849194920492149224923492449254926492749284929493049314932493349344935493649374938493949404941494249434944494549464947494849494950495149524953495449554956495749584959496049614962496349644965496649674968496949704971497249734974497549764977497849794980498149824983498449854986498749884989499049914992499349944995499649974998499950005001500250035004500550065007500850095010501150125013501450155016501750185019502050215022502350245025502650275028502950305031503250335034503550365037503850395040504150425043504450455046504750485049505050515052505350545055505650575058505950605061506250635064506550665067506850695070507150725073507450755076507750785079508050815082508350845085508650875088508950905091509250935094509550965097509850995100510151025103510451055106510751085109511051115112511351145115511651175118511951205121512251235124512551265127512851295130513151325133513451355136513751385139514051415142514351445145514651475148514951505151515251535154515551565157515851595160516151625163516451655166516751685169517051715172517351745175517651775178517951805181518251835184518551865187518851895190519151925193519451955196519751985199520052015202520352045205520652075208520952105211521252135214521552165217521852195220522152225223522452255226522752285229523052315232523352345235523652375238523952405241524252435244524552465247524852495250525152525253525452555256525752585259526052615262526352645265526652675268526952705271527252735274527552765277527852795280528152825283528452855286528752885289529052915292529352945295529652975298529953005301530253035304530553065307530853095310531153125313531453155316531753185319532053215322532353245325532653275328532953305331533253335334533553365337533853395340534153425343534453455346534753485349535053515352535353545355535653575358535953605361536253635364536553665367536853695370537153725373537453755376537753785379538053815382538353845385538653875388538953905391539253935394539553965397539853995400540154025403540454055406540754085409541054115412541354145415541654175418541954205421542254235424542554265427542854295430543154325433543454355436543754385439544054415442544354445445544654475448544954505451545254535454545554565457545854595460546154625463546454655466546754685469547054715472547354745475547654775478547954805481548254835484548554865487548854895490549154925493549454955496549754985499550055015502550355045505550655075508550955105511551255135514551555165517551855195520552155225523552455255526552755285529553055315532553355345535553655375538553955405541554255435544554555465547554855495550555155525553555455555556555755585559556055615562556355645565556655675568556955705571557255735574557555765577557855795580558155825583558455855586558755885589559055915592559355945595559655975598559956005601560256035604560556065607560856095610561156125613561456155616561756185619562056215622562356245625562656275628562956305631563256335634563556365637563856395640564156425643564456455646564756485649565056515652565356545655565656575658565956605661566256635664566556665667566856695670567156725673567456755676567756785679568056815682568356845685568656875688568956905691569256935694569556965697569856995700570157025703570457055706570757085709571057115712571357145715571657175718571957205721572257235724572557265727572857295730573157325733573457355736573757385739574057415742574357445745574657475748574957505751575257535754575557565757575857595760576157625763576457655766576757685769577057715772577357745775577657775778577957805781578257835784578557865787578857895790579157925793579457955796579757985799580058015802580358045805580658075808580958105811581258135814581558165817581858195820582158225823582458255826582758285829583058315832583358345835583658375838583958405841584258435844584558465847584858495850585158525853585458555856585758585859586058615862586358645865586658675868586958705871587258735874587558765877587858795880588158825883588458855886588758885889589058915892589358945895589658975898589959005901590259035904590559065907590859095910591159125913591459155916591759185919592059215922592359245925592659275928592959305931593259335934593559365937593859395940594159425943594459455946594759485949595059515952595359545955595659575958595959605961596259635964596559665967596859695970597159725973597459755976597759785979598059815982598359845985598659875988598959905991599259935994599559965997599859996000600160026003600460056006600760086009601060116012601360146015601660176018601960206021602260236024602560266027602860296030603160326033603460356036603760386039604060416042604360446045604660476048604960506051605260536054605560566057605860596060606160626063606460656066606760686069607060716072607360746075607660776078607960806081608260836084608560866087608860896090609160926093609460956096609760986099610061016102610361046105610661076108610961106111611261136114611561166117611861196120612161226123612461256126612761286129613061316132613361346135613661376138613961406141614261436144614561466147614861496150615161526153615461556156615761586159616061616162616361646165616661676168616961706171617261736174617561766177617861796180618161826183618461856186618761886189619061916192619361946195619661976198619962006201620262036204620562066207620862096210621162126213621462156216621762186219622062216222622362246225622662276228622962306231623262336234623562366237623862396240624162426243624462456246624762486249625062516252625362546255625662576258625962606261626262636264626562666267626862696270627162726273627462756276627762786279628062816282628362846285628662876288628962906291629262936294629562966297629862996300630163026303630463056306630763086309631063116312631363146315631663176318631963206321632263236324632563266327632863296330633163326333633463356336633763386339634063416342634363446345634663476348634963506351635263536354635563566357635863596360636163626363636463656366636763686369637063716372637363746375637663776378637963806381638263836384638563866387638863896390639163926393639463956396639763986399640064016402640364046405640664076408640964106411641264136414641564166417641864196420642164226423642464256426642764286429643064316432643364346435643664376438643964406441644264436444644564466447644864496450645164526453645464556456645764586459646064616462646364646465646664676468646964706471647264736474647564766477647864796480648164826483648464856486648764886489649064916492649364946495649664976498649965006501650265036504650565066507650865096510651165126513651465156516651765186519652065216522652365246525652665276528652965306531653265336534653565366537653865396540654165426543654465456546654765486549655065516552655365546555655665576558655965606561656265636564656565666567656865696570657165726573657465756576657765786579658065816582658365846585658665876588658965906591659265936594659565966597659865996600660166026603660466056606660766086609661066116612661366146615661666176618661966206621662266236624662566266627662866296630663166326633663466356636663766386639664066416642664366446645664666476648664966506651665266536654665566566657665866596660666166626663666466656666666766686669667066716672667366746675667666776678667966806681668266836684668566866687668866896690669166926693669466956696669766986699670067016702670367046705670667076708670967106711671267136714671567166717671867196720672167226723672467256726672767286729673067316732673367346735673667376738673967406741674267436744674567466747674867496750675167526753675467556756675767586759676067616762676367646765676667676768676967706771677267736774677567766777677867796780678167826783678467856786678767886789679067916792679367946795679667976798679968006801680268036804680568066807680868096810681168126813681468156816681768186819682068216822682368246825682668276828682968306831683268336834683568366837683868396840684168426843684468456846684768486849685068516852685368546855685668576858685968606861686268636864686568666867686868696870687168726873687468756876687768786879688068816882688368846885688668876888688968906891689268936894689568966897689868996900690169026903690469056906690769086909691069116912691369146915691669176918691969206921692269236924692569266927692869296930693169326933693469356936693769386939694069416942694369446945694669476948694969506951695269536954695569566957695869596960696169626963696469656966696769686969697069716972697369746975697669776978697969806981698269836984698569866987698869896990699169926993699469956996699769986999700070017002700370047005700670077008700970107011701270137014701570167017701870197020702170227023702470257026702770287029703070317032703370347035703670377038703970407041704270437044704570467047704870497050705170527053705470557056705770587059706070617062706370647065706670677068706970707071707270737074707570767077707870797080708170827083708470857086708770887089709070917092709370947095709670977098709971007101710271037104710571067107710871097110711171127113711471157116711771187119712071217122712371247125712671277128712971307131713271337134713571367137713871397140714171427143714471457146714771487149715071517152715371547155715671577158715971607161716271637164716571667167716871697170717171727173717471757176717771787179718071817182718371847185718671877188718971907191719271937194719571967197719871997200720172027203720472057206720772087209721072117212721372147215721672177218721972207221722272237224722572267227722872297230723172327233723472357236723772387239724072417242724372447245724672477248724972507251725272537254725572567257725872597260726172627263726472657266726772687269727072717272727372747275727672777278727972807281728272837284728572867287728872897290729172927293729472957296729772987299730073017302730373047305730673077308730973107311731273137314731573167317731873197320732173227323732473257326732773287329733073317332733373347335733673377338733973407341734273437344734573467347734873497350735173527353735473557356735773587359736073617362736373647365736673677368736973707371737273737374737573767377737873797380738173827383738473857386738773887389739073917392739373947395739673977398739974007401740274037404740574067407740874097410741174127413741474157416741774187419742074217422742374247425742674277428742974307431743274337434743574367437743874397440744174427443744474457446744774487449745074517452745374547455745674577458745974607461746274637464746574667467746874697470747174727473747474757476747774787479748074817482748374847485748674877488748974907491749274937494749574967497749874997500750175027503750475057506750775087509751075117512751375147515751675177518751975207521752275237524752575267527752875297530753175327533753475357536753775387539754075417542754375447545754675477548754975507551755275537554755575567557755875597560756175627563756475657566756775687569757075717572757375747575757675777578757975807581758275837584758575867587758875897590759175927593759475957596759775987599760076017602760376047605760676077608760976107611761276137614761576167617761876197620762176227623762476257626762776287629763076317632763376347635763676377638763976407641764276437644764576467647764876497650765176527653765476557656765776587659766076617662766376647665766676677668766976707671767276737674767576767677767876797680768176827683768476857686768776887689769076917692769376947695769676977698769977007701770277037704770577067707770877097710771177127713771477157716771777187719772077217722772377247725772677277728772977307731773277337734773577367737773877397740774177427743774477457746774777487749775077517752775377547755775677577758775977607761776277637764776577667767776877697770777177727773777477757776777777787779778077817782778377847785778677877788778977907791779277937794779577967797779877997800780178027803780478057806780778087809781078117812781378147815781678177818781978207821782278237824782578267827782878297830783178327833783478357836783778387839784078417842784378447845784678477848784978507851785278537854785578567857785878597860786178627863786478657866786778687869787078717872787378747875787678777878787978807881788278837884788578867887788878897890789178927893789478957896789778987899790079017902790379047905790679077908790979107911791279137914791579167917791879197920792179227923792479257926792779287929793079317932793379347935793679377938793979407941794279437944794579467947794879497950795179527953795479557956795779587959796079617962796379647965796679677968796979707971797279737974797579767977797879797980798179827983798479857986798779887989799079917992799379947995799679977998799980008001800280038004800580068007800880098010801180128013801480158016801780188019802080218022802380248025802680278028802980308031803280338034803580368037803880398040804180428043804480458046804780488049805080518052805380548055805680578058805980608061806280638064806580668067806880698070807180728073807480758076807780788079808080818082808380848085808680878088808980908091809280938094809580968097809880998100810181028103810481058106810781088109811081118112811381148115811681178118811981208121812281238124812581268127812881298130813181328133813481358136813781388139814081418142814381448145814681478148814981508151815281538154815581568157815881598160816181628163816481658166816781688169817081718172817381748175817681778178817981808181818281838184818581868187818881898190819181928193819481958196819781988199820082018202820382048205820682078208820982108211821282138214821582168217821882198220822182228223822482258226822782288229823082318232823382348235823682378238823982408241824282438244824582468247824882498250825182528253825482558256825782588259826082618262826382648265826682678268826982708271827282738274827582768277827882798280828182828283828482858286828782888289829082918292829382948295829682978298829983008301830283038304830583068307830883098310831183128313831483158316831783188319832083218322832383248325832683278328832983308331833283338334833583368337833883398340834183428343834483458346834783488349835083518352835383548355835683578358835983608361836283638364836583668367836883698370837183728373837483758376837783788379838083818382838383848385838683878388838983908391839283938394839583968397839883998400840184028403840484058406840784088409841084118412841384148415841684178418841984208421842284238424842584268427842884298430843184328433843484358436843784388439844084418442844384448445844684478448844984508451845284538454845584568457845884598460846184628463846484658466846784688469847084718472847384748475847684778478847984808481848284838484848584868487848884898490849184928493849484958496849784988499850085018502850385048505850685078508850985108511851285138514851585168517851885198520852185228523852485258526852785288529853085318532853385348535853685378538853985408541854285438544854585468547854885498550855185528553855485558556855785588559856085618562856385648565856685678568856985708571857285738574857585768577857885798580858185828583858485858586858785888589859085918592859385948595859685978598859986008601860286038604860586068607860886098610861186128613861486158616861786188619862086218622862386248625862686278628862986308631863286338634863586368637863886398640864186428643864486458646864786488649865086518652865386548655865686578658865986608661866286638664866586668667866886698670867186728673867486758676867786788679868086818682868386848685868686878688868986908691869286938694869586968697869886998700870187028703870487058706870787088709871087118712871387148715871687178718871987208721872287238724872587268727872887298730873187328733873487358736873787388739874087418742874387448745874687478748874987508751875287538754875587568757875887598760876187628763876487658766876787688769877087718772877387748775877687778778877987808781878287838784878587868787878887898790879187928793879487958796879787988799880088018802880388048805880688078808880988108811881288138814881588168817881888198820882188228823882488258826882788288829883088318832883388348835883688378838883988408841884288438844884588468847884888498850885188528853885488558856885788588859886088618862886388648865886688678868886988708871887288738874887588768877887888798880888188828883888488858886888788888889889088918892889388948895889688978898889989008901890289038904890589068907890889098910891189128913891489158916891789188919892089218922892389248925892689278928892989308931893289338934893589368937893889398940894189428943894489458946894789488949895089518952895389548955895689578958895989608961896289638964896589668967896889698970897189728973897489758976897789788979898089818982898389848985898689878988898989908991899289938994899589968997899889999000900190029003900490059006900790089009901090119012901390149015901690179018901990209021902290239024902590269027902890299030903190329033903490359036903790389039904090419042904390449045904690479048904990509051905290539054905590569057905890599060906190629063906490659066906790689069907090719072907390749075907690779078907990809081908290839084908590869087908890899090909190929093909490959096909790989099910091019102910391049105910691079108910991109111911291139114911591169117911891199120912191229123912491259126912791289129913091319132913391349135913691379138913991409141914291439144914591469147914891499150915191529153915491559156915791589159916091619162916391649165916691679168916991709171917291739174917591769177917891799180918191829183918491859186918791889189919091919192919391949195919691979198919992009201920292039204920592069207920892099210921192129213921492159216921792189219922092219222922392249225922692279228922992309231923292339234923592369237923892399240924192429243924492459246924792489249925092519252925392549255925692579258925992609261926292639264926592669267926892699270927192729273927492759276927792789279928092819282928392849285928692879288928992909291929292939294929592969297929892999300930193029303930493059306930793089309931093119312931393149315931693179318931993209321932293239324932593269327932893299330933193329333933493359336933793389339934093419342934393449345934693479348934993509351935293539354935593569357935893599360936193629363936493659366936793689369937093719372937393749375937693779378937993809381938293839384938593869387938893899390939193929393939493959396939793989399940094019402940394049405940694079408940994109411941294139414941594169417941894199420942194229423942494259426942794289429943094319432943394349435943694379438943994409441944294439444944594469447944894499450945194529453945494559456945794589459946094619462946394649465946694679468946994709471947294739474947594769477947894799480948194829483948494859486948794889489949094919492949394949495949694979498949995009501950295039504950595069507950895099510951195129513951495159516951795189519952095219522952395249525952695279528952995309531953295339534953595369537953895399540954195429543954495459546954795489549955095519552955395549555955695579558955995609561956295639564956595669567956895699570957195729573957495759576957795789579958095819582958395849585958695879588958995909591959295939594959595969597959895999600960196029603960496059606960796089609961096119612961396149615961696179618961996209621962296239624962596269627962896299630963196329633963496359636963796389639964096419642964396449645964696479648964996509651965296539654965596569657965896599660966196629663966496659666966796689669967096719672967396749675967696779678967996809681968296839684968596869687968896899690969196929693969496959696969796989699970097019702970397049705970697079708970997109711971297139714971597169717971897199720972197229723972497259726972797289729973097319732973397349735973697379738973997409741974297439744974597469747974897499750975197529753975497559756975797589759976097619762976397649765976697679768976997709771977297739774977597769777977897799780978197829783978497859786978797889789979097919792979397949795979697979798979998009801980298039804980598069807980898099810981198129813981498159816981798189819982098219822982398249825982698279828982998309831983298339834983598369837983898399840984198429843984498459846984798489849985098519852985398549855985698579858985998609861986298639864986598669867986898699870987198729873987498759876987798789879988098819882988398849885988698879888988998909891989298939894989598969897989898999900990199029903990499059906990799089909991099119912991399149915991699179918991999209921992299239924992599269927992899299930993199329933993499359936993799389939994099419942994399449945994699479948994999509951995299539954995599569957995899599960996199629963996499659966996799689969997099719972997399749975997699779978997999809981998299839984998599869987998899899990999199929993999499959996999799989999100001000110002100031000410005100061000710008100091001010011100121001310014100151001610017100181001910020100211002210023100241002510026100271002810029100301003110032100331003410035100361003710038100391004010041100421004310044100451004610047100481004910050100511005210053100541005510056100571005810059100601006110062100631006410065100661006710068100691007010071100721007310074100751007610077100781007910080100811008210083100841008510086100871008810089100901009110092100931009410095100961009710098100991010010101101021010310104101051010610107101081010910110101111011210113101141011510116101171011810119101201012110122101231012410125101261012710128101291013010131101321013310134101351013610137101381013910140101411014210143101441014510146101471014810149101501015110152101531015410155101561015710158101591016010161101621016310164101651016610167101681016910170101711017210173101741017510176101771017810179101801018110182101831018410185101861018710188101891019010191101921019310194101951019610197101981019910200102011020210203102041020510206102071020810209102101021110212102131021410215102161021710218102191022010221102221022310224102251022610227102281022910230102311023210233102341023510236102371023810239102401024110242102431024410245102461024710248102491025010251102521025310254102551025610257102581025910260102611026210263102641026510266102671026810269102701027110272102731027410275102761027710278102791028010281102821028310284102851028610287102881028910290102911029210293102941029510296102971029810299103001030110302103031030410305103061030710308103091031010311103121031310314103151031610317103181031910320103211032210323103241032510326103271032810329103301033110332103331033410335103361033710338103391034010341103421034310344103451034610347103481034910350103511035210353103541035510356103571035810359103601036110362103631036410365103661036710368103691037010371103721037310374103751037610377103781037910380103811038210383103841038510386103871038810389103901039110392103931039410395103961039710398103991040010401104021040310404104051040610407104081040910410104111041210413104141041510416104171041810419104201042110422104231042410425104261042710428104291043010431104321043310434104351043610437104381043910440104411044210443104441044510446104471044810449104501045110452104531045410455104561045710458104591046010461104621046310464104651046610467104681046910470104711047210473104741047510476104771047810479104801048110482104831048410485104861048710488104891049010491104921049310494104951049610497104981049910500105011050210503105041050510506105071050810509105101051110512105131051410515105161051710518105191052010521105221052310524105251052610527105281052910530105311053210533105341053510536105371053810539105401054110542105431054410545105461054710548105491055010551105521055310554105551055610557105581055910560105611056210563105641056510566105671056810569105701057110572105731057410575105761057710578105791058010581105821058310584105851058610587105881058910590105911059210593105941059510596105971059810599106001060110602106031060410605106061060710608106091061010611106121061310614106151061610617106181061910620106211062210623106241062510626106271062810629106301063110632106331063410635106361063710638106391064010641106421064310644106451064610647106481064910650106511065210653106541065510656106571065810659106601066110662106631066410665106661066710668106691067010671106721067310674106751067610677106781067910680106811068210683106841068510686106871068810689106901069110692106931069410695106961069710698106991070010701107021070310704107051070610707107081070910710107111071210713107141071510716107171071810719107201072110722107231072410725107261072710728107291073010731107321073310734107351073610737107381073910740107411074210743107441074510746107471074810749107501075110752107531075410755107561075710758107591076010761107621076310764107651076610767107681076910770107711077210773107741077510776107771077810779107801078110782107831078410785107861078710788107891079010791107921079310794107951079610797107981079910800108011080210803108041080510806108071080810809108101081110812108131081410815108161081710818108191082010821108221082310824108251082610827108281082910830108311083210833108341083510836108371083810839108401084110842108431084410845108461084710848108491085010851108521085310854108551085610857108581085910860108611086210863108641086510866108671086810869108701087110872108731087410875108761087710878108791088010881108821088310884108851088610887108881088910890108911089210893108941089510896108971089810899109001090110902109031090410905109061090710908109091091010911109121091310914109151091610917109181091910920109211092210923109241092510926109271092810929109301093110932109331093410935109361093710938109391094010941109421094310944109451094610947109481094910950109511095210953109541095510956109571095810959109601096110962109631096410965109661096710968109691097010971109721097310974109751097610977109781097910980109811098210983109841098510986109871098810989109901099110992109931099410995109961099710998109991100011001110021100311004110051100611007110081100911010110111101211013110141101511016110171101811019110201102111022110231102411025110261102711028110291103011031110321103311034110351103611037110381103911040110411104211043110441104511046110471104811049110501105111052110531105411055110561105711058110591106011061110621106311064110651106611067110681106911070110711107211073110741107511076110771107811079110801108111082110831108411085110861108711088110891109011091110921109311094110951109611097110981109911100111011110211103111041110511106111071110811109111101111111112111131111411115111161111711118111191112011121111221112311124111251112611127111281112911130111311113211133111341113511136111371113811139111401114111142111431114411145111461114711148111491115011151111521115311154111551115611157111581115911160111611116211163111641116511166111671116811169111701117111172111731117411175111761117711178111791118011181111821118311184111851118611187111881118911190111911119211193111941119511196111971119811199112001120111202112031120411205112061120711208112091121011211112121121311214112151121611217112181121911220112211122211223112241122511226112271122811229112301123111232112331123411235112361123711238112391124011241112421124311244112451124611247112481124911250112511125211253112541125511256112571125811259112601126111262112631126411265112661126711268112691127011271112721127311274112751127611277112781127911280112811128211283112841128511286112871128811289112901129111292112931129411295112961129711298112991130011301113021130311304113051130611307113081130911310113111131211313113141131511316113171131811319113201132111322113231132411325113261132711328113291133011331113321133311334113351133611337113381133911340113411134211343113441134511346113471134811349113501135111352113531135411355113561135711358113591136011361113621136311364113651136611367113681136911370113711137211373113741137511376113771137811379113801138111382113831138411385113861138711388113891139011391113921139311394113951139611397113981139911400114011140211403114041140511406114071140811409114101141111412114131141411415114161141711418114191142011421114221142311424114251142611427114281142911430114311143211433114341143511436114371143811439114401144111442114431144411445114461144711448114491145011451114521145311454114551145611457114581145911460114611146211463114641146511466114671146811469114701147111472114731147411475114761147711478114791148011481114821148311484114851148611487114881148911490114911149211493114941149511496114971149811499115001150111502115031150411505115061150711508115091151011511115121151311514115151151611517115181151911520115211152211523115241152511526115271152811529115301153111532115331153411535115361153711538115391154011541115421154311544115451154611547115481154911550115511155211553115541155511556115571155811559115601156111562115631156411565115661156711568115691157011571115721157311574115751157611577115781157911580115811158211583115841158511586115871158811589115901159111592115931159411595115961159711598115991160011601116021160311604116051160611607116081160911610116111161211613116141161511616116171161811619116201162111622116231162411625116261162711628116291163011631116321163311634116351163611637116381163911640116411164211643116441164511646116471164811649116501165111652116531165411655116561165711658116591166011661116621166311664116651166611667116681166911670116711167211673116741167511676116771167811679116801168111682116831168411685116861168711688116891169011691116921169311694116951169611697116981169911700117011170211703117041170511706117071170811709117101171111712117131171411715117161171711718117191172011721117221172311724117251172611727117281172911730117311173211733117341173511736117371173811739117401174111742117431174411745117461174711748117491175011751117521175311754117551175611757117581175911760117611176211763117641176511766117671176811769117701177111772117731177411775117761177711778117791178011781117821178311784117851178611787117881178911790117911179211793117941179511796117971179811799118001180111802118031180411805118061180711808118091181011811118121181311814118151181611817118181181911820118211182211823118241182511826118271182811829118301183111832118331183411835118361183711838118391184011841118421184311844118451184611847118481184911850118511185211853118541185511856118571185811859118601186111862118631186411865118661186711868118691187011871118721187311874118751187611877118781187911880118811188211883118841188511886118871188811889118901189111892118931189411895118961189711898118991190011901119021190311904119051190611907119081190911910119111191211913119141191511916119171191811919119201192111922119231192411925119261192711928119291193011931119321193311934119351193611937119381193911940119411194211943119441194511946119471194811949119501195111952119531195411955119561195711958119591196011961119621196311964119651196611967119681196911970119711197211973119741197511976119771197811979119801198111982119831198411985119861198711988119891199011991119921199311994119951199611997119981199912000120011200212003120041200512006120071200812009120101201112012120131201412015120161201712018120191202012021120221202312024120251202612027120281202912030120311203212033120341203512036120371203812039120401204112042120431204412045120461204712048120491205012051120521205312054120551205612057120581205912060120611206212063120641206512066120671206812069120701207112072120731207412075120761207712078120791208012081120821208312084120851208612087120881208912090120911209212093120941209512096120971209812099121001210112102121031210412105121061210712108121091211012111121121211312114121151211612117121181211912120121211212212123121241212512126121271212812129121301213112132121331213412135121361213712138121391214012141121421214312144121451214612147121481214912150121511215212153121541215512156121571215812159121601216112162121631216412165121661216712168121691217012171121721217312174121751217612177121781217912180121811218212183121841218512186121871218812189121901219112192121931219412195121961219712198121991220012201122021220312204122051220612207122081220912210122111221212213122141221512216122171221812219122201222112222122231222412225122261222712228122291223012231122321223312234122351223612237122381223912240122411224212243122441224512246122471224812249122501225112252122531225412255122561225712258122591226012261122621226312264122651226612267122681226912270122711227212273122741227512276122771227812279122801228112282122831228412285122861228712288122891229012291122921229312294122951229612297122981229912300123011230212303123041230512306123071230812309123101231112312123131231412315123161231712318123191232012321123221232312324123251232612327123281232912330123311233212333123341233512336123371233812339123401234112342123431234412345123461234712348123491235012351123521235312354123551235612357123581235912360123611236212363123641236512366123671236812369123701237112372123731237412375123761237712378123791238012381123821238312384123851238612387123881238912390123911239212393123941239512396123971239812399124001240112402124031240412405124061240712408124091241012411124121241312414124151241612417124181241912420124211242212423124241242512426124271242812429124301243112432124331243412435124361243712438124391244012441124421244312444124451244612447124481244912450124511245212453124541245512456124571245812459124601246112462124631246412465124661246712468124691247012471124721247312474124751247612477124781247912480124811248212483124841248512486124871248812489124901249112492124931249412495124961249712498124991250012501125021250312504125051250612507125081250912510125111251212513125141251512516125171251812519125201252112522125231252412525125261252712528125291253012531125321253312534125351253612537125381253912540125411254212543125441254512546125471254812549125501255112552125531255412555125561255712558125591256012561125621256312564125651256612567125681256912570125711257212573125741257512576125771257812579125801258112582125831258412585125861258712588125891259012591125921259312594125951259612597125981259912600126011260212603126041260512606126071260812609126101261112612126131261412615126161261712618126191262012621126221262312624126251262612627126281262912630126311263212633126341263512636126371263812639126401264112642126431264412645126461264712648126491265012651126521265312654126551265612657126581265912660126611266212663126641266512666126671266812669126701267112672126731267412675126761267712678126791268012681126821268312684126851268612687126881268912690126911269212693126941269512696126971269812699127001270112702127031270412705127061270712708127091271012711127121271312714127151271612717127181271912720127211272212723127241272512726127271272812729127301273112732127331273412735127361273712738127391274012741127421274312744127451274612747127481274912750127511275212753127541275512756127571275812759127601276112762127631276412765127661276712768127691277012771127721277312774127751277612777127781277912780127811278212783127841278512786127871278812789127901279112792127931279412795127961279712798127991280012801128021280312804128051280612807128081280912810128111281212813128141281512816128171281812819128201282112822128231282412825128261282712828128291283012831128321283312834128351283612837128381283912840128411284212843128441284512846128471284812849128501285112852128531285412855128561285712858128591286012861128621286312864128651286612867128681286912870128711287212873128741287512876128771287812879128801288112882128831288412885128861288712888128891289012891128921289312894128951289612897128981289912900129011290212903129041290512906129071290812909129101291112912129131291412915129161291712918129191292012921129221292312924129251292612927129281292912930129311293212933129341293512936129371293812939129401294112942129431294412945129461294712948129491295012951129521295312954129551295612957129581295912960129611296212963129641296512966129671296812969129701297112972129731297412975129761297712978129791298012981129821298312984129851298612987129881298912990129911299212993129941299512996129971299812999130001300113002130031300413005130061300713008130091301013011130121301313014130151301613017130181301913020130211302213023130241302513026130271302813029130301303113032130331303413035130361303713038130391304013041130421304313044130451304613047130481304913050130511305213053130541305513056130571305813059130601306113062130631306413065130661306713068130691307013071130721307313074130751307613077130781307913080130811308213083130841308513086130871308813089130901309113092130931309413095130961309713098130991310013101131021310313104131051310613107131081310913110131111311213113131141311513116131171311813119131201312113122131231312413125131261312713128131291313013131131321313313134131351313613137131381313913140131411314213143131441314513146131471314813149131501315113152131531315413155131561315713158131591316013161131621316313164131651316613167131681316913170131711317213173131741317513176131771317813179131801318113182131831318413185131861318713188131891319013191131921319313194131951319613197131981319913200132011320213203132041320513206132071320813209132101321113212132131321413215132161321713218132191322013221132221322313224132251322613227132281322913230132311323213233132341323513236132371323813239132401324113242132431324413245132461324713248132491325013251132521325313254132551325613257132581325913260132611326213263132641326513266132671326813269132701327113272132731327413275132761327713278132791328013281132821328313284132851328613287132881328913290132911329213293132941329513296132971329813299133001330113302133031330413305133061330713308133091331013311133121331313314133151331613317133181331913320133211332213323133241332513326133271332813329133301333113332133331333413335133361333713338133391334013341133421334313344133451334613347133481334913350133511335213353133541335513356133571335813359133601336113362133631336413365133661336713368133691337013371133721337313374133751337613377133781337913380133811338213383133841338513386133871338813389133901339113392133931339413395133961339713398133991340013401134021340313404134051340613407134081340913410134111341213413134141341513416134171341813419134201342113422134231342413425134261342713428134291343013431134321343313434134351343613437134381343913440134411344213443134441344513446134471344813449134501345113452134531345413455134561345713458134591346013461134621346313464134651346613467134681346913470134711347213473134741347513476134771347813479134801348113482134831348413485134861348713488134891349013491134921349313494134951349613497134981349913500135011350213503135041350513506135071350813509135101351113512135131351413515135161351713518135191352013521135221352313524135251352613527135281352913530135311353213533135341353513536135371353813539135401354113542135431354413545135461354713548135491355013551135521355313554135551355613557135581355913560135611356213563135641356513566135671356813569135701357113572135731357413575135761357713578135791358013581135821358313584135851358613587135881358913590135911359213593135941359513596135971359813599136001360113602136031360413605136061360713608136091361013611136121361313614136151361613617136181361913620136211362213623136241362513626136271362813629136301363113632136331363413635136361363713638136391364013641136421364313644136451364613647136481364913650136511365213653136541365513656136571365813659136601366113662136631366413665136661366713668136691367013671136721367313674136751367613677136781367913680136811368213683136841368513686136871368813689136901369113692136931369413695136961369713698136991370013701137021370313704137051370613707137081370913710137111371213713137141371513716137171371813719137201372113722137231372413725137261372713728137291373013731137321373313734137351373613737137381373913740137411374213743137441374513746137471374813749137501375113752137531375413755137561375713758137591376013761137621376313764137651376613767137681376913770137711377213773137741377513776137771377813779137801378113782137831378413785137861378713788137891379013791137921379313794137951379613797137981379913800138011380213803138041380513806138071380813809138101381113812138131381413815138161381713818138191382013821138221382313824138251382613827138281382913830138311383213833138341383513836138371383813839138401384113842138431384413845138461384713848138491385013851138521385313854138551385613857138581385913860138611386213863138641386513866138671386813869138701387113872138731387413875138761387713878138791388013881138821388313884138851388613887138881388913890138911389213893138941389513896138971389813899139001390113902139031390413905139061390713908139091391013911139121391313914139151391613917139181391913920139211392213923139241392513926139271392813929139301393113932139331393413935139361393713938139391394013941139421394313944139451394613947139481394913950139511395213953139541395513956139571395813959139601396113962139631396413965139661396713968139691397013971139721397313974139751397613977139781397913980139811398213983139841398513986139871398813989139901399113992139931399413995139961399713998139991400014001140021400314004140051400614007140081400914010140111401214013140141401514016140171401814019140201402114022140231402414025140261402714028140291403014031140321403314034140351403614037140381403914040140411404214043140441404514046140471404814049140501405114052140531405414055140561405714058140591406014061140621406314064140651406614067140681406914070140711407214073140741407514076140771407814079140801408114082140831408414085140861408714088140891409014091140921409314094140951409614097140981409914100141011410214103141041410514106141071410814109141101411114112141131411414115141161411714118141191412014121141221412314124141251412614127141281412914130141311413214133141341413514136141371413814139141401414114142141431414414145141461414714148141491415014151141521415314154141551415614157141581415914160141611416214163141641416514166141671416814169141701417114172141731417414175141761417714178141791418014181141821418314184141851418614187141881418914190141911419214193141941419514196141971419814199142001420114202142031420414205142061420714208142091421014211142121421314214142151421614217142181421914220142211422214223142241422514226142271422814229142301423114232142331423414235142361423714238142391424014241142421424314244142451424614247142481424914250142511425214253142541425514256142571425814259142601426114262142631426414265142661426714268142691427014271142721427314274142751427614277142781427914280142811428214283142841428514286142871428814289142901429114292142931429414295142961429714298142991430014301143021430314304143051430614307143081430914310143111431214313143141431514316143171431814319143201432114322143231432414325143261432714328143291433014331143321433314334143351433614337143381433914340143411434214343143441434514346143471434814349143501435114352143531435414355143561435714358143591436014361143621436314364143651436614367143681436914370143711437214373143741437514376143771437814379143801438114382143831438414385143861438714388143891439014391143921439314394143951439614397143981439914400144011440214403144041440514406144071440814409144101441114412144131441414415144161441714418144191442014421144221442314424144251442614427144281442914430144311443214433144341443514436144371443814439144401444114442144431444414445144461444714448144491445014451144521445314454144551445614457144581445914460144611446214463144641446514466144671446814469144701447114472144731447414475144761447714478144791448014481144821448314484144851448614487144881448914490144911449214493144941449514496144971449814499145001450114502145031450414505145061450714508145091451014511145121451314514145151451614517145181451914520145211452214523145241452514526145271452814529145301453114532145331453414535145361453714538145391454014541145421454314544145451454614547145481454914550145511455214553145541455514556145571455814559145601456114562145631456414565145661456714568145691457014571145721457314574145751457614577145781457914580145811458214583145841458514586145871458814589145901459114592145931459414595145961459714598145991460014601146021460314604146051460614607146081460914610146111461214613146141461514616146171461814619146201462114622146231462414625146261462714628146291463014631146321463314634146351463614637146381463914640146411464214643146441464514646146471464814649146501465114652146531465414655146561465714658146591466014661146621466314664146651466614667146681466914670146711467214673146741467514676146771467814679146801468114682146831468414685146861468714688146891469014691146921469314694146951469614697146981469914700147011470214703147041470514706147071470814709147101471114712147131471414715147161471714718147191472014721147221472314724147251472614727147281472914730147311473214733147341473514736147371473814739147401474114742147431474414745147461474714748147491475014751147521475314754147551475614757147581475914760147611476214763147641476514766147671476814769147701477114772147731477414775147761477714778147791478014781147821478314784147851478614787147881478914790147911479214793147941479514796147971479814799148001480114802148031480414805148061480714808148091481014811148121481314814148151481614817148181481914820148211482214823148241482514826148271482814829148301483114832148331483414835148361483714838148391484014841148421484314844148451484614847148481484914850148511485214853148541485514856148571485814859148601486114862148631486414865148661486714868148691487014871148721487314874148751487614877148781487914880148811488214883148841488514886148871488814889148901489114892148931489414895148961489714898148991490014901149021490314904149051490614907149081490914910149111491214913149141491514916149171491814919149201492114922149231492414925149261492714928149291493014931149321493314934149351493614937149381493914940149411494214943149441494514946149471494814949149501495114952149531495414955149561495714958149591496014961149621496314964149651496614967149681496914970149711497214973149741497514976149771497814979149801498114982149831498414985149861498714988149891499014991149921499314994149951499614997149981499915000150011500215003150041500515006150071500815009150101501115012150131501415015150161501715018150191502015021150221502315024150251502615027150281502915030150311503215033150341503515036150371503815039150401504115042150431504415045150461504715048150491505015051150521505315054150551505615057150581505915060150611506215063150641506515066150671506815069150701507115072150731507415075150761507715078150791508015081150821508315084150851508615087150881508915090150911509215093150941509515096150971509815099151001510115102151031510415105151061510715108151091511015111151121511315114151151511615117151181511915120151211512215123151241512515126151271512815129151301513115132151331513415135151361513715138151391514015141151421514315144151451514615147151481514915150151511515215153151541515515156151571515815159151601516115162151631516415165151661516715168151691517015171151721517315174151751517615177151781517915180151811518215183151841518515186151871518815189151901519115192151931519415195151961519715198151991520015201152021520315204152051520615207152081520915210152111521215213152141521515216152171521815219152201522115222152231522415225152261522715228152291523015231152321523315234152351523615237152381523915240152411524215243152441524515246152471524815249152501525115252152531525415255152561525715258152591526015261152621526315264152651526615267152681526915270152711527215273152741527515276152771527815279152801528115282152831528415285152861528715288152891529015291152921529315294152951529615297152981529915300153011530215303153041530515306153071530815309153101531115312153131531415315153161531715318153191532015321153221532315324153251532615327153281532915330153311533215333153341533515336153371533815339153401534115342153431534415345153461534715348153491535015351153521535315354153551535615357153581535915360153611536215363153641536515366153671536815369153701537115372153731537415375153761537715378153791538015381153821538315384153851538615387153881538915390153911539215393153941539515396153971539815399154001540115402154031540415405154061540715408154091541015411154121541315414154151541615417154181541915420154211542215423154241542515426154271542815429154301543115432154331543415435154361543715438154391544015441154421544315444154451544615447154481544915450154511545215453154541545515456154571545815459154601546115462154631546415465154661546715468154691547015471154721547315474154751547615477154781547915480154811548215483154841548515486154871548815489154901549115492154931549415495154961549715498154991550015501155021550315504155051550615507155081550915510155111551215513155141551515516155171551815519155201552115522155231552415525155261552715528155291553015531155321553315534155351553615537155381553915540155411554215543155441554515546155471554815549155501555115552155531555415555155561555715558155591556015561155621556315564155651556615567155681556915570155711557215573155741557515576155771557815579155801558115582155831558415585155861558715588155891559015591155921559315594155951559615597155981559915600156011560215603156041560515606156071560815609156101561115612156131561415615156161561715618156191562015621156221562315624156251562615627156281562915630156311563215633156341563515636156371563815639156401564115642156431564415645156461564715648156491565015651156521565315654156551565615657156581565915660156611566215663156641566515666156671566815669156701567115672156731567415675156761567715678156791568015681156821568315684156851568615687156881568915690156911569215693156941569515696156971569815699157001570115702157031570415705157061570715708157091571015711157121571315714157151571615717157181571915720157211572215723157241572515726157271572815729157301573115732157331573415735157361573715738157391574015741157421574315744157451574615747157481574915750157511575215753157541575515756157571575815759157601576115762157631576415765157661576715768157691577015771157721577315774157751577615777157781577915780157811578215783157841578515786157871578815789157901579115792157931579415795157961579715798157991580015801158021580315804158051580615807158081580915810158111581215813158141581515816158171581815819158201582115822158231582415825158261582715828158291583015831158321583315834158351583615837158381583915840158411584215843158441584515846158471584815849158501585115852158531585415855158561585715858158591586015861158621586315864158651586615867158681586915870158711587215873158741587515876158771587815879158801588115882158831588415885158861588715888158891589015891158921589315894158951589615897158981589915900159011590215903159041590515906159071590815909159101591115912159131591415915159161591715918159191592015921159221592315924159251592615927159281592915930159311593215933159341593515936159371593815939159401594115942159431594415945159461594715948159491595015951159521595315954159551595615957159581595915960159611596215963159641596515966159671596815969159701597115972159731597415975159761597715978159791598015981159821598315984159851598615987159881598915990159911599215993159941599515996159971599815999160001600116002160031600416005160061600716008160091601016011160121601316014160151601616017160181601916020160211602216023160241602516026160271602816029160301603116032160331603416035160361603716038160391604016041160421604316044160451604616047160481604916050160511605216053160541605516056160571605816059160601606116062160631606416065160661606716068160691607016071160721607316074160751607616077160781607916080160811608216083160841608516086160871608816089160901609116092160931609416095160961609716098160991610016101161021610316104161051610616107161081610916110161111611216113161141611516116161171611816119161201612116122161231612416125161261612716128161291613016131161321613316134161351613616137161381613916140161411614216143161441614516146161471614816149161501615116152161531615416155161561615716158161591616016161161621616316164161651616616167161681616916170161711617216173161741617516176161771617816179161801618116182161831618416185161861618716188161891619016191161921619316194161951619616197161981619916200162011620216203162041620516206162071620816209162101621116212162131621416215162161621716218162191622016221162221622316224162251622616227162281622916230162311623216233162341623516236162371623816239162401624116242162431624416245162461624716248162491625016251162521625316254162551625616257162581625916260162611626216263162641626516266162671626816269162701627116272162731627416275162761627716278162791628016281162821628316284162851628616287162881628916290162911629216293162941629516296162971629816299163001630116302163031630416305163061630716308163091631016311163121631316314163151631616317163181631916320163211632216323163241632516326163271632816329163301633116332163331633416335163361633716338163391634016341163421634316344163451634616347163481634916350163511635216353163541635516356163571635816359163601636116362163631636416365163661636716368163691637016371163721637316374163751637616377163781637916380163811638216383163841638516386163871638816389163901639116392163931639416395163961639716398163991640016401164021640316404164051640616407164081640916410164111641216413164141641516416164171641816419164201642116422164231642416425164261642716428164291643016431164321643316434164351643616437164381643916440164411644216443164441644516446164471644816449164501645116452164531645416455164561645716458164591646016461164621646316464164651646616467164681646916470164711647216473164741647516476164771647816479164801648116482164831648416485164861648716488164891649016491164921649316494164951649616497164981649916500165011650216503165041650516506165071650816509165101651116512165131651416515165161651716518165191652016521165221652316524165251652616527165281652916530165311653216533165341653516536165371653816539165401654116542165431654416545165461654716548165491655016551165521655316554165551655616557165581655916560165611656216563165641656516566165671656816569165701657116572165731657416575165761657716578165791658016581165821658316584165851658616587165881658916590165911659216593165941659516596165971659816599166001660116602166031660416605166061660716608166091661016611166121661316614166151661616617166181661916620166211662216623166241662516626166271662816629166301663116632166331663416635166361663716638166391664016641166421664316644166451664616647166481664916650166511665216653166541665516656166571665816659166601666116662166631666416665166661666716668166691667016671166721667316674166751667616677166781667916680166811668216683166841668516686166871668816689166901669116692166931669416695166961669716698166991670016701167021670316704167051670616707167081670916710167111671216713167141671516716167171671816719167201672116722167231672416725167261672716728167291673016731167321673316734167351673616737167381673916740167411674216743167441674516746167471674816749167501675116752167531675416755167561675716758167591676016761167621676316764167651676616767167681676916770167711677216773167741677516776167771677816779167801678116782167831678416785167861678716788167891679016791167921679316794167951679616797167981679916800168011680216803168041680516806168071680816809168101681116812168131681416815168161681716818168191682016821168221682316824168251682616827168281682916830168311683216833168341683516836168371683816839168401684116842168431684416845168461684716848168491685016851168521685316854168551685616857168581685916860168611686216863168641686516866168671686816869168701687116872168731687416875168761687716878168791688016881168821688316884168851688616887168881688916890168911689216893168941689516896168971689816899169001690116902169031690416905169061690716908169091691016911169121691316914169151691616917169181691916920169211692216923169241692516926169271692816929169301693116932169331693416935169361693716938169391694016941169421694316944169451694616947169481694916950169511695216953169541695516956169571695816959169601696116962169631696416965169661696716968169691697016971169721697316974169751697616977169781697916980169811698216983169841698516986169871698816989169901699116992169931699416995169961699716998169991700017001170021700317004170051700617007170081700917010170111701217013170141701517016170171701817019170201702117022170231702417025170261702717028170291703017031170321703317034170351703617037170381703917040170411704217043170441704517046170471704817049170501705117052170531705417055170561705717058170591706017061170621706317064170651706617067170681706917070170711707217073170741707517076170771707817079170801708117082170831708417085170861708717088170891709017091170921709317094170951709617097170981709917100171011710217103171041710517106171071710817109171101711117112171131711417115171161711717118171191712017121171221712317124171251712617127171281712917130171311713217133171341713517136171371713817139171401714117142171431714417145171461714717148171491715017151171521715317154171551715617157171581715917160171611716217163171641716517166171671716817169171701717117172171731717417175171761717717178171791718017181171821718317184171851718617187171881718917190171911719217193171941719517196171971719817199172001720117202172031720417205172061720717208172091721017211172121721317214172151721617217172181721917220172211722217223172241722517226172271722817229172301723117232172331723417235172361723717238172391724017241172421724317244172451724617247172481724917250172511725217253172541725517256172571725817259172601726117262172631726417265172661726717268172691727017271172721727317274172751727617277172781727917280172811728217283172841728517286172871728817289172901729117292172931729417295172961729717298172991730017301173021730317304173051730617307173081730917310173111731217313173141731517316173171731817319173201732117322173231732417325173261732717328173291733017331173321733317334173351733617337173381733917340173411734217343173441734517346173471734817349173501735117352173531735417355173561735717358173591736017361173621736317364173651736617367173681736917370173711737217373173741737517376173771737817379173801738117382173831738417385173861738717388173891739017391173921739317394173951739617397173981739917400174011740217403174041740517406174071740817409174101741117412174131741417415174161741717418174191742017421174221742317424174251742617427174281742917430174311743217433174341743517436174371743817439174401744117442174431744417445174461744717448174491745017451174521745317454174551745617457174581745917460174611746217463174641746517466174671746817469174701747117472174731747417475174761747717478174791748017481174821748317484174851748617487174881748917490174911749217493174941749517496174971749817499175001750117502175031750417505175061750717508175091751017511175121751317514175151751617517175181751917520175211752217523175241752517526175271752817529175301753117532175331753417535175361753717538175391754017541175421754317544175451754617547175481754917550175511755217553175541755517556175571755817559175601756117562175631756417565175661756717568175691757017571175721757317574175751757617577175781757917580175811758217583175841758517586175871758817589175901759117592175931759417595175961759717598175991760017601176021760317604176051760617607176081760917610176111761217613176141761517616176171761817619176201762117622176231762417625176261762717628176291763017631176321763317634176351763617637176381763917640176411764217643176441764517646176471764817649176501765117652176531765417655176561765717658176591766017661176621766317664176651766617667176681766917670176711767217673176741767517676176771767817679176801768117682176831768417685176861768717688176891769017691176921769317694176951769617697176981769917700177011770217703177041770517706177071770817709177101771117712177131771417715177161771717718177191772017721177221772317724177251772617727177281772917730177311773217733177341773517736177371773817739177401774117742177431774417745177461774717748177491775017751177521775317754177551775617757177581775917760177611776217763177641776517766177671776817769177701777117772177731777417775177761777717778177791778017781177821778317784177851778617787177881778917790177911779217793177941779517796177971779817799178001780117802178031780417805178061780717808178091781017811178121781317814178151781617817178181781917820178211782217823178241782517826178271782817829178301783117832178331783417835178361783717838178391784017841178421784317844178451784617847178481784917850178511785217853178541785517856178571785817859178601786117862178631786417865178661786717868178691787017871178721787317874178751787617877178781787917880178811788217883178841788517886178871788817889178901789117892178931789417895178961789717898178991790017901179021790317904179051790617907179081790917910179111791217913179141791517916179171791817919179201792117922179231792417925179261792717928179291793017931179321793317934179351793617937179381793917940179411794217943179441794517946179471794817949179501795117952179531795417955179561795717958179591796017961179621796317964179651796617967179681796917970179711797217973179741797517976179771797817979179801798117982179831798417985179861798717988179891799017991179921799317994179951799617997179981799918000180011800218003180041800518006180071800818009180101801118012180131801418015180161801718018180191802018021180221802318024180251802618027180281802918030180311803218033180341803518036180371803818039180401804118042180431804418045180461804718048180491805018051180521805318054180551805618057180581805918060180611806218063180641806518066180671806818069180701807118072180731807418075180761807718078180791808018081180821808318084180851808618087180881808918090180911809218093180941809518096180971809818099181001810118102181031810418105181061810718108181091811018111181121811318114181151811618117181181811918120181211812218123181241812518126181271812818129181301813118132181331813418135181361813718138181391814018141181421814318144181451814618147181481814918150181511815218153181541815518156181571815818159181601816118162181631816418165181661816718168181691817018171181721817318174181751817618177181781817918180181811818218183181841818518186181871818818189181901819118192181931819418195181961819718198181991820018201182021820318204182051820618207182081820918210182111821218213182141821518216182171821818219182201822118222182231822418225182261822718228182291823018231182321823318234182351823618237182381823918240182411824218243182441824518246182471824818249182501825118252182531825418255182561825718258182591826018261182621826318264182651826618267182681826918270182711827218273182741827518276182771827818279182801828118282182831828418285182861828718288182891829018291182921829318294182951829618297182981829918300183011830218303183041830518306183071830818309183101831118312183131831418315183161831718318183191832018321183221832318324183251832618327183281832918330183311833218333183341833518336183371833818339183401834118342183431834418345183461834718348183491835018351183521835318354183551835618357183581835918360183611836218363183641836518366183671836818369183701837118372183731837418375183761837718378183791838018381183821838318384183851838618387183881838918390183911839218393183941839518396183971839818399184001840118402184031840418405184061840718408184091841018411184121841318414184151841618417184181841918420184211842218423184241842518426184271842818429184301843118432184331843418435184361843718438184391844018441184421844318444184451844618447184481844918450184511845218453184541845518456184571845818459184601846118462184631846418465184661846718468184691847018471184721847318474184751847618477184781847918480184811848218483184841848518486184871848818489184901849118492184931849418495184961849718498184991850018501185021850318504185051850618507185081850918510185111851218513185141851518516185171851818519185201852118522185231852418525185261852718528185291853018531185321853318534185351853618537185381853918540185411854218543185441854518546185471854818549185501855118552185531855418555185561855718558185591856018561185621856318564185651856618567185681856918570185711857218573185741857518576185771857818579185801858118582185831858418585185861858718588185891859018591185921859318594185951859618597185981859918600186011860218603186041860518606186071860818609186101861118612186131861418615186161861718618186191862018621186221862318624186251862618627186281862918630186311863218633186341863518636186371863818639186401864118642186431864418645186461864718648186491865018651186521865318654186551865618657186581865918660186611866218663186641866518666186671866818669186701867118672186731867418675186761867718678186791868018681186821868318684186851868618687186881868918690186911869218693186941869518696186971869818699187001870118702187031870418705187061870718708187091871018711187121871318714187151871618717187181871918720187211872218723187241872518726187271872818729187301873118732187331873418735187361873718738187391874018741187421874318744187451874618747187481874918750187511875218753187541875518756187571875818759187601876118762187631876418765187661876718768187691877018771187721877318774187751877618777187781877918780187811878218783187841878518786187871878818789187901879118792187931879418795187961879718798187991880018801188021880318804188051880618807188081880918810188111881218813188141881518816188171881818819188201882118822188231882418825188261882718828188291883018831188321883318834188351883618837188381883918840188411884218843188441884518846188471884818849188501885118852188531885418855188561885718858188591886018861188621886318864188651886618867188681886918870188711887218873188741887518876188771887818879188801888118882188831888418885188861888718888188891889018891188921889318894188951889618897188981889918900189011890218903189041890518906189071890818909189101891118912189131891418915189161891718918189191892018921189221892318924189251892618927189281892918930189311893218933189341893518936189371893818939189401894118942189431894418945189461894718948189491895018951189521895318954189551895618957189581895918960189611896218963189641896518966189671896818969189701897118972189731897418975189761897718978189791898018981189821898318984189851898618987189881898918990189911899218993189941899518996189971899818999190001900119002190031900419005190061900719008190091901019011190121901319014190151901619017190181901919020190211902219023190241902519026190271902819029190301903119032190331903419035190361903719038190391904019041190421904319044190451904619047190481904919050190511905219053190541905519056190571905819059190601906119062190631906419065190661906719068190691907019071190721907319074190751907619077190781907919080190811908219083190841908519086190871908819089190901909119092190931909419095190961909719098190991910019101191021910319104191051910619107191081910919110191111911219113191141911519116191171911819119191201912119122191231912419125191261912719128191291913019131191321913319134191351913619137191381913919140191411914219143191441914519146191471914819149191501915119152191531915419155191561915719158191591916019161191621916319164191651916619167191681916919170191711917219173191741917519176191771917819179191801918119182191831918419185191861918719188191891919019191191921919319194191951919619197191981919919200192011920219203192041920519206192071920819209192101921119212192131921419215192161921719218192191922019221192221922319224192251922619227192281922919230192311923219233192341923519236192371923819239192401924119242192431924419245192461924719248192491925019251192521925319254192551925619257192581925919260192611926219263192641926519266192671926819269192701927119272192731927419275192761927719278192791928019281192821928319284192851928619287192881928919290192911929219293192941929519296192971929819299193001930119302193031930419305193061930719308193091931019311193121931319314193151931619317193181931919320193211932219323193241932519326193271932819329193301933119332193331933419335193361933719338193391934019341193421934319344193451934619347193481934919350193511935219353193541935519356193571935819359193601936119362193631936419365193661936719368193691937019371193721937319374193751937619377193781937919380193811938219383193841938519386193871938819389193901939119392193931939419395193961939719398193991940019401194021940319404194051940619407194081940919410194111941219413194141941519416194171941819419194201942119422194231942419425194261942719428194291943019431194321943319434194351943619437194381943919440194411944219443194441944519446194471944819449194501945119452194531945419455194561945719458194591946019461194621946319464194651946619467194681946919470194711947219473194741947519476194771947819479194801948119482194831948419485194861948719488194891949019491194921949319494194951949619497194981949919500195011950219503195041950519506195071950819509195101951119512195131951419515195161951719518195191952019521195221952319524195251952619527195281952919530195311953219533195341953519536195371953819539195401954119542195431954419545195461954719548195491955019551195521955319554195551955619557195581955919560195611956219563195641956519566195671956819569195701957119572195731957419575195761957719578195791958019581195821958319584195851958619587195881958919590195911959219593195941959519596195971959819599196001960119602196031960419605196061960719608196091961019611196121961319614196151961619617196181961919620196211962219623196241962519626196271962819629196301963119632196331963419635196361963719638196391964019641196421964319644196451964619647196481964919650196511965219653196541965519656196571965819659196601966119662196631966419665196661966719668196691967019671196721967319674196751967619677196781967919680196811968219683196841968519686196871968819689196901969119692196931969419695196961969719698196991970019701197021970319704197051970619707197081970919710197111971219713197141971519716197171971819719197201972119722197231972419725197261972719728197291973019731197321973319734197351973619737197381973919740197411974219743197441974519746197471974819749197501975119752197531975419755197561975719758197591976019761197621976319764197651976619767197681976919770197711977219773197741977519776197771977819779197801978119782197831978419785197861978719788197891979019791197921979319794197951979619797197981979919800198011980219803198041980519806198071980819809198101981119812198131981419815198161981719818198191982019821198221982319824198251982619827198281982919830198311983219833198341983519836198371983819839198401984119842198431984419845198461984719848198491985019851198521985319854198551985619857198581985919860198611986219863198641986519866198671986819869198701987119872198731987419875198761987719878198791988019881198821988319884198851988619887198881988919890198911989219893198941989519896198971989819899199001990119902199031990419905199061990719908199091991019911199121991319914199151991619917199181991919920199211992219923199241992519926199271992819929199301993119932199331993419935199361993719938199391994019941199421994319944199451994619947199481994919950199511995219953199541995519956199571995819959199601996119962199631996419965199661996719968199691997019971199721997319974199751997619977199781997919980199811998219983199841998519986199871998819989199901999119992199931999419995199961999719998199992000020001200022000320004200052000620007200082000920010200112001220013200142001520016200172001820019200202002120022200232002420025200262002720028200292003020031200322003320034200352003620037200382003920040200412004220043200442004520046200472004820049200502005120052200532005420055200562005720058200592006020061200622006320064200652006620067200682006920070200712007220073200742007520076200772007820079200802008120082200832008420085200862008720088200892009020091200922009320094200952009620097200982009920100201012010220103201042010520106201072010820109201102011120112201132011420115201162011720118201192012020121201222012320124201252012620127201282012920130201312013220133201342013520136201372013820139201402014120142201432014420145201462014720148201492015020151201522015320154201552015620157201582015920160201612016220163201642016520166201672016820169201702017120172201732017420175201762017720178201792018020181201822018320184201852018620187201882018920190201912019220193201942019520196201972019820199202002020120202202032020420205202062020720208202092021020211202122021320214202152021620217202182021920220202212022220223202242022520226202272022820229202302023120232202332023420235202362023720238202392024020241202422024320244202452024620247202482024920250202512025220253202542025520256202572025820259202602026120262202632026420265202662026720268202692027020271202722027320274202752027620277202782027920280202812028220283202842028520286202872028820289202902029120292202932029420295202962029720298202992030020301203022030320304203052030620307203082030920310203112031220313203142031520316203172031820319203202032120322203232032420325203262032720328203292033020331203322033320334203352033620337203382033920340203412034220343203442034520346203472034820349203502035120352203532035420355203562035720358203592036020361203622036320364203652036620367203682036920370203712037220373203742037520376203772037820379203802038120382203832038420385203862038720388203892039020391203922039320394203952039620397203982039920400204012040220403204042040520406204072040820409204102041120412204132041420415204162041720418204192042020421204222042320424204252042620427204282042920430204312043220433204342043520436204372043820439204402044120442204432044420445204462044720448204492045020451204522045320454204552045620457204582045920460204612046220463204642046520466204672046820469204702047120472204732047420475204762047720478204792048020481204822048320484204852048620487204882048920490204912049220493204942049520496204972049820499205002050120502205032050420505205062050720508205092051020511205122051320514205152051620517205182051920520205212052220523205242052520526205272052820529205302053120532205332053420535205362053720538205392054020541205422054320544205452054620547205482054920550205512055220553205542055520556205572055820559205602056120562205632056420565205662056720568205692057020571205722057320574205752057620577205782057920580205812058220583205842058520586205872058820589205902059120592205932059420595205962059720598205992060020601206022060320604206052060620607206082060920610206112061220613206142061520616206172061820619206202062120622206232062420625206262062720628206292063020631206322063320634206352063620637206382063920640206412064220643206442064520646206472064820649206502065120652206532065420655206562065720658206592066020661206622066320664206652066620667206682066920670206712067220673206742067520676206772067820679206802068120682206832068420685206862068720688206892069020691206922069320694206952069620697206982069920700207012070220703207042070520706207072070820709207102071120712207132071420715207162071720718207192072020721207222072320724207252072620727207282072920730207312073220733207342073520736207372073820739207402074120742207432074420745207462074720748207492075020751207522075320754207552075620757207582075920760207612076220763207642076520766207672076820769207702077120772207732077420775207762077720778207792078020781207822078320784207852078620787207882078920790207912079220793207942079520796207972079820799208002080120802208032080420805208062080720808208092081020811208122081320814208152081620817208182081920820208212082220823208242082520826208272082820829208302083120832208332083420835208362083720838208392084020841208422084320844208452084620847208482084920850208512085220853208542085520856208572085820859208602086120862208632086420865208662086720868208692087020871208722087320874208752087620877208782087920880208812088220883208842088520886208872088820889208902089120892208932089420895208962089720898208992090020901209022090320904209052090620907209082090920910209112091220913209142091520916209172091820919209202092120922209232092420925209262092720928209292093020931209322093320934209352093620937209382093920940209412094220943209442094520946209472094820949209502095120952209532095420955209562095720958209592096020961209622096320964209652096620967209682096920970209712097220973209742097520976209772097820979209802098120982209832098420985209862098720988209892099020991209922099320994209952099620997209982099921000210012100221003210042100521006210072100821009210102101121012210132101421015210162101721018210192102021021210222102321024210252102621027210282102921030210312103221033210342103521036210372103821039210402104121042210432104421045210462104721048210492105021051210522105321054210552105621057210582105921060210612106221063210642106521066210672106821069210702107121072210732107421075210762107721078210792108021081210822108321084210852108621087210882108921090210912109221093210942109521096210972109821099211002110121102211032110421105211062110721108211092111021111211122111321114211152111621117211182111921120211212112221123211242112521126211272112821129211302113121132211332113421135211362113721138211392114021141211422114321144211452114621147211482114921150211512115221153211542115521156211572115821159211602116121162211632116421165211662116721168211692117021171211722117321174211752117621177211782117921180211812118221183211842118521186211872118821189211902119121192211932119421195211962119721198211992120021201212022120321204212052120621207212082120921210212112121221213212142121521216212172121821219212202122121222212232122421225212262122721228212292123021231212322123321234212352123621237212382123921240212412124221243212442124521246212472124821249212502125121252212532125421255212562125721258212592126021261212622126321264212652126621267212682126921270212712127221273212742127521276212772127821279212802128121282212832128421285212862128721288212892129021291212922129321294212952129621297212982129921300213012130221303213042130521306213072130821309213102131121312213132131421315213162131721318213192132021321213222132321324213252132621327213282132921330213312133221333213342133521336213372133821339213402134121342213432134421345213462134721348213492135021351213522135321354213552135621357213582135921360213612136221363213642136521366213672136821369213702137121372213732137421375213762137721378213792138021381213822138321384213852138621387213882138921390213912139221393213942139521396213972139821399214002140121402214032140421405214062140721408214092141021411214122141321414214152141621417214182141921420214212142221423214242142521426214272142821429214302143121432214332143421435214362143721438214392144021441214422144321444214452144621447214482144921450214512145221453214542145521456214572145821459214602146121462214632146421465214662146721468214692147021471214722147321474214752147621477214782147921480214812148221483214842148521486214872148821489214902149121492214932149421495214962149721498214992150021501215022150321504215052150621507215082150921510215112151221513215142151521516215172151821519215202152121522215232152421525215262152721528215292153021531215322153321534215352153621537215382153921540215412154221543215442154521546215472154821549215502155121552215532155421555215562155721558215592156021561215622156321564215652156621567215682156921570215712157221573215742157521576215772157821579215802158121582215832158421585215862158721588215892159021591215922159321594215952159621597215982159921600216012160221603216042160521606216072160821609216102161121612216132161421615216162161721618216192162021621216222162321624216252162621627216282162921630216312163221633216342163521636216372163821639216402164121642216432164421645216462164721648216492165021651216522165321654216552165621657216582165921660216612166221663216642166521666216672166821669216702167121672216732167421675216762167721678216792168021681216822168321684216852168621687216882168921690216912169221693216942169521696216972169821699217002170121702217032170421705217062170721708217092171021711217122171321714217152171621717217182171921720217212172221723217242172521726217272172821729217302173121732217332173421735217362173721738217392174021741217422174321744217452174621747217482174921750217512175221753217542175521756217572175821759217602176121762217632176421765217662176721768217692177021771217722177321774217752177621777217782177921780217812178221783217842178521786217872178821789217902179121792217932179421795217962179721798217992180021801218022180321804218052180621807218082180921810218112181221813218142181521816218172181821819218202182121822218232182421825218262182721828218292183021831218322183321834218352183621837218382183921840218412184221843218442184521846218472184821849218502185121852218532185421855218562185721858218592186021861218622186321864218652186621867218682186921870218712187221873218742187521876218772187821879218802188121882218832188421885218862188721888218892189021891218922189321894218952189621897218982189921900219012190221903219042190521906219072190821909219102191121912219132191421915219162191721918219192192021921219222192321924219252192621927219282192921930219312193221933219342193521936219372193821939219402194121942219432194421945219462194721948219492195021951219522195321954219552195621957219582195921960219612196221963219642196521966219672196821969219702197121972219732197421975219762197721978219792198021981219822198321984219852198621987219882198921990219912199221993219942199521996219972199821999220002200122002220032200422005220062200722008220092201022011220122201322014220152201622017220182201922020220212202222023220242202522026220272202822029220302203122032220332203422035220362203722038220392204022041220422204322044220452204622047220482204922050220512205222053220542205522056220572205822059220602206122062220632206422065220662206722068220692207022071220722207322074220752207622077220782207922080220812208222083220842208522086220872208822089220902209122092220932209422095220962209722098220992210022101221022210322104221052210622107221082210922110221112211222113221142211522116221172211822119221202212122122221232212422125221262212722128221292213022131221322213322134221352213622137221382213922140221412214222143221442214522146221472214822149221502215122152221532215422155221562215722158221592216022161221622216322164221652216622167221682216922170221712217222173221742217522176221772217822179221802218122182221832218422185221862218722188221892219022191221922219322194221952219622197221982219922200222012220222203222042220522206222072220822209222102221122212222132221422215222162221722218222192222022221222222222322224222252222622227222282222922230222312223222233222342223522236222372223822239222402224122242222432224422245222462224722248222492225022251222522225322254222552225622257222582225922260222612226222263222642226522266222672226822269222702227122272222732227422275222762227722278222792228022281222822228322284222852228622287222882228922290222912229222293222942229522296222972229822299223002230122302223032230422305223062230722308223092231022311223122231322314223152231622317223182231922320223212232222323223242232522326223272232822329223302233122332223332233422335223362233722338223392234022341223422234322344223452234622347223482234922350223512235222353223542235522356223572235822359223602236122362223632236422365223662236722368223692237022371223722237322374223752237622377223782237922380223812238222383223842238522386223872238822389223902239122392223932239422395223962239722398223992240022401224022240322404224052240622407224082240922410224112241222413224142241522416224172241822419224202242122422224232242422425224262242722428224292243022431224322243322434224352243622437224382243922440224412244222443224442244522446224472244822449224502245122452224532245422455224562245722458224592246022461224622246322464224652246622467224682246922470224712247222473224742247522476224772247822479224802248122482224832248422485224862248722488224892249022491224922249322494224952249622497224982249922500225012250222503225042250522506225072250822509225102251122512225132251422515225162251722518225192252022521225222252322524225252252622527225282252922530225312253222533225342253522536225372253822539225402254122542225432254422545225462254722548225492255022551225522255322554225552255622557225582255922560225612256222563225642256522566225672256822569225702257122572225732257422575225762257722578225792258022581225822258322584225852258622587225882258922590225912259222593225942259522596225972259822599226002260122602226032260422605226062260722608226092261022611226122261322614226152261622617226182261922620226212262222623226242262522626226272262822629226302263122632226332263422635226362263722638226392264022641226422264322644226452264622647226482264922650226512265222653226542265522656226572265822659226602266122662226632266422665226662266722668226692267022671226722267322674226752267622677226782267922680226812268222683226842268522686226872268822689226902269122692226932269422695226962269722698226992270022701227022270322704227052270622707227082270922710227112271222713227142271522716227172271822719227202272122722227232272422725227262272722728227292273022731227322273322734227352273622737227382273922740227412274222743227442274522746227472274822749227502275122752227532275422755227562275722758227592276022761227622276322764227652276622767227682276922770227712277222773227742277522776227772277822779227802278122782227832278422785227862278722788227892279022791227922279322794227952279622797227982279922800228012280222803228042280522806228072280822809228102281122812228132281422815228162281722818228192282022821228222282322824228252282622827228282282922830228312283222833228342283522836228372283822839228402284122842228432284422845228462284722848228492285022851228522285322854228552285622857228582285922860228612286222863228642286522866228672286822869228702287122872228732287422875228762287722878228792288022881228822288322884228852288622887228882288922890228912289222893228942289522896228972289822899229002290122902229032290422905229062290722908229092291022911229122291322914229152291622917229182291922920229212292222923229242292522926229272292822929229302293122932229332293422935229362293722938229392294022941229422294322944229452294622947229482294922950229512295222953229542295522956229572295822959229602296122962229632296422965229662296722968229692297022971229722297322974229752297622977229782297922980229812298222983229842298522986229872298822989229902299122992229932299422995229962299722998229992300023001230022300323004230052300623007230082300923010230112301223013230142301523016230172301823019230202302123022230232302423025230262302723028230292303023031230322303323034230352303623037230382303923040230412304223043230442304523046230472304823049230502305123052230532305423055230562305723058230592306023061230622306323064230652306623067230682306923070230712307223073230742307523076230772307823079230802308123082230832308423085230862308723088230892309023091230922309323094230952309623097230982309923100231012310223103231042310523106231072310823109231102311123112231132311423115231162311723118231192312023121231222312323124231252312623127231282312923130231312313223133231342313523136231372313823139231402314123142231432314423145231462314723148231492315023151231522315323154231552315623157231582315923160231612316223163231642316523166231672316823169231702317123172231732317423175231762317723178231792318023181231822318323184231852318623187231882318923190231912319223193231942319523196231972319823199232002320123202232032320423205232062320723208232092321023211232122321323214232152321623217232182321923220232212322223223232242322523226232272322823229232302323123232232332323423235232362323723238232392324023241232422324323244232452324623247232482324923250232512325223253232542325523256232572325823259232602326123262232632326423265232662326723268232692327023271232722327323274232752327623277232782327923280232812328223283232842328523286232872328823289232902329123292232932329423295232962329723298232992330023301233022330323304233052330623307233082330923310233112331223313233142331523316233172331823319233202332123322233232332423325233262332723328233292333023331233322333323334233352333623337233382333923340233412334223343233442334523346233472334823349233502335123352233532335423355233562335723358233592336023361233622336323364233652336623367233682336923370233712337223373233742337523376233772337823379233802338123382233832338423385233862338723388233892339023391233922339323394233952339623397233982339923400234012340223403234042340523406234072340823409234102341123412234132341423415234162341723418234192342023421234222342323424234252342623427234282342923430234312343223433234342343523436234372343823439234402344123442234432344423445234462344723448234492345023451234522345323454234552345623457234582345923460234612346223463234642346523466234672346823469234702347123472234732347423475234762347723478234792348023481234822348323484234852348623487234882348923490234912349223493234942349523496234972349823499235002350123502235032350423505235062350723508235092351023511235122351323514235152351623517235182351923520235212352223523235242352523526235272352823529235302353123532235332353423535235362353723538235392354023541235422354323544235452354623547235482354923550235512355223553235542355523556235572355823559235602356123562235632356423565235662356723568235692357023571235722357323574235752357623577235782357923580235812358223583235842358523586235872358823589235902359123592235932359423595235962359723598235992360023601236022360323604236052360623607236082360923610236112361223613236142361523616236172361823619236202362123622236232362423625236262362723628236292363023631236322363323634236352363623637236382363923640236412364223643236442364523646236472364823649236502365123652236532365423655236562365723658236592366023661236622366323664236652366623667236682366923670236712367223673236742367523676236772367823679236802368123682236832368423685236862368723688236892369023691236922369323694236952369623697236982369923700237012370223703237042370523706237072370823709237102371123712237132371423715237162371723718237192372023721237222372323724237252372623727237282372923730237312373223733237342373523736237372373823739237402374123742237432374423745237462374723748237492375023751237522375323754237552375623757237582375923760237612376223763237642376523766237672376823769237702377123772237732377423775237762377723778237792378023781237822378323784237852378623787237882378923790237912379223793237942379523796237972379823799238002380123802238032380423805238062380723808238092381023811238122381323814238152381623817238182381923820238212382223823238242382523826238272382823829238302383123832238332383423835238362383723838238392384023841238422384323844238452384623847238482384923850238512385223853238542385523856238572385823859238602386123862238632386423865238662386723868238692387023871238722387323874238752387623877238782387923880238812388223883238842388523886238872388823889238902389123892238932389423895238962389723898238992390023901239022390323904239052390623907239082390923910239112391223913239142391523916239172391823919239202392123922239232392423925239262392723928239292393023931239322393323934239352393623937239382393923940239412394223943239442394523946239472394823949239502395123952239532395423955239562395723958239592396023961239622396323964239652396623967239682396923970239712397223973239742397523976239772397823979239802398123982239832398423985239862398723988239892399023991239922399323994239952399623997239982399924000240012400224003240042400524006240072400824009240102401124012240132401424015240162401724018240192402024021240222402324024240252402624027240282402924030240312403224033240342403524036240372403824039240402404124042240432404424045240462404724048240492405024051240522405324054240552405624057240582405924060240612406224063240642406524066240672406824069240702407124072240732407424075240762407724078240792408024081240822408324084240852408624087240882408924090240912409224093240942409524096240972409824099241002410124102241032410424105241062410724108241092411024111241122411324114241152411624117241182411924120241212412224123241242412524126241272412824129241302413124132241332413424135241362413724138241392414024141241422414324144241452414624147241482414924150241512415224153241542415524156241572415824159241602416124162241632416424165241662416724168241692417024171241722417324174241752417624177241782417924180241812418224183241842418524186241872418824189241902419124192241932419424195241962419724198241992420024201242022420324204242052420624207242082420924210242112421224213242142421524216242172421824219242202422124222242232422424225242262422724228242292423024231242322423324234242352423624237242382423924240242412424224243242442424524246242472424824249242502425124252242532425424255242562425724258242592426024261242622426324264242652426624267242682426924270242712427224273242742427524276242772427824279242802428124282242832428424285242862428724288242892429024291242922429324294242952429624297242982429924300243012430224303243042430524306243072430824309243102431124312243132431424315243162431724318243192432024321243222432324324243252432624327243282432924330243312433224333243342433524336243372433824339243402434124342243432434424345243462434724348243492435024351243522435324354243552435624357243582435924360243612436224363243642436524366243672436824369243702437124372243732437424375243762437724378243792438024381243822438324384243852438624387243882438924390243912439224393243942439524396243972439824399244002440124402244032440424405244062440724408244092441024411244122441324414244152441624417244182441924420244212442224423244242442524426244272442824429244302443124432244332443424435244362443724438244392444024441244422444324444244452444624447244482444924450244512445224453244542445524456244572445824459244602446124462244632446424465244662446724468244692447024471244722447324474244752447624477244782447924480244812448224483244842448524486244872448824489244902449124492244932449424495244962449724498244992450024501245022450324504245052450624507245082450924510245112451224513245142451524516245172451824519245202452124522245232452424525245262452724528245292453024531245322453324534245352453624537245382453924540245412454224543245442454524546245472454824549245502455124552245532455424555245562455724558245592456024561245622456324564245652456624567245682456924570245712457224573245742457524576245772457824579245802458124582245832458424585245862458724588245892459024591245922459324594245952459624597245982459924600246012460224603246042460524606246072460824609246102461124612246132461424615246162461724618246192462024621246222462324624246252462624627246282462924630246312463224633246342463524636246372463824639246402464124642246432464424645246462464724648246492465024651246522465324654246552465624657246582465924660246612466224663246642466524666246672466824669246702467124672246732467424675246762467724678246792468024681246822468324684246852468624687246882468924690246912469224693246942469524696246972469824699247002470124702247032470424705247062470724708247092471024711247122471324714247152471624717247182471924720247212472224723247242472524726247272472824729247302473124732247332473424735247362473724738247392474024741247422474324744247452474624747247482474924750247512475224753247542475524756247572475824759247602476124762247632476424765247662476724768247692477024771247722477324774247752477624777247782477924780247812478224783247842478524786247872478824789247902479124792247932479424795247962479724798247992480024801248022480324804248052480624807248082480924810248112481224813248142481524816248172481824819248202482124822248232482424825248262482724828248292483024831248322483324834248352483624837248382483924840248412484224843248442484524846248472484824849248502485124852248532485424855248562485724858248592486024861248622486324864248652486624867248682486924870248712487224873248742487524876248772487824879248802488124882248832488424885248862488724888248892489024891248922489324894248952489624897248982489924900249012490224903249042490524906249072490824909249102491124912249132491424915249162491724918249192492024921249222492324924249252492624927249282492924930249312493224933249342493524936249372493824939249402494124942249432494424945249462494724948249492495024951249522495324954249552495624957249582495924960249612496224963249642496524966249672496824969249702497124972249732497424975249762497724978249792498024981249822498324984249852498624987249882498924990249912499224993249942499524996249972499824999250002500125002250032500425005250062500725008250092501025011250122501325014250152501625017250182501925020250212502225023250242502525026250272502825029250302503125032250332503425035250362503725038250392504025041250422504325044250452504625047250482504925050250512505225053250542505525056250572505825059250602506125062250632506425065250662506725068250692507025071250722507325074250752507625077250782507925080250812508225083250842508525086250872508825089250902509125092250932509425095250962509725098250992510025101251022510325104251052510625107251082510925110251112511225113251142511525116251172511825119251202512125122251232512425125251262512725128251292513025131251322513325134251352513625137251382513925140251412514225143251442514525146251472514825149251502515125152251532515425155251562515725158251592516025161251622516325164251652516625167251682516925170251712517225173251742517525176251772517825179251802518125182251832518425185251862518725188251892519025191251922519325194251952519625197251982519925200252012520225203252042520525206252072520825209252102521125212252132521425215252162521725218252192522025221252222522325224252252522625227252282522925230252312523225233252342523525236252372523825239252402524125242252432524425245252462524725248252492525025251252522525325254252552525625257252582525925260252612526225263252642526525266252672526825269252702527125272252732527425275252762527725278252792528025281252822528325284252852528625287252882528925290252912529225293252942529525296252972529825299253002530125302253032530425305253062530725308253092531025311253122531325314253152531625317253182531925320253212532225323253242532525326253272532825329253302533125332253332533425335253362533725338253392534025341253422534325344253452534625347253482534925350253512535225353253542535525356253572535825359253602536125362253632536425365253662536725368253692537025371253722537325374253752537625377253782537925380253812538225383253842538525386253872538825389253902539125392253932539425395253962539725398253992540025401254022540325404254052540625407254082540925410254112541225413254142541525416254172541825419254202542125422254232542425425254262542725428254292543025431254322543325434254352543625437254382543925440254412544225443254442544525446254472544825449254502545125452254532545425455254562545725458254592546025461254622546325464254652546625467254682546925470254712547225473254742547525476254772547825479254802548125482254832548425485254862548725488254892549025491254922549325494254952549625497254982549925500255012550225503255042550525506255072550825509255102551125512255132551425515255162551725518255192552025521255222552325524255252552625527255282552925530255312553225533255342553525536255372553825539255402554125542255432554425545255462554725548255492555025551255522555325554255552555625557255582555925560255612556225563255642556525566255672556825569255702557125572255732557425575255762557725578255792558025581255822558325584255852558625587255882558925590255912559225593255942559525596255972559825599256002560125602256032560425605256062560725608256092561025611256122561325614256152561625617256182561925620256212562225623256242562525626256272562825629256302563125632256332563425635256362563725638256392564025641256422564325644256452564625647256482564925650256512565225653256542565525656256572565825659256602566125662256632566425665256662566725668256692567025671256722567325674256752567625677256782567925680256812568225683256842568525686256872568825689256902569125692256932569425695256962569725698256992570025701257022570325704257052570625707257082570925710257112571225713257142571525716257172571825719257202572125722257232572425725257262572725728257292573025731257322573325734257352573625737257382573925740257412574225743257442574525746257472574825749257502575125752257532575425755257562575725758257592576025761257622576325764257652576625767257682576925770257712577225773257742577525776257772577825779257802578125782257832578425785257862578725788257892579025791257922579325794257952579625797257982579925800258012580225803258042580525806258072580825809258102581125812258132581425815258162581725818258192582025821258222582325824258252582625827258282582925830258312583225833258342583525836258372583825839258402584125842258432584425845258462584725848258492585025851258522585325854258552585625857258582585925860258612586225863258642586525866258672586825869258702587125872258732587425875258762587725878258792588025881258822588325884258852588625887258882588925890258912589225893258942589525896258972589825899259002590125902259032590425905259062590725908259092591025911259122591325914259152591625917259182591925920259212592225923259242592525926259272592825929259302593125932259332593425935259362593725938259392594025941259422594325944259452594625947259482594925950259512595225953259542595525956259572595825959259602596125962259632596425965259662596725968259692597025971259722597325974259752597625977259782597925980259812598225983259842598525986259872598825989259902599125992259932599425995259962599725998259992600026001260022600326004260052600626007260082600926010260112601226013260142601526016260172601826019260202602126022260232602426025260262602726028260292603026031260322603326034260352603626037260382603926040260412604226043260442604526046260472604826049260502605126052260532605426055260562605726058260592606026061260622606326064260652606626067260682606926070260712607226073260742607526076260772607826079260802608126082260832608426085260862608726088260892609026091260922609326094260952609626097260982609926100261012610226103261042610526106261072610826109261102611126112261132611426115261162611726118261192612026121261222612326124261252612626127261282612926130261312613226133261342613526136261372613826139261402614126142261432614426145261462614726148261492615026151261522615326154261552615626157261582615926160261612616226163261642616526166261672616826169261702617126172261732617426175261762617726178261792618026181261822618326184261852618626187261882618926190261912619226193261942619526196261972619826199262002620126202262032620426205262062620726208262092621026211262122621326214262152621626217262182621926220262212622226223262242622526226262272622826229262302623126232262332623426235262362623726238262392624026241262422624326244262452624626247262482624926250262512625226253262542625526256262572625826259262602626126262262632626426265262662626726268262692627026271262722627326274262752627626277262782627926280262812628226283262842628526286262872628826289262902629126292262932629426295262962629726298262992630026301263022630326304263052630626307263082630926310263112631226313263142631526316263172631826319263202632126322263232632426325263262632726328263292633026331263322633326334263352633626337263382633926340263412634226343263442634526346263472634826349263502635126352263532635426355263562635726358263592636026361263622636326364263652636626367263682636926370263712637226373263742637526376263772637826379263802638126382263832638426385263862638726388263892639026391263922639326394263952639626397263982639926400264012640226403264042640526406264072640826409264102641126412264132641426415264162641726418264192642026421264222642326424264252642626427264282642926430264312643226433264342643526436264372643826439264402644126442264432644426445264462644726448264492645026451264522645326454264552645626457264582645926460264612646226463264642646526466264672646826469264702647126472264732647426475264762647726478264792648026481264822648326484264852648626487264882648926490264912649226493264942649526496264972649826499265002650126502265032650426505265062650726508265092651026511265122651326514265152651626517265182651926520265212652226523265242652526526265272652826529265302653126532265332653426535265362653726538265392654026541265422654326544265452654626547265482654926550265512655226553265542655526556265572655826559265602656126562265632656426565265662656726568265692657026571265722657326574265752657626577265782657926580265812658226583265842658526586265872658826589265902659126592265932659426595265962659726598265992660026601266022660326604266052660626607266082660926610266112661226613266142661526616266172661826619266202662126622266232662426625266262662726628266292663026631266322663326634266352663626637266382663926640266412664226643266442664526646266472664826649266502665126652266532665426655266562665726658266592666026661266622666326664266652666626667266682666926670266712667226673266742667526676266772667826679266802668126682266832668426685266862668726688266892669026691266922669326694266952669626697266982669926700267012670226703267042670526706267072670826709267102671126712267132671426715267162671726718267192672026721267222672326724267252672626727267282672926730267312673226733267342673526736267372673826739267402674126742267432674426745267462674726748267492675026751267522675326754267552675626757267582675926760267612676226763267642676526766267672676826769267702677126772267732677426775267762677726778267792678026781267822678326784267852678626787267882678926790267912679226793267942679526796267972679826799268002680126802268032680426805268062680726808268092681026811268122681326814268152681626817268182681926820268212682226823268242682526826268272682826829268302683126832268332683426835268362683726838268392684026841268422684326844268452684626847268482684926850268512685226853268542685526856268572685826859268602686126862268632686426865268662686726868268692687026871268722687326874268752687626877268782687926880268812688226883268842688526886268872688826889268902689126892268932689426895268962689726898268992690026901269022690326904269052690626907269082690926910269112691226913269142691526916269172691826919269202692126922269232692426925269262692726928269292693026931269322693326934269352693626937269382693926940269412694226943269442694526946269472694826949269502695126952269532695426955269562695726958269592696026961269622696326964269652696626967269682696926970269712697226973269742697526976269772697826979269802698126982269832698426985269862698726988269892699026991269922699326994269952699626997269982699927000270012700227003270042700527006270072700827009270102701127012270132701427015270162701727018270192702027021270222702327024270252702627027270282702927030270312703227033270342703527036270372703827039270402704127042270432704427045270462704727048270492705027051270522705327054270552705627057270582705927060270612706227063270642706527066270672706827069270702707127072270732707427075270762707727078270792708027081270822708327084270852708627087270882708927090270912709227093270942709527096270972709827099271002710127102271032710427105271062710727108271092711027111271122711327114271152711627117271182711927120271212712227123271242712527126271272712827129271302713127132271332713427135271362713727138271392714027141271422714327144271452714627147271482714927150271512715227153271542715527156271572715827159271602716127162271632716427165271662716727168271692717027171271722717327174271752717627177271782717927180271812718227183271842718527186271872718827189271902719127192271932719427195271962719727198271992720027201272022720327204272052720627207272082720927210272112721227213272142721527216272172721827219272202722127222272232722427225272262722727228272292723027231272322723327234272352723627237272382723927240272412724227243272442724527246272472724827249272502725127252272532725427255272562725727258272592726027261272622726327264272652726627267272682726927270272712727227273272742727527276272772727827279272802728127282272832728427285272862728727288272892729027291272922729327294272952729627297272982729927300273012730227303273042730527306273072730827309273102731127312273132731427315273162731727318273192732027321273222732327324273252732627327273282732927330273312733227333273342733527336273372733827339273402734127342273432734427345273462734727348273492735027351273522735327354273552735627357273582735927360273612736227363273642736527366273672736827369273702737127372273732737427375273762737727378273792738027381273822738327384273852738627387273882738927390273912739227393273942739527396273972739827399274002740127402274032740427405274062740727408274092741027411274122741327414274152741627417274182741927420274212742227423274242742527426274272742827429274302743127432274332743427435274362743727438274392744027441274422744327444274452744627447274482744927450274512745227453274542745527456274572745827459274602746127462274632746427465274662746727468274692747027471274722747327474274752747627477274782747927480274812748227483274842748527486274872748827489274902749127492274932749427495274962749727498274992750027501275022750327504275052750627507275082750927510275112751227513275142751527516275172751827519275202752127522275232752427525275262752727528275292753027531275322753327534275352753627537275382753927540275412754227543275442754527546275472754827549275502755127552275532755427555275562755727558275592756027561275622756327564275652756627567275682756927570275712757227573275742757527576275772757827579275802758127582275832758427585275862758727588275892759027591275922759327594275952759627597275982759927600276012760227603276042760527606276072760827609276102761127612276132761427615276162761727618276192762027621276222762327624276252762627627276282762927630276312763227633276342763527636276372763827639276402764127642276432764427645276462764727648276492765027651276522765327654276552765627657276582765927660276612766227663276642766527666276672766827669276702767127672276732767427675276762767727678276792768027681276822768327684276852768627687276882768927690276912769227693276942769527696276972769827699277002770127702277032770427705277062770727708277092771027711277122771327714277152771627717277182771927720277212772227723277242772527726277272772827729277302773127732277332773427735277362773727738277392774027741277422774327744277452774627747277482774927750277512775227753277542775527756277572775827759277602776127762277632776427765277662776727768277692777027771277722777327774277752777627777277782777927780277812778227783277842778527786277872778827789277902779127792277932779427795277962779727798277992780027801278022780327804278052780627807278082780927810278112781227813278142781527816278172781827819278202782127822278232782427825278262782727828278292783027831278322783327834278352783627837278382783927840278412784227843278442784527846278472784827849278502785127852278532785427855278562785727858278592786027861278622786327864278652786627867278682786927870278712787227873278742787527876278772787827879278802788127882278832788427885278862788727888278892789027891278922789327894278952789627897278982789927900279012790227903279042790527906279072790827909279102791127912279132791427915279162791727918279192792027921279222792327924279252792627927279282792927930279312793227933279342793527936279372793827939279402794127942279432794427945279462794727948279492795027951279522795327954279552795627957279582795927960279612796227963279642796527966279672796827969279702797127972279732797427975279762797727978279792798027981279822798327984279852798627987279882798927990279912799227993279942799527996279972799827999280002800128002280032800428005280062800728008280092801028011280122801328014280152801628017280182801928020280212802228023280242802528026280272802828029280302803128032280332803428035280362803728038280392804028041280422804328044280452804628047280482804928050280512805228053280542805528056280572805828059280602806128062280632806428065280662806728068280692807028071280722807328074280752807628077280782807928080280812808228083280842808528086280872808828089280902809128092280932809428095280962809728098280992810028101281022810328104281052810628107281082810928110281112811228113281142811528116281172811828119281202812128122281232812428125281262812728128281292813028131281322813328134281352813628137281382813928140281412814228143281442814528146281472814828149281502815128152281532815428155281562815728158281592816028161281622816328164281652816628167281682816928170281712817228173281742817528176281772817828179281802818128182281832818428185281862818728188281892819028191281922819328194281952819628197281982819928200282012820228203282042820528206282072820828209282102821128212282132821428215282162821728218282192822028221282222822328224282252822628227282282822928230282312823228233282342823528236282372823828239282402824128242282432824428245282462824728248282492825028251282522825328254282552825628257282582825928260282612826228263282642826528266282672826828269282702827128272282732827428275282762827728278282792828028281282822828328284282852828628287282882828928290282912829228293282942829528296282972829828299283002830128302283032830428305283062830728308283092831028311283122831328314283152831628317283182831928320283212832228323283242832528326283272832828329283302833128332283332833428335283362833728338283392834028341283422834328344283452834628347283482834928350283512835228353283542835528356283572835828359283602836128362283632836428365283662836728368283692837028371283722837328374283752837628377283782837928380283812838228383283842838528386283872838828389283902839128392283932839428395283962839728398283992840028401284022840328404284052840628407284082840928410284112841228413284142841528416284172841828419284202842128422284232842428425284262842728428284292843028431284322843328434284352843628437284382843928440284412844228443284442844528446284472844828449284502845128452284532845428455284562845728458284592846028461284622846328464284652846628467284682846928470284712847228473284742847528476284772847828479284802848128482284832848428485284862848728488284892849028491284922849328494284952849628497284982849928500285012850228503285042850528506285072850828509285102851128512285132851428515285162851728518285192852028521285222852328524285252852628527285282852928530285312853228533285342853528536285372853828539285402854128542285432854428545285462854728548285492855028551285522855328554285552855628557285582855928560285612856228563285642856528566285672856828569285702857128572285732857428575285762857728578285792858028581285822858328584285852858628587285882858928590285912859228593285942859528596285972859828599286002860128602286032860428605286062860728608286092861028611286122861328614286152861628617286182861928620286212862228623286242862528626286272862828629286302863128632286332863428635286362863728638286392864028641286422864328644286452864628647286482864928650286512865228653286542865528656286572865828659286602866128662286632866428665286662866728668286692867028671286722867328674286752867628677286782867928680286812868228683286842868528686286872868828689286902869128692286932869428695286962869728698286992870028701287022870328704287052870628707287082870928710287112871228713287142871528716287172871828719287202872128722287232872428725287262872728728287292873028731287322873328734287352873628737287382873928740287412874228743287442874528746287472874828749287502875128752287532875428755287562875728758287592876028761287622876328764287652876628767287682876928770287712877228773287742877528776287772877828779287802878128782287832878428785287862878728788287892879028791287922879328794287952879628797287982879928800288012880228803288042880528806288072880828809288102881128812288132881428815288162881728818288192882028821288222882328824288252882628827288282882928830288312883228833288342883528836288372883828839288402884128842288432884428845288462884728848288492885028851288522885328854288552885628857288582885928860288612886228863288642886528866288672886828869288702887128872288732887428875288762887728878288792888028881288822888328884288852888628887288882888928890288912889228893288942889528896288972889828899289002890128902289032890428905289062890728908289092891028911289122891328914289152891628917289182891928920289212892228923289242892528926289272892828929289302893128932289332893428935289362893728938289392894028941289422894328944289452894628947289482894928950289512895228953289542895528956289572895828959289602896128962289632896428965289662896728968289692897028971289722897328974289752897628977289782897928980289812898228983289842898528986289872898828989289902899128992289932899428995289962899728998289992900029001290022900329004290052900629007290082900929010290112901229013290142901529016290172901829019290202902129022290232902429025290262902729028290292903029031290322903329034290352903629037290382903929040290412904229043290442904529046290472904829049290502905129052290532905429055290562905729058290592906029061290622906329064290652906629067290682906929070290712907229073290742907529076290772907829079290802908129082290832908429085290862908729088290892909029091290922909329094290952909629097290982909929100291012910229103291042910529106291072910829109291102911129112291132911429115291162911729118291192912029121291222912329124291252912629127291282912929130291312913229133291342913529136291372913829139291402914129142291432914429145291462914729148291492915029151291522915329154291552915629157291582915929160291612916229163291642916529166291672916829169291702917129172291732917429175291762917729178291792918029181291822918329184291852918629187291882918929190291912919229193291942919529196291972919829199292002920129202292032920429205292062920729208292092921029211292122921329214292152921629217292182921929220292212922229223292242922529226292272922829229292302923129232292332923429235292362923729238292392924029241292422924329244292452924629247292482924929250292512925229253292542925529256292572925829259292602926129262292632926429265292662926729268292692927029271292722927329274292752927629277292782927929280292812928229283292842928529286292872928829289292902929129292292932929429295292962929729298292992930029301293022930329304293052930629307293082930929310293112931229313293142931529316293172931829319293202932129322293232932429325293262932729328293292933029331293322933329334293352933629337293382933929340293412934229343293442934529346293472934829349293502935129352293532935429355293562935729358293592936029361293622936329364293652936629367293682936929370293712937229373293742937529376293772937829379293802938129382293832938429385293862938729388293892939029391293922939329394293952939629397293982939929400294012940229403294042940529406294072940829409294102941129412294132941429415294162941729418294192942029421294222942329424294252942629427294282942929430294312943229433294342943529436294372943829439294402944129442294432944429445294462944729448294492945029451294522945329454294552945629457294582945929460294612946229463294642946529466294672946829469294702947129472294732947429475294762947729478294792948029481294822948329484294852948629487294882948929490294912949229493294942949529496294972949829499295002950129502295032950429505295062950729508295092951029511295122951329514295152951629517295182951929520295212952229523295242952529526295272952829529295302953129532295332953429535295362953729538295392954029541295422954329544295452954629547295482954929550295512955229553295542955529556295572955829559295602956129562295632956429565295662956729568295692957029571295722957329574295752957629577295782957929580295812958229583295842958529586295872958829589295902959129592295932959429595295962959729598295992960029601296022960329604296052960629607296082960929610296112961229613296142961529616296172961829619296202962129622296232962429625296262962729628296292963029631296322963329634296352963629637296382963929640296412964229643296442964529646296472964829649296502965129652296532965429655296562965729658296592966029661296622966329664296652966629667296682966929670296712967229673296742967529676296772967829679296802968129682296832968429685296862968729688296892969029691296922969329694296952969629697296982969929700297012970229703297042970529706297072970829709297102971129712297132971429715297162971729718297192972029721297222972329724297252972629727297282972929730297312973229733297342973529736297372973829739297402974129742297432974429745297462974729748297492975029751297522975329754297552975629757297582975929760297612976229763297642976529766297672976829769297702977129772297732977429775297762977729778297792978029781297822978329784297852978629787297882978929790297912979229793297942979529796297972979829799298002980129802298032980429805298062980729808298092981029811298122981329814298152981629817298182981929820298212982229823298242982529826298272982829829298302983129832298332983429835298362983729838298392984029841298422984329844298452984629847298482984929850298512985229853298542985529856298572985829859298602986129862298632986429865298662986729868298692987029871298722987329874298752987629877298782987929880298812988229883298842988529886298872988829889298902989129892298932989429895298962989729898298992990029901299022990329904299052990629907299082990929910299112991229913299142991529916299172991829919299202992129922299232992429925299262992729928299292993029931299322993329934299352993629937299382993929940299412994229943299442994529946299472994829949299502995129952299532995429955299562995729958299592996029961299622996329964299652996629967299682996929970299712997229973299742997529976299772997829979299802998129982299832998429985299862998729988299892999029991299922999329994299952999629997299982999930000300013000230003300043000530006300073000830009300103001130012300133001430015300163001730018300193002030021300223002330024300253002630027300283002930030300313003230033300343003530036300373003830039300403004130042300433004430045300463004730048300493005030051300523005330054300553005630057300583005930060300613006230063300643006530066300673006830069300703007130072300733007430075300763007730078300793008030081300823008330084300853008630087300883008930090300913009230093300943009530096300973009830099301003010130102301033010430105301063010730108301093011030111301123011330114301153011630117301183011930120301213012230123301243012530126301273012830129301303013130132301333013430135301363013730138301393014030141301423014330144301453014630147301483014930150301513015230153301543015530156301573015830159301603016130162301633016430165301663016730168301693017030171301723017330174301753017630177301783017930180301813018230183301843018530186301873018830189301903019130192301933019430195301963019730198301993020030201302023020330204302053020630207302083020930210302113021230213302143021530216302173021830219302203022130222302233022430225302263022730228302293023030231302323023330234302353023630237302383023930240302413024230243302443024530246302473024830249302503025130252302533025430255302563025730258302593026030261302623026330264302653026630267302683026930270302713027230273302743027530276302773027830279302803028130282302833028430285302863028730288302893029030291302923029330294302953029630297302983029930300303013030230303303043030530306303073030830309303103031130312303133031430315303163031730318303193032030321303223032330324303253032630327303283032930330303313033230333303343033530336303373033830339303403034130342303433034430345303463034730348303493035030351303523035330354303553035630357303583035930360303613036230363303643036530366303673036830369303703037130372303733037430375303763037730378303793038030381303823038330384303853038630387303883038930390303913039230393303943039530396303973039830399304003040130402304033040430405304063040730408304093041030411304123041330414304153041630417304183041930420304213042230423304243042530426304273042830429304303043130432304333043430435304363043730438304393044030441304423044330444304453044630447304483044930450304513045230453304543045530456304573045830459304603046130462304633046430465304663046730468304693047030471304723047330474304753047630477304783047930480304813048230483304843048530486304873048830489304903049130492304933049430495304963049730498304993050030501305023050330504305053050630507305083050930510305113051230513305143051530516305173051830519305203052130522305233052430525305263052730528305293053030531305323053330534305353053630537305383053930540305413054230543305443054530546305473054830549305503055130552305533055430555305563055730558305593056030561305623056330564305653056630567305683056930570305713057230573305743057530576305773057830579305803058130582305833058430585305863058730588305893059030591305923059330594305953059630597305983059930600306013060230603306043060530606306073060830609306103061130612306133061430615306163061730618306193062030621306223062330624306253062630627306283062930630306313063230633306343063530636306373063830639306403064130642306433064430645306463064730648306493065030651306523065330654306553065630657306583065930660306613066230663306643066530666306673066830669306703067130672306733067430675306763067730678306793068030681306823068330684306853068630687306883068930690306913069230693306943069530696306973069830699307003070130702307033070430705307063070730708307093071030711307123071330714307153071630717307183071930720307213072230723307243072530726307273072830729307303073130732307333073430735307363073730738307393074030741307423074330744307453074630747307483074930750307513075230753307543075530756307573075830759307603076130762307633076430765307663076730768307693077030771307723077330774307753077630777307783077930780307813078230783307843078530786307873078830789307903079130792307933079430795307963079730798307993080030801308023080330804308053080630807308083080930810308113081230813308143081530816308173081830819308203082130822308233082430825308263082730828308293083030831308323083330834308353083630837308383083930840308413084230843308443084530846308473084830849308503085130852308533085430855308563085730858308593086030861308623086330864308653086630867308683086930870308713087230873308743087530876308773087830879308803088130882308833088430885308863088730888308893089030891308923089330894308953089630897308983089930900309013090230903309043090530906309073090830909309103091130912309133091430915309163091730918309193092030921309223092330924309253092630927309283092930930309313093230933309343093530936309373093830939309403094130942309433094430945309463094730948309493095030951309523095330954309553095630957309583095930960309613096230963309643096530966309673096830969309703097130972309733097430975309763097730978309793098030981309823098330984309853098630987309883098930990309913099230993309943099530996309973099830999310003100131002310033100431005310063100731008310093101031011310123101331014310153101631017310183101931020310213102231023310243102531026310273102831029310303103131032310333103431035310363103731038310393104031041310423104331044310453104631047310483104931050310513105231053310543105531056310573105831059310603106131062310633106431065310663106731068310693107031071310723107331074310753107631077310783107931080310813108231083310843108531086310873108831089310903109131092310933109431095310963109731098310993110031101311023110331104311053110631107311083110931110311113111231113311143111531116311173111831119311203112131122311233112431125311263112731128311293113031131311323113331134311353113631137311383113931140311413114231143311443114531146311473114831149311503115131152311533115431155311563115731158311593116031161311623116331164311653116631167311683116931170311713117231173311743117531176311773117831179311803118131182311833118431185311863118731188311893119031191311923119331194311953119631197311983119931200312013120231203312043120531206312073120831209312103121131212312133121431215312163121731218312193122031221312223122331224312253122631227312283122931230312313123231233312343123531236312373123831239312403124131242312433124431245312463124731248312493125031251312523125331254312553125631257312583125931260312613126231263312643126531266312673126831269312703127131272312733127431275312763127731278312793128031281312823128331284312853128631287312883128931290312913129231293312943129531296312973129831299313003130131302313033130431305313063130731308313093131031311313123131331314313153131631317313183131931320313213132231323313243132531326313273132831329313303133131332313333133431335313363133731338313393134031341313423134331344313453134631347313483134931350313513135231353313543135531356313573135831359313603136131362313633136431365313663136731368313693137031371313723137331374313753137631377313783137931380313813138231383313843138531386313873138831389313903139131392313933139431395313963139731398313993140031401314023140331404314053140631407314083140931410314113141231413314143141531416314173141831419314203142131422314233142431425314263142731428314293143031431314323143331434314353143631437314383143931440314413144231443314443144531446314473144831449314503145131452314533145431455314563145731458314593146031461314623146331464314653146631467314683146931470314713147231473314743147531476314773147831479314803148131482314833148431485314863148731488314893149031491314923149331494314953149631497314983149931500315013150231503315043150531506315073150831509315103151131512315133151431515315163151731518315193152031521315223152331524315253152631527315283152931530315313153231533315343153531536315373153831539315403154131542315433154431545315463154731548315493155031551315523155331554315553155631557315583155931560315613156231563315643156531566315673156831569315703157131572315733157431575315763157731578315793158031581315823158331584315853158631587315883158931590315913159231593315943159531596315973159831599316003160131602316033160431605316063160731608316093161031611316123161331614316153161631617316183161931620316213162231623316243162531626316273162831629316303163131632316333163431635316363163731638316393164031641316423164331644316453164631647316483164931650316513165231653316543165531656316573165831659316603166131662316633166431665316663166731668316693167031671316723167331674316753167631677316783167931680316813168231683316843168531686316873168831689316903169131692316933169431695316963169731698316993170031701317023170331704317053170631707317083170931710317113171231713317143171531716317173171831719317203172131722317233172431725317263172731728317293173031731317323173331734317353173631737317383173931740317413174231743317443174531746317473174831749317503175131752317533175431755317563175731758317593176031761317623176331764317653176631767317683176931770317713177231773317743177531776317773177831779317803178131782317833178431785317863178731788317893179031791317923179331794317953179631797317983179931800318013180231803318043180531806318073180831809318103181131812318133181431815318163181731818318193182031821318223182331824318253182631827318283182931830318313183231833318343183531836318373183831839318403184131842318433184431845318463184731848318493185031851318523185331854318553185631857318583185931860318613186231863318643186531866318673186831869318703187131872318733187431875318763187731878318793188031881318823188331884318853188631887318883188931890318913189231893318943189531896318973189831899319003190131902319033190431905319063190731908319093191031911319123191331914319153191631917319183191931920319213192231923319243192531926319273192831929319303193131932319333193431935319363193731938319393194031941319423194331944319453194631947319483194931950319513195231953319543195531956319573195831959319603196131962319633196431965319663196731968319693197031971319723197331974319753197631977319783197931980319813198231983319843198531986319873198831989319903199131992319933199431995319963199731998319993200032001320023200332004320053200632007320083200932010320113201232013320143201532016320173201832019320203202132022320233202432025320263202732028320293203032031320323203332034320353203632037320383203932040320413204232043320443204532046320473204832049320503205132052320533205432055320563205732058320593206032061320623206332064320653206632067320683206932070320713207232073320743207532076320773207832079320803208132082320833208432085320863208732088320893209032091320923209332094320953209632097320983209932100321013210232103321043210532106321073210832109321103211132112321133211432115321163211732118321193212032121321223212332124321253212632127321283212932130321313213232133321343213532136321373213832139321403214132142321433214432145321463214732148321493215032151321523215332154321553215632157321583215932160321613216232163321643216532166321673216832169321703217132172321733217432175321763217732178321793218032181321823218332184321853218632187321883218932190321913219232193321943219532196321973219832199322003220132202322033220432205322063220732208322093221032211322123221332214322153221632217322183221932220322213222232223322243222532226322273222832229322303223132232322333223432235322363223732238322393224032241322423224332244322453224632247322483224932250322513225232253322543225532256322573225832259322603226132262322633226432265322663226732268322693227032271322723227332274322753227632277322783227932280322813228232283322843228532286322873228832289322903229132292322933229432295322963229732298322993230032301323023230332304323053230632307323083230932310323113231232313323143231532316323173231832319323203232132322323233232432325323263232732328323293233032331323323233332334323353233632337323383233932340323413234232343323443234532346323473234832349323503235132352323533235432355323563235732358323593236032361323623236332364323653236632367323683236932370323713237232373323743237532376323773237832379323803238132382323833238432385323863238732388323893239032391323923239332394323953239632397323983239932400324013240232403324043240532406324073240832409324103241132412324133241432415324163241732418324193242032421324223242332424324253242632427324283242932430324313243232433324343243532436324373243832439324403244132442324433244432445324463244732448324493245032451324523245332454324553245632457324583245932460324613246232463324643246532466324673246832469324703247132472324733247432475324763247732478324793248032481324823248332484324853248632487324883248932490324913249232493324943249532496324973249832499325003250132502325033250432505325063250732508325093251032511325123251332514325153251632517325183251932520325213252232523325243252532526325273252832529325303253132532325333253432535325363253732538325393254032541325423254332544325453254632547325483254932550325513255232553325543255532556325573255832559325603256132562325633256432565325663256732568325693257032571325723257332574325753257632577325783257932580325813258232583325843258532586325873258832589325903259132592325933259432595325963259732598325993260032601326023260332604326053260632607326083260932610326113261232613326143261532616326173261832619326203262132622326233262432625326263262732628326293263032631326323263332634326353263632637326383263932640326413264232643326443264532646326473264832649326503265132652326533265432655326563265732658326593266032661326623266332664326653266632667326683266932670326713267232673326743267532676326773267832679326803268132682326833268432685326863268732688326893269032691326923269332694326953269632697326983269932700327013270232703327043270532706327073270832709327103271132712327133271432715327163271732718327193272032721327223272332724327253272632727327283272932730327313273232733327343273532736327373273832739327403274132742327433274432745327463274732748327493275032751327523275332754327553275632757327583275932760327613276232763327643276532766327673276832769327703277132772327733277432775327763277732778327793278032781327823278332784327853278632787327883278932790327913279232793327943279532796327973279832799328003280132802328033280432805328063280732808328093281032811328123281332814328153281632817328183281932820328213282232823328243282532826328273282832829328303283132832328333283432835328363283732838328393284032841328423284332844328453284632847328483284932850328513285232853328543285532856328573285832859328603286132862328633286432865328663286732868328693287032871328723287332874328753287632877328783287932880328813288232883328843288532886328873288832889328903289132892328933289432895328963289732898328993290032901329023290332904329053290632907329083290932910329113291232913329143291532916329173291832919329203292132922329233292432925329263292732928329293293032931329323293332934329353293632937329383293932940329413294232943329443294532946329473294832949329503295132952329533295432955329563295732958329593296032961329623296332964329653296632967329683296932970329713297232973329743297532976329773297832979329803298132982329833298432985329863298732988329893299032991329923299332994329953299632997329983299933000330013300233003330043300533006330073300833009330103301133012330133301433015330163301733018330193302033021330223302333024330253302633027330283302933030330313303233033330343303533036330373303833039330403304133042330433304433045330463304733048330493305033051330523305333054330553305633057330583305933060330613306233063330643306533066330673306833069330703307133072330733307433075330763307733078330793308033081330823308333084330853308633087330883308933090330913309233093330943309533096330973309833099331003310133102331033310433105331063310733108331093311033111331123311333114331153311633117331183311933120331213312233123331243312533126331273312833129331303313133132331333313433135331363313733138331393314033141331423314333144331453314633147331483314933150331513315233153331543315533156331573315833159331603316133162331633316433165331663316733168331693317033171331723317333174331753317633177331783317933180331813318233183331843318533186331873318833189331903319133192331933319433195331963319733198331993320033201332023320333204332053320633207332083320933210332113321233213332143321533216332173321833219332203322133222332233322433225332263322733228332293323033231332323323333234332353323633237332383323933240332413324233243332443324533246332473324833249332503325133252332533325433255332563325733258332593326033261332623326333264332653326633267332683326933270332713327233273332743327533276332773327833279332803328133282332833328433285332863328733288332893329033291332923329333294332953329633297332983329933300333013330233303333043330533306333073330833309333103331133312333133331433315333163331733318333193332033321333223332333324333253332633327333283332933330333313333233333333343333533336333373333833339333403334133342333433334433345333463334733348333493335033351333523335333354333553335633357333583335933360333613336233363333643336533366333673336833369333703337133372333733337433375333763337733378333793338033381333823338333384333853338633387333883338933390333913339233393333943339533396333973339833399334003340133402334033340433405334063340733408334093341033411334123341333414334153341633417334183341933420334213342233423334243342533426334273342833429334303343133432334333343433435334363343733438334393344033441334423344333444334453344633447334483344933450334513345233453334543345533456334573345833459334603346133462334633346433465334663346733468334693347033471334723347333474334753347633477334783347933480334813348233483334843348533486334873348833489334903349133492334933349433495334963349733498334993350033501335023350333504335053350633507335083350933510335113351233513335143351533516335173351833519335203352133522335233352433525335263352733528335293353033531335323353333534335353353633537335383353933540335413354233543335443354533546335473354833549335503355133552335533355433555335563355733558335593356033561335623356333564335653356633567335683356933570335713357233573335743357533576335773357833579335803358133582335833358433585335863358733588335893359033591335923359333594335953359633597335983359933600336013360233603336043360533606336073360833609336103361133612336133361433615336163361733618336193362033621336223362333624336253362633627336283362933630336313363233633336343363533636336373363833639336403364133642336433364433645336463364733648336493365033651336523365333654336553365633657336583365933660336613366233663336643366533666336673366833669336703367133672336733367433675336763367733678336793368033681336823368333684336853368633687336883368933690336913369233693336943369533696336973369833699337003370133702337033370433705337063370733708337093371033711337123371333714337153371633717337183371933720337213372233723337243372533726337273372833729337303373133732337333373433735337363373733738337393374033741337423374333744337453374633747337483374933750337513375233753337543375533756337573375833759337603376133762337633376433765337663376733768337693377033771337723377333774337753377633777337783377933780337813378233783337843378533786337873378833789337903379133792337933379433795337963379733798337993380033801338023380333804338053380633807338083380933810338113381233813338143381533816338173381833819338203382133822338233382433825338263382733828338293383033831338323383333834338353383633837338383383933840338413384233843338443384533846338473384833849338503385133852338533385433855338563385733858338593386033861338623386333864338653386633867338683386933870338713387233873338743387533876338773387833879338803388133882338833388433885338863388733888338893389033891338923389333894338953389633897338983389933900339013390233903339043390533906339073390833909339103391133912339133391433915339163391733918339193392033921339223392333924339253392633927339283392933930339313393233933339343393533936339373393833939339403394133942339433394433945339463394733948339493395033951339523395333954339553395633957339583395933960339613396233963339643396533966339673396833969339703397133972339733397433975339763397733978339793398033981339823398333984339853398633987339883398933990339913399233993339943399533996339973399833999340003400134002340033400434005340063400734008340093401034011340123401334014340153401634017340183401934020340213402234023340243402534026340273402834029340303403134032340333403434035340363403734038340393404034041340423404334044340453404634047340483404934050340513405234053340543405534056340573405834059340603406134062340633406434065340663406734068340693407034071340723407334074340753407634077340783407934080340813408234083340843408534086340873408834089340903409134092340933409434095340963409734098340993410034101341023410334104341053410634107341083410934110341113411234113341143411534116341173411834119341203412134122341233412434125341263412734128341293413034131341323413334134341353413634137341383413934140341413414234143341443414534146341473414834149341503415134152341533415434155341563415734158341593416034161341623416334164341653416634167341683416934170341713417234173341743417534176341773417834179341803418134182341833418434185341863418734188341893419034191341923419334194341953419634197341983419934200342013420234203342043420534206342073420834209342103421134212342133421434215342163421734218342193422034221342223422334224342253422634227342283422934230342313423234233342343423534236342373423834239342403424134242342433424434245342463424734248342493425034251342523425334254342553425634257342583425934260342613426234263342643426534266342673426834269342703427134272342733427434275342763427734278342793428034281342823428334284342853428634287342883428934290342913429234293342943429534296342973429834299343003430134302343033430434305343063430734308343093431034311343123431334314343153431634317343183431934320343213432234323343243432534326343273432834329343303433134332343333433434335343363433734338343393434034341343423434334344343453434634347343483434934350343513435234353343543435534356343573435834359343603436134362343633436434365343663436734368343693437034371343723437334374343753437634377343783437934380343813438234383343843438534386343873438834389343903439134392343933439434395343963439734398343993440034401344023440334404344053440634407344083440934410344113441234413344143441534416344173441834419344203442134422344233442434425344263442734428344293443034431344323443334434344353443634437344383443934440344413444234443344443444534446344473444834449344503445134452344533445434455344563445734458344593446034461344623446334464344653446634467344683446934470344713447234473344743447534476344773447834479344803448134482344833448434485344863448734488344893449034491344923449334494344953449634497344983449934500345013450234503345043450534506345073450834509345103451134512345133451434515345163451734518345193452034521345223452334524345253452634527345283452934530345313453234533345343453534536345373453834539345403454134542345433454434545345463454734548345493455034551345523455334554345553455634557345583455934560345613456234563345643456534566345673456834569345703457134572345733457434575345763457734578345793458034581345823458334584345853458634587345883458934590345913459234593345943459534596345973459834599346003460134602346033460434605346063460734608346093461034611346123461334614346153461634617346183461934620346213462234623346243462534626346273462834629346303463134632346333463434635346363463734638346393464034641346423464334644346453464634647346483464934650346513465234653346543465534656346573465834659346603466134662346633466434665346663466734668346693467034671346723467334674346753467634677346783467934680346813468234683346843468534686346873468834689346903469134692346933469434695346963469734698346993470034701347023470334704347053470634707347083470934710347113471234713347143471534716347173471834719347203472134722347233472434725347263472734728347293473034731347323473334734347353473634737347383473934740347413474234743347443474534746347473474834749347503475134752347533475434755347563475734758347593476034761347623476334764347653476634767347683476934770347713477234773347743477534776347773477834779347803478134782347833478434785347863478734788347893479034791347923479334794347953479634797347983479934800348013480234803348043480534806348073480834809348103481134812348133481434815348163481734818348193482034821348223482334824348253482634827348283482934830348313483234833348343483534836348373483834839348403484134842348433484434845348463484734848348493485034851348523485334854348553485634857348583485934860348613486234863348643486534866348673486834869348703487134872348733487434875348763487734878348793488034881348823488334884348853488634887348883488934890348913489234893348943489534896348973489834899349003490134902349033490434905349063490734908349093491034911349123491334914349153491634917349183491934920349213492234923349243492534926349273492834929349303493134932349333493434935349363493734938349393494034941349423494334944349453494634947349483494934950349513495234953349543495534956349573495834959349603496134962349633496434965349663496734968349693497034971349723497334974349753497634977349783497934980349813498234983349843498534986349873498834989349903499134992349933499434995349963499734998349993500035001350023500335004350053500635007350083500935010350113501235013350143501535016350173501835019350203502135022350233502435025350263502735028350293503035031350323503335034350353503635037350383503935040350413504235043350443504535046350473504835049350503505135052350533505435055350563505735058350593506035061350623506335064350653506635067350683506935070350713507235073350743507535076350773507835079350803508135082350833508435085350863508735088350893509035091350923509335094350953509635097350983509935100351013510235103351043510535106351073510835109351103511135112351133511435115351163511735118351193512035121351223512335124351253512635127351283512935130351313513235133351343513535136351373513835139351403514135142351433514435145351463514735148351493515035151351523515335154351553515635157351583515935160351613516235163351643516535166351673516835169351703517135172351733517435175351763517735178351793518035181351823518335184351853518635187351883518935190351913519235193351943519535196351973519835199352003520135202352033520435205352063520735208352093521035211352123521335214352153521635217352183521935220352213522235223352243522535226352273522835229352303523135232352333523435235352363523735238352393524035241352423524335244352453524635247352483524935250352513525235253352543525535256352573525835259352603526135262352633526435265352663526735268352693527035271352723527335274352753527635277352783527935280352813528235283352843528535286352873528835289352903529135292352933529435295352963529735298352993530035301353023530335304353053530635307353083530935310353113531235313353143531535316353173531835319353203532135322353233532435325353263532735328353293533035331353323533335334353353533635337353383533935340353413534235343353443534535346353473534835349353503535135352353533535435355353563535735358353593536035361353623536335364353653536635367353683536935370353713537235373353743537535376353773537835379353803538135382353833538435385353863538735388353893539035391353923539335394353953539635397353983539935400354013540235403354043540535406354073540835409354103541135412354133541435415354163541735418354193542035421354223542335424354253542635427354283542935430354313543235433354343543535436354373543835439354403544135442354433544435445354463544735448354493545035451354523545335454354553545635457354583545935460354613546235463354643546535466354673546835469354703547135472354733547435475354763547735478354793548035481354823548335484354853548635487354883548935490354913549235493354943549535496354973549835499355003550135502355033550435505355063550735508355093551035511355123551335514355153551635517355183551935520355213552235523355243552535526355273552835529355303553135532355333553435535355363553735538355393554035541355423554335544355453554635547355483554935550355513555235553355543555535556355573555835559355603556135562355633556435565355663556735568355693557035571355723557335574355753557635577355783557935580355813558235583355843558535586355873558835589355903559135592355933559435595355963559735598355993560035601356023560335604356053560635607356083560935610356113561235613356143561535616356173561835619356203562135622356233562435625356263562735628356293563035631356323563335634356353563635637356383563935640356413564235643356443564535646356473564835649356503565135652356533565435655356563565735658356593566035661356623566335664356653566635667356683566935670356713567235673356743567535676356773567835679356803568135682356833568435685356863568735688356893569035691356923569335694356953569635697356983569935700357013570235703357043570535706357073570835709357103571135712357133571435715357163571735718357193572035721357223572335724357253572635727357283572935730357313573235733357343573535736357373573835739357403574135742357433574435745357463574735748357493575035751357523575335754357553575635757357583575935760357613576235763357643576535766357673576835769357703577135772357733577435775357763577735778357793578035781357823578335784357853578635787357883578935790357913579235793357943579535796357973579835799358003580135802358033580435805358063580735808358093581035811358123581335814358153581635817358183581935820358213582235823358243582535826358273582835829358303583135832358333583435835358363583735838358393584035841358423584335844358453584635847358483584935850358513585235853358543585535856358573585835859358603586135862358633586435865358663586735868358693587035871358723587335874358753587635877358783587935880358813588235883358843588535886358873588835889358903589135892358933589435895358963589735898358993590035901359023590335904359053590635907359083590935910359113591235913359143591535916359173591835919359203592135922359233592435925359263592735928359293593035931359323593335934359353593635937359383593935940359413594235943359443594535946359473594835949359503595135952359533595435955359563595735958359593596035961359623596335964359653596635967359683596935970359713597235973359743597535976359773597835979359803598135982359833598435985359863598735988359893599035991359923599335994359953599635997359983599936000360013600236003360043600536006360073600836009360103601136012360133601436015360163601736018360193602036021360223602336024360253602636027360283602936030360313603236033360343603536036360373603836039360403604136042360433604436045360463604736048360493605036051360523605336054360553605636057360583605936060360613606236063360643606536066360673606836069360703607136072360733607436075360763607736078360793608036081360823608336084360853608636087360883608936090360913609236093360943609536096360973609836099361003610136102361033610436105361063610736108361093611036111361123611336114361153611636117361183611936120361213612236123361243612536126361273612836129361303613136132361333613436135361363613736138361393614036141361423614336144361453614636147361483614936150361513615236153361543615536156361573615836159361603616136162361633616436165361663616736168361693617036171361723617336174361753617636177361783617936180361813618236183361843618536186361873618836189361903619136192361933619436195361963619736198361993620036201362023620336204362053620636207362083620936210362113621236213362143621536216362173621836219362203622136222362233622436225362263622736228362293623036231362323623336234362353623636237362383623936240362413624236243362443624536246362473624836249362503625136252362533625436255362563625736258362593626036261362623626336264362653626636267362683626936270362713627236273362743627536276362773627836279362803628136282362833628436285362863628736288362893629036291362923629336294362953629636297362983629936300363013630236303363043630536306363073630836309363103631136312363133631436315363163631736318363193632036321363223632336324363253632636327363283632936330363313633236333363343633536336363373633836339363403634136342363433634436345363463634736348363493635036351363523635336354363553635636357363583635936360363613636236363363643636536366363673636836369363703637136372363733637436375363763637736378363793638036381363823638336384363853638636387363883638936390363913639236393363943639536396363973639836399364003640136402364033640436405364063640736408364093641036411364123641336414364153641636417364183641936420364213642236423364243642536426364273642836429364303643136432364333643436435364363643736438364393644036441364423644336444364453644636447364483644936450364513645236453364543645536456364573645836459364603646136462364633646436465364663646736468364693647036471364723647336474364753647636477364783647936480364813648236483364843648536486364873648836489364903649136492364933649436495364963649736498364993650036501365023650336504365053650636507365083650936510365113651236513365143651536516365173651836519365203652136522365233652436525365263652736528365293653036531365323653336534365353653636537365383653936540365413654236543365443654536546365473654836549365503655136552365533655436555365563655736558365593656036561365623656336564365653656636567365683656936570365713657236573365743657536576365773657836579365803658136582365833658436585365863658736588365893659036591365923659336594365953659636597365983659936600366013660236603366043660536606366073660836609366103661136612366133661436615366163661736618366193662036621366223662336624366253662636627366283662936630366313663236633366343663536636366373663836639366403664136642366433664436645366463664736648366493665036651366523665336654366553665636657366583665936660366613666236663366643666536666366673666836669366703667136672366733667436675366763667736678366793668036681366823668336684366853668636687366883668936690366913669236693366943669536696366973669836699367003670136702367033670436705367063670736708367093671036711367123671336714367153671636717367183671936720367213672236723367243672536726367273672836729367303673136732367333673436735367363673736738367393674036741367423674336744367453674636747367483674936750367513675236753367543675536756367573675836759367603676136762367633676436765367663676736768367693677036771367723677336774367753677636777367783677936780367813678236783367843678536786367873678836789367903679136792367933679436795367963679736798367993680036801368023680336804368053680636807368083680936810368113681236813368143681536816368173681836819368203682136822368233682436825368263682736828368293683036831368323683336834368353683636837368383683936840368413684236843368443684536846
  1. diff -Nur linux-3.18.14.orig/arch/alpha/mm/fault.c linux-3.18.14-rt/arch/alpha/mm/fault.c
  2. --- linux-3.18.14.orig/arch/alpha/mm/fault.c 2015-05-20 10:04:50.000000000 -0500
  3. +++ linux-3.18.14-rt/arch/alpha/mm/fault.c 2015-05-31 15:32:45.517635394 -0500
  4. @@ -107,7 +107,7 @@
  5. /* If we're in an interrupt context, or have no user context,
  6. we must not take the fault. */
  7. - if (!mm || in_atomic())
  8. + if (!mm || pagefault_disabled())
  9. goto no_context;
  10. #ifdef CONFIG_ALPHA_LARGE_VMALLOC
  11. diff -Nur linux-3.18.14.orig/arch/arm/include/asm/cmpxchg.h linux-3.18.14-rt/arch/arm/include/asm/cmpxchg.h
  12. --- linux-3.18.14.orig/arch/arm/include/asm/cmpxchg.h 2015-05-20 10:04:50.000000000 -0500
  13. +++ linux-3.18.14-rt/arch/arm/include/asm/cmpxchg.h 2015-05-31 15:32:45.557635393 -0500
  14. @@ -129,6 +129,8 @@
  15. #else /* min ARCH >= ARMv6 */
  16. +#define __HAVE_ARCH_CMPXCHG 1
  17. +
  18. extern void __bad_cmpxchg(volatile void *ptr, int size);
  19. /*
  20. diff -Nur linux-3.18.14.orig/arch/arm/include/asm/futex.h linux-3.18.14-rt/arch/arm/include/asm/futex.h
  21. --- linux-3.18.14.orig/arch/arm/include/asm/futex.h 2015-05-20 10:04:50.000000000 -0500
  22. +++ linux-3.18.14-rt/arch/arm/include/asm/futex.h 2015-05-31 15:32:45.561635393 -0500
  23. @@ -93,6 +93,8 @@
  24. if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
  25. return -EFAULT;
  26. + preempt_disable_rt();
  27. +
  28. __asm__ __volatile__("@futex_atomic_cmpxchg_inatomic\n"
  29. "1: " TUSER(ldr) " %1, [%4]\n"
  30. " teq %1, %2\n"
  31. @@ -104,6 +106,8 @@
  32. : "cc", "memory");
  33. *uval = val;
  34. +
  35. + preempt_enable_rt();
  36. return ret;
  37. }
  38. diff -Nur linux-3.18.14.orig/arch/arm/include/asm/switch_to.h linux-3.18.14-rt/arch/arm/include/asm/switch_to.h
  39. --- linux-3.18.14.orig/arch/arm/include/asm/switch_to.h 2015-05-20 10:04:50.000000000 -0500
  40. +++ linux-3.18.14-rt/arch/arm/include/asm/switch_to.h 2015-05-31 15:32:45.565635393 -0500
  41. @@ -3,6 +3,13 @@
  42. #include <linux/thread_info.h>
  43. +#if defined CONFIG_PREEMPT_RT_FULL && defined CONFIG_HIGHMEM
  44. +void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p);
  45. +#else
  46. +static inline void
  47. +switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p) { }
  48. +#endif
  49. +
  50. /*
  51. * For v7 SMP cores running a preemptible kernel we may be pre-empted
  52. * during a TLB maintenance operation, so execute an inner-shareable dsb
  53. @@ -22,6 +29,7 @@
  54. #define switch_to(prev,next,last) \
  55. do { \
  56. + switch_kmaps(prev, next); \
  57. last = __switch_to(prev,task_thread_info(prev), task_thread_info(next)); \
  58. } while (0)
  59. diff -Nur linux-3.18.14.orig/arch/arm/include/asm/thread_info.h linux-3.18.14-rt/arch/arm/include/asm/thread_info.h
  60. --- linux-3.18.14.orig/arch/arm/include/asm/thread_info.h 2015-05-20 10:04:50.000000000 -0500
  61. +++ linux-3.18.14-rt/arch/arm/include/asm/thread_info.h 2015-05-31 15:32:45.585635393 -0500
  62. @@ -51,6 +51,7 @@
  63. struct thread_info {
  64. unsigned long flags; /* low level flags */
  65. int preempt_count; /* 0 => preemptable, <0 => bug */
  66. + int preempt_lazy_count; /* 0 => preemptable, <0 => bug */
  67. mm_segment_t addr_limit; /* address limit */
  68. struct task_struct *task; /* main task structure */
  69. struct exec_domain *exec_domain; /* execution domain */
  70. @@ -149,6 +150,7 @@
  71. #define TIF_SIGPENDING 0
  72. #define TIF_NEED_RESCHED 1
  73. #define TIF_NOTIFY_RESUME 2 /* callback before returning to user */
  74. +#define TIF_NEED_RESCHED_LAZY 3
  75. #define TIF_UPROBE 7
  76. #define TIF_SYSCALL_TRACE 8
  77. #define TIF_SYSCALL_AUDIT 9
  78. @@ -162,6 +164,7 @@
  79. #define _TIF_SIGPENDING (1 << TIF_SIGPENDING)
  80. #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED)
  81. #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
  82. +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
  83. #define _TIF_UPROBE (1 << TIF_UPROBE)
  84. #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
  85. #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
  86. diff -Nur linux-3.18.14.orig/arch/arm/Kconfig linux-3.18.14-rt/arch/arm/Kconfig
  87. --- linux-3.18.14.orig/arch/arm/Kconfig 2015-05-20 10:04:50.000000000 -0500
  88. +++ linux-3.18.14-rt/arch/arm/Kconfig 2015-05-31 15:32:45.529635394 -0500
  89. @@ -62,6 +62,7 @@
  90. select HAVE_PERF_EVENTS
  91. select HAVE_PERF_REGS
  92. select HAVE_PERF_USER_STACK_DUMP
  93. + select HAVE_PREEMPT_LAZY
  94. select HAVE_RCU_TABLE_FREE if (SMP && ARM_LPAE)
  95. select HAVE_REGS_AND_STACK_ACCESS_API
  96. select HAVE_SYSCALL_TRACEPOINTS
  97. diff -Nur linux-3.18.14.orig/arch/arm/kernel/asm-offsets.c linux-3.18.14-rt/arch/arm/kernel/asm-offsets.c
  98. --- linux-3.18.14.orig/arch/arm/kernel/asm-offsets.c 2015-05-20 10:04:50.000000000 -0500
  99. +++ linux-3.18.14-rt/arch/arm/kernel/asm-offsets.c 2015-05-31 15:32:45.605635393 -0500
  100. @@ -64,6 +64,7 @@
  101. BLANK();
  102. DEFINE(TI_FLAGS, offsetof(struct thread_info, flags));
  103. DEFINE(TI_PREEMPT, offsetof(struct thread_info, preempt_count));
  104. + DEFINE(TI_PREEMPT_LAZY, offsetof(struct thread_info, preempt_lazy_count));
  105. DEFINE(TI_ADDR_LIMIT, offsetof(struct thread_info, addr_limit));
  106. DEFINE(TI_TASK, offsetof(struct thread_info, task));
  107. DEFINE(TI_EXEC_DOMAIN, offsetof(struct thread_info, exec_domain));
  108. diff -Nur linux-3.18.14.orig/arch/arm/kernel/entry-armv.S linux-3.18.14-rt/arch/arm/kernel/entry-armv.S
  109. --- linux-3.18.14.orig/arch/arm/kernel/entry-armv.S 2015-05-20 10:04:50.000000000 -0500
  110. +++ linux-3.18.14-rt/arch/arm/kernel/entry-armv.S 2015-05-31 15:32:45.613635393 -0500
  111. @@ -207,11 +207,18 @@
  112. #ifdef CONFIG_PREEMPT
  113. get_thread_info tsk
  114. ldr r8, [tsk, #TI_PREEMPT] @ get preempt count
  115. - ldr r0, [tsk, #TI_FLAGS] @ get flags
  116. teq r8, #0 @ if preempt count != 0
  117. + bne 1f @ return from exeption
  118. + ldr r0, [tsk, #TI_FLAGS] @ get flags
  119. + tst r0, #_TIF_NEED_RESCHED @ if NEED_RESCHED is set
  120. + blne svc_preempt @ preempt!
  121. +
  122. + ldr r8, [tsk, #TI_PREEMPT_LAZY] @ get preempt lazy count
  123. + teq r8, #0 @ if preempt lazy count != 0
  124. movne r0, #0 @ force flags to 0
  125. - tst r0, #_TIF_NEED_RESCHED
  126. + tst r0, #_TIF_NEED_RESCHED_LAZY
  127. blne svc_preempt
  128. +1:
  129. #endif
  130. svc_exit r5, irq = 1 @ return from exception
  131. @@ -226,6 +233,8 @@
  132. 1: bl preempt_schedule_irq @ irq en/disable is done inside
  133. ldr r0, [tsk, #TI_FLAGS] @ get new tasks TI_FLAGS
  134. tst r0, #_TIF_NEED_RESCHED
  135. + bne 1b
  136. + tst r0, #_TIF_NEED_RESCHED_LAZY
  137. reteq r8 @ go again
  138. b 1b
  139. #endif
  140. diff -Nur linux-3.18.14.orig/arch/arm/kernel/process.c linux-3.18.14-rt/arch/arm/kernel/process.c
  141. --- linux-3.18.14.orig/arch/arm/kernel/process.c 2015-05-20 10:04:50.000000000 -0500
  142. +++ linux-3.18.14-rt/arch/arm/kernel/process.c 2015-05-31 15:32:45.617635393 -0500
  143. @@ -437,6 +437,30 @@
  144. }
  145. #ifdef CONFIG_MMU
  146. +/*
  147. + * CONFIG_SPLIT_PTLOCK_CPUS results in a page->ptl lock. If the lock is not
  148. + * initialized by pgtable_page_ctor() then a coredump of the vector page will
  149. + * fail.
  150. + */
  151. +static int __init vectors_user_mapping_init_page(void)
  152. +{
  153. + struct page *page;
  154. + unsigned long addr = 0xffff0000;
  155. + pgd_t *pgd;
  156. + pud_t *pud;
  157. + pmd_t *pmd;
  158. +
  159. + pgd = pgd_offset_k(addr);
  160. + pud = pud_offset(pgd, addr);
  161. + pmd = pmd_offset(pud, addr);
  162. + page = pmd_page(*(pmd));
  163. +
  164. + pgtable_page_ctor(page);
  165. +
  166. + return 0;
  167. +}
  168. +late_initcall(vectors_user_mapping_init_page);
  169. +
  170. #ifdef CONFIG_KUSER_HELPERS
  171. /*
  172. * The vectors page is always readable from user space for the
  173. diff -Nur linux-3.18.14.orig/arch/arm/kernel/process.c.orig linux-3.18.14-rt/arch/arm/kernel/process.c.orig
  174. --- linux-3.18.14.orig/arch/arm/kernel/process.c.orig 1969-12-31 18:00:00.000000000 -0600
  175. +++ linux-3.18.14-rt/arch/arm/kernel/process.c.orig 2015-05-20 10:04:50.000000000 -0500
  176. @@ -0,0 +1,560 @@
  177. +/*
  178. + * linux/arch/arm/kernel/process.c
  179. + *
  180. + * Copyright (C) 1996-2000 Russell King - Converted to ARM.
  181. + * Original Copyright (C) 1995 Linus Torvalds
  182. + *
  183. + * This program is free software; you can redistribute it and/or modify
  184. + * it under the terms of the GNU General Public License version 2 as
  185. + * published by the Free Software Foundation.
  186. + */
  187. +#include <stdarg.h>
  188. +
  189. +#include <linux/export.h>
  190. +#include <linux/sched.h>
  191. +#include <linux/kernel.h>
  192. +#include <linux/mm.h>
  193. +#include <linux/stddef.h>
  194. +#include <linux/unistd.h>
  195. +#include <linux/user.h>
  196. +#include <linux/delay.h>
  197. +#include <linux/reboot.h>
  198. +#include <linux/interrupt.h>
  199. +#include <linux/kallsyms.h>
  200. +#include <linux/init.h>
  201. +#include <linux/cpu.h>
  202. +#include <linux/elfcore.h>
  203. +#include <linux/pm.h>
  204. +#include <linux/tick.h>
  205. +#include <linux/utsname.h>
  206. +#include <linux/uaccess.h>
  207. +#include <linux/random.h>
  208. +#include <linux/hw_breakpoint.h>
  209. +#include <linux/leds.h>
  210. +#include <linux/reboot.h>
  211. +
  212. +#include <asm/cacheflush.h>
  213. +#include <asm/idmap.h>
  214. +#include <asm/processor.h>
  215. +#include <asm/thread_notify.h>
  216. +#include <asm/stacktrace.h>
  217. +#include <asm/system_misc.h>
  218. +#include <asm/mach/time.h>
  219. +#include <asm/tls.h>
  220. +#include "reboot.h"
  221. +
  222. +#ifdef CONFIG_CC_STACKPROTECTOR
  223. +#include <linux/stackprotector.h>
  224. +unsigned long __stack_chk_guard __read_mostly;
  225. +EXPORT_SYMBOL(__stack_chk_guard);
  226. +#endif
  227. +
  228. +static const char *processor_modes[] __maybe_unused = {
  229. + "USER_26", "FIQ_26" , "IRQ_26" , "SVC_26" , "UK4_26" , "UK5_26" , "UK6_26" , "UK7_26" ,
  230. + "UK8_26" , "UK9_26" , "UK10_26", "UK11_26", "UK12_26", "UK13_26", "UK14_26", "UK15_26",
  231. + "USER_32", "FIQ_32" , "IRQ_32" , "SVC_32" , "UK4_32" , "UK5_32" , "UK6_32" , "ABT_32" ,
  232. + "UK8_32" , "UK9_32" , "UK10_32", "UND_32" , "UK12_32", "UK13_32", "UK14_32", "SYS_32"
  233. +};
  234. +
  235. +static const char *isa_modes[] __maybe_unused = {
  236. + "ARM" , "Thumb" , "Jazelle", "ThumbEE"
  237. +};
  238. +
  239. +extern void call_with_stack(void (*fn)(void *), void *arg, void *sp);
  240. +typedef void (*phys_reset_t)(unsigned long);
  241. +
  242. +/*
  243. + * A temporary stack to use for CPU reset. This is static so that we
  244. + * don't clobber it with the identity mapping. When running with this
  245. + * stack, any references to the current task *will not work* so you
  246. + * should really do as little as possible before jumping to your reset
  247. + * code.
  248. + */
  249. +static u64 soft_restart_stack[16];
  250. +
  251. +static void __soft_restart(void *addr)
  252. +{
  253. + phys_reset_t phys_reset;
  254. +
  255. + /* Take out a flat memory mapping. */
  256. + setup_mm_for_reboot();
  257. +
  258. + /* Clean and invalidate caches */
  259. + flush_cache_all();
  260. +
  261. + /* Turn off caching */
  262. + cpu_proc_fin();
  263. +
  264. + /* Push out any further dirty data, and ensure cache is empty */
  265. + flush_cache_all();
  266. +
  267. + /* Switch to the identity mapping. */
  268. + phys_reset = (phys_reset_t)(unsigned long)virt_to_phys(cpu_reset);
  269. + phys_reset((unsigned long)addr);
  270. +
  271. + /* Should never get here. */
  272. + BUG();
  273. +}
  274. +
  275. +void _soft_restart(unsigned long addr, bool disable_l2)
  276. +{
  277. + u64 *stack = soft_restart_stack + ARRAY_SIZE(soft_restart_stack);
  278. +
  279. + /* Disable interrupts first */
  280. + raw_local_irq_disable();
  281. + local_fiq_disable();
  282. +
  283. + /* Disable the L2 if we're the last man standing. */
  284. + if (disable_l2)
  285. + outer_disable();
  286. +
  287. + /* Change to the new stack and continue with the reset. */
  288. + call_with_stack(__soft_restart, (void *)addr, (void *)stack);
  289. +
  290. + /* Should never get here. */
  291. + BUG();
  292. +}
  293. +
  294. +void soft_restart(unsigned long addr)
  295. +{
  296. + _soft_restart(addr, num_online_cpus() == 1);
  297. +}
  298. +
  299. +/*
  300. + * Function pointers to optional machine specific functions
  301. + */
  302. +void (*pm_power_off)(void);
  303. +EXPORT_SYMBOL(pm_power_off);
  304. +
  305. +void (*arm_pm_restart)(enum reboot_mode reboot_mode, const char *cmd);
  306. +
  307. +/*
  308. + * This is our default idle handler.
  309. + */
  310. +
  311. +void (*arm_pm_idle)(void);
  312. +
  313. +/*
  314. + * Called from the core idle loop.
  315. + */
  316. +
  317. +void arch_cpu_idle(void)
  318. +{
  319. + if (arm_pm_idle)
  320. + arm_pm_idle();
  321. + else
  322. + cpu_do_idle();
  323. + local_irq_enable();
  324. +}
  325. +
  326. +void arch_cpu_idle_prepare(void)
  327. +{
  328. + local_fiq_enable();
  329. +}
  330. +
  331. +void arch_cpu_idle_enter(void)
  332. +{
  333. + ledtrig_cpu(CPU_LED_IDLE_START);
  334. +#ifdef CONFIG_PL310_ERRATA_769419
  335. + wmb();
  336. +#endif
  337. +}
  338. +
  339. +void arch_cpu_idle_exit(void)
  340. +{
  341. + ledtrig_cpu(CPU_LED_IDLE_END);
  342. +}
  343. +
  344. +#ifdef CONFIG_HOTPLUG_CPU
  345. +void arch_cpu_idle_dead(void)
  346. +{
  347. + cpu_die();
  348. +}
  349. +#endif
  350. +
  351. +/*
  352. + * Called by kexec, immediately prior to machine_kexec().
  353. + *
  354. + * This must completely disable all secondary CPUs; simply causing those CPUs
  355. + * to execute e.g. a RAM-based pin loop is not sufficient. This allows the
  356. + * kexec'd kernel to use any and all RAM as it sees fit, without having to
  357. + * avoid any code or data used by any SW CPU pin loop. The CPU hotplug
  358. + * functionality embodied in disable_nonboot_cpus() to achieve this.
  359. + */
  360. +void machine_shutdown(void)
  361. +{
  362. + disable_nonboot_cpus();
  363. +}
  364. +
  365. +/*
  366. + * Halting simply requires that the secondary CPUs stop performing any
  367. + * activity (executing tasks, handling interrupts). smp_send_stop()
  368. + * achieves this.
  369. + */
  370. +void machine_halt(void)
  371. +{
  372. + local_irq_disable();
  373. + smp_send_stop();
  374. +
  375. + local_irq_disable();
  376. + while (1);
  377. +}
  378. +
  379. +/*
  380. + * Power-off simply requires that the secondary CPUs stop performing any
  381. + * activity (executing tasks, handling interrupts). smp_send_stop()
  382. + * achieves this. When the system power is turned off, it will take all CPUs
  383. + * with it.
  384. + */
  385. +void machine_power_off(void)
  386. +{
  387. + local_irq_disable();
  388. + smp_send_stop();
  389. +
  390. + if (pm_power_off)
  391. + pm_power_off();
  392. +}
  393. +
  394. +/*
  395. + * Restart requires that the secondary CPUs stop performing any activity
  396. + * while the primary CPU resets the system. Systems with a single CPU can
  397. + * use soft_restart() as their machine descriptor's .restart hook, since that
  398. + * will cause the only available CPU to reset. Systems with multiple CPUs must
  399. + * provide a HW restart implementation, to ensure that all CPUs reset at once.
  400. + * This is required so that any code running after reset on the primary CPU
  401. + * doesn't have to co-ordinate with other CPUs to ensure they aren't still
  402. + * executing pre-reset code, and using RAM that the primary CPU's code wishes
  403. + * to use. Implementing such co-ordination would be essentially impossible.
  404. + */
  405. +void machine_restart(char *cmd)
  406. +{
  407. + local_irq_disable();
  408. + smp_send_stop();
  409. +
  410. + if (arm_pm_restart)
  411. + arm_pm_restart(reboot_mode, cmd);
  412. + else
  413. + do_kernel_restart(cmd);
  414. +
  415. + /* Give a grace period for failure to restart of 1s */
  416. + mdelay(1000);
  417. +
  418. + /* Whoops - the platform was unable to reboot. Tell the user! */
  419. + printk("Reboot failed -- System halted\n");
  420. + local_irq_disable();
  421. + while (1);
  422. +}
  423. +
  424. +void __show_regs(struct pt_regs *regs)
  425. +{
  426. + unsigned long flags;
  427. + char buf[64];
  428. +
  429. + show_regs_print_info(KERN_DEFAULT);
  430. +
  431. + print_symbol("PC is at %s\n", instruction_pointer(regs));
  432. + print_symbol("LR is at %s\n", regs->ARM_lr);
  433. + printk("pc : [<%08lx>] lr : [<%08lx>] psr: %08lx\n"
  434. + "sp : %08lx ip : %08lx fp : %08lx\n",
  435. + regs->ARM_pc, regs->ARM_lr, regs->ARM_cpsr,
  436. + regs->ARM_sp, regs->ARM_ip, regs->ARM_fp);
  437. + printk("r10: %08lx r9 : %08lx r8 : %08lx\n",
  438. + regs->ARM_r10, regs->ARM_r9,
  439. + regs->ARM_r8);
  440. + printk("r7 : %08lx r6 : %08lx r5 : %08lx r4 : %08lx\n",
  441. + regs->ARM_r7, regs->ARM_r6,
  442. + regs->ARM_r5, regs->ARM_r4);
  443. + printk("r3 : %08lx r2 : %08lx r1 : %08lx r0 : %08lx\n",
  444. + regs->ARM_r3, regs->ARM_r2,
  445. + regs->ARM_r1, regs->ARM_r0);
  446. +
  447. + flags = regs->ARM_cpsr;
  448. + buf[0] = flags & PSR_N_BIT ? 'N' : 'n';
  449. + buf[1] = flags & PSR_Z_BIT ? 'Z' : 'z';
  450. + buf[2] = flags & PSR_C_BIT ? 'C' : 'c';
  451. + buf[3] = flags & PSR_V_BIT ? 'V' : 'v';
  452. + buf[4] = '\0';
  453. +
  454. +#ifndef CONFIG_CPU_V7M
  455. + printk("Flags: %s IRQs o%s FIQs o%s Mode %s ISA %s Segment %s\n",
  456. + buf, interrupts_enabled(regs) ? "n" : "ff",
  457. + fast_interrupts_enabled(regs) ? "n" : "ff",
  458. + processor_modes[processor_mode(regs)],
  459. + isa_modes[isa_mode(regs)],
  460. + get_fs() == get_ds() ? "kernel" : "user");
  461. +#else
  462. + printk("xPSR: %08lx\n", regs->ARM_cpsr);
  463. +#endif
  464. +
  465. +#ifdef CONFIG_CPU_CP15
  466. + {
  467. + unsigned int ctrl;
  468. +
  469. + buf[0] = '\0';
  470. +#ifdef CONFIG_CPU_CP15_MMU
  471. + {
  472. + unsigned int transbase, dac;
  473. + asm("mrc p15, 0, %0, c2, c0\n\t"
  474. + "mrc p15, 0, %1, c3, c0\n"
  475. + : "=r" (transbase), "=r" (dac));
  476. + snprintf(buf, sizeof(buf), " Table: %08x DAC: %08x",
  477. + transbase, dac);
  478. + }
  479. +#endif
  480. + asm("mrc p15, 0, %0, c1, c0\n" : "=r" (ctrl));
  481. +
  482. + printk("Control: %08x%s\n", ctrl, buf);
  483. + }
  484. +#endif
  485. +}
  486. +
  487. +void show_regs(struct pt_regs * regs)
  488. +{
  489. + __show_regs(regs);
  490. + dump_stack();
  491. +}
  492. +
  493. +ATOMIC_NOTIFIER_HEAD(thread_notify_head);
  494. +
  495. +EXPORT_SYMBOL_GPL(thread_notify_head);
  496. +
  497. +/*
  498. + * Free current thread data structures etc..
  499. + */
  500. +void exit_thread(void)
  501. +{
  502. + thread_notify(THREAD_NOTIFY_EXIT, current_thread_info());
  503. +}
  504. +
  505. +void flush_thread(void)
  506. +{
  507. + struct thread_info *thread = current_thread_info();
  508. + struct task_struct *tsk = current;
  509. +
  510. + flush_ptrace_hw_breakpoint(tsk);
  511. +
  512. + memset(thread->used_cp, 0, sizeof(thread->used_cp));
  513. + memset(&tsk->thread.debug, 0, sizeof(struct debug_info));
  514. + memset(&thread->fpstate, 0, sizeof(union fp_state));
  515. +
  516. + flush_tls();
  517. +
  518. + thread_notify(THREAD_NOTIFY_FLUSH, thread);
  519. +}
  520. +
  521. +void release_thread(struct task_struct *dead_task)
  522. +{
  523. +}
  524. +
  525. +asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
  526. +
  527. +int
  528. +copy_thread(unsigned long clone_flags, unsigned long stack_start,
  529. + unsigned long stk_sz, struct task_struct *p)
  530. +{
  531. + struct thread_info *thread = task_thread_info(p);
  532. + struct pt_regs *childregs = task_pt_regs(p);
  533. +
  534. + memset(&thread->cpu_context, 0, sizeof(struct cpu_context_save));
  535. +
  536. + if (likely(!(p->flags & PF_KTHREAD))) {
  537. + *childregs = *current_pt_regs();
  538. + childregs->ARM_r0 = 0;
  539. + if (stack_start)
  540. + childregs->ARM_sp = stack_start;
  541. + } else {
  542. + memset(childregs, 0, sizeof(struct pt_regs));
  543. + thread->cpu_context.r4 = stk_sz;
  544. + thread->cpu_context.r5 = stack_start;
  545. + childregs->ARM_cpsr = SVC_MODE;
  546. + }
  547. + thread->cpu_context.pc = (unsigned long)ret_from_fork;
  548. + thread->cpu_context.sp = (unsigned long)childregs;
  549. +
  550. + clear_ptrace_hw_breakpoint(p);
  551. +
  552. + if (clone_flags & CLONE_SETTLS)
  553. + thread->tp_value[0] = childregs->ARM_r3;
  554. + thread->tp_value[1] = get_tpuser();
  555. +
  556. + thread_notify(THREAD_NOTIFY_COPY, thread);
  557. +
  558. + return 0;
  559. +}
  560. +
  561. +/*
  562. + * Fill in the task's elfregs structure for a core dump.
  563. + */
  564. +int dump_task_regs(struct task_struct *t, elf_gregset_t *elfregs)
  565. +{
  566. + elf_core_copy_regs(elfregs, task_pt_regs(t));
  567. + return 1;
  568. +}
  569. +
  570. +/*
  571. + * fill in the fpe structure for a core dump...
  572. + */
  573. +int dump_fpu (struct pt_regs *regs, struct user_fp *fp)
  574. +{
  575. + struct thread_info *thread = current_thread_info();
  576. + int used_math = thread->used_cp[1] | thread->used_cp[2];
  577. +
  578. + if (used_math)
  579. + memcpy(fp, &thread->fpstate.soft, sizeof (*fp));
  580. +
  581. + return used_math != 0;
  582. +}
  583. +EXPORT_SYMBOL(dump_fpu);
  584. +
  585. +unsigned long get_wchan(struct task_struct *p)
  586. +{
  587. + struct stackframe frame;
  588. + unsigned long stack_page;
  589. + int count = 0;
  590. + if (!p || p == current || p->state == TASK_RUNNING)
  591. + return 0;
  592. +
  593. + frame.fp = thread_saved_fp(p);
  594. + frame.sp = thread_saved_sp(p);
  595. + frame.lr = 0; /* recovered from the stack */
  596. + frame.pc = thread_saved_pc(p);
  597. + stack_page = (unsigned long)task_stack_page(p);
  598. + do {
  599. + if (frame.sp < stack_page ||
  600. + frame.sp >= stack_page + THREAD_SIZE ||
  601. + unwind_frame(&frame) < 0)
  602. + return 0;
  603. + if (!in_sched_functions(frame.pc))
  604. + return frame.pc;
  605. + } while (count ++ < 16);
  606. + return 0;
  607. +}
  608. +
  609. +unsigned long arch_randomize_brk(struct mm_struct *mm)
  610. +{
  611. + unsigned long range_end = mm->brk + 0x02000000;
  612. + return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
  613. +}
  614. +
  615. +#ifdef CONFIG_MMU
  616. +#ifdef CONFIG_KUSER_HELPERS
  617. +/*
  618. + * The vectors page is always readable from user space for the
  619. + * atomic helpers. Insert it into the gate_vma so that it is visible
  620. + * through ptrace and /proc/<pid>/mem.
  621. + */
  622. +static struct vm_area_struct gate_vma = {
  623. + .vm_start = 0xffff0000,
  624. + .vm_end = 0xffff0000 + PAGE_SIZE,
  625. + .vm_flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYEXEC,
  626. +};
  627. +
  628. +static int __init gate_vma_init(void)
  629. +{
  630. + gate_vma.vm_page_prot = PAGE_READONLY_EXEC;
  631. + return 0;
  632. +}
  633. +arch_initcall(gate_vma_init);
  634. +
  635. +struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
  636. +{
  637. + return &gate_vma;
  638. +}
  639. +
  640. +int in_gate_area(struct mm_struct *mm, unsigned long addr)
  641. +{
  642. + return (addr >= gate_vma.vm_start) && (addr < gate_vma.vm_end);
  643. +}
  644. +
  645. +int in_gate_area_no_mm(unsigned long addr)
  646. +{
  647. + return in_gate_area(NULL, addr);
  648. +}
  649. +#define is_gate_vma(vma) ((vma) == &gate_vma)
  650. +#else
  651. +#define is_gate_vma(vma) 0
  652. +#endif
  653. +
  654. +const char *arch_vma_name(struct vm_area_struct *vma)
  655. +{
  656. + return is_gate_vma(vma) ? "[vectors]" : NULL;
  657. +}
  658. +
  659. +/* If possible, provide a placement hint at a random offset from the
  660. + * stack for the signal page.
  661. + */
  662. +static unsigned long sigpage_addr(const struct mm_struct *mm,
  663. + unsigned int npages)
  664. +{
  665. + unsigned long offset;
  666. + unsigned long first;
  667. + unsigned long last;
  668. + unsigned long addr;
  669. + unsigned int slots;
  670. +
  671. + first = PAGE_ALIGN(mm->start_stack);
  672. +
  673. + last = TASK_SIZE - (npages << PAGE_SHIFT);
  674. +
  675. + /* No room after stack? */
  676. + if (first > last)
  677. + return 0;
  678. +
  679. + /* Just enough room? */
  680. + if (first == last)
  681. + return first;
  682. +
  683. + slots = ((last - first) >> PAGE_SHIFT) + 1;
  684. +
  685. + offset = get_random_int() % slots;
  686. +
  687. + addr = first + (offset << PAGE_SHIFT);
  688. +
  689. + return addr;
  690. +}
  691. +
  692. +static struct page *signal_page;
  693. +extern struct page *get_signal_page(void);
  694. +
  695. +static const struct vm_special_mapping sigpage_mapping = {
  696. + .name = "[sigpage]",
  697. + .pages = &signal_page,
  698. +};
  699. +
  700. +int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
  701. +{
  702. + struct mm_struct *mm = current->mm;
  703. + struct vm_area_struct *vma;
  704. + unsigned long addr;
  705. + unsigned long hint;
  706. + int ret = 0;
  707. +
  708. + if (!signal_page)
  709. + signal_page = get_signal_page();
  710. + if (!signal_page)
  711. + return -ENOMEM;
  712. +
  713. + down_write(&mm->mmap_sem);
  714. + hint = sigpage_addr(mm, 1);
  715. + addr = get_unmapped_area(NULL, hint, PAGE_SIZE, 0, 0);
  716. + if (IS_ERR_VALUE(addr)) {
  717. + ret = addr;
  718. + goto up_fail;
  719. + }
  720. +
  721. + vma = _install_special_mapping(mm, addr, PAGE_SIZE,
  722. + VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC,
  723. + &sigpage_mapping);
  724. +
  725. + if (IS_ERR(vma)) {
  726. + ret = PTR_ERR(vma);
  727. + goto up_fail;
  728. + }
  729. +
  730. + mm->context.sigpage = addr;
  731. +
  732. + up_fail:
  733. + up_write(&mm->mmap_sem);
  734. + return ret;
  735. +}
  736. +#endif
  737. diff -Nur linux-3.18.14.orig/arch/arm/kernel/signal.c linux-3.18.14-rt/arch/arm/kernel/signal.c
  738. --- linux-3.18.14.orig/arch/arm/kernel/signal.c 2015-05-20 10:04:50.000000000 -0500
  739. +++ linux-3.18.14-rt/arch/arm/kernel/signal.c 2015-05-31 15:32:45.617635393 -0500
  740. @@ -574,7 +574,8 @@
  741. do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall)
  742. {
  743. do {
  744. - if (likely(thread_flags & _TIF_NEED_RESCHED)) {
  745. + if (likely(thread_flags & (_TIF_NEED_RESCHED |
  746. + _TIF_NEED_RESCHED_LAZY))) {
  747. schedule();
  748. } else {
  749. if (unlikely(!user_mode(regs)))
  750. diff -Nur linux-3.18.14.orig/arch/arm/kernel/unwind.c linux-3.18.14-rt/arch/arm/kernel/unwind.c
  751. --- linux-3.18.14.orig/arch/arm/kernel/unwind.c 2015-05-20 10:04:50.000000000 -0500
  752. +++ linux-3.18.14-rt/arch/arm/kernel/unwind.c 2015-05-31 15:32:45.653635392 -0500
  753. @@ -93,7 +93,7 @@
  754. static const struct unwind_idx *__origin_unwind_idx;
  755. extern const struct unwind_idx __stop_unwind_idx[];
  756. -static DEFINE_SPINLOCK(unwind_lock);
  757. +static DEFINE_RAW_SPINLOCK(unwind_lock);
  758. static LIST_HEAD(unwind_tables);
  759. /* Convert a prel31 symbol to an absolute address */
  760. @@ -201,7 +201,7 @@
  761. /* module unwind tables */
  762. struct unwind_table *table;
  763. - spin_lock_irqsave(&unwind_lock, flags);
  764. + raw_spin_lock_irqsave(&unwind_lock, flags);
  765. list_for_each_entry(table, &unwind_tables, list) {
  766. if (addr >= table->begin_addr &&
  767. addr < table->end_addr) {
  768. @@ -213,7 +213,7 @@
  769. break;
  770. }
  771. }
  772. - spin_unlock_irqrestore(&unwind_lock, flags);
  773. + raw_spin_unlock_irqrestore(&unwind_lock, flags);
  774. }
  775. pr_debug("%s: idx = %p\n", __func__, idx);
  776. @@ -530,9 +530,9 @@
  777. tab->begin_addr = text_addr;
  778. tab->end_addr = text_addr + text_size;
  779. - spin_lock_irqsave(&unwind_lock, flags);
  780. + raw_spin_lock_irqsave(&unwind_lock, flags);
  781. list_add_tail(&tab->list, &unwind_tables);
  782. - spin_unlock_irqrestore(&unwind_lock, flags);
  783. + raw_spin_unlock_irqrestore(&unwind_lock, flags);
  784. return tab;
  785. }
  786. @@ -544,9 +544,9 @@
  787. if (!tab)
  788. return;
  789. - spin_lock_irqsave(&unwind_lock, flags);
  790. + raw_spin_lock_irqsave(&unwind_lock, flags);
  791. list_del(&tab->list);
  792. - spin_unlock_irqrestore(&unwind_lock, flags);
  793. + raw_spin_unlock_irqrestore(&unwind_lock, flags);
  794. kfree(tab);
  795. }
  796. diff -Nur linux-3.18.14.orig/arch/arm/kvm/arm.c linux-3.18.14-rt/arch/arm/kvm/arm.c
  797. --- linux-3.18.14.orig/arch/arm/kvm/arm.c 2015-05-20 10:04:50.000000000 -0500
  798. +++ linux-3.18.14-rt/arch/arm/kvm/arm.c 2015-05-31 15:32:45.669635392 -0500
  799. @@ -455,9 +455,9 @@
  800. static void vcpu_pause(struct kvm_vcpu *vcpu)
  801. {
  802. - wait_queue_head_t *wq = kvm_arch_vcpu_wq(vcpu);
  803. + struct swait_head *wq = kvm_arch_vcpu_wq(vcpu);
  804. - wait_event_interruptible(*wq, !vcpu->arch.pause);
  805. + swait_event_interruptible(*wq, !vcpu->arch.pause);
  806. }
  807. static int kvm_vcpu_initialized(struct kvm_vcpu *vcpu)
  808. diff -Nur linux-3.18.14.orig/arch/arm/kvm/arm.c.orig linux-3.18.14-rt/arch/arm/kvm/arm.c.orig
  809. --- linux-3.18.14.orig/arch/arm/kvm/arm.c.orig 1969-12-31 18:00:00.000000000 -0600
  810. +++ linux-3.18.14-rt/arch/arm/kvm/arm.c.orig 2015-05-20 10:04:50.000000000 -0500
  811. @@ -0,0 +1,1060 @@
  812. +/*
  813. + * Copyright (C) 2012 - Virtual Open Systems and Columbia University
  814. + * Author: Christoffer Dall <c.dall@virtualopensystems.com>
  815. + *
  816. + * This program is free software; you can redistribute it and/or modify
  817. + * it under the terms of the GNU General Public License, version 2, as
  818. + * published by the Free Software Foundation.
  819. + *
  820. + * This program is distributed in the hope that it will be useful,
  821. + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  822. + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  823. + * GNU General Public License for more details.
  824. + *
  825. + * You should have received a copy of the GNU General Public License
  826. + * along with this program; if not, write to the Free Software
  827. + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  828. + */
  829. +
  830. +#include <linux/cpu.h>
  831. +#include <linux/cpu_pm.h>
  832. +#include <linux/errno.h>
  833. +#include <linux/err.h>
  834. +#include <linux/kvm_host.h>
  835. +#include <linux/module.h>
  836. +#include <linux/vmalloc.h>
  837. +#include <linux/fs.h>
  838. +#include <linux/mman.h>
  839. +#include <linux/sched.h>
  840. +#include <linux/kvm.h>
  841. +#include <trace/events/kvm.h>
  842. +
  843. +#define CREATE_TRACE_POINTS
  844. +#include "trace.h"
  845. +
  846. +#include <asm/uaccess.h>
  847. +#include <asm/ptrace.h>
  848. +#include <asm/mman.h>
  849. +#include <asm/tlbflush.h>
  850. +#include <asm/cacheflush.h>
  851. +#include <asm/virt.h>
  852. +#include <asm/kvm_arm.h>
  853. +#include <asm/kvm_asm.h>
  854. +#include <asm/kvm_mmu.h>
  855. +#include <asm/kvm_emulate.h>
  856. +#include <asm/kvm_coproc.h>
  857. +#include <asm/kvm_psci.h>
  858. +
  859. +#ifdef REQUIRES_VIRT
  860. +__asm__(".arch_extension virt");
  861. +#endif
  862. +
  863. +static DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page);
  864. +static kvm_cpu_context_t __percpu *kvm_host_cpu_state;
  865. +static unsigned long hyp_default_vectors;
  866. +
  867. +/* Per-CPU variable containing the currently running vcpu. */
  868. +static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_arm_running_vcpu);
  869. +
  870. +/* The VMID used in the VTTBR */
  871. +static atomic64_t kvm_vmid_gen = ATOMIC64_INIT(1);
  872. +static u8 kvm_next_vmid;
  873. +static DEFINE_SPINLOCK(kvm_vmid_lock);
  874. +
  875. +static bool vgic_present;
  876. +
  877. +static void kvm_arm_set_running_vcpu(struct kvm_vcpu *vcpu)
  878. +{
  879. + BUG_ON(preemptible());
  880. + __this_cpu_write(kvm_arm_running_vcpu, vcpu);
  881. +}
  882. +
  883. +/**
  884. + * kvm_arm_get_running_vcpu - get the vcpu running on the current CPU.
  885. + * Must be called from non-preemptible context
  886. + */
  887. +struct kvm_vcpu *kvm_arm_get_running_vcpu(void)
  888. +{
  889. + BUG_ON(preemptible());
  890. + return __this_cpu_read(kvm_arm_running_vcpu);
  891. +}
  892. +
  893. +/**
  894. + * kvm_arm_get_running_vcpus - get the per-CPU array of currently running vcpus.
  895. + */
  896. +struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void)
  897. +{
  898. + return &kvm_arm_running_vcpu;
  899. +}
  900. +
  901. +int kvm_arch_hardware_enable(void)
  902. +{
  903. + return 0;
  904. +}
  905. +
  906. +int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
  907. +{
  908. + return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE;
  909. +}
  910. +
  911. +int kvm_arch_hardware_setup(void)
  912. +{
  913. + return 0;
  914. +}
  915. +
  916. +void kvm_arch_check_processor_compat(void *rtn)
  917. +{
  918. + *(int *)rtn = 0;
  919. +}
  920. +
  921. +
  922. +/**
  923. + * kvm_arch_init_vm - initializes a VM data structure
  924. + * @kvm: pointer to the KVM struct
  925. + */
  926. +int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
  927. +{
  928. + int ret = 0;
  929. +
  930. + if (type)
  931. + return -EINVAL;
  932. +
  933. + ret = kvm_alloc_stage2_pgd(kvm);
  934. + if (ret)
  935. + goto out_fail_alloc;
  936. +
  937. + ret = create_hyp_mappings(kvm, kvm + 1);
  938. + if (ret)
  939. + goto out_free_stage2_pgd;
  940. +
  941. + kvm_timer_init(kvm);
  942. +
  943. + /* Mark the initial VMID generation invalid */
  944. + kvm->arch.vmid_gen = 0;
  945. +
  946. + return ret;
  947. +out_free_stage2_pgd:
  948. + kvm_free_stage2_pgd(kvm);
  949. +out_fail_alloc:
  950. + return ret;
  951. +}
  952. +
  953. +int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
  954. +{
  955. + return VM_FAULT_SIGBUS;
  956. +}
  957. +
  958. +
  959. +/**
  960. + * kvm_arch_destroy_vm - destroy the VM data structure
  961. + * @kvm: pointer to the KVM struct
  962. + */
  963. +void kvm_arch_destroy_vm(struct kvm *kvm)
  964. +{
  965. + int i;
  966. +
  967. + kvm_free_stage2_pgd(kvm);
  968. +
  969. + for (i = 0; i < KVM_MAX_VCPUS; ++i) {
  970. + if (kvm->vcpus[i]) {
  971. + kvm_arch_vcpu_free(kvm->vcpus[i]);
  972. + kvm->vcpus[i] = NULL;
  973. + }
  974. + }
  975. +
  976. + kvm_vgic_destroy(kvm);
  977. +}
  978. +
  979. +int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
  980. +{
  981. + int r;
  982. + switch (ext) {
  983. + case KVM_CAP_IRQCHIP:
  984. + r = vgic_present;
  985. + break;
  986. + case KVM_CAP_DEVICE_CTRL:
  987. + case KVM_CAP_USER_MEMORY:
  988. + case KVM_CAP_SYNC_MMU:
  989. + case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
  990. + case KVM_CAP_ONE_REG:
  991. + case KVM_CAP_ARM_PSCI:
  992. + case KVM_CAP_ARM_PSCI_0_2:
  993. + case KVM_CAP_READONLY_MEM:
  994. + r = 1;
  995. + break;
  996. + case KVM_CAP_COALESCED_MMIO:
  997. + r = KVM_COALESCED_MMIO_PAGE_OFFSET;
  998. + break;
  999. + case KVM_CAP_ARM_SET_DEVICE_ADDR:
  1000. + r = 1;
  1001. + break;
  1002. + case KVM_CAP_NR_VCPUS:
  1003. + r = num_online_cpus();
  1004. + break;
  1005. + case KVM_CAP_MAX_VCPUS:
  1006. + r = KVM_MAX_VCPUS;
  1007. + break;
  1008. + default:
  1009. + r = kvm_arch_dev_ioctl_check_extension(ext);
  1010. + break;
  1011. + }
  1012. + return r;
  1013. +}
  1014. +
  1015. +long kvm_arch_dev_ioctl(struct file *filp,
  1016. + unsigned int ioctl, unsigned long arg)
  1017. +{
  1018. + return -EINVAL;
  1019. +}
  1020. +
  1021. +
  1022. +struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
  1023. +{
  1024. + int err;
  1025. + struct kvm_vcpu *vcpu;
  1026. +
  1027. + if (irqchip_in_kernel(kvm) && vgic_initialized(kvm)) {
  1028. + err = -EBUSY;
  1029. + goto out;
  1030. + }
  1031. +
  1032. + vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
  1033. + if (!vcpu) {
  1034. + err = -ENOMEM;
  1035. + goto out;
  1036. + }
  1037. +
  1038. + err = kvm_vcpu_init(vcpu, kvm, id);
  1039. + if (err)
  1040. + goto free_vcpu;
  1041. +
  1042. + err = create_hyp_mappings(vcpu, vcpu + 1);
  1043. + if (err)
  1044. + goto vcpu_uninit;
  1045. +
  1046. + return vcpu;
  1047. +vcpu_uninit:
  1048. + kvm_vcpu_uninit(vcpu);
  1049. +free_vcpu:
  1050. + kmem_cache_free(kvm_vcpu_cache, vcpu);
  1051. +out:
  1052. + return ERR_PTR(err);
  1053. +}
  1054. +
  1055. +int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
  1056. +{
  1057. + return 0;
  1058. +}
  1059. +
  1060. +void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
  1061. +{
  1062. + kvm_mmu_free_memory_caches(vcpu);
  1063. + kvm_timer_vcpu_terminate(vcpu);
  1064. + kvm_vgic_vcpu_destroy(vcpu);
  1065. + kmem_cache_free(kvm_vcpu_cache, vcpu);
  1066. +}
  1067. +
  1068. +void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
  1069. +{
  1070. + kvm_arch_vcpu_free(vcpu);
  1071. +}
  1072. +
  1073. +int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
  1074. +{
  1075. + return 0;
  1076. +}
  1077. +
  1078. +int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
  1079. +{
  1080. + /* Force users to call KVM_ARM_VCPU_INIT */
  1081. + vcpu->arch.target = -1;
  1082. +
  1083. + /* Set up the timer */
  1084. + kvm_timer_vcpu_init(vcpu);
  1085. +
  1086. + return 0;
  1087. +}
  1088. +
  1089. +void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
  1090. +{
  1091. + vcpu->cpu = cpu;
  1092. + vcpu->arch.host_cpu_context = this_cpu_ptr(kvm_host_cpu_state);
  1093. +
  1094. + /*
  1095. + * Check whether this vcpu requires the cache to be flushed on
  1096. + * this physical CPU. This is a consequence of doing dcache
  1097. + * operations by set/way on this vcpu. We do it here to be in
  1098. + * a non-preemptible section.
  1099. + */
  1100. + if (cpumask_test_and_clear_cpu(cpu, &vcpu->arch.require_dcache_flush))
  1101. + flush_cache_all(); /* We'd really want v7_flush_dcache_all() */
  1102. +
  1103. + kvm_arm_set_running_vcpu(vcpu);
  1104. +}
  1105. +
  1106. +void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
  1107. +{
  1108. + /*
  1109. + * The arch-generic KVM code expects the cpu field of a vcpu to be -1
  1110. + * if the vcpu is no longer assigned to a cpu. This is used for the
  1111. + * optimized make_all_cpus_request path.
  1112. + */
  1113. + vcpu->cpu = -1;
  1114. +
  1115. + kvm_arm_set_running_vcpu(NULL);
  1116. +}
  1117. +
  1118. +int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
  1119. + struct kvm_guest_debug *dbg)
  1120. +{
  1121. + return -EINVAL;
  1122. +}
  1123. +
  1124. +
  1125. +int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
  1126. + struct kvm_mp_state *mp_state)
  1127. +{
  1128. + return -EINVAL;
  1129. +}
  1130. +
  1131. +int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
  1132. + struct kvm_mp_state *mp_state)
  1133. +{
  1134. + return -EINVAL;
  1135. +}
  1136. +
  1137. +/**
  1138. + * kvm_arch_vcpu_runnable - determine if the vcpu can be scheduled
  1139. + * @v: The VCPU pointer
  1140. + *
  1141. + * If the guest CPU is not waiting for interrupts or an interrupt line is
  1142. + * asserted, the CPU is by definition runnable.
  1143. + */
  1144. +int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
  1145. +{
  1146. + return !!v->arch.irq_lines || kvm_vgic_vcpu_pending_irq(v);
  1147. +}
  1148. +
  1149. +/* Just ensure a guest exit from a particular CPU */
  1150. +static void exit_vm_noop(void *info)
  1151. +{
  1152. +}
  1153. +
  1154. +void force_vm_exit(const cpumask_t *mask)
  1155. +{
  1156. + smp_call_function_many(mask, exit_vm_noop, NULL, true);
  1157. +}
  1158. +
  1159. +/**
  1160. + * need_new_vmid_gen - check that the VMID is still valid
  1161. + * @kvm: The VM's VMID to checkt
  1162. + *
  1163. + * return true if there is a new generation of VMIDs being used
  1164. + *
  1165. + * The hardware supports only 256 values with the value zero reserved for the
  1166. + * host, so we check if an assigned value belongs to a previous generation,
  1167. + * which which requires us to assign a new value. If we're the first to use a
  1168. + * VMID for the new generation, we must flush necessary caches and TLBs on all
  1169. + * CPUs.
  1170. + */
  1171. +static bool need_new_vmid_gen(struct kvm *kvm)
  1172. +{
  1173. + return unlikely(kvm->arch.vmid_gen != atomic64_read(&kvm_vmid_gen));
  1174. +}
  1175. +
  1176. +/**
  1177. + * update_vttbr - Update the VTTBR with a valid VMID before the guest runs
  1178. + * @kvm The guest that we are about to run
  1179. + *
  1180. + * Called from kvm_arch_vcpu_ioctl_run before entering the guest to ensure the
  1181. + * VM has a valid VMID, otherwise assigns a new one and flushes corresponding
  1182. + * caches and TLBs.
  1183. + */
  1184. +static void update_vttbr(struct kvm *kvm)
  1185. +{
  1186. + phys_addr_t pgd_phys;
  1187. + u64 vmid;
  1188. +
  1189. + if (!need_new_vmid_gen(kvm))
  1190. + return;
  1191. +
  1192. + spin_lock(&kvm_vmid_lock);
  1193. +
  1194. + /*
  1195. + * We need to re-check the vmid_gen here to ensure that if another vcpu
  1196. + * already allocated a valid vmid for this vm, then this vcpu should
  1197. + * use the same vmid.
  1198. + */
  1199. + if (!need_new_vmid_gen(kvm)) {
  1200. + spin_unlock(&kvm_vmid_lock);
  1201. + return;
  1202. + }
  1203. +
  1204. + /* First user of a new VMID generation? */
  1205. + if (unlikely(kvm_next_vmid == 0)) {
  1206. + atomic64_inc(&kvm_vmid_gen);
  1207. + kvm_next_vmid = 1;
  1208. +
  1209. + /*
  1210. + * On SMP we know no other CPUs can use this CPU's or each
  1211. + * other's VMID after force_vm_exit returns since the
  1212. + * kvm_vmid_lock blocks them from reentry to the guest.
  1213. + */
  1214. + force_vm_exit(cpu_all_mask);
  1215. + /*
  1216. + * Now broadcast TLB + ICACHE invalidation over the inner
  1217. + * shareable domain to make sure all data structures are
  1218. + * clean.
  1219. + */
  1220. + kvm_call_hyp(__kvm_flush_vm_context);
  1221. + }
  1222. +
  1223. + kvm->arch.vmid_gen = atomic64_read(&kvm_vmid_gen);
  1224. + kvm->arch.vmid = kvm_next_vmid;
  1225. + kvm_next_vmid++;
  1226. +
  1227. + /* update vttbr to be used with the new vmid */
  1228. + pgd_phys = virt_to_phys(kvm_get_hwpgd(kvm));
  1229. + BUG_ON(pgd_phys & ~VTTBR_BADDR_MASK);
  1230. + vmid = ((u64)(kvm->arch.vmid) << VTTBR_VMID_SHIFT) & VTTBR_VMID_MASK;
  1231. + kvm->arch.vttbr = pgd_phys | vmid;
  1232. +
  1233. + spin_unlock(&kvm_vmid_lock);
  1234. +}
  1235. +
  1236. +static int kvm_vcpu_first_run_init(struct kvm_vcpu *vcpu)
  1237. +{
  1238. + struct kvm *kvm = vcpu->kvm;
  1239. + int ret;
  1240. +
  1241. + if (likely(vcpu->arch.has_run_once))
  1242. + return 0;
  1243. +
  1244. + vcpu->arch.has_run_once = true;
  1245. +
  1246. + /*
  1247. + * Map the VGIC hardware resources before running a vcpu the first
  1248. + * time on this VM.
  1249. + */
  1250. + if (unlikely(!vgic_initialized(kvm))) {
  1251. + ret = kvm_vgic_map_resources(kvm);
  1252. + if (ret)
  1253. + return ret;
  1254. + }
  1255. +
  1256. + /*
  1257. + * Enable the arch timers only if we have an in-kernel VGIC
  1258. + * and it has been properly initialized, since we cannot handle
  1259. + * interrupts from the virtual timer with a userspace gic.
  1260. + */
  1261. + if (irqchip_in_kernel(kvm) && vgic_initialized(kvm))
  1262. + kvm_timer_enable(kvm);
  1263. +
  1264. + return 0;
  1265. +}
  1266. +
  1267. +static void vcpu_pause(struct kvm_vcpu *vcpu)
  1268. +{
  1269. + wait_queue_head_t *wq = kvm_arch_vcpu_wq(vcpu);
  1270. +
  1271. + wait_event_interruptible(*wq, !vcpu->arch.pause);
  1272. +}
  1273. +
  1274. +static int kvm_vcpu_initialized(struct kvm_vcpu *vcpu)
  1275. +{
  1276. + return vcpu->arch.target >= 0;
  1277. +}
  1278. +
  1279. +/**
  1280. + * kvm_arch_vcpu_ioctl_run - the main VCPU run function to execute guest code
  1281. + * @vcpu: The VCPU pointer
  1282. + * @run: The kvm_run structure pointer used for userspace state exchange
  1283. + *
  1284. + * This function is called through the VCPU_RUN ioctl called from user space. It
  1285. + * will execute VM code in a loop until the time slice for the process is used
  1286. + * or some emulation is needed from user space in which case the function will
  1287. + * return with return value 0 and with the kvm_run structure filled in with the
  1288. + * required data for the requested emulation.
  1289. + */
  1290. +int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
  1291. +{
  1292. + int ret;
  1293. + sigset_t sigsaved;
  1294. +
  1295. + if (unlikely(!kvm_vcpu_initialized(vcpu)))
  1296. + return -ENOEXEC;
  1297. +
  1298. + ret = kvm_vcpu_first_run_init(vcpu);
  1299. + if (ret)
  1300. + return ret;
  1301. +
  1302. + if (run->exit_reason == KVM_EXIT_MMIO) {
  1303. + ret = kvm_handle_mmio_return(vcpu, vcpu->run);
  1304. + if (ret)
  1305. + return ret;
  1306. + }
  1307. +
  1308. + if (vcpu->sigset_active)
  1309. + sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
  1310. +
  1311. + ret = 1;
  1312. + run->exit_reason = KVM_EXIT_UNKNOWN;
  1313. + while (ret > 0) {
  1314. + /*
  1315. + * Check conditions before entering the guest
  1316. + */
  1317. + cond_resched();
  1318. +
  1319. + update_vttbr(vcpu->kvm);
  1320. +
  1321. + if (vcpu->arch.pause)
  1322. + vcpu_pause(vcpu);
  1323. +
  1324. + kvm_vgic_flush_hwstate(vcpu);
  1325. + kvm_timer_flush_hwstate(vcpu);
  1326. +
  1327. + local_irq_disable();
  1328. +
  1329. + /*
  1330. + * Re-check atomic conditions
  1331. + */
  1332. + if (signal_pending(current)) {
  1333. + ret = -EINTR;
  1334. + run->exit_reason = KVM_EXIT_INTR;
  1335. + }
  1336. +
  1337. + if (ret <= 0 || need_new_vmid_gen(vcpu->kvm)) {
  1338. + local_irq_enable();
  1339. + kvm_timer_sync_hwstate(vcpu);
  1340. + kvm_vgic_sync_hwstate(vcpu);
  1341. + continue;
  1342. + }
  1343. +
  1344. + /**************************************************************
  1345. + * Enter the guest
  1346. + */
  1347. + trace_kvm_entry(*vcpu_pc(vcpu));
  1348. + kvm_guest_enter();
  1349. + vcpu->mode = IN_GUEST_MODE;
  1350. +
  1351. + ret = kvm_call_hyp(__kvm_vcpu_run, vcpu);
  1352. +
  1353. + vcpu->mode = OUTSIDE_GUEST_MODE;
  1354. + vcpu->arch.last_pcpu = smp_processor_id();
  1355. + kvm_guest_exit();
  1356. + trace_kvm_exit(*vcpu_pc(vcpu));
  1357. + /*
  1358. + * We may have taken a host interrupt in HYP mode (ie
  1359. + * while executing the guest). This interrupt is still
  1360. + * pending, as we haven't serviced it yet!
  1361. + *
  1362. + * We're now back in SVC mode, with interrupts
  1363. + * disabled. Enabling the interrupts now will have
  1364. + * the effect of taking the interrupt again, in SVC
  1365. + * mode this time.
  1366. + */
  1367. + local_irq_enable();
  1368. +
  1369. + /*
  1370. + * Back from guest
  1371. + *************************************************************/
  1372. +
  1373. + kvm_timer_sync_hwstate(vcpu);
  1374. + kvm_vgic_sync_hwstate(vcpu);
  1375. +
  1376. + ret = handle_exit(vcpu, run, ret);
  1377. + }
  1378. +
  1379. + if (vcpu->sigset_active)
  1380. + sigprocmask(SIG_SETMASK, &sigsaved, NULL);
  1381. + return ret;
  1382. +}
  1383. +
  1384. +static int vcpu_interrupt_line(struct kvm_vcpu *vcpu, int number, bool level)
  1385. +{
  1386. + int bit_index;
  1387. + bool set;
  1388. + unsigned long *ptr;
  1389. +
  1390. + if (number == KVM_ARM_IRQ_CPU_IRQ)
  1391. + bit_index = __ffs(HCR_VI);
  1392. + else /* KVM_ARM_IRQ_CPU_FIQ */
  1393. + bit_index = __ffs(HCR_VF);
  1394. +
  1395. + ptr = (unsigned long *)&vcpu->arch.irq_lines;
  1396. + if (level)
  1397. + set = test_and_set_bit(bit_index, ptr);
  1398. + else
  1399. + set = test_and_clear_bit(bit_index, ptr);
  1400. +
  1401. + /*
  1402. + * If we didn't change anything, no need to wake up or kick other CPUs
  1403. + */
  1404. + if (set == level)
  1405. + return 0;
  1406. +
  1407. + /*
  1408. + * The vcpu irq_lines field was updated, wake up sleeping VCPUs and
  1409. + * trigger a world-switch round on the running physical CPU to set the
  1410. + * virtual IRQ/FIQ fields in the HCR appropriately.
  1411. + */
  1412. + kvm_vcpu_kick(vcpu);
  1413. +
  1414. + return 0;
  1415. +}
  1416. +
  1417. +int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level,
  1418. + bool line_status)
  1419. +{
  1420. + u32 irq = irq_level->irq;
  1421. + unsigned int irq_type, vcpu_idx, irq_num;
  1422. + int nrcpus = atomic_read(&kvm->online_vcpus);
  1423. + struct kvm_vcpu *vcpu = NULL;
  1424. + bool level = irq_level->level;
  1425. +
  1426. + irq_type = (irq >> KVM_ARM_IRQ_TYPE_SHIFT) & KVM_ARM_IRQ_TYPE_MASK;
  1427. + vcpu_idx = (irq >> KVM_ARM_IRQ_VCPU_SHIFT) & KVM_ARM_IRQ_VCPU_MASK;
  1428. + irq_num = (irq >> KVM_ARM_IRQ_NUM_SHIFT) & KVM_ARM_IRQ_NUM_MASK;
  1429. +
  1430. + trace_kvm_irq_line(irq_type, vcpu_idx, irq_num, irq_level->level);
  1431. +
  1432. + switch (irq_type) {
  1433. + case KVM_ARM_IRQ_TYPE_CPU:
  1434. + if (irqchip_in_kernel(kvm))
  1435. + return -ENXIO;
  1436. +
  1437. + if (vcpu_idx >= nrcpus)
  1438. + return -EINVAL;
  1439. +
  1440. + vcpu = kvm_get_vcpu(kvm, vcpu_idx);
  1441. + if (!vcpu)
  1442. + return -EINVAL;
  1443. +
  1444. + if (irq_num > KVM_ARM_IRQ_CPU_FIQ)
  1445. + return -EINVAL;
  1446. +
  1447. + return vcpu_interrupt_line(vcpu, irq_num, level);
  1448. + case KVM_ARM_IRQ_TYPE_PPI:
  1449. + if (!irqchip_in_kernel(kvm))
  1450. + return -ENXIO;
  1451. +
  1452. + if (vcpu_idx >= nrcpus)
  1453. + return -EINVAL;
  1454. +
  1455. + vcpu = kvm_get_vcpu(kvm, vcpu_idx);
  1456. + if (!vcpu)
  1457. + return -EINVAL;
  1458. +
  1459. + if (irq_num < VGIC_NR_SGIS || irq_num >= VGIC_NR_PRIVATE_IRQS)
  1460. + return -EINVAL;
  1461. +
  1462. + return kvm_vgic_inject_irq(kvm, vcpu->vcpu_id, irq_num, level);
  1463. + case KVM_ARM_IRQ_TYPE_SPI:
  1464. + if (!irqchip_in_kernel(kvm))
  1465. + return -ENXIO;
  1466. +
  1467. + if (irq_num < VGIC_NR_PRIVATE_IRQS)
  1468. + return -EINVAL;
  1469. +
  1470. + return kvm_vgic_inject_irq(kvm, 0, irq_num, level);
  1471. + }
  1472. +
  1473. + return -EINVAL;
  1474. +}
  1475. +
  1476. +static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu,
  1477. + struct kvm_vcpu_init *init)
  1478. +{
  1479. + int ret;
  1480. +
  1481. + ret = kvm_vcpu_set_target(vcpu, init);
  1482. + if (ret)
  1483. + return ret;
  1484. +
  1485. + /*
  1486. + * Ensure a rebooted VM will fault in RAM pages and detect if the
  1487. + * guest MMU is turned off and flush the caches as needed.
  1488. + */
  1489. + if (vcpu->arch.has_run_once)
  1490. + stage2_unmap_vm(vcpu->kvm);
  1491. +
  1492. + vcpu_reset_hcr(vcpu);
  1493. +
  1494. + /*
  1495. + * Handle the "start in power-off" case by marking the VCPU as paused.
  1496. + */
  1497. + if (test_bit(KVM_ARM_VCPU_POWER_OFF, vcpu->arch.features))
  1498. + vcpu->arch.pause = true;
  1499. + else
  1500. + vcpu->arch.pause = false;
  1501. +
  1502. + return 0;
  1503. +}
  1504. +
  1505. +long kvm_arch_vcpu_ioctl(struct file *filp,
  1506. + unsigned int ioctl, unsigned long arg)
  1507. +{
  1508. + struct kvm_vcpu *vcpu = filp->private_data;
  1509. + void __user *argp = (void __user *)arg;
  1510. +
  1511. + switch (ioctl) {
  1512. + case KVM_ARM_VCPU_INIT: {
  1513. + struct kvm_vcpu_init init;
  1514. +
  1515. + if (copy_from_user(&init, argp, sizeof(init)))
  1516. + return -EFAULT;
  1517. +
  1518. + return kvm_arch_vcpu_ioctl_vcpu_init(vcpu, &init);
  1519. + }
  1520. + case KVM_SET_ONE_REG:
  1521. + case KVM_GET_ONE_REG: {
  1522. + struct kvm_one_reg reg;
  1523. +
  1524. + if (unlikely(!kvm_vcpu_initialized(vcpu)))
  1525. + return -ENOEXEC;
  1526. +
  1527. + if (copy_from_user(&reg, argp, sizeof(reg)))
  1528. + return -EFAULT;
  1529. + if (ioctl == KVM_SET_ONE_REG)
  1530. + return kvm_arm_set_reg(vcpu, &reg);
  1531. + else
  1532. + return kvm_arm_get_reg(vcpu, &reg);
  1533. + }
  1534. + case KVM_GET_REG_LIST: {
  1535. + struct kvm_reg_list __user *user_list = argp;
  1536. + struct kvm_reg_list reg_list;
  1537. + unsigned n;
  1538. +
  1539. + if (unlikely(!kvm_vcpu_initialized(vcpu)))
  1540. + return -ENOEXEC;
  1541. +
  1542. + if (copy_from_user(&reg_list, user_list, sizeof(reg_list)))
  1543. + return -EFAULT;
  1544. + n = reg_list.n;
  1545. + reg_list.n = kvm_arm_num_regs(vcpu);
  1546. + if (copy_to_user(user_list, &reg_list, sizeof(reg_list)))
  1547. + return -EFAULT;
  1548. + if (n < reg_list.n)
  1549. + return -E2BIG;
  1550. + return kvm_arm_copy_reg_indices(vcpu, user_list->reg);
  1551. + }
  1552. + default:
  1553. + return -EINVAL;
  1554. + }
  1555. +}
  1556. +
  1557. +int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
  1558. +{
  1559. + return -EINVAL;
  1560. +}
  1561. +
  1562. +static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm,
  1563. + struct kvm_arm_device_addr *dev_addr)
  1564. +{
  1565. + unsigned long dev_id, type;
  1566. +
  1567. + dev_id = (dev_addr->id & KVM_ARM_DEVICE_ID_MASK) >>
  1568. + KVM_ARM_DEVICE_ID_SHIFT;
  1569. + type = (dev_addr->id & KVM_ARM_DEVICE_TYPE_MASK) >>
  1570. + KVM_ARM_DEVICE_TYPE_SHIFT;
  1571. +
  1572. + switch (dev_id) {
  1573. + case KVM_ARM_DEVICE_VGIC_V2:
  1574. + if (!vgic_present)
  1575. + return -ENXIO;
  1576. + return kvm_vgic_addr(kvm, type, &dev_addr->addr, true);
  1577. + default:
  1578. + return -ENODEV;
  1579. + }
  1580. +}
  1581. +
  1582. +long kvm_arch_vm_ioctl(struct file *filp,
  1583. + unsigned int ioctl, unsigned long arg)
  1584. +{
  1585. + struct kvm *kvm = filp->private_data;
  1586. + void __user *argp = (void __user *)arg;
  1587. +
  1588. + switch (ioctl) {
  1589. + case KVM_CREATE_IRQCHIP: {
  1590. + if (vgic_present)
  1591. + return kvm_vgic_create(kvm);
  1592. + else
  1593. + return -ENXIO;
  1594. + }
  1595. + case KVM_ARM_SET_DEVICE_ADDR: {
  1596. + struct kvm_arm_device_addr dev_addr;
  1597. +
  1598. + if (copy_from_user(&dev_addr, argp, sizeof(dev_addr)))
  1599. + return -EFAULT;
  1600. + return kvm_vm_ioctl_set_device_addr(kvm, &dev_addr);
  1601. + }
  1602. + case KVM_ARM_PREFERRED_TARGET: {
  1603. + int err;
  1604. + struct kvm_vcpu_init init;
  1605. +
  1606. + err = kvm_vcpu_preferred_target(&init);
  1607. + if (err)
  1608. + return err;
  1609. +
  1610. + if (copy_to_user(argp, &init, sizeof(init)))
  1611. + return -EFAULT;
  1612. +
  1613. + return 0;
  1614. + }
  1615. + default:
  1616. + return -EINVAL;
  1617. + }
  1618. +}
  1619. +
  1620. +static void cpu_init_hyp_mode(void *dummy)
  1621. +{
  1622. + phys_addr_t boot_pgd_ptr;
  1623. + phys_addr_t pgd_ptr;
  1624. + unsigned long hyp_stack_ptr;
  1625. + unsigned long stack_page;
  1626. + unsigned long vector_ptr;
  1627. +
  1628. + /* Switch from the HYP stub to our own HYP init vector */
  1629. + __hyp_set_vectors(kvm_get_idmap_vector());
  1630. +
  1631. + boot_pgd_ptr = kvm_mmu_get_boot_httbr();
  1632. + pgd_ptr = kvm_mmu_get_httbr();
  1633. + stack_page = __this_cpu_read(kvm_arm_hyp_stack_page);
  1634. + hyp_stack_ptr = stack_page + PAGE_SIZE;
  1635. + vector_ptr = (unsigned long)__kvm_hyp_vector;
  1636. +
  1637. + __cpu_init_hyp_mode(boot_pgd_ptr, pgd_ptr, hyp_stack_ptr, vector_ptr);
  1638. +}
  1639. +
  1640. +static int hyp_init_cpu_notify(struct notifier_block *self,
  1641. + unsigned long action, void *cpu)
  1642. +{
  1643. + switch (action) {
  1644. + case CPU_STARTING:
  1645. + case CPU_STARTING_FROZEN:
  1646. + if (__hyp_get_vectors() == hyp_default_vectors)
  1647. + cpu_init_hyp_mode(NULL);
  1648. + break;
  1649. + }
  1650. +
  1651. + return NOTIFY_OK;
  1652. +}
  1653. +
  1654. +static struct notifier_block hyp_init_cpu_nb = {
  1655. + .notifier_call = hyp_init_cpu_notify,
  1656. +};
  1657. +
  1658. +#ifdef CONFIG_CPU_PM
  1659. +static int hyp_init_cpu_pm_notifier(struct notifier_block *self,
  1660. + unsigned long cmd,
  1661. + void *v)
  1662. +{
  1663. + if (cmd == CPU_PM_EXIT &&
  1664. + __hyp_get_vectors() == hyp_default_vectors) {
  1665. + cpu_init_hyp_mode(NULL);
  1666. + return NOTIFY_OK;
  1667. + }
  1668. +
  1669. + return NOTIFY_DONE;
  1670. +}
  1671. +
  1672. +static struct notifier_block hyp_init_cpu_pm_nb = {
  1673. + .notifier_call = hyp_init_cpu_pm_notifier,
  1674. +};
  1675. +
  1676. +static void __init hyp_cpu_pm_init(void)
  1677. +{
  1678. + cpu_pm_register_notifier(&hyp_init_cpu_pm_nb);
  1679. +}
  1680. +#else
  1681. +static inline void hyp_cpu_pm_init(void)
  1682. +{
  1683. +}
  1684. +#endif
  1685. +
  1686. +/**
  1687. + * Inits Hyp-mode on all online CPUs
  1688. + */
  1689. +static int init_hyp_mode(void)
  1690. +{
  1691. + int cpu;
  1692. + int err = 0;
  1693. +
  1694. + /*
  1695. + * Allocate Hyp PGD and setup Hyp identity mapping
  1696. + */
  1697. + err = kvm_mmu_init();
  1698. + if (err)
  1699. + goto out_err;
  1700. +
  1701. + /*
  1702. + * It is probably enough to obtain the default on one
  1703. + * CPU. It's unlikely to be different on the others.
  1704. + */
  1705. + hyp_default_vectors = __hyp_get_vectors();
  1706. +
  1707. + /*
  1708. + * Allocate stack pages for Hypervisor-mode
  1709. + */
  1710. + for_each_possible_cpu(cpu) {
  1711. + unsigned long stack_page;
  1712. +
  1713. + stack_page = __get_free_page(GFP_KERNEL);
  1714. + if (!stack_page) {
  1715. + err = -ENOMEM;
  1716. + goto out_free_stack_pages;
  1717. + }
  1718. +
  1719. + per_cpu(kvm_arm_hyp_stack_page, cpu) = stack_page;
  1720. + }
  1721. +
  1722. + /*
  1723. + * Map the Hyp-code called directly from the host
  1724. + */
  1725. + err = create_hyp_mappings(__kvm_hyp_code_start, __kvm_hyp_code_end);
  1726. + if (err) {
  1727. + kvm_err("Cannot map world-switch code\n");
  1728. + goto out_free_mappings;
  1729. + }
  1730. +
  1731. + /*
  1732. + * Map the Hyp stack pages
  1733. + */
  1734. + for_each_possible_cpu(cpu) {
  1735. + char *stack_page = (char *)per_cpu(kvm_arm_hyp_stack_page, cpu);
  1736. + err = create_hyp_mappings(stack_page, stack_page + PAGE_SIZE);
  1737. +
  1738. + if (err) {
  1739. + kvm_err("Cannot map hyp stack\n");
  1740. + goto out_free_mappings;
  1741. + }
  1742. + }
  1743. +
  1744. + /*
  1745. + * Map the host CPU structures
  1746. + */
  1747. + kvm_host_cpu_state = alloc_percpu(kvm_cpu_context_t);
  1748. + if (!kvm_host_cpu_state) {
  1749. + err = -ENOMEM;
  1750. + kvm_err("Cannot allocate host CPU state\n");
  1751. + goto out_free_mappings;
  1752. + }
  1753. +
  1754. + for_each_possible_cpu(cpu) {
  1755. + kvm_cpu_context_t *cpu_ctxt;
  1756. +
  1757. + cpu_ctxt = per_cpu_ptr(kvm_host_cpu_state, cpu);
  1758. + err = create_hyp_mappings(cpu_ctxt, cpu_ctxt + 1);
  1759. +
  1760. + if (err) {
  1761. + kvm_err("Cannot map host CPU state: %d\n", err);
  1762. + goto out_free_context;
  1763. + }
  1764. + }
  1765. +
  1766. + /*
  1767. + * Execute the init code on each CPU.
  1768. + */
  1769. + on_each_cpu(cpu_init_hyp_mode, NULL, 1);
  1770. +
  1771. + /*
  1772. + * Init HYP view of VGIC
  1773. + */
  1774. + err = kvm_vgic_hyp_init();
  1775. + if (err)
  1776. + goto out_free_context;
  1777. +
  1778. +#ifdef CONFIG_KVM_ARM_VGIC
  1779. + vgic_present = true;
  1780. +#endif
  1781. +
  1782. + /*
  1783. + * Init HYP architected timer support
  1784. + */
  1785. + err = kvm_timer_hyp_init();
  1786. + if (err)
  1787. + goto out_free_mappings;
  1788. +
  1789. +#ifndef CONFIG_HOTPLUG_CPU
  1790. + free_boot_hyp_pgd();
  1791. +#endif
  1792. +
  1793. + kvm_perf_init();
  1794. +
  1795. + kvm_info("Hyp mode initialized successfully\n");
  1796. +
  1797. + return 0;
  1798. +out_free_context:
  1799. + free_percpu(kvm_host_cpu_state);
  1800. +out_free_mappings:
  1801. + free_hyp_pgds();
  1802. +out_free_stack_pages:
  1803. + for_each_possible_cpu(cpu)
  1804. + free_page(per_cpu(kvm_arm_hyp_stack_page, cpu));
  1805. +out_err:
  1806. + kvm_err("error initializing Hyp mode: %d\n", err);
  1807. + return err;
  1808. +}
  1809. +
  1810. +static void check_kvm_target_cpu(void *ret)
  1811. +{
  1812. + *(int *)ret = kvm_target_cpu();
  1813. +}
  1814. +
  1815. +/**
  1816. + * Initialize Hyp-mode and memory mappings on all CPUs.
  1817. + */
  1818. +int kvm_arch_init(void *opaque)
  1819. +{
  1820. + int err;
  1821. + int ret, cpu;
  1822. +
  1823. + if (!is_hyp_mode_available()) {
  1824. + kvm_err("HYP mode not available\n");
  1825. + return -ENODEV;
  1826. + }
  1827. +
  1828. + for_each_online_cpu(cpu) {
  1829. + smp_call_function_single(cpu, check_kvm_target_cpu, &ret, 1);
  1830. + if (ret < 0) {
  1831. + kvm_err("Error, CPU %d not supported!\n", cpu);
  1832. + return -ENODEV;
  1833. + }
  1834. + }
  1835. +
  1836. + cpu_notifier_register_begin();
  1837. +
  1838. + err = init_hyp_mode();
  1839. + if (err)
  1840. + goto out_err;
  1841. +
  1842. + err = __register_cpu_notifier(&hyp_init_cpu_nb);
  1843. + if (err) {
  1844. + kvm_err("Cannot register HYP init CPU notifier (%d)\n", err);
  1845. + goto out_err;
  1846. + }
  1847. +
  1848. + cpu_notifier_register_done();
  1849. +
  1850. + hyp_cpu_pm_init();
  1851. +
  1852. + kvm_coproc_table_init();
  1853. + return 0;
  1854. +out_err:
  1855. + cpu_notifier_register_done();
  1856. + return err;
  1857. +}
  1858. +
  1859. +/* NOP: Compiling as a module not supported */
  1860. +void kvm_arch_exit(void)
  1861. +{
  1862. + kvm_perf_teardown();
  1863. +}
  1864. +
  1865. +static int arm_init(void)
  1866. +{
  1867. + int rc = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE);
  1868. + return rc;
  1869. +}
  1870. +
  1871. +module_init(arm_init);
  1872. diff -Nur linux-3.18.14.orig/arch/arm/kvm/psci.c linux-3.18.14-rt/arch/arm/kvm/psci.c
  1873. --- linux-3.18.14.orig/arch/arm/kvm/psci.c 2015-05-20 10:04:50.000000000 -0500
  1874. +++ linux-3.18.14-rt/arch/arm/kvm/psci.c 2015-05-31 15:32:45.673635392 -0500
  1875. @@ -67,7 +67,7 @@
  1876. {
  1877. struct kvm *kvm = source_vcpu->kvm;
  1878. struct kvm_vcpu *vcpu = NULL, *tmp;
  1879. - wait_queue_head_t *wq;
  1880. + struct swait_head *wq;
  1881. unsigned long cpu_id;
  1882. unsigned long context_id;
  1883. unsigned long mpidr;
  1884. @@ -124,7 +124,7 @@
  1885. smp_mb(); /* Make sure the above is visible */
  1886. wq = kvm_arch_vcpu_wq(vcpu);
  1887. - wake_up_interruptible(wq);
  1888. + swait_wake_interruptible(wq);
  1889. return PSCI_RET_SUCCESS;
  1890. }
  1891. diff -Nur linux-3.18.14.orig/arch/arm/kvm/psci.c.orig linux-3.18.14-rt/arch/arm/kvm/psci.c.orig
  1892. --- linux-3.18.14.orig/arch/arm/kvm/psci.c.orig 1969-12-31 18:00:00.000000000 -0600
  1893. +++ linux-3.18.14-rt/arch/arm/kvm/psci.c.orig 2015-05-20 10:04:50.000000000 -0500
  1894. @@ -0,0 +1,337 @@
  1895. +/*
  1896. + * Copyright (C) 2012 - ARM Ltd
  1897. + * Author: Marc Zyngier <marc.zyngier@arm.com>
  1898. + *
  1899. + * This program is free software; you can redistribute it and/or modify
  1900. + * it under the terms of the GNU General Public License version 2 as
  1901. + * published by the Free Software Foundation.
  1902. + *
  1903. + * This program is distributed in the hope that it will be useful,
  1904. + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  1905. + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  1906. + * GNU General Public License for more details.
  1907. + *
  1908. + * You should have received a copy of the GNU General Public License
  1909. + * along with this program. If not, see <http://www.gnu.org/licenses/>.
  1910. + */
  1911. +
  1912. +#include <linux/preempt.h>
  1913. +#include <linux/kvm_host.h>
  1914. +#include <linux/wait.h>
  1915. +
  1916. +#include <asm/cputype.h>
  1917. +#include <asm/kvm_emulate.h>
  1918. +#include <asm/kvm_psci.h>
  1919. +
  1920. +/*
  1921. + * This is an implementation of the Power State Coordination Interface
  1922. + * as described in ARM document number ARM DEN 0022A.
  1923. + */
  1924. +
  1925. +#define AFFINITY_MASK(level) ~((0x1UL << ((level) * MPIDR_LEVEL_BITS)) - 1)
  1926. +
  1927. +static unsigned long psci_affinity_mask(unsigned long affinity_level)
  1928. +{
  1929. + if (affinity_level <= 3)
  1930. + return MPIDR_HWID_BITMASK & AFFINITY_MASK(affinity_level);
  1931. +
  1932. + return 0;
  1933. +}
  1934. +
  1935. +static unsigned long kvm_psci_vcpu_suspend(struct kvm_vcpu *vcpu)
  1936. +{
  1937. + /*
  1938. + * NOTE: For simplicity, we make VCPU suspend emulation to be
  1939. + * same-as WFI (Wait-for-interrupt) emulation.
  1940. + *
  1941. + * This means for KVM the wakeup events are interrupts and
  1942. + * this is consistent with intended use of StateID as described
  1943. + * in section 5.4.1 of PSCI v0.2 specification (ARM DEN 0022A).
  1944. + *
  1945. + * Further, we also treat power-down request to be same as
  1946. + * stand-by request as-per section 5.4.2 clause 3 of PSCI v0.2
  1947. + * specification (ARM DEN 0022A). This means all suspend states
  1948. + * for KVM will preserve the register state.
  1949. + */
  1950. + kvm_vcpu_block(vcpu);
  1951. +
  1952. + return PSCI_RET_SUCCESS;
  1953. +}
  1954. +
  1955. +static void kvm_psci_vcpu_off(struct kvm_vcpu *vcpu)
  1956. +{
  1957. + vcpu->arch.pause = true;
  1958. +}
  1959. +
  1960. +static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
  1961. +{
  1962. + struct kvm *kvm = source_vcpu->kvm;
  1963. + struct kvm_vcpu *vcpu = NULL, *tmp;
  1964. + wait_queue_head_t *wq;
  1965. + unsigned long cpu_id;
  1966. + unsigned long context_id;
  1967. + unsigned long mpidr;
  1968. + phys_addr_t target_pc;
  1969. + int i;
  1970. +
  1971. + cpu_id = *vcpu_reg(source_vcpu, 1);
  1972. + if (vcpu_mode_is_32bit(source_vcpu))
  1973. + cpu_id &= ~((u32) 0);
  1974. +
  1975. + kvm_for_each_vcpu(i, tmp, kvm) {
  1976. + mpidr = kvm_vcpu_get_mpidr(tmp);
  1977. + if ((mpidr & MPIDR_HWID_BITMASK) == (cpu_id & MPIDR_HWID_BITMASK)) {
  1978. + vcpu = tmp;
  1979. + break;
  1980. + }
  1981. + }
  1982. +
  1983. + /*
  1984. + * Make sure the caller requested a valid CPU and that the CPU is
  1985. + * turned off.
  1986. + */
  1987. + if (!vcpu)
  1988. + return PSCI_RET_INVALID_PARAMS;
  1989. + if (!vcpu->arch.pause) {
  1990. + if (kvm_psci_version(source_vcpu) != KVM_ARM_PSCI_0_1)
  1991. + return PSCI_RET_ALREADY_ON;
  1992. + else
  1993. + return PSCI_RET_INVALID_PARAMS;
  1994. + }
  1995. +
  1996. + target_pc = *vcpu_reg(source_vcpu, 2);
  1997. + context_id = *vcpu_reg(source_vcpu, 3);
  1998. +
  1999. + kvm_reset_vcpu(vcpu);
  2000. +
  2001. + /* Gracefully handle Thumb2 entry point */
  2002. + if (vcpu_mode_is_32bit(vcpu) && (target_pc & 1)) {
  2003. + target_pc &= ~((phys_addr_t) 1);
  2004. + vcpu_set_thumb(vcpu);
  2005. + }
  2006. +
  2007. + /* Propagate caller endianness */
  2008. + if (kvm_vcpu_is_be(source_vcpu))
  2009. + kvm_vcpu_set_be(vcpu);
  2010. +
  2011. + *vcpu_pc(vcpu) = target_pc;
  2012. + /*
  2013. + * NOTE: We always update r0 (or x0) because for PSCI v0.1
  2014. + * the general puspose registers are undefined upon CPU_ON.
  2015. + */
  2016. + *vcpu_reg(vcpu, 0) = context_id;
  2017. + vcpu->arch.pause = false;
  2018. + smp_mb(); /* Make sure the above is visible */
  2019. +
  2020. + wq = kvm_arch_vcpu_wq(vcpu);
  2021. + wake_up_interruptible(wq);
  2022. +
  2023. + return PSCI_RET_SUCCESS;
  2024. +}
  2025. +
  2026. +static unsigned long kvm_psci_vcpu_affinity_info(struct kvm_vcpu *vcpu)
  2027. +{
  2028. + int i;
  2029. + unsigned long mpidr;
  2030. + unsigned long target_affinity;
  2031. + unsigned long target_affinity_mask;
  2032. + unsigned long lowest_affinity_level;
  2033. + struct kvm *kvm = vcpu->kvm;
  2034. + struct kvm_vcpu *tmp;
  2035. +
  2036. + target_affinity = *vcpu_reg(vcpu, 1);
  2037. + lowest_affinity_level = *vcpu_reg(vcpu, 2);
  2038. +
  2039. + /* Determine target affinity mask */
  2040. + target_affinity_mask = psci_affinity_mask(lowest_affinity_level);
  2041. + if (!target_affinity_mask)
  2042. + return PSCI_RET_INVALID_PARAMS;
  2043. +
  2044. + /* Ignore other bits of target affinity */
  2045. + target_affinity &= target_affinity_mask;
  2046. +
  2047. + /*
  2048. + * If one or more VCPU matching target affinity are running
  2049. + * then ON else OFF
  2050. + */
  2051. + kvm_for_each_vcpu(i, tmp, kvm) {
  2052. + mpidr = kvm_vcpu_get_mpidr(tmp);
  2053. + if (((mpidr & target_affinity_mask) == target_affinity) &&
  2054. + !tmp->arch.pause) {
  2055. + return PSCI_0_2_AFFINITY_LEVEL_ON;
  2056. + }
  2057. + }
  2058. +
  2059. + return PSCI_0_2_AFFINITY_LEVEL_OFF;
  2060. +}
  2061. +
  2062. +static void kvm_prepare_system_event(struct kvm_vcpu *vcpu, u32 type)
  2063. +{
  2064. + int i;
  2065. + struct kvm_vcpu *tmp;
  2066. +
  2067. + /*
  2068. + * The KVM ABI specifies that a system event exit may call KVM_RUN
  2069. + * again and may perform shutdown/reboot at a later time that when the
  2070. + * actual request is made. Since we are implementing PSCI and a
  2071. + * caller of PSCI reboot and shutdown expects that the system shuts
  2072. + * down or reboots immediately, let's make sure that VCPUs are not run
  2073. + * after this call is handled and before the VCPUs have been
  2074. + * re-initialized.
  2075. + */
  2076. + kvm_for_each_vcpu(i, tmp, vcpu->kvm) {
  2077. + tmp->arch.pause = true;
  2078. + kvm_vcpu_kick(tmp);
  2079. + }
  2080. +
  2081. + memset(&vcpu->run->system_event, 0, sizeof(vcpu->run->system_event));
  2082. + vcpu->run->system_event.type = type;
  2083. + vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
  2084. +}
  2085. +
  2086. +static void kvm_psci_system_off(struct kvm_vcpu *vcpu)
  2087. +{
  2088. + kvm_prepare_system_event(vcpu, KVM_SYSTEM_EVENT_SHUTDOWN);
  2089. +}
  2090. +
  2091. +static void kvm_psci_system_reset(struct kvm_vcpu *vcpu)
  2092. +{
  2093. + kvm_prepare_system_event(vcpu, KVM_SYSTEM_EVENT_RESET);
  2094. +}
  2095. +
  2096. +int kvm_psci_version(struct kvm_vcpu *vcpu)
  2097. +{
  2098. + if (test_bit(KVM_ARM_VCPU_PSCI_0_2, vcpu->arch.features))
  2099. + return KVM_ARM_PSCI_0_2;
  2100. +
  2101. + return KVM_ARM_PSCI_0_1;
  2102. +}
  2103. +
  2104. +static int kvm_psci_0_2_call(struct kvm_vcpu *vcpu)
  2105. +{
  2106. + int ret = 1;
  2107. + unsigned long psci_fn = *vcpu_reg(vcpu, 0) & ~((u32) 0);
  2108. + unsigned long val;
  2109. +
  2110. + switch (psci_fn) {
  2111. + case PSCI_0_2_FN_PSCI_VERSION:
  2112. + /*
  2113. + * Bits[31:16] = Major Version = 0
  2114. + * Bits[15:0] = Minor Version = 2
  2115. + */
  2116. + val = 2;
  2117. + break;
  2118. + case PSCI_0_2_FN_CPU_SUSPEND:
  2119. + case PSCI_0_2_FN64_CPU_SUSPEND:
  2120. + val = kvm_psci_vcpu_suspend(vcpu);
  2121. + break;
  2122. + case PSCI_0_2_FN_CPU_OFF:
  2123. + kvm_psci_vcpu_off(vcpu);
  2124. + val = PSCI_RET_SUCCESS;
  2125. + break;
  2126. + case PSCI_0_2_FN_CPU_ON:
  2127. + case PSCI_0_2_FN64_CPU_ON:
  2128. + val = kvm_psci_vcpu_on(vcpu);
  2129. + break;
  2130. + case PSCI_0_2_FN_AFFINITY_INFO:
  2131. + case PSCI_0_2_FN64_AFFINITY_INFO:
  2132. + val = kvm_psci_vcpu_affinity_info(vcpu);
  2133. + break;
  2134. + case PSCI_0_2_FN_MIGRATE:
  2135. + case PSCI_0_2_FN64_MIGRATE:
  2136. + val = PSCI_RET_NOT_SUPPORTED;
  2137. + break;
  2138. + case PSCI_0_2_FN_MIGRATE_INFO_TYPE:
  2139. + /*
  2140. + * Trusted OS is MP hence does not require migration
  2141. + * or
  2142. + * Trusted OS is not present
  2143. + */
  2144. + val = PSCI_0_2_TOS_MP;
  2145. + break;
  2146. + case PSCI_0_2_FN_MIGRATE_INFO_UP_CPU:
  2147. + case PSCI_0_2_FN64_MIGRATE_INFO_UP_CPU:
  2148. + val = PSCI_RET_NOT_SUPPORTED;
  2149. + break;
  2150. + case PSCI_0_2_FN_SYSTEM_OFF:
  2151. + kvm_psci_system_off(vcpu);
  2152. + /*
  2153. + * We should'nt be going back to guest VCPU after
  2154. + * receiving SYSTEM_OFF request.
  2155. + *
  2156. + * If user space accidently/deliberately resumes
  2157. + * guest VCPU after SYSTEM_OFF request then guest
  2158. + * VCPU should see internal failure from PSCI return
  2159. + * value. To achieve this, we preload r0 (or x0) with
  2160. + * PSCI return value INTERNAL_FAILURE.
  2161. + */
  2162. + val = PSCI_RET_INTERNAL_FAILURE;
  2163. + ret = 0;
  2164. + break;
  2165. + case PSCI_0_2_FN_SYSTEM_RESET:
  2166. + kvm_psci_system_reset(vcpu);
  2167. + /*
  2168. + * Same reason as SYSTEM_OFF for preloading r0 (or x0)
  2169. + * with PSCI return value INTERNAL_FAILURE.
  2170. + */
  2171. + val = PSCI_RET_INTERNAL_FAILURE;
  2172. + ret = 0;
  2173. + break;
  2174. + default:
  2175. + return -EINVAL;
  2176. + }
  2177. +
  2178. + *vcpu_reg(vcpu, 0) = val;
  2179. + return ret;
  2180. +}
  2181. +
  2182. +static int kvm_psci_0_1_call(struct kvm_vcpu *vcpu)
  2183. +{
  2184. + unsigned long psci_fn = *vcpu_reg(vcpu, 0) & ~((u32) 0);
  2185. + unsigned long val;
  2186. +
  2187. + switch (psci_fn) {
  2188. + case KVM_PSCI_FN_CPU_OFF:
  2189. + kvm_psci_vcpu_off(vcpu);
  2190. + val = PSCI_RET_SUCCESS;
  2191. + break;
  2192. + case KVM_PSCI_FN_CPU_ON:
  2193. + val = kvm_psci_vcpu_on(vcpu);
  2194. + break;
  2195. + case KVM_PSCI_FN_CPU_SUSPEND:
  2196. + case KVM_PSCI_FN_MIGRATE:
  2197. + val = PSCI_RET_NOT_SUPPORTED;
  2198. + break;
  2199. + default:
  2200. + return -EINVAL;
  2201. + }
  2202. +
  2203. + *vcpu_reg(vcpu, 0) = val;
  2204. + return 1;
  2205. +}
  2206. +
  2207. +/**
  2208. + * kvm_psci_call - handle PSCI call if r0 value is in range
  2209. + * @vcpu: Pointer to the VCPU struct
  2210. + *
  2211. + * Handle PSCI calls from guests through traps from HVC instructions.
  2212. + * The calling convention is similar to SMC calls to the secure world
  2213. + * where the function number is placed in r0.
  2214. + *
  2215. + * This function returns: > 0 (success), 0 (success but exit to user
  2216. + * space), and < 0 (errors)
  2217. + *
  2218. + * Errors:
  2219. + * -EINVAL: Unrecognized PSCI function
  2220. + */
  2221. +int kvm_psci_call(struct kvm_vcpu *vcpu)
  2222. +{
  2223. + switch (kvm_psci_version(vcpu)) {
  2224. + case KVM_ARM_PSCI_0_2:
  2225. + return kvm_psci_0_2_call(vcpu);
  2226. + case KVM_ARM_PSCI_0_1:
  2227. + return kvm_psci_0_1_call(vcpu);
  2228. + default:
  2229. + return -EINVAL;
  2230. + };
  2231. +}
  2232. diff -Nur linux-3.18.14.orig/arch/arm/mach-at91/at91rm9200_time.c linux-3.18.14-rt/arch/arm/mach-at91/at91rm9200_time.c
  2233. --- linux-3.18.14.orig/arch/arm/mach-at91/at91rm9200_time.c 2015-05-20 10:04:50.000000000 -0500
  2234. +++ linux-3.18.14-rt/arch/arm/mach-at91/at91rm9200_time.c 2015-05-31 15:32:45.673635392 -0500
  2235. @@ -135,6 +135,7 @@
  2236. break;
  2237. case CLOCK_EVT_MODE_SHUTDOWN:
  2238. case CLOCK_EVT_MODE_UNUSED:
  2239. + remove_irq(NR_IRQS_LEGACY + AT91_ID_SYS, &at91rm9200_timer_irq);
  2240. case CLOCK_EVT_MODE_RESUME:
  2241. irqmask = 0;
  2242. break;
  2243. diff -Nur linux-3.18.14.orig/arch/arm/mach-exynos/platsmp.c linux-3.18.14-rt/arch/arm/mach-exynos/platsmp.c
  2244. --- linux-3.18.14.orig/arch/arm/mach-exynos/platsmp.c 2015-05-20 10:04:50.000000000 -0500
  2245. +++ linux-3.18.14-rt/arch/arm/mach-exynos/platsmp.c 2015-05-31 15:32:45.673635392 -0500
  2246. @@ -137,7 +137,7 @@
  2247. return (void __iomem *)(S5P_VA_SCU);
  2248. }
  2249. -static DEFINE_SPINLOCK(boot_lock);
  2250. +static DEFINE_RAW_SPINLOCK(boot_lock);
  2251. static void exynos_secondary_init(unsigned int cpu)
  2252. {
  2253. @@ -150,8 +150,8 @@
  2254. /*
  2255. * Synchronise with the boot thread.
  2256. */
  2257. - spin_lock(&boot_lock);
  2258. - spin_unlock(&boot_lock);
  2259. + raw_spin_lock(&boot_lock);
  2260. + raw_spin_unlock(&boot_lock);
  2261. }
  2262. static int exynos_boot_secondary(unsigned int cpu, struct task_struct *idle)
  2263. @@ -165,7 +165,7 @@
  2264. * Set synchronisation state between this boot processor
  2265. * and the secondary one
  2266. */
  2267. - spin_lock(&boot_lock);
  2268. + raw_spin_lock(&boot_lock);
  2269. /*
  2270. * The secondary processor is waiting to be released from
  2271. @@ -192,7 +192,7 @@
  2272. if (timeout == 0) {
  2273. printk(KERN_ERR "cpu1 power enable failed");
  2274. - spin_unlock(&boot_lock);
  2275. + raw_spin_unlock(&boot_lock);
  2276. return -ETIMEDOUT;
  2277. }
  2278. }
  2279. @@ -242,7 +242,7 @@
  2280. * calibrations, then wait for it to finish
  2281. */
  2282. fail:
  2283. - spin_unlock(&boot_lock);
  2284. + raw_spin_unlock(&boot_lock);
  2285. return pen_release != -1 ? ret : 0;
  2286. }
  2287. diff -Nur linux-3.18.14.orig/arch/arm/mach-hisi/platmcpm.c linux-3.18.14-rt/arch/arm/mach-hisi/platmcpm.c
  2288. --- linux-3.18.14.orig/arch/arm/mach-hisi/platmcpm.c 2015-05-20 10:04:50.000000000 -0500
  2289. +++ linux-3.18.14-rt/arch/arm/mach-hisi/platmcpm.c 2015-05-31 15:32:45.677635392 -0500
  2290. @@ -57,7 +57,7 @@
  2291. static void __iomem *sysctrl, *fabric;
  2292. static int hip04_cpu_table[HIP04_MAX_CLUSTERS][HIP04_MAX_CPUS_PER_CLUSTER];
  2293. -static DEFINE_SPINLOCK(boot_lock);
  2294. +static DEFINE_RAW_SPINLOCK(boot_lock);
  2295. static u32 fabric_phys_addr;
  2296. /*
  2297. * [0]: bootwrapper physical address
  2298. @@ -104,7 +104,7 @@
  2299. if (cluster >= HIP04_MAX_CLUSTERS || cpu >= HIP04_MAX_CPUS_PER_CLUSTER)
  2300. return -EINVAL;
  2301. - spin_lock_irq(&boot_lock);
  2302. + raw_spin_lock_irq(&boot_lock);
  2303. if (hip04_cpu_table[cluster][cpu])
  2304. goto out;
  2305. @@ -133,7 +133,7 @@
  2306. udelay(20);
  2307. out:
  2308. hip04_cpu_table[cluster][cpu]++;
  2309. - spin_unlock_irq(&boot_lock);
  2310. + raw_spin_unlock_irq(&boot_lock);
  2311. return 0;
  2312. }
  2313. @@ -149,7 +149,7 @@
  2314. __mcpm_cpu_going_down(cpu, cluster);
  2315. - spin_lock(&boot_lock);
  2316. + raw_spin_lock(&boot_lock);
  2317. BUG_ON(__mcpm_cluster_state(cluster) != CLUSTER_UP);
  2318. hip04_cpu_table[cluster][cpu]--;
  2319. if (hip04_cpu_table[cluster][cpu] == 1) {
  2320. @@ -162,7 +162,7 @@
  2321. last_man = hip04_cluster_is_down(cluster);
  2322. if (last_man && __mcpm_outbound_enter_critical(cpu, cluster)) {
  2323. - spin_unlock(&boot_lock);
  2324. + raw_spin_unlock(&boot_lock);
  2325. /* Since it's Cortex A15, disable L2 prefetching. */
  2326. asm volatile(
  2327. "mcr p15, 1, %0, c15, c0, 3 \n\t"
  2328. @@ -173,7 +173,7 @@
  2329. hip04_set_snoop_filter(cluster, 0);
  2330. __mcpm_outbound_leave_critical(cluster, CLUSTER_DOWN);
  2331. } else {
  2332. - spin_unlock(&boot_lock);
  2333. + raw_spin_unlock(&boot_lock);
  2334. v7_exit_coherency_flush(louis);
  2335. }
  2336. @@ -192,7 +192,7 @@
  2337. cpu >= HIP04_MAX_CPUS_PER_CLUSTER);
  2338. count = TIMEOUT_MSEC / POLL_MSEC;
  2339. - spin_lock_irq(&boot_lock);
  2340. + raw_spin_lock_irq(&boot_lock);
  2341. for (tries = 0; tries < count; tries++) {
  2342. if (hip04_cpu_table[cluster][cpu]) {
  2343. ret = -EBUSY;
  2344. @@ -202,10 +202,10 @@
  2345. data = readl_relaxed(sysctrl + SC_CPU_RESET_STATUS(cluster));
  2346. if (data & CORE_WFI_STATUS(cpu))
  2347. break;
  2348. - spin_unlock_irq(&boot_lock);
  2349. + raw_spin_unlock_irq(&boot_lock);
  2350. /* Wait for clean L2 when the whole cluster is down. */
  2351. msleep(POLL_MSEC);
  2352. - spin_lock_irq(&boot_lock);
  2353. + raw_spin_lock_irq(&boot_lock);
  2354. }
  2355. if (tries >= count)
  2356. goto err;
  2357. @@ -220,10 +220,10 @@
  2358. }
  2359. if (tries >= count)
  2360. goto err;
  2361. - spin_unlock_irq(&boot_lock);
  2362. + raw_spin_unlock_irq(&boot_lock);
  2363. return 0;
  2364. err:
  2365. - spin_unlock_irq(&boot_lock);
  2366. + raw_spin_unlock_irq(&boot_lock);
  2367. return ret;
  2368. }
  2369. @@ -235,10 +235,10 @@
  2370. cpu = MPIDR_AFFINITY_LEVEL(mpidr, 0);
  2371. cluster = MPIDR_AFFINITY_LEVEL(mpidr, 1);
  2372. - spin_lock(&boot_lock);
  2373. + raw_spin_lock(&boot_lock);
  2374. if (!hip04_cpu_table[cluster][cpu])
  2375. hip04_cpu_table[cluster][cpu] = 1;
  2376. - spin_unlock(&boot_lock);
  2377. + raw_spin_unlock(&boot_lock);
  2378. }
  2379. static void __naked hip04_mcpm_power_up_setup(unsigned int affinity_level)
  2380. diff -Nur linux-3.18.14.orig/arch/arm/mach-omap2/omap-smp.c linux-3.18.14-rt/arch/arm/mach-omap2/omap-smp.c
  2381. --- linux-3.18.14.orig/arch/arm/mach-omap2/omap-smp.c 2015-05-20 10:04:50.000000000 -0500
  2382. +++ linux-3.18.14-rt/arch/arm/mach-omap2/omap-smp.c 2015-05-31 15:32:45.697635392 -0500
  2383. @@ -43,7 +43,7 @@
  2384. /* SCU base address */
  2385. static void __iomem *scu_base;
  2386. -static DEFINE_SPINLOCK(boot_lock);
  2387. +static DEFINE_RAW_SPINLOCK(boot_lock);
  2388. void __iomem *omap4_get_scu_base(void)
  2389. {
  2390. @@ -74,8 +74,8 @@
  2391. /*
  2392. * Synchronise with the boot thread.
  2393. */
  2394. - spin_lock(&boot_lock);
  2395. - spin_unlock(&boot_lock);
  2396. + raw_spin_lock(&boot_lock);
  2397. + raw_spin_unlock(&boot_lock);
  2398. }
  2399. static int omap4_boot_secondary(unsigned int cpu, struct task_struct *idle)
  2400. @@ -89,7 +89,7 @@
  2401. * Set synchronisation state between this boot processor
  2402. * and the secondary one
  2403. */
  2404. - spin_lock(&boot_lock);
  2405. + raw_spin_lock(&boot_lock);
  2406. /*
  2407. * Update the AuxCoreBoot0 with boot state for secondary core.
  2408. @@ -166,7 +166,7 @@
  2409. * Now the secondary core is starting up let it run its
  2410. * calibrations, then wait for it to finish
  2411. */
  2412. - spin_unlock(&boot_lock);
  2413. + raw_spin_unlock(&boot_lock);
  2414. return 0;
  2415. }
  2416. diff -Nur linux-3.18.14.orig/arch/arm/mach-prima2/platsmp.c linux-3.18.14-rt/arch/arm/mach-prima2/platsmp.c
  2417. --- linux-3.18.14.orig/arch/arm/mach-prima2/platsmp.c 2015-05-20 10:04:50.000000000 -0500
  2418. +++ linux-3.18.14-rt/arch/arm/mach-prima2/platsmp.c 2015-05-31 15:32:45.721635392 -0500
  2419. @@ -23,7 +23,7 @@
  2420. static void __iomem *scu_base;
  2421. static void __iomem *rsc_base;
  2422. -static DEFINE_SPINLOCK(boot_lock);
  2423. +static DEFINE_RAW_SPINLOCK(boot_lock);
  2424. static struct map_desc scu_io_desc __initdata = {
  2425. .length = SZ_4K,
  2426. @@ -56,8 +56,8 @@
  2427. /*
  2428. * Synchronise with the boot thread.
  2429. */
  2430. - spin_lock(&boot_lock);
  2431. - spin_unlock(&boot_lock);
  2432. + raw_spin_lock(&boot_lock);
  2433. + raw_spin_unlock(&boot_lock);
  2434. }
  2435. static struct of_device_id rsc_ids[] = {
  2436. @@ -95,7 +95,7 @@
  2437. /* make sure write buffer is drained */
  2438. mb();
  2439. - spin_lock(&boot_lock);
  2440. + raw_spin_lock(&boot_lock);
  2441. /*
  2442. * The secondary processor is waiting to be released from
  2443. @@ -127,7 +127,7 @@
  2444. * now the secondary core is starting up let it run its
  2445. * calibrations, then wait for it to finish
  2446. */
  2447. - spin_unlock(&boot_lock);
  2448. + raw_spin_unlock(&boot_lock);
  2449. return pen_release != -1 ? -ENOSYS : 0;
  2450. }
  2451. diff -Nur linux-3.18.14.orig/arch/arm/mach-qcom/platsmp.c linux-3.18.14-rt/arch/arm/mach-qcom/platsmp.c
  2452. --- linux-3.18.14.orig/arch/arm/mach-qcom/platsmp.c 2015-05-20 10:04:50.000000000 -0500
  2453. +++ linux-3.18.14-rt/arch/arm/mach-qcom/platsmp.c 2015-05-31 15:32:45.741635391 -0500
  2454. @@ -46,7 +46,7 @@
  2455. extern void secondary_startup(void);
  2456. -static DEFINE_SPINLOCK(boot_lock);
  2457. +static DEFINE_RAW_SPINLOCK(boot_lock);
  2458. #ifdef CONFIG_HOTPLUG_CPU
  2459. static void __ref qcom_cpu_die(unsigned int cpu)
  2460. @@ -60,8 +60,8 @@
  2461. /*
  2462. * Synchronise with the boot thread.
  2463. */
  2464. - spin_lock(&boot_lock);
  2465. - spin_unlock(&boot_lock);
  2466. + raw_spin_lock(&boot_lock);
  2467. + raw_spin_unlock(&boot_lock);
  2468. }
  2469. static int scss_release_secondary(unsigned int cpu)
  2470. @@ -284,7 +284,7 @@
  2471. * set synchronisation state between this boot processor
  2472. * and the secondary one
  2473. */
  2474. - spin_lock(&boot_lock);
  2475. + raw_spin_lock(&boot_lock);
  2476. /*
  2477. * Send the secondary CPU a soft interrupt, thereby causing
  2478. @@ -297,7 +297,7 @@
  2479. * now the secondary core is starting up let it run its
  2480. * calibrations, then wait for it to finish
  2481. */
  2482. - spin_unlock(&boot_lock);
  2483. + raw_spin_unlock(&boot_lock);
  2484. return ret;
  2485. }
  2486. diff -Nur linux-3.18.14.orig/arch/arm/mach-spear/platsmp.c linux-3.18.14-rt/arch/arm/mach-spear/platsmp.c
  2487. --- linux-3.18.14.orig/arch/arm/mach-spear/platsmp.c 2015-05-20 10:04:50.000000000 -0500
  2488. +++ linux-3.18.14-rt/arch/arm/mach-spear/platsmp.c 2015-05-31 15:32:45.749635392 -0500
  2489. @@ -32,7 +32,7 @@
  2490. sync_cache_w(&pen_release);
  2491. }
  2492. -static DEFINE_SPINLOCK(boot_lock);
  2493. +static DEFINE_RAW_SPINLOCK(boot_lock);
  2494. static void __iomem *scu_base = IOMEM(VA_SCU_BASE);
  2495. @@ -47,8 +47,8 @@
  2496. /*
  2497. * Synchronise with the boot thread.
  2498. */
  2499. - spin_lock(&boot_lock);
  2500. - spin_unlock(&boot_lock);
  2501. + raw_spin_lock(&boot_lock);
  2502. + raw_spin_unlock(&boot_lock);
  2503. }
  2504. static int spear13xx_boot_secondary(unsigned int cpu, struct task_struct *idle)
  2505. @@ -59,7 +59,7 @@
  2506. * set synchronisation state between this boot processor
  2507. * and the secondary one
  2508. */
  2509. - spin_lock(&boot_lock);
  2510. + raw_spin_lock(&boot_lock);
  2511. /*
  2512. * The secondary processor is waiting to be released from
  2513. @@ -84,7 +84,7 @@
  2514. * now the secondary core is starting up let it run its
  2515. * calibrations, then wait for it to finish
  2516. */
  2517. - spin_unlock(&boot_lock);
  2518. + raw_spin_unlock(&boot_lock);
  2519. return pen_release != -1 ? -ENOSYS : 0;
  2520. }
  2521. diff -Nur linux-3.18.14.orig/arch/arm/mach-sti/platsmp.c linux-3.18.14-rt/arch/arm/mach-sti/platsmp.c
  2522. --- linux-3.18.14.orig/arch/arm/mach-sti/platsmp.c 2015-05-20 10:04:50.000000000 -0500
  2523. +++ linux-3.18.14-rt/arch/arm/mach-sti/platsmp.c 2015-05-31 15:32:45.765635392 -0500
  2524. @@ -34,7 +34,7 @@
  2525. sync_cache_w(&pen_release);
  2526. }
  2527. -static DEFINE_SPINLOCK(boot_lock);
  2528. +static DEFINE_RAW_SPINLOCK(boot_lock);
  2529. static void sti_secondary_init(unsigned int cpu)
  2530. {
  2531. @@ -49,8 +49,8 @@
  2532. /*
  2533. * Synchronise with the boot thread.
  2534. */
  2535. - spin_lock(&boot_lock);
  2536. - spin_unlock(&boot_lock);
  2537. + raw_spin_lock(&boot_lock);
  2538. + raw_spin_unlock(&boot_lock);
  2539. }
  2540. static int sti_boot_secondary(unsigned int cpu, struct task_struct *idle)
  2541. @@ -61,7 +61,7 @@
  2542. * set synchronisation state between this boot processor
  2543. * and the secondary one
  2544. */
  2545. - spin_lock(&boot_lock);
  2546. + raw_spin_lock(&boot_lock);
  2547. /*
  2548. * The secondary processor is waiting to be released from
  2549. @@ -92,7 +92,7 @@
  2550. * now the secondary core is starting up let it run its
  2551. * calibrations, then wait for it to finish
  2552. */
  2553. - spin_unlock(&boot_lock);
  2554. + raw_spin_unlock(&boot_lock);
  2555. return pen_release != -1 ? -ENOSYS : 0;
  2556. }
  2557. diff -Nur linux-3.18.14.orig/arch/arm/mach-ux500/platsmp.c linux-3.18.14-rt/arch/arm/mach-ux500/platsmp.c
  2558. --- linux-3.18.14.orig/arch/arm/mach-ux500/platsmp.c 2015-05-20 10:04:50.000000000 -0500
  2559. +++ linux-3.18.14-rt/arch/arm/mach-ux500/platsmp.c 2015-05-31 15:32:45.793635391 -0500
  2560. @@ -51,7 +51,7 @@
  2561. return NULL;
  2562. }
  2563. -static DEFINE_SPINLOCK(boot_lock);
  2564. +static DEFINE_RAW_SPINLOCK(boot_lock);
  2565. static void ux500_secondary_init(unsigned int cpu)
  2566. {
  2567. @@ -64,8 +64,8 @@
  2568. /*
  2569. * Synchronise with the boot thread.
  2570. */
  2571. - spin_lock(&boot_lock);
  2572. - spin_unlock(&boot_lock);
  2573. + raw_spin_lock(&boot_lock);
  2574. + raw_spin_unlock(&boot_lock);
  2575. }
  2576. static int ux500_boot_secondary(unsigned int cpu, struct task_struct *idle)
  2577. @@ -76,7 +76,7 @@
  2578. * set synchronisation state between this boot processor
  2579. * and the secondary one
  2580. */
  2581. - spin_lock(&boot_lock);
  2582. + raw_spin_lock(&boot_lock);
  2583. /*
  2584. * The secondary processor is waiting to be released from
  2585. @@ -97,7 +97,7 @@
  2586. * now the secondary core is starting up let it run its
  2587. * calibrations, then wait for it to finish
  2588. */
  2589. - spin_unlock(&boot_lock);
  2590. + raw_spin_unlock(&boot_lock);
  2591. return pen_release != -1 ? -ENOSYS : 0;
  2592. }
  2593. diff -Nur linux-3.18.14.orig/arch/arm/mm/fault.c linux-3.18.14-rt/arch/arm/mm/fault.c
  2594. --- linux-3.18.14.orig/arch/arm/mm/fault.c 2015-05-20 10:04:50.000000000 -0500
  2595. +++ linux-3.18.14-rt/arch/arm/mm/fault.c 2015-05-31 15:32:45.797635391 -0500
  2596. @@ -277,7 +277,7 @@
  2597. * If we're in an interrupt or have no user
  2598. * context, we must not take the fault..
  2599. */
  2600. - if (in_atomic() || !mm)
  2601. + if (!mm || pagefault_disabled())
  2602. goto no_context;
  2603. if (user_mode(regs))
  2604. @@ -431,6 +431,9 @@
  2605. if (addr < TASK_SIZE)
  2606. return do_page_fault(addr, fsr, regs);
  2607. + if (interrupts_enabled(regs))
  2608. + local_irq_enable();
  2609. +
  2610. if (user_mode(regs))
  2611. goto bad_area;
  2612. @@ -498,6 +501,9 @@
  2613. static int
  2614. do_sect_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
  2615. {
  2616. + if (interrupts_enabled(regs))
  2617. + local_irq_enable();
  2618. +
  2619. do_bad_area(addr, fsr, regs);
  2620. return 0;
  2621. }
  2622. diff -Nur linux-3.18.14.orig/arch/arm/mm/highmem.c linux-3.18.14-rt/arch/arm/mm/highmem.c
  2623. --- linux-3.18.14.orig/arch/arm/mm/highmem.c 2015-05-20 10:04:50.000000000 -0500
  2624. +++ linux-3.18.14-rt/arch/arm/mm/highmem.c 2015-05-31 15:32:45.805635391 -0500
  2625. @@ -53,6 +53,7 @@
  2626. void *kmap_atomic(struct page *page)
  2627. {
  2628. + pte_t pte = mk_pte(page, kmap_prot);
  2629. unsigned int idx;
  2630. unsigned long vaddr;
  2631. void *kmap;
  2632. @@ -91,7 +92,10 @@
  2633. * in place, so the contained TLB flush ensures the TLB is updated
  2634. * with the new mapping.
  2635. */
  2636. - set_fixmap_pte(idx, mk_pte(page, kmap_prot));
  2637. +#ifdef CONFIG_PREEMPT_RT_FULL
  2638. + current->kmap_pte[type] = pte;
  2639. +#endif
  2640. + set_fixmap_pte(idx, pte);
  2641. return (void *)vaddr;
  2642. }
  2643. @@ -108,12 +112,15 @@
  2644. if (cache_is_vivt())
  2645. __cpuc_flush_dcache_area((void *)vaddr, PAGE_SIZE);
  2646. +#ifdef CONFIG_PREEMPT_RT_FULL
  2647. + current->kmap_pte[type] = __pte(0);
  2648. +#endif
  2649. #ifdef CONFIG_DEBUG_HIGHMEM
  2650. BUG_ON(vaddr != __fix_to_virt(idx));
  2651. - set_fixmap_pte(idx, __pte(0));
  2652. #else
  2653. (void) idx; /* to kill a warning */
  2654. #endif
  2655. + set_fixmap_pte(idx, __pte(0));
  2656. kmap_atomic_idx_pop();
  2657. } else if (vaddr >= PKMAP_ADDR(0) && vaddr < PKMAP_ADDR(LAST_PKMAP)) {
  2658. /* this address was obtained through kmap_high_get() */
  2659. @@ -125,6 +132,7 @@
  2660. void *kmap_atomic_pfn(unsigned long pfn)
  2661. {
  2662. + pte_t pte = pfn_pte(pfn, kmap_prot);
  2663. unsigned long vaddr;
  2664. int idx, type;
  2665. struct page *page = pfn_to_page(pfn);
  2666. @@ -139,7 +147,10 @@
  2667. #ifdef CONFIG_DEBUG_HIGHMEM
  2668. BUG_ON(!pte_none(*(fixmap_page_table + idx)));
  2669. #endif
  2670. - set_fixmap_pte(idx, pfn_pte(pfn, kmap_prot));
  2671. +#ifdef CONFIG_PREEMPT_RT_FULL
  2672. + current->kmap_pte[type] = pte;
  2673. +#endif
  2674. + set_fixmap_pte(idx, pte);
  2675. return (void *)vaddr;
  2676. }
  2677. @@ -153,3 +164,28 @@
  2678. return pte_page(get_fixmap_pte(vaddr));
  2679. }
  2680. +
  2681. +#if defined CONFIG_PREEMPT_RT_FULL
  2682. +void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p)
  2683. +{
  2684. + int i;
  2685. +
  2686. + /*
  2687. + * Clear @prev's kmap_atomic mappings
  2688. + */
  2689. + for (i = 0; i < prev_p->kmap_idx; i++) {
  2690. + int idx = i + KM_TYPE_NR * smp_processor_id();
  2691. +
  2692. + set_fixmap_pte(idx, __pte(0));
  2693. + }
  2694. + /*
  2695. + * Restore @next_p's kmap_atomic mappings
  2696. + */
  2697. + for (i = 0; i < next_p->kmap_idx; i++) {
  2698. + int idx = i + KM_TYPE_NR * smp_processor_id();
  2699. +
  2700. + if (!pte_none(next_p->kmap_pte[i]))
  2701. + set_fixmap_pte(idx, next_p->kmap_pte[i]);
  2702. + }
  2703. +}
  2704. +#endif
  2705. diff -Nur linux-3.18.14.orig/arch/arm/plat-versatile/platsmp.c linux-3.18.14-rt/arch/arm/plat-versatile/platsmp.c
  2706. --- linux-3.18.14.orig/arch/arm/plat-versatile/platsmp.c 2015-05-20 10:04:50.000000000 -0500
  2707. +++ linux-3.18.14-rt/arch/arm/plat-versatile/platsmp.c 2015-05-31 15:32:45.889635390 -0500
  2708. @@ -30,7 +30,7 @@
  2709. sync_cache_w(&pen_release);
  2710. }
  2711. -static DEFINE_SPINLOCK(boot_lock);
  2712. +static DEFINE_RAW_SPINLOCK(boot_lock);
  2713. void versatile_secondary_init(unsigned int cpu)
  2714. {
  2715. @@ -43,8 +43,8 @@
  2716. /*
  2717. * Synchronise with the boot thread.
  2718. */
  2719. - spin_lock(&boot_lock);
  2720. - spin_unlock(&boot_lock);
  2721. + raw_spin_lock(&boot_lock);
  2722. + raw_spin_unlock(&boot_lock);
  2723. }
  2724. int versatile_boot_secondary(unsigned int cpu, struct task_struct *idle)
  2725. @@ -55,7 +55,7 @@
  2726. * Set synchronisation state between this boot processor
  2727. * and the secondary one
  2728. */
  2729. - spin_lock(&boot_lock);
  2730. + raw_spin_lock(&boot_lock);
  2731. /*
  2732. * This is really belt and braces; we hold unintended secondary
  2733. @@ -85,7 +85,7 @@
  2734. * now the secondary core is starting up let it run its
  2735. * calibrations, then wait for it to finish
  2736. */
  2737. - spin_unlock(&boot_lock);
  2738. + raw_spin_unlock(&boot_lock);
  2739. return pen_release != -1 ? -ENOSYS : 0;
  2740. }
  2741. diff -Nur linux-3.18.14.orig/arch/arm64/include/asm/thread_info.h linux-3.18.14-rt/arch/arm64/include/asm/thread_info.h
  2742. --- linux-3.18.14.orig/arch/arm64/include/asm/thread_info.h 2015-05-20 10:04:50.000000000 -0500
  2743. +++ linux-3.18.14-rt/arch/arm64/include/asm/thread_info.h 2015-05-31 15:32:45.925635390 -0500
  2744. @@ -50,6 +50,7 @@
  2745. struct exec_domain *exec_domain; /* execution domain */
  2746. struct restart_block restart_block;
  2747. int preempt_count; /* 0 => preemptable, <0 => bug */
  2748. + int preempt_lazy_count; /* 0 => preemptable, <0 => bug */
  2749. int cpu; /* cpu */
  2750. };
  2751. @@ -108,6 +109,7 @@
  2752. #define TIF_NEED_RESCHED 1
  2753. #define TIF_NOTIFY_RESUME 2 /* callback before returning to user */
  2754. #define TIF_FOREIGN_FPSTATE 3 /* CPU's FP state is not current's */
  2755. +#define TIF_NEED_RESCHED_LAZY 4
  2756. #define TIF_NOHZ 7
  2757. #define TIF_SYSCALL_TRACE 8
  2758. #define TIF_SYSCALL_AUDIT 9
  2759. @@ -124,6 +126,7 @@
  2760. #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED)
  2761. #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
  2762. #define _TIF_FOREIGN_FPSTATE (1 << TIF_FOREIGN_FPSTATE)
  2763. +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
  2764. #define _TIF_NOHZ (1 << TIF_NOHZ)
  2765. #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
  2766. #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
  2767. diff -Nur linux-3.18.14.orig/arch/arm64/Kconfig linux-3.18.14-rt/arch/arm64/Kconfig
  2768. --- linux-3.18.14.orig/arch/arm64/Kconfig 2015-05-20 10:04:50.000000000 -0500
  2769. +++ linux-3.18.14-rt/arch/arm64/Kconfig 2015-05-31 15:32:45.905635390 -0500
  2770. @@ -59,8 +59,10 @@
  2771. select HAVE_PERF_REGS
  2772. select HAVE_PERF_USER_STACK_DUMP
  2773. select HAVE_RCU_TABLE_FREE
  2774. + select HAVE_PREEMPT_LAZY
  2775. select HAVE_SYSCALL_TRACEPOINTS
  2776. select IRQ_DOMAIN
  2777. + select IRQ_FORCED_THREADING
  2778. select MODULES_USE_ELF_RELA
  2779. select NO_BOOTMEM
  2780. select OF
  2781. diff -Nur linux-3.18.14.orig/arch/arm64/kernel/asm-offsets.c linux-3.18.14-rt/arch/arm64/kernel/asm-offsets.c
  2782. --- linux-3.18.14.orig/arch/arm64/kernel/asm-offsets.c 2015-05-20 10:04:50.000000000 -0500
  2783. +++ linux-3.18.14-rt/arch/arm64/kernel/asm-offsets.c 2015-05-31 15:32:45.925635390 -0500
  2784. @@ -36,6 +36,7 @@
  2785. BLANK();
  2786. DEFINE(TI_FLAGS, offsetof(struct thread_info, flags));
  2787. DEFINE(TI_PREEMPT, offsetof(struct thread_info, preempt_count));
  2788. + DEFINE(TI_PREEMPT_LAZY, offsetof(struct thread_info, preempt_lazy_count));
  2789. DEFINE(TI_ADDR_LIMIT, offsetof(struct thread_info, addr_limit));
  2790. DEFINE(TI_TASK, offsetof(struct thread_info, task));
  2791. DEFINE(TI_EXEC_DOMAIN, offsetof(struct thread_info, exec_domain));
  2792. diff -Nur linux-3.18.14.orig/arch/arm64/kernel/entry.S linux-3.18.14-rt/arch/arm64/kernel/entry.S
  2793. --- linux-3.18.14.orig/arch/arm64/kernel/entry.S 2015-05-20 10:04:50.000000000 -0500
  2794. +++ linux-3.18.14-rt/arch/arm64/kernel/entry.S 2015-05-31 15:32:45.925635390 -0500
  2795. @@ -367,11 +367,16 @@
  2796. #ifdef CONFIG_PREEMPT
  2797. get_thread_info tsk
  2798. ldr w24, [tsk, #TI_PREEMPT] // get preempt count
  2799. - cbnz w24, 1f // preempt count != 0
  2800. + cbnz w24, 2f // preempt count != 0
  2801. ldr x0, [tsk, #TI_FLAGS] // get flags
  2802. - tbz x0, #TIF_NEED_RESCHED, 1f // needs rescheduling?
  2803. - bl el1_preempt
  2804. + tbnz x0, #TIF_NEED_RESCHED, 1f // needs rescheduling?
  2805. +
  2806. + ldr w24, [tsk, #TI_PREEMPT_LAZY] // get preempt lazy count
  2807. + cbnz w24, 2f // preempt lazy count != 0
  2808. + tbz x0, #TIF_NEED_RESCHED_LAZY, 2f // needs rescheduling?
  2809. 1:
  2810. + bl el1_preempt
  2811. +2:
  2812. #endif
  2813. #ifdef CONFIG_TRACE_IRQFLAGS
  2814. bl trace_hardirqs_on
  2815. @@ -385,6 +390,7 @@
  2816. 1: bl preempt_schedule_irq // irq en/disable is done inside
  2817. ldr x0, [tsk, #TI_FLAGS] // get new tasks TI_FLAGS
  2818. tbnz x0, #TIF_NEED_RESCHED, 1b // needs rescheduling?
  2819. + tbnz x0, #TIF_NEED_RESCHED_LAZY, 1b // needs rescheduling?
  2820. ret x24
  2821. #endif
  2822. @@ -621,6 +627,7 @@
  2823. str x0, [sp, #S_X0] // returned x0
  2824. work_pending:
  2825. tbnz x1, #TIF_NEED_RESCHED, work_resched
  2826. + tbnz x1, #TIF_NEED_RESCHED_LAZY, work_resched
  2827. /* TIF_SIGPENDING, TIF_NOTIFY_RESUME or TIF_FOREIGN_FPSTATE case */
  2828. ldr x2, [sp, #S_PSTATE]
  2829. mov x0, sp // 'regs'
  2830. diff -Nur linux-3.18.14.orig/arch/arm64/kernel/perf_event.c linux-3.18.14-rt/arch/arm64/kernel/perf_event.c
  2831. --- linux-3.18.14.orig/arch/arm64/kernel/perf_event.c 2015-05-20 10:04:50.000000000 -0500
  2832. +++ linux-3.18.14-rt/arch/arm64/kernel/perf_event.c 2015-05-31 15:32:45.925635390 -0500
  2833. @@ -461,7 +461,7 @@
  2834. }
  2835. err = request_irq(irq, armpmu->handle_irq,
  2836. - IRQF_NOBALANCING,
  2837. + IRQF_NOBALANCING | IRQF_NO_THREAD,
  2838. "arm-pmu", armpmu);
  2839. if (err) {
  2840. pr_err("unable to request IRQ%d for ARM PMU counters\n",
  2841. diff -Nur linux-3.18.14.orig/arch/avr32/mm/fault.c linux-3.18.14-rt/arch/avr32/mm/fault.c
  2842. --- linux-3.18.14.orig/arch/avr32/mm/fault.c 2015-05-20 10:04:50.000000000 -0500
  2843. +++ linux-3.18.14-rt/arch/avr32/mm/fault.c 2015-05-31 15:32:45.933635390 -0500
  2844. @@ -81,7 +81,7 @@
  2845. * If we're in an interrupt or have no user context, we must
  2846. * not take the fault...
  2847. */
  2848. - if (in_atomic() || !mm || regs->sr & SYSREG_BIT(GM))
  2849. + if (!mm || regs->sr & SYSREG_BIT(GM) || pagefault_disabled())
  2850. goto no_context;
  2851. local_irq_enable();
  2852. diff -Nur linux-3.18.14.orig/arch/cris/mm/fault.c linux-3.18.14-rt/arch/cris/mm/fault.c
  2853. --- linux-3.18.14.orig/arch/cris/mm/fault.c 2015-05-20 10:04:50.000000000 -0500
  2854. +++ linux-3.18.14-rt/arch/cris/mm/fault.c 2015-05-31 15:32:45.945635390 -0500
  2855. @@ -113,7 +113,7 @@
  2856. * user context, we must not take the fault.
  2857. */
  2858. - if (in_atomic() || !mm)
  2859. + if (!mm || pagefault_disabled())
  2860. goto no_context;
  2861. if (user_mode(regs))
  2862. diff -Nur linux-3.18.14.orig/arch/frv/mm/fault.c linux-3.18.14-rt/arch/frv/mm/fault.c
  2863. --- linux-3.18.14.orig/arch/frv/mm/fault.c 2015-05-20 10:04:50.000000000 -0500
  2864. +++ linux-3.18.14-rt/arch/frv/mm/fault.c 2015-05-31 15:32:45.953635390 -0500
  2865. @@ -78,7 +78,7 @@
  2866. * If we're in an interrupt or have no user
  2867. * context, we must not take the fault..
  2868. */
  2869. - if (in_atomic() || !mm)
  2870. + if (!mm || pagefault_disabled())
  2871. goto no_context;
  2872. if (user_mode(__frame))
  2873. diff -Nur linux-3.18.14.orig/arch/ia64/mm/fault.c linux-3.18.14-rt/arch/ia64/mm/fault.c
  2874. --- linux-3.18.14.orig/arch/ia64/mm/fault.c 2015-05-20 10:04:50.000000000 -0500
  2875. +++ linux-3.18.14-rt/arch/ia64/mm/fault.c 2015-05-31 15:32:45.961635389 -0500
  2876. @@ -96,7 +96,7 @@
  2877. /*
  2878. * If we're in an interrupt or have no user context, we must not take the fault..
  2879. */
  2880. - if (in_atomic() || !mm)
  2881. + if (!mm || pagefault_disabled())
  2882. goto no_context;
  2883. #ifdef CONFIG_VIRTUAL_MEM_MAP
  2884. diff -Nur linux-3.18.14.orig/arch/Kconfig linux-3.18.14-rt/arch/Kconfig
  2885. --- linux-3.18.14.orig/arch/Kconfig 2015-05-20 10:04:50.000000000 -0500
  2886. +++ linux-3.18.14-rt/arch/Kconfig 2015-05-31 15:32:45.501635394 -0500
  2887. @@ -6,6 +6,7 @@
  2888. tristate "OProfile system profiling"
  2889. depends on PROFILING
  2890. depends on HAVE_OPROFILE
  2891. + depends on !PREEMPT_RT_FULL
  2892. select RING_BUFFER
  2893. select RING_BUFFER_ALLOW_SWAP
  2894. help
  2895. diff -Nur linux-3.18.14.orig/arch/m32r/mm/fault.c linux-3.18.14-rt/arch/m32r/mm/fault.c
  2896. --- linux-3.18.14.orig/arch/m32r/mm/fault.c 2015-05-20 10:04:50.000000000 -0500
  2897. +++ linux-3.18.14-rt/arch/m32r/mm/fault.c 2015-05-31 15:32:45.985635389 -0500
  2898. @@ -114,7 +114,7 @@
  2899. * If we're in an interrupt or have no user context or are running in an
  2900. * atomic region then we must not take the fault..
  2901. */
  2902. - if (in_atomic() || !mm)
  2903. + if (!mm || pagefault_disabled())
  2904. goto bad_area_nosemaphore;
  2905. if (error_code & ACE_USERMODE)
  2906. diff -Nur linux-3.18.14.orig/arch/m68k/mm/fault.c linux-3.18.14-rt/arch/m68k/mm/fault.c
  2907. --- linux-3.18.14.orig/arch/m68k/mm/fault.c 2015-05-20 10:04:50.000000000 -0500
  2908. +++ linux-3.18.14-rt/arch/m68k/mm/fault.c 2015-05-31 15:32:45.985635389 -0500
  2909. @@ -81,7 +81,7 @@
  2910. * If we're in an interrupt or have no user
  2911. * context, we must not take the fault..
  2912. */
  2913. - if (in_atomic() || !mm)
  2914. + if (!mm || pagefault_disabled())
  2915. goto no_context;
  2916. if (user_mode(regs))
  2917. diff -Nur linux-3.18.14.orig/arch/microblaze/mm/fault.c linux-3.18.14-rt/arch/microblaze/mm/fault.c
  2918. --- linux-3.18.14.orig/arch/microblaze/mm/fault.c 2015-05-20 10:04:50.000000000 -0500
  2919. +++ linux-3.18.14-rt/arch/microblaze/mm/fault.c 2015-05-31 15:32:46.005635389 -0500
  2920. @@ -107,7 +107,7 @@
  2921. if ((error_code & 0x13) == 0x13 || (error_code & 0x11) == 0x11)
  2922. is_write = 0;
  2923. - if (unlikely(in_atomic() || !mm)) {
  2924. + if (unlikely(!mm || pagefault_disabled())) {
  2925. if (kernel_mode(regs))
  2926. goto bad_area_nosemaphore;
  2927. diff -Nur linux-3.18.14.orig/arch/mips/Kconfig linux-3.18.14-rt/arch/mips/Kconfig
  2928. --- linux-3.18.14.orig/arch/mips/Kconfig 2015-05-20 10:04:50.000000000 -0500
  2929. +++ linux-3.18.14-rt/arch/mips/Kconfig 2015-05-31 15:32:46.033635389 -0500
  2930. @@ -2196,7 +2196,7 @@
  2931. #
  2932. config HIGHMEM
  2933. bool "High Memory Support"
  2934. - depends on 32BIT && CPU_SUPPORTS_HIGHMEM && SYS_SUPPORTS_HIGHMEM && !CPU_MIPS32_3_5_EVA
  2935. + depends on 32BIT && CPU_SUPPORTS_HIGHMEM && SYS_SUPPORTS_HIGHMEM && !CPU_MIPS32_3_5_EVA && !PREEMPT_RT_FULL
  2936. config CPU_SUPPORTS_HIGHMEM
  2937. bool
  2938. diff -Nur linux-3.18.14.orig/arch/mips/kernel/signal.c linux-3.18.14-rt/arch/mips/kernel/signal.c
  2939. --- linux-3.18.14.orig/arch/mips/kernel/signal.c 2015-05-20 10:04:50.000000000 -0500
  2940. +++ linux-3.18.14-rt/arch/mips/kernel/signal.c 2015-05-31 15:32:46.057635389 -0500
  2941. @@ -613,6 +613,7 @@
  2942. __u32 thread_info_flags)
  2943. {
  2944. local_irq_enable();
  2945. + preempt_check_resched();
  2946. user_exit();
  2947. diff -Nur linux-3.18.14.orig/arch/mips/mm/fault.c linux-3.18.14-rt/arch/mips/mm/fault.c
  2948. --- linux-3.18.14.orig/arch/mips/mm/fault.c 2015-05-20 10:04:50.000000000 -0500
  2949. +++ linux-3.18.14-rt/arch/mips/mm/fault.c 2015-05-31 15:32:46.069635388 -0500
  2950. @@ -89,7 +89,7 @@
  2951. * If we're in an interrupt or have no user
  2952. * context, we must not take the fault..
  2953. */
  2954. - if (in_atomic() || !mm)
  2955. + if (!mm || pagefault_disabled())
  2956. goto bad_area_nosemaphore;
  2957. if (user_mode(regs))
  2958. diff -Nur linux-3.18.14.orig/arch/mips/mm/init.c linux-3.18.14-rt/arch/mips/mm/init.c
  2959. --- linux-3.18.14.orig/arch/mips/mm/init.c 2015-05-20 10:04:50.000000000 -0500
  2960. +++ linux-3.18.14-rt/arch/mips/mm/init.c 2015-05-31 15:32:46.069635388 -0500
  2961. @@ -90,7 +90,7 @@
  2962. BUG_ON(Page_dcache_dirty(page));
  2963. - pagefault_disable();
  2964. + raw_pagefault_disable();
  2965. idx = (addr >> PAGE_SHIFT) & (FIX_N_COLOURS - 1);
  2966. idx += in_interrupt() ? FIX_N_COLOURS : 0;
  2967. vaddr = __fix_to_virt(FIX_CMAP_END - idx);
  2968. @@ -146,7 +146,7 @@
  2969. tlbw_use_hazard();
  2970. write_c0_entryhi(old_ctx);
  2971. local_irq_restore(flags);
  2972. - pagefault_enable();
  2973. + raw_pagefault_enable();
  2974. }
  2975. void copy_user_highpage(struct page *to, struct page *from,
  2976. diff -Nur linux-3.18.14.orig/arch/mn10300/mm/fault.c linux-3.18.14-rt/arch/mn10300/mm/fault.c
  2977. --- linux-3.18.14.orig/arch/mn10300/mm/fault.c 2015-05-20 10:04:50.000000000 -0500
  2978. +++ linux-3.18.14-rt/arch/mn10300/mm/fault.c 2015-05-31 15:32:46.113635388 -0500
  2979. @@ -168,7 +168,7 @@
  2980. * If we're in an interrupt or have no user
  2981. * context, we must not take the fault..
  2982. */
  2983. - if (in_atomic() || !mm)
  2984. + if (!mm || pagefault_disabled())
  2985. goto no_context;
  2986. if ((fault_code & MMUFCR_xFC_ACCESS) == MMUFCR_xFC_ACCESS_USR)
  2987. diff -Nur linux-3.18.14.orig/arch/parisc/mm/fault.c linux-3.18.14-rt/arch/parisc/mm/fault.c
  2988. --- linux-3.18.14.orig/arch/parisc/mm/fault.c 2015-05-20 10:04:50.000000000 -0500
  2989. +++ linux-3.18.14-rt/arch/parisc/mm/fault.c 2015-05-31 15:32:46.113635388 -0500
  2990. @@ -207,7 +207,7 @@
  2991. int fault;
  2992. unsigned int flags;
  2993. - if (in_atomic())
  2994. + if (pagefault_disabled())
  2995. goto no_context;
  2996. tsk = current;
  2997. diff -Nur linux-3.18.14.orig/arch/powerpc/include/asm/kvm_host.h linux-3.18.14-rt/arch/powerpc/include/asm/kvm_host.h
  2998. --- linux-3.18.14.orig/arch/powerpc/include/asm/kvm_host.h 2015-05-20 10:04:50.000000000 -0500
  2999. +++ linux-3.18.14-rt/arch/powerpc/include/asm/kvm_host.h 2015-05-31 15:32:46.145635388 -0500
  3000. @@ -296,7 +296,7 @@
  3001. u8 in_guest;
  3002. struct list_head runnable_threads;
  3003. spinlock_t lock;
  3004. - wait_queue_head_t wq;
  3005. + struct swait_head wq;
  3006. u64 stolen_tb;
  3007. u64 preempt_tb;
  3008. struct kvm_vcpu *runner;
  3009. @@ -618,7 +618,7 @@
  3010. u8 prodded;
  3011. u32 last_inst;
  3012. - wait_queue_head_t *wqp;
  3013. + struct swait_head *wqp;
  3014. struct kvmppc_vcore *vcore;
  3015. int ret;
  3016. int trap;
  3017. diff -Nur linux-3.18.14.orig/arch/powerpc/include/asm/thread_info.h linux-3.18.14-rt/arch/powerpc/include/asm/thread_info.h
  3018. --- linux-3.18.14.orig/arch/powerpc/include/asm/thread_info.h 2015-05-20 10:04:50.000000000 -0500
  3019. +++ linux-3.18.14-rt/arch/powerpc/include/asm/thread_info.h 2015-05-31 15:32:46.165635388 -0500
  3020. @@ -43,6 +43,8 @@
  3021. int cpu; /* cpu we're on */
  3022. int preempt_count; /* 0 => preemptable,
  3023. <0 => BUG */
  3024. + int preempt_lazy_count; /* 0 => preemptable,
  3025. + <0 => BUG */
  3026. struct restart_block restart_block;
  3027. unsigned long local_flags; /* private flags for thread */
  3028. @@ -88,8 +90,7 @@
  3029. #define TIF_SYSCALL_TRACE 0 /* syscall trace active */
  3030. #define TIF_SIGPENDING 1 /* signal pending */
  3031. #define TIF_NEED_RESCHED 2 /* rescheduling necessary */
  3032. -#define TIF_POLLING_NRFLAG 3 /* true if poll_idle() is polling
  3033. - TIF_NEED_RESCHED */
  3034. +#define TIF_NEED_RESCHED_LAZY 3 /* lazy rescheduling necessary */
  3035. #define TIF_32BIT 4 /* 32 bit binary */
  3036. #define TIF_RESTORE_TM 5 /* need to restore TM FP/VEC/VSX */
  3037. #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */
  3038. @@ -107,6 +108,8 @@
  3039. #if defined(CONFIG_PPC64)
  3040. #define TIF_ELF2ABI 18 /* function descriptors must die! */
  3041. #endif
  3042. +#define TIF_POLLING_NRFLAG 19 /* true if poll_idle() is polling
  3043. + TIF_NEED_RESCHED */
  3044. /* as above, but as bit values */
  3045. #define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE)
  3046. @@ -125,14 +128,16 @@
  3047. #define _TIF_SYSCALL_TRACEPOINT (1<<TIF_SYSCALL_TRACEPOINT)
  3048. #define _TIF_EMULATE_STACK_STORE (1<<TIF_EMULATE_STACK_STORE)
  3049. #define _TIF_NOHZ (1<<TIF_NOHZ)
  3050. +#define _TIF_NEED_RESCHED_LAZY (1<<TIF_NEED_RESCHED_LAZY)
  3051. #define _TIF_SYSCALL_T_OR_A (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
  3052. _TIF_SECCOMP | _TIF_SYSCALL_TRACEPOINT | \
  3053. _TIF_NOHZ)
  3054. #define _TIF_USER_WORK_MASK (_TIF_SIGPENDING | _TIF_NEED_RESCHED | \
  3055. _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
  3056. - _TIF_RESTORE_TM)
  3057. + _TIF_RESTORE_TM | _TIF_NEED_RESCHED_LAZY)
  3058. #define _TIF_PERSYSCALL_MASK (_TIF_RESTOREALL|_TIF_NOERROR)
  3059. +#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
  3060. /* Bits in local_flags */
  3061. /* Don't move TLF_NAPPING without adjusting the code in entry_32.S */
  3062. diff -Nur linux-3.18.14.orig/arch/powerpc/Kconfig linux-3.18.14-rt/arch/powerpc/Kconfig
  3063. --- linux-3.18.14.orig/arch/powerpc/Kconfig 2015-05-20 10:04:50.000000000 -0500
  3064. +++ linux-3.18.14-rt/arch/powerpc/Kconfig 2015-05-31 15:32:46.141635388 -0500
  3065. @@ -60,10 +60,11 @@
  3066. config RWSEM_GENERIC_SPINLOCK
  3067. bool
  3068. + default y if PREEMPT_RT_FULL
  3069. config RWSEM_XCHGADD_ALGORITHM
  3070. bool
  3071. - default y
  3072. + default y if !PREEMPT_RT_FULL
  3073. config GENERIC_LOCKBREAK
  3074. bool
  3075. @@ -136,6 +137,7 @@
  3076. select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
  3077. select GENERIC_STRNCPY_FROM_USER
  3078. select GENERIC_STRNLEN_USER
  3079. + select HAVE_PREEMPT_LAZY
  3080. select HAVE_MOD_ARCH_SPECIFIC
  3081. select MODULES_USE_ELF_RELA
  3082. select CLONE_BACKWARDS
  3083. @@ -303,7 +305,7 @@
  3084. config HIGHMEM
  3085. bool "High memory support"
  3086. - depends on PPC32
  3087. + depends on PPC32 && !PREEMPT_RT_FULL
  3088. source kernel/Kconfig.hz
  3089. source kernel/Kconfig.preempt
  3090. diff -Nur linux-3.18.14.orig/arch/powerpc/kernel/asm-offsets.c linux-3.18.14-rt/arch/powerpc/kernel/asm-offsets.c
  3091. --- linux-3.18.14.orig/arch/powerpc/kernel/asm-offsets.c 2015-05-20 10:04:50.000000000 -0500
  3092. +++ linux-3.18.14-rt/arch/powerpc/kernel/asm-offsets.c 2015-05-31 15:32:46.205635388 -0500
  3093. @@ -159,6 +159,7 @@
  3094. DEFINE(TI_FLAGS, offsetof(struct thread_info, flags));
  3095. DEFINE(TI_LOCAL_FLAGS, offsetof(struct thread_info, local_flags));
  3096. DEFINE(TI_PREEMPT, offsetof(struct thread_info, preempt_count));
  3097. + DEFINE(TI_PREEMPT_LAZY, offsetof(struct thread_info, preempt_lazy_count));
  3098. DEFINE(TI_TASK, offsetof(struct thread_info, task));
  3099. DEFINE(TI_CPU, offsetof(struct thread_info, cpu));
  3100. diff -Nur linux-3.18.14.orig/arch/powerpc/kernel/entry_32.S linux-3.18.14-rt/arch/powerpc/kernel/entry_32.S
  3101. --- linux-3.18.14.orig/arch/powerpc/kernel/entry_32.S 2015-05-20 10:04:50.000000000 -0500
  3102. +++ linux-3.18.14-rt/arch/powerpc/kernel/entry_32.S 2015-05-31 15:32:46.217635387 -0500
  3103. @@ -890,7 +890,14 @@
  3104. cmpwi 0,r0,0 /* if non-zero, just restore regs and return */
  3105. bne restore
  3106. andi. r8,r8,_TIF_NEED_RESCHED
  3107. + bne+ 1f
  3108. + lwz r0,TI_PREEMPT_LAZY(r9)
  3109. + cmpwi 0,r0,0 /* if non-zero, just restore regs and return */
  3110. + bne restore
  3111. + lwz r0,TI_FLAGS(r9)
  3112. + andi. r0,r0,_TIF_NEED_RESCHED_LAZY
  3113. beq+ restore
  3114. +1:
  3115. lwz r3,_MSR(r1)
  3116. andi. r0,r3,MSR_EE /* interrupts off? */
  3117. beq restore /* don't schedule if so */
  3118. @@ -901,11 +908,11 @@
  3119. */
  3120. bl trace_hardirqs_off
  3121. #endif
  3122. -1: bl preempt_schedule_irq
  3123. +2: bl preempt_schedule_irq
  3124. CURRENT_THREAD_INFO(r9, r1)
  3125. lwz r3,TI_FLAGS(r9)
  3126. - andi. r0,r3,_TIF_NEED_RESCHED
  3127. - bne- 1b
  3128. + andi. r0,r3,_TIF_NEED_RESCHED_MASK
  3129. + bne- 2b
  3130. #ifdef CONFIG_TRACE_IRQFLAGS
  3131. /* And now, to properly rebalance the above, we tell lockdep they
  3132. * are being turned back on, which will happen when we return
  3133. @@ -1226,7 +1233,7 @@
  3134. #endif /* !(CONFIG_4xx || CONFIG_BOOKE) */
  3135. do_work: /* r10 contains MSR_KERNEL here */
  3136. - andi. r0,r9,_TIF_NEED_RESCHED
  3137. + andi. r0,r9,_TIF_NEED_RESCHED_MASK
  3138. beq do_user_signal
  3139. do_resched: /* r10 contains MSR_KERNEL here */
  3140. @@ -1247,7 +1254,7 @@
  3141. MTMSRD(r10) /* disable interrupts */
  3142. CURRENT_THREAD_INFO(r9, r1)
  3143. lwz r9,TI_FLAGS(r9)
  3144. - andi. r0,r9,_TIF_NEED_RESCHED
  3145. + andi. r0,r9,_TIF_NEED_RESCHED_MASK
  3146. bne- do_resched
  3147. andi. r0,r9,_TIF_USER_WORK_MASK
  3148. beq restore_user
  3149. diff -Nur linux-3.18.14.orig/arch/powerpc/kernel/entry_64.S linux-3.18.14-rt/arch/powerpc/kernel/entry_64.S
  3150. --- linux-3.18.14.orig/arch/powerpc/kernel/entry_64.S 2015-05-20 10:04:50.000000000 -0500
  3151. +++ linux-3.18.14-rt/arch/powerpc/kernel/entry_64.S 2015-05-31 15:32:46.241635387 -0500
  3152. @@ -644,7 +644,7 @@
  3153. #else
  3154. beq restore
  3155. #endif
  3156. -1: andi. r0,r4,_TIF_NEED_RESCHED
  3157. +1: andi. r0,r4,_TIF_NEED_RESCHED_MASK
  3158. beq 2f
  3159. bl restore_interrupts
  3160. SCHEDULE_USER
  3161. @@ -706,10 +706,18 @@
  3162. #ifdef CONFIG_PREEMPT
  3163. /* Check if we need to preempt */
  3164. + lwz r8,TI_PREEMPT(r9)
  3165. + cmpwi 0,r8,0 /* if non-zero, just restore regs and return */
  3166. + bne restore
  3167. andi. r0,r4,_TIF_NEED_RESCHED
  3168. + bne+ check_count
  3169. +
  3170. + andi. r0,r4,_TIF_NEED_RESCHED_LAZY
  3171. beq+ restore
  3172. + lwz r8,TI_PREEMPT_LAZY(r9)
  3173. +
  3174. /* Check that preempt_count() == 0 and interrupts are enabled */
  3175. - lwz r8,TI_PREEMPT(r9)
  3176. +check_count:
  3177. cmpwi cr1,r8,0
  3178. ld r0,SOFTE(r1)
  3179. cmpdi r0,0
  3180. @@ -726,7 +734,7 @@
  3181. /* Re-test flags and eventually loop */
  3182. CURRENT_THREAD_INFO(r9, r1)
  3183. ld r4,TI_FLAGS(r9)
  3184. - andi. r0,r4,_TIF_NEED_RESCHED
  3185. + andi. r0,r4,_TIF_NEED_RESCHED_MASK
  3186. bne 1b
  3187. /*
  3188. diff -Nur linux-3.18.14.orig/arch/powerpc/kernel/irq.c linux-3.18.14-rt/arch/powerpc/kernel/irq.c
  3189. --- linux-3.18.14.orig/arch/powerpc/kernel/irq.c 2015-05-20 10:04:50.000000000 -0500
  3190. +++ linux-3.18.14-rt/arch/powerpc/kernel/irq.c 2015-05-31 15:32:46.245635387 -0500
  3191. @@ -615,6 +615,7 @@
  3192. }
  3193. }
  3194. +#ifndef CONFIG_PREEMPT_RT_FULL
  3195. void do_softirq_own_stack(void)
  3196. {
  3197. struct thread_info *curtp, *irqtp;
  3198. @@ -632,6 +633,7 @@
  3199. if (irqtp->flags)
  3200. set_bits(irqtp->flags, &curtp->flags);
  3201. }
  3202. +#endif
  3203. irq_hw_number_t virq_to_hw(unsigned int virq)
  3204. {
  3205. diff -Nur linux-3.18.14.orig/arch/powerpc/kernel/misc_32.S linux-3.18.14-rt/arch/powerpc/kernel/misc_32.S
  3206. --- linux-3.18.14.orig/arch/powerpc/kernel/misc_32.S 2015-05-20 10:04:50.000000000 -0500
  3207. +++ linux-3.18.14-rt/arch/powerpc/kernel/misc_32.S 2015-05-31 15:32:46.261635387 -0500
  3208. @@ -40,6 +40,7 @@
  3209. * We store the saved ksp_limit in the unused part
  3210. * of the STACK_FRAME_OVERHEAD
  3211. */
  3212. +#ifndef CONFIG_PREEMPT_RT_FULL
  3213. _GLOBAL(call_do_softirq)
  3214. mflr r0
  3215. stw r0,4(r1)
  3216. @@ -56,6 +57,7 @@
  3217. stw r10,THREAD+KSP_LIMIT(r2)
  3218. mtlr r0
  3219. blr
  3220. +#endif
  3221. /*
  3222. * void call_do_irq(struct pt_regs *regs, struct thread_info *irqtp);
  3223. diff -Nur linux-3.18.14.orig/arch/powerpc/kernel/misc_64.S linux-3.18.14-rt/arch/powerpc/kernel/misc_64.S
  3224. --- linux-3.18.14.orig/arch/powerpc/kernel/misc_64.S 2015-05-20 10:04:50.000000000 -0500
  3225. +++ linux-3.18.14-rt/arch/powerpc/kernel/misc_64.S 2015-05-31 15:32:46.261635387 -0500
  3226. @@ -29,6 +29,7 @@
  3227. .text
  3228. +#ifndef CONFIG_PREEMPT_RT_FULL
  3229. _GLOBAL(call_do_softirq)
  3230. mflr r0
  3231. std r0,16(r1)
  3232. @@ -39,6 +40,7 @@
  3233. ld r0,16(r1)
  3234. mtlr r0
  3235. blr
  3236. +#endif
  3237. _GLOBAL(call_do_irq)
  3238. mflr r0
  3239. diff -Nur linux-3.18.14.orig/arch/powerpc/kernel/time.c linux-3.18.14-rt/arch/powerpc/kernel/time.c
  3240. --- linux-3.18.14.orig/arch/powerpc/kernel/time.c 2015-05-20 10:04:50.000000000 -0500
  3241. +++ linux-3.18.14-rt/arch/powerpc/kernel/time.c 2015-05-31 15:32:46.261635387 -0500
  3242. @@ -424,7 +424,7 @@
  3243. EXPORT_SYMBOL(profile_pc);
  3244. #endif
  3245. -#ifdef CONFIG_IRQ_WORK
  3246. +#if defined(CONFIG_IRQ_WORK)
  3247. /*
  3248. * 64-bit uses a byte in the PACA, 32-bit uses a per-cpu variable...
  3249. diff -Nur linux-3.18.14.orig/arch/powerpc/kvm/book3s_hv.c linux-3.18.14-rt/arch/powerpc/kvm/book3s_hv.c
  3250. --- linux-3.18.14.orig/arch/powerpc/kvm/book3s_hv.c 2015-05-20 10:04:50.000000000 -0500
  3251. +++ linux-3.18.14-rt/arch/powerpc/kvm/book3s_hv.c 2015-05-31 15:32:46.301635387 -0500
  3252. @@ -84,11 +84,11 @@
  3253. {
  3254. int me;
  3255. int cpu = vcpu->cpu;
  3256. - wait_queue_head_t *wqp;
  3257. + struct swait_head *wqp;
  3258. wqp = kvm_arch_vcpu_wq(vcpu);
  3259. - if (waitqueue_active(wqp)) {
  3260. - wake_up_interruptible(wqp);
  3261. + if (swaitqueue_active(wqp)) {
  3262. + swait_wake_interruptible(wqp);
  3263. ++vcpu->stat.halt_wakeup;
  3264. }
  3265. @@ -639,8 +639,8 @@
  3266. tvcpu->arch.prodded = 1;
  3267. smp_mb();
  3268. if (vcpu->arch.ceded) {
  3269. - if (waitqueue_active(&vcpu->wq)) {
  3270. - wake_up_interruptible(&vcpu->wq);
  3271. + if (swaitqueue_active(&vcpu->wq)) {
  3272. + swait_wake_interruptible(&vcpu->wq);
  3273. vcpu->stat.halt_wakeup++;
  3274. }
  3275. }
  3276. @@ -1357,7 +1357,7 @@
  3277. INIT_LIST_HEAD(&vcore->runnable_threads);
  3278. spin_lock_init(&vcore->lock);
  3279. - init_waitqueue_head(&vcore->wq);
  3280. + init_swait_head(&vcore->wq);
  3281. vcore->preempt_tb = TB_NIL;
  3282. vcore->lpcr = kvm->arch.lpcr;
  3283. vcore->first_vcpuid = core * threads_per_subcore;
  3284. @@ -1826,13 +1826,13 @@
  3285. */
  3286. static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
  3287. {
  3288. - DEFINE_WAIT(wait);
  3289. + DEFINE_SWAITER(wait);
  3290. - prepare_to_wait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
  3291. + swait_prepare(&vc->wq, &wait, TASK_INTERRUPTIBLE);
  3292. vc->vcore_state = VCORE_SLEEPING;
  3293. spin_unlock(&vc->lock);
  3294. schedule();
  3295. - finish_wait(&vc->wq, &wait);
  3296. + swait_finish(&vc->wq, &wait);
  3297. spin_lock(&vc->lock);
  3298. vc->vcore_state = VCORE_INACTIVE;
  3299. }
  3300. @@ -1873,7 +1873,7 @@
  3301. kvmppc_create_dtl_entry(vcpu, vc);
  3302. kvmppc_start_thread(vcpu);
  3303. } else if (vc->vcore_state == VCORE_SLEEPING) {
  3304. - wake_up(&vc->wq);
  3305. + swait_wake(&vc->wq);
  3306. }
  3307. }
  3308. diff -Nur linux-3.18.14.orig/arch/powerpc/kvm/Kconfig linux-3.18.14-rt/arch/powerpc/kvm/Kconfig
  3309. --- linux-3.18.14.orig/arch/powerpc/kvm/Kconfig 2015-05-20 10:04:50.000000000 -0500
  3310. +++ linux-3.18.14-rt/arch/powerpc/kvm/Kconfig 2015-05-31 15:32:46.281635387 -0500
  3311. @@ -157,6 +157,7 @@
  3312. config KVM_MPIC
  3313. bool "KVM in-kernel MPIC emulation"
  3314. depends on KVM && E500
  3315. + depends on !PREEMPT_RT_FULL
  3316. select HAVE_KVM_IRQCHIP
  3317. select HAVE_KVM_IRQFD
  3318. select HAVE_KVM_IRQ_ROUTING
  3319. diff -Nur linux-3.18.14.orig/arch/powerpc/mm/fault.c linux-3.18.14-rt/arch/powerpc/mm/fault.c
  3320. --- linux-3.18.14.orig/arch/powerpc/mm/fault.c 2015-05-20 10:04:50.000000000 -0500
  3321. +++ linux-3.18.14-rt/arch/powerpc/mm/fault.c 2015-05-31 15:32:46.325635386 -0500
  3322. @@ -273,7 +273,7 @@
  3323. if (!arch_irq_disabled_regs(regs))
  3324. local_irq_enable();
  3325. - if (in_atomic() || mm == NULL) {
  3326. + if (in_atomic() || mm == NULL || pagefault_disabled()) {
  3327. if (!user_mode(regs)) {
  3328. rc = SIGSEGV;
  3329. goto bail;
  3330. diff -Nur linux-3.18.14.orig/arch/s390/include/asm/kvm_host.h linux-3.18.14-rt/arch/s390/include/asm/kvm_host.h
  3331. --- linux-3.18.14.orig/arch/s390/include/asm/kvm_host.h 2015-05-20 10:04:50.000000000 -0500
  3332. +++ linux-3.18.14-rt/arch/s390/include/asm/kvm_host.h 2015-05-31 15:32:46.369635386 -0500
  3333. @@ -311,7 +311,7 @@
  3334. struct list_head list;
  3335. atomic_t active;
  3336. struct kvm_s390_float_interrupt *float_int;
  3337. - wait_queue_head_t *wq;
  3338. + struct swait_head *wq;
  3339. atomic_t *cpuflags;
  3340. unsigned int action_bits;
  3341. };
  3342. diff -Nur linux-3.18.14.orig/arch/s390/kvm/interrupt.c linux-3.18.14-rt/arch/s390/kvm/interrupt.c
  3343. --- linux-3.18.14.orig/arch/s390/kvm/interrupt.c 2015-05-20 10:04:50.000000000 -0500
  3344. +++ linux-3.18.14-rt/arch/s390/kvm/interrupt.c 2015-05-31 15:32:46.385635386 -0500
  3345. @@ -620,13 +620,13 @@
  3346. void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu)
  3347. {
  3348. - if (waitqueue_active(&vcpu->wq)) {
  3349. + if (swaitqueue_active(&vcpu->wq)) {
  3350. /*
  3351. * The vcpu gave up the cpu voluntarily, mark it as a good
  3352. * yield-candidate.
  3353. */
  3354. vcpu->preempted = true;
  3355. - wake_up_interruptible(&vcpu->wq);
  3356. + swait_wake_interruptible(&vcpu->wq);
  3357. vcpu->stat.halt_wakeup++;
  3358. }
  3359. }
  3360. @@ -747,7 +747,7 @@
  3361. spin_lock(&li->lock);
  3362. list_add(&inti->list, &li->list);
  3363. atomic_set(&li->active, 1);
  3364. - BUG_ON(waitqueue_active(li->wq));
  3365. + BUG_ON(swaitqueue_active(li->wq));
  3366. spin_unlock(&li->lock);
  3367. return 0;
  3368. }
  3369. @@ -772,7 +772,7 @@
  3370. spin_lock(&li->lock);
  3371. list_add(&inti->list, &li->list);
  3372. atomic_set(&li->active, 1);
  3373. - BUG_ON(waitqueue_active(li->wq));
  3374. + BUG_ON(swaitqueue_active(li->wq));
  3375. spin_unlock(&li->lock);
  3376. return 0;
  3377. }
  3378. diff -Nur linux-3.18.14.orig/arch/s390/kvm/interrupt.c.orig linux-3.18.14-rt/arch/s390/kvm/interrupt.c.orig
  3379. --- linux-3.18.14.orig/arch/s390/kvm/interrupt.c.orig 1969-12-31 18:00:00.000000000 -0600
  3380. +++ linux-3.18.14-rt/arch/s390/kvm/interrupt.c.orig 2015-05-20 10:04:50.000000000 -0500
  3381. @@ -0,0 +1,1541 @@
  3382. +/*
  3383. + * handling kvm guest interrupts
  3384. + *
  3385. + * Copyright IBM Corp. 2008,2014
  3386. + *
  3387. + * This program is free software; you can redistribute it and/or modify
  3388. + * it under the terms of the GNU General Public License (version 2 only)
  3389. + * as published by the Free Software Foundation.
  3390. + *
  3391. + * Author(s): Carsten Otte <cotte@de.ibm.com>
  3392. + */
  3393. +
  3394. +#include <linux/interrupt.h>
  3395. +#include <linux/kvm_host.h>
  3396. +#include <linux/hrtimer.h>
  3397. +#include <linux/mmu_context.h>
  3398. +#include <linux/signal.h>
  3399. +#include <linux/slab.h>
  3400. +#include <linux/vmalloc.h>
  3401. +#include <asm/asm-offsets.h>
  3402. +#include <asm/uaccess.h>
  3403. +#include "kvm-s390.h"
  3404. +#include "gaccess.h"
  3405. +#include "trace-s390.h"
  3406. +
  3407. +#define IOINT_SCHID_MASK 0x0000ffff
  3408. +#define IOINT_SSID_MASK 0x00030000
  3409. +#define IOINT_CSSID_MASK 0x03fc0000
  3410. +#define IOINT_AI_MASK 0x04000000
  3411. +#define PFAULT_INIT 0x0600
  3412. +
  3413. +static int __must_check deliver_ckc_interrupt(struct kvm_vcpu *vcpu);
  3414. +
  3415. +static int is_ioint(u64 type)
  3416. +{
  3417. + return ((type & 0xfffe0000u) != 0xfffe0000u);
  3418. +}
  3419. +
  3420. +int psw_extint_disabled(struct kvm_vcpu *vcpu)
  3421. +{
  3422. + return !(vcpu->arch.sie_block->gpsw.mask & PSW_MASK_EXT);
  3423. +}
  3424. +
  3425. +static int psw_ioint_disabled(struct kvm_vcpu *vcpu)
  3426. +{
  3427. + return !(vcpu->arch.sie_block->gpsw.mask & PSW_MASK_IO);
  3428. +}
  3429. +
  3430. +static int psw_mchk_disabled(struct kvm_vcpu *vcpu)
  3431. +{
  3432. + return !(vcpu->arch.sie_block->gpsw.mask & PSW_MASK_MCHECK);
  3433. +}
  3434. +
  3435. +static int psw_interrupts_disabled(struct kvm_vcpu *vcpu)
  3436. +{
  3437. + if ((vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PER) ||
  3438. + (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_IO) ||
  3439. + (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_EXT))
  3440. + return 0;
  3441. + return 1;
  3442. +}
  3443. +
  3444. +static int ckc_interrupts_enabled(struct kvm_vcpu *vcpu)
  3445. +{
  3446. + if (psw_extint_disabled(vcpu) ||
  3447. + !(vcpu->arch.sie_block->gcr[0] & 0x800ul))
  3448. + return 0;
  3449. + if (guestdbg_enabled(vcpu) && guestdbg_sstep_enabled(vcpu))
  3450. + /* No timer interrupts when single stepping */
  3451. + return 0;
  3452. + return 1;
  3453. +}
  3454. +
  3455. +static u64 int_word_to_isc_bits(u32 int_word)
  3456. +{
  3457. + u8 isc = (int_word & 0x38000000) >> 27;
  3458. +
  3459. + return (0x80 >> isc) << 24;
  3460. +}
  3461. +
  3462. +static int __must_check __interrupt_is_deliverable(struct kvm_vcpu *vcpu,
  3463. + struct kvm_s390_interrupt_info *inti)
  3464. +{
  3465. + switch (inti->type) {
  3466. + case KVM_S390_INT_EXTERNAL_CALL:
  3467. + if (psw_extint_disabled(vcpu))
  3468. + return 0;
  3469. + if (vcpu->arch.sie_block->gcr[0] & 0x2000ul)
  3470. + return 1;
  3471. + return 0;
  3472. + case KVM_S390_INT_EMERGENCY:
  3473. + if (psw_extint_disabled(vcpu))
  3474. + return 0;
  3475. + if (vcpu->arch.sie_block->gcr[0] & 0x4000ul)
  3476. + return 1;
  3477. + return 0;
  3478. + case KVM_S390_INT_CLOCK_COMP:
  3479. + return ckc_interrupts_enabled(vcpu);
  3480. + case KVM_S390_INT_CPU_TIMER:
  3481. + if (psw_extint_disabled(vcpu))
  3482. + return 0;
  3483. + if (vcpu->arch.sie_block->gcr[0] & 0x400ul)
  3484. + return 1;
  3485. + return 0;
  3486. + case KVM_S390_INT_SERVICE:
  3487. + case KVM_S390_INT_PFAULT_INIT:
  3488. + case KVM_S390_INT_PFAULT_DONE:
  3489. + case KVM_S390_INT_VIRTIO:
  3490. + if (psw_extint_disabled(vcpu))
  3491. + return 0;
  3492. + if (vcpu->arch.sie_block->gcr[0] & 0x200ul)
  3493. + return 1;
  3494. + return 0;
  3495. + case KVM_S390_PROGRAM_INT:
  3496. + case KVM_S390_SIGP_STOP:
  3497. + case KVM_S390_SIGP_SET_PREFIX:
  3498. + case KVM_S390_RESTART:
  3499. + return 1;
  3500. + case KVM_S390_MCHK:
  3501. + if (psw_mchk_disabled(vcpu))
  3502. + return 0;
  3503. + if (vcpu->arch.sie_block->gcr[14] & inti->mchk.cr14)
  3504. + return 1;
  3505. + return 0;
  3506. + case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
  3507. + if (psw_ioint_disabled(vcpu))
  3508. + return 0;
  3509. + if (vcpu->arch.sie_block->gcr[6] &
  3510. + int_word_to_isc_bits(inti->io.io_int_word))
  3511. + return 1;
  3512. + return 0;
  3513. + default:
  3514. + printk(KERN_WARNING "illegal interrupt type %llx\n",
  3515. + inti->type);
  3516. + BUG();
  3517. + }
  3518. + return 0;
  3519. +}
  3520. +
  3521. +static void __set_cpu_idle(struct kvm_vcpu *vcpu)
  3522. +{
  3523. + atomic_set_mask(CPUSTAT_WAIT, &vcpu->arch.sie_block->cpuflags);
  3524. + set_bit(vcpu->vcpu_id, vcpu->arch.local_int.float_int->idle_mask);
  3525. +}
  3526. +
  3527. +static void __unset_cpu_idle(struct kvm_vcpu *vcpu)
  3528. +{
  3529. + atomic_clear_mask(CPUSTAT_WAIT, &vcpu->arch.sie_block->cpuflags);
  3530. + clear_bit(vcpu->vcpu_id, vcpu->arch.local_int.float_int->idle_mask);
  3531. +}
  3532. +
  3533. +static void __reset_intercept_indicators(struct kvm_vcpu *vcpu)
  3534. +{
  3535. + atomic_clear_mask(CPUSTAT_IO_INT | CPUSTAT_EXT_INT | CPUSTAT_STOP_INT,
  3536. + &vcpu->arch.sie_block->cpuflags);
  3537. + vcpu->arch.sie_block->lctl = 0x0000;
  3538. + vcpu->arch.sie_block->ictl &= ~(ICTL_LPSW | ICTL_STCTL | ICTL_PINT);
  3539. +
  3540. + if (guestdbg_enabled(vcpu)) {
  3541. + vcpu->arch.sie_block->lctl |= (LCTL_CR0 | LCTL_CR9 |
  3542. + LCTL_CR10 | LCTL_CR11);
  3543. + vcpu->arch.sie_block->ictl |= (ICTL_STCTL | ICTL_PINT);
  3544. + }
  3545. +
  3546. + if (vcpu->arch.local_int.action_bits & ACTION_STOP_ON_STOP)
  3547. + atomic_set_mask(CPUSTAT_STOP_INT, &vcpu->arch.sie_block->cpuflags);
  3548. +}
  3549. +
  3550. +static void __set_cpuflag(struct kvm_vcpu *vcpu, u32 flag)
  3551. +{
  3552. + atomic_set_mask(flag, &vcpu->arch.sie_block->cpuflags);
  3553. +}
  3554. +
  3555. +static void __set_intercept_indicator(struct kvm_vcpu *vcpu,
  3556. + struct kvm_s390_interrupt_info *inti)
  3557. +{
  3558. + switch (inti->type) {
  3559. + case KVM_S390_INT_EXTERNAL_CALL:
  3560. + case KVM_S390_INT_EMERGENCY:
  3561. + case KVM_S390_INT_SERVICE:
  3562. + case KVM_S390_INT_PFAULT_INIT:
  3563. + case KVM_S390_INT_PFAULT_DONE:
  3564. + case KVM_S390_INT_VIRTIO:
  3565. + case KVM_S390_INT_CLOCK_COMP:
  3566. + case KVM_S390_INT_CPU_TIMER:
  3567. + if (psw_extint_disabled(vcpu))
  3568. + __set_cpuflag(vcpu, CPUSTAT_EXT_INT);
  3569. + else
  3570. + vcpu->arch.sie_block->lctl |= LCTL_CR0;
  3571. + break;
  3572. + case KVM_S390_SIGP_STOP:
  3573. + __set_cpuflag(vcpu, CPUSTAT_STOP_INT);
  3574. + break;
  3575. + case KVM_S390_MCHK:
  3576. + if (psw_mchk_disabled(vcpu))
  3577. + vcpu->arch.sie_block->ictl |= ICTL_LPSW;
  3578. + else
  3579. + vcpu->arch.sie_block->lctl |= LCTL_CR14;
  3580. + break;
  3581. + case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
  3582. + if (psw_ioint_disabled(vcpu))
  3583. + __set_cpuflag(vcpu, CPUSTAT_IO_INT);
  3584. + else
  3585. + vcpu->arch.sie_block->lctl |= LCTL_CR6;
  3586. + break;
  3587. + default:
  3588. + BUG();
  3589. + }
  3590. +}
  3591. +
  3592. +static u16 get_ilc(struct kvm_vcpu *vcpu)
  3593. +{
  3594. + const unsigned short table[] = { 2, 4, 4, 6 };
  3595. +
  3596. + switch (vcpu->arch.sie_block->icptcode) {
  3597. + case ICPT_INST:
  3598. + case ICPT_INSTPROGI:
  3599. + case ICPT_OPEREXC:
  3600. + case ICPT_PARTEXEC:
  3601. + case ICPT_IOINST:
  3602. + /* last instruction only stored for these icptcodes */
  3603. + return table[vcpu->arch.sie_block->ipa >> 14];
  3604. + case ICPT_PROGI:
  3605. + return vcpu->arch.sie_block->pgmilc;
  3606. + default:
  3607. + return 0;
  3608. + }
  3609. +}
  3610. +
  3611. +static int __must_check __deliver_prog_irq(struct kvm_vcpu *vcpu,
  3612. + struct kvm_s390_pgm_info *pgm_info)
  3613. +{
  3614. + int rc = 0;
  3615. + u16 ilc = get_ilc(vcpu);
  3616. +
  3617. + switch (pgm_info->code & ~PGM_PER) {
  3618. + case PGM_AFX_TRANSLATION:
  3619. + case PGM_ASX_TRANSLATION:
  3620. + case PGM_EX_TRANSLATION:
  3621. + case PGM_LFX_TRANSLATION:
  3622. + case PGM_LSTE_SEQUENCE:
  3623. + case PGM_LSX_TRANSLATION:
  3624. + case PGM_LX_TRANSLATION:
  3625. + case PGM_PRIMARY_AUTHORITY:
  3626. + case PGM_SECONDARY_AUTHORITY:
  3627. + case PGM_SPACE_SWITCH:
  3628. + rc = put_guest_lc(vcpu, pgm_info->trans_exc_code,
  3629. + (u64 *)__LC_TRANS_EXC_CODE);
  3630. + break;
  3631. + case PGM_ALEN_TRANSLATION:
  3632. + case PGM_ALE_SEQUENCE:
  3633. + case PGM_ASTE_INSTANCE:
  3634. + case PGM_ASTE_SEQUENCE:
  3635. + case PGM_ASTE_VALIDITY:
  3636. + case PGM_EXTENDED_AUTHORITY:
  3637. + rc = put_guest_lc(vcpu, pgm_info->exc_access_id,
  3638. + (u8 *)__LC_EXC_ACCESS_ID);
  3639. + break;
  3640. + case PGM_ASCE_TYPE:
  3641. + case PGM_PAGE_TRANSLATION:
  3642. + case PGM_REGION_FIRST_TRANS:
  3643. + case PGM_REGION_SECOND_TRANS:
  3644. + case PGM_REGION_THIRD_TRANS:
  3645. + case PGM_SEGMENT_TRANSLATION:
  3646. + rc = put_guest_lc(vcpu, pgm_info->trans_exc_code,
  3647. + (u64 *)__LC_TRANS_EXC_CODE);
  3648. + rc |= put_guest_lc(vcpu, pgm_info->exc_access_id,
  3649. + (u8 *)__LC_EXC_ACCESS_ID);
  3650. + rc |= put_guest_lc(vcpu, pgm_info->op_access_id,
  3651. + (u8 *)__LC_OP_ACCESS_ID);
  3652. + break;
  3653. + case PGM_MONITOR:
  3654. + rc = put_guest_lc(vcpu, pgm_info->mon_class_nr,
  3655. + (u16 *)__LC_MON_CLASS_NR);
  3656. + rc |= put_guest_lc(vcpu, pgm_info->mon_code,
  3657. + (u64 *)__LC_MON_CODE);
  3658. + break;
  3659. + case PGM_DATA:
  3660. + rc = put_guest_lc(vcpu, pgm_info->data_exc_code,
  3661. + (u32 *)__LC_DATA_EXC_CODE);
  3662. + break;
  3663. + case PGM_PROTECTION:
  3664. + rc = put_guest_lc(vcpu, pgm_info->trans_exc_code,
  3665. + (u64 *)__LC_TRANS_EXC_CODE);
  3666. + rc |= put_guest_lc(vcpu, pgm_info->exc_access_id,
  3667. + (u8 *)__LC_EXC_ACCESS_ID);
  3668. + break;
  3669. + }
  3670. +
  3671. + if (pgm_info->code & PGM_PER) {
  3672. + rc |= put_guest_lc(vcpu, pgm_info->per_code,
  3673. + (u8 *) __LC_PER_CODE);
  3674. + rc |= put_guest_lc(vcpu, pgm_info->per_atmid,
  3675. + (u8 *)__LC_PER_ATMID);
  3676. + rc |= put_guest_lc(vcpu, pgm_info->per_address,
  3677. + (u64 *) __LC_PER_ADDRESS);
  3678. + rc |= put_guest_lc(vcpu, pgm_info->per_access_id,
  3679. + (u8 *) __LC_PER_ACCESS_ID);
  3680. + }
  3681. +
  3682. + rc |= put_guest_lc(vcpu, ilc, (u16 *) __LC_PGM_ILC);
  3683. + rc |= put_guest_lc(vcpu, pgm_info->code,
  3684. + (u16 *)__LC_PGM_INT_CODE);
  3685. + rc |= write_guest_lc(vcpu, __LC_PGM_OLD_PSW,
  3686. + &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
  3687. + rc |= read_guest_lc(vcpu, __LC_PGM_NEW_PSW,
  3688. + &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
  3689. +
  3690. + return rc;
  3691. +}
  3692. +
  3693. +static int __must_check __do_deliver_interrupt(struct kvm_vcpu *vcpu,
  3694. + struct kvm_s390_interrupt_info *inti)
  3695. +{
  3696. + const unsigned short table[] = { 2, 4, 4, 6 };
  3697. + int rc = 0;
  3698. +
  3699. + switch (inti->type) {
  3700. + case KVM_S390_INT_EMERGENCY:
  3701. + VCPU_EVENT(vcpu, 4, "%s", "interrupt: sigp emerg");
  3702. + vcpu->stat.deliver_emergency_signal++;
  3703. + trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
  3704. + inti->emerg.code, 0);
  3705. + rc = put_guest_lc(vcpu, 0x1201, (u16 *)__LC_EXT_INT_CODE);
  3706. + rc |= put_guest_lc(vcpu, inti->emerg.code,
  3707. + (u16 *)__LC_EXT_CPU_ADDR);
  3708. + rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW,
  3709. + &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
  3710. + rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW,
  3711. + &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
  3712. + break;
  3713. + case KVM_S390_INT_EXTERNAL_CALL:
  3714. + VCPU_EVENT(vcpu, 4, "%s", "interrupt: sigp ext call");
  3715. + vcpu->stat.deliver_external_call++;
  3716. + trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
  3717. + inti->extcall.code, 0);
  3718. + rc = put_guest_lc(vcpu, 0x1202, (u16 *)__LC_EXT_INT_CODE);
  3719. + rc |= put_guest_lc(vcpu, inti->extcall.code,
  3720. + (u16 *)__LC_EXT_CPU_ADDR);
  3721. + rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW,
  3722. + &vcpu->arch.sie_block->gpsw,
  3723. + sizeof(psw_t));
  3724. + rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW,
  3725. + &vcpu->arch.sie_block->gpsw,
  3726. + sizeof(psw_t));
  3727. + break;
  3728. + case KVM_S390_INT_CLOCK_COMP:
  3729. + trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
  3730. + inti->ext.ext_params, 0);
  3731. + rc = deliver_ckc_interrupt(vcpu);
  3732. + break;
  3733. + case KVM_S390_INT_CPU_TIMER:
  3734. + trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
  3735. + inti->ext.ext_params, 0);
  3736. + rc = put_guest_lc(vcpu, EXT_IRQ_CPU_TIMER,
  3737. + (u16 *)__LC_EXT_INT_CODE);
  3738. + rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW,
  3739. + &vcpu->arch.sie_block->gpsw,
  3740. + sizeof(psw_t));
  3741. + rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW,
  3742. + &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
  3743. + rc |= put_guest_lc(vcpu, inti->ext.ext_params,
  3744. + (u32 *)__LC_EXT_PARAMS);
  3745. + break;
  3746. + case KVM_S390_INT_SERVICE:
  3747. + VCPU_EVENT(vcpu, 4, "interrupt: sclp parm:%x",
  3748. + inti->ext.ext_params);
  3749. + vcpu->stat.deliver_service_signal++;
  3750. + trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
  3751. + inti->ext.ext_params, 0);
  3752. + rc = put_guest_lc(vcpu, 0x2401, (u16 *)__LC_EXT_INT_CODE);
  3753. + rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW,
  3754. + &vcpu->arch.sie_block->gpsw,
  3755. + sizeof(psw_t));
  3756. + rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW,
  3757. + &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
  3758. + rc |= put_guest_lc(vcpu, inti->ext.ext_params,
  3759. + (u32 *)__LC_EXT_PARAMS);
  3760. + break;
  3761. + case KVM_S390_INT_PFAULT_INIT:
  3762. + trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type, 0,
  3763. + inti->ext.ext_params2);
  3764. + rc = put_guest_lc(vcpu, EXT_IRQ_CP_SERVICE,
  3765. + (u16 *) __LC_EXT_INT_CODE);
  3766. + rc |= put_guest_lc(vcpu, PFAULT_INIT, (u16 *) __LC_EXT_CPU_ADDR);
  3767. + rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW,
  3768. + &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
  3769. + rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW,
  3770. + &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
  3771. + rc |= put_guest_lc(vcpu, inti->ext.ext_params2,
  3772. + (u64 *) __LC_EXT_PARAMS2);
  3773. + break;
  3774. + case KVM_S390_INT_PFAULT_DONE:
  3775. + trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type, 0,
  3776. + inti->ext.ext_params2);
  3777. + rc = put_guest_lc(vcpu, 0x2603, (u16 *)__LC_EXT_INT_CODE);
  3778. + rc |= put_guest_lc(vcpu, 0x0680, (u16 *)__LC_EXT_CPU_ADDR);
  3779. + rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW,
  3780. + &vcpu->arch.sie_block->gpsw,
  3781. + sizeof(psw_t));
  3782. + rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW,
  3783. + &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
  3784. + rc |= put_guest_lc(vcpu, inti->ext.ext_params2,
  3785. + (u64 *)__LC_EXT_PARAMS2);
  3786. + break;
  3787. + case KVM_S390_INT_VIRTIO:
  3788. + VCPU_EVENT(vcpu, 4, "interrupt: virtio parm:%x,parm64:%llx",
  3789. + inti->ext.ext_params, inti->ext.ext_params2);
  3790. + vcpu->stat.deliver_virtio_interrupt++;
  3791. + trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
  3792. + inti->ext.ext_params,
  3793. + inti->ext.ext_params2);
  3794. + rc = put_guest_lc(vcpu, 0x2603, (u16 *)__LC_EXT_INT_CODE);
  3795. + rc |= put_guest_lc(vcpu, 0x0d00, (u16 *)__LC_EXT_CPU_ADDR);
  3796. + rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW,
  3797. + &vcpu->arch.sie_block->gpsw,
  3798. + sizeof(psw_t));
  3799. + rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW,
  3800. + &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
  3801. + rc |= put_guest_lc(vcpu, inti->ext.ext_params,
  3802. + (u32 *)__LC_EXT_PARAMS);
  3803. + rc |= put_guest_lc(vcpu, inti->ext.ext_params2,
  3804. + (u64 *)__LC_EXT_PARAMS2);
  3805. + break;
  3806. + case KVM_S390_SIGP_STOP:
  3807. + VCPU_EVENT(vcpu, 4, "%s", "interrupt: cpu stop");
  3808. + vcpu->stat.deliver_stop_signal++;
  3809. + trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
  3810. + 0, 0);
  3811. + __set_intercept_indicator(vcpu, inti);
  3812. + break;
  3813. +
  3814. + case KVM_S390_SIGP_SET_PREFIX:
  3815. + VCPU_EVENT(vcpu, 4, "interrupt: set prefix to %x",
  3816. + inti->prefix.address);
  3817. + vcpu->stat.deliver_prefix_signal++;
  3818. + trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
  3819. + inti->prefix.address, 0);
  3820. + kvm_s390_set_prefix(vcpu, inti->prefix.address);
  3821. + break;
  3822. +
  3823. + case KVM_S390_RESTART:
  3824. + VCPU_EVENT(vcpu, 4, "%s", "interrupt: cpu restart");
  3825. + vcpu->stat.deliver_restart_signal++;
  3826. + trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
  3827. + 0, 0);
  3828. + rc = write_guest_lc(vcpu,
  3829. + offsetof(struct _lowcore, restart_old_psw),
  3830. + &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
  3831. + rc |= read_guest_lc(vcpu, offsetof(struct _lowcore, restart_psw),
  3832. + &vcpu->arch.sie_block->gpsw,
  3833. + sizeof(psw_t));
  3834. + break;
  3835. + case KVM_S390_PROGRAM_INT:
  3836. + VCPU_EVENT(vcpu, 4, "interrupt: pgm check code:%x, ilc:%x",
  3837. + inti->pgm.code,
  3838. + table[vcpu->arch.sie_block->ipa >> 14]);
  3839. + vcpu->stat.deliver_program_int++;
  3840. + trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
  3841. + inti->pgm.code, 0);
  3842. + rc = __deliver_prog_irq(vcpu, &inti->pgm);
  3843. + break;
  3844. +
  3845. + case KVM_S390_MCHK:
  3846. + VCPU_EVENT(vcpu, 4, "interrupt: machine check mcic=%llx",
  3847. + inti->mchk.mcic);
  3848. + trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
  3849. + inti->mchk.cr14,
  3850. + inti->mchk.mcic);
  3851. + rc = kvm_s390_vcpu_store_status(vcpu,
  3852. + KVM_S390_STORE_STATUS_PREFIXED);
  3853. + rc |= put_guest_lc(vcpu, inti->mchk.mcic, (u64 *)__LC_MCCK_CODE);
  3854. + rc |= write_guest_lc(vcpu, __LC_MCK_OLD_PSW,
  3855. + &vcpu->arch.sie_block->gpsw,
  3856. + sizeof(psw_t));
  3857. + rc |= read_guest_lc(vcpu, __LC_MCK_NEW_PSW,
  3858. + &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
  3859. + break;
  3860. +
  3861. + case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
  3862. + {
  3863. + __u32 param0 = ((__u32)inti->io.subchannel_id << 16) |
  3864. + inti->io.subchannel_nr;
  3865. + __u64 param1 = ((__u64)inti->io.io_int_parm << 32) |
  3866. + inti->io.io_int_word;
  3867. + VCPU_EVENT(vcpu, 4, "interrupt: I/O %llx", inti->type);
  3868. + vcpu->stat.deliver_io_int++;
  3869. + trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
  3870. + param0, param1);
  3871. + rc = put_guest_lc(vcpu, inti->io.subchannel_id,
  3872. + (u16 *)__LC_SUBCHANNEL_ID);
  3873. + rc |= put_guest_lc(vcpu, inti->io.subchannel_nr,
  3874. + (u16 *)__LC_SUBCHANNEL_NR);
  3875. + rc |= put_guest_lc(vcpu, inti->io.io_int_parm,
  3876. + (u32 *)__LC_IO_INT_PARM);
  3877. + rc |= put_guest_lc(vcpu, inti->io.io_int_word,
  3878. + (u32 *)__LC_IO_INT_WORD);
  3879. + rc |= write_guest_lc(vcpu, __LC_IO_OLD_PSW,
  3880. + &vcpu->arch.sie_block->gpsw,
  3881. + sizeof(psw_t));
  3882. + rc |= read_guest_lc(vcpu, __LC_IO_NEW_PSW,
  3883. + &vcpu->arch.sie_block->gpsw,
  3884. + sizeof(psw_t));
  3885. + break;
  3886. + }
  3887. + default:
  3888. + BUG();
  3889. + }
  3890. +
  3891. + return rc;
  3892. +}
  3893. +
  3894. +static int __must_check deliver_ckc_interrupt(struct kvm_vcpu *vcpu)
  3895. +{
  3896. + int rc;
  3897. +
  3898. + rc = put_guest_lc(vcpu, 0x1004, (u16 __user *)__LC_EXT_INT_CODE);
  3899. + rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW,
  3900. + &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
  3901. + rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW,
  3902. + &vcpu->arch.sie_block->gpsw,
  3903. + sizeof(psw_t));
  3904. + return rc;
  3905. +}
  3906. +
  3907. +/* Check whether SIGP interpretation facility has an external call pending */
  3908. +int kvm_s390_si_ext_call_pending(struct kvm_vcpu *vcpu)
  3909. +{
  3910. + atomic_t *sigp_ctrl = &vcpu->kvm->arch.sca->cpu[vcpu->vcpu_id].ctrl;
  3911. +
  3912. + if (!psw_extint_disabled(vcpu) &&
  3913. + (vcpu->arch.sie_block->gcr[0] & 0x2000ul) &&
  3914. + (atomic_read(sigp_ctrl) & SIGP_CTRL_C) &&
  3915. + (atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_ECALL_PEND))
  3916. + return 1;
  3917. +
  3918. + return 0;
  3919. +}
  3920. +
  3921. +int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu)
  3922. +{
  3923. + struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
  3924. + struct kvm_s390_float_interrupt *fi = vcpu->arch.local_int.float_int;
  3925. + struct kvm_s390_interrupt_info *inti;
  3926. + int rc = 0;
  3927. +
  3928. + if (atomic_read(&li->active)) {
  3929. + spin_lock(&li->lock);
  3930. + list_for_each_entry(inti, &li->list, list)
  3931. + if (__interrupt_is_deliverable(vcpu, inti)) {
  3932. + rc = 1;
  3933. + break;
  3934. + }
  3935. + spin_unlock(&li->lock);
  3936. + }
  3937. +
  3938. + if ((!rc) && atomic_read(&fi->active)) {
  3939. + spin_lock(&fi->lock);
  3940. + list_for_each_entry(inti, &fi->list, list)
  3941. + if (__interrupt_is_deliverable(vcpu, inti)) {
  3942. + rc = 1;
  3943. + break;
  3944. + }
  3945. + spin_unlock(&fi->lock);
  3946. + }
  3947. +
  3948. + if (!rc && kvm_cpu_has_pending_timer(vcpu))
  3949. + rc = 1;
  3950. +
  3951. + if (!rc && kvm_s390_si_ext_call_pending(vcpu))
  3952. + rc = 1;
  3953. +
  3954. + return rc;
  3955. +}
  3956. +
  3957. +int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
  3958. +{
  3959. + if (!(vcpu->arch.sie_block->ckc <
  3960. + get_tod_clock_fast() + vcpu->arch.sie_block->epoch))
  3961. + return 0;
  3962. + if (!ckc_interrupts_enabled(vcpu))
  3963. + return 0;
  3964. + return 1;
  3965. +}
  3966. +
  3967. +int kvm_s390_handle_wait(struct kvm_vcpu *vcpu)
  3968. +{
  3969. + u64 now, sltime;
  3970. +
  3971. + vcpu->stat.exit_wait_state++;
  3972. +
  3973. + /* fast path */
  3974. + if (kvm_cpu_has_pending_timer(vcpu) || kvm_arch_vcpu_runnable(vcpu))
  3975. + return 0;
  3976. +
  3977. + if (psw_interrupts_disabled(vcpu)) {
  3978. + VCPU_EVENT(vcpu, 3, "%s", "disabled wait");
  3979. + return -EOPNOTSUPP; /* disabled wait */
  3980. + }
  3981. +
  3982. + __set_cpu_idle(vcpu);
  3983. + if (!ckc_interrupts_enabled(vcpu)) {
  3984. + VCPU_EVENT(vcpu, 3, "%s", "enabled wait w/o timer");
  3985. + goto no_timer;
  3986. + }
  3987. +
  3988. + now = get_tod_clock_fast() + vcpu->arch.sie_block->epoch;
  3989. + sltime = tod_to_ns(vcpu->arch.sie_block->ckc - now);
  3990. + hrtimer_start(&vcpu->arch.ckc_timer, ktime_set (0, sltime) , HRTIMER_MODE_REL);
  3991. + VCPU_EVENT(vcpu, 5, "enabled wait via clock comparator: %llx ns", sltime);
  3992. +no_timer:
  3993. + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
  3994. + kvm_vcpu_block(vcpu);
  3995. + __unset_cpu_idle(vcpu);
  3996. + vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
  3997. +
  3998. + hrtimer_cancel(&vcpu->arch.ckc_timer);
  3999. + return 0;
  4000. +}
  4001. +
  4002. +void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu)
  4003. +{
  4004. + if (waitqueue_active(&vcpu->wq)) {
  4005. + /*
  4006. + * The vcpu gave up the cpu voluntarily, mark it as a good
  4007. + * yield-candidate.
  4008. + */
  4009. + vcpu->preempted = true;
  4010. + wake_up_interruptible(&vcpu->wq);
  4011. + vcpu->stat.halt_wakeup++;
  4012. + }
  4013. +}
  4014. +
  4015. +enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer)
  4016. +{
  4017. + struct kvm_vcpu *vcpu;
  4018. + u64 now, sltime;
  4019. +
  4020. + vcpu = container_of(timer, struct kvm_vcpu, arch.ckc_timer);
  4021. + now = get_tod_clock_fast() + vcpu->arch.sie_block->epoch;
  4022. + sltime = tod_to_ns(vcpu->arch.sie_block->ckc - now);
  4023. +
  4024. + /*
  4025. + * If the monotonic clock runs faster than the tod clock we might be
  4026. + * woken up too early and have to go back to sleep to avoid deadlocks.
  4027. + */
  4028. + if (vcpu->arch.sie_block->ckc > now &&
  4029. + hrtimer_forward_now(timer, ns_to_ktime(sltime)))
  4030. + return HRTIMER_RESTART;
  4031. + kvm_s390_vcpu_wakeup(vcpu);
  4032. + return HRTIMER_NORESTART;
  4033. +}
  4034. +
  4035. +void kvm_s390_clear_local_irqs(struct kvm_vcpu *vcpu)
  4036. +{
  4037. + struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
  4038. + struct kvm_s390_interrupt_info *n, *inti = NULL;
  4039. +
  4040. + spin_lock(&li->lock);
  4041. + list_for_each_entry_safe(inti, n, &li->list, list) {
  4042. + list_del(&inti->list);
  4043. + kfree(inti);
  4044. + }
  4045. + atomic_set(&li->active, 0);
  4046. + spin_unlock(&li->lock);
  4047. +
  4048. + /* clear pending external calls set by sigp interpretation facility */
  4049. + atomic_clear_mask(CPUSTAT_ECALL_PEND, &vcpu->arch.sie_block->cpuflags);
  4050. + atomic_clear_mask(SIGP_CTRL_C,
  4051. + &vcpu->kvm->arch.sca->cpu[vcpu->vcpu_id].ctrl);
  4052. +}
  4053. +
  4054. +int __must_check kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu)
  4055. +{
  4056. + struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
  4057. + struct kvm_s390_float_interrupt *fi = vcpu->arch.local_int.float_int;
  4058. + struct kvm_s390_interrupt_info *n, *inti = NULL;
  4059. + int deliver;
  4060. + int rc = 0;
  4061. +
  4062. + __reset_intercept_indicators(vcpu);
  4063. + if (atomic_read(&li->active)) {
  4064. + do {
  4065. + deliver = 0;
  4066. + spin_lock(&li->lock);
  4067. + list_for_each_entry_safe(inti, n, &li->list, list) {
  4068. + if (__interrupt_is_deliverable(vcpu, inti)) {
  4069. + list_del(&inti->list);
  4070. + deliver = 1;
  4071. + break;
  4072. + }
  4073. + __set_intercept_indicator(vcpu, inti);
  4074. + }
  4075. + if (list_empty(&li->list))
  4076. + atomic_set(&li->active, 0);
  4077. + spin_unlock(&li->lock);
  4078. + if (deliver) {
  4079. + rc = __do_deliver_interrupt(vcpu, inti);
  4080. + kfree(inti);
  4081. + }
  4082. + } while (!rc && deliver);
  4083. + }
  4084. +
  4085. + if (!rc && kvm_cpu_has_pending_timer(vcpu))
  4086. + rc = deliver_ckc_interrupt(vcpu);
  4087. +
  4088. + if (!rc && atomic_read(&fi->active)) {
  4089. + do {
  4090. + deliver = 0;
  4091. + spin_lock(&fi->lock);
  4092. + list_for_each_entry_safe(inti, n, &fi->list, list) {
  4093. + if (__interrupt_is_deliverable(vcpu, inti)) {
  4094. + list_del(&inti->list);
  4095. + fi->irq_count--;
  4096. + deliver = 1;
  4097. + break;
  4098. + }
  4099. + __set_intercept_indicator(vcpu, inti);
  4100. + }
  4101. + if (list_empty(&fi->list))
  4102. + atomic_set(&fi->active, 0);
  4103. + spin_unlock(&fi->lock);
  4104. + if (deliver) {
  4105. + rc = __do_deliver_interrupt(vcpu, inti);
  4106. + kfree(inti);
  4107. + }
  4108. + } while (!rc && deliver);
  4109. + }
  4110. +
  4111. + return rc;
  4112. +}
  4113. +
  4114. +int kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code)
  4115. +{
  4116. + struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
  4117. + struct kvm_s390_interrupt_info *inti;
  4118. +
  4119. + inti = kzalloc(sizeof(*inti), GFP_KERNEL);
  4120. + if (!inti)
  4121. + return -ENOMEM;
  4122. +
  4123. + inti->type = KVM_S390_PROGRAM_INT;
  4124. + inti->pgm.code = code;
  4125. +
  4126. + VCPU_EVENT(vcpu, 3, "inject: program check %d (from kernel)", code);
  4127. + trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, inti->type, code, 0, 1);
  4128. + spin_lock(&li->lock);
  4129. + list_add(&inti->list, &li->list);
  4130. + atomic_set(&li->active, 1);
  4131. + BUG_ON(waitqueue_active(li->wq));
  4132. + spin_unlock(&li->lock);
  4133. + return 0;
  4134. +}
  4135. +
  4136. +int kvm_s390_inject_prog_irq(struct kvm_vcpu *vcpu,
  4137. + struct kvm_s390_pgm_info *pgm_info)
  4138. +{
  4139. + struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
  4140. + struct kvm_s390_interrupt_info *inti;
  4141. +
  4142. + inti = kzalloc(sizeof(*inti), GFP_KERNEL);
  4143. + if (!inti)
  4144. + return -ENOMEM;
  4145. +
  4146. + VCPU_EVENT(vcpu, 3, "inject: prog irq %d (from kernel)",
  4147. + pgm_info->code);
  4148. + trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_PROGRAM_INT,
  4149. + pgm_info->code, 0, 1);
  4150. +
  4151. + inti->type = KVM_S390_PROGRAM_INT;
  4152. + memcpy(&inti->pgm, pgm_info, sizeof(inti->pgm));
  4153. + spin_lock(&li->lock);
  4154. + list_add(&inti->list, &li->list);
  4155. + atomic_set(&li->active, 1);
  4156. + BUG_ON(waitqueue_active(li->wq));
  4157. + spin_unlock(&li->lock);
  4158. + return 0;
  4159. +}
  4160. +
  4161. +struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm,
  4162. + u64 cr6, u64 schid)
  4163. +{
  4164. + struct kvm_s390_float_interrupt *fi;
  4165. + struct kvm_s390_interrupt_info *inti, *iter;
  4166. +
  4167. + if ((!schid && !cr6) || (schid && cr6))
  4168. + return NULL;
  4169. + fi = &kvm->arch.float_int;
  4170. + spin_lock(&fi->lock);
  4171. + inti = NULL;
  4172. + list_for_each_entry(iter, &fi->list, list) {
  4173. + if (!is_ioint(iter->type))
  4174. + continue;
  4175. + if (cr6 &&
  4176. + ((cr6 & int_word_to_isc_bits(iter->io.io_int_word)) == 0))
  4177. + continue;
  4178. + if (schid) {
  4179. + if (((schid & 0x00000000ffff0000) >> 16) !=
  4180. + iter->io.subchannel_id)
  4181. + continue;
  4182. + if ((schid & 0x000000000000ffff) !=
  4183. + iter->io.subchannel_nr)
  4184. + continue;
  4185. + }
  4186. + inti = iter;
  4187. + break;
  4188. + }
  4189. + if (inti) {
  4190. + list_del_init(&inti->list);
  4191. + fi->irq_count--;
  4192. + }
  4193. + if (list_empty(&fi->list))
  4194. + atomic_set(&fi->active, 0);
  4195. + spin_unlock(&fi->lock);
  4196. + return inti;
  4197. +}
  4198. +
  4199. +static int __inject_vm(struct kvm *kvm, struct kvm_s390_interrupt_info *inti)
  4200. +{
  4201. + struct kvm_s390_local_interrupt *li;
  4202. + struct kvm_s390_float_interrupt *fi;
  4203. + struct kvm_s390_interrupt_info *iter;
  4204. + struct kvm_vcpu *dst_vcpu = NULL;
  4205. + int sigcpu;
  4206. + int rc = 0;
  4207. +
  4208. + fi = &kvm->arch.float_int;
  4209. + spin_lock(&fi->lock);
  4210. + if (fi->irq_count >= KVM_S390_MAX_FLOAT_IRQS) {
  4211. + rc = -EINVAL;
  4212. + goto unlock_fi;
  4213. + }
  4214. + fi->irq_count++;
  4215. + if (!is_ioint(inti->type)) {
  4216. + list_add_tail(&inti->list, &fi->list);
  4217. + } else {
  4218. + u64 isc_bits = int_word_to_isc_bits(inti->io.io_int_word);
  4219. +
  4220. + /* Keep I/O interrupts sorted in isc order. */
  4221. + list_for_each_entry(iter, &fi->list, list) {
  4222. + if (!is_ioint(iter->type))
  4223. + continue;
  4224. + if (int_word_to_isc_bits(iter->io.io_int_word)
  4225. + <= isc_bits)
  4226. + continue;
  4227. + break;
  4228. + }
  4229. + list_add_tail(&inti->list, &iter->list);
  4230. + }
  4231. + atomic_set(&fi->active, 1);
  4232. + if (atomic_read(&kvm->online_vcpus) == 0)
  4233. + goto unlock_fi;
  4234. + sigcpu = find_first_bit(fi->idle_mask, KVM_MAX_VCPUS);
  4235. + if (sigcpu == KVM_MAX_VCPUS) {
  4236. + do {
  4237. + sigcpu = fi->next_rr_cpu++;
  4238. + if (sigcpu == KVM_MAX_VCPUS)
  4239. + sigcpu = fi->next_rr_cpu = 0;
  4240. + } while (kvm_get_vcpu(kvm, sigcpu) == NULL);
  4241. + }
  4242. + dst_vcpu = kvm_get_vcpu(kvm, sigcpu);
  4243. + li = &dst_vcpu->arch.local_int;
  4244. + spin_lock(&li->lock);
  4245. + atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags);
  4246. + spin_unlock(&li->lock);
  4247. + kvm_s390_vcpu_wakeup(kvm_get_vcpu(kvm, sigcpu));
  4248. +unlock_fi:
  4249. + spin_unlock(&fi->lock);
  4250. + return rc;
  4251. +}
  4252. +
  4253. +int kvm_s390_inject_vm(struct kvm *kvm,
  4254. + struct kvm_s390_interrupt *s390int)
  4255. +{
  4256. + struct kvm_s390_interrupt_info *inti;
  4257. + int rc;
  4258. +
  4259. + inti = kzalloc(sizeof(*inti), GFP_KERNEL);
  4260. + if (!inti)
  4261. + return -ENOMEM;
  4262. +
  4263. + inti->type = s390int->type;
  4264. + switch (inti->type) {
  4265. + case KVM_S390_INT_VIRTIO:
  4266. + VM_EVENT(kvm, 5, "inject: virtio parm:%x,parm64:%llx",
  4267. + s390int->parm, s390int->parm64);
  4268. + inti->ext.ext_params = s390int->parm;
  4269. + inti->ext.ext_params2 = s390int->parm64;
  4270. + break;
  4271. + case KVM_S390_INT_SERVICE:
  4272. + VM_EVENT(kvm, 5, "inject: sclp parm:%x", s390int->parm);
  4273. + inti->ext.ext_params = s390int->parm;
  4274. + break;
  4275. + case KVM_S390_INT_PFAULT_DONE:
  4276. + inti->type = s390int->type;
  4277. + inti->ext.ext_params2 = s390int->parm64;
  4278. + break;
  4279. + case KVM_S390_MCHK:
  4280. + VM_EVENT(kvm, 5, "inject: machine check parm64:%llx",
  4281. + s390int->parm64);
  4282. + inti->mchk.cr14 = s390int->parm; /* upper bits are not used */
  4283. + inti->mchk.mcic = s390int->parm64;
  4284. + break;
  4285. + case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
  4286. + if (inti->type & IOINT_AI_MASK)
  4287. + VM_EVENT(kvm, 5, "%s", "inject: I/O (AI)");
  4288. + else
  4289. + VM_EVENT(kvm, 5, "inject: I/O css %x ss %x schid %04x",
  4290. + s390int->type & IOINT_CSSID_MASK,
  4291. + s390int->type & IOINT_SSID_MASK,
  4292. + s390int->type & IOINT_SCHID_MASK);
  4293. + inti->io.subchannel_id = s390int->parm >> 16;
  4294. + inti->io.subchannel_nr = s390int->parm & 0x0000ffffu;
  4295. + inti->io.io_int_parm = s390int->parm64 >> 32;
  4296. + inti->io.io_int_word = s390int->parm64 & 0x00000000ffffffffull;
  4297. + break;
  4298. + default:
  4299. + kfree(inti);
  4300. + return -EINVAL;
  4301. + }
  4302. + trace_kvm_s390_inject_vm(s390int->type, s390int->parm, s390int->parm64,
  4303. + 2);
  4304. +
  4305. + rc = __inject_vm(kvm, inti);
  4306. + if (rc)
  4307. + kfree(inti);
  4308. + return rc;
  4309. +}
  4310. +
  4311. +int kvm_s390_reinject_io_int(struct kvm *kvm,
  4312. + struct kvm_s390_interrupt_info *inti)
  4313. +{
  4314. + return __inject_vm(kvm, inti);
  4315. +}
  4316. +
  4317. +int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu,
  4318. + struct kvm_s390_interrupt *s390int)
  4319. +{
  4320. + struct kvm_s390_local_interrupt *li;
  4321. + struct kvm_s390_interrupt_info *inti;
  4322. +
  4323. + inti = kzalloc(sizeof(*inti), GFP_KERNEL);
  4324. + if (!inti)
  4325. + return -ENOMEM;
  4326. +
  4327. + switch (s390int->type) {
  4328. + case KVM_S390_PROGRAM_INT:
  4329. + if (s390int->parm & 0xffff0000) {
  4330. + kfree(inti);
  4331. + return -EINVAL;
  4332. + }
  4333. + inti->type = s390int->type;
  4334. + inti->pgm.code = s390int->parm;
  4335. + VCPU_EVENT(vcpu, 3, "inject: program check %d (from user)",
  4336. + s390int->parm);
  4337. + break;
  4338. + case KVM_S390_SIGP_SET_PREFIX:
  4339. + inti->prefix.address = s390int->parm;
  4340. + inti->type = s390int->type;
  4341. + VCPU_EVENT(vcpu, 3, "inject: set prefix to %x (from user)",
  4342. + s390int->parm);
  4343. + break;
  4344. + case KVM_S390_SIGP_STOP:
  4345. + case KVM_S390_RESTART:
  4346. + case KVM_S390_INT_CLOCK_COMP:
  4347. + case KVM_S390_INT_CPU_TIMER:
  4348. + VCPU_EVENT(vcpu, 3, "inject: type %x", s390int->type);
  4349. + inti->type = s390int->type;
  4350. + break;
  4351. + case KVM_S390_INT_EXTERNAL_CALL:
  4352. + if (s390int->parm & 0xffff0000) {
  4353. + kfree(inti);
  4354. + return -EINVAL;
  4355. + }
  4356. + VCPU_EVENT(vcpu, 3, "inject: external call source-cpu:%u",
  4357. + s390int->parm);
  4358. + inti->type = s390int->type;
  4359. + inti->extcall.code = s390int->parm;
  4360. + break;
  4361. + case KVM_S390_INT_EMERGENCY:
  4362. + if (s390int->parm & 0xffff0000) {
  4363. + kfree(inti);
  4364. + return -EINVAL;
  4365. + }
  4366. + VCPU_EVENT(vcpu, 3, "inject: emergency %u\n", s390int->parm);
  4367. + inti->type = s390int->type;
  4368. + inti->emerg.code = s390int->parm;
  4369. + break;
  4370. + case KVM_S390_MCHK:
  4371. + VCPU_EVENT(vcpu, 5, "inject: machine check parm64:%llx",
  4372. + s390int->parm64);
  4373. + inti->type = s390int->type;
  4374. + inti->mchk.mcic = s390int->parm64;
  4375. + break;
  4376. + case KVM_S390_INT_PFAULT_INIT:
  4377. + inti->type = s390int->type;
  4378. + inti->ext.ext_params2 = s390int->parm64;
  4379. + break;
  4380. + case KVM_S390_INT_VIRTIO:
  4381. + case KVM_S390_INT_SERVICE:
  4382. + case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
  4383. + default:
  4384. + kfree(inti);
  4385. + return -EINVAL;
  4386. + }
  4387. + trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, s390int->type, s390int->parm,
  4388. + s390int->parm64, 2);
  4389. +
  4390. + li = &vcpu->arch.local_int;
  4391. + spin_lock(&li->lock);
  4392. + if (inti->type == KVM_S390_PROGRAM_INT)
  4393. + list_add(&inti->list, &li->list);
  4394. + else
  4395. + list_add_tail(&inti->list, &li->list);
  4396. + atomic_set(&li->active, 1);
  4397. + if (inti->type == KVM_S390_SIGP_STOP)
  4398. + li->action_bits |= ACTION_STOP_ON_STOP;
  4399. + atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags);
  4400. + spin_unlock(&li->lock);
  4401. + kvm_s390_vcpu_wakeup(vcpu);
  4402. + return 0;
  4403. +}
  4404. +
  4405. +void kvm_s390_clear_float_irqs(struct kvm *kvm)
  4406. +{
  4407. + struct kvm_s390_float_interrupt *fi;
  4408. + struct kvm_s390_interrupt_info *n, *inti = NULL;
  4409. +
  4410. + fi = &kvm->arch.float_int;
  4411. + spin_lock(&fi->lock);
  4412. + list_for_each_entry_safe(inti, n, &fi->list, list) {
  4413. + list_del(&inti->list);
  4414. + kfree(inti);
  4415. + }
  4416. + fi->irq_count = 0;
  4417. + atomic_set(&fi->active, 0);
  4418. + spin_unlock(&fi->lock);
  4419. +}
  4420. +
  4421. +static void inti_to_irq(struct kvm_s390_interrupt_info *inti,
  4422. + struct kvm_s390_irq *irq)
  4423. +{
  4424. + irq->type = inti->type;
  4425. + switch (inti->type) {
  4426. + case KVM_S390_INT_PFAULT_INIT:
  4427. + case KVM_S390_INT_PFAULT_DONE:
  4428. + case KVM_S390_INT_VIRTIO:
  4429. + case KVM_S390_INT_SERVICE:
  4430. + irq->u.ext = inti->ext;
  4431. + break;
  4432. + case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
  4433. + irq->u.io = inti->io;
  4434. + break;
  4435. + case KVM_S390_MCHK:
  4436. + irq->u.mchk = inti->mchk;
  4437. + break;
  4438. + }
  4439. +}
  4440. +
  4441. +static int get_all_floating_irqs(struct kvm *kvm, u8 __user *usrbuf, u64 len)
  4442. +{
  4443. + struct kvm_s390_interrupt_info *inti;
  4444. + struct kvm_s390_float_interrupt *fi;
  4445. + struct kvm_s390_irq *buf;
  4446. + int max_irqs;
  4447. + int ret = 0;
  4448. + int n = 0;
  4449. +
  4450. + if (len > KVM_S390_FLIC_MAX_BUFFER || len == 0)
  4451. + return -EINVAL;
  4452. +
  4453. + /*
  4454. + * We are already using -ENOMEM to signal
  4455. + * userspace it may retry with a bigger buffer,
  4456. + * so we need to use something else for this case
  4457. + */
  4458. + buf = vzalloc(len);
  4459. + if (!buf)
  4460. + return -ENOBUFS;
  4461. +
  4462. + max_irqs = len / sizeof(struct kvm_s390_irq);
  4463. +
  4464. + fi = &kvm->arch.float_int;
  4465. + spin_lock(&fi->lock);
  4466. + list_for_each_entry(inti, &fi->list, list) {
  4467. + if (n == max_irqs) {
  4468. + /* signal userspace to try again */
  4469. + ret = -ENOMEM;
  4470. + break;
  4471. + }
  4472. + inti_to_irq(inti, &buf[n]);
  4473. + n++;
  4474. + }
  4475. + spin_unlock(&fi->lock);
  4476. + if (!ret && n > 0) {
  4477. + if (copy_to_user(usrbuf, buf, sizeof(struct kvm_s390_irq) * n))
  4478. + ret = -EFAULT;
  4479. + }
  4480. + vfree(buf);
  4481. +
  4482. + return ret < 0 ? ret : n;
  4483. +}
  4484. +
  4485. +static int flic_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
  4486. +{
  4487. + int r;
  4488. +
  4489. + switch (attr->group) {
  4490. + case KVM_DEV_FLIC_GET_ALL_IRQS:
  4491. + r = get_all_floating_irqs(dev->kvm, (u8 __user *) attr->addr,
  4492. + attr->attr);
  4493. + break;
  4494. + default:
  4495. + r = -EINVAL;
  4496. + }
  4497. +
  4498. + return r;
  4499. +}
  4500. +
  4501. +static inline int copy_irq_from_user(struct kvm_s390_interrupt_info *inti,
  4502. + u64 addr)
  4503. +{
  4504. + struct kvm_s390_irq __user *uptr = (struct kvm_s390_irq __user *) addr;
  4505. + void *target = NULL;
  4506. + void __user *source;
  4507. + u64 size;
  4508. +
  4509. + if (get_user(inti->type, (u64 __user *)addr))
  4510. + return -EFAULT;
  4511. +
  4512. + switch (inti->type) {
  4513. + case KVM_S390_INT_PFAULT_INIT:
  4514. + case KVM_S390_INT_PFAULT_DONE:
  4515. + case KVM_S390_INT_VIRTIO:
  4516. + case KVM_S390_INT_SERVICE:
  4517. + target = (void *) &inti->ext;
  4518. + source = &uptr->u.ext;
  4519. + size = sizeof(inti->ext);
  4520. + break;
  4521. + case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
  4522. + target = (void *) &inti->io;
  4523. + source = &uptr->u.io;
  4524. + size = sizeof(inti->io);
  4525. + break;
  4526. + case KVM_S390_MCHK:
  4527. + target = (void *) &inti->mchk;
  4528. + source = &uptr->u.mchk;
  4529. + size = sizeof(inti->mchk);
  4530. + break;
  4531. + default:
  4532. + return -EINVAL;
  4533. + }
  4534. +
  4535. + if (copy_from_user(target, source, size))
  4536. + return -EFAULT;
  4537. +
  4538. + return 0;
  4539. +}
  4540. +
  4541. +static int enqueue_floating_irq(struct kvm_device *dev,
  4542. + struct kvm_device_attr *attr)
  4543. +{
  4544. + struct kvm_s390_interrupt_info *inti = NULL;
  4545. + int r = 0;
  4546. + int len = attr->attr;
  4547. +
  4548. + if (len % sizeof(struct kvm_s390_irq) != 0)
  4549. + return -EINVAL;
  4550. + else if (len > KVM_S390_FLIC_MAX_BUFFER)
  4551. + return -EINVAL;
  4552. +
  4553. + while (len >= sizeof(struct kvm_s390_irq)) {
  4554. + inti = kzalloc(sizeof(*inti), GFP_KERNEL);
  4555. + if (!inti)
  4556. + return -ENOMEM;
  4557. +
  4558. + r = copy_irq_from_user(inti, attr->addr);
  4559. + if (r) {
  4560. + kfree(inti);
  4561. + return r;
  4562. + }
  4563. + r = __inject_vm(dev->kvm, inti);
  4564. + if (r) {
  4565. + kfree(inti);
  4566. + return r;
  4567. + }
  4568. + len -= sizeof(struct kvm_s390_irq);
  4569. + attr->addr += sizeof(struct kvm_s390_irq);
  4570. + }
  4571. +
  4572. + return r;
  4573. +}
  4574. +
  4575. +static struct s390_io_adapter *get_io_adapter(struct kvm *kvm, unsigned int id)
  4576. +{
  4577. + if (id >= MAX_S390_IO_ADAPTERS)
  4578. + return NULL;
  4579. + return kvm->arch.adapters[id];
  4580. +}
  4581. +
  4582. +static int register_io_adapter(struct kvm_device *dev,
  4583. + struct kvm_device_attr *attr)
  4584. +{
  4585. + struct s390_io_adapter *adapter;
  4586. + struct kvm_s390_io_adapter adapter_info;
  4587. +
  4588. + if (copy_from_user(&adapter_info,
  4589. + (void __user *)attr->addr, sizeof(adapter_info)))
  4590. + return -EFAULT;
  4591. +
  4592. + if ((adapter_info.id >= MAX_S390_IO_ADAPTERS) ||
  4593. + (dev->kvm->arch.adapters[adapter_info.id] != NULL))
  4594. + return -EINVAL;
  4595. +
  4596. + adapter = kzalloc(sizeof(*adapter), GFP_KERNEL);
  4597. + if (!adapter)
  4598. + return -ENOMEM;
  4599. +
  4600. + INIT_LIST_HEAD(&adapter->maps);
  4601. + init_rwsem(&adapter->maps_lock);
  4602. + atomic_set(&adapter->nr_maps, 0);
  4603. + adapter->id = adapter_info.id;
  4604. + adapter->isc = adapter_info.isc;
  4605. + adapter->maskable = adapter_info.maskable;
  4606. + adapter->masked = false;
  4607. + adapter->swap = adapter_info.swap;
  4608. + dev->kvm->arch.adapters[adapter->id] = adapter;
  4609. +
  4610. + return 0;
  4611. +}
  4612. +
  4613. +int kvm_s390_mask_adapter(struct kvm *kvm, unsigned int id, bool masked)
  4614. +{
  4615. + int ret;
  4616. + struct s390_io_adapter *adapter = get_io_adapter(kvm, id);
  4617. +
  4618. + if (!adapter || !adapter->maskable)
  4619. + return -EINVAL;
  4620. + ret = adapter->masked;
  4621. + adapter->masked = masked;
  4622. + return ret;
  4623. +}
  4624. +
  4625. +static int kvm_s390_adapter_map(struct kvm *kvm, unsigned int id, __u64 addr)
  4626. +{
  4627. + struct s390_io_adapter *adapter = get_io_adapter(kvm, id);
  4628. + struct s390_map_info *map;
  4629. + int ret;
  4630. +
  4631. + if (!adapter || !addr)
  4632. + return -EINVAL;
  4633. +
  4634. + map = kzalloc(sizeof(*map), GFP_KERNEL);
  4635. + if (!map) {
  4636. + ret = -ENOMEM;
  4637. + goto out;
  4638. + }
  4639. + INIT_LIST_HEAD(&map->list);
  4640. + map->guest_addr = addr;
  4641. + map->addr = gmap_translate(kvm->arch.gmap, addr);
  4642. + if (map->addr == -EFAULT) {
  4643. + ret = -EFAULT;
  4644. + goto out;
  4645. + }
  4646. + ret = get_user_pages_fast(map->addr, 1, 1, &map->page);
  4647. + if (ret < 0)
  4648. + goto out;
  4649. + BUG_ON(ret != 1);
  4650. + down_write(&adapter->maps_lock);
  4651. + if (atomic_inc_return(&adapter->nr_maps) < MAX_S390_ADAPTER_MAPS) {
  4652. + list_add_tail(&map->list, &adapter->maps);
  4653. + ret = 0;
  4654. + } else {
  4655. + put_page(map->page);
  4656. + ret = -EINVAL;
  4657. + }
  4658. + up_write(&adapter->maps_lock);
  4659. +out:
  4660. + if (ret)
  4661. + kfree(map);
  4662. + return ret;
  4663. +}
  4664. +
  4665. +static int kvm_s390_adapter_unmap(struct kvm *kvm, unsigned int id, __u64 addr)
  4666. +{
  4667. + struct s390_io_adapter *adapter = get_io_adapter(kvm, id);
  4668. + struct s390_map_info *map, *tmp;
  4669. + int found = 0;
  4670. +
  4671. + if (!adapter || !addr)
  4672. + return -EINVAL;
  4673. +
  4674. + down_write(&adapter->maps_lock);
  4675. + list_for_each_entry_safe(map, tmp, &adapter->maps, list) {
  4676. + if (map->guest_addr == addr) {
  4677. + found = 1;
  4678. + atomic_dec(&adapter->nr_maps);
  4679. + list_del(&map->list);
  4680. + put_page(map->page);
  4681. + kfree(map);
  4682. + break;
  4683. + }
  4684. + }
  4685. + up_write(&adapter->maps_lock);
  4686. +
  4687. + return found ? 0 : -EINVAL;
  4688. +}
  4689. +
  4690. +void kvm_s390_destroy_adapters(struct kvm *kvm)
  4691. +{
  4692. + int i;
  4693. + struct s390_map_info *map, *tmp;
  4694. +
  4695. + for (i = 0; i < MAX_S390_IO_ADAPTERS; i++) {
  4696. + if (!kvm->arch.adapters[i])
  4697. + continue;
  4698. + list_for_each_entry_safe(map, tmp,
  4699. + &kvm->arch.adapters[i]->maps, list) {
  4700. + list_del(&map->list);
  4701. + put_page(map->page);
  4702. + kfree(map);
  4703. + }
  4704. + kfree(kvm->arch.adapters[i]);
  4705. + }
  4706. +}
  4707. +
  4708. +static int modify_io_adapter(struct kvm_device *dev,
  4709. + struct kvm_device_attr *attr)
  4710. +{
  4711. + struct kvm_s390_io_adapter_req req;
  4712. + struct s390_io_adapter *adapter;
  4713. + int ret;
  4714. +
  4715. + if (copy_from_user(&req, (void __user *)attr->addr, sizeof(req)))
  4716. + return -EFAULT;
  4717. +
  4718. + adapter = get_io_adapter(dev->kvm, req.id);
  4719. + if (!adapter)
  4720. + return -EINVAL;
  4721. + switch (req.type) {
  4722. + case KVM_S390_IO_ADAPTER_MASK:
  4723. + ret = kvm_s390_mask_adapter(dev->kvm, req.id, req.mask);
  4724. + if (ret > 0)
  4725. + ret = 0;
  4726. + break;
  4727. + case KVM_S390_IO_ADAPTER_MAP:
  4728. + ret = kvm_s390_adapter_map(dev->kvm, req.id, req.addr);
  4729. + break;
  4730. + case KVM_S390_IO_ADAPTER_UNMAP:
  4731. + ret = kvm_s390_adapter_unmap(dev->kvm, req.id, req.addr);
  4732. + break;
  4733. + default:
  4734. + ret = -EINVAL;
  4735. + }
  4736. +
  4737. + return ret;
  4738. +}
  4739. +
  4740. +static int flic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
  4741. +{
  4742. + int r = 0;
  4743. + unsigned int i;
  4744. + struct kvm_vcpu *vcpu;
  4745. +
  4746. + switch (attr->group) {
  4747. + case KVM_DEV_FLIC_ENQUEUE:
  4748. + r = enqueue_floating_irq(dev, attr);
  4749. + break;
  4750. + case KVM_DEV_FLIC_CLEAR_IRQS:
  4751. + kvm_s390_clear_float_irqs(dev->kvm);
  4752. + break;
  4753. + case KVM_DEV_FLIC_APF_ENABLE:
  4754. + dev->kvm->arch.gmap->pfault_enabled = 1;
  4755. + break;
  4756. + case KVM_DEV_FLIC_APF_DISABLE_WAIT:
  4757. + dev->kvm->arch.gmap->pfault_enabled = 0;
  4758. + /*
  4759. + * Make sure no async faults are in transition when
  4760. + * clearing the queues. So we don't need to worry
  4761. + * about late coming workers.
  4762. + */
  4763. + synchronize_srcu(&dev->kvm->srcu);
  4764. + kvm_for_each_vcpu(i, vcpu, dev->kvm)
  4765. + kvm_clear_async_pf_completion_queue(vcpu);
  4766. + break;
  4767. + case KVM_DEV_FLIC_ADAPTER_REGISTER:
  4768. + r = register_io_adapter(dev, attr);
  4769. + break;
  4770. + case KVM_DEV_FLIC_ADAPTER_MODIFY:
  4771. + r = modify_io_adapter(dev, attr);
  4772. + break;
  4773. + default:
  4774. + r = -EINVAL;
  4775. + }
  4776. +
  4777. + return r;
  4778. +}
  4779. +
  4780. +static int flic_create(struct kvm_device *dev, u32 type)
  4781. +{
  4782. + if (!dev)
  4783. + return -EINVAL;
  4784. + if (dev->kvm->arch.flic)
  4785. + return -EINVAL;
  4786. + dev->kvm->arch.flic = dev;
  4787. + return 0;
  4788. +}
  4789. +
  4790. +static void flic_destroy(struct kvm_device *dev)
  4791. +{
  4792. + dev->kvm->arch.flic = NULL;
  4793. + kfree(dev);
  4794. +}
  4795. +
  4796. +/* s390 floating irq controller (flic) */
  4797. +struct kvm_device_ops kvm_flic_ops = {
  4798. + .name = "kvm-flic",
  4799. + .get_attr = flic_get_attr,
  4800. + .set_attr = flic_set_attr,
  4801. + .create = flic_create,
  4802. + .destroy = flic_destroy,
  4803. +};
  4804. +
  4805. +static unsigned long get_ind_bit(__u64 addr, unsigned long bit_nr, bool swap)
  4806. +{
  4807. + unsigned long bit;
  4808. +
  4809. + bit = bit_nr + (addr % PAGE_SIZE) * 8;
  4810. +
  4811. + return swap ? (bit ^ (BITS_PER_LONG - 1)) : bit;
  4812. +}
  4813. +
  4814. +static struct s390_map_info *get_map_info(struct s390_io_adapter *adapter,
  4815. + u64 addr)
  4816. +{
  4817. + struct s390_map_info *map;
  4818. +
  4819. + if (!adapter)
  4820. + return NULL;
  4821. +
  4822. + list_for_each_entry(map, &adapter->maps, list) {
  4823. + if (map->guest_addr == addr)
  4824. + return map;
  4825. + }
  4826. + return NULL;
  4827. +}
  4828. +
  4829. +static int adapter_indicators_set(struct kvm *kvm,
  4830. + struct s390_io_adapter *adapter,
  4831. + struct kvm_s390_adapter_int *adapter_int)
  4832. +{
  4833. + unsigned long bit;
  4834. + int summary_set, idx;
  4835. + struct s390_map_info *info;
  4836. + void *map;
  4837. +
  4838. + info = get_map_info(adapter, adapter_int->ind_addr);
  4839. + if (!info)
  4840. + return -1;
  4841. + map = page_address(info->page);
  4842. + bit = get_ind_bit(info->addr, adapter_int->ind_offset, adapter->swap);
  4843. + set_bit(bit, map);
  4844. + idx = srcu_read_lock(&kvm->srcu);
  4845. + mark_page_dirty(kvm, info->guest_addr >> PAGE_SHIFT);
  4846. + set_page_dirty_lock(info->page);
  4847. + info = get_map_info(adapter, adapter_int->summary_addr);
  4848. + if (!info) {
  4849. + srcu_read_unlock(&kvm->srcu, idx);
  4850. + return -1;
  4851. + }
  4852. + map = page_address(info->page);
  4853. + bit = get_ind_bit(info->addr, adapter_int->summary_offset,
  4854. + adapter->swap);
  4855. + summary_set = test_and_set_bit(bit, map);
  4856. + mark_page_dirty(kvm, info->guest_addr >> PAGE_SHIFT);
  4857. + set_page_dirty_lock(info->page);
  4858. + srcu_read_unlock(&kvm->srcu, idx);
  4859. + return summary_set ? 0 : 1;
  4860. +}
  4861. +
  4862. +/*
  4863. + * < 0 - not injected due to error
  4864. + * = 0 - coalesced, summary indicator already active
  4865. + * > 0 - injected interrupt
  4866. + */
  4867. +static int set_adapter_int(struct kvm_kernel_irq_routing_entry *e,
  4868. + struct kvm *kvm, int irq_source_id, int level,
  4869. + bool line_status)
  4870. +{
  4871. + int ret;
  4872. + struct s390_io_adapter *adapter;
  4873. +
  4874. + /* We're only interested in the 0->1 transition. */
  4875. + if (!level)
  4876. + return 0;
  4877. + adapter = get_io_adapter(kvm, e->adapter.adapter_id);
  4878. + if (!adapter)
  4879. + return -1;
  4880. + down_read(&adapter->maps_lock);
  4881. + ret = adapter_indicators_set(kvm, adapter, &e->adapter);
  4882. + up_read(&adapter->maps_lock);
  4883. + if ((ret > 0) && !adapter->masked) {
  4884. + struct kvm_s390_interrupt s390int = {
  4885. + .type = KVM_S390_INT_IO(1, 0, 0, 0),
  4886. + .parm = 0,
  4887. + .parm64 = (adapter->isc << 27) | 0x80000000,
  4888. + };
  4889. + ret = kvm_s390_inject_vm(kvm, &s390int);
  4890. + if (ret == 0)
  4891. + ret = 1;
  4892. + }
  4893. + return ret;
  4894. +}
  4895. +
  4896. +int kvm_set_routing_entry(struct kvm_kernel_irq_routing_entry *e,
  4897. + const struct kvm_irq_routing_entry *ue)
  4898. +{
  4899. + int ret;
  4900. +
  4901. + switch (ue->type) {
  4902. + case KVM_IRQ_ROUTING_S390_ADAPTER:
  4903. + e->set = set_adapter_int;
  4904. + e->adapter.summary_addr = ue->u.adapter.summary_addr;
  4905. + e->adapter.ind_addr = ue->u.adapter.ind_addr;
  4906. + e->adapter.summary_offset = ue->u.adapter.summary_offset;
  4907. + e->adapter.ind_offset = ue->u.adapter.ind_offset;
  4908. + e->adapter.adapter_id = ue->u.adapter.adapter_id;
  4909. + ret = 0;
  4910. + break;
  4911. + default:
  4912. + ret = -EINVAL;
  4913. + }
  4914. +
  4915. + return ret;
  4916. +}
  4917. +
  4918. +int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
  4919. + int irq_source_id, int level, bool line_status)
  4920. +{
  4921. + return -EINVAL;
  4922. +}
  4923. diff -Nur linux-3.18.14.orig/arch/s390/mm/fault.c linux-3.18.14-rt/arch/s390/mm/fault.c
  4924. --- linux-3.18.14.orig/arch/s390/mm/fault.c 2015-05-20 10:04:50.000000000 -0500
  4925. +++ linux-3.18.14-rt/arch/s390/mm/fault.c 2015-05-31 15:32:46.401635385 -0500
  4926. @@ -435,7 +435,8 @@
  4927. * user context.
  4928. */
  4929. fault = VM_FAULT_BADCONTEXT;
  4930. - if (unlikely(!user_space_fault(regs) || in_atomic() || !mm))
  4931. + if (unlikely(!user_space_fault(regs) || !mm ||
  4932. + tsk->pagefault_disabled))
  4933. goto out;
  4934. address = trans_exc_code & __FAIL_ADDR_MASK;
  4935. diff -Nur linux-3.18.14.orig/arch/score/mm/fault.c linux-3.18.14-rt/arch/score/mm/fault.c
  4936. --- linux-3.18.14.orig/arch/score/mm/fault.c 2015-05-20 10:04:50.000000000 -0500
  4937. +++ linux-3.18.14-rt/arch/score/mm/fault.c 2015-05-31 15:32:46.413635385 -0500
  4938. @@ -73,7 +73,7 @@
  4939. * If we're in an interrupt or have no user
  4940. * context, we must not take the fault..
  4941. */
  4942. - if (in_atomic() || !mm)
  4943. + if (!mm || pagefault_disabled())
  4944. goto bad_area_nosemaphore;
  4945. if (user_mode(regs))
  4946. diff -Nur linux-3.18.14.orig/arch/sh/kernel/irq.c linux-3.18.14-rt/arch/sh/kernel/irq.c
  4947. --- linux-3.18.14.orig/arch/sh/kernel/irq.c 2015-05-20 10:04:50.000000000 -0500
  4948. +++ linux-3.18.14-rt/arch/sh/kernel/irq.c 2015-05-31 15:32:46.429635385 -0500
  4949. @@ -149,6 +149,7 @@
  4950. hardirq_ctx[cpu] = NULL;
  4951. }
  4952. +#ifndef CONFIG_PREEMPT_RT_FULL
  4953. void do_softirq_own_stack(void)
  4954. {
  4955. struct thread_info *curctx;
  4956. @@ -176,6 +177,7 @@
  4957. "r5", "r6", "r7", "r8", "r9", "r15", "t", "pr"
  4958. );
  4959. }
  4960. +#endif
  4961. #else
  4962. static inline void handle_one_irq(unsigned int irq)
  4963. {
  4964. diff -Nur linux-3.18.14.orig/arch/sh/mm/fault.c linux-3.18.14-rt/arch/sh/mm/fault.c
  4965. --- linux-3.18.14.orig/arch/sh/mm/fault.c 2015-05-20 10:04:50.000000000 -0500
  4966. +++ linux-3.18.14-rt/arch/sh/mm/fault.c 2015-05-31 15:32:46.469635385 -0500
  4967. @@ -440,7 +440,7 @@
  4968. * If we're in an interrupt, have no user context or are running
  4969. * in an atomic region then we must not take the fault:
  4970. */
  4971. - if (unlikely(in_atomic() || !mm)) {
  4972. + if (unlikely(!mm || pagefault_disabled())) {
  4973. bad_area_nosemaphore(regs, error_code, address);
  4974. return;
  4975. }
  4976. diff -Nur linux-3.18.14.orig/arch/sparc/Kconfig linux-3.18.14-rt/arch/sparc/Kconfig
  4977. --- linux-3.18.14.orig/arch/sparc/Kconfig 2015-05-20 10:04:50.000000000 -0500
  4978. +++ linux-3.18.14-rt/arch/sparc/Kconfig 2015-05-31 15:32:46.469635385 -0500
  4979. @@ -182,12 +182,10 @@
  4980. source kernel/Kconfig.hz
  4981. config RWSEM_GENERIC_SPINLOCK
  4982. - bool
  4983. - default y if SPARC32
  4984. + def_bool PREEMPT_RT_FULL
  4985. config RWSEM_XCHGADD_ALGORITHM
  4986. - bool
  4987. - default y if SPARC64
  4988. + def_bool !RWSEM_GENERIC_SPINLOCK && !PREEMPT_RT_FULL
  4989. config GENERIC_HWEIGHT
  4990. bool
  4991. @@ -528,6 +526,10 @@
  4992. source "fs/Kconfig.binfmt"
  4993. +config EARLY_PRINTK
  4994. + bool
  4995. + default y
  4996. +
  4997. config COMPAT
  4998. bool
  4999. depends on SPARC64
  5000. diff -Nur linux-3.18.14.orig/arch/sparc/kernel/irq_64.c linux-3.18.14-rt/arch/sparc/kernel/irq_64.c
  5001. --- linux-3.18.14.orig/arch/sparc/kernel/irq_64.c 2015-05-20 10:04:50.000000000 -0500
  5002. +++ linux-3.18.14-rt/arch/sparc/kernel/irq_64.c 2015-05-31 15:32:46.477635385 -0500
  5003. @@ -849,6 +849,7 @@
  5004. set_irq_regs(old_regs);
  5005. }
  5006. +#ifndef CONFIG_PREEMPT_RT_FULL
  5007. void do_softirq_own_stack(void)
  5008. {
  5009. void *orig_sp, *sp = softirq_stack[smp_processor_id()];
  5010. @@ -863,6 +864,7 @@
  5011. __asm__ __volatile__("mov %0, %%sp"
  5012. : : "r" (orig_sp));
  5013. }
  5014. +#endif
  5015. #ifdef CONFIG_HOTPLUG_CPU
  5016. void fixup_irqs(void)
  5017. diff -Nur linux-3.18.14.orig/arch/sparc/kernel/setup_32.c linux-3.18.14-rt/arch/sparc/kernel/setup_32.c
  5018. --- linux-3.18.14.orig/arch/sparc/kernel/setup_32.c 2015-05-20 10:04:50.000000000 -0500
  5019. +++ linux-3.18.14-rt/arch/sparc/kernel/setup_32.c 2015-05-31 15:32:46.489635385 -0500
  5020. @@ -309,6 +309,7 @@
  5021. boot_flags_init(*cmdline_p);
  5022. + early_console = &prom_early_console;
  5023. register_console(&prom_early_console);
  5024. printk("ARCH: ");
  5025. diff -Nur linux-3.18.14.orig/arch/sparc/kernel/setup_64.c linux-3.18.14-rt/arch/sparc/kernel/setup_64.c
  5026. --- linux-3.18.14.orig/arch/sparc/kernel/setup_64.c 2015-05-20 10:04:50.000000000 -0500
  5027. +++ linux-3.18.14-rt/arch/sparc/kernel/setup_64.c 2015-05-31 15:32:46.509635384 -0500
  5028. @@ -563,6 +563,12 @@
  5029. pause_patch();
  5030. }
  5031. +static inline void register_prom_console(void)
  5032. +{
  5033. + early_console = &prom_early_console;
  5034. + register_console(&prom_early_console);
  5035. +}
  5036. +
  5037. void __init setup_arch(char **cmdline_p)
  5038. {
  5039. /* Initialize PROM console and command line. */
  5040. @@ -574,7 +580,7 @@
  5041. #ifdef CONFIG_EARLYFB
  5042. if (btext_find_display())
  5043. #endif
  5044. - register_console(&prom_early_console);
  5045. + register_prom_console();
  5046. if (tlb_type == hypervisor)
  5047. printk("ARCH: SUN4V\n");
  5048. diff -Nur linux-3.18.14.orig/arch/sparc/mm/fault_32.c linux-3.18.14-rt/arch/sparc/mm/fault_32.c
  5049. --- linux-3.18.14.orig/arch/sparc/mm/fault_32.c 2015-05-20 10:04:50.000000000 -0500
  5050. +++ linux-3.18.14-rt/arch/sparc/mm/fault_32.c 2015-05-31 15:32:46.529635385 -0500
  5051. @@ -196,7 +196,7 @@
  5052. * If we're in an interrupt or have no user
  5053. * context, we must not take the fault..
  5054. */
  5055. - if (in_atomic() || !mm)
  5056. + if (!mm || pagefault_disabled())
  5057. goto no_context;
  5058. perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
  5059. diff -Nur linux-3.18.14.orig/arch/sparc/mm/fault_64.c linux-3.18.14-rt/arch/sparc/mm/fault_64.c
  5060. --- linux-3.18.14.orig/arch/sparc/mm/fault_64.c 2015-05-20 10:04:50.000000000 -0500
  5061. +++ linux-3.18.14-rt/arch/sparc/mm/fault_64.c 2015-05-31 15:32:46.529635385 -0500
  5062. @@ -330,7 +330,7 @@
  5063. * If we're in an interrupt or have no user
  5064. * context, we must not take the fault..
  5065. */
  5066. - if (in_atomic() || !mm)
  5067. + if (!mm || pagefault_disabled())
  5068. goto intr_or_no_mm;
  5069. perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
  5070. diff -Nur linux-3.18.14.orig/arch/tile/mm/fault.c linux-3.18.14-rt/arch/tile/mm/fault.c
  5071. --- linux-3.18.14.orig/arch/tile/mm/fault.c 2015-05-20 10:04:50.000000000 -0500
  5072. +++ linux-3.18.14-rt/arch/tile/mm/fault.c 2015-05-31 15:32:46.533635385 -0500
  5073. @@ -357,7 +357,7 @@
  5074. * If we're in an interrupt, have no user context or are running in an
  5075. * atomic region then we must not take the fault.
  5076. */
  5077. - if (in_atomic() || !mm) {
  5078. + if (!mm || pagefault_disabled()) {
  5079. vma = NULL; /* happy compiler */
  5080. goto bad_area_nosemaphore;
  5081. }
  5082. diff -Nur linux-3.18.14.orig/arch/um/kernel/trap.c linux-3.18.14-rt/arch/um/kernel/trap.c
  5083. --- linux-3.18.14.orig/arch/um/kernel/trap.c 2015-05-20 10:04:50.000000000 -0500
  5084. +++ linux-3.18.14-rt/arch/um/kernel/trap.c 2015-05-31 15:32:46.537635384 -0500
  5085. @@ -38,7 +38,7 @@
  5086. * If the fault was during atomic operation, don't take the fault, just
  5087. * fail.
  5088. */
  5089. - if (in_atomic())
  5090. + if (pagefault_disabled())
  5091. goto out_nosemaphore;
  5092. if (is_user)
  5093. diff -Nur linux-3.18.14.orig/arch/x86/crypto/aesni-intel_glue.c linux-3.18.14-rt/arch/x86/crypto/aesni-intel_glue.c
  5094. --- linux-3.18.14.orig/arch/x86/crypto/aesni-intel_glue.c 2015-05-20 10:04:50.000000000 -0500
  5095. +++ linux-3.18.14-rt/arch/x86/crypto/aesni-intel_glue.c 2015-05-31 15:32:46.569635384 -0500
  5096. @@ -381,14 +381,14 @@
  5097. err = blkcipher_walk_virt(desc, &walk);
  5098. desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
  5099. - kernel_fpu_begin();
  5100. while ((nbytes = walk.nbytes)) {
  5101. + kernel_fpu_begin();
  5102. aesni_ecb_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
  5103. - nbytes & AES_BLOCK_MASK);
  5104. + nbytes & AES_BLOCK_MASK);
  5105. + kernel_fpu_end();
  5106. nbytes &= AES_BLOCK_SIZE - 1;
  5107. err = blkcipher_walk_done(desc, &walk, nbytes);
  5108. }
  5109. - kernel_fpu_end();
  5110. return err;
  5111. }
  5112. @@ -405,14 +405,14 @@
  5113. err = blkcipher_walk_virt(desc, &walk);
  5114. desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
  5115. - kernel_fpu_begin();
  5116. while ((nbytes = walk.nbytes)) {
  5117. + kernel_fpu_begin();
  5118. aesni_ecb_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
  5119. nbytes & AES_BLOCK_MASK);
  5120. + kernel_fpu_end();
  5121. nbytes &= AES_BLOCK_SIZE - 1;
  5122. err = blkcipher_walk_done(desc, &walk, nbytes);
  5123. }
  5124. - kernel_fpu_end();
  5125. return err;
  5126. }
  5127. @@ -429,14 +429,14 @@
  5128. err = blkcipher_walk_virt(desc, &walk);
  5129. desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
  5130. - kernel_fpu_begin();
  5131. while ((nbytes = walk.nbytes)) {
  5132. + kernel_fpu_begin();
  5133. aesni_cbc_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
  5134. nbytes & AES_BLOCK_MASK, walk.iv);
  5135. + kernel_fpu_end();
  5136. nbytes &= AES_BLOCK_SIZE - 1;
  5137. err = blkcipher_walk_done(desc, &walk, nbytes);
  5138. }
  5139. - kernel_fpu_end();
  5140. return err;
  5141. }
  5142. @@ -453,14 +453,14 @@
  5143. err = blkcipher_walk_virt(desc, &walk);
  5144. desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
  5145. - kernel_fpu_begin();
  5146. while ((nbytes = walk.nbytes)) {
  5147. + kernel_fpu_begin();
  5148. aesni_cbc_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
  5149. nbytes & AES_BLOCK_MASK, walk.iv);
  5150. + kernel_fpu_end();
  5151. nbytes &= AES_BLOCK_SIZE - 1;
  5152. err = blkcipher_walk_done(desc, &walk, nbytes);
  5153. }
  5154. - kernel_fpu_end();
  5155. return err;
  5156. }
  5157. @@ -512,18 +512,20 @@
  5158. err = blkcipher_walk_virt_block(desc, &walk, AES_BLOCK_SIZE);
  5159. desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
  5160. - kernel_fpu_begin();
  5161. while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) {
  5162. + kernel_fpu_begin();
  5163. aesni_ctr_enc_tfm(ctx, walk.dst.virt.addr, walk.src.virt.addr,
  5164. nbytes & AES_BLOCK_MASK, walk.iv);
  5165. + kernel_fpu_end();
  5166. nbytes &= AES_BLOCK_SIZE - 1;
  5167. err = blkcipher_walk_done(desc, &walk, nbytes);
  5168. }
  5169. if (walk.nbytes) {
  5170. + kernel_fpu_begin();
  5171. ctr_crypt_final(ctx, &walk);
  5172. + kernel_fpu_end();
  5173. err = blkcipher_walk_done(desc, &walk, 0);
  5174. }
  5175. - kernel_fpu_end();
  5176. return err;
  5177. }
  5178. diff -Nur linux-3.18.14.orig/arch/x86/crypto/cast5_avx_glue.c linux-3.18.14-rt/arch/x86/crypto/cast5_avx_glue.c
  5179. --- linux-3.18.14.orig/arch/x86/crypto/cast5_avx_glue.c 2015-05-20 10:04:50.000000000 -0500
  5180. +++ linux-3.18.14-rt/arch/x86/crypto/cast5_avx_glue.c 2015-05-31 15:32:46.585635384 -0500
  5181. @@ -60,7 +60,7 @@
  5182. static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
  5183. bool enc)
  5184. {
  5185. - bool fpu_enabled = false;
  5186. + bool fpu_enabled;
  5187. struct cast5_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
  5188. const unsigned int bsize = CAST5_BLOCK_SIZE;
  5189. unsigned int nbytes;
  5190. @@ -76,7 +76,7 @@
  5191. u8 *wsrc = walk->src.virt.addr;
  5192. u8 *wdst = walk->dst.virt.addr;
  5193. - fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
  5194. + fpu_enabled = cast5_fpu_begin(false, nbytes);
  5195. /* Process multi-block batch */
  5196. if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) {
  5197. @@ -104,10 +104,9 @@
  5198. } while (nbytes >= bsize);
  5199. done:
  5200. + cast5_fpu_end(fpu_enabled);
  5201. err = blkcipher_walk_done(desc, walk, nbytes);
  5202. }
  5203. -
  5204. - cast5_fpu_end(fpu_enabled);
  5205. return err;
  5206. }
  5207. @@ -228,7 +227,7 @@
  5208. static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
  5209. struct scatterlist *src, unsigned int nbytes)
  5210. {
  5211. - bool fpu_enabled = false;
  5212. + bool fpu_enabled;
  5213. struct blkcipher_walk walk;
  5214. int err;
  5215. @@ -237,12 +236,11 @@
  5216. desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
  5217. while ((nbytes = walk.nbytes)) {
  5218. - fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
  5219. + fpu_enabled = cast5_fpu_begin(false, nbytes);
  5220. nbytes = __cbc_decrypt(desc, &walk);
  5221. + cast5_fpu_end(fpu_enabled);
  5222. err = blkcipher_walk_done(desc, &walk, nbytes);
  5223. }
  5224. -
  5225. - cast5_fpu_end(fpu_enabled);
  5226. return err;
  5227. }
  5228. @@ -312,7 +310,7 @@
  5229. static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
  5230. struct scatterlist *src, unsigned int nbytes)
  5231. {
  5232. - bool fpu_enabled = false;
  5233. + bool fpu_enabled;
  5234. struct blkcipher_walk walk;
  5235. int err;
  5236. @@ -321,13 +319,12 @@
  5237. desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
  5238. while ((nbytes = walk.nbytes) >= CAST5_BLOCK_SIZE) {
  5239. - fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
  5240. + fpu_enabled = cast5_fpu_begin(false, nbytes);
  5241. nbytes = __ctr_crypt(desc, &walk);
  5242. + cast5_fpu_end(fpu_enabled);
  5243. err = blkcipher_walk_done(desc, &walk, nbytes);
  5244. }
  5245. - cast5_fpu_end(fpu_enabled);
  5246. -
  5247. if (walk.nbytes) {
  5248. ctr_crypt_final(desc, &walk);
  5249. err = blkcipher_walk_done(desc, &walk, 0);
  5250. diff -Nur linux-3.18.14.orig/arch/x86/crypto/glue_helper.c linux-3.18.14-rt/arch/x86/crypto/glue_helper.c
  5251. --- linux-3.18.14.orig/arch/x86/crypto/glue_helper.c 2015-05-20 10:04:50.000000000 -0500
  5252. +++ linux-3.18.14-rt/arch/x86/crypto/glue_helper.c 2015-05-31 15:32:46.589635384 -0500
  5253. @@ -39,7 +39,7 @@
  5254. void *ctx = crypto_blkcipher_ctx(desc->tfm);
  5255. const unsigned int bsize = 128 / 8;
  5256. unsigned int nbytes, i, func_bytes;
  5257. - bool fpu_enabled = false;
  5258. + bool fpu_enabled;
  5259. int err;
  5260. err = blkcipher_walk_virt(desc, walk);
  5261. @@ -49,7 +49,7 @@
  5262. u8 *wdst = walk->dst.virt.addr;
  5263. fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
  5264. - desc, fpu_enabled, nbytes);
  5265. + desc, false, nbytes);
  5266. for (i = 0; i < gctx->num_funcs; i++) {
  5267. func_bytes = bsize * gctx->funcs[i].num_blocks;
  5268. @@ -71,10 +71,10 @@
  5269. }
  5270. done:
  5271. + glue_fpu_end(fpu_enabled);
  5272. err = blkcipher_walk_done(desc, walk, nbytes);
  5273. }
  5274. - glue_fpu_end(fpu_enabled);
  5275. return err;
  5276. }
  5277. @@ -194,7 +194,7 @@
  5278. struct scatterlist *src, unsigned int nbytes)
  5279. {
  5280. const unsigned int bsize = 128 / 8;
  5281. - bool fpu_enabled = false;
  5282. + bool fpu_enabled;
  5283. struct blkcipher_walk walk;
  5284. int err;
  5285. @@ -203,12 +203,12 @@
  5286. while ((nbytes = walk.nbytes)) {
  5287. fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
  5288. - desc, fpu_enabled, nbytes);
  5289. + desc, false, nbytes);
  5290. nbytes = __glue_cbc_decrypt_128bit(gctx, desc, &walk);
  5291. + glue_fpu_end(fpu_enabled);
  5292. err = blkcipher_walk_done(desc, &walk, nbytes);
  5293. }
  5294. - glue_fpu_end(fpu_enabled);
  5295. return err;
  5296. }
  5297. EXPORT_SYMBOL_GPL(glue_cbc_decrypt_128bit);
  5298. @@ -278,7 +278,7 @@
  5299. struct scatterlist *src, unsigned int nbytes)
  5300. {
  5301. const unsigned int bsize = 128 / 8;
  5302. - bool fpu_enabled = false;
  5303. + bool fpu_enabled;
  5304. struct blkcipher_walk walk;
  5305. int err;
  5306. @@ -287,13 +287,12 @@
  5307. while ((nbytes = walk.nbytes) >= bsize) {
  5308. fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
  5309. - desc, fpu_enabled, nbytes);
  5310. + desc, false, nbytes);
  5311. nbytes = __glue_ctr_crypt_128bit(gctx, desc, &walk);
  5312. + glue_fpu_end(fpu_enabled);
  5313. err = blkcipher_walk_done(desc, &walk, nbytes);
  5314. }
  5315. - glue_fpu_end(fpu_enabled);
  5316. -
  5317. if (walk.nbytes) {
  5318. glue_ctr_crypt_final_128bit(
  5319. gctx->funcs[gctx->num_funcs - 1].fn_u.ctr, desc, &walk);
  5320. @@ -348,7 +347,7 @@
  5321. void *tweak_ctx, void *crypt_ctx)
  5322. {
  5323. const unsigned int bsize = 128 / 8;
  5324. - bool fpu_enabled = false;
  5325. + bool fpu_enabled;
  5326. struct blkcipher_walk walk;
  5327. int err;
  5328. @@ -361,21 +360,21 @@
  5329. /* set minimum length to bsize, for tweak_fn */
  5330. fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
  5331. - desc, fpu_enabled,
  5332. + desc, false,
  5333. nbytes < bsize ? bsize : nbytes);
  5334. -
  5335. /* calculate first value of T */
  5336. tweak_fn(tweak_ctx, walk.iv, walk.iv);
  5337. + glue_fpu_end(fpu_enabled);
  5338. while (nbytes) {
  5339. + fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
  5340. + desc, false, nbytes);
  5341. nbytes = __glue_xts_crypt_128bit(gctx, crypt_ctx, desc, &walk);
  5342. + glue_fpu_end(fpu_enabled);
  5343. err = blkcipher_walk_done(desc, &walk, nbytes);
  5344. nbytes = walk.nbytes;
  5345. }
  5346. -
  5347. - glue_fpu_end(fpu_enabled);
  5348. -
  5349. return err;
  5350. }
  5351. EXPORT_SYMBOL_GPL(glue_xts_crypt_128bit);
  5352. diff -Nur linux-3.18.14.orig/arch/x86/include/asm/preempt.h linux-3.18.14-rt/arch/x86/include/asm/preempt.h
  5353. --- linux-3.18.14.orig/arch/x86/include/asm/preempt.h 2015-05-20 10:04:50.000000000 -0500
  5354. +++ linux-3.18.14-rt/arch/x86/include/asm/preempt.h 2015-05-31 15:32:46.597635384 -0500
  5355. @@ -85,17 +85,33 @@
  5356. * a decrement which hits zero means we have no preempt_count and should
  5357. * reschedule.
  5358. */
  5359. -static __always_inline bool __preempt_count_dec_and_test(void)
  5360. +static __always_inline bool ____preempt_count_dec_and_test(void)
  5361. {
  5362. GEN_UNARY_RMWcc("decl", __preempt_count, __percpu_arg(0), "e");
  5363. }
  5364. +static __always_inline bool __preempt_count_dec_and_test(void)
  5365. +{
  5366. + if (____preempt_count_dec_and_test())
  5367. + return true;
  5368. +#ifdef CONFIG_PREEMPT_LAZY
  5369. + return test_thread_flag(TIF_NEED_RESCHED_LAZY);
  5370. +#else
  5371. + return false;
  5372. +#endif
  5373. +}
  5374. +
  5375. /*
  5376. * Returns true when we need to resched and can (barring IRQ state).
  5377. */
  5378. static __always_inline bool should_resched(void)
  5379. {
  5380. +#ifdef CONFIG_PREEMPT_LAZY
  5381. + return unlikely(!raw_cpu_read_4(__preempt_count) || \
  5382. + test_thread_flag(TIF_NEED_RESCHED_LAZY));
  5383. +#else
  5384. return unlikely(!raw_cpu_read_4(__preempt_count));
  5385. +#endif
  5386. }
  5387. #ifdef CONFIG_PREEMPT
  5388. diff -Nur linux-3.18.14.orig/arch/x86/include/asm/signal.h linux-3.18.14-rt/arch/x86/include/asm/signal.h
  5389. --- linux-3.18.14.orig/arch/x86/include/asm/signal.h 2015-05-20 10:04:50.000000000 -0500
  5390. +++ linux-3.18.14-rt/arch/x86/include/asm/signal.h 2015-05-31 15:32:46.597635384 -0500
  5391. @@ -23,6 +23,19 @@
  5392. unsigned long sig[_NSIG_WORDS];
  5393. } sigset_t;
  5394. +/*
  5395. + * Because some traps use the IST stack, we must keep preemption
  5396. + * disabled while calling do_trap(), but do_trap() may call
  5397. + * force_sig_info() which will grab the signal spin_locks for the
  5398. + * task, which in PREEMPT_RT_FULL are mutexes. By defining
  5399. + * ARCH_RT_DELAYS_SIGNAL_SEND the force_sig_info() will set
  5400. + * TIF_NOTIFY_RESUME and set up the signal to be sent on exit of the
  5401. + * trap.
  5402. + */
  5403. +#if defined(CONFIG_PREEMPT_RT_FULL) && defined(CONFIG_X86_64)
  5404. +#define ARCH_RT_DELAYS_SIGNAL_SEND
  5405. +#endif
  5406. +
  5407. #ifndef CONFIG_COMPAT
  5408. typedef sigset_t compat_sigset_t;
  5409. #endif
  5410. diff -Nur linux-3.18.14.orig/arch/x86/include/asm/stackprotector.h linux-3.18.14-rt/arch/x86/include/asm/stackprotector.h
  5411. --- linux-3.18.14.orig/arch/x86/include/asm/stackprotector.h 2015-05-20 10:04:50.000000000 -0500
  5412. +++ linux-3.18.14-rt/arch/x86/include/asm/stackprotector.h 2015-05-31 15:32:46.613635384 -0500
  5413. @@ -57,7 +57,7 @@
  5414. */
  5415. static __always_inline void boot_init_stack_canary(void)
  5416. {
  5417. - u64 canary;
  5418. + u64 uninitialized_var(canary);
  5419. u64 tsc;
  5420. #ifdef CONFIG_X86_64
  5421. @@ -68,8 +68,16 @@
  5422. * of randomness. The TSC only matters for very early init,
  5423. * there it already has some randomness on most systems. Later
  5424. * on during the bootup the random pool has true entropy too.
  5425. + *
  5426. + * For preempt-rt we need to weaken the randomness a bit, as
  5427. + * we can't call into the random generator from atomic context
  5428. + * due to locking constraints. We just leave canary
  5429. + * uninitialized and use the TSC based randomness on top of
  5430. + * it.
  5431. */
  5432. +#ifndef CONFIG_PREEMPT_RT_FULL
  5433. get_random_bytes(&canary, sizeof(canary));
  5434. +#endif
  5435. tsc = __native_read_tsc();
  5436. canary += tsc + (tsc << 32UL);
  5437. diff -Nur linux-3.18.14.orig/arch/x86/include/asm/thread_info.h linux-3.18.14-rt/arch/x86/include/asm/thread_info.h
  5438. --- linux-3.18.14.orig/arch/x86/include/asm/thread_info.h 2015-05-20 10:04:50.000000000 -0500
  5439. +++ linux-3.18.14-rt/arch/x86/include/asm/thread_info.h 2015-05-31 15:32:46.621635383 -0500
  5440. @@ -30,6 +30,8 @@
  5441. __u32 status; /* thread synchronous flags */
  5442. __u32 cpu; /* current CPU */
  5443. int saved_preempt_count;
  5444. + int preempt_lazy_count; /* 0 => lazy preemptable
  5445. + <0 => BUG */
  5446. mm_segment_t addr_limit;
  5447. struct restart_block restart_block;
  5448. void __user *sysenter_return;
  5449. @@ -75,6 +77,7 @@
  5450. #define TIF_SYSCALL_EMU 6 /* syscall emulation active */
  5451. #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */
  5452. #define TIF_SECCOMP 8 /* secure computing */
  5453. +#define TIF_NEED_RESCHED_LAZY 9 /* lazy rescheduling necessary */
  5454. #define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */
  5455. #define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */
  5456. #define TIF_UPROBE 12 /* breakpointed or singlestepping */
  5457. @@ -100,6 +103,7 @@
  5458. #define _TIF_SYSCALL_EMU (1 << TIF_SYSCALL_EMU)
  5459. #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
  5460. #define _TIF_SECCOMP (1 << TIF_SECCOMP)
  5461. +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
  5462. #define _TIF_MCE_NOTIFY (1 << TIF_MCE_NOTIFY)
  5463. #define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY)
  5464. #define _TIF_UPROBE (1 << TIF_UPROBE)
  5465. @@ -150,6 +154,8 @@
  5466. #define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
  5467. #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW)
  5468. +#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
  5469. +
  5470. #define STACK_WARN (THREAD_SIZE/8)
  5471. #define KERNEL_STACK_OFFSET (5*(BITS_PER_LONG/8))
  5472. diff -Nur linux-3.18.14.orig/arch/x86/include/asm/uv/uv_bau.h linux-3.18.14-rt/arch/x86/include/asm/uv/uv_bau.h
  5473. --- linux-3.18.14.orig/arch/x86/include/asm/uv/uv_bau.h 2015-05-20 10:04:50.000000000 -0500
  5474. +++ linux-3.18.14-rt/arch/x86/include/asm/uv/uv_bau.h 2015-05-31 15:32:46.621635383 -0500
  5475. @@ -615,9 +615,9 @@
  5476. cycles_t send_message;
  5477. cycles_t period_end;
  5478. cycles_t period_time;
  5479. - spinlock_t uvhub_lock;
  5480. - spinlock_t queue_lock;
  5481. - spinlock_t disable_lock;
  5482. + raw_spinlock_t uvhub_lock;
  5483. + raw_spinlock_t queue_lock;
  5484. + raw_spinlock_t disable_lock;
  5485. /* tunables */
  5486. int max_concurr;
  5487. int max_concurr_const;
  5488. @@ -776,15 +776,15 @@
  5489. * to be lowered below the current 'v'. atomic_add_unless can only stop
  5490. * on equal.
  5491. */
  5492. -static inline int atomic_inc_unless_ge(spinlock_t *lock, atomic_t *v, int u)
  5493. +static inline int atomic_inc_unless_ge(raw_spinlock_t *lock, atomic_t *v, int u)
  5494. {
  5495. - spin_lock(lock);
  5496. + raw_spin_lock(lock);
  5497. if (atomic_read(v) >= u) {
  5498. - spin_unlock(lock);
  5499. + raw_spin_unlock(lock);
  5500. return 0;
  5501. }
  5502. atomic_inc(v);
  5503. - spin_unlock(lock);
  5504. + raw_spin_unlock(lock);
  5505. return 1;
  5506. }
  5507. diff -Nur linux-3.18.14.orig/arch/x86/include/asm/uv/uv_hub.h linux-3.18.14-rt/arch/x86/include/asm/uv/uv_hub.h
  5508. --- linux-3.18.14.orig/arch/x86/include/asm/uv/uv_hub.h 2015-05-20 10:04:50.000000000 -0500
  5509. +++ linux-3.18.14-rt/arch/x86/include/asm/uv/uv_hub.h 2015-05-31 15:32:46.621635383 -0500
  5510. @@ -492,7 +492,7 @@
  5511. unsigned short nr_online_cpus;
  5512. unsigned short pnode;
  5513. short memory_nid;
  5514. - spinlock_t nmi_lock; /* obsolete, see uv_hub_nmi */
  5515. + raw_spinlock_t nmi_lock; /* obsolete, see uv_hub_nmi */
  5516. unsigned long nmi_count; /* obsolete, see uv_hub_nmi */
  5517. };
  5518. extern struct uv_blade_info *uv_blade_info;
  5519. diff -Nur linux-3.18.14.orig/arch/x86/Kconfig linux-3.18.14-rt/arch/x86/Kconfig
  5520. --- linux-3.18.14.orig/arch/x86/Kconfig 2015-05-20 10:04:50.000000000 -0500
  5521. +++ linux-3.18.14-rt/arch/x86/Kconfig 2015-05-31 15:32:46.561635384 -0500
  5522. @@ -21,6 +21,7 @@
  5523. ### Arch settings
  5524. config X86
  5525. def_bool y
  5526. + select HAVE_PREEMPT_LAZY
  5527. select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI
  5528. select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS
  5529. select ARCH_HAS_FAST_MULTIPLIER
  5530. @@ -197,8 +198,11 @@
  5531. def_bool y
  5532. depends on ISA_DMA_API
  5533. +config RWSEM_GENERIC_SPINLOCK
  5534. + def_bool PREEMPT_RT_FULL
  5535. +
  5536. config RWSEM_XCHGADD_ALGORITHM
  5537. - def_bool y
  5538. + def_bool !RWSEM_GENERIC_SPINLOCK && !PREEMPT_RT_FULL
  5539. config GENERIC_CALIBRATE_DELAY
  5540. def_bool y
  5541. @@ -811,7 +815,7 @@
  5542. config MAXSMP
  5543. bool "Enable Maximum number of SMP Processors and NUMA Nodes"
  5544. depends on X86_64 && SMP && DEBUG_KERNEL
  5545. - select CPUMASK_OFFSTACK
  5546. + select CPUMASK_OFFSTACK if !PREEMPT_RT_FULL
  5547. ---help---
  5548. Enable maximum number of CPUS and NUMA Nodes for this architecture.
  5549. If unsure, say N.
  5550. diff -Nur linux-3.18.14.orig/arch/x86/kernel/apic/io_apic.c linux-3.18.14-rt/arch/x86/kernel/apic/io_apic.c
  5551. --- linux-3.18.14.orig/arch/x86/kernel/apic/io_apic.c 2015-05-20 10:04:50.000000000 -0500
  5552. +++ linux-3.18.14-rt/arch/x86/kernel/apic/io_apic.c 2015-05-31 15:32:46.629635384 -0500
  5553. @@ -2494,7 +2494,8 @@
  5554. static inline bool ioapic_irqd_mask(struct irq_data *data, struct irq_cfg *cfg)
  5555. {
  5556. /* If we are moving the irq we need to mask it */
  5557. - if (unlikely(irqd_is_setaffinity_pending(data))) {
  5558. + if (unlikely(irqd_is_setaffinity_pending(data) &&
  5559. + !irqd_irq_inprogress(data))) {
  5560. mask_ioapic(cfg);
  5561. return true;
  5562. }
  5563. diff -Nur linux-3.18.14.orig/arch/x86/kernel/apic/x2apic_uv_x.c linux-3.18.14-rt/arch/x86/kernel/apic/x2apic_uv_x.c
  5564. --- linux-3.18.14.orig/arch/x86/kernel/apic/x2apic_uv_x.c 2015-05-20 10:04:50.000000000 -0500
  5565. +++ linux-3.18.14-rt/arch/x86/kernel/apic/x2apic_uv_x.c 2015-05-31 15:32:46.629635384 -0500
  5566. @@ -918,7 +918,7 @@
  5567. uv_blade_info[blade].pnode = pnode;
  5568. uv_blade_info[blade].nr_possible_cpus = 0;
  5569. uv_blade_info[blade].nr_online_cpus = 0;
  5570. - spin_lock_init(&uv_blade_info[blade].nmi_lock);
  5571. + raw_spin_lock_init(&uv_blade_info[blade].nmi_lock);
  5572. min_pnode = min(pnode, min_pnode);
  5573. max_pnode = max(pnode, max_pnode);
  5574. blade++;
  5575. diff -Nur linux-3.18.14.orig/arch/x86/kernel/asm-offsets.c linux-3.18.14-rt/arch/x86/kernel/asm-offsets.c
  5576. --- linux-3.18.14.orig/arch/x86/kernel/asm-offsets.c 2015-05-20 10:04:50.000000000 -0500
  5577. +++ linux-3.18.14-rt/arch/x86/kernel/asm-offsets.c 2015-05-31 15:32:46.633635383 -0500
  5578. @@ -32,6 +32,7 @@
  5579. OFFSET(TI_flags, thread_info, flags);
  5580. OFFSET(TI_status, thread_info, status);
  5581. OFFSET(TI_addr_limit, thread_info, addr_limit);
  5582. + OFFSET(TI_preempt_lazy_count, thread_info, preempt_lazy_count);
  5583. BLANK();
  5584. OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
  5585. @@ -71,4 +72,5 @@
  5586. BLANK();
  5587. DEFINE(PTREGS_SIZE, sizeof(struct pt_regs));
  5588. + DEFINE(_PREEMPT_ENABLED, PREEMPT_ENABLED);
  5589. }
  5590. diff -Nur linux-3.18.14.orig/arch/x86/kernel/cpu/mcheck/mce.c linux-3.18.14-rt/arch/x86/kernel/cpu/mcheck/mce.c
  5591. --- linux-3.18.14.orig/arch/x86/kernel/cpu/mcheck/mce.c 2015-05-20 10:04:50.000000000 -0500
  5592. +++ linux-3.18.14-rt/arch/x86/kernel/cpu/mcheck/mce.c 2015-05-31 15:32:46.641635383 -0500
  5593. @@ -41,6 +41,8 @@
  5594. #include <linux/debugfs.h>
  5595. #include <linux/irq_work.h>
  5596. #include <linux/export.h>
  5597. +#include <linux/jiffies.h>
  5598. +#include <linux/work-simple.h>
  5599. #include <asm/processor.h>
  5600. #include <asm/mce.h>
  5601. @@ -1266,7 +1268,7 @@
  5602. static unsigned long check_interval = 5 * 60; /* 5 minutes */
  5603. static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
  5604. -static DEFINE_PER_CPU(struct timer_list, mce_timer);
  5605. +static DEFINE_PER_CPU(struct hrtimer, mce_timer);
  5606. static unsigned long mce_adjust_timer_default(unsigned long interval)
  5607. {
  5608. @@ -1283,14 +1285,11 @@
  5609. return test_and_clear_bit(0, v);
  5610. }
  5611. -static void mce_timer_fn(unsigned long data)
  5612. +static enum hrtimer_restart mce_timer_fn(struct hrtimer *timer)
  5613. {
  5614. - struct timer_list *t = this_cpu_ptr(&mce_timer);
  5615. unsigned long iv;
  5616. int notify;
  5617. - WARN_ON(smp_processor_id() != data);
  5618. -
  5619. if (mce_available(this_cpu_ptr(&cpu_info))) {
  5620. machine_check_poll(MCP_TIMESTAMP,
  5621. this_cpu_ptr(&mce_poll_banks));
  5622. @@ -1313,9 +1312,11 @@
  5623. __this_cpu_write(mce_next_interval, iv);
  5624. /* Might have become 0 after CMCI storm subsided */
  5625. if (iv) {
  5626. - t->expires = jiffies + iv;
  5627. - add_timer_on(t, smp_processor_id());
  5628. + hrtimer_forward_now(timer, ns_to_ktime(
  5629. + jiffies_to_usecs(iv) * 1000ULL));
  5630. + return HRTIMER_RESTART;
  5631. }
  5632. + return HRTIMER_NORESTART;
  5633. }
  5634. /*
  5635. @@ -1323,28 +1324,37 @@
  5636. */
  5637. void mce_timer_kick(unsigned long interval)
  5638. {
  5639. - struct timer_list *t = this_cpu_ptr(&mce_timer);
  5640. - unsigned long when = jiffies + interval;
  5641. + struct hrtimer *t = this_cpu_ptr(&mce_timer);
  5642. unsigned long iv = __this_cpu_read(mce_next_interval);
  5643. - if (timer_pending(t)) {
  5644. - if (time_before(when, t->expires))
  5645. - mod_timer_pinned(t, when);
  5646. + if (hrtimer_active(t)) {
  5647. + s64 exp;
  5648. + s64 intv_us;
  5649. +
  5650. + intv_us = jiffies_to_usecs(interval);
  5651. + exp = ktime_to_us(hrtimer_expires_remaining(t));
  5652. + if (intv_us < exp) {
  5653. + hrtimer_cancel(t);
  5654. + hrtimer_start_range_ns(t,
  5655. + ns_to_ktime(intv_us * 1000),
  5656. + 0, HRTIMER_MODE_REL_PINNED);
  5657. + }
  5658. } else {
  5659. - t->expires = round_jiffies(when);
  5660. - add_timer_on(t, smp_processor_id());
  5661. + hrtimer_start_range_ns(t,
  5662. + ns_to_ktime(jiffies_to_usecs(interval) * 1000ULL),
  5663. + 0, HRTIMER_MODE_REL_PINNED);
  5664. }
  5665. if (interval < iv)
  5666. __this_cpu_write(mce_next_interval, interval);
  5667. }
  5668. -/* Must not be called in IRQ context where del_timer_sync() can deadlock */
  5669. +/* Must not be called in IRQ context where hrtimer_cancel() can deadlock */
  5670. static void mce_timer_delete_all(void)
  5671. {
  5672. int cpu;
  5673. for_each_online_cpu(cpu)
  5674. - del_timer_sync(&per_cpu(mce_timer, cpu));
  5675. + hrtimer_cancel(&per_cpu(mce_timer, cpu));
  5676. }
  5677. static void mce_do_trigger(struct work_struct *work)
  5678. @@ -1354,6 +1364,56 @@
  5679. static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
  5680. +static void __mce_notify_work(struct swork_event *event)
  5681. +{
  5682. + /* Not more than two messages every minute */
  5683. + static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
  5684. +
  5685. + /* wake processes polling /dev/mcelog */
  5686. + wake_up_interruptible(&mce_chrdev_wait);
  5687. +
  5688. + /*
  5689. + * There is no risk of missing notifications because
  5690. + * work_pending is always cleared before the function is
  5691. + * executed.
  5692. + */
  5693. + if (mce_helper[0] && !work_pending(&mce_trigger_work))
  5694. + schedule_work(&mce_trigger_work);
  5695. +
  5696. + if (__ratelimit(&ratelimit))
  5697. + pr_info(HW_ERR "Machine check events logged\n");
  5698. +}
  5699. +
  5700. +#ifdef CONFIG_PREEMPT_RT_FULL
  5701. +static bool notify_work_ready __read_mostly;
  5702. +static struct swork_event notify_work;
  5703. +
  5704. +static int mce_notify_work_init(void)
  5705. +{
  5706. + int err;
  5707. +
  5708. + err = swork_get();
  5709. + if (err)
  5710. + return err;
  5711. +
  5712. + INIT_SWORK(&notify_work, __mce_notify_work);
  5713. + notify_work_ready = true;
  5714. + return 0;
  5715. +}
  5716. +
  5717. +static void mce_notify_work(void)
  5718. +{
  5719. + if (notify_work_ready)
  5720. + swork_queue(&notify_work);
  5721. +}
  5722. +#else
  5723. +static void mce_notify_work(void)
  5724. +{
  5725. + __mce_notify_work(NULL);
  5726. +}
  5727. +static inline int mce_notify_work_init(void) { return 0; }
  5728. +#endif
  5729. +
  5730. /*
  5731. * Notify the user(s) about new machine check events.
  5732. * Can be called from interrupt context, but not from machine check/NMI
  5733. @@ -1361,19 +1421,8 @@
  5734. */
  5735. int mce_notify_irq(void)
  5736. {
  5737. - /* Not more than two messages every minute */
  5738. - static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
  5739. -
  5740. if (test_and_clear_bit(0, &mce_need_notify)) {
  5741. - /* wake processes polling /dev/mcelog */
  5742. - wake_up_interruptible(&mce_chrdev_wait);
  5743. -
  5744. - if (mce_helper[0])
  5745. - schedule_work(&mce_trigger_work);
  5746. -
  5747. - if (__ratelimit(&ratelimit))
  5748. - pr_info(HW_ERR "Machine check events logged\n");
  5749. -
  5750. + mce_notify_work();
  5751. return 1;
  5752. }
  5753. return 0;
  5754. @@ -1644,7 +1693,7 @@
  5755. }
  5756. }
  5757. -static void mce_start_timer(unsigned int cpu, struct timer_list *t)
  5758. +static void mce_start_timer(unsigned int cpu, struct hrtimer *t)
  5759. {
  5760. unsigned long iv = check_interval * HZ;
  5761. @@ -1653,16 +1702,17 @@
  5762. per_cpu(mce_next_interval, cpu) = iv;
  5763. - t->expires = round_jiffies(jiffies + iv);
  5764. - add_timer_on(t, cpu);
  5765. + hrtimer_start_range_ns(t, ns_to_ktime(jiffies_to_usecs(iv) * 1000ULL),
  5766. + 0, HRTIMER_MODE_REL_PINNED);
  5767. }
  5768. static void __mcheck_cpu_init_timer(void)
  5769. {
  5770. - struct timer_list *t = this_cpu_ptr(&mce_timer);
  5771. + struct hrtimer *t = this_cpu_ptr(&mce_timer);
  5772. unsigned int cpu = smp_processor_id();
  5773. - setup_timer(t, mce_timer_fn, cpu);
  5774. + hrtimer_init(t, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
  5775. + t->function = mce_timer_fn;
  5776. mce_start_timer(cpu, t);
  5777. }
  5778. @@ -2339,6 +2389,8 @@
  5779. if (!mce_available(raw_cpu_ptr(&cpu_info)))
  5780. return;
  5781. + hrtimer_cancel(this_cpu_ptr(&mce_timer));
  5782. +
  5783. if (!(action & CPU_TASKS_FROZEN))
  5784. cmci_clear();
  5785. for (i = 0; i < mca_cfg.banks; i++) {
  5786. @@ -2365,6 +2417,7 @@
  5787. if (b->init)
  5788. wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);
  5789. }
  5790. + __mcheck_cpu_init_timer();
  5791. }
  5792. /* Get notified when a cpu comes on/off. Be hotplug friendly. */
  5793. @@ -2372,7 +2425,6 @@
  5794. mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
  5795. {
  5796. unsigned int cpu = (unsigned long)hcpu;
  5797. - struct timer_list *t = &per_cpu(mce_timer, cpu);
  5798. switch (action & ~CPU_TASKS_FROZEN) {
  5799. case CPU_ONLINE:
  5800. @@ -2392,11 +2444,9 @@
  5801. break;
  5802. case CPU_DOWN_PREPARE:
  5803. smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
  5804. - del_timer_sync(t);
  5805. break;
  5806. case CPU_DOWN_FAILED:
  5807. smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
  5808. - mce_start_timer(cpu, t);
  5809. break;
  5810. }
  5811. @@ -2435,6 +2485,10 @@
  5812. goto err_out;
  5813. }
  5814. + err = mce_notify_work_init();
  5815. + if (err)
  5816. + goto err_out;
  5817. +
  5818. if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) {
  5819. err = -ENOMEM;
  5820. goto err_out;
  5821. diff -Nur linux-3.18.14.orig/arch/x86/kernel/entry_32.S linux-3.18.14-rt/arch/x86/kernel/entry_32.S
  5822. --- linux-3.18.14.orig/arch/x86/kernel/entry_32.S 2015-05-20 10:04:50.000000000 -0500
  5823. +++ linux-3.18.14-rt/arch/x86/kernel/entry_32.S 2015-05-31 15:32:46.641635383 -0500
  5824. @@ -359,8 +359,24 @@
  5825. ENTRY(resume_kernel)
  5826. DISABLE_INTERRUPTS(CLBR_ANY)
  5827. need_resched:
  5828. + # preempt count == 0 + NEED_RS set?
  5829. cmpl $0,PER_CPU_VAR(__preempt_count)
  5830. +#ifndef CONFIG_PREEMPT_LAZY
  5831. jnz restore_all
  5832. +#else
  5833. + jz test_int_off
  5834. +
  5835. + # atleast preempt count == 0 ?
  5836. + cmpl $_PREEMPT_ENABLED,PER_CPU_VAR(__preempt_count)
  5837. + jne restore_all
  5838. +
  5839. + cmpl $0,TI_preempt_lazy_count(%ebp) # non-zero preempt_lazy_count ?
  5840. + jnz restore_all
  5841. +
  5842. + testl $_TIF_NEED_RESCHED_LAZY, TI_flags(%ebp)
  5843. + jz restore_all
  5844. +test_int_off:
  5845. +#endif
  5846. testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off (exception path) ?
  5847. jz restore_all
  5848. call preempt_schedule_irq
  5849. @@ -591,7 +607,7 @@
  5850. ALIGN
  5851. RING0_PTREGS_FRAME # can't unwind into user space anyway
  5852. work_pending:
  5853. - testb $_TIF_NEED_RESCHED, %cl
  5854. + testl $_TIF_NEED_RESCHED_MASK, %ecx
  5855. jz work_notifysig
  5856. work_resched:
  5857. call schedule
  5858. @@ -604,7 +620,7 @@
  5859. andl $_TIF_WORK_MASK, %ecx # is there any work to be done other
  5860. # than syscall tracing?
  5861. jz restore_all
  5862. - testb $_TIF_NEED_RESCHED, %cl
  5863. + testl $_TIF_NEED_RESCHED_MASK, %ecx
  5864. jnz work_resched
  5865. work_notifysig: # deal with pending signals and
  5866. diff -Nur linux-3.18.14.orig/arch/x86/kernel/entry_64.S linux-3.18.14-rt/arch/x86/kernel/entry_64.S
  5867. --- linux-3.18.14.orig/arch/x86/kernel/entry_64.S 2015-05-20 10:04:50.000000000 -0500
  5868. +++ linux-3.18.14-rt/arch/x86/kernel/entry_64.S 2015-05-31 15:32:46.649635383 -0500
  5869. @@ -454,8 +454,8 @@
  5870. /* Handle reschedules */
  5871. /* edx: work, edi: workmask */
  5872. sysret_careful:
  5873. - bt $TIF_NEED_RESCHED,%edx
  5874. - jnc sysret_signal
  5875. + testl $_TIF_NEED_RESCHED_MASK,%edx
  5876. + jz sysret_signal
  5877. TRACE_IRQS_ON
  5878. ENABLE_INTERRUPTS(CLBR_NONE)
  5879. pushq_cfi %rdi
  5880. @@ -554,8 +554,8 @@
  5881. /* First do a reschedule test. */
  5882. /* edx: work, edi: workmask */
  5883. int_careful:
  5884. - bt $TIF_NEED_RESCHED,%edx
  5885. - jnc int_very_careful
  5886. + testl $_TIF_NEED_RESCHED_MASK,%edx
  5887. + jz int_very_careful
  5888. TRACE_IRQS_ON
  5889. ENABLE_INTERRUPTS(CLBR_NONE)
  5890. pushq_cfi %rdi
  5891. @@ -870,8 +870,8 @@
  5892. /* edi: workmask, edx: work */
  5893. retint_careful:
  5894. CFI_RESTORE_STATE
  5895. - bt $TIF_NEED_RESCHED,%edx
  5896. - jnc retint_signal
  5897. + testl $_TIF_NEED_RESCHED_MASK,%edx
  5898. + jz retint_signal
  5899. TRACE_IRQS_ON
  5900. ENABLE_INTERRUPTS(CLBR_NONE)
  5901. pushq_cfi %rdi
  5902. @@ -903,7 +903,22 @@
  5903. /* rcx: threadinfo. interrupts off. */
  5904. ENTRY(retint_kernel)
  5905. cmpl $0,PER_CPU_VAR(__preempt_count)
  5906. +#ifndef CONFIG_PREEMPT_LAZY
  5907. jnz retint_restore_args
  5908. +#else
  5909. + jz check_int_off
  5910. +
  5911. + # atleast preempt count == 0 ?
  5912. + cmpl $_PREEMPT_ENABLED,PER_CPU_VAR(__preempt_count)
  5913. + jnz retint_restore_args
  5914. +
  5915. + cmpl $0, TI_preempt_lazy_count(%rcx)
  5916. + jnz retint_restore_args
  5917. +
  5918. + bt $TIF_NEED_RESCHED_LAZY,TI_flags(%rcx)
  5919. + jnc retint_restore_args
  5920. +check_int_off:
  5921. +#endif
  5922. bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */
  5923. jnc retint_restore_args
  5924. call preempt_schedule_irq
  5925. @@ -1119,6 +1134,7 @@
  5926. jmp 2b
  5927. .previous
  5928. +#ifndef CONFIG_PREEMPT_RT_FULL
  5929. /* Call softirq on interrupt stack. Interrupts are off. */
  5930. ENTRY(do_softirq_own_stack)
  5931. CFI_STARTPROC
  5932. @@ -1138,6 +1154,7 @@
  5933. ret
  5934. CFI_ENDPROC
  5935. END(do_softirq_own_stack)
  5936. +#endif
  5937. #ifdef CONFIG_XEN
  5938. idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0
  5939. @@ -1302,7 +1319,7 @@
  5940. movq %rsp,%rdi /* &pt_regs */
  5941. call sync_regs
  5942. movq %rax,%rsp /* switch stack for scheduling */
  5943. - testl $_TIF_NEED_RESCHED,%ebx
  5944. + testl $_TIF_NEED_RESCHED_MASK,%ebx
  5945. jnz paranoid_schedule
  5946. movl %ebx,%edx /* arg3: thread flags */
  5947. TRACE_IRQS_ON
  5948. diff -Nur linux-3.18.14.orig/arch/x86/kernel/irq_32.c linux-3.18.14-rt/arch/x86/kernel/irq_32.c
  5949. --- linux-3.18.14.orig/arch/x86/kernel/irq_32.c 2015-05-20 10:04:50.000000000 -0500
  5950. +++ linux-3.18.14-rt/arch/x86/kernel/irq_32.c 2015-05-31 15:32:46.653635383 -0500
  5951. @@ -142,6 +142,7 @@
  5952. cpu, per_cpu(hardirq_stack, cpu), per_cpu(softirq_stack, cpu));
  5953. }
  5954. +#ifndef CONFIG_PREEMPT_RT_FULL
  5955. void do_softirq_own_stack(void)
  5956. {
  5957. struct thread_info *curstk;
  5958. @@ -160,6 +161,7 @@
  5959. call_on_stack(__do_softirq, isp);
  5960. }
  5961. +#endif
  5962. bool handle_irq(unsigned irq, struct pt_regs *regs)
  5963. {
  5964. diff -Nur linux-3.18.14.orig/arch/x86/kernel/process_32.c linux-3.18.14-rt/arch/x86/kernel/process_32.c
  5965. --- linux-3.18.14.orig/arch/x86/kernel/process_32.c 2015-05-20 10:04:50.000000000 -0500
  5966. +++ linux-3.18.14-rt/arch/x86/kernel/process_32.c 2015-05-31 15:32:46.653635383 -0500
  5967. @@ -35,6 +35,7 @@
  5968. #include <linux/uaccess.h>
  5969. #include <linux/io.h>
  5970. #include <linux/kdebug.h>
  5971. +#include <linux/highmem.h>
  5972. #include <asm/pgtable.h>
  5973. #include <asm/ldt.h>
  5974. @@ -214,6 +215,35 @@
  5975. }
  5976. EXPORT_SYMBOL_GPL(start_thread);
  5977. +#ifdef CONFIG_PREEMPT_RT_FULL
  5978. +static void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p)
  5979. +{
  5980. + int i;
  5981. +
  5982. + /*
  5983. + * Clear @prev's kmap_atomic mappings
  5984. + */
  5985. + for (i = 0; i < prev_p->kmap_idx; i++) {
  5986. + int idx = i + KM_TYPE_NR * smp_processor_id();
  5987. + pte_t *ptep = kmap_pte - idx;
  5988. +
  5989. + kpte_clear_flush(ptep, __fix_to_virt(FIX_KMAP_BEGIN + idx));
  5990. + }
  5991. + /*
  5992. + * Restore @next_p's kmap_atomic mappings
  5993. + */
  5994. + for (i = 0; i < next_p->kmap_idx; i++) {
  5995. + int idx = i + KM_TYPE_NR * smp_processor_id();
  5996. +
  5997. + if (!pte_none(next_p->kmap_pte[i]))
  5998. + set_pte(kmap_pte - idx, next_p->kmap_pte[i]);
  5999. + }
  6000. +}
  6001. +#else
  6002. +static inline void
  6003. +switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p) { }
  6004. +#endif
  6005. +
  6006. /*
  6007. * switch_to(x,y) should switch tasks from x to y.
  6008. @@ -301,6 +331,8 @@
  6009. task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT))
  6010. __switch_to_xtra(prev_p, next_p, tss);
  6011. + switch_kmaps(prev_p, next_p);
  6012. +
  6013. /*
  6014. * Leave lazy mode, flushing any hypercalls made here.
  6015. * This must be done before restoring TLS segments so
  6016. diff -Nur linux-3.18.14.orig/arch/x86/kernel/signal.c linux-3.18.14-rt/arch/x86/kernel/signal.c
  6017. --- linux-3.18.14.orig/arch/x86/kernel/signal.c 2015-05-20 10:04:50.000000000 -0500
  6018. +++ linux-3.18.14-rt/arch/x86/kernel/signal.c 2015-05-31 15:32:46.653635383 -0500
  6019. @@ -746,6 +746,14 @@
  6020. mce_notify_process();
  6021. #endif /* CONFIG_X86_64 && CONFIG_X86_MCE */
  6022. +#ifdef ARCH_RT_DELAYS_SIGNAL_SEND
  6023. + if (unlikely(current->forced_info.si_signo)) {
  6024. + struct task_struct *t = current;
  6025. + force_sig_info(t->forced_info.si_signo, &t->forced_info, t);
  6026. + t->forced_info.si_signo = 0;
  6027. + }
  6028. +#endif
  6029. +
  6030. if (thread_info_flags & _TIF_UPROBE)
  6031. uprobe_notify_resume(regs);
  6032. diff -Nur linux-3.18.14.orig/arch/x86/kernel/traps.c linux-3.18.14-rt/arch/x86/kernel/traps.c
  6033. --- linux-3.18.14.orig/arch/x86/kernel/traps.c 2015-05-20 10:04:50.000000000 -0500
  6034. +++ linux-3.18.14-rt/arch/x86/kernel/traps.c 2015-05-31 15:32:46.657635383 -0500
  6035. @@ -87,9 +87,21 @@
  6036. local_irq_enable();
  6037. }
  6038. -static inline void preempt_conditional_sti(struct pt_regs *regs)
  6039. +static inline void conditional_sti_ist(struct pt_regs *regs)
  6040. {
  6041. +#ifdef CONFIG_X86_64
  6042. + /*
  6043. + * X86_64 uses a per CPU stack on the IST for certain traps
  6044. + * like int3. The task can not be preempted when using one
  6045. + * of these stacks, thus preemption must be disabled, otherwise
  6046. + * the stack can be corrupted if the task is scheduled out,
  6047. + * and another task comes in and uses this stack.
  6048. + *
  6049. + * On x86_32 the task keeps its own stack and it is OK if the
  6050. + * task schedules out.
  6051. + */
  6052. preempt_count_inc();
  6053. +#endif
  6054. if (regs->flags & X86_EFLAGS_IF)
  6055. local_irq_enable();
  6056. }
  6057. @@ -100,11 +112,13 @@
  6058. local_irq_disable();
  6059. }
  6060. -static inline void preempt_conditional_cli(struct pt_regs *regs)
  6061. +static inline void conditional_cli_ist(struct pt_regs *regs)
  6062. {
  6063. if (regs->flags & X86_EFLAGS_IF)
  6064. local_irq_disable();
  6065. +#ifdef CONFIG_X86_64
  6066. preempt_count_dec();
  6067. +#endif
  6068. }
  6069. static nokprobe_inline int
  6070. @@ -372,9 +386,9 @@
  6071. * as we may switch to the interrupt stack.
  6072. */
  6073. debug_stack_usage_inc();
  6074. - preempt_conditional_sti(regs);
  6075. + conditional_sti_ist(regs);
  6076. do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, error_code, NULL);
  6077. - preempt_conditional_cli(regs);
  6078. + conditional_cli_ist(regs);
  6079. debug_stack_usage_dec();
  6080. exit:
  6081. exception_exit(prev_state);
  6082. @@ -517,12 +531,12 @@
  6083. debug_stack_usage_inc();
  6084. /* It's safe to allow irq's after DR6 has been saved */
  6085. - preempt_conditional_sti(regs);
  6086. + conditional_sti_ist(regs);
  6087. if (regs->flags & X86_VM_MASK) {
  6088. handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code,
  6089. X86_TRAP_DB);
  6090. - preempt_conditional_cli(regs);
  6091. + conditional_cli_ist(regs);
  6092. debug_stack_usage_dec();
  6093. goto exit;
  6094. }
  6095. @@ -542,7 +556,7 @@
  6096. si_code = get_si_code(tsk->thread.debugreg6);
  6097. if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp)
  6098. send_sigtrap(tsk, regs, error_code, si_code);
  6099. - preempt_conditional_cli(regs);
  6100. + conditional_cli_ist(regs);
  6101. debug_stack_usage_dec();
  6102. exit:
  6103. diff -Nur linux-3.18.14.orig/arch/x86/kvm/lapic.c linux-3.18.14-rt/arch/x86/kvm/lapic.c
  6104. --- linux-3.18.14.orig/arch/x86/kvm/lapic.c 2015-05-20 10:04:50.000000000 -0500
  6105. +++ linux-3.18.14-rt/arch/x86/kvm/lapic.c 2015-05-31 15:32:46.693635383 -0500
  6106. @@ -1034,8 +1034,38 @@
  6107. apic->divide_count);
  6108. }
  6109. +
  6110. +static enum hrtimer_restart apic_timer_fn(struct hrtimer *data);
  6111. +
  6112. +static void apic_timer_expired(struct hrtimer *data)
  6113. +{
  6114. + int ret, i = 0;
  6115. + enum hrtimer_restart r;
  6116. + struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
  6117. +
  6118. + r = apic_timer_fn(data);
  6119. +
  6120. + if (r == HRTIMER_RESTART) {
  6121. + do {
  6122. + ret = hrtimer_start_expires(data, HRTIMER_MODE_ABS);
  6123. + if (ret == -ETIME)
  6124. + hrtimer_add_expires_ns(&ktimer->timer,
  6125. + ktimer->period);
  6126. + i++;
  6127. + } while (ret == -ETIME && i < 10);
  6128. +
  6129. + if (ret == -ETIME) {
  6130. + printk_once(KERN_ERR "%s: failed to reprogram timer\n",
  6131. + __func__);
  6132. + WARN_ON_ONCE(1);
  6133. + }
  6134. + }
  6135. +}
  6136. +
  6137. +
  6138. static void start_apic_timer(struct kvm_lapic *apic)
  6139. {
  6140. + int ret;
  6141. ktime_t now;
  6142. atomic_set(&apic->lapic_timer.pending, 0);
  6143. @@ -1065,9 +1095,11 @@
  6144. }
  6145. }
  6146. - hrtimer_start(&apic->lapic_timer.timer,
  6147. + ret = hrtimer_start(&apic->lapic_timer.timer,
  6148. ktime_add_ns(now, apic->lapic_timer.period),
  6149. HRTIMER_MODE_ABS);
  6150. + if (ret == -ETIME)
  6151. + apic_timer_expired(&apic->lapic_timer.timer);
  6152. apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016"
  6153. PRIx64 ", "
  6154. @@ -1097,8 +1129,10 @@
  6155. ns = (tscdeadline - guest_tsc) * 1000000ULL;
  6156. do_div(ns, this_tsc_khz);
  6157. }
  6158. - hrtimer_start(&apic->lapic_timer.timer,
  6159. + ret = hrtimer_start(&apic->lapic_timer.timer,
  6160. ktime_add_ns(now, ns), HRTIMER_MODE_ABS);
  6161. + if (ret == -ETIME)
  6162. + apic_timer_expired(&apic->lapic_timer.timer);
  6163. local_irq_restore(flags);
  6164. }
  6165. @@ -1539,7 +1573,7 @@
  6166. struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
  6167. struct kvm_lapic *apic = container_of(ktimer, struct kvm_lapic, lapic_timer);
  6168. struct kvm_vcpu *vcpu = apic->vcpu;
  6169. - wait_queue_head_t *q = &vcpu->wq;
  6170. + struct swait_head *q = &vcpu->wq;
  6171. /*
  6172. * There is a race window between reading and incrementing, but we do
  6173. @@ -1553,8 +1587,8 @@
  6174. kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
  6175. }
  6176. - if (waitqueue_active(q))
  6177. - wake_up_interruptible(q);
  6178. + if (swaitqueue_active(q))
  6179. + swait_wake_interruptible(q);
  6180. if (lapic_is_periodic(apic)) {
  6181. hrtimer_add_expires_ns(&ktimer->timer, ktimer->period);
  6182. @@ -1587,6 +1621,7 @@
  6183. hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
  6184. HRTIMER_MODE_ABS);
  6185. apic->lapic_timer.timer.function = apic_timer_fn;
  6186. + apic->lapic_timer.timer.irqsafe = 1;
  6187. /*
  6188. * APIC is created enabled. This will prevent kvm_lapic_set_base from
  6189. @@ -1707,7 +1742,8 @@
  6190. timer = &vcpu->arch.apic->lapic_timer.timer;
  6191. if (hrtimer_cancel(timer))
  6192. - hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
  6193. + if (hrtimer_start_expires(timer, HRTIMER_MODE_ABS) == -ETIME)
  6194. + apic_timer_expired(timer);
  6195. }
  6196. /*
  6197. diff -Nur linux-3.18.14.orig/arch/x86/kvm/x86.c linux-3.18.14-rt/arch/x86/kvm/x86.c
  6198. --- linux-3.18.14.orig/arch/x86/kvm/x86.c 2015-05-20 10:04:50.000000000 -0500
  6199. +++ linux-3.18.14-rt/arch/x86/kvm/x86.c 2015-05-31 15:32:46.697635383 -0500
  6200. @@ -5772,6 +5772,13 @@
  6201. goto out;
  6202. }
  6203. +#ifdef CONFIG_PREEMPT_RT_FULL
  6204. + if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
  6205. + printk(KERN_ERR "RT requires X86_FEATURE_CONSTANT_TSC\n");
  6206. + return -EOPNOTSUPP;
  6207. + }
  6208. +#endif
  6209. +
  6210. r = kvm_mmu_module_init();
  6211. if (r)
  6212. goto out_free_percpu;
  6213. diff -Nur linux-3.18.14.orig/arch/x86/mm/fault.c linux-3.18.14-rt/arch/x86/mm/fault.c
  6214. --- linux-3.18.14.orig/arch/x86/mm/fault.c 2015-05-20 10:04:50.000000000 -0500
  6215. +++ linux-3.18.14-rt/arch/x86/mm/fault.c 2015-05-31 15:32:46.729635382 -0500
  6216. @@ -1128,7 +1128,7 @@
  6217. * If we're in an interrupt, have no user context or are running
  6218. * in an atomic region then we must not take the fault:
  6219. */
  6220. - if (unlikely(in_atomic() || !mm)) {
  6221. + if (unlikely(!mm || pagefault_disabled())) {
  6222. bad_area_nosemaphore(regs, error_code, address);
  6223. return;
  6224. }
  6225. diff -Nur linux-3.18.14.orig/arch/x86/mm/highmem_32.c linux-3.18.14-rt/arch/x86/mm/highmem_32.c
  6226. --- linux-3.18.14.orig/arch/x86/mm/highmem_32.c 2015-05-20 10:04:50.000000000 -0500
  6227. +++ linux-3.18.14-rt/arch/x86/mm/highmem_32.c 2015-05-31 15:32:46.729635382 -0500
  6228. @@ -32,6 +32,7 @@
  6229. */
  6230. void *kmap_atomic_prot(struct page *page, pgprot_t prot)
  6231. {
  6232. + pte_t pte = mk_pte(page, prot);
  6233. unsigned long vaddr;
  6234. int idx, type;
  6235. @@ -45,7 +46,10 @@
  6236. idx = type + KM_TYPE_NR*smp_processor_id();
  6237. vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
  6238. BUG_ON(!pte_none(*(kmap_pte-idx)));
  6239. - set_pte(kmap_pte-idx, mk_pte(page, prot));
  6240. +#ifdef CONFIG_PREEMPT_RT_FULL
  6241. + current->kmap_pte[type] = pte;
  6242. +#endif
  6243. + set_pte(kmap_pte-idx, pte);
  6244. arch_flush_lazy_mmu_mode();
  6245. return (void *)vaddr;
  6246. @@ -88,6 +92,9 @@
  6247. * is a bad idea also, in case the page changes cacheability
  6248. * attributes or becomes a protected page in a hypervisor.
  6249. */
  6250. +#ifdef CONFIG_PREEMPT_RT_FULL
  6251. + current->kmap_pte[type] = __pte(0);
  6252. +#endif
  6253. kpte_clear_flush(kmap_pte-idx, vaddr);
  6254. kmap_atomic_idx_pop();
  6255. arch_flush_lazy_mmu_mode();
  6256. diff -Nur linux-3.18.14.orig/arch/x86/mm/iomap_32.c linux-3.18.14-rt/arch/x86/mm/iomap_32.c
  6257. --- linux-3.18.14.orig/arch/x86/mm/iomap_32.c 2015-05-20 10:04:50.000000000 -0500
  6258. +++ linux-3.18.14-rt/arch/x86/mm/iomap_32.c 2015-05-31 15:32:46.733635383 -0500
  6259. @@ -56,6 +56,7 @@
  6260. void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
  6261. {
  6262. + pte_t pte = pfn_pte(pfn, prot);
  6263. unsigned long vaddr;
  6264. int idx, type;
  6265. @@ -64,7 +65,12 @@
  6266. type = kmap_atomic_idx_push();
  6267. idx = type + KM_TYPE_NR * smp_processor_id();
  6268. vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
  6269. - set_pte(kmap_pte - idx, pfn_pte(pfn, prot));
  6270. + WARN_ON(!pte_none(*(kmap_pte - idx)));
  6271. +
  6272. +#ifdef CONFIG_PREEMPT_RT_FULL
  6273. + current->kmap_pte[type] = pte;
  6274. +#endif
  6275. + set_pte(kmap_pte - idx, pte);
  6276. arch_flush_lazy_mmu_mode();
  6277. return (void *)vaddr;
  6278. @@ -110,6 +116,9 @@
  6279. * is a bad idea also, in case the page changes cacheability
  6280. * attributes or becomes a protected page in a hypervisor.
  6281. */
  6282. +#ifdef CONFIG_PREEMPT_RT_FULL
  6283. + current->kmap_pte[type] = __pte(0);
  6284. +#endif
  6285. kpte_clear_flush(kmap_pte-idx, vaddr);
  6286. kmap_atomic_idx_pop();
  6287. }
  6288. diff -Nur linux-3.18.14.orig/arch/x86/platform/uv/tlb_uv.c linux-3.18.14-rt/arch/x86/platform/uv/tlb_uv.c
  6289. --- linux-3.18.14.orig/arch/x86/platform/uv/tlb_uv.c 2015-05-20 10:04:50.000000000 -0500
  6290. +++ linux-3.18.14-rt/arch/x86/platform/uv/tlb_uv.c 2015-05-31 15:32:46.733635383 -0500
  6291. @@ -714,9 +714,9 @@
  6292. quiesce_local_uvhub(hmaster);
  6293. - spin_lock(&hmaster->queue_lock);
  6294. + raw_spin_lock(&hmaster->queue_lock);
  6295. reset_with_ipi(&bau_desc->distribution, bcp);
  6296. - spin_unlock(&hmaster->queue_lock);
  6297. + raw_spin_unlock(&hmaster->queue_lock);
  6298. end_uvhub_quiesce(hmaster);
  6299. @@ -736,9 +736,9 @@
  6300. quiesce_local_uvhub(hmaster);
  6301. - spin_lock(&hmaster->queue_lock);
  6302. + raw_spin_lock(&hmaster->queue_lock);
  6303. reset_with_ipi(&bau_desc->distribution, bcp);
  6304. - spin_unlock(&hmaster->queue_lock);
  6305. + raw_spin_unlock(&hmaster->queue_lock);
  6306. end_uvhub_quiesce(hmaster);
  6307. @@ -759,7 +759,7 @@
  6308. cycles_t tm1;
  6309. hmaster = bcp->uvhub_master;
  6310. - spin_lock(&hmaster->disable_lock);
  6311. + raw_spin_lock(&hmaster->disable_lock);
  6312. if (!bcp->baudisabled) {
  6313. stat->s_bau_disabled++;
  6314. tm1 = get_cycles();
  6315. @@ -772,7 +772,7 @@
  6316. }
  6317. }
  6318. }
  6319. - spin_unlock(&hmaster->disable_lock);
  6320. + raw_spin_unlock(&hmaster->disable_lock);
  6321. }
  6322. static void count_max_concurr(int stat, struct bau_control *bcp,
  6323. @@ -835,7 +835,7 @@
  6324. */
  6325. static void uv1_throttle(struct bau_control *hmaster, struct ptc_stats *stat)
  6326. {
  6327. - spinlock_t *lock = &hmaster->uvhub_lock;
  6328. + raw_spinlock_t *lock = &hmaster->uvhub_lock;
  6329. atomic_t *v;
  6330. v = &hmaster->active_descriptor_count;
  6331. @@ -968,7 +968,7 @@
  6332. struct bau_control *hmaster;
  6333. hmaster = bcp->uvhub_master;
  6334. - spin_lock(&hmaster->disable_lock);
  6335. + raw_spin_lock(&hmaster->disable_lock);
  6336. if (bcp->baudisabled && (get_cycles() >= bcp->set_bau_on_time)) {
  6337. stat->s_bau_reenabled++;
  6338. for_each_present_cpu(tcpu) {
  6339. @@ -980,10 +980,10 @@
  6340. tbcp->period_giveups = 0;
  6341. }
  6342. }
  6343. - spin_unlock(&hmaster->disable_lock);
  6344. + raw_spin_unlock(&hmaster->disable_lock);
  6345. return 0;
  6346. }
  6347. - spin_unlock(&hmaster->disable_lock);
  6348. + raw_spin_unlock(&hmaster->disable_lock);
  6349. return -1;
  6350. }
  6351. @@ -1899,9 +1899,9 @@
  6352. bcp->cong_reps = congested_reps;
  6353. bcp->disabled_period = sec_2_cycles(disabled_period);
  6354. bcp->giveup_limit = giveup_limit;
  6355. - spin_lock_init(&bcp->queue_lock);
  6356. - spin_lock_init(&bcp->uvhub_lock);
  6357. - spin_lock_init(&bcp->disable_lock);
  6358. + raw_spin_lock_init(&bcp->queue_lock);
  6359. + raw_spin_lock_init(&bcp->uvhub_lock);
  6360. + raw_spin_lock_init(&bcp->disable_lock);
  6361. }
  6362. }
  6363. diff -Nur linux-3.18.14.orig/arch/x86/platform/uv/uv_time.c linux-3.18.14-rt/arch/x86/platform/uv/uv_time.c
  6364. --- linux-3.18.14.orig/arch/x86/platform/uv/uv_time.c 2015-05-20 10:04:50.000000000 -0500
  6365. +++ linux-3.18.14-rt/arch/x86/platform/uv/uv_time.c 2015-05-31 15:32:46.737635383 -0500
  6366. @@ -58,7 +58,7 @@
  6367. /* There is one of these allocated per node */
  6368. struct uv_rtc_timer_head {
  6369. - spinlock_t lock;
  6370. + raw_spinlock_t lock;
  6371. /* next cpu waiting for timer, local node relative: */
  6372. int next_cpu;
  6373. /* number of cpus on this node: */
  6374. @@ -178,7 +178,7 @@
  6375. uv_rtc_deallocate_timers();
  6376. return -ENOMEM;
  6377. }
  6378. - spin_lock_init(&head->lock);
  6379. + raw_spin_lock_init(&head->lock);
  6380. head->ncpus = uv_blade_nr_possible_cpus(bid);
  6381. head->next_cpu = -1;
  6382. blade_info[bid] = head;
  6383. @@ -232,7 +232,7 @@
  6384. unsigned long flags;
  6385. int next_cpu;
  6386. - spin_lock_irqsave(&head->lock, flags);
  6387. + raw_spin_lock_irqsave(&head->lock, flags);
  6388. next_cpu = head->next_cpu;
  6389. *t = expires;
  6390. @@ -244,12 +244,12 @@
  6391. if (uv_setup_intr(cpu, expires)) {
  6392. *t = ULLONG_MAX;
  6393. uv_rtc_find_next_timer(head, pnode);
  6394. - spin_unlock_irqrestore(&head->lock, flags);
  6395. + raw_spin_unlock_irqrestore(&head->lock, flags);
  6396. return -ETIME;
  6397. }
  6398. }
  6399. - spin_unlock_irqrestore(&head->lock, flags);
  6400. + raw_spin_unlock_irqrestore(&head->lock, flags);
  6401. return 0;
  6402. }
  6403. @@ -268,7 +268,7 @@
  6404. unsigned long flags;
  6405. int rc = 0;
  6406. - spin_lock_irqsave(&head->lock, flags);
  6407. + raw_spin_lock_irqsave(&head->lock, flags);
  6408. if ((head->next_cpu == bcpu && uv_read_rtc(NULL) >= *t) || force)
  6409. rc = 1;
  6410. @@ -280,7 +280,7 @@
  6411. uv_rtc_find_next_timer(head, pnode);
  6412. }
  6413. - spin_unlock_irqrestore(&head->lock, flags);
  6414. + raw_spin_unlock_irqrestore(&head->lock, flags);
  6415. return rc;
  6416. }
  6417. @@ -300,13 +300,18 @@
  6418. static cycle_t uv_read_rtc(struct clocksource *cs)
  6419. {
  6420. unsigned long offset;
  6421. + cycle_t cycles;
  6422. + preempt_disable();
  6423. if (uv_get_min_hub_revision_id() == 1)
  6424. offset = 0;
  6425. else
  6426. offset = (uv_blade_processor_id() * L1_CACHE_BYTES) % PAGE_SIZE;
  6427. - return (cycle_t)uv_read_local_mmr(UVH_RTC | offset);
  6428. + cycles = (cycle_t)uv_read_local_mmr(UVH_RTC | offset);
  6429. + preempt_enable();
  6430. +
  6431. + return cycles;
  6432. }
  6433. /*
  6434. diff -Nur linux-3.18.14.orig/arch/xtensa/mm/fault.c linux-3.18.14-rt/arch/xtensa/mm/fault.c
  6435. --- linux-3.18.14.orig/arch/xtensa/mm/fault.c 2015-05-20 10:04:50.000000000 -0500
  6436. +++ linux-3.18.14-rt/arch/xtensa/mm/fault.c 2015-05-31 15:32:46.741635382 -0500
  6437. @@ -57,7 +57,7 @@
  6438. /* If we're in an interrupt or have no user
  6439. * context, we must not take the fault..
  6440. */
  6441. - if (in_atomic() || !mm) {
  6442. + if (!mm || pagefault_disabled()) {
  6443. bad_page_fault(regs, address, SIGSEGV);
  6444. return;
  6445. }
  6446. diff -Nur linux-3.18.14.orig/block/blk-core.c linux-3.18.14-rt/block/blk-core.c
  6447. --- linux-3.18.14.orig/block/blk-core.c 2015-05-20 10:04:50.000000000 -0500
  6448. +++ linux-3.18.14-rt/block/blk-core.c 2015-05-31 15:32:46.757635382 -0500
  6449. @@ -100,6 +100,9 @@
  6450. INIT_LIST_HEAD(&rq->queuelist);
  6451. INIT_LIST_HEAD(&rq->timeout_list);
  6452. +#if CONFIG_PREEMPT_RT_FULL
  6453. + INIT_WORK(&rq->work, __blk_mq_complete_request_remote_work);
  6454. +#endif
  6455. rq->cpu = -1;
  6456. rq->q = q;
  6457. rq->__sector = (sector_t) -1;
  6458. @@ -194,7 +197,7 @@
  6459. **/
  6460. void blk_start_queue(struct request_queue *q)
  6461. {
  6462. - WARN_ON(!irqs_disabled());
  6463. + WARN_ON_NONRT(!irqs_disabled());
  6464. queue_flag_clear(QUEUE_FLAG_STOPPED, q);
  6465. __blk_run_queue(q);
  6466. @@ -627,7 +630,7 @@
  6467. q->bypass_depth = 1;
  6468. __set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
  6469. - init_waitqueue_head(&q->mq_freeze_wq);
  6470. + init_swait_head(&q->mq_freeze_wq);
  6471. if (blkcg_init_queue(q))
  6472. goto fail_bdi;
  6473. @@ -3037,7 +3040,7 @@
  6474. blk_run_queue_async(q);
  6475. else
  6476. __blk_run_queue(q);
  6477. - spin_unlock(q->queue_lock);
  6478. + spin_unlock_irq(q->queue_lock);
  6479. }
  6480. static void flush_plug_callbacks(struct blk_plug *plug, bool from_schedule)
  6481. @@ -3085,7 +3088,6 @@
  6482. void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
  6483. {
  6484. struct request_queue *q;
  6485. - unsigned long flags;
  6486. struct request *rq;
  6487. LIST_HEAD(list);
  6488. unsigned int depth;
  6489. @@ -3105,11 +3107,6 @@
  6490. q = NULL;
  6491. depth = 0;
  6492. - /*
  6493. - * Save and disable interrupts here, to avoid doing it for every
  6494. - * queue lock we have to take.
  6495. - */
  6496. - local_irq_save(flags);
  6497. while (!list_empty(&list)) {
  6498. rq = list_entry_rq(list.next);
  6499. list_del_init(&rq->queuelist);
  6500. @@ -3122,7 +3119,7 @@
  6501. queue_unplugged(q, depth, from_schedule);
  6502. q = rq->q;
  6503. depth = 0;
  6504. - spin_lock(q->queue_lock);
  6505. + spin_lock_irq(q->queue_lock);
  6506. }
  6507. /*
  6508. @@ -3149,8 +3146,6 @@
  6509. */
  6510. if (q)
  6511. queue_unplugged(q, depth, from_schedule);
  6512. -
  6513. - local_irq_restore(flags);
  6514. }
  6515. void blk_finish_plug(struct blk_plug *plug)
  6516. diff -Nur linux-3.18.14.orig/block/blk-ioc.c linux-3.18.14-rt/block/blk-ioc.c
  6517. --- linux-3.18.14.orig/block/blk-ioc.c 2015-05-20 10:04:50.000000000 -0500
  6518. +++ linux-3.18.14-rt/block/blk-ioc.c 2015-05-31 15:32:46.761635382 -0500
  6519. @@ -7,6 +7,7 @@
  6520. #include <linux/bio.h>
  6521. #include <linux/blkdev.h>
  6522. #include <linux/slab.h>
  6523. +#include <linux/delay.h>
  6524. #include "blk.h"
  6525. @@ -109,7 +110,7 @@
  6526. spin_unlock(q->queue_lock);
  6527. } else {
  6528. spin_unlock_irqrestore(&ioc->lock, flags);
  6529. - cpu_relax();
  6530. + cpu_chill();
  6531. spin_lock_irqsave_nested(&ioc->lock, flags, 1);
  6532. }
  6533. }
  6534. @@ -187,7 +188,7 @@
  6535. spin_unlock(icq->q->queue_lock);
  6536. } else {
  6537. spin_unlock_irqrestore(&ioc->lock, flags);
  6538. - cpu_relax();
  6539. + cpu_chill();
  6540. goto retry;
  6541. }
  6542. }
  6543. diff -Nur linux-3.18.14.orig/block/blk-iopoll.c linux-3.18.14-rt/block/blk-iopoll.c
  6544. --- linux-3.18.14.orig/block/blk-iopoll.c 2015-05-20 10:04:50.000000000 -0500
  6545. +++ linux-3.18.14-rt/block/blk-iopoll.c 2015-05-31 15:32:46.761635382 -0500
  6546. @@ -35,6 +35,7 @@
  6547. list_add_tail(&iop->list, this_cpu_ptr(&blk_cpu_iopoll));
  6548. __raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
  6549. local_irq_restore(flags);
  6550. + preempt_check_resched_rt();
  6551. }
  6552. EXPORT_SYMBOL(blk_iopoll_sched);
  6553. @@ -132,6 +133,7 @@
  6554. __raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
  6555. local_irq_enable();
  6556. + preempt_check_resched_rt();
  6557. }
  6558. /**
  6559. @@ -201,6 +203,7 @@
  6560. this_cpu_ptr(&blk_cpu_iopoll));
  6561. __raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
  6562. local_irq_enable();
  6563. + preempt_check_resched_rt();
  6564. }
  6565. return NOTIFY_OK;
  6566. diff -Nur linux-3.18.14.orig/block/blk-mq.c linux-3.18.14-rt/block/blk-mq.c
  6567. --- linux-3.18.14.orig/block/blk-mq.c 2015-05-20 10:04:50.000000000 -0500
  6568. +++ linux-3.18.14-rt/block/blk-mq.c 2015-05-31 15:32:46.789635382 -0500
  6569. @@ -85,7 +85,7 @@
  6570. if (percpu_ref_tryget_live(&q->mq_usage_counter))
  6571. return 0;
  6572. - ret = wait_event_interruptible(q->mq_freeze_wq,
  6573. + ret = swait_event_interruptible(q->mq_freeze_wq,
  6574. !q->mq_freeze_depth || blk_queue_dying(q));
  6575. if (blk_queue_dying(q))
  6576. return -ENODEV;
  6577. @@ -104,7 +104,7 @@
  6578. struct request_queue *q =
  6579. container_of(ref, struct request_queue, mq_usage_counter);
  6580. - wake_up_all(&q->mq_freeze_wq);
  6581. + swait_wake_all(&q->mq_freeze_wq);
  6582. }
  6583. static void blk_mq_freeze_queue_start(struct request_queue *q)
  6584. @@ -123,7 +123,7 @@
  6585. static void blk_mq_freeze_queue_wait(struct request_queue *q)
  6586. {
  6587. - wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->mq_usage_counter));
  6588. + swait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->mq_usage_counter));
  6589. }
  6590. /*
  6591. @@ -146,7 +146,7 @@
  6592. spin_unlock_irq(q->queue_lock);
  6593. if (wake) {
  6594. percpu_ref_reinit(&q->mq_usage_counter);
  6595. - wake_up_all(&q->mq_freeze_wq);
  6596. + swait_wake_all(&q->mq_freeze_wq);
  6597. }
  6598. }
  6599. @@ -194,6 +194,9 @@
  6600. rq->resid_len = 0;
  6601. rq->sense = NULL;
  6602. +#ifdef CONFIG_PREEMPT_RT_FULL
  6603. + INIT_WORK(&rq->work, __blk_mq_complete_request_remote_work);
  6604. +#endif
  6605. INIT_LIST_HEAD(&rq->timeout_list);
  6606. rq->timeout = 0;
  6607. @@ -313,6 +316,17 @@
  6608. }
  6609. EXPORT_SYMBOL(blk_mq_end_request);
  6610. +#ifdef CONFIG_PREEMPT_RT_FULL
  6611. +
  6612. +void __blk_mq_complete_request_remote_work(struct work_struct *work)
  6613. +{
  6614. + struct request *rq = container_of(work, struct request, work);
  6615. +
  6616. + rq->q->softirq_done_fn(rq);
  6617. +}
  6618. +
  6619. +#else
  6620. +
  6621. static void __blk_mq_complete_request_remote(void *data)
  6622. {
  6623. struct request *rq = data;
  6624. @@ -320,6 +334,8 @@
  6625. rq->q->softirq_done_fn(rq);
  6626. }
  6627. +#endif
  6628. +
  6629. static void blk_mq_ipi_complete_request(struct request *rq)
  6630. {
  6631. struct blk_mq_ctx *ctx = rq->mq_ctx;
  6632. @@ -331,19 +347,23 @@
  6633. return;
  6634. }
  6635. - cpu = get_cpu();
  6636. + cpu = get_cpu_light();
  6637. if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags))
  6638. shared = cpus_share_cache(cpu, ctx->cpu);
  6639. if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
  6640. +#ifdef CONFIG_PREEMPT_RT_FULL
  6641. + schedule_work_on(ctx->cpu, &rq->work);
  6642. +#else
  6643. rq->csd.func = __blk_mq_complete_request_remote;
  6644. rq->csd.info = rq;
  6645. rq->csd.flags = 0;
  6646. smp_call_function_single_async(ctx->cpu, &rq->csd);
  6647. +#endif
  6648. } else {
  6649. rq->q->softirq_done_fn(rq);
  6650. }
  6651. - put_cpu();
  6652. + put_cpu_light();
  6653. }
  6654. void __blk_mq_complete_request(struct request *rq)
  6655. @@ -814,9 +834,9 @@
  6656. test_bit(BLK_MQ_S_STOPPED, &hctx->state))
  6657. continue;
  6658. - preempt_disable();
  6659. + migrate_disable();
  6660. blk_mq_run_hw_queue(hctx, async);
  6661. - preempt_enable();
  6662. + migrate_enable();
  6663. }
  6664. }
  6665. EXPORT_SYMBOL(blk_mq_run_queues);
  6666. @@ -843,9 +863,9 @@
  6667. {
  6668. clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
  6669. - preempt_disable();
  6670. + migrate_disable();
  6671. blk_mq_run_hw_queue(hctx, false);
  6672. - preempt_enable();
  6673. + migrate_enable();
  6674. }
  6675. EXPORT_SYMBOL(blk_mq_start_hw_queue);
  6676. @@ -870,9 +890,9 @@
  6677. continue;
  6678. clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
  6679. - preempt_disable();
  6680. + migrate_disable();
  6681. blk_mq_run_hw_queue(hctx, async);
  6682. - preempt_enable();
  6683. + migrate_enable();
  6684. }
  6685. }
  6686. EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues);
  6687. @@ -1494,7 +1514,7 @@
  6688. {
  6689. struct blk_mq_hw_ctx *hctx = data;
  6690. - if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
  6691. + if (action == CPU_POST_DEAD)
  6692. return blk_mq_hctx_cpu_offline(hctx, cpu);
  6693. else if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
  6694. return blk_mq_hctx_cpu_online(hctx, cpu);
  6695. diff -Nur linux-3.18.14.orig/block/blk-mq-cpu.c linux-3.18.14-rt/block/blk-mq-cpu.c
  6696. --- linux-3.18.14.orig/block/blk-mq-cpu.c 2015-05-20 10:04:50.000000000 -0500
  6697. +++ linux-3.18.14-rt/block/blk-mq-cpu.c 2015-05-31 15:32:46.773635382 -0500
  6698. @@ -16,7 +16,7 @@
  6699. #include "blk-mq.h"
  6700. static LIST_HEAD(blk_mq_cpu_notify_list);
  6701. -static DEFINE_RAW_SPINLOCK(blk_mq_cpu_notify_lock);
  6702. +static DEFINE_SPINLOCK(blk_mq_cpu_notify_lock);
  6703. static int blk_mq_main_cpu_notify(struct notifier_block *self,
  6704. unsigned long action, void *hcpu)
  6705. @@ -25,7 +25,10 @@
  6706. struct blk_mq_cpu_notifier *notify;
  6707. int ret = NOTIFY_OK;
  6708. - raw_spin_lock(&blk_mq_cpu_notify_lock);
  6709. + if (action != CPU_POST_DEAD)
  6710. + return NOTIFY_OK;
  6711. +
  6712. + spin_lock(&blk_mq_cpu_notify_lock);
  6713. list_for_each_entry(notify, &blk_mq_cpu_notify_list, list) {
  6714. ret = notify->notify(notify->data, action, cpu);
  6715. @@ -33,7 +36,7 @@
  6716. break;
  6717. }
  6718. - raw_spin_unlock(&blk_mq_cpu_notify_lock);
  6719. + spin_unlock(&blk_mq_cpu_notify_lock);
  6720. return ret;
  6721. }
  6722. @@ -41,16 +44,16 @@
  6723. {
  6724. BUG_ON(!notifier->notify);
  6725. - raw_spin_lock(&blk_mq_cpu_notify_lock);
  6726. + spin_lock(&blk_mq_cpu_notify_lock);
  6727. list_add_tail(&notifier->list, &blk_mq_cpu_notify_list);
  6728. - raw_spin_unlock(&blk_mq_cpu_notify_lock);
  6729. + spin_unlock(&blk_mq_cpu_notify_lock);
  6730. }
  6731. void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier)
  6732. {
  6733. - raw_spin_lock(&blk_mq_cpu_notify_lock);
  6734. + spin_lock(&blk_mq_cpu_notify_lock);
  6735. list_del(&notifier->list);
  6736. - raw_spin_unlock(&blk_mq_cpu_notify_lock);
  6737. + spin_unlock(&blk_mq_cpu_notify_lock);
  6738. }
  6739. void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier,
  6740. diff -Nur linux-3.18.14.orig/block/blk-mq.h linux-3.18.14-rt/block/blk-mq.h
  6741. --- linux-3.18.14.orig/block/blk-mq.h 2015-05-20 10:04:50.000000000 -0500
  6742. +++ linux-3.18.14-rt/block/blk-mq.h 2015-05-31 15:32:46.789635382 -0500
  6743. @@ -73,7 +73,10 @@
  6744. static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
  6745. unsigned int cpu)
  6746. {
  6747. - return per_cpu_ptr(q->queue_ctx, cpu);
  6748. + struct blk_mq_ctx *ctx;
  6749. +
  6750. + ctx = per_cpu_ptr(q->queue_ctx, cpu);
  6751. + return ctx;
  6752. }
  6753. /*
  6754. @@ -84,12 +87,12 @@
  6755. */
  6756. static inline struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q)
  6757. {
  6758. - return __blk_mq_get_ctx(q, get_cpu());
  6759. + return __blk_mq_get_ctx(q, get_cpu_light());
  6760. }
  6761. static inline void blk_mq_put_ctx(struct blk_mq_ctx *ctx)
  6762. {
  6763. - put_cpu();
  6764. + put_cpu_light();
  6765. }
  6766. struct blk_mq_alloc_data {
  6767. diff -Nur linux-3.18.14.orig/block/blk-softirq.c linux-3.18.14-rt/block/blk-softirq.c
  6768. --- linux-3.18.14.orig/block/blk-softirq.c 2015-05-20 10:04:50.000000000 -0500
  6769. +++ linux-3.18.14-rt/block/blk-softirq.c 2015-05-31 15:32:46.789635382 -0500
  6770. @@ -51,6 +51,7 @@
  6771. raise_softirq_irqoff(BLOCK_SOFTIRQ);
  6772. local_irq_restore(flags);
  6773. + preempt_check_resched_rt();
  6774. }
  6775. /*
  6776. @@ -93,6 +94,7 @@
  6777. this_cpu_ptr(&blk_cpu_done));
  6778. raise_softirq_irqoff(BLOCK_SOFTIRQ);
  6779. local_irq_enable();
  6780. + preempt_check_resched_rt();
  6781. }
  6782. return NOTIFY_OK;
  6783. @@ -150,6 +152,7 @@
  6784. goto do_local;
  6785. local_irq_restore(flags);
  6786. + preempt_check_resched_rt();
  6787. }
  6788. /**
  6789. diff -Nur linux-3.18.14.orig/block/bounce.c linux-3.18.14-rt/block/bounce.c
  6790. --- linux-3.18.14.orig/block/bounce.c 2015-05-20 10:04:50.000000000 -0500
  6791. +++ linux-3.18.14-rt/block/bounce.c 2015-05-31 15:32:46.793635382 -0500
  6792. @@ -54,11 +54,11 @@
  6793. unsigned long flags;
  6794. unsigned char *vto;
  6795. - local_irq_save(flags);
  6796. + local_irq_save_nort(flags);
  6797. vto = kmap_atomic(to->bv_page);
  6798. memcpy(vto + to->bv_offset, vfrom, to->bv_len);
  6799. kunmap_atomic(vto);
  6800. - local_irq_restore(flags);
  6801. + local_irq_restore_nort(flags);
  6802. }
  6803. #else /* CONFIG_HIGHMEM */
  6804. diff -Nur linux-3.18.14.orig/crypto/algapi.c linux-3.18.14-rt/crypto/algapi.c
  6805. --- linux-3.18.14.orig/crypto/algapi.c 2015-05-20 10:04:50.000000000 -0500
  6806. +++ linux-3.18.14-rt/crypto/algapi.c 2015-05-31 15:32:46.809635382 -0500
  6807. @@ -698,13 +698,13 @@
  6808. int crypto_register_notifier(struct notifier_block *nb)
  6809. {
  6810. - return blocking_notifier_chain_register(&crypto_chain, nb);
  6811. + return srcu_notifier_chain_register(&crypto_chain, nb);
  6812. }
  6813. EXPORT_SYMBOL_GPL(crypto_register_notifier);
  6814. int crypto_unregister_notifier(struct notifier_block *nb)
  6815. {
  6816. - return blocking_notifier_chain_unregister(&crypto_chain, nb);
  6817. + return srcu_notifier_chain_unregister(&crypto_chain, nb);
  6818. }
  6819. EXPORT_SYMBOL_GPL(crypto_unregister_notifier);
  6820. diff -Nur linux-3.18.14.orig/crypto/api.c linux-3.18.14-rt/crypto/api.c
  6821. --- linux-3.18.14.orig/crypto/api.c 2015-05-20 10:04:50.000000000 -0500
  6822. +++ linux-3.18.14-rt/crypto/api.c 2015-05-31 15:32:46.861635382 -0500
  6823. @@ -31,7 +31,7 @@
  6824. DECLARE_RWSEM(crypto_alg_sem);
  6825. EXPORT_SYMBOL_GPL(crypto_alg_sem);
  6826. -BLOCKING_NOTIFIER_HEAD(crypto_chain);
  6827. +SRCU_NOTIFIER_HEAD(crypto_chain);
  6828. EXPORT_SYMBOL_GPL(crypto_chain);
  6829. static struct crypto_alg *crypto_larval_wait(struct crypto_alg *alg);
  6830. @@ -236,10 +236,10 @@
  6831. {
  6832. int ok;
  6833. - ok = blocking_notifier_call_chain(&crypto_chain, val, v);
  6834. + ok = srcu_notifier_call_chain(&crypto_chain, val, v);
  6835. if (ok == NOTIFY_DONE) {
  6836. request_module("cryptomgr");
  6837. - ok = blocking_notifier_call_chain(&crypto_chain, val, v);
  6838. + ok = srcu_notifier_call_chain(&crypto_chain, val, v);
  6839. }
  6840. return ok;
  6841. diff -Nur linux-3.18.14.orig/crypto/internal.h linux-3.18.14-rt/crypto/internal.h
  6842. --- linux-3.18.14.orig/crypto/internal.h 2015-05-20 10:04:50.000000000 -0500
  6843. +++ linux-3.18.14-rt/crypto/internal.h 2015-05-31 15:32:46.865635381 -0500
  6844. @@ -48,7 +48,7 @@
  6845. extern struct list_head crypto_alg_list;
  6846. extern struct rw_semaphore crypto_alg_sem;
  6847. -extern struct blocking_notifier_head crypto_chain;
  6848. +extern struct srcu_notifier_head crypto_chain;
  6849. #ifdef CONFIG_PROC_FS
  6850. void __init crypto_init_proc(void);
  6851. @@ -142,7 +142,7 @@
  6852. static inline void crypto_notify(unsigned long val, void *v)
  6853. {
  6854. - blocking_notifier_call_chain(&crypto_chain, val, v);
  6855. + srcu_notifier_call_chain(&crypto_chain, val, v);
  6856. }
  6857. #endif /* _CRYPTO_INTERNAL_H */
  6858. diff -Nur linux-3.18.14.orig/Documentation/hwlat_detector.txt linux-3.18.14-rt/Documentation/hwlat_detector.txt
  6859. --- linux-3.18.14.orig/Documentation/hwlat_detector.txt 1969-12-31 18:00:00.000000000 -0600
  6860. +++ linux-3.18.14-rt/Documentation/hwlat_detector.txt 2015-05-31 15:32:45.457635394 -0500
  6861. @@ -0,0 +1,64 @@
  6862. +Introduction:
  6863. +-------------
  6864. +
  6865. +The module hwlat_detector is a special purpose kernel module that is used to
  6866. +detect large system latencies induced by the behavior of certain underlying
  6867. +hardware or firmware, independent of Linux itself. The code was developed
  6868. +originally to detect SMIs (System Management Interrupts) on x86 systems,
  6869. +however there is nothing x86 specific about this patchset. It was
  6870. +originally written for use by the "RT" patch since the Real Time
  6871. +kernel is highly latency sensitive.
  6872. +
  6873. +SMIs are usually not serviced by the Linux kernel, which typically does not
  6874. +even know that they are occuring. SMIs are instead are set up by BIOS code
  6875. +and are serviced by BIOS code, usually for "critical" events such as
  6876. +management of thermal sensors and fans. Sometimes though, SMIs are used for
  6877. +other tasks and those tasks can spend an inordinate amount of time in the
  6878. +handler (sometimes measured in milliseconds). Obviously this is a problem if
  6879. +you are trying to keep event service latencies down in the microsecond range.
  6880. +
  6881. +The hardware latency detector works by hogging all of the cpus for configurable
  6882. +amounts of time (by calling stop_machine()), polling the CPU Time Stamp Counter
  6883. +for some period, then looking for gaps in the TSC data. Any gap indicates a
  6884. +time when the polling was interrupted and since the machine is stopped and
  6885. +interrupts turned off the only thing that could do that would be an SMI.
  6886. +
  6887. +Note that the SMI detector should *NEVER* be used in a production environment.
  6888. +It is intended to be run manually to determine if the hardware platform has a
  6889. +problem with long system firmware service routines.
  6890. +
  6891. +Usage:
  6892. +------
  6893. +
  6894. +Loading the module hwlat_detector passing the parameter "enabled=1" (or by
  6895. +setting the "enable" entry in "hwlat_detector" debugfs toggled on) is the only
  6896. +step required to start the hwlat_detector. It is possible to redefine the
  6897. +threshold in microseconds (us) above which latency spikes will be taken
  6898. +into account (parameter "threshold=").
  6899. +
  6900. +Example:
  6901. +
  6902. + # modprobe hwlat_detector enabled=1 threshold=100
  6903. +
  6904. +After the module is loaded, it creates a directory named "hwlat_detector" under
  6905. +the debugfs mountpoint, "/debug/hwlat_detector" for this text. It is necessary
  6906. +to have debugfs mounted, which might be on /sys/debug on your system.
  6907. +
  6908. +The /debug/hwlat_detector interface contains the following files:
  6909. +
  6910. +count - number of latency spikes observed since last reset
  6911. +enable - a global enable/disable toggle (0/1), resets count
  6912. +max - maximum hardware latency actually observed (usecs)
  6913. +sample - a pipe from which to read current raw sample data
  6914. + in the format <timestamp> <latency observed usecs>
  6915. + (can be opened O_NONBLOCK for a single sample)
  6916. +threshold - minimum latency value to be considered (usecs)
  6917. +width - time period to sample with CPUs held (usecs)
  6918. + must be less than the total window size (enforced)
  6919. +window - total period of sampling, width being inside (usecs)
  6920. +
  6921. +By default we will set width to 500,000 and window to 1,000,000, meaning that
  6922. +we will sample every 1,000,000 usecs (1s) for 500,000 usecs (0.5s). If we
  6923. +observe any latencies that exceed the threshold (initially 100 usecs),
  6924. +then we write to a global sample ring buffer of 8K samples, which is
  6925. +consumed by reading from the "sample" (pipe) debugfs file interface.
  6926. diff -Nur linux-3.18.14.orig/Documentation/sysrq.txt linux-3.18.14-rt/Documentation/sysrq.txt
  6927. --- linux-3.18.14.orig/Documentation/sysrq.txt 2015-05-20 10:04:50.000000000 -0500
  6928. +++ linux-3.18.14-rt/Documentation/sysrq.txt 2015-05-31 15:32:45.461635394 -0500
  6929. @@ -59,10 +59,17 @@
  6930. On other - If you know of the key combos for other architectures, please
  6931. let me know so I can add them to this section.
  6932. -On all - write a character to /proc/sysrq-trigger. e.g.:
  6933. -
  6934. +On all - write a character to /proc/sysrq-trigger, e.g.:
  6935. echo t > /proc/sysrq-trigger
  6936. +On all - Enable network SysRq by writing a cookie to icmp_echo_sysrq, e.g.
  6937. + echo 0x01020304 >/proc/sys/net/ipv4/icmp_echo_sysrq
  6938. + Send an ICMP echo request with this pattern plus the particular
  6939. + SysRq command key. Example:
  6940. + # ping -c1 -s57 -p0102030468
  6941. + will trigger the SysRq-H (help) command.
  6942. +
  6943. +
  6944. * What are the 'command' keys?
  6945. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  6946. 'b' - Will immediately reboot the system without syncing or unmounting
  6947. diff -Nur linux-3.18.14.orig/Documentation/trace/histograms.txt linux-3.18.14-rt/Documentation/trace/histograms.txt
  6948. --- linux-3.18.14.orig/Documentation/trace/histograms.txt 1969-12-31 18:00:00.000000000 -0600
  6949. +++ linux-3.18.14-rt/Documentation/trace/histograms.txt 2015-05-31 15:32:45.461635394 -0500
  6950. @@ -0,0 +1,186 @@
  6951. + Using the Linux Kernel Latency Histograms
  6952. +
  6953. +
  6954. +This document gives a short explanation how to enable, configure and use
  6955. +latency histograms. Latency histograms are primarily relevant in the
  6956. +context of real-time enabled kernels (CONFIG_PREEMPT/CONFIG_PREEMPT_RT)
  6957. +and are used in the quality management of the Linux real-time
  6958. +capabilities.
  6959. +
  6960. +
  6961. +* Purpose of latency histograms
  6962. +
  6963. +A latency histogram continuously accumulates the frequencies of latency
  6964. +data. There are two types of histograms
  6965. +- potential sources of latencies
  6966. +- effective latencies
  6967. +
  6968. +
  6969. +* Potential sources of latencies
  6970. +
  6971. +Potential sources of latencies are code segments where interrupts,
  6972. +preemption or both are disabled (aka critical sections). To create
  6973. +histograms of potential sources of latency, the kernel stores the time
  6974. +stamp at the start of a critical section, determines the time elapsed
  6975. +when the end of the section is reached, and increments the frequency
  6976. +counter of that latency value - irrespective of whether any concurrently
  6977. +running process is affected by latency or not.
  6978. +- Configuration items (in the Kernel hacking/Tracers submenu)
  6979. + CONFIG_INTERRUPT_OFF_LATENCY
  6980. + CONFIG_PREEMPT_OFF_LATENCY
  6981. +
  6982. +
  6983. +* Effective latencies
  6984. +
  6985. +Effective latencies are actually occuring during wakeup of a process. To
  6986. +determine effective latencies, the kernel stores the time stamp when a
  6987. +process is scheduled to be woken up, and determines the duration of the
  6988. +wakeup time shortly before control is passed over to this process. Note
  6989. +that the apparent latency in user space may be somewhat longer, since the
  6990. +process may be interrupted after control is passed over to it but before
  6991. +the execution in user space takes place. Simply measuring the interval
  6992. +between enqueuing and wakeup may also not appropriate in cases when a
  6993. +process is scheduled as a result of a timer expiration. The timer may have
  6994. +missed its deadline, e.g. due to disabled interrupts, but this latency
  6995. +would not be registered. Therefore, the offsets of missed timers are
  6996. +recorded in a separate histogram. If both wakeup latency and missed timer
  6997. +offsets are configured and enabled, a third histogram may be enabled that
  6998. +records the overall latency as a sum of the timer latency, if any, and the
  6999. +wakeup latency. This histogram is called "timerandwakeup".
  7000. +- Configuration items (in the Kernel hacking/Tracers submenu)
  7001. + CONFIG_WAKEUP_LATENCY
  7002. + CONFIG_MISSED_TIMER_OFSETS
  7003. +
  7004. +
  7005. +* Usage
  7006. +
  7007. +The interface to the administration of the latency histograms is located
  7008. +in the debugfs file system. To mount it, either enter
  7009. +
  7010. +mount -t sysfs nodev /sys
  7011. +mount -t debugfs nodev /sys/kernel/debug
  7012. +
  7013. +from shell command line level, or add
  7014. +
  7015. +nodev /sys sysfs defaults 0 0
  7016. +nodev /sys/kernel/debug debugfs defaults 0 0
  7017. +
  7018. +to the file /etc/fstab. All latency histogram related files are then
  7019. +available in the directory /sys/kernel/debug/tracing/latency_hist. A
  7020. +particular histogram type is enabled by writing non-zero to the related
  7021. +variable in the /sys/kernel/debug/tracing/latency_hist/enable directory.
  7022. +Select "preemptirqsoff" for the histograms of potential sources of
  7023. +latencies and "wakeup" for histograms of effective latencies etc. The
  7024. +histogram data - one per CPU - are available in the files
  7025. +
  7026. +/sys/kernel/debug/tracing/latency_hist/preemptoff/CPUx
  7027. +/sys/kernel/debug/tracing/latency_hist/irqsoff/CPUx
  7028. +/sys/kernel/debug/tracing/latency_hist/preemptirqsoff/CPUx
  7029. +/sys/kernel/debug/tracing/latency_hist/wakeup/CPUx
  7030. +/sys/kernel/debug/tracing/latency_hist/wakeup/sharedprio/CPUx
  7031. +/sys/kernel/debug/tracing/latency_hist/missed_timer_offsets/CPUx
  7032. +/sys/kernel/debug/tracing/latency_hist/timerandwakeup/CPUx
  7033. +
  7034. +The histograms are reset by writing non-zero to the file "reset" in a
  7035. +particular latency directory. To reset all latency data, use
  7036. +
  7037. +#!/bin/sh
  7038. +
  7039. +TRACINGDIR=/sys/kernel/debug/tracing
  7040. +HISTDIR=$TRACINGDIR/latency_hist
  7041. +
  7042. +if test -d $HISTDIR
  7043. +then
  7044. + cd $HISTDIR
  7045. + for i in `find . | grep /reset$`
  7046. + do
  7047. + echo 1 >$i
  7048. + done
  7049. +fi
  7050. +
  7051. +
  7052. +* Data format
  7053. +
  7054. +Latency data are stored with a resolution of one microsecond. The
  7055. +maximum latency is 10,240 microseconds. The data are only valid, if the
  7056. +overflow register is empty. Every output line contains the latency in
  7057. +microseconds in the first row and the number of samples in the second
  7058. +row. To display only lines with a positive latency count, use, for
  7059. +example,
  7060. +
  7061. +grep -v " 0$" /sys/kernel/debug/tracing/latency_hist/preemptoff/CPU0
  7062. +
  7063. +#Minimum latency: 0 microseconds.
  7064. +#Average latency: 0 microseconds.
  7065. +#Maximum latency: 25 microseconds.
  7066. +#Total samples: 3104770694
  7067. +#There are 0 samples greater or equal than 10240 microseconds
  7068. +#usecs samples
  7069. + 0 2984486876
  7070. + 1 49843506
  7071. + 2 58219047
  7072. + 3 5348126
  7073. + 4 2187960
  7074. + 5 3388262
  7075. + 6 959289
  7076. + 7 208294
  7077. + 8 40420
  7078. + 9 4485
  7079. + 10 14918
  7080. + 11 18340
  7081. + 12 25052
  7082. + 13 19455
  7083. + 14 5602
  7084. + 15 969
  7085. + 16 47
  7086. + 17 18
  7087. + 18 14
  7088. + 19 1
  7089. + 20 3
  7090. + 21 2
  7091. + 22 5
  7092. + 23 2
  7093. + 25 1
  7094. +
  7095. +
  7096. +* Wakeup latency of a selected process
  7097. +
  7098. +To only collect wakeup latency data of a particular process, write the
  7099. +PID of the requested process to
  7100. +
  7101. +/sys/kernel/debug/tracing/latency_hist/wakeup/pid
  7102. +
  7103. +PIDs are not considered, if this variable is set to 0.
  7104. +
  7105. +
  7106. +* Details of the process with the highest wakeup latency so far
  7107. +
  7108. +Selected data of the process that suffered from the highest wakeup
  7109. +latency that occurred in a particular CPU are available in the file
  7110. +
  7111. +/sys/kernel/debug/tracing/latency_hist/wakeup/max_latency-CPUx.
  7112. +
  7113. +In addition, other relevant system data at the time when the
  7114. +latency occurred are given.
  7115. +
  7116. +The format of the data is (all in one line):
  7117. +<PID> <Priority> <Latency> (<Timeroffset>) <Command> \
  7118. +<- <PID> <Priority> <Command> <Timestamp>
  7119. +
  7120. +The value of <Timeroffset> is only relevant in the combined timer
  7121. +and wakeup latency recording. In the wakeup recording, it is
  7122. +always 0, in the missed_timer_offsets recording, it is the same
  7123. +as <Latency>.
  7124. +
  7125. +When retrospectively searching for the origin of a latency and
  7126. +tracing was not enabled, it may be helpful to know the name and
  7127. +some basic data of the task that (finally) was switching to the
  7128. +late real-tlme task. In addition to the victim's data, also the
  7129. +data of the possible culprit are therefore displayed after the
  7130. +"<-" symbol.
  7131. +
  7132. +Finally, the timestamp of the time when the latency occurred
  7133. +in <seconds>.<microseconds> after the most recent system boot
  7134. +is provided.
  7135. +
  7136. +These data are also reset when the wakeup histogram is reset.
  7137. diff -Nur linux-3.18.14.orig/drivers/acpi/acpica/acglobal.h linux-3.18.14-rt/drivers/acpi/acpica/acglobal.h
  7138. --- linux-3.18.14.orig/drivers/acpi/acpica/acglobal.h 2015-05-20 10:04:50.000000000 -0500
  7139. +++ linux-3.18.14-rt/drivers/acpi/acpica/acglobal.h 2015-05-31 15:32:46.885635381 -0500
  7140. @@ -112,7 +112,7 @@
  7141. * interrupt level
  7142. */
  7143. ACPI_GLOBAL(acpi_spinlock, acpi_gbl_gpe_lock); /* For GPE data structs and registers */
  7144. -ACPI_GLOBAL(acpi_spinlock, acpi_gbl_hardware_lock); /* For ACPI H/W except GPE registers */
  7145. +ACPI_GLOBAL(acpi_raw_spinlock, acpi_gbl_hardware_lock); /* For ACPI H/W except GPE registers */
  7146. ACPI_GLOBAL(acpi_spinlock, acpi_gbl_reference_count_lock);
  7147. /* Mutex for _OSI support */
  7148. diff -Nur linux-3.18.14.orig/drivers/acpi/acpica/hwregs.c linux-3.18.14-rt/drivers/acpi/acpica/hwregs.c
  7149. --- linux-3.18.14.orig/drivers/acpi/acpica/hwregs.c 2015-05-20 10:04:50.000000000 -0500
  7150. +++ linux-3.18.14-rt/drivers/acpi/acpica/hwregs.c 2015-05-31 15:32:46.929635381 -0500
  7151. @@ -269,14 +269,14 @@
  7152. ACPI_BITMASK_ALL_FIXED_STATUS,
  7153. ACPI_FORMAT_UINT64(acpi_gbl_xpm1a_status.address)));
  7154. - lock_flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock);
  7155. + raw_spin_lock_irqsave(acpi_gbl_hardware_lock, lock_flags);
  7156. /* Clear the fixed events in PM1 A/B */
  7157. status = acpi_hw_register_write(ACPI_REGISTER_PM1_STATUS,
  7158. ACPI_BITMASK_ALL_FIXED_STATUS);
  7159. - acpi_os_release_lock(acpi_gbl_hardware_lock, lock_flags);
  7160. + raw_spin_unlock_irqrestore(acpi_gbl_hardware_lock, lock_flags);
  7161. if (ACPI_FAILURE(status)) {
  7162. goto exit;
  7163. diff -Nur linux-3.18.14.orig/drivers/acpi/acpica/hwxface.c linux-3.18.14-rt/drivers/acpi/acpica/hwxface.c
  7164. --- linux-3.18.14.orig/drivers/acpi/acpica/hwxface.c 2015-05-20 10:04:50.000000000 -0500
  7165. +++ linux-3.18.14-rt/drivers/acpi/acpica/hwxface.c 2015-05-31 15:32:46.973635380 -0500
  7166. @@ -374,7 +374,7 @@
  7167. return_ACPI_STATUS(AE_BAD_PARAMETER);
  7168. }
  7169. - lock_flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock);
  7170. + raw_spin_lock_irqsave(acpi_gbl_hardware_lock, lock_flags);
  7171. /*
  7172. * At this point, we know that the parent register is one of the
  7173. @@ -435,7 +435,7 @@
  7174. unlock_and_exit:
  7175. - acpi_os_release_lock(acpi_gbl_hardware_lock, lock_flags);
  7176. + raw_spin_unlock_irqrestore(acpi_gbl_hardware_lock, lock_flags);
  7177. return_ACPI_STATUS(status);
  7178. }
  7179. diff -Nur linux-3.18.14.orig/drivers/acpi/acpica/utmutex.c linux-3.18.14-rt/drivers/acpi/acpica/utmutex.c
  7180. --- linux-3.18.14.orig/drivers/acpi/acpica/utmutex.c 2015-05-20 10:04:50.000000000 -0500
  7181. +++ linux-3.18.14-rt/drivers/acpi/acpica/utmutex.c 2015-05-31 15:32:46.973635380 -0500
  7182. @@ -88,7 +88,7 @@
  7183. return_ACPI_STATUS (status);
  7184. }
  7185. - status = acpi_os_create_lock (&acpi_gbl_hardware_lock);
  7186. + status = acpi_os_create_raw_lock (&acpi_gbl_hardware_lock);
  7187. if (ACPI_FAILURE (status)) {
  7188. return_ACPI_STATUS (status);
  7189. }
  7190. @@ -141,7 +141,7 @@
  7191. /* Delete the spinlocks */
  7192. acpi_os_delete_lock(acpi_gbl_gpe_lock);
  7193. - acpi_os_delete_lock(acpi_gbl_hardware_lock);
  7194. + acpi_os_delete_raw_lock(acpi_gbl_hardware_lock);
  7195. acpi_os_delete_lock(acpi_gbl_reference_count_lock);
  7196. /* Delete the reader/writer lock */
  7197. diff -Nur linux-3.18.14.orig/drivers/ata/libata-sff.c linux-3.18.14-rt/drivers/ata/libata-sff.c
  7198. --- linux-3.18.14.orig/drivers/ata/libata-sff.c 2015-05-20 10:04:50.000000000 -0500
  7199. +++ linux-3.18.14-rt/drivers/ata/libata-sff.c 2015-05-31 15:32:46.993635380 -0500
  7200. @@ -678,9 +678,9 @@
  7201. unsigned long flags;
  7202. unsigned int consumed;
  7203. - local_irq_save(flags);
  7204. + local_irq_save_nort(flags);
  7205. consumed = ata_sff_data_xfer32(dev, buf, buflen, rw);
  7206. - local_irq_restore(flags);
  7207. + local_irq_restore_nort(flags);
  7208. return consumed;
  7209. }
  7210. @@ -719,7 +719,7 @@
  7211. unsigned long flags;
  7212. /* FIXME: use a bounce buffer */
  7213. - local_irq_save(flags);
  7214. + local_irq_save_nort(flags);
  7215. buf = kmap_atomic(page);
  7216. /* do the actual data transfer */
  7217. @@ -727,7 +727,7 @@
  7218. do_write);
  7219. kunmap_atomic(buf);
  7220. - local_irq_restore(flags);
  7221. + local_irq_restore_nort(flags);
  7222. } else {
  7223. buf = page_address(page);
  7224. ap->ops->sff_data_xfer(qc->dev, buf + offset, qc->sect_size,
  7225. @@ -864,7 +864,7 @@
  7226. unsigned long flags;
  7227. /* FIXME: use bounce buffer */
  7228. - local_irq_save(flags);
  7229. + local_irq_save_nort(flags);
  7230. buf = kmap_atomic(page);
  7231. /* do the actual data transfer */
  7232. @@ -872,7 +872,7 @@
  7233. count, rw);
  7234. kunmap_atomic(buf);
  7235. - local_irq_restore(flags);
  7236. + local_irq_restore_nort(flags);
  7237. } else {
  7238. buf = page_address(page);
  7239. consumed = ap->ops->sff_data_xfer(dev, buf + offset,
  7240. diff -Nur linux-3.18.14.orig/drivers/char/random.c linux-3.18.14-rt/drivers/char/random.c
  7241. --- linux-3.18.14.orig/drivers/char/random.c 2015-05-20 10:04:50.000000000 -0500
  7242. +++ linux-3.18.14-rt/drivers/char/random.c 2015-05-31 15:32:47.013635380 -0500
  7243. @@ -776,8 +776,6 @@
  7244. } sample;
  7245. long delta, delta2, delta3;
  7246. - preempt_disable();
  7247. -
  7248. sample.jiffies = jiffies;
  7249. sample.cycles = random_get_entropy();
  7250. sample.num = num;
  7251. @@ -818,7 +816,6 @@
  7252. */
  7253. credit_entropy_bits(r, min_t(int, fls(delta>>1), 11));
  7254. }
  7255. - preempt_enable();
  7256. }
  7257. void add_input_randomness(unsigned int type, unsigned int code,
  7258. @@ -871,28 +868,27 @@
  7259. return *(ptr + f->reg_idx++);
  7260. }
  7261. -void add_interrupt_randomness(int irq, int irq_flags)
  7262. +void add_interrupt_randomness(int irq, int irq_flags, __u64 ip)
  7263. {
  7264. struct entropy_store *r;
  7265. struct fast_pool *fast_pool = this_cpu_ptr(&irq_randomness);
  7266. - struct pt_regs *regs = get_irq_regs();
  7267. unsigned long now = jiffies;
  7268. cycles_t cycles = random_get_entropy();
  7269. __u32 c_high, j_high;
  7270. - __u64 ip;
  7271. unsigned long seed;
  7272. int credit = 0;
  7273. if (cycles == 0)
  7274. - cycles = get_reg(fast_pool, regs);
  7275. + cycles = get_reg(fast_pool, NULL);
  7276. c_high = (sizeof(cycles) > 4) ? cycles >> 32 : 0;
  7277. j_high = (sizeof(now) > 4) ? now >> 32 : 0;
  7278. fast_pool->pool[0] ^= cycles ^ j_high ^ irq;
  7279. fast_pool->pool[1] ^= now ^ c_high;
  7280. - ip = regs ? instruction_pointer(regs) : _RET_IP_;
  7281. + if (!ip)
  7282. + ip = _RET_IP_;
  7283. fast_pool->pool[2] ^= ip;
  7284. fast_pool->pool[3] ^= (sizeof(ip) > 4) ? ip >> 32 :
  7285. - get_reg(fast_pool, regs);
  7286. + get_reg(fast_pool, NULL);
  7287. fast_mix(fast_pool);
  7288. add_interrupt_bench(cycles);
  7289. diff -Nur linux-3.18.14.orig/drivers/clocksource/tcb_clksrc.c linux-3.18.14-rt/drivers/clocksource/tcb_clksrc.c
  7290. --- linux-3.18.14.orig/drivers/clocksource/tcb_clksrc.c 2015-05-20 10:04:50.000000000 -0500
  7291. +++ linux-3.18.14-rt/drivers/clocksource/tcb_clksrc.c 2015-05-31 15:32:47.025635380 -0500
  7292. @@ -23,8 +23,7 @@
  7293. * this 32 bit free-running counter. the second channel is not used.
  7294. *
  7295. * - The third channel may be used to provide a 16-bit clockevent
  7296. - * source, used in either periodic or oneshot mode. This runs
  7297. - * at 32 KiHZ, and can handle delays of up to two seconds.
  7298. + * source, used in either periodic or oneshot mode.
  7299. *
  7300. * A boot clocksource and clockevent source are also currently needed,
  7301. * unless the relevant platforms (ARM/AT91, AVR32/AT32) are changed so
  7302. @@ -74,6 +73,7 @@
  7303. struct tc_clkevt_device {
  7304. struct clock_event_device clkevt;
  7305. struct clk *clk;
  7306. + u32 freq;
  7307. void __iomem *regs;
  7308. };
  7309. @@ -82,13 +82,6 @@
  7310. return container_of(clkevt, struct tc_clkevt_device, clkevt);
  7311. }
  7312. -/* For now, we always use the 32K clock ... this optimizes for NO_HZ,
  7313. - * because using one of the divided clocks would usually mean the
  7314. - * tick rate can never be less than several dozen Hz (vs 0.5 Hz).
  7315. - *
  7316. - * A divided clock could be good for high resolution timers, since
  7317. - * 30.5 usec resolution can seem "low".
  7318. - */
  7319. static u32 timer_clock;
  7320. static void tc_mode(enum clock_event_mode m, struct clock_event_device *d)
  7321. @@ -111,11 +104,12 @@
  7322. case CLOCK_EVT_MODE_PERIODIC:
  7323. clk_enable(tcd->clk);
  7324. - /* slow clock, count up to RC, then irq and restart */
  7325. + /* count up to RC, then irq and restart */
  7326. __raw_writel(timer_clock
  7327. | ATMEL_TC_WAVE | ATMEL_TC_WAVESEL_UP_AUTO,
  7328. regs + ATMEL_TC_REG(2, CMR));
  7329. - __raw_writel((32768 + HZ/2) / HZ, tcaddr + ATMEL_TC_REG(2, RC));
  7330. + __raw_writel((tcd->freq + HZ / 2) / HZ,
  7331. + tcaddr + ATMEL_TC_REG(2, RC));
  7332. /* Enable clock and interrupts on RC compare */
  7333. __raw_writel(ATMEL_TC_CPCS, regs + ATMEL_TC_REG(2, IER));
  7334. @@ -128,7 +122,7 @@
  7335. case CLOCK_EVT_MODE_ONESHOT:
  7336. clk_enable(tcd->clk);
  7337. - /* slow clock, count up to RC, then irq and stop */
  7338. + /* count up to RC, then irq and stop */
  7339. __raw_writel(timer_clock | ATMEL_TC_CPCSTOP
  7340. | ATMEL_TC_WAVE | ATMEL_TC_WAVESEL_UP_AUTO,
  7341. regs + ATMEL_TC_REG(2, CMR));
  7342. @@ -157,8 +151,12 @@
  7343. .name = "tc_clkevt",
  7344. .features = CLOCK_EVT_FEAT_PERIODIC
  7345. | CLOCK_EVT_FEAT_ONESHOT,
  7346. +#ifdef CONFIG_ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
  7347. /* Should be lower than at91rm9200's system timer */
  7348. .rating = 125,
  7349. +#else
  7350. + .rating = 200,
  7351. +#endif
  7352. .set_next_event = tc_next_event,
  7353. .set_mode = tc_mode,
  7354. },
  7355. @@ -178,8 +176,9 @@
  7356. return IRQ_NONE;
  7357. }
  7358. -static int __init setup_clkevents(struct atmel_tc *tc, int clk32k_divisor_idx)
  7359. +static int __init setup_clkevents(struct atmel_tc *tc, int divisor_idx)
  7360. {
  7361. + unsigned divisor = atmel_tc_divisors[divisor_idx];
  7362. int ret;
  7363. struct clk *t2_clk = tc->clk[2];
  7364. int irq = tc->irq[2];
  7365. @@ -193,7 +192,11 @@
  7366. clkevt.regs = tc->regs;
  7367. clkevt.clk = t2_clk;
  7368. - timer_clock = clk32k_divisor_idx;
  7369. + timer_clock = divisor_idx;
  7370. + if (!divisor)
  7371. + clkevt.freq = 32768;
  7372. + else
  7373. + clkevt.freq = clk_get_rate(t2_clk) / divisor;
  7374. clkevt.clkevt.cpumask = cpumask_of(0);
  7375. @@ -203,7 +206,7 @@
  7376. return ret;
  7377. }
  7378. - clockevents_config_and_register(&clkevt.clkevt, 32768, 1, 0xffff);
  7379. + clockevents_config_and_register(&clkevt.clkevt, clkevt.freq, 1, 0xffff);
  7380. return ret;
  7381. }
  7382. @@ -340,7 +343,11 @@
  7383. goto err_disable_t1;
  7384. /* channel 2: periodic and oneshot timer support */
  7385. +#ifdef CONFIG_ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
  7386. ret = setup_clkevents(tc, clk32k_divisor_idx);
  7387. +#else
  7388. + ret = setup_clkevents(tc, best_divisor_idx);
  7389. +#endif
  7390. if (ret)
  7391. goto err_unregister_clksrc;
  7392. diff -Nur linux-3.18.14.orig/drivers/clocksource/timer-atmel-pit.c linux-3.18.14-rt/drivers/clocksource/timer-atmel-pit.c
  7393. --- linux-3.18.14.orig/drivers/clocksource/timer-atmel-pit.c 2015-05-20 10:04:50.000000000 -0500
  7394. +++ linux-3.18.14-rt/drivers/clocksource/timer-atmel-pit.c 2015-05-31 15:32:47.025635380 -0500
  7395. @@ -90,6 +90,7 @@
  7396. return elapsed;
  7397. }
  7398. +static struct irqaction at91sam926x_pit_irq;
  7399. /*
  7400. * Clockevent device: interrupts every 1/HZ (== pit_cycles * MCK/16)
  7401. */
  7402. @@ -100,6 +101,8 @@
  7403. switch (mode) {
  7404. case CLOCK_EVT_MODE_PERIODIC:
  7405. + /* Set up irq handler */
  7406. + setup_irq(at91sam926x_pit_irq.irq, &at91sam926x_pit_irq);
  7407. /* update clocksource counter */
  7408. data->cnt += data->cycle * PIT_PICNT(pit_read(data->base, AT91_PIT_PIVR));
  7409. pit_write(data->base, AT91_PIT_MR,
  7410. @@ -113,6 +116,7 @@
  7411. /* disable irq, leaving the clocksource active */
  7412. pit_write(data->base, AT91_PIT_MR,
  7413. (data->cycle - 1) | AT91_PIT_PITEN);
  7414. + remove_irq(at91sam926x_pit_irq.irq, &at91sam926x_pit_irq);
  7415. break;
  7416. case CLOCK_EVT_MODE_RESUME:
  7417. break;
  7418. diff -Nur linux-3.18.14.orig/drivers/cpufreq/Kconfig.x86 linux-3.18.14-rt/drivers/cpufreq/Kconfig.x86
  7419. --- linux-3.18.14.orig/drivers/cpufreq/Kconfig.x86 2015-05-20 10:04:50.000000000 -0500
  7420. +++ linux-3.18.14-rt/drivers/cpufreq/Kconfig.x86 2015-05-31 15:32:47.065635380 -0500
  7421. @@ -113,7 +113,7 @@
  7422. config X86_POWERNOW_K8
  7423. tristate "AMD Opteron/Athlon64 PowerNow!"
  7424. - depends on ACPI && ACPI_PROCESSOR && X86_ACPI_CPUFREQ
  7425. + depends on ACPI && ACPI_PROCESSOR && X86_ACPI_CPUFREQ && !PREEMPT_RT_BASE
  7426. help
  7427. This adds the CPUFreq driver for K8/early Opteron/Athlon64 processors.
  7428. Support for K10 and newer processors is now in acpi-cpufreq.
  7429. diff -Nur linux-3.18.14.orig/drivers/gpio/gpio-omap.c linux-3.18.14-rt/drivers/gpio/gpio-omap.c
  7430. --- linux-3.18.14.orig/drivers/gpio/gpio-omap.c 2015-05-20 10:04:50.000000000 -0500
  7431. +++ linux-3.18.14-rt/drivers/gpio/gpio-omap.c 2015-05-31 15:32:47.073635379 -0500
  7432. @@ -57,7 +57,7 @@
  7433. u32 saved_datain;
  7434. u32 level_mask;
  7435. u32 toggle_mask;
  7436. - spinlock_t lock;
  7437. + raw_spinlock_t lock;
  7438. struct gpio_chip chip;
  7439. struct clk *dbck;
  7440. u32 mod_usage;
  7441. @@ -503,19 +503,19 @@
  7442. (type & (IRQ_TYPE_LEVEL_LOW|IRQ_TYPE_LEVEL_HIGH)))
  7443. return -EINVAL;
  7444. - spin_lock_irqsave(&bank->lock, flags);
  7445. + raw_spin_lock_irqsave(&bank->lock, flags);
  7446. offset = GPIO_INDEX(bank, gpio);
  7447. retval = omap_set_gpio_triggering(bank, offset, type);
  7448. if (!LINE_USED(bank->mod_usage, offset)) {
  7449. omap_enable_gpio_module(bank, offset);
  7450. omap_set_gpio_direction(bank, offset, 1);
  7451. } else if (!omap_gpio_is_input(bank, BIT(offset))) {
  7452. - spin_unlock_irqrestore(&bank->lock, flags);
  7453. + raw_spin_unlock_irqrestore(&bank->lock, flags);
  7454. return -EINVAL;
  7455. }
  7456. bank->irq_usage |= BIT(GPIO_INDEX(bank, gpio));
  7457. - spin_unlock_irqrestore(&bank->lock, flags);
  7458. + raw_spin_unlock_irqrestore(&bank->lock, flags);
  7459. if (type & (IRQ_TYPE_LEVEL_LOW | IRQ_TYPE_LEVEL_HIGH))
  7460. __irq_set_handler_locked(d->irq, handle_level_irq);
  7461. @@ -633,14 +633,14 @@
  7462. return -EINVAL;
  7463. }
  7464. - spin_lock_irqsave(&bank->lock, flags);
  7465. + raw_spin_lock_irqsave(&bank->lock, flags);
  7466. if (enable)
  7467. bank->context.wake_en |= gpio_bit;
  7468. else
  7469. bank->context.wake_en &= ~gpio_bit;
  7470. writel_relaxed(bank->context.wake_en, bank->base + bank->regs->wkup_en);
  7471. - spin_unlock_irqrestore(&bank->lock, flags);
  7472. + raw_spin_unlock_irqrestore(&bank->lock, flags);
  7473. return 0;
  7474. }
  7475. @@ -675,7 +675,7 @@
  7476. if (!BANK_USED(bank))
  7477. pm_runtime_get_sync(bank->dev);
  7478. - spin_lock_irqsave(&bank->lock, flags);
  7479. + raw_spin_lock_irqsave(&bank->lock, flags);
  7480. /* Set trigger to none. You need to enable the desired trigger with
  7481. * request_irq() or set_irq_type(). Only do this if the IRQ line has
  7482. * not already been requested.
  7483. @@ -685,7 +685,7 @@
  7484. omap_enable_gpio_module(bank, offset);
  7485. }
  7486. bank->mod_usage |= BIT(offset);
  7487. - spin_unlock_irqrestore(&bank->lock, flags);
  7488. + raw_spin_unlock_irqrestore(&bank->lock, flags);
  7489. return 0;
  7490. }
  7491. @@ -695,11 +695,11 @@
  7492. struct gpio_bank *bank = container_of(chip, struct gpio_bank, chip);
  7493. unsigned long flags;
  7494. - spin_lock_irqsave(&bank->lock, flags);
  7495. + raw_spin_lock_irqsave(&bank->lock, flags);
  7496. bank->mod_usage &= ~(BIT(offset));
  7497. omap_disable_gpio_module(bank, offset);
  7498. omap_reset_gpio(bank, bank->chip.base + offset);
  7499. - spin_unlock_irqrestore(&bank->lock, flags);
  7500. + raw_spin_unlock_irqrestore(&bank->lock, flags);
  7501. /*
  7502. * If this is the last gpio to be freed in the bank,
  7503. @@ -799,12 +799,12 @@
  7504. unsigned long flags;
  7505. unsigned offset = GPIO_INDEX(bank, gpio);
  7506. - spin_lock_irqsave(&bank->lock, flags);
  7507. + raw_spin_lock_irqsave(&bank->lock, flags);
  7508. gpio_unlock_as_irq(&bank->chip, offset);
  7509. bank->irq_usage &= ~(BIT(offset));
  7510. omap_disable_gpio_module(bank, offset);
  7511. omap_reset_gpio(bank, gpio);
  7512. - spin_unlock_irqrestore(&bank->lock, flags);
  7513. + raw_spin_unlock_irqrestore(&bank->lock, flags);
  7514. /*
  7515. * If this is the last IRQ to be freed in the bank,
  7516. @@ -828,10 +828,10 @@
  7517. unsigned int gpio = omap_irq_to_gpio(bank, d->hwirq);
  7518. unsigned long flags;
  7519. - spin_lock_irqsave(&bank->lock, flags);
  7520. + raw_spin_lock_irqsave(&bank->lock, flags);
  7521. omap_set_gpio_irqenable(bank, gpio, 0);
  7522. omap_set_gpio_triggering(bank, GPIO_INDEX(bank, gpio), IRQ_TYPE_NONE);
  7523. - spin_unlock_irqrestore(&bank->lock, flags);
  7524. + raw_spin_unlock_irqrestore(&bank->lock, flags);
  7525. }
  7526. static void omap_gpio_unmask_irq(struct irq_data *d)
  7527. @@ -842,7 +842,7 @@
  7528. u32 trigger = irqd_get_trigger_type(d);
  7529. unsigned long flags;
  7530. - spin_lock_irqsave(&bank->lock, flags);
  7531. + raw_spin_lock_irqsave(&bank->lock, flags);
  7532. if (trigger)
  7533. omap_set_gpio_triggering(bank, GPIO_INDEX(bank, gpio), trigger);
  7534. @@ -854,7 +854,7 @@
  7535. }
  7536. omap_set_gpio_irqenable(bank, gpio, 1);
  7537. - spin_unlock_irqrestore(&bank->lock, flags);
  7538. + raw_spin_unlock_irqrestore(&bank->lock, flags);
  7539. }
  7540. /*---------------------------------------------------------------------*/
  7541. @@ -867,9 +867,9 @@
  7542. OMAP_MPUIO_GPIO_MASKIT / bank->stride;
  7543. unsigned long flags;
  7544. - spin_lock_irqsave(&bank->lock, flags);
  7545. + raw_spin_lock_irqsave(&bank->lock, flags);
  7546. writel_relaxed(0xffff & ~bank->context.wake_en, mask_reg);
  7547. - spin_unlock_irqrestore(&bank->lock, flags);
  7548. + raw_spin_unlock_irqrestore(&bank->lock, flags);
  7549. return 0;
  7550. }
  7551. @@ -882,9 +882,9 @@
  7552. OMAP_MPUIO_GPIO_MASKIT / bank->stride;
  7553. unsigned long flags;
  7554. - spin_lock_irqsave(&bank->lock, flags);
  7555. + raw_spin_lock_irqsave(&bank->lock, flags);
  7556. writel_relaxed(bank->context.wake_en, mask_reg);
  7557. - spin_unlock_irqrestore(&bank->lock, flags);
  7558. + raw_spin_unlock_irqrestore(&bank->lock, flags);
  7559. return 0;
  7560. }
  7561. @@ -930,9 +930,9 @@
  7562. bank = container_of(chip, struct gpio_bank, chip);
  7563. reg = bank->base + bank->regs->direction;
  7564. - spin_lock_irqsave(&bank->lock, flags);
  7565. + raw_spin_lock_irqsave(&bank->lock, flags);
  7566. dir = !!(readl_relaxed(reg) & BIT(offset));
  7567. - spin_unlock_irqrestore(&bank->lock, flags);
  7568. + raw_spin_unlock_irqrestore(&bank->lock, flags);
  7569. return dir;
  7570. }
  7571. @@ -942,9 +942,9 @@
  7572. unsigned long flags;
  7573. bank = container_of(chip, struct gpio_bank, chip);
  7574. - spin_lock_irqsave(&bank->lock, flags);
  7575. + raw_spin_lock_irqsave(&bank->lock, flags);
  7576. omap_set_gpio_direction(bank, offset, 1);
  7577. - spin_unlock_irqrestore(&bank->lock, flags);
  7578. + raw_spin_unlock_irqrestore(&bank->lock, flags);
  7579. return 0;
  7580. }
  7581. @@ -968,10 +968,10 @@
  7582. unsigned long flags;
  7583. bank = container_of(chip, struct gpio_bank, chip);
  7584. - spin_lock_irqsave(&bank->lock, flags);
  7585. + raw_spin_lock_irqsave(&bank->lock, flags);
  7586. bank->set_dataout(bank, offset, value);
  7587. omap_set_gpio_direction(bank, offset, 0);
  7588. - spin_unlock_irqrestore(&bank->lock, flags);
  7589. + raw_spin_unlock_irqrestore(&bank->lock, flags);
  7590. return 0;
  7591. }
  7592. @@ -983,9 +983,9 @@
  7593. bank = container_of(chip, struct gpio_bank, chip);
  7594. - spin_lock_irqsave(&bank->lock, flags);
  7595. + raw_spin_lock_irqsave(&bank->lock, flags);
  7596. omap2_set_gpio_debounce(bank, offset, debounce);
  7597. - spin_unlock_irqrestore(&bank->lock, flags);
  7598. + raw_spin_unlock_irqrestore(&bank->lock, flags);
  7599. return 0;
  7600. }
  7601. @@ -996,9 +996,9 @@
  7602. unsigned long flags;
  7603. bank = container_of(chip, struct gpio_bank, chip);
  7604. - spin_lock_irqsave(&bank->lock, flags);
  7605. + raw_spin_lock_irqsave(&bank->lock, flags);
  7606. bank->set_dataout(bank, offset, value);
  7607. - spin_unlock_irqrestore(&bank->lock, flags);
  7608. + raw_spin_unlock_irqrestore(&bank->lock, flags);
  7609. }
  7610. /*---------------------------------------------------------------------*/
  7611. @@ -1223,7 +1223,7 @@
  7612. else
  7613. bank->set_dataout = omap_set_gpio_dataout_mask;
  7614. - spin_lock_init(&bank->lock);
  7615. + raw_spin_lock_init(&bank->lock);
  7616. /* Static mapping, never released */
  7617. res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
  7618. @@ -1270,7 +1270,7 @@
  7619. unsigned long flags;
  7620. u32 wake_low, wake_hi;
  7621. - spin_lock_irqsave(&bank->lock, flags);
  7622. + raw_spin_lock_irqsave(&bank->lock, flags);
  7623. /*
  7624. * Only edges can generate a wakeup event to the PRCM.
  7625. @@ -1323,7 +1323,7 @@
  7626. bank->get_context_loss_count(bank->dev);
  7627. omap_gpio_dbck_disable(bank);
  7628. - spin_unlock_irqrestore(&bank->lock, flags);
  7629. + raw_spin_unlock_irqrestore(&bank->lock, flags);
  7630. return 0;
  7631. }
  7632. @@ -1338,7 +1338,7 @@
  7633. unsigned long flags;
  7634. int c;
  7635. - spin_lock_irqsave(&bank->lock, flags);
  7636. + raw_spin_lock_irqsave(&bank->lock, flags);
  7637. /*
  7638. * On the first resume during the probe, the context has not
  7639. @@ -1374,14 +1374,14 @@
  7640. if (c != bank->context_loss_count) {
  7641. omap_gpio_restore_context(bank);
  7642. } else {
  7643. - spin_unlock_irqrestore(&bank->lock, flags);
  7644. + raw_spin_unlock_irqrestore(&bank->lock, flags);
  7645. return 0;
  7646. }
  7647. }
  7648. }
  7649. if (!bank->workaround_enabled) {
  7650. - spin_unlock_irqrestore(&bank->lock, flags);
  7651. + raw_spin_unlock_irqrestore(&bank->lock, flags);
  7652. return 0;
  7653. }
  7654. @@ -1436,7 +1436,7 @@
  7655. }
  7656. bank->workaround_enabled = false;
  7657. - spin_unlock_irqrestore(&bank->lock, flags);
  7658. + raw_spin_unlock_irqrestore(&bank->lock, flags);
  7659. return 0;
  7660. }
  7661. diff -Nur linux-3.18.14.orig/drivers/gpu/drm/i915/i915_gem.c linux-3.18.14-rt/drivers/gpu/drm/i915/i915_gem.c
  7662. --- linux-3.18.14.orig/drivers/gpu/drm/i915/i915_gem.c 2015-05-20 10:04:50.000000000 -0500
  7663. +++ linux-3.18.14-rt/drivers/gpu/drm/i915/i915_gem.c 2015-05-31 15:32:47.081635379 -0500
  7664. @@ -5144,7 +5144,7 @@
  7665. if (!mutex_is_locked(mutex))
  7666. return false;
  7667. -#if defined(CONFIG_SMP) && !defined(CONFIG_DEBUG_MUTEXES)
  7668. +#if defined(CONFIG_SMP) && !defined(CONFIG_DEBUG_MUTEXES) && !defined(CONFIG_PREEMPT_RT_BASE)
  7669. return mutex->owner == task;
  7670. #else
  7671. /* Since UP may be pre-empted, we cannot assume that we own the lock */
  7672. diff -Nur linux-3.18.14.orig/drivers/gpu/drm/i915/i915_gem_execbuffer.c linux-3.18.14-rt/drivers/gpu/drm/i915/i915_gem_execbuffer.c
  7673. --- linux-3.18.14.orig/drivers/gpu/drm/i915/i915_gem_execbuffer.c 2015-05-20 10:04:50.000000000 -0500
  7674. +++ linux-3.18.14-rt/drivers/gpu/drm/i915/i915_gem_execbuffer.c 2015-05-31 15:32:47.121635379 -0500
  7675. @@ -1170,7 +1170,9 @@
  7676. return ret;
  7677. }
  7678. +#ifndef CONFIG_PREEMPT_RT_BASE
  7679. trace_i915_gem_ring_dispatch(ring, intel_ring_get_seqno(ring), flags);
  7680. +#endif
  7681. i915_gem_execbuffer_move_to_active(vmas, ring);
  7682. i915_gem_execbuffer_retire_commands(dev, file, ring, batch_obj);
  7683. diff -Nur linux-3.18.14.orig/drivers/i2c/busses/i2c-omap.c linux-3.18.14-rt/drivers/i2c/busses/i2c-omap.c
  7684. --- linux-3.18.14.orig/drivers/i2c/busses/i2c-omap.c 2015-05-20 10:04:50.000000000 -0500
  7685. +++ linux-3.18.14-rt/drivers/i2c/busses/i2c-omap.c 2015-05-31 15:32:47.125635379 -0500
  7686. @@ -875,15 +875,12 @@
  7687. u16 mask;
  7688. u16 stat;
  7689. - spin_lock(&dev->lock);
  7690. - mask = omap_i2c_read_reg(dev, OMAP_I2C_IE_REG);
  7691. stat = omap_i2c_read_reg(dev, OMAP_I2C_STAT_REG);
  7692. + mask = omap_i2c_read_reg(dev, OMAP_I2C_IE_REG);
  7693. if (stat & mask)
  7694. ret = IRQ_WAKE_THREAD;
  7695. - spin_unlock(&dev->lock);
  7696. -
  7697. return ret;
  7698. }
  7699. diff -Nur linux-3.18.14.orig/drivers/ide/alim15x3.c linux-3.18.14-rt/drivers/ide/alim15x3.c
  7700. --- linux-3.18.14.orig/drivers/ide/alim15x3.c 2015-05-20 10:04:50.000000000 -0500
  7701. +++ linux-3.18.14-rt/drivers/ide/alim15x3.c 2015-05-31 15:32:47.137635379 -0500
  7702. @@ -234,7 +234,7 @@
  7703. isa_dev = pci_get_device(PCI_VENDOR_ID_AL, PCI_DEVICE_ID_AL_M1533, NULL);
  7704. - local_irq_save(flags);
  7705. + local_irq_save_nort(flags);
  7706. if (m5229_revision < 0xC2) {
  7707. /*
  7708. @@ -325,7 +325,7 @@
  7709. }
  7710. pci_dev_put(north);
  7711. pci_dev_put(isa_dev);
  7712. - local_irq_restore(flags);
  7713. + local_irq_restore_nort(flags);
  7714. return 0;
  7715. }
  7716. diff -Nur linux-3.18.14.orig/drivers/ide/hpt366.c linux-3.18.14-rt/drivers/ide/hpt366.c
  7717. --- linux-3.18.14.orig/drivers/ide/hpt366.c 2015-05-20 10:04:50.000000000 -0500
  7718. +++ linux-3.18.14-rt/drivers/ide/hpt366.c 2015-05-31 15:32:47.169635379 -0500
  7719. @@ -1241,7 +1241,7 @@
  7720. dma_old = inb(base + 2);
  7721. - local_irq_save(flags);
  7722. + local_irq_save_nort(flags);
  7723. dma_new = dma_old;
  7724. pci_read_config_byte(dev, hwif->channel ? 0x4b : 0x43, &masterdma);
  7725. @@ -1252,7 +1252,7 @@
  7726. if (dma_new != dma_old)
  7727. outb(dma_new, base + 2);
  7728. - local_irq_restore(flags);
  7729. + local_irq_restore_nort(flags);
  7730. printk(KERN_INFO " %s: BM-DMA at 0x%04lx-0x%04lx\n",
  7731. hwif->name, base, base + 7);
  7732. diff -Nur linux-3.18.14.orig/drivers/ide/ide-io.c linux-3.18.14-rt/drivers/ide/ide-io.c
  7733. --- linux-3.18.14.orig/drivers/ide/ide-io.c 2015-05-20 10:04:50.000000000 -0500
  7734. +++ linux-3.18.14-rt/drivers/ide/ide-io.c 2015-05-31 15:32:47.169635379 -0500
  7735. @@ -659,7 +659,7 @@
  7736. /* disable_irq_nosync ?? */
  7737. disable_irq(hwif->irq);
  7738. /* local CPU only, as if we were handling an interrupt */
  7739. - local_irq_disable();
  7740. + local_irq_disable_nort();
  7741. if (hwif->polling) {
  7742. startstop = handler(drive);
  7743. } else if (drive_is_ready(drive)) {
  7744. diff -Nur linux-3.18.14.orig/drivers/ide/ide-iops.c linux-3.18.14-rt/drivers/ide/ide-iops.c
  7745. --- linux-3.18.14.orig/drivers/ide/ide-iops.c 2015-05-20 10:04:50.000000000 -0500
  7746. +++ linux-3.18.14-rt/drivers/ide/ide-iops.c 2015-05-31 15:32:47.185635379 -0500
  7747. @@ -129,12 +129,12 @@
  7748. if ((stat & ATA_BUSY) == 0)
  7749. break;
  7750. - local_irq_restore(flags);
  7751. + local_irq_restore_nort(flags);
  7752. *rstat = stat;
  7753. return -EBUSY;
  7754. }
  7755. }
  7756. - local_irq_restore(flags);
  7757. + local_irq_restore_nort(flags);
  7758. }
  7759. /*
  7760. * Allow status to settle, then read it again.
  7761. diff -Nur linux-3.18.14.orig/drivers/ide/ide-io-std.c linux-3.18.14-rt/drivers/ide/ide-io-std.c
  7762. --- linux-3.18.14.orig/drivers/ide/ide-io-std.c 2015-05-20 10:04:50.000000000 -0500
  7763. +++ linux-3.18.14-rt/drivers/ide/ide-io-std.c 2015-05-31 15:32:47.169635379 -0500
  7764. @@ -175,7 +175,7 @@
  7765. unsigned long uninitialized_var(flags);
  7766. if ((io_32bit & 2) && !mmio) {
  7767. - local_irq_save(flags);
  7768. + local_irq_save_nort(flags);
  7769. ata_vlb_sync(io_ports->nsect_addr);
  7770. }
  7771. @@ -186,7 +186,7 @@
  7772. insl(data_addr, buf, words);
  7773. if ((io_32bit & 2) && !mmio)
  7774. - local_irq_restore(flags);
  7775. + local_irq_restore_nort(flags);
  7776. if (((len + 1) & 3) < 2)
  7777. return;
  7778. @@ -219,7 +219,7 @@
  7779. unsigned long uninitialized_var(flags);
  7780. if ((io_32bit & 2) && !mmio) {
  7781. - local_irq_save(flags);
  7782. + local_irq_save_nort(flags);
  7783. ata_vlb_sync(io_ports->nsect_addr);
  7784. }
  7785. @@ -230,7 +230,7 @@
  7786. outsl(data_addr, buf, words);
  7787. if ((io_32bit & 2) && !mmio)
  7788. - local_irq_restore(flags);
  7789. + local_irq_restore_nort(flags);
  7790. if (((len + 1) & 3) < 2)
  7791. return;
  7792. diff -Nur linux-3.18.14.orig/drivers/ide/ide-probe.c linux-3.18.14-rt/drivers/ide/ide-probe.c
  7793. --- linux-3.18.14.orig/drivers/ide/ide-probe.c 2015-05-20 10:04:50.000000000 -0500
  7794. +++ linux-3.18.14-rt/drivers/ide/ide-probe.c 2015-05-31 15:32:47.185635379 -0500
  7795. @@ -196,10 +196,10 @@
  7796. int bswap = 1;
  7797. /* local CPU only; some systems need this */
  7798. - local_irq_save(flags);
  7799. + local_irq_save_nort(flags);
  7800. /* read 512 bytes of id info */
  7801. hwif->tp_ops->input_data(drive, NULL, id, SECTOR_SIZE);
  7802. - local_irq_restore(flags);
  7803. + local_irq_restore_nort(flags);
  7804. drive->dev_flags |= IDE_DFLAG_ID_READ;
  7805. #ifdef DEBUG
  7806. diff -Nur linux-3.18.14.orig/drivers/ide/ide-taskfile.c linux-3.18.14-rt/drivers/ide/ide-taskfile.c
  7807. --- linux-3.18.14.orig/drivers/ide/ide-taskfile.c 2015-05-20 10:04:50.000000000 -0500
  7808. +++ linux-3.18.14-rt/drivers/ide/ide-taskfile.c 2015-05-31 15:32:47.185635379 -0500
  7809. @@ -250,7 +250,7 @@
  7810. page_is_high = PageHighMem(page);
  7811. if (page_is_high)
  7812. - local_irq_save(flags);
  7813. + local_irq_save_nort(flags);
  7814. buf = kmap_atomic(page) + offset;
  7815. @@ -271,7 +271,7 @@
  7816. kunmap_atomic(buf);
  7817. if (page_is_high)
  7818. - local_irq_restore(flags);
  7819. + local_irq_restore_nort(flags);
  7820. len -= nr_bytes;
  7821. }
  7822. @@ -414,7 +414,7 @@
  7823. }
  7824. if ((drive->dev_flags & IDE_DFLAG_UNMASK) == 0)
  7825. - local_irq_disable();
  7826. + local_irq_disable_nort();
  7827. ide_set_handler(drive, &task_pio_intr, WAIT_WORSTCASE);
  7828. diff -Nur linux-3.18.14.orig/drivers/infiniband/ulp/ipoib/ipoib_multicast.c linux-3.18.14-rt/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
  7829. --- linux-3.18.14.orig/drivers/infiniband/ulp/ipoib/ipoib_multicast.c 2015-05-20 10:04:50.000000000 -0500
  7830. +++ linux-3.18.14-rt/drivers/infiniband/ulp/ipoib/ipoib_multicast.c 2015-05-31 15:32:47.205635378 -0500
  7831. @@ -796,7 +796,7 @@
  7832. ipoib_mcast_stop_thread(dev, 0);
  7833. - local_irq_save(flags);
  7834. + local_irq_save_nort(flags);
  7835. netif_addr_lock(dev);
  7836. spin_lock(&priv->lock);
  7837. @@ -878,7 +878,7 @@
  7838. spin_unlock(&priv->lock);
  7839. netif_addr_unlock(dev);
  7840. - local_irq_restore(flags);
  7841. + local_irq_restore_nort(flags);
  7842. /* We have to cancel outside of the spinlock */
  7843. list_for_each_entry_safe(mcast, tmcast, &remove_list, list) {
  7844. diff -Nur linux-3.18.14.orig/drivers/input/gameport/gameport.c linux-3.18.14-rt/drivers/input/gameport/gameport.c
  7845. --- linux-3.18.14.orig/drivers/input/gameport/gameport.c 2015-05-20 10:04:50.000000000 -0500
  7846. +++ linux-3.18.14-rt/drivers/input/gameport/gameport.c 2015-05-31 15:32:47.225635378 -0500
  7847. @@ -124,12 +124,12 @@
  7848. tx = 1 << 30;
  7849. for(i = 0; i < 50; i++) {
  7850. - local_irq_save(flags);
  7851. + local_irq_save_nort(flags);
  7852. GET_TIME(t1);
  7853. for (t = 0; t < 50; t++) gameport_read(gameport);
  7854. GET_TIME(t2);
  7855. GET_TIME(t3);
  7856. - local_irq_restore(flags);
  7857. + local_irq_restore_nort(flags);
  7858. udelay(i * 10);
  7859. if ((t = DELTA(t2,t1) - DELTA(t3,t2)) < tx) tx = t;
  7860. }
  7861. @@ -148,11 +148,11 @@
  7862. tx = 1 << 30;
  7863. for(i = 0; i < 50; i++) {
  7864. - local_irq_save(flags);
  7865. + local_irq_save_nort(flags);
  7866. rdtscl(t1);
  7867. for (t = 0; t < 50; t++) gameport_read(gameport);
  7868. rdtscl(t2);
  7869. - local_irq_restore(flags);
  7870. + local_irq_restore_nort(flags);
  7871. udelay(i * 10);
  7872. if (t2 - t1 < tx) tx = t2 - t1;
  7873. }
  7874. diff -Nur linux-3.18.14.orig/drivers/leds/trigger/Kconfig linux-3.18.14-rt/drivers/leds/trigger/Kconfig
  7875. --- linux-3.18.14.orig/drivers/leds/trigger/Kconfig 2015-05-20 10:04:50.000000000 -0500
  7876. +++ linux-3.18.14-rt/drivers/leds/trigger/Kconfig 2015-05-31 15:32:47.229635378 -0500
  7877. @@ -61,7 +61,7 @@
  7878. config LEDS_TRIGGER_CPU
  7879. bool "LED CPU Trigger"
  7880. - depends on LEDS_TRIGGERS
  7881. + depends on LEDS_TRIGGERS && !PREEMPT_RT_BASE
  7882. help
  7883. This allows LEDs to be controlled by active CPUs. This shows
  7884. the active CPUs across an array of LEDs so you can see which
  7885. diff -Nur linux-3.18.14.orig/drivers/md/bcache/Kconfig linux-3.18.14-rt/drivers/md/bcache/Kconfig
  7886. --- linux-3.18.14.orig/drivers/md/bcache/Kconfig 2015-05-20 10:04:50.000000000 -0500
  7887. +++ linux-3.18.14-rt/drivers/md/bcache/Kconfig 2015-05-31 15:32:47.245635378 -0500
  7888. @@ -1,6 +1,7 @@
  7889. config BCACHE
  7890. tristate "Block device as cache"
  7891. + depends on !PREEMPT_RT_FULL
  7892. ---help---
  7893. Allows a block device to be used as cache for other devices; uses
  7894. a btree for indexing and the layout is optimized for SSDs.
  7895. diff -Nur linux-3.18.14.orig/drivers/md/dm.c linux-3.18.14-rt/drivers/md/dm.c
  7896. --- linux-3.18.14.orig/drivers/md/dm.c 2015-05-20 10:04:50.000000000 -0500
  7897. +++ linux-3.18.14-rt/drivers/md/dm.c 2015-05-31 15:32:47.261635378 -0500
  7898. @@ -1898,14 +1898,14 @@
  7899. if (map_request(ti, clone, md))
  7900. goto requeued;
  7901. - BUG_ON(!irqs_disabled());
  7902. + BUG_ON_NONRT(!irqs_disabled());
  7903. spin_lock(q->queue_lock);
  7904. }
  7905. goto out;
  7906. requeued:
  7907. - BUG_ON(!irqs_disabled());
  7908. + BUG_ON_NONRT(!irqs_disabled());
  7909. spin_lock(q->queue_lock);
  7910. delay_and_out:
  7911. diff -Nur linux-3.18.14.orig/drivers/md/raid5.c linux-3.18.14-rt/drivers/md/raid5.c
  7912. --- linux-3.18.14.orig/drivers/md/raid5.c 2015-05-20 10:04:50.000000000 -0500
  7913. +++ linux-3.18.14-rt/drivers/md/raid5.c 2015-05-31 15:32:47.265635378 -0500
  7914. @@ -1649,8 +1649,9 @@
  7915. struct raid5_percpu *percpu;
  7916. unsigned long cpu;
  7917. - cpu = get_cpu();
  7918. + cpu = get_cpu_light();
  7919. percpu = per_cpu_ptr(conf->percpu, cpu);
  7920. + spin_lock(&percpu->lock);
  7921. if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
  7922. ops_run_biofill(sh);
  7923. overlap_clear++;
  7924. @@ -1702,7 +1703,8 @@
  7925. if (test_and_clear_bit(R5_Overlap, &dev->flags))
  7926. wake_up(&sh->raid_conf->wait_for_overlap);
  7927. }
  7928. - put_cpu();
  7929. + spin_unlock(&percpu->lock);
  7930. + put_cpu_light();
  7931. }
  7932. static int grow_one_stripe(struct r5conf *conf, int hash)
  7933. @@ -5708,6 +5710,7 @@
  7934. __func__, cpu);
  7935. break;
  7936. }
  7937. + spin_lock_init(&per_cpu_ptr(conf->percpu, cpu)->lock);
  7938. }
  7939. put_online_cpus();
  7940. diff -Nur linux-3.18.14.orig/drivers/md/raid5.h linux-3.18.14-rt/drivers/md/raid5.h
  7941. --- linux-3.18.14.orig/drivers/md/raid5.h 2015-05-20 10:04:50.000000000 -0500
  7942. +++ linux-3.18.14-rt/drivers/md/raid5.h 2015-05-31 15:32:47.293635378 -0500
  7943. @@ -457,6 +457,7 @@
  7944. int recovery_disabled;
  7945. /* per cpu variables */
  7946. struct raid5_percpu {
  7947. + spinlock_t lock; /* Protection for -RT */
  7948. struct page *spare_page; /* Used when checking P/Q in raid6 */
  7949. void *scribble; /* space for constructing buffer
  7950. * lists and performing address
  7951. diff -Nur linux-3.18.14.orig/drivers/misc/hwlat_detector.c linux-3.18.14-rt/drivers/misc/hwlat_detector.c
  7952. --- linux-3.18.14.orig/drivers/misc/hwlat_detector.c 1969-12-31 18:00:00.000000000 -0600
  7953. +++ linux-3.18.14-rt/drivers/misc/hwlat_detector.c 2015-05-31 15:32:47.377635377 -0500
  7954. @@ -0,0 +1,1240 @@
  7955. +/*
  7956. + * hwlat_detector.c - A simple Hardware Latency detector.
  7957. + *
  7958. + * Use this module to detect large system latencies induced by the behavior of
  7959. + * certain underlying system hardware or firmware, independent of Linux itself.
  7960. + * The code was developed originally to detect the presence of SMIs on Intel
  7961. + * and AMD systems, although there is no dependency upon x86 herein.
  7962. + *
  7963. + * The classical example usage of this module is in detecting the presence of
  7964. + * SMIs or System Management Interrupts on Intel and AMD systems. An SMI is a
  7965. + * somewhat special form of hardware interrupt spawned from earlier CPU debug
  7966. + * modes in which the (BIOS/EFI/etc.) firmware arranges for the South Bridge
  7967. + * LPC (or other device) to generate a special interrupt under certain
  7968. + * circumstances, for example, upon expiration of a special SMI timer device,
  7969. + * due to certain external thermal readings, on certain I/O address accesses,
  7970. + * and other situations. An SMI hits a special CPU pin, triggers a special
  7971. + * SMI mode (complete with special memory map), and the OS is unaware.
  7972. + *
  7973. + * Although certain hardware-inducing latencies are necessary (for example,
  7974. + * a modern system often requires an SMI handler for correct thermal control
  7975. + * and remote management) they can wreak havoc upon any OS-level performance
  7976. + * guarantees toward low-latency, especially when the OS is not even made
  7977. + * aware of the presence of these interrupts. For this reason, we need a
  7978. + * somewhat brute force mechanism to detect these interrupts. In this case,
  7979. + * we do it by hogging all of the CPU(s) for configurable timer intervals,
  7980. + * sampling the built-in CPU timer, looking for discontiguous readings.
  7981. + *
  7982. + * WARNING: This implementation necessarily introduces latencies. Therefore,
  7983. + * you should NEVER use this module in a production environment
  7984. + * requiring any kind of low-latency performance guarantee(s).
  7985. + *
  7986. + * Copyright (C) 2008-2009 Jon Masters, Red Hat, Inc. <jcm@redhat.com>
  7987. + *
  7988. + * Includes useful feedback from Clark Williams <clark@redhat.com>
  7989. + *
  7990. + * This file is licensed under the terms of the GNU General Public
  7991. + * License version 2. This program is licensed "as is" without any
  7992. + * warranty of any kind, whether express or implied.
  7993. + */
  7994. +
  7995. +#include <linux/module.h>
  7996. +#include <linux/init.h>
  7997. +#include <linux/ring_buffer.h>
  7998. +#include <linux/time.h>
  7999. +#include <linux/hrtimer.h>
  8000. +#include <linux/kthread.h>
  8001. +#include <linux/debugfs.h>
  8002. +#include <linux/seq_file.h>
  8003. +#include <linux/uaccess.h>
  8004. +#include <linux/version.h>
  8005. +#include <linux/delay.h>
  8006. +#include <linux/slab.h>
  8007. +#include <linux/trace_clock.h>
  8008. +
  8009. +#define BUF_SIZE_DEFAULT 262144UL /* 8K*(sizeof(entry)) */
  8010. +#define BUF_FLAGS (RB_FL_OVERWRITE) /* no block on full */
  8011. +#define U64STR_SIZE 22 /* 20 digits max */
  8012. +
  8013. +#define VERSION "1.0.0"
  8014. +#define BANNER "hwlat_detector: "
  8015. +#define DRVNAME "hwlat_detector"
  8016. +#define DEFAULT_SAMPLE_WINDOW 1000000 /* 1s */
  8017. +#define DEFAULT_SAMPLE_WIDTH 500000 /* 0.5s */
  8018. +#define DEFAULT_LAT_THRESHOLD 10 /* 10us */
  8019. +
  8020. +/* Module metadata */
  8021. +
  8022. +MODULE_LICENSE("GPL");
  8023. +MODULE_AUTHOR("Jon Masters <jcm@redhat.com>");
  8024. +MODULE_DESCRIPTION("A simple hardware latency detector");
  8025. +MODULE_VERSION(VERSION);
  8026. +
  8027. +/* Module parameters */
  8028. +
  8029. +static int debug;
  8030. +static int enabled;
  8031. +static int threshold;
  8032. +
  8033. +module_param(debug, int, 0); /* enable debug */
  8034. +module_param(enabled, int, 0); /* enable detector */
  8035. +module_param(threshold, int, 0); /* latency threshold */
  8036. +
  8037. +/* Buffering and sampling */
  8038. +
  8039. +static struct ring_buffer *ring_buffer; /* sample buffer */
  8040. +static DEFINE_MUTEX(ring_buffer_mutex); /* lock changes */
  8041. +static unsigned long buf_size = BUF_SIZE_DEFAULT;
  8042. +static struct task_struct *kthread; /* sampling thread */
  8043. +
  8044. +/* DebugFS filesystem entries */
  8045. +
  8046. +static struct dentry *debug_dir; /* debugfs directory */
  8047. +static struct dentry *debug_max; /* maximum TSC delta */
  8048. +static struct dentry *debug_count; /* total detect count */
  8049. +static struct dentry *debug_sample_width; /* sample width us */
  8050. +static struct dentry *debug_sample_window; /* sample window us */
  8051. +static struct dentry *debug_sample; /* raw samples us */
  8052. +static struct dentry *debug_threshold; /* threshold us */
  8053. +static struct dentry *debug_enable; /* enable/disable */
  8054. +
  8055. +/* Individual samples and global state */
  8056. +
  8057. +struct sample; /* latency sample */
  8058. +struct data; /* Global state */
  8059. +
  8060. +/* Sampling functions */
  8061. +static int __buffer_add_sample(struct sample *sample);
  8062. +static struct sample *buffer_get_sample(struct sample *sample);
  8063. +
  8064. +/* Threading and state */
  8065. +static int kthread_fn(void *unused);
  8066. +static int start_kthread(void);
  8067. +static int stop_kthread(void);
  8068. +static void __reset_stats(void);
  8069. +static int init_stats(void);
  8070. +
  8071. +/* Debugfs interface */
  8072. +static ssize_t simple_data_read(struct file *filp, char __user *ubuf,
  8073. + size_t cnt, loff_t *ppos, const u64 *entry);
  8074. +static ssize_t simple_data_write(struct file *filp, const char __user *ubuf,
  8075. + size_t cnt, loff_t *ppos, u64 *entry);
  8076. +static int debug_sample_fopen(struct inode *inode, struct file *filp);
  8077. +static ssize_t debug_sample_fread(struct file *filp, char __user *ubuf,
  8078. + size_t cnt, loff_t *ppos);
  8079. +static int debug_sample_release(struct inode *inode, struct file *filp);
  8080. +static int debug_enable_fopen(struct inode *inode, struct file *filp);
  8081. +static ssize_t debug_enable_fread(struct file *filp, char __user *ubuf,
  8082. + size_t cnt, loff_t *ppos);
  8083. +static ssize_t debug_enable_fwrite(struct file *file,
  8084. + const char __user *user_buffer,
  8085. + size_t user_size, loff_t *offset);
  8086. +
  8087. +/* Initialization functions */
  8088. +static int init_debugfs(void);
  8089. +static void free_debugfs(void);
  8090. +static int detector_init(void);
  8091. +static void detector_exit(void);
  8092. +
  8093. +/* Individual latency samples are stored here when detected and packed into
  8094. + * the ring_buffer circular buffer, where they are overwritten when
  8095. + * more than buf_size/sizeof(sample) samples are received. */
  8096. +struct sample {
  8097. + u64 seqnum; /* unique sequence */
  8098. + u64 duration; /* ktime delta */
  8099. + u64 outer_duration; /* ktime delta (outer loop) */
  8100. + struct timespec timestamp; /* wall time */
  8101. + unsigned long lost;
  8102. +};
  8103. +
  8104. +/* keep the global state somewhere. */
  8105. +static struct data {
  8106. +
  8107. + struct mutex lock; /* protect changes */
  8108. +
  8109. + u64 count; /* total since reset */
  8110. + u64 max_sample; /* max hardware latency */
  8111. + u64 threshold; /* sample threshold level */
  8112. +
  8113. + u64 sample_window; /* total sampling window (on+off) */
  8114. + u64 sample_width; /* active sampling portion of window */
  8115. +
  8116. + atomic_t sample_open; /* whether the sample file is open */
  8117. +
  8118. + wait_queue_head_t wq; /* waitqeue for new sample values */
  8119. +
  8120. +} data;
  8121. +
  8122. +/**
  8123. + * __buffer_add_sample - add a new latency sample recording to the ring buffer
  8124. + * @sample: The new latency sample value
  8125. + *
  8126. + * This receives a new latency sample and records it in a global ring buffer.
  8127. + * No additional locking is used in this case.
  8128. + */
  8129. +static int __buffer_add_sample(struct sample *sample)
  8130. +{
  8131. + return ring_buffer_write(ring_buffer,
  8132. + sizeof(struct sample), sample);
  8133. +}
  8134. +
  8135. +/**
  8136. + * buffer_get_sample - remove a hardware latency sample from the ring buffer
  8137. + * @sample: Pre-allocated storage for the sample
  8138. + *
  8139. + * This retrieves a hardware latency sample from the global circular buffer
  8140. + */
  8141. +static struct sample *buffer_get_sample(struct sample *sample)
  8142. +{
  8143. + struct ring_buffer_event *e = NULL;
  8144. + struct sample *s = NULL;
  8145. + unsigned int cpu = 0;
  8146. +
  8147. + if (!sample)
  8148. + return NULL;
  8149. +
  8150. + mutex_lock(&ring_buffer_mutex);
  8151. + for_each_online_cpu(cpu) {
  8152. + e = ring_buffer_consume(ring_buffer, cpu, NULL, &sample->lost);
  8153. + if (e)
  8154. + break;
  8155. + }
  8156. +
  8157. + if (e) {
  8158. + s = ring_buffer_event_data(e);
  8159. + memcpy(sample, s, sizeof(struct sample));
  8160. + } else
  8161. + sample = NULL;
  8162. + mutex_unlock(&ring_buffer_mutex);
  8163. +
  8164. + return sample;
  8165. +}
  8166. +
  8167. +#ifndef CONFIG_TRACING
  8168. +#define time_type ktime_t
  8169. +#define time_get() ktime_get()
  8170. +#define time_to_us(x) ktime_to_us(x)
  8171. +#define time_sub(a, b) ktime_sub(a, b)
  8172. +#define init_time(a, b) (a).tv64 = b
  8173. +#define time_u64(a) ((a).tv64)
  8174. +#else
  8175. +#define time_type u64
  8176. +#define time_get() trace_clock_local()
  8177. +#define time_to_us(x) div_u64(x, 1000)
  8178. +#define time_sub(a, b) ((a) - (b))
  8179. +#define init_time(a, b) (a = b)
  8180. +#define time_u64(a) a
  8181. +#endif
  8182. +/**
  8183. + * get_sample - sample the CPU TSC and look for likely hardware latencies
  8184. + *
  8185. + * Used to repeatedly capture the CPU TSC (or similar), looking for potential
  8186. + * hardware-induced latency. Called with interrupts disabled and with
  8187. + * data.lock held.
  8188. + */
  8189. +static int get_sample(void)
  8190. +{
  8191. + time_type start, t1, t2, last_t2;
  8192. + s64 diff, total = 0;
  8193. + u64 sample = 0;
  8194. + u64 outer_sample = 0;
  8195. + int ret = -1;
  8196. +
  8197. + init_time(last_t2, 0);
  8198. + start = time_get(); /* start timestamp */
  8199. +
  8200. + do {
  8201. +
  8202. + t1 = time_get(); /* we'll look for a discontinuity */
  8203. + t2 = time_get();
  8204. +
  8205. + if (time_u64(last_t2)) {
  8206. + /* Check the delta from outer loop (t2 to next t1) */
  8207. + diff = time_to_us(time_sub(t1, last_t2));
  8208. + /* This shouldn't happen */
  8209. + if (diff < 0) {
  8210. + pr_err(BANNER "time running backwards\n");
  8211. + goto out;
  8212. + }
  8213. + if (diff > outer_sample)
  8214. + outer_sample = diff;
  8215. + }
  8216. + last_t2 = t2;
  8217. +
  8218. + total = time_to_us(time_sub(t2, start)); /* sample width */
  8219. +
  8220. + /* This checks the inner loop (t1 to t2) */
  8221. + diff = time_to_us(time_sub(t2, t1)); /* current diff */
  8222. +
  8223. + /* This shouldn't happen */
  8224. + if (diff < 0) {
  8225. + pr_err(BANNER "time running backwards\n");
  8226. + goto out;
  8227. + }
  8228. +
  8229. + if (diff > sample)
  8230. + sample = diff; /* only want highest value */
  8231. +
  8232. + } while (total <= data.sample_width);
  8233. +
  8234. + ret = 0;
  8235. +
  8236. + /* If we exceed the threshold value, we have found a hardware latency */
  8237. + if (sample > data.threshold || outer_sample > data.threshold) {
  8238. + struct sample s;
  8239. +
  8240. + ret = 1;
  8241. +
  8242. + data.count++;
  8243. + s.seqnum = data.count;
  8244. + s.duration = sample;
  8245. + s.outer_duration = outer_sample;
  8246. + s.timestamp = CURRENT_TIME;
  8247. + __buffer_add_sample(&s);
  8248. +
  8249. + /* Keep a running maximum ever recorded hardware latency */
  8250. + if (sample > data.max_sample)
  8251. + data.max_sample = sample;
  8252. + }
  8253. +
  8254. +out:
  8255. + return ret;
  8256. +}
  8257. +
  8258. +/*
  8259. + * kthread_fn - The CPU time sampling/hardware latency detection kernel thread
  8260. + * @unused: A required part of the kthread API.
  8261. + *
  8262. + * Used to periodically sample the CPU TSC via a call to get_sample. We
  8263. + * disable interrupts, which does (intentionally) introduce latency since we
  8264. + * need to ensure nothing else might be running (and thus pre-empting).
  8265. + * Obviously this should never be used in production environments.
  8266. + *
  8267. + * Currently this runs on which ever CPU it was scheduled on, but most
  8268. + * real-worald hardware latency situations occur across several CPUs,
  8269. + * but we might later generalize this if we find there are any actualy
  8270. + * systems with alternate SMI delivery or other hardware latencies.
  8271. + */
  8272. +static int kthread_fn(void *unused)
  8273. +{
  8274. + int ret;
  8275. + u64 interval;
  8276. +
  8277. + while (!kthread_should_stop()) {
  8278. +
  8279. + mutex_lock(&data.lock);
  8280. +
  8281. + local_irq_disable();
  8282. + ret = get_sample();
  8283. + local_irq_enable();
  8284. +
  8285. + if (ret > 0)
  8286. + wake_up(&data.wq); /* wake up reader(s) */
  8287. +
  8288. + interval = data.sample_window - data.sample_width;
  8289. + do_div(interval, USEC_PER_MSEC); /* modifies interval value */
  8290. +
  8291. + mutex_unlock(&data.lock);
  8292. +
  8293. + if (msleep_interruptible(interval))
  8294. + break;
  8295. + }
  8296. +
  8297. + return 0;
  8298. +}
  8299. +
  8300. +/**
  8301. + * start_kthread - Kick off the hardware latency sampling/detector kthread
  8302. + *
  8303. + * This starts a kernel thread that will sit and sample the CPU timestamp
  8304. + * counter (TSC or similar) and look for potential hardware latencies.
  8305. + */
  8306. +static int start_kthread(void)
  8307. +{
  8308. + kthread = kthread_run(kthread_fn, NULL,
  8309. + DRVNAME);
  8310. + if (IS_ERR(kthread)) {
  8311. + pr_err(BANNER "could not start sampling thread\n");
  8312. + enabled = 0;
  8313. + return -ENOMEM;
  8314. + }
  8315. +
  8316. + return 0;
  8317. +}
  8318. +
  8319. +/**
  8320. + * stop_kthread - Inform the hardware latency samping/detector kthread to stop
  8321. + *
  8322. + * This kicks the running hardware latency sampling/detector kernel thread and
  8323. + * tells it to stop sampling now. Use this on unload and at system shutdown.
  8324. + */
  8325. +static int stop_kthread(void)
  8326. +{
  8327. + int ret;
  8328. +
  8329. + ret = kthread_stop(kthread);
  8330. +
  8331. + return ret;
  8332. +}
  8333. +
  8334. +/**
  8335. + * __reset_stats - Reset statistics for the hardware latency detector
  8336. + *
  8337. + * We use data to store various statistics and global state. We call this
  8338. + * function in order to reset those when "enable" is toggled on or off, and
  8339. + * also at initialization. Should be called with data.lock held.
  8340. + */
  8341. +static void __reset_stats(void)
  8342. +{
  8343. + data.count = 0;
  8344. + data.max_sample = 0;
  8345. + ring_buffer_reset(ring_buffer); /* flush out old sample entries */
  8346. +}
  8347. +
  8348. +/**
  8349. + * init_stats - Setup global state statistics for the hardware latency detector
  8350. + *
  8351. + * We use data to store various statistics and global state. We also use
  8352. + * a global ring buffer (ring_buffer) to keep raw samples of detected hardware
  8353. + * induced system latencies. This function initializes these structures and
  8354. + * allocates the global ring buffer also.
  8355. + */
  8356. +static int init_stats(void)
  8357. +{
  8358. + int ret = -ENOMEM;
  8359. +
  8360. + mutex_init(&data.lock);
  8361. + init_waitqueue_head(&data.wq);
  8362. + atomic_set(&data.sample_open, 0);
  8363. +
  8364. + ring_buffer = ring_buffer_alloc(buf_size, BUF_FLAGS);
  8365. +
  8366. + if (WARN(!ring_buffer, KERN_ERR BANNER
  8367. + "failed to allocate ring buffer!\n"))
  8368. + goto out;
  8369. +
  8370. + __reset_stats();
  8371. + data.threshold = threshold ?: DEFAULT_LAT_THRESHOLD; /* threshold us */
  8372. + data.sample_window = DEFAULT_SAMPLE_WINDOW; /* window us */
  8373. + data.sample_width = DEFAULT_SAMPLE_WIDTH; /* width us */
  8374. +
  8375. + ret = 0;
  8376. +
  8377. +out:
  8378. + return ret;
  8379. +
  8380. +}
  8381. +
  8382. +/*
  8383. + * simple_data_read - Wrapper read function for global state debugfs entries
  8384. + * @filp: The active open file structure for the debugfs "file"
  8385. + * @ubuf: The userspace provided buffer to read value into
  8386. + * @cnt: The maximum number of bytes to read
  8387. + * @ppos: The current "file" position
  8388. + * @entry: The entry to read from
  8389. + *
  8390. + * This function provides a generic read implementation for the global state
  8391. + * "data" structure debugfs filesystem entries. It would be nice to use
  8392. + * simple_attr_read directly, but we need to make sure that the data.lock
  8393. + * is held during the actual read.
  8394. + */
  8395. +static ssize_t simple_data_read(struct file *filp, char __user *ubuf,
  8396. + size_t cnt, loff_t *ppos, const u64 *entry)
  8397. +{
  8398. + char buf[U64STR_SIZE];
  8399. + u64 val = 0;
  8400. + int len = 0;
  8401. +
  8402. + memset(buf, 0, sizeof(buf));
  8403. +
  8404. + if (!entry)
  8405. + return -EFAULT;
  8406. +
  8407. + mutex_lock(&data.lock);
  8408. + val = *entry;
  8409. + mutex_unlock(&data.lock);
  8410. +
  8411. + len = snprintf(buf, sizeof(buf), "%llu\n", (unsigned long long)val);
  8412. +
  8413. + return simple_read_from_buffer(ubuf, cnt, ppos, buf, len);
  8414. +
  8415. +}
  8416. +
  8417. +/*
  8418. + * simple_data_write - Wrapper write function for global state debugfs entries
  8419. + * @filp: The active open file structure for the debugfs "file"
  8420. + * @ubuf: The userspace provided buffer to write value from
  8421. + * @cnt: The maximum number of bytes to write
  8422. + * @ppos: The current "file" position
  8423. + * @entry: The entry to write to
  8424. + *
  8425. + * This function provides a generic write implementation for the global state
  8426. + * "data" structure debugfs filesystem entries. It would be nice to use
  8427. + * simple_attr_write directly, but we need to make sure that the data.lock
  8428. + * is held during the actual write.
  8429. + */
  8430. +static ssize_t simple_data_write(struct file *filp, const char __user *ubuf,
  8431. + size_t cnt, loff_t *ppos, u64 *entry)
  8432. +{
  8433. + char buf[U64STR_SIZE];
  8434. + int csize = min(cnt, sizeof(buf));
  8435. + u64 val = 0;
  8436. + int err = 0;
  8437. +
  8438. + memset(buf, '\0', sizeof(buf));
  8439. + if (copy_from_user(buf, ubuf, csize))
  8440. + return -EFAULT;
  8441. +
  8442. + buf[U64STR_SIZE-1] = '\0'; /* just in case */
  8443. + err = kstrtoull(buf, 10, &val);
  8444. + if (err)
  8445. + return -EINVAL;
  8446. +
  8447. + mutex_lock(&data.lock);
  8448. + *entry = val;
  8449. + mutex_unlock(&data.lock);
  8450. +
  8451. + return csize;
  8452. +}
  8453. +
  8454. +/**
  8455. + * debug_count_fopen - Open function for "count" debugfs entry
  8456. + * @inode: The in-kernel inode representation of the debugfs "file"
  8457. + * @filp: The active open file structure for the debugfs "file"
  8458. + *
  8459. + * This function provides an open implementation for the "count" debugfs
  8460. + * interface to the hardware latency detector.
  8461. + */
  8462. +static int debug_count_fopen(struct inode *inode, struct file *filp)
  8463. +{
  8464. + return 0;
  8465. +}
  8466. +
  8467. +/**
  8468. + * debug_count_fread - Read function for "count" debugfs entry
  8469. + * @filp: The active open file structure for the debugfs "file"
  8470. + * @ubuf: The userspace provided buffer to read value into
  8471. + * @cnt: The maximum number of bytes to read
  8472. + * @ppos: The current "file" position
  8473. + *
  8474. + * This function provides a read implementation for the "count" debugfs
  8475. + * interface to the hardware latency detector. Can be used to read the
  8476. + * number of latency readings exceeding the configured threshold since
  8477. + * the detector was last reset (e.g. by writing a zero into "count").
  8478. + */
  8479. +static ssize_t debug_count_fread(struct file *filp, char __user *ubuf,
  8480. + size_t cnt, loff_t *ppos)
  8481. +{
  8482. + return simple_data_read(filp, ubuf, cnt, ppos, &data.count);
  8483. +}
  8484. +
  8485. +/**
  8486. + * debug_count_fwrite - Write function for "count" debugfs entry
  8487. + * @filp: The active open file structure for the debugfs "file"
  8488. + * @ubuf: The user buffer that contains the value to write
  8489. + * @cnt: The maximum number of bytes to write to "file"
  8490. + * @ppos: The current position in the debugfs "file"
  8491. + *
  8492. + * This function provides a write implementation for the "count" debugfs
  8493. + * interface to the hardware latency detector. Can be used to write a
  8494. + * desired value, especially to zero the total count.
  8495. + */
  8496. +static ssize_t debug_count_fwrite(struct file *filp,
  8497. + const char __user *ubuf,
  8498. + size_t cnt,
  8499. + loff_t *ppos)
  8500. +{
  8501. + return simple_data_write(filp, ubuf, cnt, ppos, &data.count);
  8502. +}
  8503. +
  8504. +/**
  8505. + * debug_enable_fopen - Dummy open function for "enable" debugfs interface
  8506. + * @inode: The in-kernel inode representation of the debugfs "file"
  8507. + * @filp: The active open file structure for the debugfs "file"
  8508. + *
  8509. + * This function provides an open implementation for the "enable" debugfs
  8510. + * interface to the hardware latency detector.
  8511. + */
  8512. +static int debug_enable_fopen(struct inode *inode, struct file *filp)
  8513. +{
  8514. + return 0;
  8515. +}
  8516. +
  8517. +/**
  8518. + * debug_enable_fread - Read function for "enable" debugfs interface
  8519. + * @filp: The active open file structure for the debugfs "file"
  8520. + * @ubuf: The userspace provided buffer to read value into
  8521. + * @cnt: The maximum number of bytes to read
  8522. + * @ppos: The current "file" position
  8523. + *
  8524. + * This function provides a read implementation for the "enable" debugfs
  8525. + * interface to the hardware latency detector. Can be used to determine
  8526. + * whether the detector is currently enabled ("0\n" or "1\n" returned).
  8527. + */
  8528. +static ssize_t debug_enable_fread(struct file *filp, char __user *ubuf,
  8529. + size_t cnt, loff_t *ppos)
  8530. +{
  8531. + char buf[4];
  8532. +
  8533. + if ((cnt < sizeof(buf)) || (*ppos))
  8534. + return 0;
  8535. +
  8536. + buf[0] = enabled ? '1' : '0';
  8537. + buf[1] = '\n';
  8538. + buf[2] = '\0';
  8539. + if (copy_to_user(ubuf, buf, strlen(buf)))
  8540. + return -EFAULT;
  8541. + return *ppos = strlen(buf);
  8542. +}
  8543. +
  8544. +/**
  8545. + * debug_enable_fwrite - Write function for "enable" debugfs interface
  8546. + * @filp: The active open file structure for the debugfs "file"
  8547. + * @ubuf: The user buffer that contains the value to write
  8548. + * @cnt: The maximum number of bytes to write to "file"
  8549. + * @ppos: The current position in the debugfs "file"
  8550. + *
  8551. + * This function provides a write implementation for the "enable" debugfs
  8552. + * interface to the hardware latency detector. Can be used to enable or
  8553. + * disable the detector, which will have the side-effect of possibly
  8554. + * also resetting the global stats and kicking off the measuring
  8555. + * kthread (on an enable) or the converse (upon a disable).
  8556. + */
  8557. +static ssize_t debug_enable_fwrite(struct file *filp,
  8558. + const char __user *ubuf,
  8559. + size_t cnt,
  8560. + loff_t *ppos)
  8561. +{
  8562. + char buf[4];
  8563. + int csize = min(cnt, sizeof(buf));
  8564. + long val = 0;
  8565. + int err = 0;
  8566. +
  8567. + memset(buf, '\0', sizeof(buf));
  8568. + if (copy_from_user(buf, ubuf, csize))
  8569. + return -EFAULT;
  8570. +
  8571. + buf[sizeof(buf)-1] = '\0'; /* just in case */
  8572. + err = kstrtoul(buf, 10, &val);
  8573. + if (0 != err)
  8574. + return -EINVAL;
  8575. +
  8576. + if (val) {
  8577. + if (enabled)
  8578. + goto unlock;
  8579. + enabled = 1;
  8580. + __reset_stats();
  8581. + if (start_kthread())
  8582. + return -EFAULT;
  8583. + } else {
  8584. + if (!enabled)
  8585. + goto unlock;
  8586. + enabled = 0;
  8587. + err = stop_kthread();
  8588. + if (err) {
  8589. + pr_err(BANNER "cannot stop kthread\n");
  8590. + return -EFAULT;
  8591. + }
  8592. + wake_up(&data.wq); /* reader(s) should return */
  8593. + }
  8594. +unlock:
  8595. + return csize;
  8596. +}
  8597. +
  8598. +/**
  8599. + * debug_max_fopen - Open function for "max" debugfs entry
  8600. + * @inode: The in-kernel inode representation of the debugfs "file"
  8601. + * @filp: The active open file structure for the debugfs "file"
  8602. + *
  8603. + * This function provides an open implementation for the "max" debugfs
  8604. + * interface to the hardware latency detector.
  8605. + */
  8606. +static int debug_max_fopen(struct inode *inode, struct file *filp)
  8607. +{
  8608. + return 0;
  8609. +}
  8610. +
  8611. +/**
  8612. + * debug_max_fread - Read function for "max" debugfs entry
  8613. + * @filp: The active open file structure for the debugfs "file"
  8614. + * @ubuf: The userspace provided buffer to read value into
  8615. + * @cnt: The maximum number of bytes to read
  8616. + * @ppos: The current "file" position
  8617. + *
  8618. + * This function provides a read implementation for the "max" debugfs
  8619. + * interface to the hardware latency detector. Can be used to determine
  8620. + * the maximum latency value observed since it was last reset.
  8621. + */
  8622. +static ssize_t debug_max_fread(struct file *filp, char __user *ubuf,
  8623. + size_t cnt, loff_t *ppos)
  8624. +{
  8625. + return simple_data_read(filp, ubuf, cnt, ppos, &data.max_sample);
  8626. +}
  8627. +
  8628. +/**
  8629. + * debug_max_fwrite - Write function for "max" debugfs entry
  8630. + * @filp: The active open file structure for the debugfs "file"
  8631. + * @ubuf: The user buffer that contains the value to write
  8632. + * @cnt: The maximum number of bytes to write to "file"
  8633. + * @ppos: The current position in the debugfs "file"
  8634. + *
  8635. + * This function provides a write implementation for the "max" debugfs
  8636. + * interface to the hardware latency detector. Can be used to reset the
  8637. + * maximum or set it to some other desired value - if, then, subsequent
  8638. + * measurements exceed this value, the maximum will be updated.
  8639. + */
  8640. +static ssize_t debug_max_fwrite(struct file *filp,
  8641. + const char __user *ubuf,
  8642. + size_t cnt,
  8643. + loff_t *ppos)
  8644. +{
  8645. + return simple_data_write(filp, ubuf, cnt, ppos, &data.max_sample);
  8646. +}
  8647. +
  8648. +
  8649. +/**
  8650. + * debug_sample_fopen - An open function for "sample" debugfs interface
  8651. + * @inode: The in-kernel inode representation of this debugfs "file"
  8652. + * @filp: The active open file structure for the debugfs "file"
  8653. + *
  8654. + * This function handles opening the "sample" file within the hardware
  8655. + * latency detector debugfs directory interface. This file is used to read
  8656. + * raw samples from the global ring_buffer and allows the user to see a
  8657. + * running latency history. Can be opened blocking or non-blocking,
  8658. + * affecting whether it behaves as a buffer read pipe, or does not.
  8659. + * Implements simple locking to prevent multiple simultaneous use.
  8660. + */
  8661. +static int debug_sample_fopen(struct inode *inode, struct file *filp)
  8662. +{
  8663. + if (!atomic_add_unless(&data.sample_open, 1, 1))
  8664. + return -EBUSY;
  8665. + else
  8666. + return 0;
  8667. +}
  8668. +
  8669. +/**
  8670. + * debug_sample_fread - A read function for "sample" debugfs interface
  8671. + * @filp: The active open file structure for the debugfs "file"
  8672. + * @ubuf: The user buffer that will contain the samples read
  8673. + * @cnt: The maximum bytes to read from the debugfs "file"
  8674. + * @ppos: The current position in the debugfs "file"
  8675. + *
  8676. + * This function handles reading from the "sample" file within the hardware
  8677. + * latency detector debugfs directory interface. This file is used to read
  8678. + * raw samples from the global ring_buffer and allows the user to see a
  8679. + * running latency history. By default this will block pending a new
  8680. + * value written into the sample buffer, unless there are already a
  8681. + * number of value(s) waiting in the buffer, or the sample file was
  8682. + * previously opened in a non-blocking mode of operation.
  8683. + */
  8684. +static ssize_t debug_sample_fread(struct file *filp, char __user *ubuf,
  8685. + size_t cnt, loff_t *ppos)
  8686. +{
  8687. + int len = 0;
  8688. + char buf[64];
  8689. + struct sample *sample = NULL;
  8690. +
  8691. + if (!enabled)
  8692. + return 0;
  8693. +
  8694. + sample = kzalloc(sizeof(struct sample), GFP_KERNEL);
  8695. + if (!sample)
  8696. + return -ENOMEM;
  8697. +
  8698. + while (!buffer_get_sample(sample)) {
  8699. +
  8700. + DEFINE_WAIT(wait);
  8701. +
  8702. + if (filp->f_flags & O_NONBLOCK) {
  8703. + len = -EAGAIN;
  8704. + goto out;
  8705. + }
  8706. +
  8707. + prepare_to_wait(&data.wq, &wait, TASK_INTERRUPTIBLE);
  8708. + schedule();
  8709. + finish_wait(&data.wq, &wait);
  8710. +
  8711. + if (signal_pending(current)) {
  8712. + len = -EINTR;
  8713. + goto out;
  8714. + }
  8715. +
  8716. + if (!enabled) { /* enable was toggled */
  8717. + len = 0;
  8718. + goto out;
  8719. + }
  8720. + }
  8721. +
  8722. + len = snprintf(buf, sizeof(buf), "%010lu.%010lu\t%llu\t%llu\n",
  8723. + sample->timestamp.tv_sec,
  8724. + sample->timestamp.tv_nsec,
  8725. + sample->duration,
  8726. + sample->outer_duration);
  8727. +
  8728. +
  8729. + /* handling partial reads is more trouble than it's worth */
  8730. + if (len > cnt)
  8731. + goto out;
  8732. +
  8733. + if (copy_to_user(ubuf, buf, len))
  8734. + len = -EFAULT;
  8735. +
  8736. +out:
  8737. + kfree(sample);
  8738. + return len;
  8739. +}
  8740. +
  8741. +/**
  8742. + * debug_sample_release - Release function for "sample" debugfs interface
  8743. + * @inode: The in-kernel inode represenation of the debugfs "file"
  8744. + * @filp: The active open file structure for the debugfs "file"
  8745. + *
  8746. + * This function completes the close of the debugfs interface "sample" file.
  8747. + * Frees the sample_open "lock" so that other users may open the interface.
  8748. + */
  8749. +static int debug_sample_release(struct inode *inode, struct file *filp)
  8750. +{
  8751. + atomic_dec(&data.sample_open);
  8752. +
  8753. + return 0;
  8754. +}
  8755. +
  8756. +/**
  8757. + * debug_threshold_fopen - Open function for "threshold" debugfs entry
  8758. + * @inode: The in-kernel inode representation of the debugfs "file"
  8759. + * @filp: The active open file structure for the debugfs "file"
  8760. + *
  8761. + * This function provides an open implementation for the "threshold" debugfs
  8762. + * interface to the hardware latency detector.
  8763. + */
  8764. +static int debug_threshold_fopen(struct inode *inode, struct file *filp)
  8765. +{
  8766. + return 0;
  8767. +}
  8768. +
  8769. +/**
  8770. + * debug_threshold_fread - Read function for "threshold" debugfs entry
  8771. + * @filp: The active open file structure for the debugfs "file"
  8772. + * @ubuf: The userspace provided buffer to read value into
  8773. + * @cnt: The maximum number of bytes to read
  8774. + * @ppos: The current "file" position
  8775. + *
  8776. + * This function provides a read implementation for the "threshold" debugfs
  8777. + * interface to the hardware latency detector. It can be used to determine
  8778. + * the current threshold level at which a latency will be recorded in the
  8779. + * global ring buffer, typically on the order of 10us.
  8780. + */
  8781. +static ssize_t debug_threshold_fread(struct file *filp, char __user *ubuf,
  8782. + size_t cnt, loff_t *ppos)
  8783. +{
  8784. + return simple_data_read(filp, ubuf, cnt, ppos, &data.threshold);
  8785. +}
  8786. +
  8787. +/**
  8788. + * debug_threshold_fwrite - Write function for "threshold" debugfs entry
  8789. + * @filp: The active open file structure for the debugfs "file"
  8790. + * @ubuf: The user buffer that contains the value to write
  8791. + * @cnt: The maximum number of bytes to write to "file"
  8792. + * @ppos: The current position in the debugfs "file"
  8793. + *
  8794. + * This function provides a write implementation for the "threshold" debugfs
  8795. + * interface to the hardware latency detector. It can be used to configure
  8796. + * the threshold level at which any subsequently detected latencies will
  8797. + * be recorded into the global ring buffer.
  8798. + */
  8799. +static ssize_t debug_threshold_fwrite(struct file *filp,
  8800. + const char __user *ubuf,
  8801. + size_t cnt,
  8802. + loff_t *ppos)
  8803. +{
  8804. + int ret;
  8805. +
  8806. + ret = simple_data_write(filp, ubuf, cnt, ppos, &data.threshold);
  8807. +
  8808. + if (enabled)
  8809. + wake_up_process(kthread);
  8810. +
  8811. + return ret;
  8812. +}
  8813. +
  8814. +/**
  8815. + * debug_width_fopen - Open function for "width" debugfs entry
  8816. + * @inode: The in-kernel inode representation of the debugfs "file"
  8817. + * @filp: The active open file structure for the debugfs "file"
  8818. + *
  8819. + * This function provides an open implementation for the "width" debugfs
  8820. + * interface to the hardware latency detector.
  8821. + */
  8822. +static int debug_width_fopen(struct inode *inode, struct file *filp)
  8823. +{
  8824. + return 0;
  8825. +}
  8826. +
  8827. +/**
  8828. + * debug_width_fread - Read function for "width" debugfs entry
  8829. + * @filp: The active open file structure for the debugfs "file"
  8830. + * @ubuf: The userspace provided buffer to read value into
  8831. + * @cnt: The maximum number of bytes to read
  8832. + * @ppos: The current "file" position
  8833. + *
  8834. + * This function provides a read implementation for the "width" debugfs
  8835. + * interface to the hardware latency detector. It can be used to determine
  8836. + * for how many us of the total window us we will actively sample for any
  8837. + * hardware-induced latecy periods. Obviously, it is not possible to
  8838. + * sample constantly and have the system respond to a sample reader, or,
  8839. + * worse, without having the system appear to have gone out to lunch.
  8840. + */
  8841. +static ssize_t debug_width_fread(struct file *filp, char __user *ubuf,
  8842. + size_t cnt, loff_t *ppos)
  8843. +{
  8844. + return simple_data_read(filp, ubuf, cnt, ppos, &data.sample_width);
  8845. +}
  8846. +
  8847. +/**
  8848. + * debug_width_fwrite - Write function for "width" debugfs entry
  8849. + * @filp: The active open file structure for the debugfs "file"
  8850. + * @ubuf: The user buffer that contains the value to write
  8851. + * @cnt: The maximum number of bytes to write to "file"
  8852. + * @ppos: The current position in the debugfs "file"
  8853. + *
  8854. + * This function provides a write implementation for the "width" debugfs
  8855. + * interface to the hardware latency detector. It can be used to configure
  8856. + * for how many us of the total window us we will actively sample for any
  8857. + * hardware-induced latency periods. Obviously, it is not possible to
  8858. + * sample constantly and have the system respond to a sample reader, or,
  8859. + * worse, without having the system appear to have gone out to lunch. It
  8860. + * is enforced that width is less that the total window size.
  8861. + */
  8862. +static ssize_t debug_width_fwrite(struct file *filp,
  8863. + const char __user *ubuf,
  8864. + size_t cnt,
  8865. + loff_t *ppos)
  8866. +{
  8867. + char buf[U64STR_SIZE];
  8868. + int csize = min(cnt, sizeof(buf));
  8869. + u64 val = 0;
  8870. + int err = 0;
  8871. +
  8872. + memset(buf, '\0', sizeof(buf));
  8873. + if (copy_from_user(buf, ubuf, csize))
  8874. + return -EFAULT;
  8875. +
  8876. + buf[U64STR_SIZE-1] = '\0'; /* just in case */
  8877. + err = kstrtoull(buf, 10, &val);
  8878. + if (0 != err)
  8879. + return -EINVAL;
  8880. +
  8881. + mutex_lock(&data.lock);
  8882. + if (val < data.sample_window)
  8883. + data.sample_width = val;
  8884. + else {
  8885. + mutex_unlock(&data.lock);
  8886. + return -EINVAL;
  8887. + }
  8888. + mutex_unlock(&data.lock);
  8889. +
  8890. + if (enabled)
  8891. + wake_up_process(kthread);
  8892. +
  8893. + return csize;
  8894. +}
  8895. +
  8896. +/**
  8897. + * debug_window_fopen - Open function for "window" debugfs entry
  8898. + * @inode: The in-kernel inode representation of the debugfs "file"
  8899. + * @filp: The active open file structure for the debugfs "file"
  8900. + *
  8901. + * This function provides an open implementation for the "window" debugfs
  8902. + * interface to the hardware latency detector. The window is the total time
  8903. + * in us that will be considered one sample period. Conceptually, windows
  8904. + * occur back-to-back and contain a sample width period during which
  8905. + * actual sampling occurs.
  8906. + */
  8907. +static int debug_window_fopen(struct inode *inode, struct file *filp)
  8908. +{
  8909. + return 0;
  8910. +}
  8911. +
  8912. +/**
  8913. + * debug_window_fread - Read function for "window" debugfs entry
  8914. + * @filp: The active open file structure for the debugfs "file"
  8915. + * @ubuf: The userspace provided buffer to read value into
  8916. + * @cnt: The maximum number of bytes to read
  8917. + * @ppos: The current "file" position
  8918. + *
  8919. + * This function provides a read implementation for the "window" debugfs
  8920. + * interface to the hardware latency detector. The window is the total time
  8921. + * in us that will be considered one sample period. Conceptually, windows
  8922. + * occur back-to-back and contain a sample width period during which
  8923. + * actual sampling occurs. Can be used to read the total window size.
  8924. + */
  8925. +static ssize_t debug_window_fread(struct file *filp, char __user *ubuf,
  8926. + size_t cnt, loff_t *ppos)
  8927. +{
  8928. + return simple_data_read(filp, ubuf, cnt, ppos, &data.sample_window);
  8929. +}
  8930. +
  8931. +/**
  8932. + * debug_window_fwrite - Write function for "window" debugfs entry
  8933. + * @filp: The active open file structure for the debugfs "file"
  8934. + * @ubuf: The user buffer that contains the value to write
  8935. + * @cnt: The maximum number of bytes to write to "file"
  8936. + * @ppos: The current position in the debugfs "file"
  8937. + *
  8938. + * This function provides a write implementation for the "window" debufds
  8939. + * interface to the hardware latency detetector. The window is the total time
  8940. + * in us that will be considered one sample period. Conceptually, windows
  8941. + * occur back-to-back and contain a sample width period during which
  8942. + * actual sampling occurs. Can be used to write a new total window size. It
  8943. + * is enfoced that any value written must be greater than the sample width
  8944. + * size, or an error results.
  8945. + */
  8946. +static ssize_t debug_window_fwrite(struct file *filp,
  8947. + const char __user *ubuf,
  8948. + size_t cnt,
  8949. + loff_t *ppos)
  8950. +{
  8951. + char buf[U64STR_SIZE];
  8952. + int csize = min(cnt, sizeof(buf));
  8953. + u64 val = 0;
  8954. + int err = 0;
  8955. +
  8956. + memset(buf, '\0', sizeof(buf));
  8957. + if (copy_from_user(buf, ubuf, csize))
  8958. + return -EFAULT;
  8959. +
  8960. + buf[U64STR_SIZE-1] = '\0'; /* just in case */
  8961. + err = kstrtoull(buf, 10, &val);
  8962. + if (0 != err)
  8963. + return -EINVAL;
  8964. +
  8965. + mutex_lock(&data.lock);
  8966. + if (data.sample_width < val)
  8967. + data.sample_window = val;
  8968. + else {
  8969. + mutex_unlock(&data.lock);
  8970. + return -EINVAL;
  8971. + }
  8972. + mutex_unlock(&data.lock);
  8973. +
  8974. + return csize;
  8975. +}
  8976. +
  8977. +/*
  8978. + * Function pointers for the "count" debugfs file operations
  8979. + */
  8980. +static const struct file_operations count_fops = {
  8981. + .open = debug_count_fopen,
  8982. + .read = debug_count_fread,
  8983. + .write = debug_count_fwrite,
  8984. + .owner = THIS_MODULE,
  8985. +};
  8986. +
  8987. +/*
  8988. + * Function pointers for the "enable" debugfs file operations
  8989. + */
  8990. +static const struct file_operations enable_fops = {
  8991. + .open = debug_enable_fopen,
  8992. + .read = debug_enable_fread,
  8993. + .write = debug_enable_fwrite,
  8994. + .owner = THIS_MODULE,
  8995. +};
  8996. +
  8997. +/*
  8998. + * Function pointers for the "max" debugfs file operations
  8999. + */
  9000. +static const struct file_operations max_fops = {
  9001. + .open = debug_max_fopen,
  9002. + .read = debug_max_fread,
  9003. + .write = debug_max_fwrite,
  9004. + .owner = THIS_MODULE,
  9005. +};
  9006. +
  9007. +/*
  9008. + * Function pointers for the "sample" debugfs file operations
  9009. + */
  9010. +static const struct file_operations sample_fops = {
  9011. + .open = debug_sample_fopen,
  9012. + .read = debug_sample_fread,
  9013. + .release = debug_sample_release,
  9014. + .owner = THIS_MODULE,
  9015. +};
  9016. +
  9017. +/*
  9018. + * Function pointers for the "threshold" debugfs file operations
  9019. + */
  9020. +static const struct file_operations threshold_fops = {
  9021. + .open = debug_threshold_fopen,
  9022. + .read = debug_threshold_fread,
  9023. + .write = debug_threshold_fwrite,
  9024. + .owner = THIS_MODULE,
  9025. +};
  9026. +
  9027. +/*
  9028. + * Function pointers for the "width" debugfs file operations
  9029. + */
  9030. +static const struct file_operations width_fops = {
  9031. + .open = debug_width_fopen,
  9032. + .read = debug_width_fread,
  9033. + .write = debug_width_fwrite,
  9034. + .owner = THIS_MODULE,
  9035. +};
  9036. +
  9037. +/*
  9038. + * Function pointers for the "window" debugfs file operations
  9039. + */
  9040. +static const struct file_operations window_fops = {
  9041. + .open = debug_window_fopen,
  9042. + .read = debug_window_fread,
  9043. + .write = debug_window_fwrite,
  9044. + .owner = THIS_MODULE,
  9045. +};
  9046. +
  9047. +/**
  9048. + * init_debugfs - A function to initialize the debugfs interface files
  9049. + *
  9050. + * This function creates entries in debugfs for "hwlat_detector", including
  9051. + * files to read values from the detector, current samples, and the
  9052. + * maximum sample that has been captured since the hardware latency
  9053. + * dectector was started.
  9054. + */
  9055. +static int init_debugfs(void)
  9056. +{
  9057. + int ret = -ENOMEM;
  9058. +
  9059. + debug_dir = debugfs_create_dir(DRVNAME, NULL);
  9060. + if (!debug_dir)
  9061. + goto err_debug_dir;
  9062. +
  9063. + debug_sample = debugfs_create_file("sample", 0444,
  9064. + debug_dir, NULL,
  9065. + &sample_fops);
  9066. + if (!debug_sample)
  9067. + goto err_sample;
  9068. +
  9069. + debug_count = debugfs_create_file("count", 0444,
  9070. + debug_dir, NULL,
  9071. + &count_fops);
  9072. + if (!debug_count)
  9073. + goto err_count;
  9074. +
  9075. + debug_max = debugfs_create_file("max", 0444,
  9076. + debug_dir, NULL,
  9077. + &max_fops);
  9078. + if (!debug_max)
  9079. + goto err_max;
  9080. +
  9081. + debug_sample_window = debugfs_create_file("window", 0644,
  9082. + debug_dir, NULL,
  9083. + &window_fops);
  9084. + if (!debug_sample_window)
  9085. + goto err_window;
  9086. +
  9087. + debug_sample_width = debugfs_create_file("width", 0644,
  9088. + debug_dir, NULL,
  9089. + &width_fops);
  9090. + if (!debug_sample_width)
  9091. + goto err_width;
  9092. +
  9093. + debug_threshold = debugfs_create_file("threshold", 0644,
  9094. + debug_dir, NULL,
  9095. + &threshold_fops);
  9096. + if (!debug_threshold)
  9097. + goto err_threshold;
  9098. +
  9099. + debug_enable = debugfs_create_file("enable", 0644,
  9100. + debug_dir, &enabled,
  9101. + &enable_fops);
  9102. + if (!debug_enable)
  9103. + goto err_enable;
  9104. +
  9105. + else {
  9106. + ret = 0;
  9107. + goto out;
  9108. + }
  9109. +
  9110. +err_enable:
  9111. + debugfs_remove(debug_threshold);
  9112. +err_threshold:
  9113. + debugfs_remove(debug_sample_width);
  9114. +err_width:
  9115. + debugfs_remove(debug_sample_window);
  9116. +err_window:
  9117. + debugfs_remove(debug_max);
  9118. +err_max:
  9119. + debugfs_remove(debug_count);
  9120. +err_count:
  9121. + debugfs_remove(debug_sample);
  9122. +err_sample:
  9123. + debugfs_remove(debug_dir);
  9124. +err_debug_dir:
  9125. +out:
  9126. + return ret;
  9127. +}
  9128. +
  9129. +/**
  9130. + * free_debugfs - A function to cleanup the debugfs file interface
  9131. + */
  9132. +static void free_debugfs(void)
  9133. +{
  9134. + /* could also use a debugfs_remove_recursive */
  9135. + debugfs_remove(debug_enable);
  9136. + debugfs_remove(debug_threshold);
  9137. + debugfs_remove(debug_sample_width);
  9138. + debugfs_remove(debug_sample_window);
  9139. + debugfs_remove(debug_max);
  9140. + debugfs_remove(debug_count);
  9141. + debugfs_remove(debug_sample);
  9142. + debugfs_remove(debug_dir);
  9143. +}
  9144. +
  9145. +/**
  9146. + * detector_init - Standard module initialization code
  9147. + */
  9148. +static int detector_init(void)
  9149. +{
  9150. + int ret = -ENOMEM;
  9151. +
  9152. + pr_info(BANNER "version %s\n", VERSION);
  9153. +
  9154. + ret = init_stats();
  9155. + if (0 != ret)
  9156. + goto out;
  9157. +
  9158. + ret = init_debugfs();
  9159. + if (0 != ret)
  9160. + goto err_stats;
  9161. +
  9162. + if (enabled)
  9163. + ret = start_kthread();
  9164. +
  9165. + goto out;
  9166. +
  9167. +err_stats:
  9168. + ring_buffer_free(ring_buffer);
  9169. +out:
  9170. + return ret;
  9171. +
  9172. +}
  9173. +
  9174. +/**
  9175. + * detector_exit - Standard module cleanup code
  9176. + */
  9177. +static void detector_exit(void)
  9178. +{
  9179. + int err;
  9180. +
  9181. + if (enabled) {
  9182. + enabled = 0;
  9183. + err = stop_kthread();
  9184. + if (err)
  9185. + pr_err(BANNER "cannot stop kthread\n");
  9186. + }
  9187. +
  9188. + free_debugfs();
  9189. + ring_buffer_free(ring_buffer); /* free up the ring buffer */
  9190. +
  9191. +}
  9192. +
  9193. +module_init(detector_init);
  9194. +module_exit(detector_exit);
  9195. diff -Nur linux-3.18.14.orig/drivers/misc/Kconfig linux-3.18.14-rt/drivers/misc/Kconfig
  9196. --- linux-3.18.14.orig/drivers/misc/Kconfig 2015-05-20 10:04:50.000000000 -0500
  9197. +++ linux-3.18.14-rt/drivers/misc/Kconfig 2015-05-31 15:32:47.297635378 -0500
  9198. @@ -54,6 +54,7 @@
  9199. config ATMEL_TCLIB
  9200. bool "Atmel AT32/AT91 Timer/Counter Library"
  9201. depends on (AVR32 || ARCH_AT91)
  9202. + default y if PREEMPT_RT_FULL
  9203. help
  9204. Select this if you want a library to allocate the Timer/Counter
  9205. blocks found on many Atmel processors. This facilitates using
  9206. @@ -69,8 +70,7 @@
  9207. are combined to make a single 32-bit timer.
  9208. When GENERIC_CLOCKEVENTS is defined, the third timer channel
  9209. - may be used as a clock event device supporting oneshot mode
  9210. - (delays of up to two seconds) based on the 32 KiHz clock.
  9211. + may be used as a clock event device supporting oneshot mode.
  9212. config ATMEL_TCB_CLKSRC_BLOCK
  9213. int
  9214. @@ -84,6 +84,15 @@
  9215. TC can be used for other purposes, such as PWM generation and
  9216. interval timing.
  9217. +config ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
  9218. + bool "TC Block use 32 KiHz clock"
  9219. + depends on ATMEL_TCB_CLKSRC
  9220. + default y if !PREEMPT_RT_FULL
  9221. + help
  9222. + Select this to use 32 KiHz base clock rate as TC block clock
  9223. + source for clock events.
  9224. +
  9225. +
  9226. config DUMMY_IRQ
  9227. tristate "Dummy IRQ handler"
  9228. default n
  9229. @@ -113,6 +122,35 @@
  9230. for information on the specific driver level and support statement
  9231. for your IBM server.
  9232. +config HWLAT_DETECTOR
  9233. + tristate "Testing module to detect hardware-induced latencies"
  9234. + depends on DEBUG_FS
  9235. + depends on RING_BUFFER
  9236. + default m
  9237. + ---help---
  9238. + A simple hardware latency detector. Use this module to detect
  9239. + large latencies introduced by the behavior of the underlying
  9240. + system firmware external to Linux. We do this using periodic
  9241. + use of stop_machine to grab all available CPUs and measure
  9242. + for unexplainable gaps in the CPU timestamp counter(s). By
  9243. + default, the module is not enabled until the "enable" file
  9244. + within the "hwlat_detector" debugfs directory is toggled.
  9245. +
  9246. + This module is often used to detect SMI (System Management
  9247. + Interrupts) on x86 systems, though is not x86 specific. To
  9248. + this end, we default to using a sample window of 1 second,
  9249. + during which we will sample for 0.5 seconds. If an SMI or
  9250. + similar event occurs during that time, it is recorded
  9251. + into an 8K samples global ring buffer until retreived.
  9252. +
  9253. + WARNING: This software should never be enabled (it can be built
  9254. + but should not be turned on after it is loaded) in a production
  9255. + environment where high latencies are a concern since the
  9256. + sampling mechanism actually introduces latencies for
  9257. + regular tasks while the CPU(s) are being held.
  9258. +
  9259. + If unsure, say N
  9260. +
  9261. config PHANTOM
  9262. tristate "Sensable PHANToM (PCI)"
  9263. depends on PCI
  9264. diff -Nur linux-3.18.14.orig/drivers/misc/Makefile linux-3.18.14-rt/drivers/misc/Makefile
  9265. --- linux-3.18.14.orig/drivers/misc/Makefile 2015-05-20 10:04:50.000000000 -0500
  9266. +++ linux-3.18.14-rt/drivers/misc/Makefile 2015-05-31 15:32:47.349635377 -0500
  9267. @@ -38,6 +38,7 @@
  9268. obj-$(CONFIG_HMC6352) += hmc6352.o
  9269. obj-y += eeprom/
  9270. obj-y += cb710/
  9271. +obj-$(CONFIG_HWLAT_DETECTOR) += hwlat_detector.o
  9272. obj-$(CONFIG_SPEAR13XX_PCIE_GADGET) += spear13xx_pcie_gadget.o
  9273. obj-$(CONFIG_VMWARE_BALLOON) += vmw_balloon.o
  9274. obj-$(CONFIG_ARM_CHARLCD) += arm-charlcd.o
  9275. diff -Nur linux-3.18.14.orig/drivers/mmc/host/mmci.c linux-3.18.14-rt/drivers/mmc/host/mmci.c
  9276. --- linux-3.18.14.orig/drivers/mmc/host/mmci.c 2015-05-20 10:04:50.000000000 -0500
  9277. +++ linux-3.18.14-rt/drivers/mmc/host/mmci.c 2015-05-31 15:32:47.393635377 -0500
  9278. @@ -1153,15 +1153,12 @@
  9279. struct sg_mapping_iter *sg_miter = &host->sg_miter;
  9280. struct variant_data *variant = host->variant;
  9281. void __iomem *base = host->base;
  9282. - unsigned long flags;
  9283. u32 status;
  9284. status = readl(base + MMCISTATUS);
  9285. dev_dbg(mmc_dev(host->mmc), "irq1 (pio) %08x\n", status);
  9286. - local_irq_save(flags);
  9287. -
  9288. do {
  9289. unsigned int remain, len;
  9290. char *buffer;
  9291. @@ -1201,8 +1198,6 @@
  9292. sg_miter_stop(sg_miter);
  9293. - local_irq_restore(flags);
  9294. -
  9295. /*
  9296. * If we have less than the fifo 'half-full' threshold to transfer,
  9297. * trigger a PIO interrupt as soon as any data is available.
  9298. diff -Nur linux-3.18.14.orig/drivers/mmc/host/sdhci.c linux-3.18.14-rt/drivers/mmc/host/sdhci.c
  9299. --- linux-3.18.14.orig/drivers/mmc/host/sdhci.c 2015-05-20 10:04:50.000000000 -0500
  9300. +++ linux-3.18.14-rt/drivers/mmc/host/sdhci.c 2015-05-31 15:32:47.397635376 -0500
  9301. @@ -2565,6 +2565,31 @@
  9302. return isr ? IRQ_HANDLED : IRQ_NONE;
  9303. }
  9304. +#ifdef CONFIG_PREEMPT_RT_BASE
  9305. +static irqreturn_t sdhci_rt_irq(int irq, void *dev_id)
  9306. +{
  9307. + irqreturn_t ret;
  9308. +
  9309. + local_bh_disable();
  9310. + ret = sdhci_irq(irq, dev_id);
  9311. + local_bh_enable();
  9312. + if (ret == IRQ_WAKE_THREAD)
  9313. + ret = sdhci_thread_irq(irq, dev_id);
  9314. + return ret;
  9315. +}
  9316. +#endif
  9317. +
  9318. +static int sdhci_req_irq(struct sdhci_host *host)
  9319. +{
  9320. +#ifdef CONFIG_PREEMPT_RT_BASE
  9321. + return request_threaded_irq(host->irq, NULL, sdhci_rt_irq,
  9322. + IRQF_SHARED, mmc_hostname(host->mmc), host);
  9323. +#else
  9324. + return request_threaded_irq(host->irq, sdhci_irq, sdhci_thread_irq,
  9325. + IRQF_SHARED, mmc_hostname(host->mmc), host);
  9326. +#endif
  9327. +}
  9328. +
  9329. /*****************************************************************************\
  9330. * *
  9331. * Suspend/resume *
  9332. @@ -2632,9 +2657,7 @@
  9333. }
  9334. if (!device_may_wakeup(mmc_dev(host->mmc))) {
  9335. - ret = request_threaded_irq(host->irq, sdhci_irq,
  9336. - sdhci_thread_irq, IRQF_SHARED,
  9337. - mmc_hostname(host->mmc), host);
  9338. + ret = sdhci_req_irq(host);
  9339. if (ret)
  9340. return ret;
  9341. } else {
  9342. @@ -3253,8 +3276,7 @@
  9343. sdhci_init(host, 0);
  9344. - ret = request_threaded_irq(host->irq, sdhci_irq, sdhci_thread_irq,
  9345. - IRQF_SHARED, mmc_hostname(mmc), host);
  9346. + ret = sdhci_req_irq(host);
  9347. if (ret) {
  9348. pr_err("%s: Failed to request IRQ %d: %d\n",
  9349. mmc_hostname(mmc), host->irq, ret);
  9350. diff -Nur linux-3.18.14.orig/drivers/net/ethernet/3com/3c59x.c linux-3.18.14-rt/drivers/net/ethernet/3com/3c59x.c
  9351. --- linux-3.18.14.orig/drivers/net/ethernet/3com/3c59x.c 2015-05-20 10:04:50.000000000 -0500
  9352. +++ linux-3.18.14-rt/drivers/net/ethernet/3com/3c59x.c 2015-05-31 15:32:47.425635376 -0500
  9353. @@ -842,9 +842,9 @@
  9354. {
  9355. struct vortex_private *vp = netdev_priv(dev);
  9356. unsigned long flags;
  9357. - local_irq_save(flags);
  9358. + local_irq_save_nort(flags);
  9359. (vp->full_bus_master_rx ? boomerang_interrupt:vortex_interrupt)(dev->irq,dev);
  9360. - local_irq_restore(flags);
  9361. + local_irq_restore_nort(flags);
  9362. }
  9363. #endif
  9364. @@ -1916,12 +1916,12 @@
  9365. * Block interrupts because vortex_interrupt does a bare spin_lock()
  9366. */
  9367. unsigned long flags;
  9368. - local_irq_save(flags);
  9369. + local_irq_save_nort(flags);
  9370. if (vp->full_bus_master_tx)
  9371. boomerang_interrupt(dev->irq, dev);
  9372. else
  9373. vortex_interrupt(dev->irq, dev);
  9374. - local_irq_restore(flags);
  9375. + local_irq_restore_nort(flags);
  9376. }
  9377. }
  9378. diff -Nur linux-3.18.14.orig/drivers/net/ethernet/atheros/atl1c/atl1c_main.c linux-3.18.14-rt/drivers/net/ethernet/atheros/atl1c/atl1c_main.c
  9379. --- linux-3.18.14.orig/drivers/net/ethernet/atheros/atl1c/atl1c_main.c 2015-05-20 10:04:50.000000000 -0500
  9380. +++ linux-3.18.14-rt/drivers/net/ethernet/atheros/atl1c/atl1c_main.c 2015-05-31 15:32:47.437635376 -0500
  9381. @@ -2213,11 +2213,7 @@
  9382. }
  9383. tpd_req = atl1c_cal_tpd_req(skb);
  9384. - if (!spin_trylock_irqsave(&adapter->tx_lock, flags)) {
  9385. - if (netif_msg_pktdata(adapter))
  9386. - dev_info(&adapter->pdev->dev, "tx locked\n");
  9387. - return NETDEV_TX_LOCKED;
  9388. - }
  9389. + spin_lock_irqsave(&adapter->tx_lock, flags);
  9390. if (atl1c_tpd_avail(adapter, type) < tpd_req) {
  9391. /* no enough descriptor, just stop queue */
  9392. diff -Nur linux-3.18.14.orig/drivers/net/ethernet/atheros/atl1e/atl1e_main.c linux-3.18.14-rt/drivers/net/ethernet/atheros/atl1e/atl1e_main.c
  9393. --- linux-3.18.14.orig/drivers/net/ethernet/atheros/atl1e/atl1e_main.c 2015-05-20 10:04:50.000000000 -0500
  9394. +++ linux-3.18.14-rt/drivers/net/ethernet/atheros/atl1e/atl1e_main.c 2015-05-31 15:32:47.445635376 -0500
  9395. @@ -1880,8 +1880,7 @@
  9396. return NETDEV_TX_OK;
  9397. }
  9398. tpd_req = atl1e_cal_tdp_req(skb);
  9399. - if (!spin_trylock_irqsave(&adapter->tx_lock, flags))
  9400. - return NETDEV_TX_LOCKED;
  9401. + spin_lock_irqsave(&adapter->tx_lock, flags);
  9402. if (atl1e_tpd_avail(adapter) < tpd_req) {
  9403. /* no enough descriptor, just stop queue */
  9404. diff -Nur linux-3.18.14.orig/drivers/net/ethernet/chelsio/cxgb/sge.c linux-3.18.14-rt/drivers/net/ethernet/chelsio/cxgb/sge.c
  9405. --- linux-3.18.14.orig/drivers/net/ethernet/chelsio/cxgb/sge.c 2015-05-20 10:04:50.000000000 -0500
  9406. +++ linux-3.18.14-rt/drivers/net/ethernet/chelsio/cxgb/sge.c 2015-05-31 15:32:47.493635375 -0500
  9407. @@ -1663,8 +1663,7 @@
  9408. struct cmdQ *q = &sge->cmdQ[qid];
  9409. unsigned int credits, pidx, genbit, count, use_sched_skb = 0;
  9410. - if (!spin_trylock(&q->lock))
  9411. - return NETDEV_TX_LOCKED;
  9412. + spin_lock(&q->lock);
  9413. reclaim_completed_tx(sge, q);
  9414. diff -Nur linux-3.18.14.orig/drivers/net/ethernet/freescale/gianfar.c linux-3.18.14-rt/drivers/net/ethernet/freescale/gianfar.c
  9415. --- linux-3.18.14.orig/drivers/net/ethernet/freescale/gianfar.c 2015-05-20 10:04:50.000000000 -0500
  9416. +++ linux-3.18.14-rt/drivers/net/ethernet/freescale/gianfar.c 2015-05-31 15:32:47.525635375 -0500
  9417. @@ -1483,7 +1483,7 @@
  9418. if (netif_running(ndev)) {
  9419. - local_irq_save(flags);
  9420. + local_irq_save_nort(flags);
  9421. lock_tx_qs(priv);
  9422. gfar_halt_nodisable(priv);
  9423. @@ -1499,7 +1499,7 @@
  9424. gfar_write(&regs->maccfg1, tempval);
  9425. unlock_tx_qs(priv);
  9426. - local_irq_restore(flags);
  9427. + local_irq_restore_nort(flags);
  9428. disable_napi(priv);
  9429. @@ -1541,7 +1541,7 @@
  9430. /* Disable Magic Packet mode, in case something
  9431. * else woke us up.
  9432. */
  9433. - local_irq_save(flags);
  9434. + local_irq_save_nort(flags);
  9435. lock_tx_qs(priv);
  9436. tempval = gfar_read(&regs->maccfg2);
  9437. @@ -1551,7 +1551,7 @@
  9438. gfar_start(priv);
  9439. unlock_tx_qs(priv);
  9440. - local_irq_restore(flags);
  9441. + local_irq_restore_nort(flags);
  9442. netif_device_attach(ndev);
  9443. @@ -3307,14 +3307,14 @@
  9444. dev->stats.tx_dropped++;
  9445. atomic64_inc(&priv->extra_stats.tx_underrun);
  9446. - local_irq_save(flags);
  9447. + local_irq_save_nort(flags);
  9448. lock_tx_qs(priv);
  9449. /* Reactivate the Tx Queues */
  9450. gfar_write(&regs->tstat, gfargrp->tstat);
  9451. unlock_tx_qs(priv);
  9452. - local_irq_restore(flags);
  9453. + local_irq_restore_nort(flags);
  9454. }
  9455. netif_dbg(priv, tx_err, dev, "Transmit Error\n");
  9456. }
  9457. diff -Nur linux-3.18.14.orig/drivers/net/ethernet/neterion/s2io.c linux-3.18.14-rt/drivers/net/ethernet/neterion/s2io.c
  9458. --- linux-3.18.14.orig/drivers/net/ethernet/neterion/s2io.c 2015-05-20 10:04:50.000000000 -0500
  9459. +++ linux-3.18.14-rt/drivers/net/ethernet/neterion/s2io.c 2015-05-31 15:32:47.537635375 -0500
  9460. @@ -4084,12 +4084,7 @@
  9461. [skb->priority & (MAX_TX_FIFOS - 1)];
  9462. fifo = &mac_control->fifos[queue];
  9463. - if (do_spin_lock)
  9464. - spin_lock_irqsave(&fifo->tx_lock, flags);
  9465. - else {
  9466. - if (unlikely(!spin_trylock_irqsave(&fifo->tx_lock, flags)))
  9467. - return NETDEV_TX_LOCKED;
  9468. - }
  9469. + spin_lock_irqsave(&fifo->tx_lock, flags);
  9470. if (sp->config.multiq) {
  9471. if (__netif_subqueue_stopped(dev, fifo->fifo_no)) {
  9472. diff -Nur linux-3.18.14.orig/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c linux-3.18.14-rt/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c
  9473. --- linux-3.18.14.orig/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c 2015-05-20 10:04:50.000000000 -0500
  9474. +++ linux-3.18.14-rt/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c 2015-05-31 15:32:47.549635375 -0500
  9475. @@ -2137,10 +2137,8 @@
  9476. struct pch_gbe_tx_ring *tx_ring = adapter->tx_ring;
  9477. unsigned long flags;
  9478. - if (!spin_trylock_irqsave(&tx_ring->tx_lock, flags)) {
  9479. - /* Collision - tell upper layer to requeue */
  9480. - return NETDEV_TX_LOCKED;
  9481. - }
  9482. + spin_lock_irqsave(&tx_ring->tx_lock, flags);
  9483. +
  9484. if (unlikely(!PCH_GBE_DESC_UNUSED(tx_ring))) {
  9485. netif_stop_queue(netdev);
  9486. spin_unlock_irqrestore(&tx_ring->tx_lock, flags);
  9487. diff -Nur linux-3.18.14.orig/drivers/net/ethernet/realtek/8139too.c linux-3.18.14-rt/drivers/net/ethernet/realtek/8139too.c
  9488. --- linux-3.18.14.orig/drivers/net/ethernet/realtek/8139too.c 2015-05-20 10:04:50.000000000 -0500
  9489. +++ linux-3.18.14-rt/drivers/net/ethernet/realtek/8139too.c 2015-05-31 15:32:47.557635375 -0500
  9490. @@ -2215,7 +2215,7 @@
  9491. struct rtl8139_private *tp = netdev_priv(dev);
  9492. const int irq = tp->pci_dev->irq;
  9493. - disable_irq(irq);
  9494. + disable_irq_nosync(irq);
  9495. rtl8139_interrupt(irq, dev);
  9496. enable_irq(irq);
  9497. }
  9498. diff -Nur linux-3.18.14.orig/drivers/net/ethernet/tehuti/tehuti.c linux-3.18.14-rt/drivers/net/ethernet/tehuti/tehuti.c
  9499. --- linux-3.18.14.orig/drivers/net/ethernet/tehuti/tehuti.c 2015-05-20 10:04:50.000000000 -0500
  9500. +++ linux-3.18.14-rt/drivers/net/ethernet/tehuti/tehuti.c 2015-05-31 15:32:47.581635375 -0500
  9501. @@ -1629,13 +1629,8 @@
  9502. unsigned long flags;
  9503. ENTER;
  9504. - local_irq_save(flags);
  9505. - if (!spin_trylock(&priv->tx_lock)) {
  9506. - local_irq_restore(flags);
  9507. - DBG("%s[%s]: TX locked, returning NETDEV_TX_LOCKED\n",
  9508. - BDX_DRV_NAME, ndev->name);
  9509. - return NETDEV_TX_LOCKED;
  9510. - }
  9511. +
  9512. + spin_lock_irqsave(&priv->tx_lock, flags);
  9513. /* build tx descriptor */
  9514. BDX_ASSERT(f->m.wptr >= f->m.memsz); /* started with valid wptr */
  9515. diff -Nur linux-3.18.14.orig/drivers/net/rionet.c linux-3.18.14-rt/drivers/net/rionet.c
  9516. --- linux-3.18.14.orig/drivers/net/rionet.c 2015-05-20 10:04:50.000000000 -0500
  9517. +++ linux-3.18.14-rt/drivers/net/rionet.c 2015-05-31 15:32:47.597635374 -0500
  9518. @@ -174,11 +174,7 @@
  9519. unsigned long flags;
  9520. int add_num = 1;
  9521. - local_irq_save(flags);
  9522. - if (!spin_trylock(&rnet->tx_lock)) {
  9523. - local_irq_restore(flags);
  9524. - return NETDEV_TX_LOCKED;
  9525. - }
  9526. + spin_lock_irqsave(&rnet->tx_lock, flags);
  9527. if (is_multicast_ether_addr(eth->h_dest))
  9528. add_num = nets[rnet->mport->id].nact;
  9529. diff -Nur linux-3.18.14.orig/drivers/net/wireless/orinoco/orinoco_usb.c linux-3.18.14-rt/drivers/net/wireless/orinoco/orinoco_usb.c
  9530. --- linux-3.18.14.orig/drivers/net/wireless/orinoco/orinoco_usb.c 2015-05-20 10:04:50.000000000 -0500
  9531. +++ linux-3.18.14-rt/drivers/net/wireless/orinoco/orinoco_usb.c 2015-05-31 15:32:47.613635374 -0500
  9532. @@ -699,7 +699,7 @@
  9533. while (!ctx->done.done && msecs--)
  9534. udelay(1000);
  9535. } else {
  9536. - wait_event_interruptible(ctx->done.wait,
  9537. + swait_event_interruptible(ctx->done.wait,
  9538. ctx->done.done);
  9539. }
  9540. break;
  9541. diff -Nur linux-3.18.14.orig/drivers/pci/access.c linux-3.18.14-rt/drivers/pci/access.c
  9542. --- linux-3.18.14.orig/drivers/pci/access.c 2015-05-20 10:04:50.000000000 -0500
  9543. +++ linux-3.18.14-rt/drivers/pci/access.c 2015-05-31 15:32:47.665635374 -0500
  9544. @@ -434,7 +434,7 @@
  9545. WARN_ON(!dev->block_cfg_access);
  9546. dev->block_cfg_access = 0;
  9547. - wake_up_all(&pci_cfg_wait);
  9548. + wake_up_all_locked(&pci_cfg_wait);
  9549. raw_spin_unlock_irqrestore(&pci_lock, flags);
  9550. }
  9551. EXPORT_SYMBOL_GPL(pci_cfg_access_unlock);
  9552. diff -Nur linux-3.18.14.orig/drivers/scsi/fcoe/fcoe.c linux-3.18.14-rt/drivers/scsi/fcoe/fcoe.c
  9553. --- linux-3.18.14.orig/drivers/scsi/fcoe/fcoe.c 2015-05-20 10:04:50.000000000 -0500
  9554. +++ linux-3.18.14-rt/drivers/scsi/fcoe/fcoe.c 2015-05-31 15:32:47.677635374 -0500
  9555. @@ -1286,7 +1286,7 @@
  9556. struct sk_buff *skb;
  9557. #ifdef CONFIG_SMP
  9558. struct fcoe_percpu_s *p0;
  9559. - unsigned targ_cpu = get_cpu();
  9560. + unsigned targ_cpu = get_cpu_light();
  9561. #endif /* CONFIG_SMP */
  9562. FCOE_DBG("Destroying receive thread for CPU %d\n", cpu);
  9563. @@ -1342,7 +1342,7 @@
  9564. kfree_skb(skb);
  9565. spin_unlock_bh(&p->fcoe_rx_list.lock);
  9566. }
  9567. - put_cpu();
  9568. + put_cpu_light();
  9569. #else
  9570. /*
  9571. * This a non-SMP scenario where the singular Rx thread is
  9572. @@ -1566,11 +1566,11 @@
  9573. static int fcoe_alloc_paged_crc_eof(struct sk_buff *skb, int tlen)
  9574. {
  9575. struct fcoe_percpu_s *fps;
  9576. - int rc;
  9577. + int rc, cpu = get_cpu_light();
  9578. - fps = &get_cpu_var(fcoe_percpu);
  9579. + fps = &per_cpu(fcoe_percpu, cpu);
  9580. rc = fcoe_get_paged_crc_eof(skb, tlen, fps);
  9581. - put_cpu_var(fcoe_percpu);
  9582. + put_cpu_light();
  9583. return rc;
  9584. }
  9585. @@ -1768,11 +1768,11 @@
  9586. return 0;
  9587. }
  9588. - stats = per_cpu_ptr(lport->stats, get_cpu());
  9589. + stats = per_cpu_ptr(lport->stats, get_cpu_light());
  9590. stats->InvalidCRCCount++;
  9591. if (stats->InvalidCRCCount < 5)
  9592. printk(KERN_WARNING "fcoe: dropping frame with CRC error\n");
  9593. - put_cpu();
  9594. + put_cpu_light();
  9595. return -EINVAL;
  9596. }
  9597. @@ -1848,13 +1848,13 @@
  9598. goto drop;
  9599. if (!fcoe_filter_frames(lport, fp)) {
  9600. - put_cpu();
  9601. + put_cpu_light();
  9602. fc_exch_recv(lport, fp);
  9603. return;
  9604. }
  9605. drop:
  9606. stats->ErrorFrames++;
  9607. - put_cpu();
  9608. + put_cpu_light();
  9609. kfree_skb(skb);
  9610. }
  9611. diff -Nur linux-3.18.14.orig/drivers/scsi/fcoe/fcoe_ctlr.c linux-3.18.14-rt/drivers/scsi/fcoe/fcoe_ctlr.c
  9612. --- linux-3.18.14.orig/drivers/scsi/fcoe/fcoe_ctlr.c 2015-05-20 10:04:50.000000000 -0500
  9613. +++ linux-3.18.14-rt/drivers/scsi/fcoe/fcoe_ctlr.c 2015-05-31 15:32:47.681635374 -0500
  9614. @@ -831,7 +831,7 @@
  9615. INIT_LIST_HEAD(&del_list);
  9616. - stats = per_cpu_ptr(fip->lp->stats, get_cpu());
  9617. + stats = per_cpu_ptr(fip->lp->stats, get_cpu_light());
  9618. list_for_each_entry_safe(fcf, next, &fip->fcfs, list) {
  9619. deadline = fcf->time + fcf->fka_period + fcf->fka_period / 2;
  9620. @@ -867,7 +867,7 @@
  9621. sel_time = fcf->time;
  9622. }
  9623. }
  9624. - put_cpu();
  9625. + put_cpu_light();
  9626. list_for_each_entry_safe(fcf, next, &del_list, list) {
  9627. /* Removes fcf from current list */
  9628. diff -Nur linux-3.18.14.orig/drivers/scsi/libfc/fc_exch.c linux-3.18.14-rt/drivers/scsi/libfc/fc_exch.c
  9629. --- linux-3.18.14.orig/drivers/scsi/libfc/fc_exch.c 2015-05-20 10:04:50.000000000 -0500
  9630. +++ linux-3.18.14-rt/drivers/scsi/libfc/fc_exch.c 2015-05-31 15:32:47.689635374 -0500
  9631. @@ -816,10 +816,10 @@
  9632. }
  9633. memset(ep, 0, sizeof(*ep));
  9634. - cpu = get_cpu();
  9635. + cpu = get_cpu_light();
  9636. pool = per_cpu_ptr(mp->pool, cpu);
  9637. spin_lock_bh(&pool->lock);
  9638. - put_cpu();
  9639. + put_cpu_light();
  9640. /* peek cache of free slot */
  9641. if (pool->left != FC_XID_UNKNOWN) {
  9642. diff -Nur linux-3.18.14.orig/drivers/scsi/libsas/sas_ata.c linux-3.18.14-rt/drivers/scsi/libsas/sas_ata.c
  9643. --- linux-3.18.14.orig/drivers/scsi/libsas/sas_ata.c 2015-05-20 10:04:50.000000000 -0500
  9644. +++ linux-3.18.14-rt/drivers/scsi/libsas/sas_ata.c 2015-05-31 15:32:47.689635374 -0500
  9645. @@ -191,7 +191,7 @@
  9646. /* TODO: audit callers to ensure they are ready for qc_issue to
  9647. * unconditionally re-enable interrupts
  9648. */
  9649. - local_irq_save(flags);
  9650. + local_irq_save_nort(flags);
  9651. spin_unlock(ap->lock);
  9652. /* If the device fell off, no sense in issuing commands */
  9653. @@ -261,7 +261,7 @@
  9654. out:
  9655. spin_lock(ap->lock);
  9656. - local_irq_restore(flags);
  9657. + local_irq_restore_nort(flags);
  9658. return ret;
  9659. }
  9660. diff -Nur linux-3.18.14.orig/drivers/scsi/qla2xxx/qla_inline.h linux-3.18.14-rt/drivers/scsi/qla2xxx/qla_inline.h
  9661. --- linux-3.18.14.orig/drivers/scsi/qla2xxx/qla_inline.h 2015-05-20 10:04:50.000000000 -0500
  9662. +++ linux-3.18.14-rt/drivers/scsi/qla2xxx/qla_inline.h 2015-05-31 15:32:47.693635374 -0500
  9663. @@ -59,12 +59,12 @@
  9664. {
  9665. unsigned long flags;
  9666. struct qla_hw_data *ha = rsp->hw;
  9667. - local_irq_save(flags);
  9668. + local_irq_save_nort(flags);
  9669. if (IS_P3P_TYPE(ha))
  9670. qla82xx_poll(0, rsp);
  9671. else
  9672. ha->isp_ops->intr_handler(0, rsp);
  9673. - local_irq_restore(flags);
  9674. + local_irq_restore_nort(flags);
  9675. }
  9676. static inline uint8_t *
  9677. diff -Nur linux-3.18.14.orig/drivers/thermal/x86_pkg_temp_thermal.c linux-3.18.14-rt/drivers/thermal/x86_pkg_temp_thermal.c
  9678. --- linux-3.18.14.orig/drivers/thermal/x86_pkg_temp_thermal.c 2015-05-20 10:04:50.000000000 -0500
  9679. +++ linux-3.18.14-rt/drivers/thermal/x86_pkg_temp_thermal.c 2015-05-31 15:32:47.701635374 -0500
  9680. @@ -29,6 +29,7 @@
  9681. #include <linux/pm.h>
  9682. #include <linux/thermal.h>
  9683. #include <linux/debugfs.h>
  9684. +#include <linux/work-simple.h>
  9685. #include <asm/cpu_device_id.h>
  9686. #include <asm/mce.h>
  9687. @@ -352,7 +353,7 @@
  9688. }
  9689. }
  9690. -static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
  9691. +static void platform_thermal_notify_work(struct swork_event *event)
  9692. {
  9693. unsigned long flags;
  9694. int cpu = smp_processor_id();
  9695. @@ -369,7 +370,7 @@
  9696. pkg_work_scheduled[phy_id]) {
  9697. disable_pkg_thres_interrupt();
  9698. spin_unlock_irqrestore(&pkg_work_lock, flags);
  9699. - return -EINVAL;
  9700. + return;
  9701. }
  9702. pkg_work_scheduled[phy_id] = 1;
  9703. spin_unlock_irqrestore(&pkg_work_lock, flags);
  9704. @@ -378,9 +379,48 @@
  9705. schedule_delayed_work_on(cpu,
  9706. &per_cpu(pkg_temp_thermal_threshold_work, cpu),
  9707. msecs_to_jiffies(notify_delay_ms));
  9708. +}
  9709. +
  9710. +#ifdef CONFIG_PREEMPT_RT_FULL
  9711. +static struct swork_event notify_work;
  9712. +
  9713. +static int thermal_notify_work_init(void)
  9714. +{
  9715. + int err;
  9716. +
  9717. + err = swork_get();
  9718. + if (err)
  9719. + return err;
  9720. +
  9721. + INIT_SWORK(&notify_work, platform_thermal_notify_work);
  9722. return 0;
  9723. }
  9724. +static void thermal_notify_work_cleanup(void)
  9725. +{
  9726. + swork_put();
  9727. +}
  9728. +
  9729. +static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
  9730. +{
  9731. + swork_queue(&notify_work);
  9732. + return 0;
  9733. +}
  9734. +
  9735. +#else /* !CONFIG_PREEMPT_RT_FULL */
  9736. +
  9737. +static int thermal_notify_work_init(void) { return 0; }
  9738. +
  9739. +static int thermal_notify_work_cleanup(void) { }
  9740. +
  9741. +static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
  9742. +{
  9743. + platform_thermal_notify_work(NULL);
  9744. +
  9745. + return 0;
  9746. +}
  9747. +#endif /* CONFIG_PREEMPT_RT_FULL */
  9748. +
  9749. static int find_siblings_cpu(int cpu)
  9750. {
  9751. int i;
  9752. @@ -584,6 +624,9 @@
  9753. if (!x86_match_cpu(pkg_temp_thermal_ids))
  9754. return -ENODEV;
  9755. + if (!thermal_notify_work_init())
  9756. + return -ENODEV;
  9757. +
  9758. spin_lock_init(&pkg_work_lock);
  9759. platform_thermal_package_notify =
  9760. pkg_temp_thermal_platform_thermal_notify;
  9761. @@ -608,7 +651,7 @@
  9762. kfree(pkg_work_scheduled);
  9763. platform_thermal_package_notify = NULL;
  9764. platform_thermal_package_rate_control = NULL;
  9765. -
  9766. + thermal_notify_work_cleanup();
  9767. return -ENODEV;
  9768. }
  9769. @@ -633,6 +676,7 @@
  9770. mutex_unlock(&phy_dev_list_mutex);
  9771. platform_thermal_package_notify = NULL;
  9772. platform_thermal_package_rate_control = NULL;
  9773. + thermal_notify_work_cleanup();
  9774. for_each_online_cpu(i)
  9775. cancel_delayed_work_sync(
  9776. &per_cpu(pkg_temp_thermal_threshold_work, i));
  9777. diff -Nur linux-3.18.14.orig/drivers/tty/serial/8250/8250_core.c linux-3.18.14-rt/drivers/tty/serial/8250/8250_core.c
  9778. --- linux-3.18.14.orig/drivers/tty/serial/8250/8250_core.c 2015-05-20 10:04:50.000000000 -0500
  9779. +++ linux-3.18.14-rt/drivers/tty/serial/8250/8250_core.c 2015-05-31 15:32:47.753635373 -0500
  9780. @@ -37,6 +37,7 @@
  9781. #include <linux/nmi.h>
  9782. #include <linux/mutex.h>
  9783. #include <linux/slab.h>
  9784. +#include <linux/kdb.h>
  9785. #include <linux/uaccess.h>
  9786. #include <linux/pm_runtime.h>
  9787. #ifdef CONFIG_SPARC
  9788. @@ -81,7 +82,16 @@
  9789. #define DEBUG_INTR(fmt...) do { } while (0)
  9790. #endif
  9791. -#define PASS_LIMIT 512
  9792. +/*
  9793. + * On -rt we can have a more delays, and legitimately
  9794. + * so - so don't drop work spuriously and spam the
  9795. + * syslog:
  9796. + */
  9797. +#ifdef CONFIG_PREEMPT_RT_FULL
  9798. +# define PASS_LIMIT 1000000
  9799. +#else
  9800. +# define PASS_LIMIT 512
  9801. +#endif
  9802. #define BOTH_EMPTY (UART_LSR_TEMT | UART_LSR_THRE)
  9803. @@ -3197,7 +3207,7 @@
  9804. serial8250_rpm_get(up);
  9805. - if (port->sysrq || oops_in_progress)
  9806. + if (port->sysrq || oops_in_progress || in_kdb_printk())
  9807. locked = spin_trylock_irqsave(&port->lock, flags);
  9808. else
  9809. spin_lock_irqsave(&port->lock, flags);
  9810. diff -Nur linux-3.18.14.orig/drivers/tty/serial/amba-pl011.c linux-3.18.14-rt/drivers/tty/serial/amba-pl011.c
  9811. --- linux-3.18.14.orig/drivers/tty/serial/amba-pl011.c 2015-05-20 10:04:50.000000000 -0500
  9812. +++ linux-3.18.14-rt/drivers/tty/serial/amba-pl011.c 2015-05-31 15:32:47.777635373 -0500
  9813. @@ -1935,13 +1935,19 @@
  9814. clk_enable(uap->clk);
  9815. - local_irq_save(flags);
  9816. + /*
  9817. + * local_irq_save(flags);
  9818. + *
  9819. + * This local_irq_save() is nonsense. If we come in via sysrq
  9820. + * handling then interrupts are already disabled. Aside of
  9821. + * that the port.sysrq check is racy on SMP regardless.
  9822. + */
  9823. if (uap->port.sysrq)
  9824. locked = 0;
  9825. else if (oops_in_progress)
  9826. - locked = spin_trylock(&uap->port.lock);
  9827. + locked = spin_trylock_irqsave(&uap->port.lock, flags);
  9828. else
  9829. - spin_lock(&uap->port.lock);
  9830. + spin_lock_irqsave(&uap->port.lock, flags);
  9831. /*
  9832. * First save the CR then disable the interrupts
  9833. @@ -1963,8 +1969,7 @@
  9834. writew(old_cr, uap->port.membase + UART011_CR);
  9835. if (locked)
  9836. - spin_unlock(&uap->port.lock);
  9837. - local_irq_restore(flags);
  9838. + spin_unlock_irqrestore(&uap->port.lock, flags);
  9839. clk_disable(uap->clk);
  9840. }
  9841. diff -Nur linux-3.18.14.orig/drivers/tty/serial/omap-serial.c linux-3.18.14-rt/drivers/tty/serial/omap-serial.c
  9842. --- linux-3.18.14.orig/drivers/tty/serial/omap-serial.c 2015-05-20 10:04:50.000000000 -0500
  9843. +++ linux-3.18.14-rt/drivers/tty/serial/omap-serial.c 2015-05-31 15:32:47.781635373 -0500
  9844. @@ -1270,13 +1270,10 @@
  9845. pm_runtime_get_sync(up->dev);
  9846. - local_irq_save(flags);
  9847. - if (up->port.sysrq)
  9848. - locked = 0;
  9849. - else if (oops_in_progress)
  9850. - locked = spin_trylock(&up->port.lock);
  9851. + if (up->port.sysrq || oops_in_progress)
  9852. + locked = spin_trylock_irqsave(&up->port.lock, flags);
  9853. else
  9854. - spin_lock(&up->port.lock);
  9855. + spin_lock_irqsave(&up->port.lock, flags);
  9856. /*
  9857. * First save the IER then disable the interrupts
  9858. @@ -1305,8 +1302,7 @@
  9859. pm_runtime_mark_last_busy(up->dev);
  9860. pm_runtime_put_autosuspend(up->dev);
  9861. if (locked)
  9862. - spin_unlock(&up->port.lock);
  9863. - local_irq_restore(flags);
  9864. + spin_unlock_irqrestore(&up->port.lock, flags);
  9865. }
  9866. static int __init
  9867. diff -Nur linux-3.18.14.orig/drivers/usb/core/hcd.c linux-3.18.14-rt/drivers/usb/core/hcd.c
  9868. --- linux-3.18.14.orig/drivers/usb/core/hcd.c 2015-05-20 10:04:50.000000000 -0500
  9869. +++ linux-3.18.14-rt/drivers/usb/core/hcd.c 2015-05-31 15:32:47.785635373 -0500
  9870. @@ -1681,9 +1681,9 @@
  9871. * and no one may trigger the above deadlock situation when
  9872. * running complete() in tasklet.
  9873. */
  9874. - local_irq_save(flags);
  9875. + local_irq_save_nort(flags);
  9876. urb->complete(urb);
  9877. - local_irq_restore(flags);
  9878. + local_irq_restore_nort(flags);
  9879. usb_anchor_resume_wakeups(anchor);
  9880. atomic_dec(&urb->use_count);
  9881. diff -Nur linux-3.18.14.orig/drivers/usb/gadget/function/f_fs.c linux-3.18.14-rt/drivers/usb/gadget/function/f_fs.c
  9882. --- linux-3.18.14.orig/drivers/usb/gadget/function/f_fs.c 2015-05-20 10:04:50.000000000 -0500
  9883. +++ linux-3.18.14-rt/drivers/usb/gadget/function/f_fs.c 2015-05-31 15:32:47.809635373 -0500
  9884. @@ -1428,7 +1428,7 @@
  9885. pr_info("%s(): freeing\n", __func__);
  9886. ffs_data_clear(ffs);
  9887. BUG_ON(waitqueue_active(&ffs->ev.waitq) ||
  9888. - waitqueue_active(&ffs->ep0req_completion.wait));
  9889. + swaitqueue_active(&ffs->ep0req_completion.wait));
  9890. kfree(ffs->dev_name);
  9891. kfree(ffs);
  9892. }
  9893. diff -Nur linux-3.18.14.orig/drivers/usb/gadget/legacy/inode.c linux-3.18.14-rt/drivers/usb/gadget/legacy/inode.c
  9894. --- linux-3.18.14.orig/drivers/usb/gadget/legacy/inode.c 2015-05-20 10:04:50.000000000 -0500
  9895. +++ linux-3.18.14-rt/drivers/usb/gadget/legacy/inode.c 2015-05-31 15:32:47.837635372 -0500
  9896. @@ -339,7 +339,7 @@
  9897. spin_unlock_irq (&epdata->dev->lock);
  9898. if (likely (value == 0)) {
  9899. - value = wait_event_interruptible (done.wait, done.done);
  9900. + value = swait_event_interruptible (done.wait, done.done);
  9901. if (value != 0) {
  9902. spin_lock_irq (&epdata->dev->lock);
  9903. if (likely (epdata->ep != NULL)) {
  9904. @@ -348,7 +348,7 @@
  9905. usb_ep_dequeue (epdata->ep, epdata->req);
  9906. spin_unlock_irq (&epdata->dev->lock);
  9907. - wait_event (done.wait, done.done);
  9908. + swait_event (done.wait, done.done);
  9909. if (epdata->status == -ECONNRESET)
  9910. epdata->status = -EINTR;
  9911. } else {
  9912. diff -Nur linux-3.18.14.orig/fs/aio.c linux-3.18.14-rt/fs/aio.c
  9913. --- linux-3.18.14.orig/fs/aio.c 2015-05-20 10:04:50.000000000 -0500
  9914. +++ linux-3.18.14-rt/fs/aio.c 2015-05-31 15:32:47.853635372 -0500
  9915. @@ -40,6 +40,7 @@
  9916. #include <linux/ramfs.h>
  9917. #include <linux/percpu-refcount.h>
  9918. #include <linux/mount.h>
  9919. +#include <linux/work-simple.h>
  9920. #include <asm/kmap_types.h>
  9921. #include <asm/uaccess.h>
  9922. @@ -110,7 +111,7 @@
  9923. struct page **ring_pages;
  9924. long nr_pages;
  9925. - struct work_struct free_work;
  9926. + struct swork_event free_work;
  9927. /*
  9928. * signals when all in-flight requests are done
  9929. @@ -226,6 +227,7 @@
  9930. .mount = aio_mount,
  9931. .kill_sb = kill_anon_super,
  9932. };
  9933. + BUG_ON(swork_get());
  9934. aio_mnt = kern_mount(&aio_fs);
  9935. if (IS_ERR(aio_mnt))
  9936. panic("Failed to create aio fs mount.");
  9937. @@ -505,9 +507,9 @@
  9938. return cancel(kiocb);
  9939. }
  9940. -static void free_ioctx(struct work_struct *work)
  9941. +static void free_ioctx(struct swork_event *sev)
  9942. {
  9943. - struct kioctx *ctx = container_of(work, struct kioctx, free_work);
  9944. + struct kioctx *ctx = container_of(sev, struct kioctx, free_work);
  9945. pr_debug("freeing %p\n", ctx);
  9946. @@ -526,8 +528,8 @@
  9947. if (ctx->requests_done)
  9948. complete(ctx->requests_done);
  9949. - INIT_WORK(&ctx->free_work, free_ioctx);
  9950. - schedule_work(&ctx->free_work);
  9951. + INIT_SWORK(&ctx->free_work, free_ioctx);
  9952. + swork_queue(&ctx->free_work);
  9953. }
  9954. /*
  9955. @@ -535,9 +537,9 @@
  9956. * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted -
  9957. * now it's safe to cancel any that need to be.
  9958. */
  9959. -static void free_ioctx_users(struct percpu_ref *ref)
  9960. +static void free_ioctx_users_work(struct swork_event *sev)
  9961. {
  9962. - struct kioctx *ctx = container_of(ref, struct kioctx, users);
  9963. + struct kioctx *ctx = container_of(sev, struct kioctx, free_work);
  9964. struct kiocb *req;
  9965. spin_lock_irq(&ctx->ctx_lock);
  9966. @@ -556,6 +558,14 @@
  9967. percpu_ref_put(&ctx->reqs);
  9968. }
  9969. +static void free_ioctx_users(struct percpu_ref *ref)
  9970. +{
  9971. + struct kioctx *ctx = container_of(ref, struct kioctx, users);
  9972. +
  9973. + INIT_SWORK(&ctx->free_work, free_ioctx_users_work);
  9974. + swork_queue(&ctx->free_work);
  9975. +}
  9976. +
  9977. static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
  9978. {
  9979. unsigned i, new_nr;
  9980. diff -Nur linux-3.18.14.orig/fs/autofs4/autofs_i.h linux-3.18.14-rt/fs/autofs4/autofs_i.h
  9981. --- linux-3.18.14.orig/fs/autofs4/autofs_i.h 2015-05-20 10:04:50.000000000 -0500
  9982. +++ linux-3.18.14-rt/fs/autofs4/autofs_i.h 2015-05-31 15:32:47.865635372 -0500
  9983. @@ -34,6 +34,7 @@
  9984. #include <linux/sched.h>
  9985. #include <linux/mount.h>
  9986. #include <linux/namei.h>
  9987. +#include <linux/delay.h>
  9988. #include <asm/current.h>
  9989. #include <asm/uaccess.h>
  9990. diff -Nur linux-3.18.14.orig/fs/autofs4/expire.c linux-3.18.14-rt/fs/autofs4/expire.c
  9991. --- linux-3.18.14.orig/fs/autofs4/expire.c 2015-05-20 10:04:50.000000000 -0500
  9992. +++ linux-3.18.14-rt/fs/autofs4/expire.c 2015-05-31 15:32:47.897635372 -0500
  9993. @@ -151,7 +151,7 @@
  9994. parent = p->d_parent;
  9995. if (!spin_trylock(&parent->d_lock)) {
  9996. spin_unlock(&p->d_lock);
  9997. - cpu_relax();
  9998. + cpu_chill();
  9999. goto relock;
  10000. }
  10001. spin_unlock(&p->d_lock);
  10002. diff -Nur linux-3.18.14.orig/fs/buffer.c linux-3.18.14-rt/fs/buffer.c
  10003. --- linux-3.18.14.orig/fs/buffer.c 2015-05-20 10:04:50.000000000 -0500
  10004. +++ linux-3.18.14-rt/fs/buffer.c 2015-05-31 15:32:47.905635372 -0500
  10005. @@ -301,8 +301,7 @@
  10006. * decide that the page is now completely done.
  10007. */
  10008. first = page_buffers(page);
  10009. - local_irq_save(flags);
  10010. - bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
  10011. + flags = bh_uptodate_lock_irqsave(first);
  10012. clear_buffer_async_read(bh);
  10013. unlock_buffer(bh);
  10014. tmp = bh;
  10015. @@ -315,8 +314,7 @@
  10016. }
  10017. tmp = tmp->b_this_page;
  10018. } while (tmp != bh);
  10019. - bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
  10020. - local_irq_restore(flags);
  10021. + bh_uptodate_unlock_irqrestore(first, flags);
  10022. /*
  10023. * If none of the buffers had errors and they are all
  10024. @@ -328,9 +326,7 @@
  10025. return;
  10026. still_busy:
  10027. - bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
  10028. - local_irq_restore(flags);
  10029. - return;
  10030. + bh_uptodate_unlock_irqrestore(first, flags);
  10031. }
  10032. /*
  10033. @@ -358,8 +354,7 @@
  10034. }
  10035. first = page_buffers(page);
  10036. - local_irq_save(flags);
  10037. - bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
  10038. + flags = bh_uptodate_lock_irqsave(first);
  10039. clear_buffer_async_write(bh);
  10040. unlock_buffer(bh);
  10041. @@ -371,15 +366,12 @@
  10042. }
  10043. tmp = tmp->b_this_page;
  10044. }
  10045. - bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
  10046. - local_irq_restore(flags);
  10047. + bh_uptodate_unlock_irqrestore(first, flags);
  10048. end_page_writeback(page);
  10049. return;
  10050. still_busy:
  10051. - bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
  10052. - local_irq_restore(flags);
  10053. - return;
  10054. + bh_uptodate_unlock_irqrestore(first, flags);
  10055. }
  10056. EXPORT_SYMBOL(end_buffer_async_write);
  10057. @@ -3325,6 +3317,7 @@
  10058. struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
  10059. if (ret) {
  10060. INIT_LIST_HEAD(&ret->b_assoc_buffers);
  10061. + buffer_head_init_locks(ret);
  10062. preempt_disable();
  10063. __this_cpu_inc(bh_accounting.nr);
  10064. recalc_bh_state();
  10065. diff -Nur linux-3.18.14.orig/fs/dcache.c linux-3.18.14-rt/fs/dcache.c
  10066. --- linux-3.18.14.orig/fs/dcache.c 2015-05-20 10:04:50.000000000 -0500
  10067. +++ linux-3.18.14-rt/fs/dcache.c 2015-05-31 15:32:47.929635371 -0500
  10068. @@ -19,6 +19,7 @@
  10069. #include <linux/mm.h>
  10070. #include <linux/fs.h>
  10071. #include <linux/fsnotify.h>
  10072. +#include <linux/delay.h>
  10073. #include <linux/slab.h>
  10074. #include <linux/init.h>
  10075. #include <linux/hash.h>
  10076. @@ -552,7 +553,7 @@
  10077. failed:
  10078. spin_unlock(&dentry->d_lock);
  10079. - cpu_relax();
  10080. + cpu_chill();
  10081. return dentry; /* try again with same dentry */
  10082. }
  10083. @@ -2285,7 +2286,7 @@
  10084. if (dentry->d_lockref.count == 1) {
  10085. if (!spin_trylock(&inode->i_lock)) {
  10086. spin_unlock(&dentry->d_lock);
  10087. - cpu_relax();
  10088. + cpu_chill();
  10089. goto again;
  10090. }
  10091. dentry->d_flags &= ~DCACHE_CANT_MOUNT;
  10092. diff -Nur linux-3.18.14.orig/fs/eventpoll.c linux-3.18.14-rt/fs/eventpoll.c
  10093. --- linux-3.18.14.orig/fs/eventpoll.c 2015-05-20 10:04:50.000000000 -0500
  10094. +++ linux-3.18.14-rt/fs/eventpoll.c 2015-05-31 15:32:47.945635371 -0500
  10095. @@ -505,12 +505,12 @@
  10096. */
  10097. static void ep_poll_safewake(wait_queue_head_t *wq)
  10098. {
  10099. - int this_cpu = get_cpu();
  10100. + int this_cpu = get_cpu_light();
  10101. ep_call_nested(&poll_safewake_ncalls, EP_MAX_NESTS,
  10102. ep_poll_wakeup_proc, NULL, wq, (void *) (long) this_cpu);
  10103. - put_cpu();
  10104. + put_cpu_light();
  10105. }
  10106. static void ep_remove_wait_queue(struct eppoll_entry *pwq)
  10107. diff -Nur linux-3.18.14.orig/fs/exec.c linux-3.18.14-rt/fs/exec.c
  10108. --- linux-3.18.14.orig/fs/exec.c 2015-05-20 10:04:50.000000000 -0500
  10109. +++ linux-3.18.14-rt/fs/exec.c 2015-05-31 15:32:47.945635371 -0500
  10110. @@ -841,12 +841,14 @@
  10111. }
  10112. }
  10113. task_lock(tsk);
  10114. + preempt_disable_rt();
  10115. active_mm = tsk->active_mm;
  10116. tsk->mm = mm;
  10117. tsk->active_mm = mm;
  10118. activate_mm(active_mm, mm);
  10119. tsk->mm->vmacache_seqnum = 0;
  10120. vmacache_flush(tsk);
  10121. + preempt_enable_rt();
  10122. task_unlock(tsk);
  10123. if (old_mm) {
  10124. up_read(&old_mm->mmap_sem);
  10125. diff -Nur linux-3.18.14.orig/fs/jbd/checkpoint.c linux-3.18.14-rt/fs/jbd/checkpoint.c
  10126. --- linux-3.18.14.orig/fs/jbd/checkpoint.c 2015-05-20 10:04:50.000000000 -0500
  10127. +++ linux-3.18.14-rt/fs/jbd/checkpoint.c 2015-05-31 15:32:47.957635371 -0500
  10128. @@ -129,6 +129,8 @@
  10129. if (journal->j_flags & JFS_ABORT)
  10130. return;
  10131. spin_unlock(&journal->j_state_lock);
  10132. + if (current->plug)
  10133. + io_schedule();
  10134. mutex_lock(&journal->j_checkpoint_mutex);
  10135. /*
  10136. diff -Nur linux-3.18.14.orig/fs/jbd2/checkpoint.c linux-3.18.14-rt/fs/jbd2/checkpoint.c
  10137. --- linux-3.18.14.orig/fs/jbd2/checkpoint.c 2015-05-20 10:04:50.000000000 -0500
  10138. +++ linux-3.18.14-rt/fs/jbd2/checkpoint.c 2015-05-31 15:32:47.969635371 -0500
  10139. @@ -116,6 +116,8 @@
  10140. nblocks = jbd2_space_needed(journal);
  10141. while (jbd2_log_space_left(journal) < nblocks) {
  10142. write_unlock(&journal->j_state_lock);
  10143. + if (current->plug)
  10144. + io_schedule();
  10145. mutex_lock(&journal->j_checkpoint_mutex);
  10146. /*
  10147. diff -Nur linux-3.18.14.orig/fs/namespace.c linux-3.18.14-rt/fs/namespace.c
  10148. --- linux-3.18.14.orig/fs/namespace.c 2015-05-20 10:04:50.000000000 -0500
  10149. +++ linux-3.18.14-rt/fs/namespace.c 2015-05-31 15:32:47.969635371 -0500
  10150. @@ -14,6 +14,7 @@
  10151. #include <linux/mnt_namespace.h>
  10152. #include <linux/user_namespace.h>
  10153. #include <linux/namei.h>
  10154. +#include <linux/delay.h>
  10155. #include <linux/security.h>
  10156. #include <linux/idr.h>
  10157. #include <linux/init.h> /* init_rootfs */
  10158. @@ -344,8 +345,11 @@
  10159. * incremented count after it has set MNT_WRITE_HOLD.
  10160. */
  10161. smp_mb();
  10162. - while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD)
  10163. - cpu_relax();
  10164. + while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) {
  10165. + preempt_enable();
  10166. + cpu_chill();
  10167. + preempt_disable();
  10168. + }
  10169. /*
  10170. * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
  10171. * be set to match its requirements. So we must not load that until
  10172. diff -Nur linux-3.18.14.orig/fs/ntfs/aops.c linux-3.18.14-rt/fs/ntfs/aops.c
  10173. --- linux-3.18.14.orig/fs/ntfs/aops.c 2015-05-20 10:04:50.000000000 -0500
  10174. +++ linux-3.18.14-rt/fs/ntfs/aops.c 2015-05-31 15:32:47.969635371 -0500
  10175. @@ -107,8 +107,7 @@
  10176. "0x%llx.", (unsigned long long)bh->b_blocknr);
  10177. }
  10178. first = page_buffers(page);
  10179. - local_irq_save(flags);
  10180. - bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
  10181. + flags = bh_uptodate_lock_irqsave(first);
  10182. clear_buffer_async_read(bh);
  10183. unlock_buffer(bh);
  10184. tmp = bh;
  10185. @@ -123,8 +122,7 @@
  10186. }
  10187. tmp = tmp->b_this_page;
  10188. } while (tmp != bh);
  10189. - bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
  10190. - local_irq_restore(flags);
  10191. + bh_uptodate_unlock_irqrestore(first, flags);
  10192. /*
  10193. * If none of the buffers had errors then we can set the page uptodate,
  10194. * but we first have to perform the post read mst fixups, if the
  10195. @@ -145,13 +143,13 @@
  10196. recs = PAGE_CACHE_SIZE / rec_size;
  10197. /* Should have been verified before we got here... */
  10198. BUG_ON(!recs);
  10199. - local_irq_save(flags);
  10200. + local_irq_save_nort(flags);
  10201. kaddr = kmap_atomic(page);
  10202. for (i = 0; i < recs; i++)
  10203. post_read_mst_fixup((NTFS_RECORD*)(kaddr +
  10204. i * rec_size), rec_size);
  10205. kunmap_atomic(kaddr);
  10206. - local_irq_restore(flags);
  10207. + local_irq_restore_nort(flags);
  10208. flush_dcache_page(page);
  10209. if (likely(page_uptodate && !PageError(page)))
  10210. SetPageUptodate(page);
  10211. @@ -159,9 +157,7 @@
  10212. unlock_page(page);
  10213. return;
  10214. still_busy:
  10215. - bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
  10216. - local_irq_restore(flags);
  10217. - return;
  10218. + bh_uptodate_unlock_irqrestore(first, flags);
  10219. }
  10220. /**
  10221. diff -Nur linux-3.18.14.orig/fs/timerfd.c linux-3.18.14-rt/fs/timerfd.c
  10222. --- linux-3.18.14.orig/fs/timerfd.c 2015-05-20 10:04:50.000000000 -0500
  10223. +++ linux-3.18.14-rt/fs/timerfd.c 2015-05-31 15:32:47.969635371 -0500
  10224. @@ -449,7 +449,10 @@
  10225. break;
  10226. }
  10227. spin_unlock_irq(&ctx->wqh.lock);
  10228. - cpu_relax();
  10229. + if (isalarm(ctx))
  10230. + hrtimer_wait_for_timer(&ctx->t.alarm.timer);
  10231. + else
  10232. + hrtimer_wait_for_timer(&ctx->t.tmr);
  10233. }
  10234. /*
  10235. diff -Nur linux-3.18.14.orig/fs/xfs/xfs_linux.h linux-3.18.14-rt/fs/xfs/xfs_linux.h
  10236. --- linux-3.18.14.orig/fs/xfs/xfs_linux.h 2015-05-20 10:04:50.000000000 -0500
  10237. +++ linux-3.18.14-rt/fs/xfs/xfs_linux.h 2015-05-31 15:32:47.989635371 -0500
  10238. @@ -119,7 +119,7 @@
  10239. /*
  10240. * Feature macros (disable/enable)
  10241. */
  10242. -#ifdef CONFIG_SMP
  10243. +#if defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT_RT_FULL)
  10244. #define HAVE_PERCPU_SB /* per cpu superblock counters are a 2.6 feature */
  10245. #else
  10246. #undef HAVE_PERCPU_SB /* per cpu superblock counters are a 2.6 feature */
  10247. diff -Nur linux-3.18.14.orig/include/acpi/platform/aclinux.h linux-3.18.14-rt/include/acpi/platform/aclinux.h
  10248. --- linux-3.18.14.orig/include/acpi/platform/aclinux.h 2015-05-20 10:04:50.000000000 -0500
  10249. +++ linux-3.18.14-rt/include/acpi/platform/aclinux.h 2015-05-31 15:32:48.013635371 -0500
  10250. @@ -123,6 +123,7 @@
  10251. #define acpi_cache_t struct kmem_cache
  10252. #define acpi_spinlock spinlock_t *
  10253. +#define acpi_raw_spinlock raw_spinlock_t *
  10254. #define acpi_cpu_flags unsigned long
  10255. /* Use native linux version of acpi_os_allocate_zeroed */
  10256. @@ -141,6 +142,20 @@
  10257. #define ACPI_USE_ALTERNATE_PROTOTYPE_acpi_os_get_thread_id
  10258. #define ACPI_USE_ALTERNATE_PROTOTYPE_acpi_os_create_lock
  10259. +#define acpi_os_create_raw_lock(__handle) \
  10260. +({ \
  10261. + raw_spinlock_t *lock = ACPI_ALLOCATE(sizeof(*lock)); \
  10262. + \
  10263. + if (lock) { \
  10264. + *(__handle) = lock; \
  10265. + raw_spin_lock_init(*(__handle)); \
  10266. + } \
  10267. + lock ? AE_OK : AE_NO_MEMORY; \
  10268. + })
  10269. +
  10270. +#define acpi_os_delete_raw_lock(__handle) kfree(__handle)
  10271. +
  10272. +
  10273. /*
  10274. * OSL interfaces used by debugger/disassembler
  10275. */
  10276. diff -Nur linux-3.18.14.orig/include/asm-generic/bug.h linux-3.18.14-rt/include/asm-generic/bug.h
  10277. --- linux-3.18.14.orig/include/asm-generic/bug.h 2015-05-20 10:04:50.000000000 -0500
  10278. +++ linux-3.18.14-rt/include/asm-generic/bug.h 2015-05-31 15:32:48.037635370 -0500
  10279. @@ -206,6 +206,20 @@
  10280. # define WARN_ON_SMP(x) ({0;})
  10281. #endif
  10282. +#ifdef CONFIG_PREEMPT_RT_BASE
  10283. +# define BUG_ON_RT(c) BUG_ON(c)
  10284. +# define BUG_ON_NONRT(c) do { } while (0)
  10285. +# define WARN_ON_RT(condition) WARN_ON(condition)
  10286. +# define WARN_ON_NONRT(condition) do { } while (0)
  10287. +# define WARN_ON_ONCE_NONRT(condition) do { } while (0)
  10288. +#else
  10289. +# define BUG_ON_RT(c) do { } while (0)
  10290. +# define BUG_ON_NONRT(c) BUG_ON(c)
  10291. +# define WARN_ON_RT(condition) do { } while (0)
  10292. +# define WARN_ON_NONRT(condition) WARN_ON(condition)
  10293. +# define WARN_ON_ONCE_NONRT(condition) WARN_ON_ONCE(condition)
  10294. +#endif
  10295. +
  10296. #endif /* __ASSEMBLY__ */
  10297. #endif
  10298. diff -Nur linux-3.18.14.orig/include/linux/blkdev.h linux-3.18.14-rt/include/linux/blkdev.h
  10299. --- linux-3.18.14.orig/include/linux/blkdev.h 2015-05-20 10:04:50.000000000 -0500
  10300. +++ linux-3.18.14-rt/include/linux/blkdev.h 2015-05-31 15:32:48.077635370 -0500
  10301. @@ -101,6 +101,7 @@
  10302. struct list_head queuelist;
  10303. union {
  10304. struct call_single_data csd;
  10305. + struct work_struct work;
  10306. unsigned long fifo_time;
  10307. };
  10308. @@ -478,7 +479,7 @@
  10309. struct throtl_data *td;
  10310. #endif
  10311. struct rcu_head rcu_head;
  10312. - wait_queue_head_t mq_freeze_wq;
  10313. + struct swait_head mq_freeze_wq;
  10314. struct percpu_ref mq_usage_counter;
  10315. struct list_head all_q_node;
  10316. diff -Nur linux-3.18.14.orig/include/linux/blk-mq.h linux-3.18.14-rt/include/linux/blk-mq.h
  10317. --- linux-3.18.14.orig/include/linux/blk-mq.h 2015-05-20 10:04:50.000000000 -0500
  10318. +++ linux-3.18.14-rt/include/linux/blk-mq.h 2015-05-31 15:32:48.069635370 -0500
  10319. @@ -169,6 +169,7 @@
  10320. struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *, const int ctx_index);
  10321. struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_tag_set *, unsigned int, int);
  10322. +void __blk_mq_complete_request_remote_work(struct work_struct *work);
  10323. void blk_mq_start_request(struct request *rq);
  10324. void blk_mq_end_request(struct request *rq, int error);
  10325. diff -Nur linux-3.18.14.orig/include/linux/bottom_half.h linux-3.18.14-rt/include/linux/bottom_half.h
  10326. --- linux-3.18.14.orig/include/linux/bottom_half.h 2015-05-20 10:04:50.000000000 -0500
  10327. +++ linux-3.18.14-rt/include/linux/bottom_half.h 2015-05-31 15:32:48.081635370 -0500
  10328. @@ -4,6 +4,17 @@
  10329. #include <linux/preempt.h>
  10330. #include <linux/preempt_mask.h>
  10331. +#ifdef CONFIG_PREEMPT_RT_FULL
  10332. +
  10333. +extern void local_bh_disable(void);
  10334. +extern void _local_bh_enable(void);
  10335. +extern void local_bh_enable(void);
  10336. +extern void local_bh_enable_ip(unsigned long ip);
  10337. +extern void __local_bh_disable_ip(unsigned long ip, unsigned int cnt);
  10338. +extern void __local_bh_enable_ip(unsigned long ip, unsigned int cnt);
  10339. +
  10340. +#else
  10341. +
  10342. #ifdef CONFIG_TRACE_IRQFLAGS
  10343. extern void __local_bh_disable_ip(unsigned long ip, unsigned int cnt);
  10344. #else
  10345. @@ -31,5 +42,6 @@
  10346. {
  10347. __local_bh_enable_ip(_THIS_IP_, SOFTIRQ_DISABLE_OFFSET);
  10348. }
  10349. +#endif
  10350. #endif /* _LINUX_BH_H */
  10351. diff -Nur linux-3.18.14.orig/include/linux/buffer_head.h linux-3.18.14-rt/include/linux/buffer_head.h
  10352. --- linux-3.18.14.orig/include/linux/buffer_head.h 2015-05-20 10:04:50.000000000 -0500
  10353. +++ linux-3.18.14-rt/include/linux/buffer_head.h 2015-05-31 15:32:48.109635370 -0500
  10354. @@ -75,8 +75,52 @@
  10355. struct address_space *b_assoc_map; /* mapping this buffer is
  10356. associated with */
  10357. atomic_t b_count; /* users using this buffer_head */
  10358. +#ifdef CONFIG_PREEMPT_RT_BASE
  10359. + spinlock_t b_uptodate_lock;
  10360. +#if defined(CONFIG_JBD) || defined(CONFIG_JBD_MODULE) || \
  10361. + defined(CONFIG_JBD2) || defined(CONFIG_JBD2_MODULE)
  10362. + spinlock_t b_state_lock;
  10363. + spinlock_t b_journal_head_lock;
  10364. +#endif
  10365. +#endif
  10366. };
  10367. +static inline unsigned long bh_uptodate_lock_irqsave(struct buffer_head *bh)
  10368. +{
  10369. + unsigned long flags;
  10370. +
  10371. +#ifndef CONFIG_PREEMPT_RT_BASE
  10372. + local_irq_save(flags);
  10373. + bit_spin_lock(BH_Uptodate_Lock, &bh->b_state);
  10374. +#else
  10375. + spin_lock_irqsave(&bh->b_uptodate_lock, flags);
  10376. +#endif
  10377. + return flags;
  10378. +}
  10379. +
  10380. +static inline void
  10381. +bh_uptodate_unlock_irqrestore(struct buffer_head *bh, unsigned long flags)
  10382. +{
  10383. +#ifndef CONFIG_PREEMPT_RT_BASE
  10384. + bit_spin_unlock(BH_Uptodate_Lock, &bh->b_state);
  10385. + local_irq_restore(flags);
  10386. +#else
  10387. + spin_unlock_irqrestore(&bh->b_uptodate_lock, flags);
  10388. +#endif
  10389. +}
  10390. +
  10391. +static inline void buffer_head_init_locks(struct buffer_head *bh)
  10392. +{
  10393. +#ifdef CONFIG_PREEMPT_RT_BASE
  10394. + spin_lock_init(&bh->b_uptodate_lock);
  10395. +#if defined(CONFIG_JBD) || defined(CONFIG_JBD_MODULE) || \
  10396. + defined(CONFIG_JBD2) || defined(CONFIG_JBD2_MODULE)
  10397. + spin_lock_init(&bh->b_state_lock);
  10398. + spin_lock_init(&bh->b_journal_head_lock);
  10399. +#endif
  10400. +#endif
  10401. +}
  10402. +
  10403. /*
  10404. * macro tricks to expand the set_buffer_foo(), clear_buffer_foo()
  10405. * and buffer_foo() functions.
  10406. diff -Nur linux-3.18.14.orig/include/linux/cgroup.h linux-3.18.14-rt/include/linux/cgroup.h
  10407. --- linux-3.18.14.orig/include/linux/cgroup.h 2015-05-20 10:04:50.000000000 -0500
  10408. +++ linux-3.18.14-rt/include/linux/cgroup.h 2015-05-31 15:32:48.117635370 -0500
  10409. @@ -22,6 +22,7 @@
  10410. #include <linux/seq_file.h>
  10411. #include <linux/kernfs.h>
  10412. #include <linux/wait.h>
  10413. +#include <linux/work-simple.h>
  10414. #ifdef CONFIG_CGROUPS
  10415. @@ -91,6 +92,7 @@
  10416. /* percpu_ref killing and RCU release */
  10417. struct rcu_head rcu_head;
  10418. struct work_struct destroy_work;
  10419. + struct swork_event destroy_swork;
  10420. };
  10421. /* bits in struct cgroup_subsys_state flags field */
  10422. diff -Nur linux-3.18.14.orig/include/linux/completion.h linux-3.18.14-rt/include/linux/completion.h
  10423. --- linux-3.18.14.orig/include/linux/completion.h 2015-05-20 10:04:50.000000000 -0500
  10424. +++ linux-3.18.14-rt/include/linux/completion.h 2015-05-31 15:32:48.117635370 -0500
  10425. @@ -7,8 +7,7 @@
  10426. * Atomic wait-for-completion handler data structures.
  10427. * See kernel/sched/completion.c for details.
  10428. */
  10429. -
  10430. -#include <linux/wait.h>
  10431. +#include <linux/wait-simple.h>
  10432. /*
  10433. * struct completion - structure used to maintain state for a "completion"
  10434. @@ -24,11 +23,11 @@
  10435. */
  10436. struct completion {
  10437. unsigned int done;
  10438. - wait_queue_head_t wait;
  10439. + struct swait_head wait;
  10440. };
  10441. #define COMPLETION_INITIALIZER(work) \
  10442. - { 0, __WAIT_QUEUE_HEAD_INITIALIZER((work).wait) }
  10443. + { 0, SWAIT_HEAD_INITIALIZER((work).wait) }
  10444. #define COMPLETION_INITIALIZER_ONSTACK(work) \
  10445. ({ init_completion(&work); work; })
  10446. @@ -73,7 +72,7 @@
  10447. static inline void init_completion(struct completion *x)
  10448. {
  10449. x->done = 0;
  10450. - init_waitqueue_head(&x->wait);
  10451. + init_swait_head(&x->wait);
  10452. }
  10453. /**
  10454. diff -Nur linux-3.18.14.orig/include/linux/cpu.h linux-3.18.14-rt/include/linux/cpu.h
  10455. --- linux-3.18.14.orig/include/linux/cpu.h 2015-05-20 10:04:50.000000000 -0500
  10456. +++ linux-3.18.14-rt/include/linux/cpu.h 2015-05-31 15:32:48.129635370 -0500
  10457. @@ -217,6 +217,8 @@
  10458. extern void put_online_cpus(void);
  10459. extern void cpu_hotplug_disable(void);
  10460. extern void cpu_hotplug_enable(void);
  10461. +extern void pin_current_cpu(void);
  10462. +extern void unpin_current_cpu(void);
  10463. #define hotcpu_notifier(fn, pri) cpu_notifier(fn, pri)
  10464. #define __hotcpu_notifier(fn, pri) __cpu_notifier(fn, pri)
  10465. #define register_hotcpu_notifier(nb) register_cpu_notifier(nb)
  10466. @@ -235,6 +237,8 @@
  10467. #define put_online_cpus() do { } while (0)
  10468. #define cpu_hotplug_disable() do { } while (0)
  10469. #define cpu_hotplug_enable() do { } while (0)
  10470. +static inline void pin_current_cpu(void) { }
  10471. +static inline void unpin_current_cpu(void) { }
  10472. #define hotcpu_notifier(fn, pri) do { (void)(fn); } while (0)
  10473. #define __hotcpu_notifier(fn, pri) do { (void)(fn); } while (0)
  10474. /* These aren't inline functions due to a GCC bug. */
  10475. diff -Nur linux-3.18.14.orig/include/linux/delay.h linux-3.18.14-rt/include/linux/delay.h
  10476. --- linux-3.18.14.orig/include/linux/delay.h 2015-05-20 10:04:50.000000000 -0500
  10477. +++ linux-3.18.14-rt/include/linux/delay.h 2015-05-31 15:32:48.129635370 -0500
  10478. @@ -52,4 +52,10 @@
  10479. msleep(seconds * 1000);
  10480. }
  10481. +#ifdef CONFIG_PREEMPT_RT_FULL
  10482. +extern void cpu_chill(void);
  10483. +#else
  10484. +# define cpu_chill() cpu_relax()
  10485. +#endif
  10486. +
  10487. #endif /* defined(_LINUX_DELAY_H) */
  10488. diff -Nur linux-3.18.14.orig/include/linux/ftrace_event.h linux-3.18.14-rt/include/linux/ftrace_event.h
  10489. --- linux-3.18.14.orig/include/linux/ftrace_event.h 2015-05-20 10:04:50.000000000 -0500
  10490. +++ linux-3.18.14-rt/include/linux/ftrace_event.h 2015-05-31 15:32:48.157635370 -0500
  10491. @@ -61,6 +61,9 @@
  10492. unsigned char flags;
  10493. unsigned char preempt_count;
  10494. int pid;
  10495. + unsigned short migrate_disable;
  10496. + unsigned short padding;
  10497. + unsigned char preempt_lazy_count;
  10498. };
  10499. #define FTRACE_MAX_EVENT \
  10500. diff -Nur linux-3.18.14.orig/include/linux/highmem.h linux-3.18.14-rt/include/linux/highmem.h
  10501. --- linux-3.18.14.orig/include/linux/highmem.h 2015-05-20 10:04:50.000000000 -0500
  10502. +++ linux-3.18.14-rt/include/linux/highmem.h 2015-05-31 15:32:48.157635370 -0500
  10503. @@ -7,6 +7,7 @@
  10504. #include <linux/mm.h>
  10505. #include <linux/uaccess.h>
  10506. #include <linux/hardirq.h>
  10507. +#include <linux/sched.h>
  10508. #include <asm/cacheflush.h>
  10509. @@ -85,32 +86,51 @@
  10510. #if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32)
  10511. +#ifndef CONFIG_PREEMPT_RT_FULL
  10512. DECLARE_PER_CPU(int, __kmap_atomic_idx);
  10513. +#endif
  10514. static inline int kmap_atomic_idx_push(void)
  10515. {
  10516. +#ifndef CONFIG_PREEMPT_RT_FULL
  10517. int idx = __this_cpu_inc_return(__kmap_atomic_idx) - 1;
  10518. -#ifdef CONFIG_DEBUG_HIGHMEM
  10519. +# ifdef CONFIG_DEBUG_HIGHMEM
  10520. WARN_ON_ONCE(in_irq() && !irqs_disabled());
  10521. BUG_ON(idx >= KM_TYPE_NR);
  10522. -#endif
  10523. +# endif
  10524. return idx;
  10525. +#else
  10526. + current->kmap_idx++;
  10527. + BUG_ON(current->kmap_idx > KM_TYPE_NR);
  10528. + return current->kmap_idx - 1;
  10529. +#endif
  10530. }
  10531. static inline int kmap_atomic_idx(void)
  10532. {
  10533. +#ifndef CONFIG_PREEMPT_RT_FULL
  10534. return __this_cpu_read(__kmap_atomic_idx) - 1;
  10535. +#else
  10536. + return current->kmap_idx - 1;
  10537. +#endif
  10538. }
  10539. static inline void kmap_atomic_idx_pop(void)
  10540. {
  10541. -#ifdef CONFIG_DEBUG_HIGHMEM
  10542. +#ifndef CONFIG_PREEMPT_RT_FULL
  10543. +# ifdef CONFIG_DEBUG_HIGHMEM
  10544. int idx = __this_cpu_dec_return(__kmap_atomic_idx);
  10545. BUG_ON(idx < 0);
  10546. -#else
  10547. +# else
  10548. __this_cpu_dec(__kmap_atomic_idx);
  10549. +# endif
  10550. +#else
  10551. + current->kmap_idx--;
  10552. +# ifdef CONFIG_DEBUG_HIGHMEM
  10553. + BUG_ON(current->kmap_idx < 0);
  10554. +# endif
  10555. #endif
  10556. }
  10557. diff -Nur linux-3.18.14.orig/include/linux/hrtimer.h linux-3.18.14-rt/include/linux/hrtimer.h
  10558. --- linux-3.18.14.orig/include/linux/hrtimer.h 2015-05-20 10:04:50.000000000 -0500
  10559. +++ linux-3.18.14-rt/include/linux/hrtimer.h 2015-05-31 15:32:48.161635369 -0500
  10560. @@ -111,6 +111,11 @@
  10561. enum hrtimer_restart (*function)(struct hrtimer *);
  10562. struct hrtimer_clock_base *base;
  10563. unsigned long state;
  10564. + struct list_head cb_entry;
  10565. + int irqsafe;
  10566. +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
  10567. + ktime_t praecox;
  10568. +#endif
  10569. #ifdef CONFIG_TIMER_STATS
  10570. int start_pid;
  10571. void *start_site;
  10572. @@ -147,6 +152,7 @@
  10573. int index;
  10574. clockid_t clockid;
  10575. struct timerqueue_head active;
  10576. + struct list_head expired;
  10577. ktime_t resolution;
  10578. ktime_t (*get_time)(void);
  10579. ktime_t softirq_time;
  10580. @@ -192,6 +198,9 @@
  10581. unsigned long nr_hangs;
  10582. ktime_t max_hang_time;
  10583. #endif
  10584. +#ifdef CONFIG_PREEMPT_RT_BASE
  10585. + wait_queue_head_t wait;
  10586. +#endif
  10587. struct hrtimer_clock_base clock_base[HRTIMER_MAX_CLOCK_BASES];
  10588. };
  10589. @@ -379,6 +388,13 @@
  10590. return hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
  10591. }
  10592. +/* Softirq preemption could deadlock timer removal */
  10593. +#ifdef CONFIG_PREEMPT_RT_BASE
  10594. + extern void hrtimer_wait_for_timer(const struct hrtimer *timer);
  10595. +#else
  10596. +# define hrtimer_wait_for_timer(timer) do { cpu_relax(); } while (0)
  10597. +#endif
  10598. +
  10599. /* Query timers: */
  10600. extern ktime_t hrtimer_get_remaining(const struct hrtimer *timer);
  10601. extern int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp);
  10602. diff -Nur linux-3.18.14.orig/include/linux/idr.h linux-3.18.14-rt/include/linux/idr.h
  10603. --- linux-3.18.14.orig/include/linux/idr.h 2015-05-20 10:04:50.000000000 -0500
  10604. +++ linux-3.18.14-rt/include/linux/idr.h 2015-05-31 15:32:48.161635369 -0500
  10605. @@ -95,10 +95,14 @@
  10606. * Each idr_preload() should be matched with an invocation of this
  10607. * function. See idr_preload() for details.
  10608. */
  10609. +#ifdef CONFIG_PREEMPT_RT_FULL
  10610. +void idr_preload_end(void);
  10611. +#else
  10612. static inline void idr_preload_end(void)
  10613. {
  10614. preempt_enable();
  10615. }
  10616. +#endif
  10617. /**
  10618. * idr_find - return pointer for given id
  10619. diff -Nur linux-3.18.14.orig/include/linux/init_task.h linux-3.18.14-rt/include/linux/init_task.h
  10620. --- linux-3.18.14.orig/include/linux/init_task.h 2015-05-20 10:04:50.000000000 -0500
  10621. +++ linux-3.18.14-rt/include/linux/init_task.h 2015-05-31 15:32:48.177635369 -0500
  10622. @@ -147,9 +147,16 @@
  10623. # define INIT_PERF_EVENTS(tsk)
  10624. #endif
  10625. +#ifdef CONFIG_PREEMPT_RT_BASE
  10626. +# define INIT_TIMER_LIST .posix_timer_list = NULL,
  10627. +#else
  10628. +# define INIT_TIMER_LIST
  10629. +#endif
  10630. +
  10631. #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
  10632. # define INIT_VTIME(tsk) \
  10633. - .vtime_seqlock = __SEQLOCK_UNLOCKED(tsk.vtime_seqlock), \
  10634. + .vtime_lock = __RAW_SPIN_LOCK_UNLOCKED(tsk.vtime_lock), \
  10635. + .vtime_seq = SEQCNT_ZERO(tsk.vtime_seq), \
  10636. .vtime_snap = 0, \
  10637. .vtime_snap_whence = VTIME_SYS,
  10638. #else
  10639. @@ -219,6 +226,7 @@
  10640. .cpu_timers = INIT_CPU_TIMERS(tsk.cpu_timers), \
  10641. .pi_lock = __RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock), \
  10642. .timer_slack_ns = 50000, /* 50 usec default slack */ \
  10643. + INIT_TIMER_LIST \
  10644. .pids = { \
  10645. [PIDTYPE_PID] = INIT_PID_LINK(PIDTYPE_PID), \
  10646. [PIDTYPE_PGID] = INIT_PID_LINK(PIDTYPE_PGID), \
  10647. diff -Nur linux-3.18.14.orig/include/linux/interrupt.h linux-3.18.14-rt/include/linux/interrupt.h
  10648. --- linux-3.18.14.orig/include/linux/interrupt.h 2015-05-20 10:04:50.000000000 -0500
  10649. +++ linux-3.18.14-rt/include/linux/interrupt.h 2015-05-31 15:32:48.181635369 -0500
  10650. @@ -57,6 +57,7 @@
  10651. * IRQF_NO_THREAD - Interrupt cannot be threaded
  10652. * IRQF_EARLY_RESUME - Resume IRQ early during syscore instead of at device
  10653. * resume time.
  10654. + * IRQF_NO_SOFTIRQ_CALL - Do not process softirqs in the irq thread context (RT)
  10655. */
  10656. #define IRQF_DISABLED 0x00000020
  10657. #define IRQF_SHARED 0x00000080
  10658. @@ -70,6 +71,7 @@
  10659. #define IRQF_FORCE_RESUME 0x00008000
  10660. #define IRQF_NO_THREAD 0x00010000
  10661. #define IRQF_EARLY_RESUME 0x00020000
  10662. +#define IRQF_NO_SOFTIRQ_CALL 0x00080000
  10663. #define IRQF_TIMER (__IRQF_TIMER | IRQF_NO_SUSPEND | IRQF_NO_THREAD)
  10664. @@ -180,7 +182,7 @@
  10665. #ifdef CONFIG_LOCKDEP
  10666. # define local_irq_enable_in_hardirq() do { } while (0)
  10667. #else
  10668. -# define local_irq_enable_in_hardirq() local_irq_enable()
  10669. +# define local_irq_enable_in_hardirq() local_irq_enable_nort()
  10670. #endif
  10671. extern void disable_irq_nosync(unsigned int irq);
  10672. @@ -210,6 +212,7 @@
  10673. unsigned int irq;
  10674. struct kref kref;
  10675. struct work_struct work;
  10676. + struct list_head list;
  10677. void (*notify)(struct irq_affinity_notify *, const cpumask_t *mask);
  10678. void (*release)(struct kref *ref);
  10679. };
  10680. @@ -358,9 +361,13 @@
  10681. #ifdef CONFIG_IRQ_FORCED_THREADING
  10682. +# ifndef CONFIG_PREEMPT_RT_BASE
  10683. extern bool force_irqthreads;
  10684. +# else
  10685. +# define force_irqthreads (true)
  10686. +# endif
  10687. #else
  10688. -#define force_irqthreads (0)
  10689. +#define force_irqthreads (false)
  10690. #endif
  10691. #ifndef __ARCH_SET_SOFTIRQ_PENDING
  10692. @@ -416,9 +423,10 @@
  10693. void (*action)(struct softirq_action *);
  10694. };
  10695. +#ifndef CONFIG_PREEMPT_RT_FULL
  10696. asmlinkage void do_softirq(void);
  10697. asmlinkage void __do_softirq(void);
  10698. -
  10699. +static inline void thread_do_softirq(void) { do_softirq(); }
  10700. #ifdef __ARCH_HAS_DO_SOFTIRQ
  10701. void do_softirq_own_stack(void);
  10702. #else
  10703. @@ -427,6 +435,9 @@
  10704. __do_softirq();
  10705. }
  10706. #endif
  10707. +#else
  10708. +extern void thread_do_softirq(void);
  10709. +#endif
  10710. extern void open_softirq(int nr, void (*action)(struct softirq_action *));
  10711. extern void softirq_init(void);
  10712. @@ -434,6 +445,7 @@
  10713. extern void raise_softirq_irqoff(unsigned int nr);
  10714. extern void raise_softirq(unsigned int nr);
  10715. +extern void softirq_check_pending_idle(void);
  10716. DECLARE_PER_CPU(struct task_struct *, ksoftirqd);
  10717. @@ -455,8 +467,9 @@
  10718. to be executed on some cpu at least once after this.
  10719. * If the tasklet is already scheduled, but its execution is still not
  10720. started, it will be executed only once.
  10721. - * If this tasklet is already running on another CPU (or schedule is called
  10722. - from tasklet itself), it is rescheduled for later.
  10723. + * If this tasklet is already running on another CPU, it is rescheduled
  10724. + for later.
  10725. + * Schedule must not be called from the tasklet itself (a lockup occurs)
  10726. * Tasklet is strictly serialized wrt itself, but not
  10727. wrt another tasklets. If client needs some intertask synchronization,
  10728. he makes it with spinlocks.
  10729. @@ -481,27 +494,36 @@
  10730. enum
  10731. {
  10732. TASKLET_STATE_SCHED, /* Tasklet is scheduled for execution */
  10733. - TASKLET_STATE_RUN /* Tasklet is running (SMP only) */
  10734. + TASKLET_STATE_RUN, /* Tasklet is running (SMP only) */
  10735. + TASKLET_STATE_PENDING /* Tasklet is pending */
  10736. };
  10737. -#ifdef CONFIG_SMP
  10738. +#define TASKLET_STATEF_SCHED (1 << TASKLET_STATE_SCHED)
  10739. +#define TASKLET_STATEF_RUN (1 << TASKLET_STATE_RUN)
  10740. +#define TASKLET_STATEF_PENDING (1 << TASKLET_STATE_PENDING)
  10741. +
  10742. +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
  10743. static inline int tasklet_trylock(struct tasklet_struct *t)
  10744. {
  10745. return !test_and_set_bit(TASKLET_STATE_RUN, &(t)->state);
  10746. }
  10747. +static inline int tasklet_tryunlock(struct tasklet_struct *t)
  10748. +{
  10749. + return cmpxchg(&t->state, TASKLET_STATEF_RUN, 0) == TASKLET_STATEF_RUN;
  10750. +}
  10751. +
  10752. static inline void tasklet_unlock(struct tasklet_struct *t)
  10753. {
  10754. smp_mb__before_atomic();
  10755. clear_bit(TASKLET_STATE_RUN, &(t)->state);
  10756. }
  10757. -static inline void tasklet_unlock_wait(struct tasklet_struct *t)
  10758. -{
  10759. - while (test_bit(TASKLET_STATE_RUN, &(t)->state)) { barrier(); }
  10760. -}
  10761. +extern void tasklet_unlock_wait(struct tasklet_struct *t);
  10762. +
  10763. #else
  10764. #define tasklet_trylock(t) 1
  10765. +#define tasklet_tryunlock(t) 1
  10766. #define tasklet_unlock_wait(t) do { } while (0)
  10767. #define tasklet_unlock(t) do { } while (0)
  10768. #endif
  10769. @@ -550,17 +572,8 @@
  10770. smp_mb();
  10771. }
  10772. -static inline void tasklet_enable(struct tasklet_struct *t)
  10773. -{
  10774. - smp_mb__before_atomic();
  10775. - atomic_dec(&t->count);
  10776. -}
  10777. -
  10778. -static inline void tasklet_hi_enable(struct tasklet_struct *t)
  10779. -{
  10780. - smp_mb__before_atomic();
  10781. - atomic_dec(&t->count);
  10782. -}
  10783. +extern void tasklet_enable(struct tasklet_struct *t);
  10784. +extern void tasklet_hi_enable(struct tasklet_struct *t);
  10785. extern void tasklet_kill(struct tasklet_struct *t);
  10786. extern void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu);
  10787. @@ -592,6 +605,12 @@
  10788. tasklet_kill(&ttimer->tasklet);
  10789. }
  10790. +#ifdef CONFIG_PREEMPT_RT_FULL
  10791. +extern void softirq_early_init(void);
  10792. +#else
  10793. +static inline void softirq_early_init(void) { }
  10794. +#endif
  10795. +
  10796. /*
  10797. * Autoprobing for irqs:
  10798. *
  10799. diff -Nur linux-3.18.14.orig/include/linux/irqdesc.h linux-3.18.14-rt/include/linux/irqdesc.h
  10800. --- linux-3.18.14.orig/include/linux/irqdesc.h 2015-05-20 10:04:50.000000000 -0500
  10801. +++ linux-3.18.14-rt/include/linux/irqdesc.h 2015-05-31 15:32:48.217635369 -0500
  10802. @@ -63,6 +63,7 @@
  10803. unsigned int irqs_unhandled;
  10804. atomic_t threads_handled;
  10805. int threads_handled_last;
  10806. + u64 random_ip;
  10807. raw_spinlock_t lock;
  10808. struct cpumask *percpu_enabled;
  10809. #ifdef CONFIG_SMP
  10810. diff -Nur linux-3.18.14.orig/include/linux/irqflags.h linux-3.18.14-rt/include/linux/irqflags.h
  10811. --- linux-3.18.14.orig/include/linux/irqflags.h 2015-05-20 10:04:50.000000000 -0500
  10812. +++ linux-3.18.14-rt/include/linux/irqflags.h 2015-05-31 15:32:48.233635369 -0500
  10813. @@ -25,8 +25,6 @@
  10814. # define trace_softirqs_enabled(p) ((p)->softirqs_enabled)
  10815. # define trace_hardirq_enter() do { current->hardirq_context++; } while (0)
  10816. # define trace_hardirq_exit() do { current->hardirq_context--; } while (0)
  10817. -# define lockdep_softirq_enter() do { current->softirq_context++; } while (0)
  10818. -# define lockdep_softirq_exit() do { current->softirq_context--; } while (0)
  10819. # define INIT_TRACE_IRQFLAGS .softirqs_enabled = 1,
  10820. #else
  10821. # define trace_hardirqs_on() do { } while (0)
  10822. @@ -39,9 +37,15 @@
  10823. # define trace_softirqs_enabled(p) 0
  10824. # define trace_hardirq_enter() do { } while (0)
  10825. # define trace_hardirq_exit() do { } while (0)
  10826. +# define INIT_TRACE_IRQFLAGS
  10827. +#endif
  10828. +
  10829. +#if defined(CONFIG_TRACE_IRQFLAGS) && !defined(CONFIG_PREEMPT_RT_FULL)
  10830. +# define lockdep_softirq_enter() do { current->softirq_context++; } while (0)
  10831. +# define lockdep_softirq_exit() do { current->softirq_context--; } while (0)
  10832. +#else
  10833. # define lockdep_softirq_enter() do { } while (0)
  10834. # define lockdep_softirq_exit() do { } while (0)
  10835. -# define INIT_TRACE_IRQFLAGS
  10836. #endif
  10837. #if defined(CONFIG_IRQSOFF_TRACER) || \
  10838. @@ -147,4 +151,23 @@
  10839. #endif /* CONFIG_TRACE_IRQFLAGS_SUPPORT */
  10840. +/*
  10841. + * local_irq* variants depending on RT/!RT
  10842. + */
  10843. +#ifdef CONFIG_PREEMPT_RT_FULL
  10844. +# define local_irq_disable_nort() do { } while (0)
  10845. +# define local_irq_enable_nort() do { } while (0)
  10846. +# define local_irq_save_nort(flags) local_save_flags(flags)
  10847. +# define local_irq_restore_nort(flags) (void)(flags)
  10848. +# define local_irq_disable_rt() local_irq_disable()
  10849. +# define local_irq_enable_rt() local_irq_enable()
  10850. +#else
  10851. +# define local_irq_disable_nort() local_irq_disable()
  10852. +# define local_irq_enable_nort() local_irq_enable()
  10853. +# define local_irq_save_nort(flags) local_irq_save(flags)
  10854. +# define local_irq_restore_nort(flags) local_irq_restore(flags)
  10855. +# define local_irq_disable_rt() do { } while (0)
  10856. +# define local_irq_enable_rt() do { } while (0)
  10857. +#endif
  10858. +
  10859. #endif
  10860. diff -Nur linux-3.18.14.orig/include/linux/irq.h linux-3.18.14-rt/include/linux/irq.h
  10861. --- linux-3.18.14.orig/include/linux/irq.h 2015-05-20 10:04:50.000000000 -0500
  10862. +++ linux-3.18.14-rt/include/linux/irq.h 2015-05-31 15:32:48.185635369 -0500
  10863. @@ -73,6 +73,7 @@
  10864. * IRQ_IS_POLLED - Always polled by another interrupt. Exclude
  10865. * it from the spurious interrupt detection
  10866. * mechanism and from core side polling.
  10867. + * IRQ_NO_SOFTIRQ_CALL - No softirq processing in the irq thread context (RT)
  10868. */
  10869. enum {
  10870. IRQ_TYPE_NONE = 0x00000000,
  10871. @@ -98,13 +99,14 @@
  10872. IRQ_NOTHREAD = (1 << 16),
  10873. IRQ_PER_CPU_DEVID = (1 << 17),
  10874. IRQ_IS_POLLED = (1 << 18),
  10875. + IRQ_NO_SOFTIRQ_CALL = (1 << 19),
  10876. };
  10877. #define IRQF_MODIFY_MASK \
  10878. (IRQ_TYPE_SENSE_MASK | IRQ_NOPROBE | IRQ_NOREQUEST | \
  10879. IRQ_NOAUTOEN | IRQ_MOVE_PCNTXT | IRQ_LEVEL | IRQ_NO_BALANCING | \
  10880. IRQ_PER_CPU | IRQ_NESTED_THREAD | IRQ_NOTHREAD | IRQ_PER_CPU_DEVID | \
  10881. - IRQ_IS_POLLED)
  10882. + IRQ_IS_POLLED | IRQ_NO_SOFTIRQ_CALL)
  10883. #define IRQ_NO_BALANCING_MASK (IRQ_PER_CPU | IRQ_NO_BALANCING)
  10884. diff -Nur linux-3.18.14.orig/include/linux/irq_work.h linux-3.18.14-rt/include/linux/irq_work.h
  10885. --- linux-3.18.14.orig/include/linux/irq_work.h 2015-05-20 10:04:50.000000000 -0500
  10886. +++ linux-3.18.14-rt/include/linux/irq_work.h 2015-05-31 15:32:48.217635369 -0500
  10887. @@ -16,6 +16,7 @@
  10888. #define IRQ_WORK_BUSY 2UL
  10889. #define IRQ_WORK_FLAGS 3UL
  10890. #define IRQ_WORK_LAZY 4UL /* Doesn't want IPI, wait for tick */
  10891. +#define IRQ_WORK_HARD_IRQ 8UL /* Run hard IRQ context, even on RT */
  10892. struct irq_work {
  10893. unsigned long flags;
  10894. diff -Nur linux-3.18.14.orig/include/linux/jbd_common.h linux-3.18.14-rt/include/linux/jbd_common.h
  10895. --- linux-3.18.14.orig/include/linux/jbd_common.h 2015-05-20 10:04:50.000000000 -0500
  10896. +++ linux-3.18.14-rt/include/linux/jbd_common.h 2015-05-31 15:32:48.237635369 -0500
  10897. @@ -15,32 +15,56 @@
  10898. static inline void jbd_lock_bh_state(struct buffer_head *bh)
  10899. {
  10900. +#ifndef CONFIG_PREEMPT_RT_BASE
  10901. bit_spin_lock(BH_State, &bh->b_state);
  10902. +#else
  10903. + spin_lock(&bh->b_state_lock);
  10904. +#endif
  10905. }
  10906. static inline int jbd_trylock_bh_state(struct buffer_head *bh)
  10907. {
  10908. +#ifndef CONFIG_PREEMPT_RT_BASE
  10909. return bit_spin_trylock(BH_State, &bh->b_state);
  10910. +#else
  10911. + return spin_trylock(&bh->b_state_lock);
  10912. +#endif
  10913. }
  10914. static inline int jbd_is_locked_bh_state(struct buffer_head *bh)
  10915. {
  10916. +#ifndef CONFIG_PREEMPT_RT_BASE
  10917. return bit_spin_is_locked(BH_State, &bh->b_state);
  10918. +#else
  10919. + return spin_is_locked(&bh->b_state_lock);
  10920. +#endif
  10921. }
  10922. static inline void jbd_unlock_bh_state(struct buffer_head *bh)
  10923. {
  10924. +#ifndef CONFIG_PREEMPT_RT_BASE
  10925. bit_spin_unlock(BH_State, &bh->b_state);
  10926. +#else
  10927. + spin_unlock(&bh->b_state_lock);
  10928. +#endif
  10929. }
  10930. static inline void jbd_lock_bh_journal_head(struct buffer_head *bh)
  10931. {
  10932. +#ifndef CONFIG_PREEMPT_RT_BASE
  10933. bit_spin_lock(BH_JournalHead, &bh->b_state);
  10934. +#else
  10935. + spin_lock(&bh->b_journal_head_lock);
  10936. +#endif
  10937. }
  10938. static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh)
  10939. {
  10940. +#ifndef CONFIG_PREEMPT_RT_BASE
  10941. bit_spin_unlock(BH_JournalHead, &bh->b_state);
  10942. +#else
  10943. + spin_unlock(&bh->b_journal_head_lock);
  10944. +#endif
  10945. }
  10946. #endif
  10947. diff -Nur linux-3.18.14.orig/include/linux/jump_label.h linux-3.18.14-rt/include/linux/jump_label.h
  10948. --- linux-3.18.14.orig/include/linux/jump_label.h 2015-05-20 10:04:50.000000000 -0500
  10949. +++ linux-3.18.14-rt/include/linux/jump_label.h 2015-05-31 15:32:48.237635369 -0500
  10950. @@ -55,7 +55,8 @@
  10951. "%s used before call to jump_label_init", \
  10952. __func__)
  10953. -#if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_JUMP_LABEL)
  10954. +#if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_JUMP_LABEL) && \
  10955. + !defined(CONFIG_PREEMPT_BASE)
  10956. struct static_key {
  10957. atomic_t enabled;
  10958. diff -Nur linux-3.18.14.orig/include/linux/kdb.h linux-3.18.14-rt/include/linux/kdb.h
  10959. --- linux-3.18.14.orig/include/linux/kdb.h 2015-05-20 10:04:50.000000000 -0500
  10960. +++ linux-3.18.14-rt/include/linux/kdb.h 2015-05-31 15:32:48.245635369 -0500
  10961. @@ -116,7 +116,7 @@
  10962. extern __printf(1, 0) int vkdb_printf(const char *fmt, va_list args);
  10963. extern __printf(1, 2) int kdb_printf(const char *, ...);
  10964. typedef __printf(1, 2) int (*kdb_printf_t)(const char *, ...);
  10965. -
  10966. +#define in_kdb_printk() (kdb_trap_printk)
  10967. extern void kdb_init(int level);
  10968. /* Access to kdb specific polling devices */
  10969. @@ -151,6 +151,7 @@
  10970. extern int kdb_unregister(char *);
  10971. #else /* ! CONFIG_KGDB_KDB */
  10972. static inline __printf(1, 2) int kdb_printf(const char *fmt, ...) { return 0; }
  10973. +#define in_kdb_printk() (0)
  10974. static inline void kdb_init(int level) {}
  10975. static inline int kdb_register(char *cmd, kdb_func_t func, char *usage,
  10976. char *help, short minlen) { return 0; }
  10977. diff -Nur linux-3.18.14.orig/include/linux/kernel.h linux-3.18.14-rt/include/linux/kernel.h
  10978. --- linux-3.18.14.orig/include/linux/kernel.h 2015-05-20 10:04:50.000000000 -0500
  10979. +++ linux-3.18.14-rt/include/linux/kernel.h 2015-05-31 15:32:48.245635369 -0500
  10980. @@ -451,6 +451,7 @@
  10981. SYSTEM_HALT,
  10982. SYSTEM_POWER_OFF,
  10983. SYSTEM_RESTART,
  10984. + SYSTEM_SUSPEND,
  10985. } system_state;
  10986. #define TAINT_PROPRIETARY_MODULE 0
  10987. diff -Nur linux-3.18.14.orig/include/linux/kvm_host.h linux-3.18.14-rt/include/linux/kvm_host.h
  10988. --- linux-3.18.14.orig/include/linux/kvm_host.h 2015-05-20 10:04:50.000000000 -0500
  10989. +++ linux-3.18.14-rt/include/linux/kvm_host.h 2015-05-31 15:32:48.253635368 -0500
  10990. @@ -245,7 +245,7 @@
  10991. int fpu_active;
  10992. int guest_fpu_loaded, guest_xcr0_loaded;
  10993. - wait_queue_head_t wq;
  10994. + struct swait_head wq;
  10995. struct pid *pid;
  10996. int sigset_active;
  10997. sigset_t sigset;
  10998. @@ -688,7 +688,7 @@
  10999. }
  11000. #endif
  11001. -static inline wait_queue_head_t *kvm_arch_vcpu_wq(struct kvm_vcpu *vcpu)
  11002. +static inline struct swait_head *kvm_arch_vcpu_wq(struct kvm_vcpu *vcpu)
  11003. {
  11004. #ifdef __KVM_HAVE_ARCH_WQP
  11005. return vcpu->arch.wqp;
  11006. diff -Nur linux-3.18.14.orig/include/linux/kvm_host.h.orig linux-3.18.14-rt/include/linux/kvm_host.h.orig
  11007. --- linux-3.18.14.orig/include/linux/kvm_host.h.orig 1969-12-31 18:00:00.000000000 -0600
  11008. +++ linux-3.18.14-rt/include/linux/kvm_host.h.orig 2015-05-20 10:04:50.000000000 -0500
  11009. @@ -0,0 +1,1111 @@
  11010. +#ifndef __KVM_HOST_H
  11011. +#define __KVM_HOST_H
  11012. +
  11013. +/*
  11014. + * This work is licensed under the terms of the GNU GPL, version 2. See
  11015. + * the COPYING file in the top-level directory.
  11016. + */
  11017. +
  11018. +#include <linux/types.h>
  11019. +#include <linux/hardirq.h>
  11020. +#include <linux/list.h>
  11021. +#include <linux/mutex.h>
  11022. +#include <linux/spinlock.h>
  11023. +#include <linux/signal.h>
  11024. +#include <linux/sched.h>
  11025. +#include <linux/bug.h>
  11026. +#include <linux/mm.h>
  11027. +#include <linux/mmu_notifier.h>
  11028. +#include <linux/preempt.h>
  11029. +#include <linux/msi.h>
  11030. +#include <linux/slab.h>
  11031. +#include <linux/rcupdate.h>
  11032. +#include <linux/ratelimit.h>
  11033. +#include <linux/err.h>
  11034. +#include <linux/irqflags.h>
  11035. +#include <linux/context_tracking.h>
  11036. +#include <asm/signal.h>
  11037. +
  11038. +#include <linux/kvm.h>
  11039. +#include <linux/kvm_para.h>
  11040. +
  11041. +#include <linux/kvm_types.h>
  11042. +
  11043. +#include <asm/kvm_host.h>
  11044. +
  11045. +#ifndef KVM_MMIO_SIZE
  11046. +#define KVM_MMIO_SIZE 8
  11047. +#endif
  11048. +
  11049. +/*
  11050. + * The bit 16 ~ bit 31 of kvm_memory_region::flags are internally used
  11051. + * in kvm, other bits are visible for userspace which are defined in
  11052. + * include/linux/kvm_h.
  11053. + */
  11054. +#define KVM_MEMSLOT_INVALID (1UL << 16)
  11055. +#define KVM_MEMSLOT_INCOHERENT (1UL << 17)
  11056. +
  11057. +/* Two fragments for cross MMIO pages. */
  11058. +#define KVM_MAX_MMIO_FRAGMENTS 2
  11059. +
  11060. +/*
  11061. + * For the normal pfn, the highest 12 bits should be zero,
  11062. + * so we can mask bit 62 ~ bit 52 to indicate the error pfn,
  11063. + * mask bit 63 to indicate the noslot pfn.
  11064. + */
  11065. +#define KVM_PFN_ERR_MASK (0x7ffULL << 52)
  11066. +#define KVM_PFN_ERR_NOSLOT_MASK (0xfffULL << 52)
  11067. +#define KVM_PFN_NOSLOT (0x1ULL << 63)
  11068. +
  11069. +#define KVM_PFN_ERR_FAULT (KVM_PFN_ERR_MASK)
  11070. +#define KVM_PFN_ERR_HWPOISON (KVM_PFN_ERR_MASK + 1)
  11071. +#define KVM_PFN_ERR_RO_FAULT (KVM_PFN_ERR_MASK + 2)
  11072. +
  11073. +/*
  11074. + * error pfns indicate that the gfn is in slot but faild to
  11075. + * translate it to pfn on host.
  11076. + */
  11077. +static inline bool is_error_pfn(pfn_t pfn)
  11078. +{
  11079. + return !!(pfn & KVM_PFN_ERR_MASK);
  11080. +}
  11081. +
  11082. +/*
  11083. + * error_noslot pfns indicate that the gfn can not be
  11084. + * translated to pfn - it is not in slot or failed to
  11085. + * translate it to pfn.
  11086. + */
  11087. +static inline bool is_error_noslot_pfn(pfn_t pfn)
  11088. +{
  11089. + return !!(pfn & KVM_PFN_ERR_NOSLOT_MASK);
  11090. +}
  11091. +
  11092. +/* noslot pfn indicates that the gfn is not in slot. */
  11093. +static inline bool is_noslot_pfn(pfn_t pfn)
  11094. +{
  11095. + return pfn == KVM_PFN_NOSLOT;
  11096. +}
  11097. +
  11098. +/*
  11099. + * architectures with KVM_HVA_ERR_BAD other than PAGE_OFFSET (e.g. s390)
  11100. + * provide own defines and kvm_is_error_hva
  11101. + */
  11102. +#ifndef KVM_HVA_ERR_BAD
  11103. +
  11104. +#define KVM_HVA_ERR_BAD (PAGE_OFFSET)
  11105. +#define KVM_HVA_ERR_RO_BAD (PAGE_OFFSET + PAGE_SIZE)
  11106. +
  11107. +static inline bool kvm_is_error_hva(unsigned long addr)
  11108. +{
  11109. + return addr >= PAGE_OFFSET;
  11110. +}
  11111. +
  11112. +#endif
  11113. +
  11114. +#define KVM_ERR_PTR_BAD_PAGE (ERR_PTR(-ENOENT))
  11115. +
  11116. +static inline bool is_error_page(struct page *page)
  11117. +{
  11118. + return IS_ERR(page);
  11119. +}
  11120. +
  11121. +/*
  11122. + * vcpu->requests bit members
  11123. + */
  11124. +#define KVM_REQ_TLB_FLUSH 0
  11125. +#define KVM_REQ_MIGRATE_TIMER 1
  11126. +#define KVM_REQ_REPORT_TPR_ACCESS 2
  11127. +#define KVM_REQ_MMU_RELOAD 3
  11128. +#define KVM_REQ_TRIPLE_FAULT 4
  11129. +#define KVM_REQ_PENDING_TIMER 5
  11130. +#define KVM_REQ_UNHALT 6
  11131. +#define KVM_REQ_MMU_SYNC 7
  11132. +#define KVM_REQ_CLOCK_UPDATE 8
  11133. +#define KVM_REQ_KICK 9
  11134. +#define KVM_REQ_DEACTIVATE_FPU 10
  11135. +#define KVM_REQ_EVENT 11
  11136. +#define KVM_REQ_APF_HALT 12
  11137. +#define KVM_REQ_STEAL_UPDATE 13
  11138. +#define KVM_REQ_NMI 14
  11139. +#define KVM_REQ_PMU 15
  11140. +#define KVM_REQ_PMI 16
  11141. +#define KVM_REQ_WATCHDOG 17
  11142. +#define KVM_REQ_MASTERCLOCK_UPDATE 18
  11143. +#define KVM_REQ_MCLOCK_INPROGRESS 19
  11144. +#define KVM_REQ_EPR_EXIT 20
  11145. +#define KVM_REQ_SCAN_IOAPIC 21
  11146. +#define KVM_REQ_GLOBAL_CLOCK_UPDATE 22
  11147. +#define KVM_REQ_ENABLE_IBS 23
  11148. +#define KVM_REQ_DISABLE_IBS 24
  11149. +#define KVM_REQ_APIC_PAGE_RELOAD 25
  11150. +
  11151. +#define KVM_USERSPACE_IRQ_SOURCE_ID 0
  11152. +#define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID 1
  11153. +
  11154. +extern struct kmem_cache *kvm_vcpu_cache;
  11155. +
  11156. +extern spinlock_t kvm_lock;
  11157. +extern struct list_head vm_list;
  11158. +
  11159. +struct kvm_io_range {
  11160. + gpa_t addr;
  11161. + int len;
  11162. + struct kvm_io_device *dev;
  11163. +};
  11164. +
  11165. +#define NR_IOBUS_DEVS 1000
  11166. +
  11167. +struct kvm_io_bus {
  11168. + int dev_count;
  11169. + int ioeventfd_count;
  11170. + struct kvm_io_range range[];
  11171. +};
  11172. +
  11173. +enum kvm_bus {
  11174. + KVM_MMIO_BUS,
  11175. + KVM_PIO_BUS,
  11176. + KVM_VIRTIO_CCW_NOTIFY_BUS,
  11177. + KVM_FAST_MMIO_BUS,
  11178. + KVM_NR_BUSES
  11179. +};
  11180. +
  11181. +int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
  11182. + int len, const void *val);
  11183. +int kvm_io_bus_write_cookie(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
  11184. + int len, const void *val, long cookie);
  11185. +int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, int len,
  11186. + void *val);
  11187. +int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
  11188. + int len, struct kvm_io_device *dev);
  11189. +int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
  11190. + struct kvm_io_device *dev);
  11191. +
  11192. +#ifdef CONFIG_KVM_ASYNC_PF
  11193. +struct kvm_async_pf {
  11194. + struct work_struct work;
  11195. + struct list_head link;
  11196. + struct list_head queue;
  11197. + struct kvm_vcpu *vcpu;
  11198. + struct mm_struct *mm;
  11199. + gva_t gva;
  11200. + unsigned long addr;
  11201. + struct kvm_arch_async_pf arch;
  11202. + bool wakeup_all;
  11203. +};
  11204. +
  11205. +void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu);
  11206. +void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu);
  11207. +int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, unsigned long hva,
  11208. + struct kvm_arch_async_pf *arch);
  11209. +int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
  11210. +#endif
  11211. +
  11212. +/*
  11213. + * Carry out a gup that requires IO. Allow the mm to relinquish the mmap
  11214. + * semaphore if the filemap/swap has to wait on a page lock. pagep == NULL
  11215. + * controls whether we retry the gup one more time to completion in that case.
  11216. + * Typically this is called after a FAULT_FLAG_RETRY_NOWAIT in the main tdp
  11217. + * handler.
  11218. + */
  11219. +int kvm_get_user_page_io(struct task_struct *tsk, struct mm_struct *mm,
  11220. + unsigned long addr, bool write_fault,
  11221. + struct page **pagep);
  11222. +
  11223. +enum {
  11224. + OUTSIDE_GUEST_MODE,
  11225. + IN_GUEST_MODE,
  11226. + EXITING_GUEST_MODE,
  11227. + READING_SHADOW_PAGE_TABLES,
  11228. +};
  11229. +
  11230. +/*
  11231. + * Sometimes a large or cross-page mmio needs to be broken up into separate
  11232. + * exits for userspace servicing.
  11233. + */
  11234. +struct kvm_mmio_fragment {
  11235. + gpa_t gpa;
  11236. + void *data;
  11237. + unsigned len;
  11238. +};
  11239. +
  11240. +struct kvm_vcpu {
  11241. + struct kvm *kvm;
  11242. +#ifdef CONFIG_PREEMPT_NOTIFIERS
  11243. + struct preempt_notifier preempt_notifier;
  11244. +#endif
  11245. + int cpu;
  11246. + int vcpu_id;
  11247. + int srcu_idx;
  11248. + int mode;
  11249. + unsigned long requests;
  11250. + unsigned long guest_debug;
  11251. +
  11252. + struct mutex mutex;
  11253. + struct kvm_run *run;
  11254. +
  11255. + int fpu_active;
  11256. + int guest_fpu_loaded, guest_xcr0_loaded;
  11257. + wait_queue_head_t wq;
  11258. + struct pid *pid;
  11259. + int sigset_active;
  11260. + sigset_t sigset;
  11261. + struct kvm_vcpu_stat stat;
  11262. +
  11263. +#ifdef CONFIG_HAS_IOMEM
  11264. + int mmio_needed;
  11265. + int mmio_read_completed;
  11266. + int mmio_is_write;
  11267. + int mmio_cur_fragment;
  11268. + int mmio_nr_fragments;
  11269. + struct kvm_mmio_fragment mmio_fragments[KVM_MAX_MMIO_FRAGMENTS];
  11270. +#endif
  11271. +
  11272. +#ifdef CONFIG_KVM_ASYNC_PF
  11273. + struct {
  11274. + u32 queued;
  11275. + struct list_head queue;
  11276. + struct list_head done;
  11277. + spinlock_t lock;
  11278. + } async_pf;
  11279. +#endif
  11280. +
  11281. +#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
  11282. + /*
  11283. + * Cpu relax intercept or pause loop exit optimization
  11284. + * in_spin_loop: set when a vcpu does a pause loop exit
  11285. + * or cpu relax intercepted.
  11286. + * dy_eligible: indicates whether vcpu is eligible for directed yield.
  11287. + */
  11288. + struct {
  11289. + bool in_spin_loop;
  11290. + bool dy_eligible;
  11291. + } spin_loop;
  11292. +#endif
  11293. + bool preempted;
  11294. + struct kvm_vcpu_arch arch;
  11295. +};
  11296. +
  11297. +static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu)
  11298. +{
  11299. + return cmpxchg(&vcpu->mode, IN_GUEST_MODE, EXITING_GUEST_MODE);
  11300. +}
  11301. +
  11302. +/*
  11303. + * Some of the bitops functions do not support too long bitmaps.
  11304. + * This number must be determined not to exceed such limits.
  11305. + */
  11306. +#define KVM_MEM_MAX_NR_PAGES ((1UL << 31) - 1)
  11307. +
  11308. +struct kvm_memory_slot {
  11309. + gfn_t base_gfn;
  11310. + unsigned long npages;
  11311. + unsigned long *dirty_bitmap;
  11312. + struct kvm_arch_memory_slot arch;
  11313. + unsigned long userspace_addr;
  11314. + u32 flags;
  11315. + short id;
  11316. +};
  11317. +
  11318. +static inline unsigned long kvm_dirty_bitmap_bytes(struct kvm_memory_slot *memslot)
  11319. +{
  11320. + return ALIGN(memslot->npages, BITS_PER_LONG) / 8;
  11321. +}
  11322. +
  11323. +struct kvm_s390_adapter_int {
  11324. + u64 ind_addr;
  11325. + u64 summary_addr;
  11326. + u64 ind_offset;
  11327. + u32 summary_offset;
  11328. + u32 adapter_id;
  11329. +};
  11330. +
  11331. +struct kvm_kernel_irq_routing_entry {
  11332. + u32 gsi;
  11333. + u32 type;
  11334. + int (*set)(struct kvm_kernel_irq_routing_entry *e,
  11335. + struct kvm *kvm, int irq_source_id, int level,
  11336. + bool line_status);
  11337. + union {
  11338. + struct {
  11339. + unsigned irqchip;
  11340. + unsigned pin;
  11341. + } irqchip;
  11342. + struct msi_msg msi;
  11343. + struct kvm_s390_adapter_int adapter;
  11344. + };
  11345. + struct hlist_node link;
  11346. +};
  11347. +
  11348. +#ifndef KVM_PRIVATE_MEM_SLOTS
  11349. +#define KVM_PRIVATE_MEM_SLOTS 0
  11350. +#endif
  11351. +
  11352. +#ifndef KVM_MEM_SLOTS_NUM
  11353. +#define KVM_MEM_SLOTS_NUM (KVM_USER_MEM_SLOTS + KVM_PRIVATE_MEM_SLOTS)
  11354. +#endif
  11355. +
  11356. +/*
  11357. + * Note:
  11358. + * memslots are not sorted by id anymore, please use id_to_memslot()
  11359. + * to get the memslot by its id.
  11360. + */
  11361. +struct kvm_memslots {
  11362. + u64 generation;
  11363. + struct kvm_memory_slot memslots[KVM_MEM_SLOTS_NUM];
  11364. + /* The mapping table from slot id to the index in memslots[]. */
  11365. + short id_to_index[KVM_MEM_SLOTS_NUM];
  11366. +};
  11367. +
  11368. +struct kvm {
  11369. + spinlock_t mmu_lock;
  11370. + struct mutex slots_lock;
  11371. + struct mm_struct *mm; /* userspace tied to this vm */
  11372. + struct kvm_memslots *memslots;
  11373. + struct srcu_struct srcu;
  11374. + struct srcu_struct irq_srcu;
  11375. +#ifdef CONFIG_KVM_APIC_ARCHITECTURE
  11376. + u32 bsp_vcpu_id;
  11377. +#endif
  11378. + struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
  11379. + atomic_t online_vcpus;
  11380. + int last_boosted_vcpu;
  11381. + struct list_head vm_list;
  11382. + struct mutex lock;
  11383. + struct kvm_io_bus *buses[KVM_NR_BUSES];
  11384. +#ifdef CONFIG_HAVE_KVM_EVENTFD
  11385. + struct {
  11386. + spinlock_t lock;
  11387. + struct list_head items;
  11388. + struct list_head resampler_list;
  11389. + struct mutex resampler_lock;
  11390. + } irqfds;
  11391. + struct list_head ioeventfds;
  11392. +#endif
  11393. + struct kvm_vm_stat stat;
  11394. + struct kvm_arch arch;
  11395. + atomic_t users_count;
  11396. +#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
  11397. + struct kvm_coalesced_mmio_ring *coalesced_mmio_ring;
  11398. + spinlock_t ring_lock;
  11399. + struct list_head coalesced_zones;
  11400. +#endif
  11401. +
  11402. + struct mutex irq_lock;
  11403. +#ifdef CONFIG_HAVE_KVM_IRQCHIP
  11404. + /*
  11405. + * Update side is protected by irq_lock.
  11406. + */
  11407. + struct kvm_irq_routing_table __rcu *irq_routing;
  11408. + struct hlist_head mask_notifier_list;
  11409. +#endif
  11410. +#ifdef CONFIG_HAVE_KVM_IRQFD
  11411. + struct hlist_head irq_ack_notifier_list;
  11412. +#endif
  11413. +
  11414. +#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
  11415. + struct mmu_notifier mmu_notifier;
  11416. + unsigned long mmu_notifier_seq;
  11417. + long mmu_notifier_count;
  11418. +#endif
  11419. + long tlbs_dirty;
  11420. + struct list_head devices;
  11421. +};
  11422. +
  11423. +#define kvm_err(fmt, ...) \
  11424. + pr_err("kvm [%i]: " fmt, task_pid_nr(current), ## __VA_ARGS__)
  11425. +#define kvm_info(fmt, ...) \
  11426. + pr_info("kvm [%i]: " fmt, task_pid_nr(current), ## __VA_ARGS__)
  11427. +#define kvm_debug(fmt, ...) \
  11428. + pr_debug("kvm [%i]: " fmt, task_pid_nr(current), ## __VA_ARGS__)
  11429. +#define kvm_pr_unimpl(fmt, ...) \
  11430. + pr_err_ratelimited("kvm [%i]: " fmt, \
  11431. + task_tgid_nr(current), ## __VA_ARGS__)
  11432. +
  11433. +/* The guest did something we don't support. */
  11434. +#define vcpu_unimpl(vcpu, fmt, ...) \
  11435. + kvm_pr_unimpl("vcpu%i " fmt, (vcpu)->vcpu_id, ## __VA_ARGS__)
  11436. +
  11437. +static inline struct kvm_vcpu *kvm_get_vcpu(struct kvm *kvm, int i)
  11438. +{
  11439. + smp_rmb();
  11440. + return kvm->vcpus[i];
  11441. +}
  11442. +
  11443. +#define kvm_for_each_vcpu(idx, vcpup, kvm) \
  11444. + for (idx = 0; \
  11445. + idx < atomic_read(&kvm->online_vcpus) && \
  11446. + (vcpup = kvm_get_vcpu(kvm, idx)) != NULL; \
  11447. + idx++)
  11448. +
  11449. +#define kvm_for_each_memslot(memslot, slots) \
  11450. + for (memslot = &slots->memslots[0]; \
  11451. + memslot < slots->memslots + KVM_MEM_SLOTS_NUM && memslot->npages;\
  11452. + memslot++)
  11453. +
  11454. +int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id);
  11455. +void kvm_vcpu_uninit(struct kvm_vcpu *vcpu);
  11456. +
  11457. +int __must_check vcpu_load(struct kvm_vcpu *vcpu);
  11458. +void vcpu_put(struct kvm_vcpu *vcpu);
  11459. +
  11460. +#ifdef CONFIG_HAVE_KVM_IRQFD
  11461. +int kvm_irqfd_init(void);
  11462. +void kvm_irqfd_exit(void);
  11463. +#else
  11464. +static inline int kvm_irqfd_init(void)
  11465. +{
  11466. + return 0;
  11467. +}
  11468. +
  11469. +static inline void kvm_irqfd_exit(void)
  11470. +{
  11471. +}
  11472. +#endif
  11473. +int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
  11474. + struct module *module);
  11475. +void kvm_exit(void);
  11476. +
  11477. +void kvm_get_kvm(struct kvm *kvm);
  11478. +void kvm_put_kvm(struct kvm *kvm);
  11479. +
  11480. +static inline struct kvm_memslots *kvm_memslots(struct kvm *kvm)
  11481. +{
  11482. + return rcu_dereference_check(kvm->memslots,
  11483. + srcu_read_lock_held(&kvm->srcu)
  11484. + || lockdep_is_held(&kvm->slots_lock));
  11485. +}
  11486. +
  11487. +static inline struct kvm_memory_slot *
  11488. +id_to_memslot(struct kvm_memslots *slots, int id)
  11489. +{
  11490. + int index = slots->id_to_index[id];
  11491. + struct kvm_memory_slot *slot;
  11492. +
  11493. + slot = &slots->memslots[index];
  11494. +
  11495. + WARN_ON(slot->id != id);
  11496. + return slot;
  11497. +}
  11498. +
  11499. +/*
  11500. + * KVM_SET_USER_MEMORY_REGION ioctl allows the following operations:
  11501. + * - create a new memory slot
  11502. + * - delete an existing memory slot
  11503. + * - modify an existing memory slot
  11504. + * -- move it in the guest physical memory space
  11505. + * -- just change its flags
  11506. + *
  11507. + * Since flags can be changed by some of these operations, the following
  11508. + * differentiation is the best we can do for __kvm_set_memory_region():
  11509. + */
  11510. +enum kvm_mr_change {
  11511. + KVM_MR_CREATE,
  11512. + KVM_MR_DELETE,
  11513. + KVM_MR_MOVE,
  11514. + KVM_MR_FLAGS_ONLY,
  11515. +};
  11516. +
  11517. +int kvm_set_memory_region(struct kvm *kvm,
  11518. + struct kvm_userspace_memory_region *mem);
  11519. +int __kvm_set_memory_region(struct kvm *kvm,
  11520. + struct kvm_userspace_memory_region *mem);
  11521. +void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
  11522. + struct kvm_memory_slot *dont);
  11523. +int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
  11524. + unsigned long npages);
  11525. +void kvm_arch_memslots_updated(struct kvm *kvm);
  11526. +int kvm_arch_prepare_memory_region(struct kvm *kvm,
  11527. + struct kvm_memory_slot *memslot,
  11528. + struct kvm_userspace_memory_region *mem,
  11529. + enum kvm_mr_change change);
  11530. +void kvm_arch_commit_memory_region(struct kvm *kvm,
  11531. + struct kvm_userspace_memory_region *mem,
  11532. + const struct kvm_memory_slot *old,
  11533. + enum kvm_mr_change change);
  11534. +bool kvm_largepages_enabled(void);
  11535. +void kvm_disable_largepages(void);
  11536. +/* flush all memory translations */
  11537. +void kvm_arch_flush_shadow_all(struct kvm *kvm);
  11538. +/* flush memory translations pointing to 'slot' */
  11539. +void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
  11540. + struct kvm_memory_slot *slot);
  11541. +
  11542. +int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,
  11543. + int nr_pages);
  11544. +
  11545. +struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn);
  11546. +unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn);
  11547. +unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable);
  11548. +unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, gfn_t gfn);
  11549. +unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot, gfn_t gfn,
  11550. + bool *writable);
  11551. +void kvm_release_page_clean(struct page *page);
  11552. +void kvm_release_page_dirty(struct page *page);
  11553. +void kvm_set_page_accessed(struct page *page);
  11554. +
  11555. +pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn);
  11556. +pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async,
  11557. + bool write_fault, bool *writable);
  11558. +pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn);
  11559. +pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
  11560. + bool *writable);
  11561. +pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn);
  11562. +pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn);
  11563. +
  11564. +void kvm_release_pfn_clean(pfn_t pfn);
  11565. +void kvm_set_pfn_dirty(pfn_t pfn);
  11566. +void kvm_set_pfn_accessed(pfn_t pfn);
  11567. +void kvm_get_pfn(pfn_t pfn);
  11568. +
  11569. +int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
  11570. + int len);
  11571. +int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
  11572. + unsigned long len);
  11573. +int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len);
  11574. +int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
  11575. + void *data, unsigned long len);
  11576. +int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
  11577. + int offset, int len);
  11578. +int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
  11579. + unsigned long len);
  11580. +int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
  11581. + void *data, unsigned long len);
  11582. +int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
  11583. + gpa_t gpa, unsigned long len);
  11584. +int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len);
  11585. +int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len);
  11586. +struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn);
  11587. +int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn);
  11588. +unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn);
  11589. +void mark_page_dirty(struct kvm *kvm, gfn_t gfn);
  11590. +
  11591. +void kvm_vcpu_block(struct kvm_vcpu *vcpu);
  11592. +void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
  11593. +int kvm_vcpu_yield_to(struct kvm_vcpu *target);
  11594. +void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu);
  11595. +void kvm_load_guest_fpu(struct kvm_vcpu *vcpu);
  11596. +void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
  11597. +
  11598. +void kvm_flush_remote_tlbs(struct kvm *kvm);
  11599. +void kvm_reload_remote_mmus(struct kvm *kvm);
  11600. +void kvm_make_mclock_inprogress_request(struct kvm *kvm);
  11601. +void kvm_make_scan_ioapic_request(struct kvm *kvm);
  11602. +bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req);
  11603. +
  11604. +long kvm_arch_dev_ioctl(struct file *filp,
  11605. + unsigned int ioctl, unsigned long arg);
  11606. +long kvm_arch_vcpu_ioctl(struct file *filp,
  11607. + unsigned int ioctl, unsigned long arg);
  11608. +int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf);
  11609. +
  11610. +int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext);
  11611. +
  11612. +int kvm_get_dirty_log(struct kvm *kvm,
  11613. + struct kvm_dirty_log *log, int *is_dirty);
  11614. +int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
  11615. + struct kvm_dirty_log *log);
  11616. +
  11617. +int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level,
  11618. + bool line_status);
  11619. +long kvm_arch_vm_ioctl(struct file *filp,
  11620. + unsigned int ioctl, unsigned long arg);
  11621. +
  11622. +int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu);
  11623. +int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu);
  11624. +
  11625. +int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
  11626. + struct kvm_translation *tr);
  11627. +
  11628. +int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs);
  11629. +int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs);
  11630. +int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
  11631. + struct kvm_sregs *sregs);
  11632. +int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
  11633. + struct kvm_sregs *sregs);
  11634. +int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
  11635. + struct kvm_mp_state *mp_state);
  11636. +int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
  11637. + struct kvm_mp_state *mp_state);
  11638. +int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
  11639. + struct kvm_guest_debug *dbg);
  11640. +int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run);
  11641. +
  11642. +int kvm_arch_init(void *opaque);
  11643. +void kvm_arch_exit(void);
  11644. +
  11645. +int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu);
  11646. +void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu);
  11647. +
  11648. +void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu);
  11649. +
  11650. +void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu);
  11651. +void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
  11652. +void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu);
  11653. +struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id);
  11654. +int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu);
  11655. +int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu);
  11656. +void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu);
  11657. +
  11658. +int kvm_arch_hardware_enable(void);
  11659. +void kvm_arch_hardware_disable(void);
  11660. +int kvm_arch_hardware_setup(void);
  11661. +void kvm_arch_hardware_unsetup(void);
  11662. +void kvm_arch_check_processor_compat(void *rtn);
  11663. +int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu);
  11664. +int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu);
  11665. +
  11666. +void *kvm_kvzalloc(unsigned long size);
  11667. +void kvm_kvfree(const void *addr);
  11668. +
  11669. +#ifndef __KVM_HAVE_ARCH_VM_ALLOC
  11670. +static inline struct kvm *kvm_arch_alloc_vm(void)
  11671. +{
  11672. + return kzalloc(sizeof(struct kvm), GFP_KERNEL);
  11673. +}
  11674. +
  11675. +static inline void kvm_arch_free_vm(struct kvm *kvm)
  11676. +{
  11677. + kfree(kvm);
  11678. +}
  11679. +#endif
  11680. +
  11681. +#ifdef __KVM_HAVE_ARCH_NONCOHERENT_DMA
  11682. +void kvm_arch_register_noncoherent_dma(struct kvm *kvm);
  11683. +void kvm_arch_unregister_noncoherent_dma(struct kvm *kvm);
  11684. +bool kvm_arch_has_noncoherent_dma(struct kvm *kvm);
  11685. +#else
  11686. +static inline void kvm_arch_register_noncoherent_dma(struct kvm *kvm)
  11687. +{
  11688. +}
  11689. +
  11690. +static inline void kvm_arch_unregister_noncoherent_dma(struct kvm *kvm)
  11691. +{
  11692. +}
  11693. +
  11694. +static inline bool kvm_arch_has_noncoherent_dma(struct kvm *kvm)
  11695. +{
  11696. + return false;
  11697. +}
  11698. +#endif
  11699. +
  11700. +static inline wait_queue_head_t *kvm_arch_vcpu_wq(struct kvm_vcpu *vcpu)
  11701. +{
  11702. +#ifdef __KVM_HAVE_ARCH_WQP
  11703. + return vcpu->arch.wqp;
  11704. +#else
  11705. + return &vcpu->wq;
  11706. +#endif
  11707. +}
  11708. +
  11709. +int kvm_arch_init_vm(struct kvm *kvm, unsigned long type);
  11710. +void kvm_arch_destroy_vm(struct kvm *kvm);
  11711. +void kvm_arch_sync_events(struct kvm *kvm);
  11712. +
  11713. +int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu);
  11714. +void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
  11715. +
  11716. +bool kvm_is_reserved_pfn(pfn_t pfn);
  11717. +
  11718. +struct kvm_irq_ack_notifier {
  11719. + struct hlist_node link;
  11720. + unsigned gsi;
  11721. + void (*irq_acked)(struct kvm_irq_ack_notifier *kian);
  11722. +};
  11723. +
  11724. +struct kvm_assigned_dev_kernel {
  11725. + struct kvm_irq_ack_notifier ack_notifier;
  11726. + struct list_head list;
  11727. + int assigned_dev_id;
  11728. + int host_segnr;
  11729. + int host_busnr;
  11730. + int host_devfn;
  11731. + unsigned int entries_nr;
  11732. + int host_irq;
  11733. + bool host_irq_disabled;
  11734. + bool pci_2_3;
  11735. + struct msix_entry *host_msix_entries;
  11736. + int guest_irq;
  11737. + struct msix_entry *guest_msix_entries;
  11738. + unsigned long irq_requested_type;
  11739. + int irq_source_id;
  11740. + int flags;
  11741. + struct pci_dev *dev;
  11742. + struct kvm *kvm;
  11743. + spinlock_t intx_lock;
  11744. + spinlock_t intx_mask_lock;
  11745. + char irq_name[32];
  11746. + struct pci_saved_state *pci_saved_state;
  11747. +};
  11748. +
  11749. +struct kvm_irq_mask_notifier {
  11750. + void (*func)(struct kvm_irq_mask_notifier *kimn, bool masked);
  11751. + int irq;
  11752. + struct hlist_node link;
  11753. +};
  11754. +
  11755. +void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq,
  11756. + struct kvm_irq_mask_notifier *kimn);
  11757. +void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
  11758. + struct kvm_irq_mask_notifier *kimn);
  11759. +void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin,
  11760. + bool mask);
  11761. +
  11762. +int kvm_irq_map_gsi(struct kvm *kvm,
  11763. + struct kvm_kernel_irq_routing_entry *entries, int gsi);
  11764. +int kvm_irq_map_chip_pin(struct kvm *kvm, unsigned irqchip, unsigned pin);
  11765. +
  11766. +int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
  11767. + bool line_status);
  11768. +int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level);
  11769. +int kvm_set_msi(struct kvm_kernel_irq_routing_entry *irq_entry, struct kvm *kvm,
  11770. + int irq_source_id, int level, bool line_status);
  11771. +bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin);
  11772. +void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin);
  11773. +void kvm_register_irq_ack_notifier(struct kvm *kvm,
  11774. + struct kvm_irq_ack_notifier *kian);
  11775. +void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
  11776. + struct kvm_irq_ack_notifier *kian);
  11777. +int kvm_request_irq_source_id(struct kvm *kvm);
  11778. +void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id);
  11779. +
  11780. +#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
  11781. +int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot);
  11782. +void kvm_iommu_unmap_pages(struct kvm *kvm, struct kvm_memory_slot *slot);
  11783. +int kvm_iommu_map_guest(struct kvm *kvm);
  11784. +int kvm_iommu_unmap_guest(struct kvm *kvm);
  11785. +int kvm_assign_device(struct kvm *kvm,
  11786. + struct kvm_assigned_dev_kernel *assigned_dev);
  11787. +int kvm_deassign_device(struct kvm *kvm,
  11788. + struct kvm_assigned_dev_kernel *assigned_dev);
  11789. +#else
  11790. +static inline int kvm_iommu_map_pages(struct kvm *kvm,
  11791. + struct kvm_memory_slot *slot)
  11792. +{
  11793. + return 0;
  11794. +}
  11795. +
  11796. +static inline void kvm_iommu_unmap_pages(struct kvm *kvm,
  11797. + struct kvm_memory_slot *slot)
  11798. +{
  11799. +}
  11800. +
  11801. +static inline int kvm_iommu_unmap_guest(struct kvm *kvm)
  11802. +{
  11803. + return 0;
  11804. +}
  11805. +#endif
  11806. +
  11807. +static inline void kvm_guest_enter(void)
  11808. +{
  11809. + unsigned long flags;
  11810. +
  11811. + BUG_ON(preemptible());
  11812. +
  11813. + local_irq_save(flags);
  11814. + guest_enter();
  11815. + local_irq_restore(flags);
  11816. +
  11817. + /* KVM does not hold any references to rcu protected data when it
  11818. + * switches CPU into a guest mode. In fact switching to a guest mode
  11819. + * is very similar to exiting to userspace from rcu point of view. In
  11820. + * addition CPU may stay in a guest mode for quite a long time (up to
  11821. + * one time slice). Lets treat guest mode as quiescent state, just like
  11822. + * we do with user-mode execution.
  11823. + */
  11824. + rcu_virt_note_context_switch(smp_processor_id());
  11825. +}
  11826. +
  11827. +static inline void kvm_guest_exit(void)
  11828. +{
  11829. + unsigned long flags;
  11830. +
  11831. + local_irq_save(flags);
  11832. + guest_exit();
  11833. + local_irq_restore(flags);
  11834. +}
  11835. +
  11836. +/*
  11837. + * search_memslots() and __gfn_to_memslot() are here because they are
  11838. + * used in non-modular code in arch/powerpc/kvm/book3s_hv_rm_mmu.c.
  11839. + * gfn_to_memslot() itself isn't here as an inline because that would
  11840. + * bloat other code too much.
  11841. + */
  11842. +static inline struct kvm_memory_slot *
  11843. +search_memslots(struct kvm_memslots *slots, gfn_t gfn)
  11844. +{
  11845. + struct kvm_memory_slot *memslot;
  11846. +
  11847. + kvm_for_each_memslot(memslot, slots)
  11848. + if (gfn >= memslot->base_gfn &&
  11849. + gfn < memslot->base_gfn + memslot->npages)
  11850. + return memslot;
  11851. +
  11852. + return NULL;
  11853. +}
  11854. +
  11855. +static inline struct kvm_memory_slot *
  11856. +__gfn_to_memslot(struct kvm_memslots *slots, gfn_t gfn)
  11857. +{
  11858. + return search_memslots(slots, gfn);
  11859. +}
  11860. +
  11861. +static inline unsigned long
  11862. +__gfn_to_hva_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
  11863. +{
  11864. + return slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE;
  11865. +}
  11866. +
  11867. +static inline int memslot_id(struct kvm *kvm, gfn_t gfn)
  11868. +{
  11869. + return gfn_to_memslot(kvm, gfn)->id;
  11870. +}
  11871. +
  11872. +static inline gfn_t
  11873. +hva_to_gfn_memslot(unsigned long hva, struct kvm_memory_slot *slot)
  11874. +{
  11875. + gfn_t gfn_offset = (hva - slot->userspace_addr) >> PAGE_SHIFT;
  11876. +
  11877. + return slot->base_gfn + gfn_offset;
  11878. +}
  11879. +
  11880. +static inline gpa_t gfn_to_gpa(gfn_t gfn)
  11881. +{
  11882. + return (gpa_t)gfn << PAGE_SHIFT;
  11883. +}
  11884. +
  11885. +static inline gfn_t gpa_to_gfn(gpa_t gpa)
  11886. +{
  11887. + return (gfn_t)(gpa >> PAGE_SHIFT);
  11888. +}
  11889. +
  11890. +static inline hpa_t pfn_to_hpa(pfn_t pfn)
  11891. +{
  11892. + return (hpa_t)pfn << PAGE_SHIFT;
  11893. +}
  11894. +
  11895. +static inline bool kvm_is_error_gpa(struct kvm *kvm, gpa_t gpa)
  11896. +{
  11897. + unsigned long hva = gfn_to_hva(kvm, gpa_to_gfn(gpa));
  11898. +
  11899. + return kvm_is_error_hva(hva);
  11900. +}
  11901. +
  11902. +static inline void kvm_migrate_timers(struct kvm_vcpu *vcpu)
  11903. +{
  11904. + set_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests);
  11905. +}
  11906. +
  11907. +enum kvm_stat_kind {
  11908. + KVM_STAT_VM,
  11909. + KVM_STAT_VCPU,
  11910. +};
  11911. +
  11912. +struct kvm_stats_debugfs_item {
  11913. + const char *name;
  11914. + int offset;
  11915. + enum kvm_stat_kind kind;
  11916. + struct dentry *dentry;
  11917. +};
  11918. +extern struct kvm_stats_debugfs_item debugfs_entries[];
  11919. +extern struct dentry *kvm_debugfs_dir;
  11920. +
  11921. +#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
  11922. +static inline int mmu_notifier_retry(struct kvm *kvm, unsigned long mmu_seq)
  11923. +{
  11924. + if (unlikely(kvm->mmu_notifier_count))
  11925. + return 1;
  11926. + /*
  11927. + * Ensure the read of mmu_notifier_count happens before the read
  11928. + * of mmu_notifier_seq. This interacts with the smp_wmb() in
  11929. + * mmu_notifier_invalidate_range_end to make sure that the caller
  11930. + * either sees the old (non-zero) value of mmu_notifier_count or
  11931. + * the new (incremented) value of mmu_notifier_seq.
  11932. + * PowerPC Book3s HV KVM calls this under a per-page lock
  11933. + * rather than under kvm->mmu_lock, for scalability, so
  11934. + * can't rely on kvm->mmu_lock to keep things ordered.
  11935. + */
  11936. + smp_rmb();
  11937. + if (kvm->mmu_notifier_seq != mmu_seq)
  11938. + return 1;
  11939. + return 0;
  11940. +}
  11941. +#endif
  11942. +
  11943. +#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
  11944. +
  11945. +#ifdef CONFIG_S390
  11946. +#define KVM_MAX_IRQ_ROUTES 4096 //FIXME: we can have more than that...
  11947. +#else
  11948. +#define KVM_MAX_IRQ_ROUTES 1024
  11949. +#endif
  11950. +
  11951. +int kvm_setup_default_irq_routing(struct kvm *kvm);
  11952. +int kvm_set_irq_routing(struct kvm *kvm,
  11953. + const struct kvm_irq_routing_entry *entries,
  11954. + unsigned nr,
  11955. + unsigned flags);
  11956. +int kvm_set_routing_entry(struct kvm_kernel_irq_routing_entry *e,
  11957. + const struct kvm_irq_routing_entry *ue);
  11958. +void kvm_free_irq_routing(struct kvm *kvm);
  11959. +
  11960. +#else
  11961. +
  11962. +static inline void kvm_free_irq_routing(struct kvm *kvm) {}
  11963. +
  11964. +#endif
  11965. +
  11966. +int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi);
  11967. +
  11968. +#ifdef CONFIG_HAVE_KVM_EVENTFD
  11969. +
  11970. +void kvm_eventfd_init(struct kvm *kvm);
  11971. +int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args);
  11972. +
  11973. +#ifdef CONFIG_HAVE_KVM_IRQFD
  11974. +int kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args);
  11975. +void kvm_irqfd_release(struct kvm *kvm);
  11976. +void kvm_irq_routing_update(struct kvm *);
  11977. +#else
  11978. +static inline int kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args)
  11979. +{
  11980. + return -EINVAL;
  11981. +}
  11982. +
  11983. +static inline void kvm_irqfd_release(struct kvm *kvm) {}
  11984. +#endif
  11985. +
  11986. +#else
  11987. +
  11988. +static inline void kvm_eventfd_init(struct kvm *kvm) {}
  11989. +
  11990. +static inline int kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args)
  11991. +{
  11992. + return -EINVAL;
  11993. +}
  11994. +
  11995. +static inline void kvm_irqfd_release(struct kvm *kvm) {}
  11996. +
  11997. +#ifdef CONFIG_HAVE_KVM_IRQCHIP
  11998. +static inline void kvm_irq_routing_update(struct kvm *kvm)
  11999. +{
  12000. +}
  12001. +#endif
  12002. +
  12003. +static inline int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
  12004. +{
  12005. + return -ENOSYS;
  12006. +}
  12007. +
  12008. +#endif /* CONFIG_HAVE_KVM_EVENTFD */
  12009. +
  12010. +#ifdef CONFIG_KVM_APIC_ARCHITECTURE
  12011. +static inline bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
  12012. +{
  12013. + return vcpu->kvm->bsp_vcpu_id == vcpu->vcpu_id;
  12014. +}
  12015. +
  12016. +bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu);
  12017. +
  12018. +#else
  12019. +
  12020. +static inline bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu) { return true; }
  12021. +
  12022. +#endif
  12023. +
  12024. +#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
  12025. +
  12026. +long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
  12027. + unsigned long arg);
  12028. +
  12029. +void kvm_free_all_assigned_devices(struct kvm *kvm);
  12030. +
  12031. +#else
  12032. +
  12033. +static inline long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
  12034. + unsigned long arg)
  12035. +{
  12036. + return -ENOTTY;
  12037. +}
  12038. +
  12039. +static inline void kvm_free_all_assigned_devices(struct kvm *kvm) {}
  12040. +
  12041. +#endif
  12042. +
  12043. +static inline void kvm_make_request(int req, struct kvm_vcpu *vcpu)
  12044. +{
  12045. + set_bit(req, &vcpu->requests);
  12046. +}
  12047. +
  12048. +static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu)
  12049. +{
  12050. + if (test_bit(req, &vcpu->requests)) {
  12051. + clear_bit(req, &vcpu->requests);
  12052. + return true;
  12053. + } else {
  12054. + return false;
  12055. + }
  12056. +}
  12057. +
  12058. +extern bool kvm_rebooting;
  12059. +
  12060. +struct kvm_device {
  12061. + struct kvm_device_ops *ops;
  12062. + struct kvm *kvm;
  12063. + void *private;
  12064. + struct list_head vm_node;
  12065. +};
  12066. +
  12067. +/* create, destroy, and name are mandatory */
  12068. +struct kvm_device_ops {
  12069. + const char *name;
  12070. + int (*create)(struct kvm_device *dev, u32 type);
  12071. +
  12072. + /*
  12073. + * Destroy is responsible for freeing dev.
  12074. + *
  12075. + * Destroy may be called before or after destructors are called
  12076. + * on emulated I/O regions, depending on whether a reference is
  12077. + * held by a vcpu or other kvm component that gets destroyed
  12078. + * after the emulated I/O.
  12079. + */
  12080. + void (*destroy)(struct kvm_device *dev);
  12081. +
  12082. + int (*set_attr)(struct kvm_device *dev, struct kvm_device_attr *attr);
  12083. + int (*get_attr)(struct kvm_device *dev, struct kvm_device_attr *attr);
  12084. + int (*has_attr)(struct kvm_device *dev, struct kvm_device_attr *attr);
  12085. + long (*ioctl)(struct kvm_device *dev, unsigned int ioctl,
  12086. + unsigned long arg);
  12087. +};
  12088. +
  12089. +void kvm_device_get(struct kvm_device *dev);
  12090. +void kvm_device_put(struct kvm_device *dev);
  12091. +struct kvm_device *kvm_device_from_filp(struct file *filp);
  12092. +int kvm_register_device_ops(struct kvm_device_ops *ops, u32 type);
  12093. +void kvm_unregister_device_ops(u32 type);
  12094. +
  12095. +extern struct kvm_device_ops kvm_mpic_ops;
  12096. +extern struct kvm_device_ops kvm_xics_ops;
  12097. +
  12098. +#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
  12099. +
  12100. +static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool val)
  12101. +{
  12102. + vcpu->spin_loop.in_spin_loop = val;
  12103. +}
  12104. +static inline void kvm_vcpu_set_dy_eligible(struct kvm_vcpu *vcpu, bool val)
  12105. +{
  12106. + vcpu->spin_loop.dy_eligible = val;
  12107. +}
  12108. +
  12109. +#else /* !CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */
  12110. +
  12111. +static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool val)
  12112. +{
  12113. +}
  12114. +
  12115. +static inline void kvm_vcpu_set_dy_eligible(struct kvm_vcpu *vcpu, bool val)
  12116. +{
  12117. +}
  12118. +#endif /* CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */
  12119. +#endif
  12120. +
  12121. diff -Nur linux-3.18.14.orig/include/linux/lglock.h linux-3.18.14-rt/include/linux/lglock.h
  12122. --- linux-3.18.14.orig/include/linux/lglock.h 2015-05-20 10:04:50.000000000 -0500
  12123. +++ linux-3.18.14-rt/include/linux/lglock.h 2015-05-31 15:32:48.261635369 -0500
  12124. @@ -34,22 +34,39 @@
  12125. #endif
  12126. struct lglock {
  12127. +#ifndef CONFIG_PREEMPT_RT_FULL
  12128. arch_spinlock_t __percpu *lock;
  12129. +#else
  12130. + struct rt_mutex __percpu *lock;
  12131. +#endif
  12132. #ifdef CONFIG_DEBUG_LOCK_ALLOC
  12133. struct lock_class_key lock_key;
  12134. struct lockdep_map lock_dep_map;
  12135. #endif
  12136. };
  12137. -#define DEFINE_LGLOCK(name) \
  12138. +#ifndef CONFIG_PREEMPT_RT_FULL
  12139. +# define DEFINE_LGLOCK(name) \
  12140. static DEFINE_PER_CPU(arch_spinlock_t, name ## _lock) \
  12141. = __ARCH_SPIN_LOCK_UNLOCKED; \
  12142. struct lglock name = { .lock = &name ## _lock }
  12143. -#define DEFINE_STATIC_LGLOCK(name) \
  12144. +# define DEFINE_STATIC_LGLOCK(name) \
  12145. static DEFINE_PER_CPU(arch_spinlock_t, name ## _lock) \
  12146. = __ARCH_SPIN_LOCK_UNLOCKED; \
  12147. static struct lglock name = { .lock = &name ## _lock }
  12148. +#else
  12149. +
  12150. +# define DEFINE_LGLOCK(name) \
  12151. + static DEFINE_PER_CPU(struct rt_mutex, name ## _lock) \
  12152. + = __RT_MUTEX_INITIALIZER( name ## _lock); \
  12153. + struct lglock name = { .lock = &name ## _lock }
  12154. +
  12155. +# define DEFINE_STATIC_LGLOCK(name) \
  12156. + static DEFINE_PER_CPU(struct rt_mutex, name ## _lock) \
  12157. + = __RT_MUTEX_INITIALIZER( name ## _lock); \
  12158. + static struct lglock name = { .lock = &name ## _lock }
  12159. +#endif
  12160. void lg_lock_init(struct lglock *lg, char *name);
  12161. void lg_local_lock(struct lglock *lg);
  12162. @@ -59,6 +76,12 @@
  12163. void lg_global_lock(struct lglock *lg);
  12164. void lg_global_unlock(struct lglock *lg);
  12165. +#ifndef CONFIG_PREEMPT_RT_FULL
  12166. +#define lg_global_trylock_relax(name) lg_global_lock(name)
  12167. +#else
  12168. +void lg_global_trylock_relax(struct lglock *lg);
  12169. +#endif
  12170. +
  12171. #else
  12172. /* When !CONFIG_SMP, map lglock to spinlock */
  12173. #define lglock spinlock
  12174. diff -Nur linux-3.18.14.orig/include/linux/list_bl.h linux-3.18.14-rt/include/linux/list_bl.h
  12175. --- linux-3.18.14.orig/include/linux/list_bl.h 2015-05-20 10:04:50.000000000 -0500
  12176. +++ linux-3.18.14-rt/include/linux/list_bl.h 2015-05-31 15:32:48.265635369 -0500
  12177. @@ -2,6 +2,7 @@
  12178. #define _LINUX_LIST_BL_H
  12179. #include <linux/list.h>
  12180. +#include <linux/spinlock.h>
  12181. #include <linux/bit_spinlock.h>
  12182. /*
  12183. @@ -32,13 +33,22 @@
  12184. struct hlist_bl_head {
  12185. struct hlist_bl_node *first;
  12186. +#ifdef CONFIG_PREEMPT_RT_BASE
  12187. + raw_spinlock_t lock;
  12188. +#endif
  12189. };
  12190. struct hlist_bl_node {
  12191. struct hlist_bl_node *next, **pprev;
  12192. };
  12193. -#define INIT_HLIST_BL_HEAD(ptr) \
  12194. - ((ptr)->first = NULL)
  12195. +
  12196. +static inline void INIT_HLIST_BL_HEAD(struct hlist_bl_head *h)
  12197. +{
  12198. + h->first = NULL;
  12199. +#ifdef CONFIG_PREEMPT_RT_BASE
  12200. + raw_spin_lock_init(&h->lock);
  12201. +#endif
  12202. +}
  12203. static inline void INIT_HLIST_BL_NODE(struct hlist_bl_node *h)
  12204. {
  12205. @@ -117,12 +127,26 @@
  12206. static inline void hlist_bl_lock(struct hlist_bl_head *b)
  12207. {
  12208. +#ifndef CONFIG_PREEMPT_RT_BASE
  12209. bit_spin_lock(0, (unsigned long *)b);
  12210. +#else
  12211. + raw_spin_lock(&b->lock);
  12212. +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
  12213. + __set_bit(0, (unsigned long *)b);
  12214. +#endif
  12215. +#endif
  12216. }
  12217. static inline void hlist_bl_unlock(struct hlist_bl_head *b)
  12218. {
  12219. +#ifndef CONFIG_PREEMPT_RT_BASE
  12220. __bit_spin_unlock(0, (unsigned long *)b);
  12221. +#else
  12222. +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
  12223. + __clear_bit(0, (unsigned long *)b);
  12224. +#endif
  12225. + raw_spin_unlock(&b->lock);
  12226. +#endif
  12227. }
  12228. static inline bool hlist_bl_is_locked(struct hlist_bl_head *b)
  12229. diff -Nur linux-3.18.14.orig/include/linux/locallock.h linux-3.18.14-rt/include/linux/locallock.h
  12230. --- linux-3.18.14.orig/include/linux/locallock.h 1969-12-31 18:00:00.000000000 -0600
  12231. +++ linux-3.18.14-rt/include/linux/locallock.h 2015-05-31 15:32:48.273635368 -0500
  12232. @@ -0,0 +1,270 @@
  12233. +#ifndef _LINUX_LOCALLOCK_H
  12234. +#define _LINUX_LOCALLOCK_H
  12235. +
  12236. +#include <linux/percpu.h>
  12237. +#include <linux/spinlock.h>
  12238. +
  12239. +#ifdef CONFIG_PREEMPT_RT_BASE
  12240. +
  12241. +#ifdef CONFIG_DEBUG_SPINLOCK
  12242. +# define LL_WARN(cond) WARN_ON(cond)
  12243. +#else
  12244. +# define LL_WARN(cond) do { } while (0)
  12245. +#endif
  12246. +
  12247. +/*
  12248. + * per cpu lock based substitute for local_irq_*()
  12249. + */
  12250. +struct local_irq_lock {
  12251. + spinlock_t lock;
  12252. + struct task_struct *owner;
  12253. + int nestcnt;
  12254. + unsigned long flags;
  12255. +};
  12256. +
  12257. +#define DEFINE_LOCAL_IRQ_LOCK(lvar) \
  12258. + DEFINE_PER_CPU(struct local_irq_lock, lvar) = { \
  12259. + .lock = __SPIN_LOCK_UNLOCKED((lvar).lock) }
  12260. +
  12261. +#define DECLARE_LOCAL_IRQ_LOCK(lvar) \
  12262. + DECLARE_PER_CPU(struct local_irq_lock, lvar)
  12263. +
  12264. +#define local_irq_lock_init(lvar) \
  12265. + do { \
  12266. + int __cpu; \
  12267. + for_each_possible_cpu(__cpu) \
  12268. + spin_lock_init(&per_cpu(lvar, __cpu).lock); \
  12269. + } while (0)
  12270. +
  12271. +/*
  12272. + * spin_lock|trylock|unlock_local flavour that does not migrate disable
  12273. + * used for __local_lock|trylock|unlock where get_local_var/put_local_var
  12274. + * already takes care of the migrate_disable/enable
  12275. + * for CONFIG_PREEMPT_BASE map to the normal spin_* calls.
  12276. + */
  12277. +#ifdef CONFIG_PREEMPT_RT_FULL
  12278. +# define spin_lock_local(lock) rt_spin_lock(lock)
  12279. +# define spin_trylock_local(lock) rt_spin_trylock(lock)
  12280. +# define spin_unlock_local(lock) rt_spin_unlock(lock)
  12281. +#else
  12282. +# define spin_lock_local(lock) spin_lock(lock)
  12283. +# define spin_trylock_local(lock) spin_trylock(lock)
  12284. +# define spin_unlock_local(lock) spin_unlock(lock)
  12285. +#endif
  12286. +
  12287. +static inline void __local_lock(struct local_irq_lock *lv)
  12288. +{
  12289. + if (lv->owner != current) {
  12290. + spin_lock_local(&lv->lock);
  12291. + LL_WARN(lv->owner);
  12292. + LL_WARN(lv->nestcnt);
  12293. + lv->owner = current;
  12294. + }
  12295. + lv->nestcnt++;
  12296. +}
  12297. +
  12298. +#define local_lock(lvar) \
  12299. + do { __local_lock(&get_local_var(lvar)); } while (0)
  12300. +
  12301. +static inline int __local_trylock(struct local_irq_lock *lv)
  12302. +{
  12303. + if (lv->owner != current && spin_trylock_local(&lv->lock)) {
  12304. + LL_WARN(lv->owner);
  12305. + LL_WARN(lv->nestcnt);
  12306. + lv->owner = current;
  12307. + lv->nestcnt = 1;
  12308. + return 1;
  12309. + }
  12310. + return 0;
  12311. +}
  12312. +
  12313. +#define local_trylock(lvar) \
  12314. + ({ \
  12315. + int __locked; \
  12316. + __locked = __local_trylock(&get_local_var(lvar)); \
  12317. + if (!__locked) \
  12318. + put_local_var(lvar); \
  12319. + __locked; \
  12320. + })
  12321. +
  12322. +static inline void __local_unlock(struct local_irq_lock *lv)
  12323. +{
  12324. + LL_WARN(lv->nestcnt == 0);
  12325. + LL_WARN(lv->owner != current);
  12326. + if (--lv->nestcnt)
  12327. + return;
  12328. +
  12329. + lv->owner = NULL;
  12330. + spin_unlock_local(&lv->lock);
  12331. +}
  12332. +
  12333. +#define local_unlock(lvar) \
  12334. + do { \
  12335. + __local_unlock(&__get_cpu_var(lvar)); \
  12336. + put_local_var(lvar); \
  12337. + } while (0)
  12338. +
  12339. +static inline void __local_lock_irq(struct local_irq_lock *lv)
  12340. +{
  12341. + spin_lock_irqsave(&lv->lock, lv->flags);
  12342. + LL_WARN(lv->owner);
  12343. + LL_WARN(lv->nestcnt);
  12344. + lv->owner = current;
  12345. + lv->nestcnt = 1;
  12346. +}
  12347. +
  12348. +#define local_lock_irq(lvar) \
  12349. + do { __local_lock_irq(&get_local_var(lvar)); } while (0)
  12350. +
  12351. +#define local_lock_irq_on(lvar, cpu) \
  12352. + do { __local_lock_irq(&per_cpu(lvar, cpu)); } while (0)
  12353. +
  12354. +static inline void __local_unlock_irq(struct local_irq_lock *lv)
  12355. +{
  12356. + LL_WARN(!lv->nestcnt);
  12357. + LL_WARN(lv->owner != current);
  12358. + lv->owner = NULL;
  12359. + lv->nestcnt = 0;
  12360. + spin_unlock_irq(&lv->lock);
  12361. +}
  12362. +
  12363. +#define local_unlock_irq(lvar) \
  12364. + do { \
  12365. + __local_unlock_irq(&__get_cpu_var(lvar)); \
  12366. + put_local_var(lvar); \
  12367. + } while (0)
  12368. +
  12369. +#define local_unlock_irq_on(lvar, cpu) \
  12370. + do { \
  12371. + __local_unlock_irq(&per_cpu(lvar, cpu)); \
  12372. + } while (0)
  12373. +
  12374. +static inline int __local_lock_irqsave(struct local_irq_lock *lv)
  12375. +{
  12376. + if (lv->owner != current) {
  12377. + __local_lock_irq(lv);
  12378. + return 0;
  12379. + } else {
  12380. + lv->nestcnt++;
  12381. + return 1;
  12382. + }
  12383. +}
  12384. +
  12385. +#define local_lock_irqsave(lvar, _flags) \
  12386. + do { \
  12387. + if (__local_lock_irqsave(&get_local_var(lvar))) \
  12388. + put_local_var(lvar); \
  12389. + _flags = __get_cpu_var(lvar).flags; \
  12390. + } while (0)
  12391. +
  12392. +#define local_lock_irqsave_on(lvar, _flags, cpu) \
  12393. + do { \
  12394. + __local_lock_irqsave(&per_cpu(lvar, cpu)); \
  12395. + _flags = per_cpu(lvar, cpu).flags; \
  12396. + } while (0)
  12397. +
  12398. +static inline int __local_unlock_irqrestore(struct local_irq_lock *lv,
  12399. + unsigned long flags)
  12400. +{
  12401. + LL_WARN(!lv->nestcnt);
  12402. + LL_WARN(lv->owner != current);
  12403. + if (--lv->nestcnt)
  12404. + return 0;
  12405. +
  12406. + lv->owner = NULL;
  12407. + spin_unlock_irqrestore(&lv->lock, lv->flags);
  12408. + return 1;
  12409. +}
  12410. +
  12411. +#define local_unlock_irqrestore(lvar, flags) \
  12412. + do { \
  12413. + if (__local_unlock_irqrestore(&__get_cpu_var(lvar), flags)) \
  12414. + put_local_var(lvar); \
  12415. + } while (0)
  12416. +
  12417. +#define local_unlock_irqrestore_on(lvar, flags, cpu) \
  12418. + do { \
  12419. + __local_unlock_irqrestore(&per_cpu(lvar, cpu), flags); \
  12420. + } while (0)
  12421. +
  12422. +#define local_spin_trylock_irq(lvar, lock) \
  12423. + ({ \
  12424. + int __locked; \
  12425. + local_lock_irq(lvar); \
  12426. + __locked = spin_trylock(lock); \
  12427. + if (!__locked) \
  12428. + local_unlock_irq(lvar); \
  12429. + __locked; \
  12430. + })
  12431. +
  12432. +#define local_spin_lock_irq(lvar, lock) \
  12433. + do { \
  12434. + local_lock_irq(lvar); \
  12435. + spin_lock(lock); \
  12436. + } while (0)
  12437. +
  12438. +#define local_spin_unlock_irq(lvar, lock) \
  12439. + do { \
  12440. + spin_unlock(lock); \
  12441. + local_unlock_irq(lvar); \
  12442. + } while (0)
  12443. +
  12444. +#define local_spin_lock_irqsave(lvar, lock, flags) \
  12445. + do { \
  12446. + local_lock_irqsave(lvar, flags); \
  12447. + spin_lock(lock); \
  12448. + } while (0)
  12449. +
  12450. +#define local_spin_unlock_irqrestore(lvar, lock, flags) \
  12451. + do { \
  12452. + spin_unlock(lock); \
  12453. + local_unlock_irqrestore(lvar, flags); \
  12454. + } while (0)
  12455. +
  12456. +#define get_locked_var(lvar, var) \
  12457. + (*({ \
  12458. + local_lock(lvar); \
  12459. + &__get_cpu_var(var); \
  12460. + }))
  12461. +
  12462. +#define put_locked_var(lvar, var) local_unlock(lvar);
  12463. +
  12464. +#define local_lock_cpu(lvar) \
  12465. + ({ \
  12466. + local_lock(lvar); \
  12467. + smp_processor_id(); \
  12468. + })
  12469. +
  12470. +#define local_unlock_cpu(lvar) local_unlock(lvar)
  12471. +
  12472. +#else /* PREEMPT_RT_BASE */
  12473. +
  12474. +#define DEFINE_LOCAL_IRQ_LOCK(lvar) __typeof__(const int) lvar
  12475. +#define DECLARE_LOCAL_IRQ_LOCK(lvar) extern __typeof__(const int) lvar
  12476. +
  12477. +static inline void local_irq_lock_init(int lvar) { }
  12478. +
  12479. +#define local_lock(lvar) preempt_disable()
  12480. +#define local_unlock(lvar) preempt_enable()
  12481. +#define local_lock_irq(lvar) local_irq_disable()
  12482. +#define local_unlock_irq(lvar) local_irq_enable()
  12483. +#define local_lock_irqsave(lvar, flags) local_irq_save(flags)
  12484. +#define local_unlock_irqrestore(lvar, flags) local_irq_restore(flags)
  12485. +
  12486. +#define local_spin_trylock_irq(lvar, lock) spin_trylock_irq(lock)
  12487. +#define local_spin_lock_irq(lvar, lock) spin_lock_irq(lock)
  12488. +#define local_spin_unlock_irq(lvar, lock) spin_unlock_irq(lock)
  12489. +#define local_spin_lock_irqsave(lvar, lock, flags) \
  12490. + spin_lock_irqsave(lock, flags)
  12491. +#define local_spin_unlock_irqrestore(lvar, lock, flags) \
  12492. + spin_unlock_irqrestore(lock, flags)
  12493. +
  12494. +#define get_locked_var(lvar, var) get_cpu_var(var)
  12495. +#define put_locked_var(lvar, var) put_cpu_var(var)
  12496. +
  12497. +#define local_lock_cpu(lvar) get_cpu()
  12498. +#define local_unlock_cpu(lvar) put_cpu()
  12499. +
  12500. +#endif
  12501. +
  12502. +#endif
  12503. diff -Nur linux-3.18.14.orig/include/linux/mm_types.h linux-3.18.14-rt/include/linux/mm_types.h
  12504. --- linux-3.18.14.orig/include/linux/mm_types.h 2015-05-20 10:04:50.000000000 -0500
  12505. +++ linux-3.18.14-rt/include/linux/mm_types.h 2015-05-31 15:32:48.273635368 -0500
  12506. @@ -11,6 +11,7 @@
  12507. #include <linux/completion.h>
  12508. #include <linux/cpumask.h>
  12509. #include <linux/page-debug-flags.h>
  12510. +#include <linux/rcupdate.h>
  12511. #include <linux/uprobes.h>
  12512. #include <linux/page-flags-layout.h>
  12513. #include <asm/page.h>
  12514. @@ -454,6 +455,9 @@
  12515. bool tlb_flush_pending;
  12516. #endif
  12517. struct uprobes_state uprobes_state;
  12518. +#ifdef CONFIG_PREEMPT_RT_BASE
  12519. + struct rcu_head delayed_drop;
  12520. +#endif
  12521. };
  12522. static inline void mm_init_cpumask(struct mm_struct *mm)
  12523. diff -Nur linux-3.18.14.orig/include/linux/mutex.h linux-3.18.14-rt/include/linux/mutex.h
  12524. --- linux-3.18.14.orig/include/linux/mutex.h 2015-05-20 10:04:50.000000000 -0500
  12525. +++ linux-3.18.14-rt/include/linux/mutex.h 2015-05-31 15:32:48.273635368 -0500
  12526. @@ -19,6 +19,17 @@
  12527. #include <asm/processor.h>
  12528. #include <linux/osq_lock.h>
  12529. +#ifdef CONFIG_DEBUG_LOCK_ALLOC
  12530. +# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \
  12531. + , .dep_map = { .name = #lockname }
  12532. +#else
  12533. +# define __DEP_MAP_MUTEX_INITIALIZER(lockname)
  12534. +#endif
  12535. +
  12536. +#ifdef CONFIG_PREEMPT_RT_FULL
  12537. +# include <linux/mutex_rt.h>
  12538. +#else
  12539. +
  12540. /*
  12541. * Simple, straightforward mutexes with strict semantics:
  12542. *
  12543. @@ -100,13 +111,6 @@
  12544. static inline void mutex_destroy(struct mutex *lock) {}
  12545. #endif
  12546. -#ifdef CONFIG_DEBUG_LOCK_ALLOC
  12547. -# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \
  12548. - , .dep_map = { .name = #lockname }
  12549. -#else
  12550. -# define __DEP_MAP_MUTEX_INITIALIZER(lockname)
  12551. -#endif
  12552. -
  12553. #define __MUTEX_INITIALIZER(lockname) \
  12554. { .count = ATOMIC_INIT(1) \
  12555. , .wait_lock = __SPIN_LOCK_UNLOCKED(lockname.wait_lock) \
  12556. @@ -174,6 +178,8 @@
  12557. extern int mutex_trylock(struct mutex *lock);
  12558. extern void mutex_unlock(struct mutex *lock);
  12559. +#endif /* !PREEMPT_RT_FULL */
  12560. +
  12561. extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock);
  12562. #endif /* __LINUX_MUTEX_H */
  12563. diff -Nur linux-3.18.14.orig/include/linux/mutex_rt.h linux-3.18.14-rt/include/linux/mutex_rt.h
  12564. --- linux-3.18.14.orig/include/linux/mutex_rt.h 1969-12-31 18:00:00.000000000 -0600
  12565. +++ linux-3.18.14-rt/include/linux/mutex_rt.h 2015-05-31 15:32:48.273635368 -0500
  12566. @@ -0,0 +1,84 @@
  12567. +#ifndef __LINUX_MUTEX_RT_H
  12568. +#define __LINUX_MUTEX_RT_H
  12569. +
  12570. +#ifndef __LINUX_MUTEX_H
  12571. +#error "Please include mutex.h"
  12572. +#endif
  12573. +
  12574. +#include <linux/rtmutex.h>
  12575. +
  12576. +/* FIXME: Just for __lockfunc */
  12577. +#include <linux/spinlock.h>
  12578. +
  12579. +struct mutex {
  12580. + struct rt_mutex lock;
  12581. +#ifdef CONFIG_DEBUG_LOCK_ALLOC
  12582. + struct lockdep_map dep_map;
  12583. +#endif
  12584. +};
  12585. +
  12586. +#define __MUTEX_INITIALIZER(mutexname) \
  12587. + { \
  12588. + .lock = __RT_MUTEX_INITIALIZER(mutexname.lock) \
  12589. + __DEP_MAP_MUTEX_INITIALIZER(mutexname) \
  12590. + }
  12591. +
  12592. +#define DEFINE_MUTEX(mutexname) \
  12593. + struct mutex mutexname = __MUTEX_INITIALIZER(mutexname)
  12594. +
  12595. +extern void __mutex_do_init(struct mutex *lock, const char *name, struct lock_class_key *key);
  12596. +extern void __lockfunc _mutex_lock(struct mutex *lock);
  12597. +extern int __lockfunc _mutex_lock_interruptible(struct mutex *lock);
  12598. +extern int __lockfunc _mutex_lock_killable(struct mutex *lock);
  12599. +extern void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass);
  12600. +extern void __lockfunc _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest_lock);
  12601. +extern int __lockfunc _mutex_lock_interruptible_nested(struct mutex *lock, int subclass);
  12602. +extern int __lockfunc _mutex_lock_killable_nested(struct mutex *lock, int subclass);
  12603. +extern int __lockfunc _mutex_trylock(struct mutex *lock);
  12604. +extern void __lockfunc _mutex_unlock(struct mutex *lock);
  12605. +
  12606. +#define mutex_is_locked(l) rt_mutex_is_locked(&(l)->lock)
  12607. +#define mutex_lock(l) _mutex_lock(l)
  12608. +#define mutex_lock_interruptible(l) _mutex_lock_interruptible(l)
  12609. +#define mutex_lock_killable(l) _mutex_lock_killable(l)
  12610. +#define mutex_trylock(l) _mutex_trylock(l)
  12611. +#define mutex_unlock(l) _mutex_unlock(l)
  12612. +#define mutex_destroy(l) rt_mutex_destroy(&(l)->lock)
  12613. +
  12614. +#ifdef CONFIG_DEBUG_LOCK_ALLOC
  12615. +# define mutex_lock_nested(l, s) _mutex_lock_nested(l, s)
  12616. +# define mutex_lock_interruptible_nested(l, s) \
  12617. + _mutex_lock_interruptible_nested(l, s)
  12618. +# define mutex_lock_killable_nested(l, s) \
  12619. + _mutex_lock_killable_nested(l, s)
  12620. +
  12621. +# define mutex_lock_nest_lock(lock, nest_lock) \
  12622. +do { \
  12623. + typecheck(struct lockdep_map *, &(nest_lock)->dep_map); \
  12624. + _mutex_lock_nest_lock(lock, &(nest_lock)->dep_map); \
  12625. +} while (0)
  12626. +
  12627. +#else
  12628. +# define mutex_lock_nested(l, s) _mutex_lock(l)
  12629. +# define mutex_lock_interruptible_nested(l, s) \
  12630. + _mutex_lock_interruptible(l)
  12631. +# define mutex_lock_killable_nested(l, s) \
  12632. + _mutex_lock_killable(l)
  12633. +# define mutex_lock_nest_lock(lock, nest_lock) mutex_lock(lock)
  12634. +#endif
  12635. +
  12636. +# define mutex_init(mutex) \
  12637. +do { \
  12638. + static struct lock_class_key __key; \
  12639. + \
  12640. + rt_mutex_init(&(mutex)->lock); \
  12641. + __mutex_do_init((mutex), #mutex, &__key); \
  12642. +} while (0)
  12643. +
  12644. +# define __mutex_init(mutex, name, key) \
  12645. +do { \
  12646. + rt_mutex_init(&(mutex)->lock); \
  12647. + __mutex_do_init((mutex), name, key); \
  12648. +} while (0)
  12649. +
  12650. +#endif
  12651. diff -Nur linux-3.18.14.orig/include/linux/netdevice.h linux-3.18.14-rt/include/linux/netdevice.h
  12652. --- linux-3.18.14.orig/include/linux/netdevice.h 2015-05-20 10:04:50.000000000 -0500
  12653. +++ linux-3.18.14-rt/include/linux/netdevice.h 2015-05-31 15:32:48.305635368 -0500
  12654. @@ -2351,6 +2351,7 @@
  12655. unsigned int dropped;
  12656. struct sk_buff_head input_pkt_queue;
  12657. struct napi_struct backlog;
  12658. + struct sk_buff_head tofree_queue;
  12659. #ifdef CONFIG_NET_FLOW_LIMIT
  12660. struct sd_flow_limit __rcu *flow_limit;
  12661. diff -Nur linux-3.18.14.orig/include/linux/netfilter/x_tables.h linux-3.18.14-rt/include/linux/netfilter/x_tables.h
  12662. --- linux-3.18.14.orig/include/linux/netfilter/x_tables.h 2015-05-20 10:04:50.000000000 -0500
  12663. +++ linux-3.18.14-rt/include/linux/netfilter/x_tables.h 2015-05-31 15:32:48.305635368 -0500
  12664. @@ -3,6 +3,7 @@
  12665. #include <linux/netdevice.h>
  12666. +#include <linux/locallock.h>
  12667. #include <uapi/linux/netfilter/x_tables.h>
  12668. /**
  12669. @@ -282,6 +283,8 @@
  12670. */
  12671. DECLARE_PER_CPU(seqcount_t, xt_recseq);
  12672. +DECLARE_LOCAL_IRQ_LOCK(xt_write_lock);
  12673. +
  12674. /**
  12675. * xt_write_recseq_begin - start of a write section
  12676. *
  12677. @@ -296,6 +299,9 @@
  12678. {
  12679. unsigned int addend;
  12680. + /* RT protection */
  12681. + local_lock(xt_write_lock);
  12682. +
  12683. /*
  12684. * Low order bit of sequence is set if we already
  12685. * called xt_write_recseq_begin().
  12686. @@ -326,6 +332,7 @@
  12687. /* this is kind of a write_seqcount_end(), but addend is 0 or 1 */
  12688. smp_wmb();
  12689. __this_cpu_add(xt_recseq.sequence, addend);
  12690. + local_unlock(xt_write_lock);
  12691. }
  12692. /*
  12693. diff -Nur linux-3.18.14.orig/include/linux/notifier.h linux-3.18.14-rt/include/linux/notifier.h
  12694. --- linux-3.18.14.orig/include/linux/notifier.h 2015-05-20 10:04:50.000000000 -0500
  12695. +++ linux-3.18.14-rt/include/linux/notifier.h 2015-05-31 15:32:48.305635368 -0500
  12696. @@ -6,7 +6,7 @@
  12697. *
  12698. * Alan Cox <Alan.Cox@linux.org>
  12699. */
  12700. -
  12701. +
  12702. #ifndef _LINUX_NOTIFIER_H
  12703. #define _LINUX_NOTIFIER_H
  12704. #include <linux/errno.h>
  12705. @@ -42,9 +42,7 @@
  12706. * in srcu_notifier_call_chain(): no cache bounces and no memory barriers.
  12707. * As compensation, srcu_notifier_chain_unregister() is rather expensive.
  12708. * SRCU notifier chains should be used when the chain will be called very
  12709. - * often but notifier_blocks will seldom be removed. Also, SRCU notifier
  12710. - * chains are slightly more difficult to use because they require special
  12711. - * runtime initialization.
  12712. + * often but notifier_blocks will seldom be removed.
  12713. */
  12714. typedef int (*notifier_fn_t)(struct notifier_block *nb,
  12715. @@ -88,7 +86,7 @@
  12716. (name)->head = NULL; \
  12717. } while (0)
  12718. -/* srcu_notifier_heads must be initialized and cleaned up dynamically */
  12719. +/* srcu_notifier_heads must be cleaned up dynamically */
  12720. extern void srcu_init_notifier_head(struct srcu_notifier_head *nh);
  12721. #define srcu_cleanup_notifier_head(name) \
  12722. cleanup_srcu_struct(&(name)->srcu);
  12723. @@ -101,7 +99,13 @@
  12724. .head = NULL }
  12725. #define RAW_NOTIFIER_INIT(name) { \
  12726. .head = NULL }
  12727. -/* srcu_notifier_heads cannot be initialized statically */
  12728. +
  12729. +#define SRCU_NOTIFIER_INIT(name, pcpu) \
  12730. + { \
  12731. + .mutex = __MUTEX_INITIALIZER(name.mutex), \
  12732. + .head = NULL, \
  12733. + .srcu = __SRCU_STRUCT_INIT(name.srcu, pcpu), \
  12734. + }
  12735. #define ATOMIC_NOTIFIER_HEAD(name) \
  12736. struct atomic_notifier_head name = \
  12737. @@ -113,6 +117,18 @@
  12738. struct raw_notifier_head name = \
  12739. RAW_NOTIFIER_INIT(name)
  12740. +#define _SRCU_NOTIFIER_HEAD(name, mod) \
  12741. + static DEFINE_PER_CPU(struct srcu_struct_array, \
  12742. + name##_head_srcu_array); \
  12743. + mod struct srcu_notifier_head name = \
  12744. + SRCU_NOTIFIER_INIT(name, name##_head_srcu_array)
  12745. +
  12746. +#define SRCU_NOTIFIER_HEAD(name) \
  12747. + _SRCU_NOTIFIER_HEAD(name, )
  12748. +
  12749. +#define SRCU_NOTIFIER_HEAD_STATIC(name) \
  12750. + _SRCU_NOTIFIER_HEAD(name, static)
  12751. +
  12752. #ifdef __KERNEL__
  12753. extern int atomic_notifier_chain_register(struct atomic_notifier_head *nh,
  12754. @@ -182,12 +198,12 @@
  12755. /*
  12756. * Declared notifiers so far. I can imagine quite a few more chains
  12757. - * over time (eg laptop power reset chains, reboot chain (to clean
  12758. + * over time (eg laptop power reset chains, reboot chain (to clean
  12759. * device units up), device [un]mount chain, module load/unload chain,
  12760. - * low memory chain, screenblank chain (for plug in modular screenblankers)
  12761. + * low memory chain, screenblank chain (for plug in modular screenblankers)
  12762. * VC switch chains (for loadable kernel svgalib VC switch helpers) etc...
  12763. */
  12764. -
  12765. +
  12766. /* CPU notfiers are defined in include/linux/cpu.h. */
  12767. /* netdevice notifiers are defined in include/linux/netdevice.h */
  12768. diff -Nur linux-3.18.14.orig/include/linux/percpu.h linux-3.18.14-rt/include/linux/percpu.h
  12769. --- linux-3.18.14.orig/include/linux/percpu.h 2015-05-20 10:04:50.000000000 -0500
  12770. +++ linux-3.18.14-rt/include/linux/percpu.h 2015-05-31 15:32:48.305635368 -0500
  12771. @@ -23,6 +23,35 @@
  12772. PERCPU_MODULE_RESERVE)
  12773. #endif
  12774. +#ifdef CONFIG_PREEMPT_RT_FULL
  12775. +
  12776. +#define get_local_var(var) (*({ \
  12777. + migrate_disable(); \
  12778. + &__get_cpu_var(var); }))
  12779. +
  12780. +#define put_local_var(var) do { \
  12781. + (void)&(var); \
  12782. + migrate_enable(); \
  12783. +} while (0)
  12784. +
  12785. +# define get_local_ptr(var) ({ \
  12786. + migrate_disable(); \
  12787. + this_cpu_ptr(var); })
  12788. +
  12789. +# define put_local_ptr(var) do { \
  12790. + (void)(var); \
  12791. + migrate_enable(); \
  12792. +} while (0)
  12793. +
  12794. +#else
  12795. +
  12796. +#define get_local_var(var) get_cpu_var(var)
  12797. +#define put_local_var(var) put_cpu_var(var)
  12798. +#define get_local_ptr(var) get_cpu_ptr(var)
  12799. +#define put_local_ptr(var) put_cpu_ptr(var)
  12800. +
  12801. +#endif
  12802. +
  12803. /* minimum unit size, also is the maximum supported allocation size */
  12804. #define PCPU_MIN_UNIT_SIZE PFN_ALIGN(32 << 10)
  12805. diff -Nur linux-3.18.14.orig/include/linux/pid.h linux-3.18.14-rt/include/linux/pid.h
  12806. --- linux-3.18.14.orig/include/linux/pid.h 2015-05-20 10:04:50.000000000 -0500
  12807. +++ linux-3.18.14-rt/include/linux/pid.h 2015-05-31 15:32:48.341635368 -0500
  12808. @@ -2,6 +2,7 @@
  12809. #define _LINUX_PID_H
  12810. #include <linux/rcupdate.h>
  12811. +#include <linux/atomic.h>
  12812. enum pid_type
  12813. {
  12814. diff -Nur linux-3.18.14.orig/include/linux/preempt.h linux-3.18.14-rt/include/linux/preempt.h
  12815. --- linux-3.18.14.orig/include/linux/preempt.h 2015-05-20 10:04:50.000000000 -0500
  12816. +++ linux-3.18.14-rt/include/linux/preempt.h 2015-05-31 15:32:48.341635368 -0500
  12817. @@ -33,6 +33,20 @@
  12818. #define preempt_count_inc() preempt_count_add(1)
  12819. #define preempt_count_dec() preempt_count_sub(1)
  12820. +#ifdef CONFIG_PREEMPT_LAZY
  12821. +#define add_preempt_lazy_count(val) do { preempt_lazy_count() += (val); } while (0)
  12822. +#define sub_preempt_lazy_count(val) do { preempt_lazy_count() -= (val); } while (0)
  12823. +#define inc_preempt_lazy_count() add_preempt_lazy_count(1)
  12824. +#define dec_preempt_lazy_count() sub_preempt_lazy_count(1)
  12825. +#define preempt_lazy_count() (current_thread_info()->preempt_lazy_count)
  12826. +#else
  12827. +#define add_preempt_lazy_count(val) do { } while (0)
  12828. +#define sub_preempt_lazy_count(val) do { } while (0)
  12829. +#define inc_preempt_lazy_count() do { } while (0)
  12830. +#define dec_preempt_lazy_count() do { } while (0)
  12831. +#define preempt_lazy_count() (0)
  12832. +#endif
  12833. +
  12834. #ifdef CONFIG_PREEMPT_COUNT
  12835. #define preempt_disable() \
  12836. @@ -41,13 +55,25 @@
  12837. barrier(); \
  12838. } while (0)
  12839. +#define preempt_lazy_disable() \
  12840. +do { \
  12841. + inc_preempt_lazy_count(); \
  12842. + barrier(); \
  12843. +} while (0)
  12844. +
  12845. #define sched_preempt_enable_no_resched() \
  12846. do { \
  12847. barrier(); \
  12848. preempt_count_dec(); \
  12849. } while (0)
  12850. -#define preempt_enable_no_resched() sched_preempt_enable_no_resched()
  12851. +#ifdef CONFIG_PREEMPT_RT_BASE
  12852. +# define preempt_enable_no_resched() sched_preempt_enable_no_resched()
  12853. +# define preempt_check_resched_rt() preempt_check_resched()
  12854. +#else
  12855. +# define preempt_enable_no_resched() preempt_enable()
  12856. +# define preempt_check_resched_rt() barrier();
  12857. +#endif
  12858. #ifdef CONFIG_PREEMPT
  12859. #define preempt_enable() \
  12860. @@ -63,6 +89,13 @@
  12861. __preempt_schedule(); \
  12862. } while (0)
  12863. +#define preempt_lazy_enable() \
  12864. +do { \
  12865. + dec_preempt_lazy_count(); \
  12866. + barrier(); \
  12867. + preempt_check_resched(); \
  12868. +} while (0)
  12869. +
  12870. #else
  12871. #define preempt_enable() \
  12872. do { \
  12873. @@ -121,6 +154,7 @@
  12874. #define preempt_disable_notrace() barrier()
  12875. #define preempt_enable_no_resched_notrace() barrier()
  12876. #define preempt_enable_notrace() barrier()
  12877. +#define preempt_check_resched_rt() barrier()
  12878. #endif /* CONFIG_PREEMPT_COUNT */
  12879. @@ -140,10 +174,31 @@
  12880. } while (0)
  12881. #define preempt_fold_need_resched() \
  12882. do { \
  12883. - if (tif_need_resched()) \
  12884. + if (tif_need_resched_now()) \
  12885. set_preempt_need_resched(); \
  12886. } while (0)
  12887. +#ifdef CONFIG_PREEMPT_RT_FULL
  12888. +# define preempt_disable_rt() preempt_disable()
  12889. +# define preempt_enable_rt() preempt_enable()
  12890. +# define preempt_disable_nort() barrier()
  12891. +# define preempt_enable_nort() barrier()
  12892. +# ifdef CONFIG_SMP
  12893. + extern void migrate_disable(void);
  12894. + extern void migrate_enable(void);
  12895. +# else /* CONFIG_SMP */
  12896. +# define migrate_disable() barrier()
  12897. +# define migrate_enable() barrier()
  12898. +# endif /* CONFIG_SMP */
  12899. +#else
  12900. +# define preempt_disable_rt() barrier()
  12901. +# define preempt_enable_rt() barrier()
  12902. +# define preempt_disable_nort() preempt_disable()
  12903. +# define preempt_enable_nort() preempt_enable()
  12904. +# define migrate_disable() preempt_disable()
  12905. +# define migrate_enable() preempt_enable()
  12906. +#endif
  12907. +
  12908. #ifdef CONFIG_PREEMPT_NOTIFIERS
  12909. struct preempt_notifier;
  12910. diff -Nur linux-3.18.14.orig/include/linux/preempt_mask.h linux-3.18.14-rt/include/linux/preempt_mask.h
  12911. --- linux-3.18.14.orig/include/linux/preempt_mask.h 2015-05-20 10:04:50.000000000 -0500
  12912. +++ linux-3.18.14-rt/include/linux/preempt_mask.h 2015-05-31 15:32:48.341635368 -0500
  12913. @@ -44,16 +44,26 @@
  12914. #define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT)
  12915. #define NMI_OFFSET (1UL << NMI_SHIFT)
  12916. -#define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET)
  12917. +#ifndef CONFIG_PREEMPT_RT_FULL
  12918. +# define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET)
  12919. +#else
  12920. +# define SOFTIRQ_DISABLE_OFFSET (0)
  12921. +#endif
  12922. #define PREEMPT_ACTIVE_BITS 1
  12923. #define PREEMPT_ACTIVE_SHIFT (NMI_SHIFT + NMI_BITS)
  12924. #define PREEMPT_ACTIVE (__IRQ_MASK(PREEMPT_ACTIVE_BITS) << PREEMPT_ACTIVE_SHIFT)
  12925. #define hardirq_count() (preempt_count() & HARDIRQ_MASK)
  12926. -#define softirq_count() (preempt_count() & SOFTIRQ_MASK)
  12927. #define irq_count() (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK \
  12928. | NMI_MASK))
  12929. +#ifndef CONFIG_PREEMPT_RT_FULL
  12930. +# define softirq_count() (preempt_count() & SOFTIRQ_MASK)
  12931. +# define in_serving_softirq() (softirq_count() & SOFTIRQ_OFFSET)
  12932. +#else
  12933. +# define softirq_count() (0UL)
  12934. +extern int in_serving_softirq(void);
  12935. +#endif
  12936. /*
  12937. * Are we doing bottom half or hardware interrupt processing?
  12938. @@ -64,7 +74,6 @@
  12939. #define in_irq() (hardirq_count())
  12940. #define in_softirq() (softirq_count())
  12941. #define in_interrupt() (irq_count())
  12942. -#define in_serving_softirq() (softirq_count() & SOFTIRQ_OFFSET)
  12943. /*
  12944. * Are we in NMI context?
  12945. diff -Nur linux-3.18.14.orig/include/linux/printk.h linux-3.18.14-rt/include/linux/printk.h
  12946. --- linux-3.18.14.orig/include/linux/printk.h 2015-05-20 10:04:50.000000000 -0500
  12947. +++ linux-3.18.14-rt/include/linux/printk.h 2015-05-31 15:32:48.341635368 -0500
  12948. @@ -119,9 +119,11 @@
  12949. extern asmlinkage __printf(1, 2)
  12950. void early_printk(const char *fmt, ...);
  12951. void early_vprintk(const char *fmt, va_list ap);
  12952. +extern void printk_kill(void);
  12953. #else
  12954. static inline __printf(1, 2) __cold
  12955. void early_printk(const char *s, ...) { }
  12956. +static inline void printk_kill(void) { }
  12957. #endif
  12958. #ifdef CONFIG_PRINTK
  12959. @@ -155,7 +157,6 @@
  12960. #define printk_ratelimit() __printk_ratelimit(__func__)
  12961. extern bool printk_timed_ratelimit(unsigned long *caller_jiffies,
  12962. unsigned int interval_msec);
  12963. -
  12964. extern int printk_delay_msec;
  12965. extern int dmesg_restrict;
  12966. extern int kptr_restrict;
  12967. diff -Nur linux-3.18.14.orig/include/linux/radix-tree.h linux-3.18.14-rt/include/linux/radix-tree.h
  12968. --- linux-3.18.14.orig/include/linux/radix-tree.h 2015-05-20 10:04:50.000000000 -0500
  12969. +++ linux-3.18.14-rt/include/linux/radix-tree.h 2015-05-31 15:32:48.341635368 -0500
  12970. @@ -277,8 +277,13 @@
  12971. unsigned int radix_tree_gang_lookup_slot(struct radix_tree_root *root,
  12972. void ***results, unsigned long *indices,
  12973. unsigned long first_index, unsigned int max_items);
  12974. +#ifndef CONFIG_PREEMPT_RT_FULL
  12975. int radix_tree_preload(gfp_t gfp_mask);
  12976. int radix_tree_maybe_preload(gfp_t gfp_mask);
  12977. +#else
  12978. +static inline int radix_tree_preload(gfp_t gm) { return 0; }
  12979. +static inline int radix_tree_maybe_preload(gfp_t gfp_mask) { return 0; }
  12980. +#endif
  12981. void radix_tree_init(void);
  12982. void *radix_tree_tag_set(struct radix_tree_root *root,
  12983. unsigned long index, unsigned int tag);
  12984. @@ -303,7 +308,7 @@
  12985. static inline void radix_tree_preload_end(void)
  12986. {
  12987. - preempt_enable();
  12988. + preempt_enable_nort();
  12989. }
  12990. /**
  12991. diff -Nur linux-3.18.14.orig/include/linux/random.h linux-3.18.14-rt/include/linux/random.h
  12992. --- linux-3.18.14.orig/include/linux/random.h 2015-05-20 10:04:50.000000000 -0500
  12993. +++ linux-3.18.14-rt/include/linux/random.h 2015-05-31 15:32:48.341635368 -0500
  12994. @@ -11,7 +11,7 @@
  12995. extern void add_device_randomness(const void *, unsigned int);
  12996. extern void add_input_randomness(unsigned int type, unsigned int code,
  12997. unsigned int value);
  12998. -extern void add_interrupt_randomness(int irq, int irq_flags);
  12999. +extern void add_interrupt_randomness(int irq, int irq_flags, __u64 ip);
  13000. extern void get_random_bytes(void *buf, int nbytes);
  13001. extern void get_random_bytes_arch(void *buf, int nbytes);
  13002. diff -Nur linux-3.18.14.orig/include/linux/rcupdate.h linux-3.18.14-rt/include/linux/rcupdate.h
  13003. --- linux-3.18.14.orig/include/linux/rcupdate.h 2015-05-20 10:04:50.000000000 -0500
  13004. +++ linux-3.18.14-rt/include/linux/rcupdate.h 2015-05-31 15:32:48.341635368 -0500
  13005. @@ -147,6 +147,9 @@
  13006. #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
  13007. +#ifdef CONFIG_PREEMPT_RT_FULL
  13008. +#define call_rcu_bh call_rcu
  13009. +#else
  13010. /**
  13011. * call_rcu_bh() - Queue an RCU for invocation after a quicker grace period.
  13012. * @head: structure to be used for queueing the RCU updates.
  13013. @@ -170,6 +173,7 @@
  13014. */
  13015. void call_rcu_bh(struct rcu_head *head,
  13016. void (*func)(struct rcu_head *head));
  13017. +#endif
  13018. /**
  13019. * call_rcu_sched() - Queue an RCU for invocation after sched grace period.
  13020. @@ -231,6 +235,11 @@
  13021. * types of kernel builds, the rcu_read_lock() nesting depth is unknowable.
  13022. */
  13023. #define rcu_preempt_depth() (current->rcu_read_lock_nesting)
  13024. +#ifndef CONFIG_PREEMPT_RT_FULL
  13025. +#define sched_rcu_preempt_depth() rcu_preempt_depth()
  13026. +#else
  13027. +static inline int sched_rcu_preempt_depth(void) { return 0; }
  13028. +#endif
  13029. #else /* #ifdef CONFIG_PREEMPT_RCU */
  13030. @@ -254,6 +263,8 @@
  13031. return 0;
  13032. }
  13033. +#define sched_rcu_preempt_depth() rcu_preempt_depth()
  13034. +
  13035. #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
  13036. /* Internal to kernel */
  13037. @@ -430,7 +441,14 @@
  13038. int debug_lockdep_rcu_enabled(void);
  13039. int rcu_read_lock_held(void);
  13040. +#ifdef CONFIG_PREEMPT_RT_FULL
  13041. +static inline int rcu_read_lock_bh_held(void)
  13042. +{
  13043. + return rcu_read_lock_held();
  13044. +}
  13045. +#else
  13046. int rcu_read_lock_bh_held(void);
  13047. +#endif
  13048. /**
  13049. * rcu_read_lock_sched_held() - might we be in RCU-sched read-side critical section?
  13050. @@ -955,10 +973,14 @@
  13051. static inline void rcu_read_lock_bh(void)
  13052. {
  13053. local_bh_disable();
  13054. +#ifdef CONFIG_PREEMPT_RT_FULL
  13055. + rcu_read_lock();
  13056. +#else
  13057. __acquire(RCU_BH);
  13058. rcu_lock_acquire(&rcu_bh_lock_map);
  13059. rcu_lockdep_assert(rcu_is_watching(),
  13060. "rcu_read_lock_bh() used illegally while idle");
  13061. +#endif
  13062. }
  13063. /*
  13064. @@ -968,10 +990,14 @@
  13065. */
  13066. static inline void rcu_read_unlock_bh(void)
  13067. {
  13068. +#ifdef CONFIG_PREEMPT_RT_FULL
  13069. + rcu_read_unlock();
  13070. +#else
  13071. rcu_lockdep_assert(rcu_is_watching(),
  13072. "rcu_read_unlock_bh() used illegally while idle");
  13073. rcu_lock_release(&rcu_bh_lock_map);
  13074. __release(RCU_BH);
  13075. +#endif
  13076. local_bh_enable();
  13077. }
  13078. diff -Nur linux-3.18.14.orig/include/linux/rcutree.h linux-3.18.14-rt/include/linux/rcutree.h
  13079. --- linux-3.18.14.orig/include/linux/rcutree.h 2015-05-20 10:04:50.000000000 -0500
  13080. +++ linux-3.18.14-rt/include/linux/rcutree.h 2015-05-31 15:32:48.361635367 -0500
  13081. @@ -46,7 +46,11 @@
  13082. rcu_note_context_switch(cpu);
  13083. }
  13084. +#ifdef CONFIG_PREEMPT_RT_FULL
  13085. +# define synchronize_rcu_bh synchronize_rcu
  13086. +#else
  13087. void synchronize_rcu_bh(void);
  13088. +#endif
  13089. void synchronize_sched_expedited(void);
  13090. void synchronize_rcu_expedited(void);
  13091. @@ -74,7 +78,11 @@
  13092. }
  13093. void rcu_barrier(void);
  13094. +#ifdef CONFIG_PREEMPT_RT_FULL
  13095. +# define rcu_barrier_bh rcu_barrier
  13096. +#else
  13097. void rcu_barrier_bh(void);
  13098. +#endif
  13099. void rcu_barrier_sched(void);
  13100. unsigned long get_state_synchronize_rcu(void);
  13101. void cond_synchronize_rcu(unsigned long oldstate);
  13102. @@ -82,12 +90,10 @@
  13103. extern unsigned long rcutorture_testseq;
  13104. extern unsigned long rcutorture_vernum;
  13105. long rcu_batches_completed(void);
  13106. -long rcu_batches_completed_bh(void);
  13107. long rcu_batches_completed_sched(void);
  13108. void show_rcu_gp_kthreads(void);
  13109. void rcu_force_quiescent_state(void);
  13110. -void rcu_bh_force_quiescent_state(void);
  13111. void rcu_sched_force_quiescent_state(void);
  13112. void exit_rcu(void);
  13113. @@ -97,4 +103,12 @@
  13114. bool rcu_is_watching(void);
  13115. +#ifndef CONFIG_PREEMPT_RT_FULL
  13116. +void rcu_bh_force_quiescent_state(void);
  13117. +long rcu_batches_completed_bh(void);
  13118. +#else
  13119. +# define rcu_bh_force_quiescent_state rcu_force_quiescent_state
  13120. +# define rcu_batches_completed_bh rcu_batches_completed
  13121. +#endif
  13122. +
  13123. #endif /* __LINUX_RCUTREE_H */
  13124. diff -Nur linux-3.18.14.orig/include/linux/rtmutex.h linux-3.18.14-rt/include/linux/rtmutex.h
  13125. --- linux-3.18.14.orig/include/linux/rtmutex.h 2015-05-20 10:04:50.000000000 -0500
  13126. +++ linux-3.18.14-rt/include/linux/rtmutex.h 2015-05-31 15:32:48.377635367 -0500
  13127. @@ -14,10 +14,14 @@
  13128. #include <linux/linkage.h>
  13129. #include <linux/rbtree.h>
  13130. -#include <linux/spinlock_types.h>
  13131. +#include <linux/spinlock_types_raw.h>
  13132. extern int max_lock_depth; /* for sysctl */
  13133. +#ifdef CONFIG_DEBUG_MUTEXES
  13134. +#include <linux/debug_locks.h>
  13135. +#endif
  13136. +
  13137. /**
  13138. * The rt_mutex structure
  13139. *
  13140. @@ -31,8 +35,8 @@
  13141. struct rb_root waiters;
  13142. struct rb_node *waiters_leftmost;
  13143. struct task_struct *owner;
  13144. -#ifdef CONFIG_DEBUG_RT_MUTEXES
  13145. int save_state;
  13146. +#ifdef CONFIG_DEBUG_RT_MUTEXES
  13147. const char *name, *file;
  13148. int line;
  13149. void *magic;
  13150. @@ -55,22 +59,33 @@
  13151. # define rt_mutex_debug_check_no_locks_held(task) do { } while (0)
  13152. #endif
  13153. +# define rt_mutex_init(mutex) \
  13154. + do { \
  13155. + raw_spin_lock_init(&(mutex)->wait_lock); \
  13156. + __rt_mutex_init(mutex, #mutex); \
  13157. + } while (0)
  13158. +
  13159. #ifdef CONFIG_DEBUG_RT_MUTEXES
  13160. # define __DEBUG_RT_MUTEX_INITIALIZER(mutexname) \
  13161. , .name = #mutexname, .file = __FILE__, .line = __LINE__
  13162. -# define rt_mutex_init(mutex) __rt_mutex_init(mutex, __func__)
  13163. extern void rt_mutex_debug_task_free(struct task_struct *tsk);
  13164. #else
  13165. # define __DEBUG_RT_MUTEX_INITIALIZER(mutexname)
  13166. -# define rt_mutex_init(mutex) __rt_mutex_init(mutex, NULL)
  13167. # define rt_mutex_debug_task_free(t) do { } while (0)
  13168. #endif
  13169. -#define __RT_MUTEX_INITIALIZER(mutexname) \
  13170. - { .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \
  13171. +#define __RT_MUTEX_INITIALIZER_PLAIN(mutexname) \
  13172. + .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \
  13173. , .waiters = RB_ROOT \
  13174. , .owner = NULL \
  13175. - __DEBUG_RT_MUTEX_INITIALIZER(mutexname)}
  13176. + __DEBUG_RT_MUTEX_INITIALIZER(mutexname)
  13177. +
  13178. +#define __RT_MUTEX_INITIALIZER(mutexname) \
  13179. + { __RT_MUTEX_INITIALIZER_PLAIN(mutexname) }
  13180. +
  13181. +#define __RT_MUTEX_INITIALIZER_SAVE_STATE(mutexname) \
  13182. + { __RT_MUTEX_INITIALIZER_PLAIN(mutexname) \
  13183. + , .save_state = 1 }
  13184. #define DEFINE_RT_MUTEX(mutexname) \
  13185. struct rt_mutex mutexname = __RT_MUTEX_INITIALIZER(mutexname)
  13186. @@ -91,6 +106,7 @@
  13187. extern void rt_mutex_lock(struct rt_mutex *lock);
  13188. extern int rt_mutex_lock_interruptible(struct rt_mutex *lock);
  13189. +extern int rt_mutex_lock_killable(struct rt_mutex *lock);
  13190. extern int rt_mutex_timed_lock(struct rt_mutex *lock,
  13191. struct hrtimer_sleeper *timeout);
  13192. diff -Nur linux-3.18.14.orig/include/linux/rwlock_rt.h linux-3.18.14-rt/include/linux/rwlock_rt.h
  13193. --- linux-3.18.14.orig/include/linux/rwlock_rt.h 1969-12-31 18:00:00.000000000 -0600
  13194. +++ linux-3.18.14-rt/include/linux/rwlock_rt.h 2015-05-31 15:32:48.377635367 -0500
  13195. @@ -0,0 +1,99 @@
  13196. +#ifndef __LINUX_RWLOCK_RT_H
  13197. +#define __LINUX_RWLOCK_RT_H
  13198. +
  13199. +#ifndef __LINUX_SPINLOCK_H
  13200. +#error Do not include directly. Use spinlock.h
  13201. +#endif
  13202. +
  13203. +#define rwlock_init(rwl) \
  13204. +do { \
  13205. + static struct lock_class_key __key; \
  13206. + \
  13207. + rt_mutex_init(&(rwl)->lock); \
  13208. + __rt_rwlock_init(rwl, #rwl, &__key); \
  13209. +} while (0)
  13210. +
  13211. +extern void __lockfunc rt_write_lock(rwlock_t *rwlock);
  13212. +extern void __lockfunc rt_read_lock(rwlock_t *rwlock);
  13213. +extern int __lockfunc rt_write_trylock(rwlock_t *rwlock);
  13214. +extern int __lockfunc rt_write_trylock_irqsave(rwlock_t *trylock, unsigned long *flags);
  13215. +extern int __lockfunc rt_read_trylock(rwlock_t *rwlock);
  13216. +extern void __lockfunc rt_write_unlock(rwlock_t *rwlock);
  13217. +extern void __lockfunc rt_read_unlock(rwlock_t *rwlock);
  13218. +extern unsigned long __lockfunc rt_write_lock_irqsave(rwlock_t *rwlock);
  13219. +extern unsigned long __lockfunc rt_read_lock_irqsave(rwlock_t *rwlock);
  13220. +extern void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key);
  13221. +
  13222. +#define read_trylock(lock) __cond_lock(lock, rt_read_trylock(lock))
  13223. +#define write_trylock(lock) __cond_lock(lock, rt_write_trylock(lock))
  13224. +
  13225. +#define write_trylock_irqsave(lock, flags) \
  13226. + __cond_lock(lock, rt_write_trylock_irqsave(lock, &flags))
  13227. +
  13228. +#define read_lock_irqsave(lock, flags) \
  13229. + do { \
  13230. + typecheck(unsigned long, flags); \
  13231. + flags = rt_read_lock_irqsave(lock); \
  13232. + } while (0)
  13233. +
  13234. +#define write_lock_irqsave(lock, flags) \
  13235. + do { \
  13236. + typecheck(unsigned long, flags); \
  13237. + flags = rt_write_lock_irqsave(lock); \
  13238. + } while (0)
  13239. +
  13240. +#define read_lock(lock) rt_read_lock(lock)
  13241. +
  13242. +#define read_lock_bh(lock) \
  13243. + do { \
  13244. + local_bh_disable(); \
  13245. + rt_read_lock(lock); \
  13246. + } while (0)
  13247. +
  13248. +#define read_lock_irq(lock) read_lock(lock)
  13249. +
  13250. +#define write_lock(lock) rt_write_lock(lock)
  13251. +
  13252. +#define write_lock_bh(lock) \
  13253. + do { \
  13254. + local_bh_disable(); \
  13255. + rt_write_lock(lock); \
  13256. + } while (0)
  13257. +
  13258. +#define write_lock_irq(lock) write_lock(lock)
  13259. +
  13260. +#define read_unlock(lock) rt_read_unlock(lock)
  13261. +
  13262. +#define read_unlock_bh(lock) \
  13263. + do { \
  13264. + rt_read_unlock(lock); \
  13265. + local_bh_enable(); \
  13266. + } while (0)
  13267. +
  13268. +#define read_unlock_irq(lock) read_unlock(lock)
  13269. +
  13270. +#define write_unlock(lock) rt_write_unlock(lock)
  13271. +
  13272. +#define write_unlock_bh(lock) \
  13273. + do { \
  13274. + rt_write_unlock(lock); \
  13275. + local_bh_enable(); \
  13276. + } while (0)
  13277. +
  13278. +#define write_unlock_irq(lock) write_unlock(lock)
  13279. +
  13280. +#define read_unlock_irqrestore(lock, flags) \
  13281. + do { \
  13282. + typecheck(unsigned long, flags); \
  13283. + (void) flags; \
  13284. + rt_read_unlock(lock); \
  13285. + } while (0)
  13286. +
  13287. +#define write_unlock_irqrestore(lock, flags) \
  13288. + do { \
  13289. + typecheck(unsigned long, flags); \
  13290. + (void) flags; \
  13291. + rt_write_unlock(lock); \
  13292. + } while (0)
  13293. +
  13294. +#endif
  13295. diff -Nur linux-3.18.14.orig/include/linux/rwlock_types.h linux-3.18.14-rt/include/linux/rwlock_types.h
  13296. --- linux-3.18.14.orig/include/linux/rwlock_types.h 2015-05-20 10:04:50.000000000 -0500
  13297. +++ linux-3.18.14-rt/include/linux/rwlock_types.h 2015-05-31 15:32:48.377635367 -0500
  13298. @@ -1,6 +1,10 @@
  13299. #ifndef __LINUX_RWLOCK_TYPES_H
  13300. #define __LINUX_RWLOCK_TYPES_H
  13301. +#if !defined(__LINUX_SPINLOCK_TYPES_H)
  13302. +# error "Do not include directly, include spinlock_types.h"
  13303. +#endif
  13304. +
  13305. /*
  13306. * include/linux/rwlock_types.h - generic rwlock type definitions
  13307. * and initializers
  13308. @@ -43,6 +47,7 @@
  13309. RW_DEP_MAP_INIT(lockname) }
  13310. #endif
  13311. -#define DEFINE_RWLOCK(x) rwlock_t x = __RW_LOCK_UNLOCKED(x)
  13312. +#define DEFINE_RWLOCK(name) \
  13313. + rwlock_t name __cacheline_aligned_in_smp = __RW_LOCK_UNLOCKED(name)
  13314. #endif /* __LINUX_RWLOCK_TYPES_H */
  13315. diff -Nur linux-3.18.14.orig/include/linux/rwlock_types_rt.h linux-3.18.14-rt/include/linux/rwlock_types_rt.h
  13316. --- linux-3.18.14.orig/include/linux/rwlock_types_rt.h 1969-12-31 18:00:00.000000000 -0600
  13317. +++ linux-3.18.14-rt/include/linux/rwlock_types_rt.h 2015-05-31 15:32:48.377635367 -0500
  13318. @@ -0,0 +1,33 @@
  13319. +#ifndef __LINUX_RWLOCK_TYPES_RT_H
  13320. +#define __LINUX_RWLOCK_TYPES_RT_H
  13321. +
  13322. +#ifndef __LINUX_SPINLOCK_TYPES_H
  13323. +#error "Do not include directly. Include spinlock_types.h instead"
  13324. +#endif
  13325. +
  13326. +/*
  13327. + * rwlocks - rtmutex which allows single reader recursion
  13328. + */
  13329. +typedef struct {
  13330. + struct rt_mutex lock;
  13331. + int read_depth;
  13332. + unsigned int break_lock;
  13333. +#ifdef CONFIG_DEBUG_LOCK_ALLOC
  13334. + struct lockdep_map dep_map;
  13335. +#endif
  13336. +} rwlock_t;
  13337. +
  13338. +#ifdef CONFIG_DEBUG_LOCK_ALLOC
  13339. +# define RW_DEP_MAP_INIT(lockname) .dep_map = { .name = #lockname }
  13340. +#else
  13341. +# define RW_DEP_MAP_INIT(lockname)
  13342. +#endif
  13343. +
  13344. +#define __RW_LOCK_UNLOCKED(name) \
  13345. + { .lock = __RT_MUTEX_INITIALIZER_SAVE_STATE(name.lock), \
  13346. + RW_DEP_MAP_INIT(name) }
  13347. +
  13348. +#define DEFINE_RWLOCK(name) \
  13349. + rwlock_t name __cacheline_aligned_in_smp = __RW_LOCK_UNLOCKED(name)
  13350. +
  13351. +#endif
  13352. diff -Nur linux-3.18.14.orig/include/linux/rwsem.h linux-3.18.14-rt/include/linux/rwsem.h
  13353. --- linux-3.18.14.orig/include/linux/rwsem.h 2015-05-20 10:04:50.000000000 -0500
  13354. +++ linux-3.18.14-rt/include/linux/rwsem.h 2015-05-31 15:32:48.377635367 -0500
  13355. @@ -18,6 +18,10 @@
  13356. #include <linux/osq_lock.h>
  13357. #endif
  13358. +#ifdef CONFIG_PREEMPT_RT_FULL
  13359. +#include <linux/rwsem_rt.h>
  13360. +#else /* PREEMPT_RT_FULL */
  13361. +
  13362. struct rw_semaphore;
  13363. #ifdef CONFIG_RWSEM_GENERIC_SPINLOCK
  13364. @@ -177,4 +181,6 @@
  13365. # define up_read_non_owner(sem) up_read(sem)
  13366. #endif
  13367. +#endif /* !PREEMPT_RT_FULL */
  13368. +
  13369. #endif /* _LINUX_RWSEM_H */
  13370. diff -Nur linux-3.18.14.orig/include/linux/rwsem_rt.h linux-3.18.14-rt/include/linux/rwsem_rt.h
  13371. --- linux-3.18.14.orig/include/linux/rwsem_rt.h 1969-12-31 18:00:00.000000000 -0600
  13372. +++ linux-3.18.14-rt/include/linux/rwsem_rt.h 2015-05-31 15:32:48.377635367 -0500
  13373. @@ -0,0 +1,134 @@
  13374. +#ifndef _LINUX_RWSEM_RT_H
  13375. +#define _LINUX_RWSEM_RT_H
  13376. +
  13377. +#ifndef _LINUX_RWSEM_H
  13378. +#error "Include rwsem.h"
  13379. +#endif
  13380. +
  13381. +/*
  13382. + * RW-semaphores are a spinlock plus a reader-depth count.
  13383. + *
  13384. + * Note that the semantics are different from the usual
  13385. + * Linux rw-sems, in PREEMPT_RT mode we do not allow
  13386. + * multiple readers to hold the lock at once, we only allow
  13387. + * a read-lock owner to read-lock recursively. This is
  13388. + * better for latency, makes the implementation inherently
  13389. + * fair and makes it simpler as well.
  13390. + */
  13391. +
  13392. +#include <linux/rtmutex.h>
  13393. +
  13394. +struct rw_semaphore {
  13395. + struct rt_mutex lock;
  13396. + int read_depth;
  13397. +#ifdef CONFIG_DEBUG_LOCK_ALLOC
  13398. + struct lockdep_map dep_map;
  13399. +#endif
  13400. +};
  13401. +
  13402. +#define __RWSEM_INITIALIZER(name) \
  13403. + { .lock = __RT_MUTEX_INITIALIZER(name.lock), \
  13404. + RW_DEP_MAP_INIT(name) }
  13405. +
  13406. +#define DECLARE_RWSEM(lockname) \
  13407. + struct rw_semaphore lockname = __RWSEM_INITIALIZER(lockname)
  13408. +
  13409. +extern void __rt_rwsem_init(struct rw_semaphore *rwsem, const char *name,
  13410. + struct lock_class_key *key);
  13411. +
  13412. +#define __rt_init_rwsem(sem, name, key) \
  13413. + do { \
  13414. + rt_mutex_init(&(sem)->lock); \
  13415. + __rt_rwsem_init((sem), (name), (key));\
  13416. + } while (0)
  13417. +
  13418. +#define __init_rwsem(sem, name, key) __rt_init_rwsem(sem, name, key)
  13419. +
  13420. +# define rt_init_rwsem(sem) \
  13421. +do { \
  13422. + static struct lock_class_key __key; \
  13423. + \
  13424. + __rt_init_rwsem((sem), #sem, &__key); \
  13425. +} while (0)
  13426. +
  13427. +extern void rt_down_write(struct rw_semaphore *rwsem);
  13428. +extern void rt_down_read_nested(struct rw_semaphore *rwsem, int subclass);
  13429. +extern void rt_down_write_nested(struct rw_semaphore *rwsem, int subclass);
  13430. +extern void rt_down_write_nested_lock(struct rw_semaphore *rwsem,
  13431. + struct lockdep_map *nest);
  13432. +extern void rt_down_read(struct rw_semaphore *rwsem);
  13433. +extern int rt_down_write_trylock(struct rw_semaphore *rwsem);
  13434. +extern int rt_down_read_trylock(struct rw_semaphore *rwsem);
  13435. +extern void rt_up_read(struct rw_semaphore *rwsem);
  13436. +extern void rt_up_write(struct rw_semaphore *rwsem);
  13437. +extern void rt_downgrade_write(struct rw_semaphore *rwsem);
  13438. +
  13439. +#define init_rwsem(sem) rt_init_rwsem(sem)
  13440. +#define rwsem_is_locked(s) rt_mutex_is_locked(&(s)->lock)
  13441. +
  13442. +static inline int rwsem_is_contended(struct rw_semaphore *sem)
  13443. +{
  13444. + /* rt_mutex_has_waiters() */
  13445. + return !RB_EMPTY_ROOT(&sem->lock.waiters);
  13446. +}
  13447. +
  13448. +static inline void down_read(struct rw_semaphore *sem)
  13449. +{
  13450. + rt_down_read(sem);
  13451. +}
  13452. +
  13453. +static inline int down_read_trylock(struct rw_semaphore *sem)
  13454. +{
  13455. + return rt_down_read_trylock(sem);
  13456. +}
  13457. +
  13458. +static inline void down_write(struct rw_semaphore *sem)
  13459. +{
  13460. + rt_down_write(sem);
  13461. +}
  13462. +
  13463. +static inline int down_write_trylock(struct rw_semaphore *sem)
  13464. +{
  13465. + return rt_down_write_trylock(sem);
  13466. +}
  13467. +
  13468. +static inline void up_read(struct rw_semaphore *sem)
  13469. +{
  13470. + rt_up_read(sem);
  13471. +}
  13472. +
  13473. +static inline void up_write(struct rw_semaphore *sem)
  13474. +{
  13475. + rt_up_write(sem);
  13476. +}
  13477. +
  13478. +static inline void downgrade_write(struct rw_semaphore *sem)
  13479. +{
  13480. + rt_downgrade_write(sem);
  13481. +}
  13482. +
  13483. +static inline void down_read_nested(struct rw_semaphore *sem, int subclass)
  13484. +{
  13485. + return rt_down_read_nested(sem, subclass);
  13486. +}
  13487. +
  13488. +static inline void down_write_nested(struct rw_semaphore *sem, int subclass)
  13489. +{
  13490. + rt_down_write_nested(sem, subclass);
  13491. +}
  13492. +#ifdef CONFIG_DEBUG_LOCK_ALLOC
  13493. +static inline void down_write_nest_lock(struct rw_semaphore *sem,
  13494. + struct rw_semaphore *nest_lock)
  13495. +{
  13496. + rt_down_write_nested_lock(sem, &nest_lock->dep_map);
  13497. +}
  13498. +
  13499. +#else
  13500. +
  13501. +static inline void down_write_nest_lock(struct rw_semaphore *sem,
  13502. + struct rw_semaphore *nest_lock)
  13503. +{
  13504. + rt_down_write_nested_lock(sem, NULL);
  13505. +}
  13506. +#endif
  13507. +#endif
  13508. diff -Nur linux-3.18.14.orig/include/linux/sched.h linux-3.18.14-rt/include/linux/sched.h
  13509. --- linux-3.18.14.orig/include/linux/sched.h 2015-05-20 10:04:50.000000000 -0500
  13510. +++ linux-3.18.14-rt/include/linux/sched.h 2015-05-31 15:32:48.381635367 -0500
  13511. @@ -26,6 +26,7 @@
  13512. #include <linux/nodemask.h>
  13513. #include <linux/mm_types.h>
  13514. #include <linux/preempt_mask.h>
  13515. +#include <asm/kmap_types.h>
  13516. #include <asm/page.h>
  13517. #include <asm/ptrace.h>
  13518. @@ -56,6 +57,7 @@
  13519. #include <linux/cred.h>
  13520. #include <linux/llist.h>
  13521. #include <linux/uidgid.h>
  13522. +#include <linux/hardirq.h>
  13523. #include <linux/gfp.h>
  13524. #include <linux/magic.h>
  13525. @@ -235,10 +237,7 @@
  13526. TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \
  13527. __TASK_TRACED | EXIT_ZOMBIE | EXIT_DEAD)
  13528. -#define task_is_traced(task) ((task->state & __TASK_TRACED) != 0)
  13529. #define task_is_stopped(task) ((task->state & __TASK_STOPPED) != 0)
  13530. -#define task_is_stopped_or_traced(task) \
  13531. - ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0)
  13532. #define task_contributes_to_load(task) \
  13533. ((task->state & TASK_UNINTERRUPTIBLE) != 0 && \
  13534. (task->flags & PF_FROZEN) == 0)
  13535. @@ -1234,6 +1233,7 @@
  13536. struct task_struct {
  13537. volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
  13538. + volatile long saved_state; /* saved state for "spinlock sleepers" */
  13539. void *stack;
  13540. atomic_t usage;
  13541. unsigned int flags; /* per process flags, defined below */
  13542. @@ -1270,6 +1270,12 @@
  13543. #endif
  13544. unsigned int policy;
  13545. +#ifdef CONFIG_PREEMPT_RT_FULL
  13546. + int migrate_disable;
  13547. +# ifdef CONFIG_SCHED_DEBUG
  13548. + int migrate_disable_atomic;
  13549. +# endif
  13550. +#endif
  13551. int nr_cpus_allowed;
  13552. cpumask_t cpus_allowed;
  13553. @@ -1371,7 +1377,8 @@
  13554. struct cputime prev_cputime;
  13555. #endif
  13556. #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
  13557. - seqlock_t vtime_seqlock;
  13558. + raw_spinlock_t vtime_lock;
  13559. + seqcount_t vtime_seq;
  13560. unsigned long long vtime_snap;
  13561. enum {
  13562. VTIME_SLEEPING = 0,
  13563. @@ -1387,6 +1394,9 @@
  13564. struct task_cputime cputime_expires;
  13565. struct list_head cpu_timers[3];
  13566. +#ifdef CONFIG_PREEMPT_RT_BASE
  13567. + struct task_struct *posix_timer_list;
  13568. +#endif
  13569. /* process credentials */
  13570. const struct cred __rcu *real_cred; /* objective and real subjective task
  13571. @@ -1419,10 +1429,15 @@
  13572. /* signal handlers */
  13573. struct signal_struct *signal;
  13574. struct sighand_struct *sighand;
  13575. + struct sigqueue *sigqueue_cache;
  13576. sigset_t blocked, real_blocked;
  13577. sigset_t saved_sigmask; /* restored if set_restore_sigmask() was used */
  13578. struct sigpending pending;
  13579. +#ifdef CONFIG_PREEMPT_RT_FULL
  13580. + /* TODO: move me into ->restart_block ? */
  13581. + struct siginfo forced_info;
  13582. +#endif
  13583. unsigned long sas_ss_sp;
  13584. size_t sas_ss_size;
  13585. @@ -1460,6 +1475,9 @@
  13586. /* mutex deadlock detection */
  13587. struct mutex_waiter *blocked_on;
  13588. #endif
  13589. +#ifdef CONFIG_PREEMPT_RT_FULL
  13590. + int pagefault_disabled;
  13591. +#endif
  13592. #ifdef CONFIG_TRACE_IRQFLAGS
  13593. unsigned int irq_events;
  13594. unsigned long hardirq_enable_ip;
  13595. @@ -1644,6 +1662,12 @@
  13596. unsigned long trace;
  13597. /* bitmask and counter of trace recursion */
  13598. unsigned long trace_recursion;
  13599. +#ifdef CONFIG_WAKEUP_LATENCY_HIST
  13600. + u64 preempt_timestamp_hist;
  13601. +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
  13602. + long timer_offset;
  13603. +#endif
  13604. +#endif
  13605. #endif /* CONFIG_TRACING */
  13606. #ifdef CONFIG_MEMCG /* memcg uses this to do batch job */
  13607. unsigned int memcg_kmem_skip_account;
  13608. @@ -1661,11 +1685,19 @@
  13609. unsigned int sequential_io;
  13610. unsigned int sequential_io_avg;
  13611. #endif
  13612. +#ifdef CONFIG_PREEMPT_RT_BASE
  13613. + struct rcu_head put_rcu;
  13614. + int softirq_nestcnt;
  13615. + unsigned int softirqs_raised;
  13616. +#endif
  13617. +#ifdef CONFIG_PREEMPT_RT_FULL
  13618. +# if defined CONFIG_HIGHMEM || defined CONFIG_X86_32
  13619. + int kmap_idx;
  13620. + pte_t kmap_pte[KM_TYPE_NR];
  13621. +# endif
  13622. +#endif
  13623. };
  13624. -/* Future-safe accessor for struct task_struct's cpus_allowed. */
  13625. -#define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
  13626. -
  13627. #define TNF_MIGRATED 0x01
  13628. #define TNF_NO_GROUP 0x02
  13629. #define TNF_SHARED 0x04
  13630. @@ -1700,6 +1732,17 @@
  13631. }
  13632. #endif
  13633. +#ifdef CONFIG_PREEMPT_RT_FULL
  13634. +static inline bool cur_pf_disabled(void) { return current->pagefault_disabled; }
  13635. +#else
  13636. +static inline bool cur_pf_disabled(void) { return false; }
  13637. +#endif
  13638. +
  13639. +static inline bool pagefault_disabled(void)
  13640. +{
  13641. + return in_atomic() || cur_pf_disabled();
  13642. +}
  13643. +
  13644. static inline struct pid *task_pid(struct task_struct *task)
  13645. {
  13646. return task->pids[PIDTYPE_PID].pid;
  13647. @@ -1853,6 +1896,15 @@
  13648. extern void free_task(struct task_struct *tsk);
  13649. #define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0)
  13650. +#ifdef CONFIG_PREEMPT_RT_BASE
  13651. +extern void __put_task_struct_cb(struct rcu_head *rhp);
  13652. +
  13653. +static inline void put_task_struct(struct task_struct *t)
  13654. +{
  13655. + if (atomic_dec_and_test(&t->usage))
  13656. + call_rcu(&t->put_rcu, __put_task_struct_cb);
  13657. +}
  13658. +#else
  13659. extern void __put_task_struct(struct task_struct *t);
  13660. static inline void put_task_struct(struct task_struct *t)
  13661. @@ -1860,6 +1912,7 @@
  13662. if (atomic_dec_and_test(&t->usage))
  13663. __put_task_struct(t);
  13664. }
  13665. +#endif
  13666. #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
  13667. extern void task_cputime(struct task_struct *t,
  13668. @@ -1898,6 +1951,7 @@
  13669. /*
  13670. * Per process flags
  13671. */
  13672. +#define PF_IN_SOFTIRQ 0x00000001 /* Task is serving softirq */
  13673. #define PF_EXITING 0x00000004 /* getting shut down */
  13674. #define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */
  13675. #define PF_VCPU 0x00000010 /* I'm a virtual CPU */
  13676. @@ -2058,6 +2112,10 @@
  13677. extern int set_cpus_allowed_ptr(struct task_struct *p,
  13678. const struct cpumask *new_mask);
  13679. +int migrate_me(void);
  13680. +void tell_sched_cpu_down_begin(int cpu);
  13681. +void tell_sched_cpu_down_done(int cpu);
  13682. +
  13683. #else
  13684. static inline void do_set_cpus_allowed(struct task_struct *p,
  13685. const struct cpumask *new_mask)
  13686. @@ -2070,6 +2128,9 @@
  13687. return -EINVAL;
  13688. return 0;
  13689. }
  13690. +static inline int migrate_me(void) { return 0; }
  13691. +static inline void tell_sched_cpu_down_begin(int cpu) { }
  13692. +static inline void tell_sched_cpu_down_done(int cpu) { }
  13693. #endif
  13694. #ifdef CONFIG_NO_HZ_COMMON
  13695. @@ -2290,6 +2351,7 @@
  13696. extern int wake_up_state(struct task_struct *tsk, unsigned int state);
  13697. extern int wake_up_process(struct task_struct *tsk);
  13698. +extern int wake_up_lock_sleeper(struct task_struct * tsk);
  13699. extern void wake_up_new_task(struct task_struct *tsk);
  13700. #ifdef CONFIG_SMP
  13701. extern void kick_process(struct task_struct *tsk);
  13702. @@ -2406,12 +2468,24 @@
  13703. /* mmdrop drops the mm and the page tables */
  13704. extern void __mmdrop(struct mm_struct *);
  13705. +
  13706. static inline void mmdrop(struct mm_struct * mm)
  13707. {
  13708. if (unlikely(atomic_dec_and_test(&mm->mm_count)))
  13709. __mmdrop(mm);
  13710. }
  13711. +#ifdef CONFIG_PREEMPT_RT_BASE
  13712. +extern void __mmdrop_delayed(struct rcu_head *rhp);
  13713. +static inline void mmdrop_delayed(struct mm_struct *mm)
  13714. +{
  13715. + if (atomic_dec_and_test(&mm->mm_count))
  13716. + call_rcu(&mm->delayed_drop, __mmdrop_delayed);
  13717. +}
  13718. +#else
  13719. +# define mmdrop_delayed(mm) mmdrop(mm)
  13720. +#endif
  13721. +
  13722. /* mmput gets rid of the mappings and all user-space */
  13723. extern void mmput(struct mm_struct *);
  13724. /* Grab a reference to a task's mm, if it is not already going away */
  13725. @@ -2719,6 +2793,43 @@
  13726. return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED));
  13727. }
  13728. +#ifdef CONFIG_PREEMPT_LAZY
  13729. +static inline void set_tsk_need_resched_lazy(struct task_struct *tsk)
  13730. +{
  13731. + set_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY);
  13732. +}
  13733. +
  13734. +static inline void clear_tsk_need_resched_lazy(struct task_struct *tsk)
  13735. +{
  13736. + clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY);
  13737. +}
  13738. +
  13739. +static inline int test_tsk_need_resched_lazy(struct task_struct *tsk)
  13740. +{
  13741. + return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY));
  13742. +}
  13743. +
  13744. +static inline int need_resched_lazy(void)
  13745. +{
  13746. + return test_thread_flag(TIF_NEED_RESCHED_LAZY);
  13747. +}
  13748. +
  13749. +static inline int need_resched_now(void)
  13750. +{
  13751. + return test_thread_flag(TIF_NEED_RESCHED);
  13752. +}
  13753. +
  13754. +#else
  13755. +static inline void clear_tsk_need_resched_lazy(struct task_struct *tsk) { }
  13756. +static inline int need_resched_lazy(void) { return 0; }
  13757. +
  13758. +static inline int need_resched_now(void)
  13759. +{
  13760. + return test_thread_flag(TIF_NEED_RESCHED);
  13761. +}
  13762. +
  13763. +#endif
  13764. +
  13765. static inline int restart_syscall(void)
  13766. {
  13767. set_tsk_thread_flag(current, TIF_SIGPENDING);
  13768. @@ -2750,6 +2861,51 @@
  13769. return (state & TASK_INTERRUPTIBLE) || __fatal_signal_pending(p);
  13770. }
  13771. +static inline bool __task_is_stopped_or_traced(struct task_struct *task)
  13772. +{
  13773. + if (task->state & (__TASK_STOPPED | __TASK_TRACED))
  13774. + return true;
  13775. +#ifdef CONFIG_PREEMPT_RT_FULL
  13776. + if (task->saved_state & (__TASK_STOPPED | __TASK_TRACED))
  13777. + return true;
  13778. +#endif
  13779. + return false;
  13780. +}
  13781. +
  13782. +static inline bool task_is_stopped_or_traced(struct task_struct *task)
  13783. +{
  13784. + bool traced_stopped;
  13785. +
  13786. +#ifdef CONFIG_PREEMPT_RT_FULL
  13787. + unsigned long flags;
  13788. +
  13789. + raw_spin_lock_irqsave(&task->pi_lock, flags);
  13790. + traced_stopped = __task_is_stopped_or_traced(task);
  13791. + raw_spin_unlock_irqrestore(&task->pi_lock, flags);
  13792. +#else
  13793. + traced_stopped = __task_is_stopped_or_traced(task);
  13794. +#endif
  13795. + return traced_stopped;
  13796. +}
  13797. +
  13798. +static inline bool task_is_traced(struct task_struct *task)
  13799. +{
  13800. + bool traced = false;
  13801. +
  13802. + if (task->state & __TASK_TRACED)
  13803. + return true;
  13804. +#ifdef CONFIG_PREEMPT_RT_FULL
  13805. + /* in case the task is sleeping on tasklist_lock */
  13806. + raw_spin_lock_irq(&task->pi_lock);
  13807. + if (task->state & __TASK_TRACED)
  13808. + traced = true;
  13809. + else if (task->saved_state & __TASK_TRACED)
  13810. + traced = true;
  13811. + raw_spin_unlock_irq(&task->pi_lock);
  13812. +#endif
  13813. + return traced;
  13814. +}
  13815. +
  13816. /*
  13817. * cond_resched() and cond_resched_lock(): latency reduction via
  13818. * explicit rescheduling in places that are safe. The return
  13819. @@ -2766,7 +2922,7 @@
  13820. extern int __cond_resched_lock(spinlock_t *lock);
  13821. -#ifdef CONFIG_PREEMPT_COUNT
  13822. +#if defined(CONFIG_PREEMPT_COUNT) && !defined(CONFIG_PREEMPT_RT_FULL)
  13823. #define PREEMPT_LOCK_OFFSET PREEMPT_OFFSET
  13824. #else
  13825. #define PREEMPT_LOCK_OFFSET 0
  13826. @@ -2777,12 +2933,16 @@
  13827. __cond_resched_lock(lock); \
  13828. })
  13829. +#ifndef CONFIG_PREEMPT_RT_FULL
  13830. extern int __cond_resched_softirq(void);
  13831. #define cond_resched_softirq() ({ \
  13832. __might_sleep(__FILE__, __LINE__, SOFTIRQ_DISABLE_OFFSET); \
  13833. __cond_resched_softirq(); \
  13834. })
  13835. +#else
  13836. +# define cond_resched_softirq() cond_resched()
  13837. +#endif
  13838. static inline void cond_resched_rcu(void)
  13839. {
  13840. @@ -2949,6 +3109,26 @@
  13841. #endif /* CONFIG_SMP */
  13842. +static inline int __migrate_disabled(struct task_struct *p)
  13843. +{
  13844. +#ifdef CONFIG_PREEMPT_RT_FULL
  13845. + return p->migrate_disable;
  13846. +#else
  13847. + return 0;
  13848. +#endif
  13849. +}
  13850. +
  13851. +/* Future-safe accessor for struct task_struct's cpus_allowed. */
  13852. +static inline const struct cpumask *tsk_cpus_allowed(struct task_struct *p)
  13853. +{
  13854. +#ifdef CONFIG_PREEMPT_RT_FULL
  13855. + if (p->migrate_disable)
  13856. + return cpumask_of(task_cpu(p));
  13857. +#endif
  13858. +
  13859. + return &p->cpus_allowed;
  13860. +}
  13861. +
  13862. extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);
  13863. extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
  13864. diff -Nur linux-3.18.14.orig/include/linux/seqlock.h linux-3.18.14-rt/include/linux/seqlock.h
  13865. --- linux-3.18.14.orig/include/linux/seqlock.h 2015-05-20 10:04:50.000000000 -0500
  13866. +++ linux-3.18.14-rt/include/linux/seqlock.h 2015-05-31 15:32:48.381635367 -0500
  13867. @@ -219,20 +219,30 @@
  13868. return __read_seqcount_retry(s, start);
  13869. }
  13870. -
  13871. -
  13872. -static inline void raw_write_seqcount_begin(seqcount_t *s)
  13873. +static inline void __raw_write_seqcount_begin(seqcount_t *s)
  13874. {
  13875. s->sequence++;
  13876. smp_wmb();
  13877. }
  13878. -static inline void raw_write_seqcount_end(seqcount_t *s)
  13879. +static inline void raw_write_seqcount_begin(seqcount_t *s)
  13880. +{
  13881. + preempt_disable_rt();
  13882. + __raw_write_seqcount_begin(s);
  13883. +}
  13884. +
  13885. +static inline void __raw_write_seqcount_end(seqcount_t *s)
  13886. {
  13887. smp_wmb();
  13888. s->sequence++;
  13889. }
  13890. +static inline void raw_write_seqcount_end(seqcount_t *s)
  13891. +{
  13892. + __raw_write_seqcount_end(s);
  13893. + preempt_enable_rt();
  13894. +}
  13895. +
  13896. /*
  13897. * raw_write_seqcount_latch - redirect readers to even/odd copy
  13898. * @s: pointer to seqcount_t
  13899. @@ -305,10 +315,32 @@
  13900. /*
  13901. * Read side functions for starting and finalizing a read side section.
  13902. */
  13903. +#ifndef CONFIG_PREEMPT_RT_FULL
  13904. static inline unsigned read_seqbegin(const seqlock_t *sl)
  13905. {
  13906. return read_seqcount_begin(&sl->seqcount);
  13907. }
  13908. +#else
  13909. +/*
  13910. + * Starvation safe read side for RT
  13911. + */
  13912. +static inline unsigned read_seqbegin(seqlock_t *sl)
  13913. +{
  13914. + unsigned ret;
  13915. +
  13916. +repeat:
  13917. + ret = ACCESS_ONCE(sl->seqcount.sequence);
  13918. + if (unlikely(ret & 1)) {
  13919. + /*
  13920. + * Take the lock and let the writer proceed (i.e. evtl
  13921. + * boost it), otherwise we could loop here forever.
  13922. + */
  13923. + spin_unlock_wait(&sl->lock);
  13924. + goto repeat;
  13925. + }
  13926. + return ret;
  13927. +}
  13928. +#endif
  13929. static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start)
  13930. {
  13931. @@ -323,36 +355,36 @@
  13932. static inline void write_seqlock(seqlock_t *sl)
  13933. {
  13934. spin_lock(&sl->lock);
  13935. - write_seqcount_begin(&sl->seqcount);
  13936. + __raw_write_seqcount_begin(&sl->seqcount);
  13937. }
  13938. static inline void write_sequnlock(seqlock_t *sl)
  13939. {
  13940. - write_seqcount_end(&sl->seqcount);
  13941. + __raw_write_seqcount_end(&sl->seqcount);
  13942. spin_unlock(&sl->lock);
  13943. }
  13944. static inline void write_seqlock_bh(seqlock_t *sl)
  13945. {
  13946. spin_lock_bh(&sl->lock);
  13947. - write_seqcount_begin(&sl->seqcount);
  13948. + __raw_write_seqcount_begin(&sl->seqcount);
  13949. }
  13950. static inline void write_sequnlock_bh(seqlock_t *sl)
  13951. {
  13952. - write_seqcount_end(&sl->seqcount);
  13953. + __raw_write_seqcount_end(&sl->seqcount);
  13954. spin_unlock_bh(&sl->lock);
  13955. }
  13956. static inline void write_seqlock_irq(seqlock_t *sl)
  13957. {
  13958. spin_lock_irq(&sl->lock);
  13959. - write_seqcount_begin(&sl->seqcount);
  13960. + __raw_write_seqcount_begin(&sl->seqcount);
  13961. }
  13962. static inline void write_sequnlock_irq(seqlock_t *sl)
  13963. {
  13964. - write_seqcount_end(&sl->seqcount);
  13965. + __raw_write_seqcount_end(&sl->seqcount);
  13966. spin_unlock_irq(&sl->lock);
  13967. }
  13968. @@ -361,7 +393,7 @@
  13969. unsigned long flags;
  13970. spin_lock_irqsave(&sl->lock, flags);
  13971. - write_seqcount_begin(&sl->seqcount);
  13972. + __raw_write_seqcount_begin(&sl->seqcount);
  13973. return flags;
  13974. }
  13975. @@ -371,7 +403,7 @@
  13976. static inline void
  13977. write_sequnlock_irqrestore(seqlock_t *sl, unsigned long flags)
  13978. {
  13979. - write_seqcount_end(&sl->seqcount);
  13980. + __raw_write_seqcount_end(&sl->seqcount);
  13981. spin_unlock_irqrestore(&sl->lock, flags);
  13982. }
  13983. diff -Nur linux-3.18.14.orig/include/linux/signal.h linux-3.18.14-rt/include/linux/signal.h
  13984. --- linux-3.18.14.orig/include/linux/signal.h 2015-05-20 10:04:50.000000000 -0500
  13985. +++ linux-3.18.14-rt/include/linux/signal.h 2015-05-31 15:32:48.381635367 -0500
  13986. @@ -218,6 +218,7 @@
  13987. }
  13988. extern void flush_sigqueue(struct sigpending *queue);
  13989. +extern void flush_task_sigqueue(struct task_struct *tsk);
  13990. /* Test if 'sig' is valid signal. Use this instead of testing _NSIG directly */
  13991. static inline int valid_signal(unsigned long sig)
  13992. diff -Nur linux-3.18.14.orig/include/linux/skbuff.h linux-3.18.14-rt/include/linux/skbuff.h
  13993. --- linux-3.18.14.orig/include/linux/skbuff.h 2015-05-20 10:04:50.000000000 -0500
  13994. +++ linux-3.18.14-rt/include/linux/skbuff.h 2015-05-31 15:32:48.405635367 -0500
  13995. @@ -172,6 +172,7 @@
  13996. __u32 qlen;
  13997. spinlock_t lock;
  13998. + raw_spinlock_t raw_lock;
  13999. };
  14000. struct sk_buff;
  14001. @@ -1328,6 +1329,12 @@
  14002. __skb_queue_head_init(list);
  14003. }
  14004. +static inline void skb_queue_head_init_raw(struct sk_buff_head *list)
  14005. +{
  14006. + raw_spin_lock_init(&list->raw_lock);
  14007. + __skb_queue_head_init(list);
  14008. +}
  14009. +
  14010. static inline void skb_queue_head_init_class(struct sk_buff_head *list,
  14011. struct lock_class_key *class)
  14012. {
  14013. diff -Nur linux-3.18.14.orig/include/linux/skbuff.h.orig linux-3.18.14-rt/include/linux/skbuff.h.orig
  14014. --- linux-3.18.14.orig/include/linux/skbuff.h.orig 1969-12-31 18:00:00.000000000 -0600
  14015. +++ linux-3.18.14-rt/include/linux/skbuff.h.orig 2015-05-20 10:04:50.000000000 -0500
  14016. @@ -0,0 +1,3364 @@
  14017. +/*
  14018. + * Definitions for the 'struct sk_buff' memory handlers.
  14019. + *
  14020. + * Authors:
  14021. + * Alan Cox, <gw4pts@gw4pts.ampr.org>
  14022. + * Florian La Roche, <rzsfl@rz.uni-sb.de>
  14023. + *
  14024. + * This program is free software; you can redistribute it and/or
  14025. + * modify it under the terms of the GNU General Public License
  14026. + * as published by the Free Software Foundation; either version
  14027. + * 2 of the License, or (at your option) any later version.
  14028. + */
  14029. +
  14030. +#ifndef _LINUX_SKBUFF_H
  14031. +#define _LINUX_SKBUFF_H
  14032. +
  14033. +#include <linux/kernel.h>
  14034. +#include <linux/kmemcheck.h>
  14035. +#include <linux/compiler.h>
  14036. +#include <linux/time.h>
  14037. +#include <linux/bug.h>
  14038. +#include <linux/cache.h>
  14039. +
  14040. +#include <linux/atomic.h>
  14041. +#include <asm/types.h>
  14042. +#include <linux/spinlock.h>
  14043. +#include <linux/net.h>
  14044. +#include <linux/textsearch.h>
  14045. +#include <net/checksum.h>
  14046. +#include <linux/rcupdate.h>
  14047. +#include <linux/hrtimer.h>
  14048. +#include <linux/dma-mapping.h>
  14049. +#include <linux/netdev_features.h>
  14050. +#include <linux/sched.h>
  14051. +#include <net/flow_keys.h>
  14052. +
  14053. +/* A. Checksumming of received packets by device.
  14054. + *
  14055. + * CHECKSUM_NONE:
  14056. + *
  14057. + * Device failed to checksum this packet e.g. due to lack of capabilities.
  14058. + * The packet contains full (though not verified) checksum in packet but
  14059. + * not in skb->csum. Thus, skb->csum is undefined in this case.
  14060. + *
  14061. + * CHECKSUM_UNNECESSARY:
  14062. + *
  14063. + * The hardware you're dealing with doesn't calculate the full checksum
  14064. + * (as in CHECKSUM_COMPLETE), but it does parse headers and verify checksums
  14065. + * for specific protocols. For such packets it will set CHECKSUM_UNNECESSARY
  14066. + * if their checksums are okay. skb->csum is still undefined in this case
  14067. + * though. It is a bad option, but, unfortunately, nowadays most vendors do
  14068. + * this. Apparently with the secret goal to sell you new devices, when you
  14069. + * will add new protocol to your host, f.e. IPv6 8)
  14070. + *
  14071. + * CHECKSUM_UNNECESSARY is applicable to following protocols:
  14072. + * TCP: IPv6 and IPv4.
  14073. + * UDP: IPv4 and IPv6. A device may apply CHECKSUM_UNNECESSARY to a
  14074. + * zero UDP checksum for either IPv4 or IPv6, the networking stack
  14075. + * may perform further validation in this case.
  14076. + * GRE: only if the checksum is present in the header.
  14077. + * SCTP: indicates the CRC in SCTP header has been validated.
  14078. + *
  14079. + * skb->csum_level indicates the number of consecutive checksums found in
  14080. + * the packet minus one that have been verified as CHECKSUM_UNNECESSARY.
  14081. + * For instance if a device receives an IPv6->UDP->GRE->IPv4->TCP packet
  14082. + * and a device is able to verify the checksums for UDP (possibly zero),
  14083. + * GRE (checksum flag is set), and TCP-- skb->csum_level would be set to
  14084. + * two. If the device were only able to verify the UDP checksum and not
  14085. + * GRE, either because it doesn't support GRE checksum of because GRE
  14086. + * checksum is bad, skb->csum_level would be set to zero (TCP checksum is
  14087. + * not considered in this case).
  14088. + *
  14089. + * CHECKSUM_COMPLETE:
  14090. + *
  14091. + * This is the most generic way. The device supplied checksum of the _whole_
  14092. + * packet as seen by netif_rx() and fills out in skb->csum. Meaning, the
  14093. + * hardware doesn't need to parse L3/L4 headers to implement this.
  14094. + *
  14095. + * Note: Even if device supports only some protocols, but is able to produce
  14096. + * skb->csum, it MUST use CHECKSUM_COMPLETE, not CHECKSUM_UNNECESSARY.
  14097. + *
  14098. + * CHECKSUM_PARTIAL:
  14099. + *
  14100. + * This is identical to the case for output below. This may occur on a packet
  14101. + * received directly from another Linux OS, e.g., a virtualized Linux kernel
  14102. + * on the same host. The packet can be treated in the same way as
  14103. + * CHECKSUM_UNNECESSARY, except that on output (i.e., forwarding) the
  14104. + * checksum must be filled in by the OS or the hardware.
  14105. + *
  14106. + * B. Checksumming on output.
  14107. + *
  14108. + * CHECKSUM_NONE:
  14109. + *
  14110. + * The skb was already checksummed by the protocol, or a checksum is not
  14111. + * required.
  14112. + *
  14113. + * CHECKSUM_PARTIAL:
  14114. + *
  14115. + * The device is required to checksum the packet as seen by hard_start_xmit()
  14116. + * from skb->csum_start up to the end, and to record/write the checksum at
  14117. + * offset skb->csum_start + skb->csum_offset.
  14118. + *
  14119. + * The device must show its capabilities in dev->features, set up at device
  14120. + * setup time, e.g. netdev_features.h:
  14121. + *
  14122. + * NETIF_F_HW_CSUM - It's a clever device, it's able to checksum everything.
  14123. + * NETIF_F_IP_CSUM - Device is dumb, it's able to checksum only TCP/UDP over
  14124. + * IPv4. Sigh. Vendors like this way for an unknown reason.
  14125. + * Though, see comment above about CHECKSUM_UNNECESSARY. 8)
  14126. + * NETIF_F_IPV6_CSUM - About as dumb as the last one but does IPv6 instead.
  14127. + * NETIF_F_... - Well, you get the picture.
  14128. + *
  14129. + * CHECKSUM_UNNECESSARY:
  14130. + *
  14131. + * Normally, the device will do per protocol specific checksumming. Protocol
  14132. + * implementations that do not want the NIC to perform the checksum
  14133. + * calculation should use this flag in their outgoing skbs.
  14134. + *
  14135. + * NETIF_F_FCOE_CRC - This indicates that the device can do FCoE FC CRC
  14136. + * offload. Correspondingly, the FCoE protocol driver
  14137. + * stack should use CHECKSUM_UNNECESSARY.
  14138. + *
  14139. + * Any questions? No questions, good. --ANK
  14140. + */
  14141. +
  14142. +/* Don't change this without changing skb_csum_unnecessary! */
  14143. +#define CHECKSUM_NONE 0
  14144. +#define CHECKSUM_UNNECESSARY 1
  14145. +#define CHECKSUM_COMPLETE 2
  14146. +#define CHECKSUM_PARTIAL 3
  14147. +
  14148. +/* Maximum value in skb->csum_level */
  14149. +#define SKB_MAX_CSUM_LEVEL 3
  14150. +
  14151. +#define SKB_DATA_ALIGN(X) ALIGN(X, SMP_CACHE_BYTES)
  14152. +#define SKB_WITH_OVERHEAD(X) \
  14153. + ((X) - SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))
  14154. +#define SKB_MAX_ORDER(X, ORDER) \
  14155. + SKB_WITH_OVERHEAD((PAGE_SIZE << (ORDER)) - (X))
  14156. +#define SKB_MAX_HEAD(X) (SKB_MAX_ORDER((X), 0))
  14157. +#define SKB_MAX_ALLOC (SKB_MAX_ORDER(0, 2))
  14158. +
  14159. +/* return minimum truesize of one skb containing X bytes of data */
  14160. +#define SKB_TRUESIZE(X) ((X) + \
  14161. + SKB_DATA_ALIGN(sizeof(struct sk_buff)) + \
  14162. + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))
  14163. +
  14164. +struct net_device;
  14165. +struct scatterlist;
  14166. +struct pipe_inode_info;
  14167. +
  14168. +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
  14169. +struct nf_conntrack {
  14170. + atomic_t use;
  14171. +};
  14172. +#endif
  14173. +
  14174. +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
  14175. +struct nf_bridge_info {
  14176. + atomic_t use;
  14177. + unsigned int mask;
  14178. + struct net_device *physindev;
  14179. + struct net_device *physoutdev;
  14180. + unsigned long data[32 / sizeof(unsigned long)];
  14181. +};
  14182. +#endif
  14183. +
  14184. +struct sk_buff_head {
  14185. + /* These two members must be first. */
  14186. + struct sk_buff *next;
  14187. + struct sk_buff *prev;
  14188. +
  14189. + __u32 qlen;
  14190. + spinlock_t lock;
  14191. +};
  14192. +
  14193. +struct sk_buff;
  14194. +
  14195. +/* To allow 64K frame to be packed as single skb without frag_list we
  14196. + * require 64K/PAGE_SIZE pages plus 1 additional page to allow for
  14197. + * buffers which do not start on a page boundary.
  14198. + *
  14199. + * Since GRO uses frags we allocate at least 16 regardless of page
  14200. + * size.
  14201. + */
  14202. +#if (65536/PAGE_SIZE + 1) < 16
  14203. +#define MAX_SKB_FRAGS 16UL
  14204. +#else
  14205. +#define MAX_SKB_FRAGS (65536/PAGE_SIZE + 1)
  14206. +#endif
  14207. +
  14208. +typedef struct skb_frag_struct skb_frag_t;
  14209. +
  14210. +struct skb_frag_struct {
  14211. + struct {
  14212. + struct page *p;
  14213. + } page;
  14214. +#if (BITS_PER_LONG > 32) || (PAGE_SIZE >= 65536)
  14215. + __u32 page_offset;
  14216. + __u32 size;
  14217. +#else
  14218. + __u16 page_offset;
  14219. + __u16 size;
  14220. +#endif
  14221. +};
  14222. +
  14223. +static inline unsigned int skb_frag_size(const skb_frag_t *frag)
  14224. +{
  14225. + return frag->size;
  14226. +}
  14227. +
  14228. +static inline void skb_frag_size_set(skb_frag_t *frag, unsigned int size)
  14229. +{
  14230. + frag->size = size;
  14231. +}
  14232. +
  14233. +static inline void skb_frag_size_add(skb_frag_t *frag, int delta)
  14234. +{
  14235. + frag->size += delta;
  14236. +}
  14237. +
  14238. +static inline void skb_frag_size_sub(skb_frag_t *frag, int delta)
  14239. +{
  14240. + frag->size -= delta;
  14241. +}
  14242. +
  14243. +#define HAVE_HW_TIME_STAMP
  14244. +
  14245. +/**
  14246. + * struct skb_shared_hwtstamps - hardware time stamps
  14247. + * @hwtstamp: hardware time stamp transformed into duration
  14248. + * since arbitrary point in time
  14249. + *
  14250. + * Software time stamps generated by ktime_get_real() are stored in
  14251. + * skb->tstamp.
  14252. + *
  14253. + * hwtstamps can only be compared against other hwtstamps from
  14254. + * the same device.
  14255. + *
  14256. + * This structure is attached to packets as part of the
  14257. + * &skb_shared_info. Use skb_hwtstamps() to get a pointer.
  14258. + */
  14259. +struct skb_shared_hwtstamps {
  14260. + ktime_t hwtstamp;
  14261. +};
  14262. +
  14263. +/* Definitions for tx_flags in struct skb_shared_info */
  14264. +enum {
  14265. + /* generate hardware time stamp */
  14266. + SKBTX_HW_TSTAMP = 1 << 0,
  14267. +
  14268. + /* generate software time stamp when queueing packet to NIC */
  14269. + SKBTX_SW_TSTAMP = 1 << 1,
  14270. +
  14271. + /* device driver is going to provide hardware time stamp */
  14272. + SKBTX_IN_PROGRESS = 1 << 2,
  14273. +
  14274. + /* device driver supports TX zero-copy buffers */
  14275. + SKBTX_DEV_ZEROCOPY = 1 << 3,
  14276. +
  14277. + /* generate wifi status information (where possible) */
  14278. + SKBTX_WIFI_STATUS = 1 << 4,
  14279. +
  14280. + /* This indicates at least one fragment might be overwritten
  14281. + * (as in vmsplice(), sendfile() ...)
  14282. + * If we need to compute a TX checksum, we'll need to copy
  14283. + * all frags to avoid possible bad checksum
  14284. + */
  14285. + SKBTX_SHARED_FRAG = 1 << 5,
  14286. +
  14287. + /* generate software time stamp when entering packet scheduling */
  14288. + SKBTX_SCHED_TSTAMP = 1 << 6,
  14289. +
  14290. + /* generate software timestamp on peer data acknowledgment */
  14291. + SKBTX_ACK_TSTAMP = 1 << 7,
  14292. +};
  14293. +
  14294. +#define SKBTX_ANY_SW_TSTAMP (SKBTX_SW_TSTAMP | \
  14295. + SKBTX_SCHED_TSTAMP | \
  14296. + SKBTX_ACK_TSTAMP)
  14297. +#define SKBTX_ANY_TSTAMP (SKBTX_HW_TSTAMP | SKBTX_ANY_SW_TSTAMP)
  14298. +
  14299. +/*
  14300. + * The callback notifies userspace to release buffers when skb DMA is done in
  14301. + * lower device, the skb last reference should be 0 when calling this.
  14302. + * The zerocopy_success argument is true if zero copy transmit occurred,
  14303. + * false on data copy or out of memory error caused by data copy attempt.
  14304. + * The ctx field is used to track device context.
  14305. + * The desc field is used to track userspace buffer index.
  14306. + */
  14307. +struct ubuf_info {
  14308. + void (*callback)(struct ubuf_info *, bool zerocopy_success);
  14309. + void *ctx;
  14310. + unsigned long desc;
  14311. +};
  14312. +
  14313. +/* This data is invariant across clones and lives at
  14314. + * the end of the header data, ie. at skb->end.
  14315. + */
  14316. +struct skb_shared_info {
  14317. + unsigned char nr_frags;
  14318. + __u8 tx_flags;
  14319. + unsigned short gso_size;
  14320. + /* Warning: this field is not always filled in (UFO)! */
  14321. + unsigned short gso_segs;
  14322. + unsigned short gso_type;
  14323. + struct sk_buff *frag_list;
  14324. + struct skb_shared_hwtstamps hwtstamps;
  14325. + u32 tskey;
  14326. + __be32 ip6_frag_id;
  14327. +
  14328. + /*
  14329. + * Warning : all fields before dataref are cleared in __alloc_skb()
  14330. + */
  14331. + atomic_t dataref;
  14332. +
  14333. + /* Intermediate layers must ensure that destructor_arg
  14334. + * remains valid until skb destructor */
  14335. + void * destructor_arg;
  14336. +
  14337. + /* must be last field, see pskb_expand_head() */
  14338. + skb_frag_t frags[MAX_SKB_FRAGS];
  14339. +};
  14340. +
  14341. +/* We divide dataref into two halves. The higher 16 bits hold references
  14342. + * to the payload part of skb->data. The lower 16 bits hold references to
  14343. + * the entire skb->data. A clone of a headerless skb holds the length of
  14344. + * the header in skb->hdr_len.
  14345. + *
  14346. + * All users must obey the rule that the skb->data reference count must be
  14347. + * greater than or equal to the payload reference count.
  14348. + *
  14349. + * Holding a reference to the payload part means that the user does not
  14350. + * care about modifications to the header part of skb->data.
  14351. + */
  14352. +#define SKB_DATAREF_SHIFT 16
  14353. +#define SKB_DATAREF_MASK ((1 << SKB_DATAREF_SHIFT) - 1)
  14354. +
  14355. +
  14356. +enum {
  14357. + SKB_FCLONE_UNAVAILABLE, /* skb has no fclone (from head_cache) */
  14358. + SKB_FCLONE_ORIG, /* orig skb (from fclone_cache) */
  14359. + SKB_FCLONE_CLONE, /* companion fclone skb (from fclone_cache) */
  14360. + SKB_FCLONE_FREE, /* this companion fclone skb is available */
  14361. +};
  14362. +
  14363. +enum {
  14364. + SKB_GSO_TCPV4 = 1 << 0,
  14365. + SKB_GSO_UDP = 1 << 1,
  14366. +
  14367. + /* This indicates the skb is from an untrusted source. */
  14368. + SKB_GSO_DODGY = 1 << 2,
  14369. +
  14370. + /* This indicates the tcp segment has CWR set. */
  14371. + SKB_GSO_TCP_ECN = 1 << 3,
  14372. +
  14373. + SKB_GSO_TCPV6 = 1 << 4,
  14374. +
  14375. + SKB_GSO_FCOE = 1 << 5,
  14376. +
  14377. + SKB_GSO_GRE = 1 << 6,
  14378. +
  14379. + SKB_GSO_GRE_CSUM = 1 << 7,
  14380. +
  14381. + SKB_GSO_IPIP = 1 << 8,
  14382. +
  14383. + SKB_GSO_SIT = 1 << 9,
  14384. +
  14385. + SKB_GSO_UDP_TUNNEL = 1 << 10,
  14386. +
  14387. + SKB_GSO_UDP_TUNNEL_CSUM = 1 << 11,
  14388. +
  14389. + SKB_GSO_MPLS = 1 << 12,
  14390. +
  14391. +};
  14392. +
  14393. +#if BITS_PER_LONG > 32
  14394. +#define NET_SKBUFF_DATA_USES_OFFSET 1
  14395. +#endif
  14396. +
  14397. +#ifdef NET_SKBUFF_DATA_USES_OFFSET
  14398. +typedef unsigned int sk_buff_data_t;
  14399. +#else
  14400. +typedef unsigned char *sk_buff_data_t;
  14401. +#endif
  14402. +
  14403. +/**
  14404. + * struct skb_mstamp - multi resolution time stamps
  14405. + * @stamp_us: timestamp in us resolution
  14406. + * @stamp_jiffies: timestamp in jiffies
  14407. + */
  14408. +struct skb_mstamp {
  14409. + union {
  14410. + u64 v64;
  14411. + struct {
  14412. + u32 stamp_us;
  14413. + u32 stamp_jiffies;
  14414. + };
  14415. + };
  14416. +};
  14417. +
  14418. +/**
  14419. + * skb_mstamp_get - get current timestamp
  14420. + * @cl: place to store timestamps
  14421. + */
  14422. +static inline void skb_mstamp_get(struct skb_mstamp *cl)
  14423. +{
  14424. + u64 val = local_clock();
  14425. +
  14426. + do_div(val, NSEC_PER_USEC);
  14427. + cl->stamp_us = (u32)val;
  14428. + cl->stamp_jiffies = (u32)jiffies;
  14429. +}
  14430. +
  14431. +/**
  14432. + * skb_mstamp_delta - compute the difference in usec between two skb_mstamp
  14433. + * @t1: pointer to newest sample
  14434. + * @t0: pointer to oldest sample
  14435. + */
  14436. +static inline u32 skb_mstamp_us_delta(const struct skb_mstamp *t1,
  14437. + const struct skb_mstamp *t0)
  14438. +{
  14439. + s32 delta_us = t1->stamp_us - t0->stamp_us;
  14440. + u32 delta_jiffies = t1->stamp_jiffies - t0->stamp_jiffies;
  14441. +
  14442. + /* If delta_us is negative, this might be because interval is too big,
  14443. + * or local_clock() drift is too big : fallback using jiffies.
  14444. + */
  14445. + if (delta_us <= 0 ||
  14446. + delta_jiffies >= (INT_MAX / (USEC_PER_SEC / HZ)))
  14447. +
  14448. + delta_us = jiffies_to_usecs(delta_jiffies);
  14449. +
  14450. + return delta_us;
  14451. +}
  14452. +
  14453. +
  14454. +/**
  14455. + * struct sk_buff - socket buffer
  14456. + * @next: Next buffer in list
  14457. + * @prev: Previous buffer in list
  14458. + * @tstamp: Time we arrived/left
  14459. + * @sk: Socket we are owned by
  14460. + * @dev: Device we arrived on/are leaving by
  14461. + * @cb: Control buffer. Free for use by every layer. Put private vars here
  14462. + * @_skb_refdst: destination entry (with norefcount bit)
  14463. + * @sp: the security path, used for xfrm
  14464. + * @len: Length of actual data
  14465. + * @data_len: Data length
  14466. + * @mac_len: Length of link layer header
  14467. + * @hdr_len: writable header length of cloned skb
  14468. + * @csum: Checksum (must include start/offset pair)
  14469. + * @csum_start: Offset from skb->head where checksumming should start
  14470. + * @csum_offset: Offset from csum_start where checksum should be stored
  14471. + * @priority: Packet queueing priority
  14472. + * @ignore_df: allow local fragmentation
  14473. + * @cloned: Head may be cloned (check refcnt to be sure)
  14474. + * @ip_summed: Driver fed us an IP checksum
  14475. + * @nohdr: Payload reference only, must not modify header
  14476. + * @nfctinfo: Relationship of this skb to the connection
  14477. + * @pkt_type: Packet class
  14478. + * @fclone: skbuff clone status
  14479. + * @ipvs_property: skbuff is owned by ipvs
  14480. + * @peeked: this packet has been seen already, so stats have been
  14481. + * done for it, don't do them again
  14482. + * @nf_trace: netfilter packet trace flag
  14483. + * @protocol: Packet protocol from driver
  14484. + * @destructor: Destruct function
  14485. + * @nfct: Associated connection, if any
  14486. + * @nf_bridge: Saved data about a bridged frame - see br_netfilter.c
  14487. + * @skb_iif: ifindex of device we arrived on
  14488. + * @tc_index: Traffic control index
  14489. + * @tc_verd: traffic control verdict
  14490. + * @hash: the packet hash
  14491. + * @queue_mapping: Queue mapping for multiqueue devices
  14492. + * @xmit_more: More SKBs are pending for this queue
  14493. + * @ndisc_nodetype: router type (from link layer)
  14494. + * @ooo_okay: allow the mapping of a socket to a queue to be changed
  14495. + * @l4_hash: indicate hash is a canonical 4-tuple hash over transport
  14496. + * ports.
  14497. + * @sw_hash: indicates hash was computed in software stack
  14498. + * @wifi_acked_valid: wifi_acked was set
  14499. + * @wifi_acked: whether frame was acked on wifi or not
  14500. + * @no_fcs: Request NIC to treat last 4 bytes as Ethernet FCS
  14501. + * @napi_id: id of the NAPI struct this skb came from
  14502. + * @secmark: security marking
  14503. + * @mark: Generic packet mark
  14504. + * @dropcount: total number of sk_receive_queue overflows
  14505. + * @vlan_proto: vlan encapsulation protocol
  14506. + * @vlan_tci: vlan tag control information
  14507. + * @inner_protocol: Protocol (encapsulation)
  14508. + * @inner_transport_header: Inner transport layer header (encapsulation)
  14509. + * @inner_network_header: Network layer header (encapsulation)
  14510. + * @inner_mac_header: Link layer header (encapsulation)
  14511. + * @transport_header: Transport layer header
  14512. + * @network_header: Network layer header
  14513. + * @mac_header: Link layer header
  14514. + * @tail: Tail pointer
  14515. + * @end: End pointer
  14516. + * @head: Head of buffer
  14517. + * @data: Data head pointer
  14518. + * @truesize: Buffer size
  14519. + * @users: User count - see {datagram,tcp}.c
  14520. + */
  14521. +
  14522. +struct sk_buff {
  14523. + /* These two members must be first. */
  14524. + struct sk_buff *next;
  14525. + struct sk_buff *prev;
  14526. +
  14527. + union {
  14528. + ktime_t tstamp;
  14529. + struct skb_mstamp skb_mstamp;
  14530. + };
  14531. +
  14532. + struct sock *sk;
  14533. + struct net_device *dev;
  14534. +
  14535. + /*
  14536. + * This is the control buffer. It is free to use for every
  14537. + * layer. Please put your private variables there. If you
  14538. + * want to keep them across layers you have to do a skb_clone()
  14539. + * first. This is owned by whoever has the skb queued ATM.
  14540. + */
  14541. + char cb[48] __aligned(8);
  14542. +
  14543. + unsigned long _skb_refdst;
  14544. + void (*destructor)(struct sk_buff *skb);
  14545. +#ifdef CONFIG_XFRM
  14546. + struct sec_path *sp;
  14547. +#endif
  14548. +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
  14549. + struct nf_conntrack *nfct;
  14550. +#endif
  14551. +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
  14552. + struct nf_bridge_info *nf_bridge;
  14553. +#endif
  14554. + unsigned int len,
  14555. + data_len;
  14556. + __u16 mac_len,
  14557. + hdr_len;
  14558. +
  14559. + /* Following fields are _not_ copied in __copy_skb_header()
  14560. + * Note that queue_mapping is here mostly to fill a hole.
  14561. + */
  14562. + kmemcheck_bitfield_begin(flags1);
  14563. + __u16 queue_mapping;
  14564. + __u8 cloned:1,
  14565. + nohdr:1,
  14566. + fclone:2,
  14567. + peeked:1,
  14568. + head_frag:1,
  14569. + xmit_more:1;
  14570. + /* one bit hole */
  14571. + kmemcheck_bitfield_end(flags1);
  14572. +
  14573. + /* fields enclosed in headers_start/headers_end are copied
  14574. + * using a single memcpy() in __copy_skb_header()
  14575. + */
  14576. + /* private: */
  14577. + __u32 headers_start[0];
  14578. + /* public: */
  14579. +
  14580. +/* if you move pkt_type around you also must adapt those constants */
  14581. +#ifdef __BIG_ENDIAN_BITFIELD
  14582. +#define PKT_TYPE_MAX (7 << 5)
  14583. +#else
  14584. +#define PKT_TYPE_MAX 7
  14585. +#endif
  14586. +#define PKT_TYPE_OFFSET() offsetof(struct sk_buff, __pkt_type_offset)
  14587. +
  14588. + __u8 __pkt_type_offset[0];
  14589. + __u8 pkt_type:3;
  14590. + __u8 pfmemalloc:1;
  14591. + __u8 ignore_df:1;
  14592. + __u8 nfctinfo:3;
  14593. +
  14594. + __u8 nf_trace:1;
  14595. + __u8 ip_summed:2;
  14596. + __u8 ooo_okay:1;
  14597. + __u8 l4_hash:1;
  14598. + __u8 sw_hash:1;
  14599. + __u8 wifi_acked_valid:1;
  14600. + __u8 wifi_acked:1;
  14601. +
  14602. + __u8 no_fcs:1;
  14603. + /* Indicates the inner headers are valid in the skbuff. */
  14604. + __u8 encapsulation:1;
  14605. + __u8 encap_hdr_csum:1;
  14606. + __u8 csum_valid:1;
  14607. + __u8 csum_complete_sw:1;
  14608. + __u8 csum_level:2;
  14609. + __u8 csum_bad:1;
  14610. +
  14611. +#ifdef CONFIG_IPV6_NDISC_NODETYPE
  14612. + __u8 ndisc_nodetype:2;
  14613. +#endif
  14614. + __u8 ipvs_property:1;
  14615. + __u8 inner_protocol_type:1;
  14616. + /* 4 or 6 bit hole */
  14617. +
  14618. +#ifdef CONFIG_NET_SCHED
  14619. + __u16 tc_index; /* traffic control index */
  14620. +#ifdef CONFIG_NET_CLS_ACT
  14621. + __u16 tc_verd; /* traffic control verdict */
  14622. +#endif
  14623. +#endif
  14624. +
  14625. + union {
  14626. + __wsum csum;
  14627. + struct {
  14628. + __u16 csum_start;
  14629. + __u16 csum_offset;
  14630. + };
  14631. + };
  14632. + __u32 priority;
  14633. + int skb_iif;
  14634. + __u32 hash;
  14635. + __be16 vlan_proto;
  14636. + __u16 vlan_tci;
  14637. +#ifdef CONFIG_NET_RX_BUSY_POLL
  14638. + unsigned int napi_id;
  14639. +#endif
  14640. +#ifdef CONFIG_NETWORK_SECMARK
  14641. + __u32 secmark;
  14642. +#endif
  14643. + union {
  14644. + __u32 mark;
  14645. + __u32 dropcount;
  14646. + __u32 reserved_tailroom;
  14647. + };
  14648. +
  14649. + union {
  14650. + __be16 inner_protocol;
  14651. + __u8 inner_ipproto;
  14652. + };
  14653. +
  14654. + __u16 inner_transport_header;
  14655. + __u16 inner_network_header;
  14656. + __u16 inner_mac_header;
  14657. +
  14658. + __be16 protocol;
  14659. + __u16 transport_header;
  14660. + __u16 network_header;
  14661. + __u16 mac_header;
  14662. +
  14663. + /* private: */
  14664. + __u32 headers_end[0];
  14665. + /* public: */
  14666. +
  14667. + /* These elements must be at the end, see alloc_skb() for details. */
  14668. + sk_buff_data_t tail;
  14669. + sk_buff_data_t end;
  14670. + unsigned char *head,
  14671. + *data;
  14672. + unsigned int truesize;
  14673. + atomic_t users;
  14674. +};
  14675. +
  14676. +#ifdef __KERNEL__
  14677. +/*
  14678. + * Handling routines are only of interest to the kernel
  14679. + */
  14680. +#include <linux/slab.h>
  14681. +
  14682. +
  14683. +#define SKB_ALLOC_FCLONE 0x01
  14684. +#define SKB_ALLOC_RX 0x02
  14685. +
  14686. +/* Returns true if the skb was allocated from PFMEMALLOC reserves */
  14687. +static inline bool skb_pfmemalloc(const struct sk_buff *skb)
  14688. +{
  14689. + return unlikely(skb->pfmemalloc);
  14690. +}
  14691. +
  14692. +/*
  14693. + * skb might have a dst pointer attached, refcounted or not.
  14694. + * _skb_refdst low order bit is set if refcount was _not_ taken
  14695. + */
  14696. +#define SKB_DST_NOREF 1UL
  14697. +#define SKB_DST_PTRMASK ~(SKB_DST_NOREF)
  14698. +
  14699. +/**
  14700. + * skb_dst - returns skb dst_entry
  14701. + * @skb: buffer
  14702. + *
  14703. + * Returns skb dst_entry, regardless of reference taken or not.
  14704. + */
  14705. +static inline struct dst_entry *skb_dst(const struct sk_buff *skb)
  14706. +{
  14707. + /* If refdst was not refcounted, check we still are in a
  14708. + * rcu_read_lock section
  14709. + */
  14710. + WARN_ON((skb->_skb_refdst & SKB_DST_NOREF) &&
  14711. + !rcu_read_lock_held() &&
  14712. + !rcu_read_lock_bh_held());
  14713. + return (struct dst_entry *)(skb->_skb_refdst & SKB_DST_PTRMASK);
  14714. +}
  14715. +
  14716. +/**
  14717. + * skb_dst_set - sets skb dst
  14718. + * @skb: buffer
  14719. + * @dst: dst entry
  14720. + *
  14721. + * Sets skb dst, assuming a reference was taken on dst and should
  14722. + * be released by skb_dst_drop()
  14723. + */
  14724. +static inline void skb_dst_set(struct sk_buff *skb, struct dst_entry *dst)
  14725. +{
  14726. + skb->_skb_refdst = (unsigned long)dst;
  14727. +}
  14728. +
  14729. +void __skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst,
  14730. + bool force);
  14731. +
  14732. +/**
  14733. + * skb_dst_set_noref - sets skb dst, hopefully, without taking reference
  14734. + * @skb: buffer
  14735. + * @dst: dst entry
  14736. + *
  14737. + * Sets skb dst, assuming a reference was not taken on dst.
  14738. + * If dst entry is cached, we do not take reference and dst_release
  14739. + * will be avoided by refdst_drop. If dst entry is not cached, we take
  14740. + * reference, so that last dst_release can destroy the dst immediately.
  14741. + */
  14742. +static inline void skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst)
  14743. +{
  14744. + __skb_dst_set_noref(skb, dst, false);
  14745. +}
  14746. +
  14747. +/**
  14748. + * skb_dst_set_noref_force - sets skb dst, without taking reference
  14749. + * @skb: buffer
  14750. + * @dst: dst entry
  14751. + *
  14752. + * Sets skb dst, assuming a reference was not taken on dst.
  14753. + * No reference is taken and no dst_release will be called. While for
  14754. + * cached dsts deferred reclaim is a basic feature, for entries that are
  14755. + * not cached it is caller's job to guarantee that last dst_release for
  14756. + * provided dst happens when nobody uses it, eg. after a RCU grace period.
  14757. + */
  14758. +static inline void skb_dst_set_noref_force(struct sk_buff *skb,
  14759. + struct dst_entry *dst)
  14760. +{
  14761. + __skb_dst_set_noref(skb, dst, true);
  14762. +}
  14763. +
  14764. +/**
  14765. + * skb_dst_is_noref - Test if skb dst isn't refcounted
  14766. + * @skb: buffer
  14767. + */
  14768. +static inline bool skb_dst_is_noref(const struct sk_buff *skb)
  14769. +{
  14770. + return (skb->_skb_refdst & SKB_DST_NOREF) && skb_dst(skb);
  14771. +}
  14772. +
  14773. +static inline struct rtable *skb_rtable(const struct sk_buff *skb)
  14774. +{
  14775. + return (struct rtable *)skb_dst(skb);
  14776. +}
  14777. +
  14778. +void kfree_skb(struct sk_buff *skb);
  14779. +void kfree_skb_list(struct sk_buff *segs);
  14780. +void skb_tx_error(struct sk_buff *skb);
  14781. +void consume_skb(struct sk_buff *skb);
  14782. +void __kfree_skb(struct sk_buff *skb);
  14783. +extern struct kmem_cache *skbuff_head_cache;
  14784. +
  14785. +void kfree_skb_partial(struct sk_buff *skb, bool head_stolen);
  14786. +bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from,
  14787. + bool *fragstolen, int *delta_truesize);
  14788. +
  14789. +struct sk_buff *__alloc_skb(unsigned int size, gfp_t priority, int flags,
  14790. + int node);
  14791. +struct sk_buff *__build_skb(void *data, unsigned int frag_size);
  14792. +struct sk_buff *build_skb(void *data, unsigned int frag_size);
  14793. +static inline struct sk_buff *alloc_skb(unsigned int size,
  14794. + gfp_t priority)
  14795. +{
  14796. + return __alloc_skb(size, priority, 0, NUMA_NO_NODE);
  14797. +}
  14798. +
  14799. +struct sk_buff *alloc_skb_with_frags(unsigned long header_len,
  14800. + unsigned long data_len,
  14801. + int max_page_order,
  14802. + int *errcode,
  14803. + gfp_t gfp_mask);
  14804. +
  14805. +/* Layout of fast clones : [skb1][skb2][fclone_ref] */
  14806. +struct sk_buff_fclones {
  14807. + struct sk_buff skb1;
  14808. +
  14809. + struct sk_buff skb2;
  14810. +
  14811. + atomic_t fclone_ref;
  14812. +};
  14813. +
  14814. +/**
  14815. + * skb_fclone_busy - check if fclone is busy
  14816. + * @skb: buffer
  14817. + *
  14818. + * Returns true is skb is a fast clone, and its clone is not freed.
  14819. + * Some drivers call skb_orphan() in their ndo_start_xmit(),
  14820. + * so we also check that this didnt happen.
  14821. + */
  14822. +static inline bool skb_fclone_busy(const struct sock *sk,
  14823. + const struct sk_buff *skb)
  14824. +{
  14825. + const struct sk_buff_fclones *fclones;
  14826. +
  14827. + fclones = container_of(skb, struct sk_buff_fclones, skb1);
  14828. +
  14829. + return skb->fclone == SKB_FCLONE_ORIG &&
  14830. + fclones->skb2.fclone == SKB_FCLONE_CLONE &&
  14831. + fclones->skb2.sk == sk;
  14832. +}
  14833. +
  14834. +static inline struct sk_buff *alloc_skb_fclone(unsigned int size,
  14835. + gfp_t priority)
  14836. +{
  14837. + return __alloc_skb(size, priority, SKB_ALLOC_FCLONE, NUMA_NO_NODE);
  14838. +}
  14839. +
  14840. +struct sk_buff *__alloc_skb_head(gfp_t priority, int node);
  14841. +static inline struct sk_buff *alloc_skb_head(gfp_t priority)
  14842. +{
  14843. + return __alloc_skb_head(priority, -1);
  14844. +}
  14845. +
  14846. +struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src);
  14847. +int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask);
  14848. +struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t priority);
  14849. +struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t priority);
  14850. +struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom,
  14851. + gfp_t gfp_mask, bool fclone);
  14852. +static inline struct sk_buff *__pskb_copy(struct sk_buff *skb, int headroom,
  14853. + gfp_t gfp_mask)
  14854. +{
  14855. + return __pskb_copy_fclone(skb, headroom, gfp_mask, false);
  14856. +}
  14857. +
  14858. +int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, gfp_t gfp_mask);
  14859. +struct sk_buff *skb_realloc_headroom(struct sk_buff *skb,
  14860. + unsigned int headroom);
  14861. +struct sk_buff *skb_copy_expand(const struct sk_buff *skb, int newheadroom,
  14862. + int newtailroom, gfp_t priority);
  14863. +int skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg,
  14864. + int offset, int len);
  14865. +int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset,
  14866. + int len);
  14867. +int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer);
  14868. +int skb_pad(struct sk_buff *skb, int pad);
  14869. +#define dev_kfree_skb(a) consume_skb(a)
  14870. +
  14871. +int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb,
  14872. + int getfrag(void *from, char *to, int offset,
  14873. + int len, int odd, struct sk_buff *skb),
  14874. + void *from, int length);
  14875. +
  14876. +struct skb_seq_state {
  14877. + __u32 lower_offset;
  14878. + __u32 upper_offset;
  14879. + __u32 frag_idx;
  14880. + __u32 stepped_offset;
  14881. + struct sk_buff *root_skb;
  14882. + struct sk_buff *cur_skb;
  14883. + __u8 *frag_data;
  14884. +};
  14885. +
  14886. +void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from,
  14887. + unsigned int to, struct skb_seq_state *st);
  14888. +unsigned int skb_seq_read(unsigned int consumed, const u8 **data,
  14889. + struct skb_seq_state *st);
  14890. +void skb_abort_seq_read(struct skb_seq_state *st);
  14891. +
  14892. +unsigned int skb_find_text(struct sk_buff *skb, unsigned int from,
  14893. + unsigned int to, struct ts_config *config,
  14894. + struct ts_state *state);
  14895. +
  14896. +/*
  14897. + * Packet hash types specify the type of hash in skb_set_hash.
  14898. + *
  14899. + * Hash types refer to the protocol layer addresses which are used to
  14900. + * construct a packet's hash. The hashes are used to differentiate or identify
  14901. + * flows of the protocol layer for the hash type. Hash types are either
  14902. + * layer-2 (L2), layer-3 (L3), or layer-4 (L4).
  14903. + *
  14904. + * Properties of hashes:
  14905. + *
  14906. + * 1) Two packets in different flows have different hash values
  14907. + * 2) Two packets in the same flow should have the same hash value
  14908. + *
  14909. + * A hash at a higher layer is considered to be more specific. A driver should
  14910. + * set the most specific hash possible.
  14911. + *
  14912. + * A driver cannot indicate a more specific hash than the layer at which a hash
  14913. + * was computed. For instance an L3 hash cannot be set as an L4 hash.
  14914. + *
  14915. + * A driver may indicate a hash level which is less specific than the
  14916. + * actual layer the hash was computed on. For instance, a hash computed
  14917. + * at L4 may be considered an L3 hash. This should only be done if the
  14918. + * driver can't unambiguously determine that the HW computed the hash at
  14919. + * the higher layer. Note that the "should" in the second property above
  14920. + * permits this.
  14921. + */
  14922. +enum pkt_hash_types {
  14923. + PKT_HASH_TYPE_NONE, /* Undefined type */
  14924. + PKT_HASH_TYPE_L2, /* Input: src_MAC, dest_MAC */
  14925. + PKT_HASH_TYPE_L3, /* Input: src_IP, dst_IP */
  14926. + PKT_HASH_TYPE_L4, /* Input: src_IP, dst_IP, src_port, dst_port */
  14927. +};
  14928. +
  14929. +static inline void
  14930. +skb_set_hash(struct sk_buff *skb, __u32 hash, enum pkt_hash_types type)
  14931. +{
  14932. + skb->l4_hash = (type == PKT_HASH_TYPE_L4);
  14933. + skb->sw_hash = 0;
  14934. + skb->hash = hash;
  14935. +}
  14936. +
  14937. +void __skb_get_hash(struct sk_buff *skb);
  14938. +static inline __u32 skb_get_hash(struct sk_buff *skb)
  14939. +{
  14940. + if (!skb->l4_hash && !skb->sw_hash)
  14941. + __skb_get_hash(skb);
  14942. +
  14943. + return skb->hash;
  14944. +}
  14945. +
  14946. +static inline __u32 skb_get_hash_raw(const struct sk_buff *skb)
  14947. +{
  14948. + return skb->hash;
  14949. +}
  14950. +
  14951. +static inline void skb_clear_hash(struct sk_buff *skb)
  14952. +{
  14953. + skb->hash = 0;
  14954. + skb->sw_hash = 0;
  14955. + skb->l4_hash = 0;
  14956. +}
  14957. +
  14958. +static inline void skb_clear_hash_if_not_l4(struct sk_buff *skb)
  14959. +{
  14960. + if (!skb->l4_hash)
  14961. + skb_clear_hash(skb);
  14962. +}
  14963. +
  14964. +static inline void skb_copy_hash(struct sk_buff *to, const struct sk_buff *from)
  14965. +{
  14966. + to->hash = from->hash;
  14967. + to->sw_hash = from->sw_hash;
  14968. + to->l4_hash = from->l4_hash;
  14969. +};
  14970. +
  14971. +#ifdef NET_SKBUFF_DATA_USES_OFFSET
  14972. +static inline unsigned char *skb_end_pointer(const struct sk_buff *skb)
  14973. +{
  14974. + return skb->head + skb->end;
  14975. +}
  14976. +
  14977. +static inline unsigned int skb_end_offset(const struct sk_buff *skb)
  14978. +{
  14979. + return skb->end;
  14980. +}
  14981. +#else
  14982. +static inline unsigned char *skb_end_pointer(const struct sk_buff *skb)
  14983. +{
  14984. + return skb->end;
  14985. +}
  14986. +
  14987. +static inline unsigned int skb_end_offset(const struct sk_buff *skb)
  14988. +{
  14989. + return skb->end - skb->head;
  14990. +}
  14991. +#endif
  14992. +
  14993. +/* Internal */
  14994. +#define skb_shinfo(SKB) ((struct skb_shared_info *)(skb_end_pointer(SKB)))
  14995. +
  14996. +static inline struct skb_shared_hwtstamps *skb_hwtstamps(struct sk_buff *skb)
  14997. +{
  14998. + return &skb_shinfo(skb)->hwtstamps;
  14999. +}
  15000. +
  15001. +/**
  15002. + * skb_queue_empty - check if a queue is empty
  15003. + * @list: queue head
  15004. + *
  15005. + * Returns true if the queue is empty, false otherwise.
  15006. + */
  15007. +static inline int skb_queue_empty(const struct sk_buff_head *list)
  15008. +{
  15009. + return list->next == (const struct sk_buff *) list;
  15010. +}
  15011. +
  15012. +/**
  15013. + * skb_queue_is_last - check if skb is the last entry in the queue
  15014. + * @list: queue head
  15015. + * @skb: buffer
  15016. + *
  15017. + * Returns true if @skb is the last buffer on the list.
  15018. + */
  15019. +static inline bool skb_queue_is_last(const struct sk_buff_head *list,
  15020. + const struct sk_buff *skb)
  15021. +{
  15022. + return skb->next == (const struct sk_buff *) list;
  15023. +}
  15024. +
  15025. +/**
  15026. + * skb_queue_is_first - check if skb is the first entry in the queue
  15027. + * @list: queue head
  15028. + * @skb: buffer
  15029. + *
  15030. + * Returns true if @skb is the first buffer on the list.
  15031. + */
  15032. +static inline bool skb_queue_is_first(const struct sk_buff_head *list,
  15033. + const struct sk_buff *skb)
  15034. +{
  15035. + return skb->prev == (const struct sk_buff *) list;
  15036. +}
  15037. +
  15038. +/**
  15039. + * skb_queue_next - return the next packet in the queue
  15040. + * @list: queue head
  15041. + * @skb: current buffer
  15042. + *
  15043. + * Return the next packet in @list after @skb. It is only valid to
  15044. + * call this if skb_queue_is_last() evaluates to false.
  15045. + */
  15046. +static inline struct sk_buff *skb_queue_next(const struct sk_buff_head *list,
  15047. + const struct sk_buff *skb)
  15048. +{
  15049. + /* This BUG_ON may seem severe, but if we just return then we
  15050. + * are going to dereference garbage.
  15051. + */
  15052. + BUG_ON(skb_queue_is_last(list, skb));
  15053. + return skb->next;
  15054. +}
  15055. +
  15056. +/**
  15057. + * skb_queue_prev - return the prev packet in the queue
  15058. + * @list: queue head
  15059. + * @skb: current buffer
  15060. + *
  15061. + * Return the prev packet in @list before @skb. It is only valid to
  15062. + * call this if skb_queue_is_first() evaluates to false.
  15063. + */
  15064. +static inline struct sk_buff *skb_queue_prev(const struct sk_buff_head *list,
  15065. + const struct sk_buff *skb)
  15066. +{
  15067. + /* This BUG_ON may seem severe, but if we just return then we
  15068. + * are going to dereference garbage.
  15069. + */
  15070. + BUG_ON(skb_queue_is_first(list, skb));
  15071. + return skb->prev;
  15072. +}
  15073. +
  15074. +/**
  15075. + * skb_get - reference buffer
  15076. + * @skb: buffer to reference
  15077. + *
  15078. + * Makes another reference to a socket buffer and returns a pointer
  15079. + * to the buffer.
  15080. + */
  15081. +static inline struct sk_buff *skb_get(struct sk_buff *skb)
  15082. +{
  15083. + atomic_inc(&skb->users);
  15084. + return skb;
  15085. +}
  15086. +
  15087. +/*
  15088. + * If users == 1, we are the only owner and are can avoid redundant
  15089. + * atomic change.
  15090. + */
  15091. +
  15092. +/**
  15093. + * skb_cloned - is the buffer a clone
  15094. + * @skb: buffer to check
  15095. + *
  15096. + * Returns true if the buffer was generated with skb_clone() and is
  15097. + * one of multiple shared copies of the buffer. Cloned buffers are
  15098. + * shared data so must not be written to under normal circumstances.
  15099. + */
  15100. +static inline int skb_cloned(const struct sk_buff *skb)
  15101. +{
  15102. + return skb->cloned &&
  15103. + (atomic_read(&skb_shinfo(skb)->dataref) & SKB_DATAREF_MASK) != 1;
  15104. +}
  15105. +
  15106. +static inline int skb_unclone(struct sk_buff *skb, gfp_t pri)
  15107. +{
  15108. + might_sleep_if(pri & __GFP_WAIT);
  15109. +
  15110. + if (skb_cloned(skb))
  15111. + return pskb_expand_head(skb, 0, 0, pri);
  15112. +
  15113. + return 0;
  15114. +}
  15115. +
  15116. +/**
  15117. + * skb_header_cloned - is the header a clone
  15118. + * @skb: buffer to check
  15119. + *
  15120. + * Returns true if modifying the header part of the buffer requires
  15121. + * the data to be copied.
  15122. + */
  15123. +static inline int skb_header_cloned(const struct sk_buff *skb)
  15124. +{
  15125. + int dataref;
  15126. +
  15127. + if (!skb->cloned)
  15128. + return 0;
  15129. +
  15130. + dataref = atomic_read(&skb_shinfo(skb)->dataref);
  15131. + dataref = (dataref & SKB_DATAREF_MASK) - (dataref >> SKB_DATAREF_SHIFT);
  15132. + return dataref != 1;
  15133. +}
  15134. +
  15135. +/**
  15136. + * skb_header_release - release reference to header
  15137. + * @skb: buffer to operate on
  15138. + *
  15139. + * Drop a reference to the header part of the buffer. This is done
  15140. + * by acquiring a payload reference. You must not read from the header
  15141. + * part of skb->data after this.
  15142. + * Note : Check if you can use __skb_header_release() instead.
  15143. + */
  15144. +static inline void skb_header_release(struct sk_buff *skb)
  15145. +{
  15146. + BUG_ON(skb->nohdr);
  15147. + skb->nohdr = 1;
  15148. + atomic_add(1 << SKB_DATAREF_SHIFT, &skb_shinfo(skb)->dataref);
  15149. +}
  15150. +
  15151. +/**
  15152. + * __skb_header_release - release reference to header
  15153. + * @skb: buffer to operate on
  15154. + *
  15155. + * Variant of skb_header_release() assuming skb is private to caller.
  15156. + * We can avoid one atomic operation.
  15157. + */
  15158. +static inline void __skb_header_release(struct sk_buff *skb)
  15159. +{
  15160. + skb->nohdr = 1;
  15161. + atomic_set(&skb_shinfo(skb)->dataref, 1 + (1 << SKB_DATAREF_SHIFT));
  15162. +}
  15163. +
  15164. +
  15165. +/**
  15166. + * skb_shared - is the buffer shared
  15167. + * @skb: buffer to check
  15168. + *
  15169. + * Returns true if more than one person has a reference to this
  15170. + * buffer.
  15171. + */
  15172. +static inline int skb_shared(const struct sk_buff *skb)
  15173. +{
  15174. + return atomic_read(&skb->users) != 1;
  15175. +}
  15176. +
  15177. +/**
  15178. + * skb_share_check - check if buffer is shared and if so clone it
  15179. + * @skb: buffer to check
  15180. + * @pri: priority for memory allocation
  15181. + *
  15182. + * If the buffer is shared the buffer is cloned and the old copy
  15183. + * drops a reference. A new clone with a single reference is returned.
  15184. + * If the buffer is not shared the original buffer is returned. When
  15185. + * being called from interrupt status or with spinlocks held pri must
  15186. + * be GFP_ATOMIC.
  15187. + *
  15188. + * NULL is returned on a memory allocation failure.
  15189. + */
  15190. +static inline struct sk_buff *skb_share_check(struct sk_buff *skb, gfp_t pri)
  15191. +{
  15192. + might_sleep_if(pri & __GFP_WAIT);
  15193. + if (skb_shared(skb)) {
  15194. + struct sk_buff *nskb = skb_clone(skb, pri);
  15195. +
  15196. + if (likely(nskb))
  15197. + consume_skb(skb);
  15198. + else
  15199. + kfree_skb(skb);
  15200. + skb = nskb;
  15201. + }
  15202. + return skb;
  15203. +}
  15204. +
  15205. +/*
  15206. + * Copy shared buffers into a new sk_buff. We effectively do COW on
  15207. + * packets to handle cases where we have a local reader and forward
  15208. + * and a couple of other messy ones. The normal one is tcpdumping
  15209. + * a packet thats being forwarded.
  15210. + */
  15211. +
  15212. +/**
  15213. + * skb_unshare - make a copy of a shared buffer
  15214. + * @skb: buffer to check
  15215. + * @pri: priority for memory allocation
  15216. + *
  15217. + * If the socket buffer is a clone then this function creates a new
  15218. + * copy of the data, drops a reference count on the old copy and returns
  15219. + * the new copy with the reference count at 1. If the buffer is not a clone
  15220. + * the original buffer is returned. When called with a spinlock held or
  15221. + * from interrupt state @pri must be %GFP_ATOMIC
  15222. + *
  15223. + * %NULL is returned on a memory allocation failure.
  15224. + */
  15225. +static inline struct sk_buff *skb_unshare(struct sk_buff *skb,
  15226. + gfp_t pri)
  15227. +{
  15228. + might_sleep_if(pri & __GFP_WAIT);
  15229. + if (skb_cloned(skb)) {
  15230. + struct sk_buff *nskb = skb_copy(skb, pri);
  15231. +
  15232. + /* Free our shared copy */
  15233. + if (likely(nskb))
  15234. + consume_skb(skb);
  15235. + else
  15236. + kfree_skb(skb);
  15237. + skb = nskb;
  15238. + }
  15239. + return skb;
  15240. +}
  15241. +
  15242. +/**
  15243. + * skb_peek - peek at the head of an &sk_buff_head
  15244. + * @list_: list to peek at
  15245. + *
  15246. + * Peek an &sk_buff. Unlike most other operations you _MUST_
  15247. + * be careful with this one. A peek leaves the buffer on the
  15248. + * list and someone else may run off with it. You must hold
  15249. + * the appropriate locks or have a private queue to do this.
  15250. + *
  15251. + * Returns %NULL for an empty list or a pointer to the head element.
  15252. + * The reference count is not incremented and the reference is therefore
  15253. + * volatile. Use with caution.
  15254. + */
  15255. +static inline struct sk_buff *skb_peek(const struct sk_buff_head *list_)
  15256. +{
  15257. + struct sk_buff *skb = list_->next;
  15258. +
  15259. + if (skb == (struct sk_buff *)list_)
  15260. + skb = NULL;
  15261. + return skb;
  15262. +}
  15263. +
  15264. +/**
  15265. + * skb_peek_next - peek skb following the given one from a queue
  15266. + * @skb: skb to start from
  15267. + * @list_: list to peek at
  15268. + *
  15269. + * Returns %NULL when the end of the list is met or a pointer to the
  15270. + * next element. The reference count is not incremented and the
  15271. + * reference is therefore volatile. Use with caution.
  15272. + */
  15273. +static inline struct sk_buff *skb_peek_next(struct sk_buff *skb,
  15274. + const struct sk_buff_head *list_)
  15275. +{
  15276. + struct sk_buff *next = skb->next;
  15277. +
  15278. + if (next == (struct sk_buff *)list_)
  15279. + next = NULL;
  15280. + return next;
  15281. +}
  15282. +
  15283. +/**
  15284. + * skb_peek_tail - peek at the tail of an &sk_buff_head
  15285. + * @list_: list to peek at
  15286. + *
  15287. + * Peek an &sk_buff. Unlike most other operations you _MUST_
  15288. + * be careful with this one. A peek leaves the buffer on the
  15289. + * list and someone else may run off with it. You must hold
  15290. + * the appropriate locks or have a private queue to do this.
  15291. + *
  15292. + * Returns %NULL for an empty list or a pointer to the tail element.
  15293. + * The reference count is not incremented and the reference is therefore
  15294. + * volatile. Use with caution.
  15295. + */
  15296. +static inline struct sk_buff *skb_peek_tail(const struct sk_buff_head *list_)
  15297. +{
  15298. + struct sk_buff *skb = list_->prev;
  15299. +
  15300. + if (skb == (struct sk_buff *)list_)
  15301. + skb = NULL;
  15302. + return skb;
  15303. +
  15304. +}
  15305. +
  15306. +/**
  15307. + * skb_queue_len - get queue length
  15308. + * @list_: list to measure
  15309. + *
  15310. + * Return the length of an &sk_buff queue.
  15311. + */
  15312. +static inline __u32 skb_queue_len(const struct sk_buff_head *list_)
  15313. +{
  15314. + return list_->qlen;
  15315. +}
  15316. +
  15317. +/**
  15318. + * __skb_queue_head_init - initialize non-spinlock portions of sk_buff_head
  15319. + * @list: queue to initialize
  15320. + *
  15321. + * This initializes only the list and queue length aspects of
  15322. + * an sk_buff_head object. This allows to initialize the list
  15323. + * aspects of an sk_buff_head without reinitializing things like
  15324. + * the spinlock. It can also be used for on-stack sk_buff_head
  15325. + * objects where the spinlock is known to not be used.
  15326. + */
  15327. +static inline void __skb_queue_head_init(struct sk_buff_head *list)
  15328. +{
  15329. + list->prev = list->next = (struct sk_buff *)list;
  15330. + list->qlen = 0;
  15331. +}
  15332. +
  15333. +/*
  15334. + * This function creates a split out lock class for each invocation;
  15335. + * this is needed for now since a whole lot of users of the skb-queue
  15336. + * infrastructure in drivers have different locking usage (in hardirq)
  15337. + * than the networking core (in softirq only). In the long run either the
  15338. + * network layer or drivers should need annotation to consolidate the
  15339. + * main types of usage into 3 classes.
  15340. + */
  15341. +static inline void skb_queue_head_init(struct sk_buff_head *list)
  15342. +{
  15343. + spin_lock_init(&list->lock);
  15344. + __skb_queue_head_init(list);
  15345. +}
  15346. +
  15347. +static inline void skb_queue_head_init_class(struct sk_buff_head *list,
  15348. + struct lock_class_key *class)
  15349. +{
  15350. + skb_queue_head_init(list);
  15351. + lockdep_set_class(&list->lock, class);
  15352. +}
  15353. +
  15354. +/*
  15355. + * Insert an sk_buff on a list.
  15356. + *
  15357. + * The "__skb_xxxx()" functions are the non-atomic ones that
  15358. + * can only be called with interrupts disabled.
  15359. + */
  15360. +void skb_insert(struct sk_buff *old, struct sk_buff *newsk,
  15361. + struct sk_buff_head *list);
  15362. +static inline void __skb_insert(struct sk_buff *newsk,
  15363. + struct sk_buff *prev, struct sk_buff *next,
  15364. + struct sk_buff_head *list)
  15365. +{
  15366. + newsk->next = next;
  15367. + newsk->prev = prev;
  15368. + next->prev = prev->next = newsk;
  15369. + list->qlen++;
  15370. +}
  15371. +
  15372. +static inline void __skb_queue_splice(const struct sk_buff_head *list,
  15373. + struct sk_buff *prev,
  15374. + struct sk_buff *next)
  15375. +{
  15376. + struct sk_buff *first = list->next;
  15377. + struct sk_buff *last = list->prev;
  15378. +
  15379. + first->prev = prev;
  15380. + prev->next = first;
  15381. +
  15382. + last->next = next;
  15383. + next->prev = last;
  15384. +}
  15385. +
  15386. +/**
  15387. + * skb_queue_splice - join two skb lists, this is designed for stacks
  15388. + * @list: the new list to add
  15389. + * @head: the place to add it in the first list
  15390. + */
  15391. +static inline void skb_queue_splice(const struct sk_buff_head *list,
  15392. + struct sk_buff_head *head)
  15393. +{
  15394. + if (!skb_queue_empty(list)) {
  15395. + __skb_queue_splice(list, (struct sk_buff *) head, head->next);
  15396. + head->qlen += list->qlen;
  15397. + }
  15398. +}
  15399. +
  15400. +/**
  15401. + * skb_queue_splice_init - join two skb lists and reinitialise the emptied list
  15402. + * @list: the new list to add
  15403. + * @head: the place to add it in the first list
  15404. + *
  15405. + * The list at @list is reinitialised
  15406. + */
  15407. +static inline void skb_queue_splice_init(struct sk_buff_head *list,
  15408. + struct sk_buff_head *head)
  15409. +{
  15410. + if (!skb_queue_empty(list)) {
  15411. + __skb_queue_splice(list, (struct sk_buff *) head, head->next);
  15412. + head->qlen += list->qlen;
  15413. + __skb_queue_head_init(list);
  15414. + }
  15415. +}
  15416. +
  15417. +/**
  15418. + * skb_queue_splice_tail - join two skb lists, each list being a queue
  15419. + * @list: the new list to add
  15420. + * @head: the place to add it in the first list
  15421. + */
  15422. +static inline void skb_queue_splice_tail(const struct sk_buff_head *list,
  15423. + struct sk_buff_head *head)
  15424. +{
  15425. + if (!skb_queue_empty(list)) {
  15426. + __skb_queue_splice(list, head->prev, (struct sk_buff *) head);
  15427. + head->qlen += list->qlen;
  15428. + }
  15429. +}
  15430. +
  15431. +/**
  15432. + * skb_queue_splice_tail_init - join two skb lists and reinitialise the emptied list
  15433. + * @list: the new list to add
  15434. + * @head: the place to add it in the first list
  15435. + *
  15436. + * Each of the lists is a queue.
  15437. + * The list at @list is reinitialised
  15438. + */
  15439. +static inline void skb_queue_splice_tail_init(struct sk_buff_head *list,
  15440. + struct sk_buff_head *head)
  15441. +{
  15442. + if (!skb_queue_empty(list)) {
  15443. + __skb_queue_splice(list, head->prev, (struct sk_buff *) head);
  15444. + head->qlen += list->qlen;
  15445. + __skb_queue_head_init(list);
  15446. + }
  15447. +}
  15448. +
  15449. +/**
  15450. + * __skb_queue_after - queue a buffer at the list head
  15451. + * @list: list to use
  15452. + * @prev: place after this buffer
  15453. + * @newsk: buffer to queue
  15454. + *
  15455. + * Queue a buffer int the middle of a list. This function takes no locks
  15456. + * and you must therefore hold required locks before calling it.
  15457. + *
  15458. + * A buffer cannot be placed on two lists at the same time.
  15459. + */
  15460. +static inline void __skb_queue_after(struct sk_buff_head *list,
  15461. + struct sk_buff *prev,
  15462. + struct sk_buff *newsk)
  15463. +{
  15464. + __skb_insert(newsk, prev, prev->next, list);
  15465. +}
  15466. +
  15467. +void skb_append(struct sk_buff *old, struct sk_buff *newsk,
  15468. + struct sk_buff_head *list);
  15469. +
  15470. +static inline void __skb_queue_before(struct sk_buff_head *list,
  15471. + struct sk_buff *next,
  15472. + struct sk_buff *newsk)
  15473. +{
  15474. + __skb_insert(newsk, next->prev, next, list);
  15475. +}
  15476. +
  15477. +/**
  15478. + * __skb_queue_head - queue a buffer at the list head
  15479. + * @list: list to use
  15480. + * @newsk: buffer to queue
  15481. + *
  15482. + * Queue a buffer at the start of a list. This function takes no locks
  15483. + * and you must therefore hold required locks before calling it.
  15484. + *
  15485. + * A buffer cannot be placed on two lists at the same time.
  15486. + */
  15487. +void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk);
  15488. +static inline void __skb_queue_head(struct sk_buff_head *list,
  15489. + struct sk_buff *newsk)
  15490. +{
  15491. + __skb_queue_after(list, (struct sk_buff *)list, newsk);
  15492. +}
  15493. +
  15494. +/**
  15495. + * __skb_queue_tail - queue a buffer at the list tail
  15496. + * @list: list to use
  15497. + * @newsk: buffer to queue
  15498. + *
  15499. + * Queue a buffer at the end of a list. This function takes no locks
  15500. + * and you must therefore hold required locks before calling it.
  15501. + *
  15502. + * A buffer cannot be placed on two lists at the same time.
  15503. + */
  15504. +void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk);
  15505. +static inline void __skb_queue_tail(struct sk_buff_head *list,
  15506. + struct sk_buff *newsk)
  15507. +{
  15508. + __skb_queue_before(list, (struct sk_buff *)list, newsk);
  15509. +}
  15510. +
  15511. +/*
  15512. + * remove sk_buff from list. _Must_ be called atomically, and with
  15513. + * the list known..
  15514. + */
  15515. +void skb_unlink(struct sk_buff *skb, struct sk_buff_head *list);
  15516. +static inline void __skb_unlink(struct sk_buff *skb, struct sk_buff_head *list)
  15517. +{
  15518. + struct sk_buff *next, *prev;
  15519. +
  15520. + list->qlen--;
  15521. + next = skb->next;
  15522. + prev = skb->prev;
  15523. + skb->next = skb->prev = NULL;
  15524. + next->prev = prev;
  15525. + prev->next = next;
  15526. +}
  15527. +
  15528. +/**
  15529. + * __skb_dequeue - remove from the head of the queue
  15530. + * @list: list to dequeue from
  15531. + *
  15532. + * Remove the head of the list. This function does not take any locks
  15533. + * so must be used with appropriate locks held only. The head item is
  15534. + * returned or %NULL if the list is empty.
  15535. + */
  15536. +struct sk_buff *skb_dequeue(struct sk_buff_head *list);
  15537. +static inline struct sk_buff *__skb_dequeue(struct sk_buff_head *list)
  15538. +{
  15539. + struct sk_buff *skb = skb_peek(list);
  15540. + if (skb)
  15541. + __skb_unlink(skb, list);
  15542. + return skb;
  15543. +}
  15544. +
  15545. +/**
  15546. + * __skb_dequeue_tail - remove from the tail of the queue
  15547. + * @list: list to dequeue from
  15548. + *
  15549. + * Remove the tail of the list. This function does not take any locks
  15550. + * so must be used with appropriate locks held only. The tail item is
  15551. + * returned or %NULL if the list is empty.
  15552. + */
  15553. +struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list);
  15554. +static inline struct sk_buff *__skb_dequeue_tail(struct sk_buff_head *list)
  15555. +{
  15556. + struct sk_buff *skb = skb_peek_tail(list);
  15557. + if (skb)
  15558. + __skb_unlink(skb, list);
  15559. + return skb;
  15560. +}
  15561. +
  15562. +
  15563. +static inline bool skb_is_nonlinear(const struct sk_buff *skb)
  15564. +{
  15565. + return skb->data_len;
  15566. +}
  15567. +
  15568. +static inline unsigned int skb_headlen(const struct sk_buff *skb)
  15569. +{
  15570. + return skb->len - skb->data_len;
  15571. +}
  15572. +
  15573. +static inline int skb_pagelen(const struct sk_buff *skb)
  15574. +{
  15575. + int i, len = 0;
  15576. +
  15577. + for (i = (int)skb_shinfo(skb)->nr_frags - 1; i >= 0; i--)
  15578. + len += skb_frag_size(&skb_shinfo(skb)->frags[i]);
  15579. + return len + skb_headlen(skb);
  15580. +}
  15581. +
  15582. +/**
  15583. + * __skb_fill_page_desc - initialise a paged fragment in an skb
  15584. + * @skb: buffer containing fragment to be initialised
  15585. + * @i: paged fragment index to initialise
  15586. + * @page: the page to use for this fragment
  15587. + * @off: the offset to the data with @page
  15588. + * @size: the length of the data
  15589. + *
  15590. + * Initialises the @i'th fragment of @skb to point to &size bytes at
  15591. + * offset @off within @page.
  15592. + *
  15593. + * Does not take any additional reference on the fragment.
  15594. + */
  15595. +static inline void __skb_fill_page_desc(struct sk_buff *skb, int i,
  15596. + struct page *page, int off, int size)
  15597. +{
  15598. + skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
  15599. +
  15600. + /*
  15601. + * Propagate page->pfmemalloc to the skb if we can. The problem is
  15602. + * that not all callers have unique ownership of the page. If
  15603. + * pfmemalloc is set, we check the mapping as a mapping implies
  15604. + * page->index is set (index and pfmemalloc share space).
  15605. + * If it's a valid mapping, we cannot use page->pfmemalloc but we
  15606. + * do not lose pfmemalloc information as the pages would not be
  15607. + * allocated using __GFP_MEMALLOC.
  15608. + */
  15609. + frag->page.p = page;
  15610. + frag->page_offset = off;
  15611. + skb_frag_size_set(frag, size);
  15612. +
  15613. + page = compound_head(page);
  15614. + if (page->pfmemalloc && !page->mapping)
  15615. + skb->pfmemalloc = true;
  15616. +}
  15617. +
  15618. +/**
  15619. + * skb_fill_page_desc - initialise a paged fragment in an skb
  15620. + * @skb: buffer containing fragment to be initialised
  15621. + * @i: paged fragment index to initialise
  15622. + * @page: the page to use for this fragment
  15623. + * @off: the offset to the data with @page
  15624. + * @size: the length of the data
  15625. + *
  15626. + * As per __skb_fill_page_desc() -- initialises the @i'th fragment of
  15627. + * @skb to point to @size bytes at offset @off within @page. In
  15628. + * addition updates @skb such that @i is the last fragment.
  15629. + *
  15630. + * Does not take any additional reference on the fragment.
  15631. + */
  15632. +static inline void skb_fill_page_desc(struct sk_buff *skb, int i,
  15633. + struct page *page, int off, int size)
  15634. +{
  15635. + __skb_fill_page_desc(skb, i, page, off, size);
  15636. + skb_shinfo(skb)->nr_frags = i + 1;
  15637. +}
  15638. +
  15639. +void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
  15640. + int size, unsigned int truesize);
  15641. +
  15642. +void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size,
  15643. + unsigned int truesize);
  15644. +
  15645. +#define SKB_PAGE_ASSERT(skb) BUG_ON(skb_shinfo(skb)->nr_frags)
  15646. +#define SKB_FRAG_ASSERT(skb) BUG_ON(skb_has_frag_list(skb))
  15647. +#define SKB_LINEAR_ASSERT(skb) BUG_ON(skb_is_nonlinear(skb))
  15648. +
  15649. +#ifdef NET_SKBUFF_DATA_USES_OFFSET
  15650. +static inline unsigned char *skb_tail_pointer(const struct sk_buff *skb)
  15651. +{
  15652. + return skb->head + skb->tail;
  15653. +}
  15654. +
  15655. +static inline void skb_reset_tail_pointer(struct sk_buff *skb)
  15656. +{
  15657. + skb->tail = skb->data - skb->head;
  15658. +}
  15659. +
  15660. +static inline void skb_set_tail_pointer(struct sk_buff *skb, const int offset)
  15661. +{
  15662. + skb_reset_tail_pointer(skb);
  15663. + skb->tail += offset;
  15664. +}
  15665. +
  15666. +#else /* NET_SKBUFF_DATA_USES_OFFSET */
  15667. +static inline unsigned char *skb_tail_pointer(const struct sk_buff *skb)
  15668. +{
  15669. + return skb->tail;
  15670. +}
  15671. +
  15672. +static inline void skb_reset_tail_pointer(struct sk_buff *skb)
  15673. +{
  15674. + skb->tail = skb->data;
  15675. +}
  15676. +
  15677. +static inline void skb_set_tail_pointer(struct sk_buff *skb, const int offset)
  15678. +{
  15679. + skb->tail = skb->data + offset;
  15680. +}
  15681. +
  15682. +#endif /* NET_SKBUFF_DATA_USES_OFFSET */
  15683. +
  15684. +/*
  15685. + * Add data to an sk_buff
  15686. + */
  15687. +unsigned char *pskb_put(struct sk_buff *skb, struct sk_buff *tail, int len);
  15688. +unsigned char *skb_put(struct sk_buff *skb, unsigned int len);
  15689. +static inline unsigned char *__skb_put(struct sk_buff *skb, unsigned int len)
  15690. +{
  15691. + unsigned char *tmp = skb_tail_pointer(skb);
  15692. + SKB_LINEAR_ASSERT(skb);
  15693. + skb->tail += len;
  15694. + skb->len += len;
  15695. + return tmp;
  15696. +}
  15697. +
  15698. +unsigned char *skb_push(struct sk_buff *skb, unsigned int len);
  15699. +static inline unsigned char *__skb_push(struct sk_buff *skb, unsigned int len)
  15700. +{
  15701. + skb->data -= len;
  15702. + skb->len += len;
  15703. + return skb->data;
  15704. +}
  15705. +
  15706. +unsigned char *skb_pull(struct sk_buff *skb, unsigned int len);
  15707. +static inline unsigned char *__skb_pull(struct sk_buff *skb, unsigned int len)
  15708. +{
  15709. + skb->len -= len;
  15710. + BUG_ON(skb->len < skb->data_len);
  15711. + return skb->data += len;
  15712. +}
  15713. +
  15714. +static inline unsigned char *skb_pull_inline(struct sk_buff *skb, unsigned int len)
  15715. +{
  15716. + return unlikely(len > skb->len) ? NULL : __skb_pull(skb, len);
  15717. +}
  15718. +
  15719. +unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta);
  15720. +
  15721. +static inline unsigned char *__pskb_pull(struct sk_buff *skb, unsigned int len)
  15722. +{
  15723. + if (len > skb_headlen(skb) &&
  15724. + !__pskb_pull_tail(skb, len - skb_headlen(skb)))
  15725. + return NULL;
  15726. + skb->len -= len;
  15727. + return skb->data += len;
  15728. +}
  15729. +
  15730. +static inline unsigned char *pskb_pull(struct sk_buff *skb, unsigned int len)
  15731. +{
  15732. + return unlikely(len > skb->len) ? NULL : __pskb_pull(skb, len);
  15733. +}
  15734. +
  15735. +static inline int pskb_may_pull(struct sk_buff *skb, unsigned int len)
  15736. +{
  15737. + if (likely(len <= skb_headlen(skb)))
  15738. + return 1;
  15739. + if (unlikely(len > skb->len))
  15740. + return 0;
  15741. + return __pskb_pull_tail(skb, len - skb_headlen(skb)) != NULL;
  15742. +}
  15743. +
  15744. +/**
  15745. + * skb_headroom - bytes at buffer head
  15746. + * @skb: buffer to check
  15747. + *
  15748. + * Return the number of bytes of free space at the head of an &sk_buff.
  15749. + */
  15750. +static inline unsigned int skb_headroom(const struct sk_buff *skb)
  15751. +{
  15752. + return skb->data - skb->head;
  15753. +}
  15754. +
  15755. +/**
  15756. + * skb_tailroom - bytes at buffer end
  15757. + * @skb: buffer to check
  15758. + *
  15759. + * Return the number of bytes of free space at the tail of an sk_buff
  15760. + */
  15761. +static inline int skb_tailroom(const struct sk_buff *skb)
  15762. +{
  15763. + return skb_is_nonlinear(skb) ? 0 : skb->end - skb->tail;
  15764. +}
  15765. +
  15766. +/**
  15767. + * skb_availroom - bytes at buffer end
  15768. + * @skb: buffer to check
  15769. + *
  15770. + * Return the number of bytes of free space at the tail of an sk_buff
  15771. + * allocated by sk_stream_alloc()
  15772. + */
  15773. +static inline int skb_availroom(const struct sk_buff *skb)
  15774. +{
  15775. + if (skb_is_nonlinear(skb))
  15776. + return 0;
  15777. +
  15778. + return skb->end - skb->tail - skb->reserved_tailroom;
  15779. +}
  15780. +
  15781. +/**
  15782. + * skb_reserve - adjust headroom
  15783. + * @skb: buffer to alter
  15784. + * @len: bytes to move
  15785. + *
  15786. + * Increase the headroom of an empty &sk_buff by reducing the tail
  15787. + * room. This is only allowed for an empty buffer.
  15788. + */
  15789. +static inline void skb_reserve(struct sk_buff *skb, int len)
  15790. +{
  15791. + skb->data += len;
  15792. + skb->tail += len;
  15793. +}
  15794. +
  15795. +#define ENCAP_TYPE_ETHER 0
  15796. +#define ENCAP_TYPE_IPPROTO 1
  15797. +
  15798. +static inline void skb_set_inner_protocol(struct sk_buff *skb,
  15799. + __be16 protocol)
  15800. +{
  15801. + skb->inner_protocol = protocol;
  15802. + skb->inner_protocol_type = ENCAP_TYPE_ETHER;
  15803. +}
  15804. +
  15805. +static inline void skb_set_inner_ipproto(struct sk_buff *skb,
  15806. + __u8 ipproto)
  15807. +{
  15808. + skb->inner_ipproto = ipproto;
  15809. + skb->inner_protocol_type = ENCAP_TYPE_IPPROTO;
  15810. +}
  15811. +
  15812. +static inline void skb_reset_inner_headers(struct sk_buff *skb)
  15813. +{
  15814. + skb->inner_mac_header = skb->mac_header;
  15815. + skb->inner_network_header = skb->network_header;
  15816. + skb->inner_transport_header = skb->transport_header;
  15817. +}
  15818. +
  15819. +static inline void skb_reset_mac_len(struct sk_buff *skb)
  15820. +{
  15821. + skb->mac_len = skb->network_header - skb->mac_header;
  15822. +}
  15823. +
  15824. +static inline unsigned char *skb_inner_transport_header(const struct sk_buff
  15825. + *skb)
  15826. +{
  15827. + return skb->head + skb->inner_transport_header;
  15828. +}
  15829. +
  15830. +static inline void skb_reset_inner_transport_header(struct sk_buff *skb)
  15831. +{
  15832. + skb->inner_transport_header = skb->data - skb->head;
  15833. +}
  15834. +
  15835. +static inline void skb_set_inner_transport_header(struct sk_buff *skb,
  15836. + const int offset)
  15837. +{
  15838. + skb_reset_inner_transport_header(skb);
  15839. + skb->inner_transport_header += offset;
  15840. +}
  15841. +
  15842. +static inline unsigned char *skb_inner_network_header(const struct sk_buff *skb)
  15843. +{
  15844. + return skb->head + skb->inner_network_header;
  15845. +}
  15846. +
  15847. +static inline void skb_reset_inner_network_header(struct sk_buff *skb)
  15848. +{
  15849. + skb->inner_network_header = skb->data - skb->head;
  15850. +}
  15851. +
  15852. +static inline void skb_set_inner_network_header(struct sk_buff *skb,
  15853. + const int offset)
  15854. +{
  15855. + skb_reset_inner_network_header(skb);
  15856. + skb->inner_network_header += offset;
  15857. +}
  15858. +
  15859. +static inline unsigned char *skb_inner_mac_header(const struct sk_buff *skb)
  15860. +{
  15861. + return skb->head + skb->inner_mac_header;
  15862. +}
  15863. +
  15864. +static inline void skb_reset_inner_mac_header(struct sk_buff *skb)
  15865. +{
  15866. + skb->inner_mac_header = skb->data - skb->head;
  15867. +}
  15868. +
  15869. +static inline void skb_set_inner_mac_header(struct sk_buff *skb,
  15870. + const int offset)
  15871. +{
  15872. + skb_reset_inner_mac_header(skb);
  15873. + skb->inner_mac_header += offset;
  15874. +}
  15875. +static inline bool skb_transport_header_was_set(const struct sk_buff *skb)
  15876. +{
  15877. + return skb->transport_header != (typeof(skb->transport_header))~0U;
  15878. +}
  15879. +
  15880. +static inline unsigned char *skb_transport_header(const struct sk_buff *skb)
  15881. +{
  15882. + return skb->head + skb->transport_header;
  15883. +}
  15884. +
  15885. +static inline void skb_reset_transport_header(struct sk_buff *skb)
  15886. +{
  15887. + skb->transport_header = skb->data - skb->head;
  15888. +}
  15889. +
  15890. +static inline void skb_set_transport_header(struct sk_buff *skb,
  15891. + const int offset)
  15892. +{
  15893. + skb_reset_transport_header(skb);
  15894. + skb->transport_header += offset;
  15895. +}
  15896. +
  15897. +static inline unsigned char *skb_network_header(const struct sk_buff *skb)
  15898. +{
  15899. + return skb->head + skb->network_header;
  15900. +}
  15901. +
  15902. +static inline void skb_reset_network_header(struct sk_buff *skb)
  15903. +{
  15904. + skb->network_header = skb->data - skb->head;
  15905. +}
  15906. +
  15907. +static inline void skb_set_network_header(struct sk_buff *skb, const int offset)
  15908. +{
  15909. + skb_reset_network_header(skb);
  15910. + skb->network_header += offset;
  15911. +}
  15912. +
  15913. +static inline unsigned char *skb_mac_header(const struct sk_buff *skb)
  15914. +{
  15915. + return skb->head + skb->mac_header;
  15916. +}
  15917. +
  15918. +static inline int skb_mac_header_was_set(const struct sk_buff *skb)
  15919. +{
  15920. + return skb->mac_header != (typeof(skb->mac_header))~0U;
  15921. +}
  15922. +
  15923. +static inline void skb_reset_mac_header(struct sk_buff *skb)
  15924. +{
  15925. + skb->mac_header = skb->data - skb->head;
  15926. +}
  15927. +
  15928. +static inline void skb_set_mac_header(struct sk_buff *skb, const int offset)
  15929. +{
  15930. + skb_reset_mac_header(skb);
  15931. + skb->mac_header += offset;
  15932. +}
  15933. +
  15934. +static inline void skb_pop_mac_header(struct sk_buff *skb)
  15935. +{
  15936. + skb->mac_header = skb->network_header;
  15937. +}
  15938. +
  15939. +static inline void skb_probe_transport_header(struct sk_buff *skb,
  15940. + const int offset_hint)
  15941. +{
  15942. + struct flow_keys keys;
  15943. +
  15944. + if (skb_transport_header_was_set(skb))
  15945. + return;
  15946. + else if (skb_flow_dissect(skb, &keys))
  15947. + skb_set_transport_header(skb, keys.thoff);
  15948. + else
  15949. + skb_set_transport_header(skb, offset_hint);
  15950. +}
  15951. +
  15952. +static inline void skb_mac_header_rebuild(struct sk_buff *skb)
  15953. +{
  15954. + if (skb_mac_header_was_set(skb)) {
  15955. + const unsigned char *old_mac = skb_mac_header(skb);
  15956. +
  15957. + skb_set_mac_header(skb, -skb->mac_len);
  15958. + memmove(skb_mac_header(skb), old_mac, skb->mac_len);
  15959. + }
  15960. +}
  15961. +
  15962. +static inline int skb_checksum_start_offset(const struct sk_buff *skb)
  15963. +{
  15964. + return skb->csum_start - skb_headroom(skb);
  15965. +}
  15966. +
  15967. +static inline int skb_transport_offset(const struct sk_buff *skb)
  15968. +{
  15969. + return skb_transport_header(skb) - skb->data;
  15970. +}
  15971. +
  15972. +static inline u32 skb_network_header_len(const struct sk_buff *skb)
  15973. +{
  15974. + return skb->transport_header - skb->network_header;
  15975. +}
  15976. +
  15977. +static inline u32 skb_inner_network_header_len(const struct sk_buff *skb)
  15978. +{
  15979. + return skb->inner_transport_header - skb->inner_network_header;
  15980. +}
  15981. +
  15982. +static inline int skb_network_offset(const struct sk_buff *skb)
  15983. +{
  15984. + return skb_network_header(skb) - skb->data;
  15985. +}
  15986. +
  15987. +static inline int skb_inner_network_offset(const struct sk_buff *skb)
  15988. +{
  15989. + return skb_inner_network_header(skb) - skb->data;
  15990. +}
  15991. +
  15992. +static inline int pskb_network_may_pull(struct sk_buff *skb, unsigned int len)
  15993. +{
  15994. + return pskb_may_pull(skb, skb_network_offset(skb) + len);
  15995. +}
  15996. +
  15997. +/*
  15998. + * CPUs often take a performance hit when accessing unaligned memory
  15999. + * locations. The actual performance hit varies, it can be small if the
  16000. + * hardware handles it or large if we have to take an exception and fix it
  16001. + * in software.
  16002. + *
  16003. + * Since an ethernet header is 14 bytes network drivers often end up with
  16004. + * the IP header at an unaligned offset. The IP header can be aligned by
  16005. + * shifting the start of the packet by 2 bytes. Drivers should do this
  16006. + * with:
  16007. + *
  16008. + * skb_reserve(skb, NET_IP_ALIGN);
  16009. + *
  16010. + * The downside to this alignment of the IP header is that the DMA is now
  16011. + * unaligned. On some architectures the cost of an unaligned DMA is high
  16012. + * and this cost outweighs the gains made by aligning the IP header.
  16013. + *
  16014. + * Since this trade off varies between architectures, we allow NET_IP_ALIGN
  16015. + * to be overridden.
  16016. + */
  16017. +#ifndef NET_IP_ALIGN
  16018. +#define NET_IP_ALIGN 2
  16019. +#endif
  16020. +
  16021. +/*
  16022. + * The networking layer reserves some headroom in skb data (via
  16023. + * dev_alloc_skb). This is used to avoid having to reallocate skb data when
  16024. + * the header has to grow. In the default case, if the header has to grow
  16025. + * 32 bytes or less we avoid the reallocation.
  16026. + *
  16027. + * Unfortunately this headroom changes the DMA alignment of the resulting
  16028. + * network packet. As for NET_IP_ALIGN, this unaligned DMA is expensive
  16029. + * on some architectures. An architecture can override this value,
  16030. + * perhaps setting it to a cacheline in size (since that will maintain
  16031. + * cacheline alignment of the DMA). It must be a power of 2.
  16032. + *
  16033. + * Various parts of the networking layer expect at least 32 bytes of
  16034. + * headroom, you should not reduce this.
  16035. + *
  16036. + * Using max(32, L1_CACHE_BYTES) makes sense (especially with RPS)
  16037. + * to reduce average number of cache lines per packet.
  16038. + * get_rps_cpus() for example only access one 64 bytes aligned block :
  16039. + * NET_IP_ALIGN(2) + ethernet_header(14) + IP_header(20/40) + ports(8)
  16040. + */
  16041. +#ifndef NET_SKB_PAD
  16042. +#define NET_SKB_PAD max(32, L1_CACHE_BYTES)
  16043. +#endif
  16044. +
  16045. +int ___pskb_trim(struct sk_buff *skb, unsigned int len);
  16046. +
  16047. +static inline void __skb_trim(struct sk_buff *skb, unsigned int len)
  16048. +{
  16049. + if (unlikely(skb_is_nonlinear(skb))) {
  16050. + WARN_ON(1);
  16051. + return;
  16052. + }
  16053. + skb->len = len;
  16054. + skb_set_tail_pointer(skb, len);
  16055. +}
  16056. +
  16057. +void skb_trim(struct sk_buff *skb, unsigned int len);
  16058. +
  16059. +static inline int __pskb_trim(struct sk_buff *skb, unsigned int len)
  16060. +{
  16061. + if (skb->data_len)
  16062. + return ___pskb_trim(skb, len);
  16063. + __skb_trim(skb, len);
  16064. + return 0;
  16065. +}
  16066. +
  16067. +static inline int pskb_trim(struct sk_buff *skb, unsigned int len)
  16068. +{
  16069. + return (len < skb->len) ? __pskb_trim(skb, len) : 0;
  16070. +}
  16071. +
  16072. +/**
  16073. + * pskb_trim_unique - remove end from a paged unique (not cloned) buffer
  16074. + * @skb: buffer to alter
  16075. + * @len: new length
  16076. + *
  16077. + * This is identical to pskb_trim except that the caller knows that
  16078. + * the skb is not cloned so we should never get an error due to out-
  16079. + * of-memory.
  16080. + */
  16081. +static inline void pskb_trim_unique(struct sk_buff *skb, unsigned int len)
  16082. +{
  16083. + int err = pskb_trim(skb, len);
  16084. + BUG_ON(err);
  16085. +}
  16086. +
  16087. +/**
  16088. + * skb_orphan - orphan a buffer
  16089. + * @skb: buffer to orphan
  16090. + *
  16091. + * If a buffer currently has an owner then we call the owner's
  16092. + * destructor function and make the @skb unowned. The buffer continues
  16093. + * to exist but is no longer charged to its former owner.
  16094. + */
  16095. +static inline void skb_orphan(struct sk_buff *skb)
  16096. +{
  16097. + if (skb->destructor) {
  16098. + skb->destructor(skb);
  16099. + skb->destructor = NULL;
  16100. + skb->sk = NULL;
  16101. + } else {
  16102. + BUG_ON(skb->sk);
  16103. + }
  16104. +}
  16105. +
  16106. +/**
  16107. + * skb_orphan_frags - orphan the frags contained in a buffer
  16108. + * @skb: buffer to orphan frags from
  16109. + * @gfp_mask: allocation mask for replacement pages
  16110. + *
  16111. + * For each frag in the SKB which needs a destructor (i.e. has an
  16112. + * owner) create a copy of that frag and release the original
  16113. + * page by calling the destructor.
  16114. + */
  16115. +static inline int skb_orphan_frags(struct sk_buff *skb, gfp_t gfp_mask)
  16116. +{
  16117. + if (likely(!(skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY)))
  16118. + return 0;
  16119. + return skb_copy_ubufs(skb, gfp_mask);
  16120. +}
  16121. +
  16122. +/**
  16123. + * __skb_queue_purge - empty a list
  16124. + * @list: list to empty
  16125. + *
  16126. + * Delete all buffers on an &sk_buff list. Each buffer is removed from
  16127. + * the list and one reference dropped. This function does not take the
  16128. + * list lock and the caller must hold the relevant locks to use it.
  16129. + */
  16130. +void skb_queue_purge(struct sk_buff_head *list);
  16131. +static inline void __skb_queue_purge(struct sk_buff_head *list)
  16132. +{
  16133. + struct sk_buff *skb;
  16134. + while ((skb = __skb_dequeue(list)) != NULL)
  16135. + kfree_skb(skb);
  16136. +}
  16137. +
  16138. +#define NETDEV_FRAG_PAGE_MAX_ORDER get_order(32768)
  16139. +#define NETDEV_FRAG_PAGE_MAX_SIZE (PAGE_SIZE << NETDEV_FRAG_PAGE_MAX_ORDER)
  16140. +#define NETDEV_PAGECNT_MAX_BIAS NETDEV_FRAG_PAGE_MAX_SIZE
  16141. +
  16142. +void *netdev_alloc_frag(unsigned int fragsz);
  16143. +
  16144. +struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int length,
  16145. + gfp_t gfp_mask);
  16146. +
  16147. +/**
  16148. + * netdev_alloc_skb - allocate an skbuff for rx on a specific device
  16149. + * @dev: network device to receive on
  16150. + * @length: length to allocate
  16151. + *
  16152. + * Allocate a new &sk_buff and assign it a usage count of one. The
  16153. + * buffer has unspecified headroom built in. Users should allocate
  16154. + * the headroom they think they need without accounting for the
  16155. + * built in space. The built in space is used for optimisations.
  16156. + *
  16157. + * %NULL is returned if there is no free memory. Although this function
  16158. + * allocates memory it can be called from an interrupt.
  16159. + */
  16160. +static inline struct sk_buff *netdev_alloc_skb(struct net_device *dev,
  16161. + unsigned int length)
  16162. +{
  16163. + return __netdev_alloc_skb(dev, length, GFP_ATOMIC);
  16164. +}
  16165. +
  16166. +/* legacy helper around __netdev_alloc_skb() */
  16167. +static inline struct sk_buff *__dev_alloc_skb(unsigned int length,
  16168. + gfp_t gfp_mask)
  16169. +{
  16170. + return __netdev_alloc_skb(NULL, length, gfp_mask);
  16171. +}
  16172. +
  16173. +/* legacy helper around netdev_alloc_skb() */
  16174. +static inline struct sk_buff *dev_alloc_skb(unsigned int length)
  16175. +{
  16176. + return netdev_alloc_skb(NULL, length);
  16177. +}
  16178. +
  16179. +
  16180. +static inline struct sk_buff *__netdev_alloc_skb_ip_align(struct net_device *dev,
  16181. + unsigned int length, gfp_t gfp)
  16182. +{
  16183. + struct sk_buff *skb = __netdev_alloc_skb(dev, length + NET_IP_ALIGN, gfp);
  16184. +
  16185. + if (NET_IP_ALIGN && skb)
  16186. + skb_reserve(skb, NET_IP_ALIGN);
  16187. + return skb;
  16188. +}
  16189. +
  16190. +static inline struct sk_buff *netdev_alloc_skb_ip_align(struct net_device *dev,
  16191. + unsigned int length)
  16192. +{
  16193. + return __netdev_alloc_skb_ip_align(dev, length, GFP_ATOMIC);
  16194. +}
  16195. +
  16196. +/**
  16197. + * __skb_alloc_pages - allocate pages for ps-rx on a skb and preserve pfmemalloc data
  16198. + * @gfp_mask: alloc_pages_node mask. Set __GFP_NOMEMALLOC if not for network packet RX
  16199. + * @skb: skb to set pfmemalloc on if __GFP_MEMALLOC is used
  16200. + * @order: size of the allocation
  16201. + *
  16202. + * Allocate a new page.
  16203. + *
  16204. + * %NULL is returned if there is no free memory.
  16205. +*/
  16206. +static inline struct page *__skb_alloc_pages(gfp_t gfp_mask,
  16207. + struct sk_buff *skb,
  16208. + unsigned int order)
  16209. +{
  16210. + struct page *page;
  16211. +
  16212. + gfp_mask |= __GFP_COLD;
  16213. +
  16214. + if (!(gfp_mask & __GFP_NOMEMALLOC))
  16215. + gfp_mask |= __GFP_MEMALLOC;
  16216. +
  16217. + page = alloc_pages_node(NUMA_NO_NODE, gfp_mask, order);
  16218. + if (skb && page && page->pfmemalloc)
  16219. + skb->pfmemalloc = true;
  16220. +
  16221. + return page;
  16222. +}
  16223. +
  16224. +/**
  16225. + * __skb_alloc_page - allocate a page for ps-rx for a given skb and preserve pfmemalloc data
  16226. + * @gfp_mask: alloc_pages_node mask. Set __GFP_NOMEMALLOC if not for network packet RX
  16227. + * @skb: skb to set pfmemalloc on if __GFP_MEMALLOC is used
  16228. + *
  16229. + * Allocate a new page.
  16230. + *
  16231. + * %NULL is returned if there is no free memory.
  16232. + */
  16233. +static inline struct page *__skb_alloc_page(gfp_t gfp_mask,
  16234. + struct sk_buff *skb)
  16235. +{
  16236. + return __skb_alloc_pages(gfp_mask, skb, 0);
  16237. +}
  16238. +
  16239. +/**
  16240. + * skb_propagate_pfmemalloc - Propagate pfmemalloc if skb is allocated after RX page
  16241. + * @page: The page that was allocated from skb_alloc_page
  16242. + * @skb: The skb that may need pfmemalloc set
  16243. + */
  16244. +static inline void skb_propagate_pfmemalloc(struct page *page,
  16245. + struct sk_buff *skb)
  16246. +{
  16247. + if (page && page->pfmemalloc)
  16248. + skb->pfmemalloc = true;
  16249. +}
  16250. +
  16251. +/**
  16252. + * skb_frag_page - retrieve the page referred to by a paged fragment
  16253. + * @frag: the paged fragment
  16254. + *
  16255. + * Returns the &struct page associated with @frag.
  16256. + */
  16257. +static inline struct page *skb_frag_page(const skb_frag_t *frag)
  16258. +{
  16259. + return frag->page.p;
  16260. +}
  16261. +
  16262. +/**
  16263. + * __skb_frag_ref - take an addition reference on a paged fragment.
  16264. + * @frag: the paged fragment
  16265. + *
  16266. + * Takes an additional reference on the paged fragment @frag.
  16267. + */
  16268. +static inline void __skb_frag_ref(skb_frag_t *frag)
  16269. +{
  16270. + get_page(skb_frag_page(frag));
  16271. +}
  16272. +
  16273. +/**
  16274. + * skb_frag_ref - take an addition reference on a paged fragment of an skb.
  16275. + * @skb: the buffer
  16276. + * @f: the fragment offset.
  16277. + *
  16278. + * Takes an additional reference on the @f'th paged fragment of @skb.
  16279. + */
  16280. +static inline void skb_frag_ref(struct sk_buff *skb, int f)
  16281. +{
  16282. + __skb_frag_ref(&skb_shinfo(skb)->frags[f]);
  16283. +}
  16284. +
  16285. +/**
  16286. + * __skb_frag_unref - release a reference on a paged fragment.
  16287. + * @frag: the paged fragment
  16288. + *
  16289. + * Releases a reference on the paged fragment @frag.
  16290. + */
  16291. +static inline void __skb_frag_unref(skb_frag_t *frag)
  16292. +{
  16293. + put_page(skb_frag_page(frag));
  16294. +}
  16295. +
  16296. +/**
  16297. + * skb_frag_unref - release a reference on a paged fragment of an skb.
  16298. + * @skb: the buffer
  16299. + * @f: the fragment offset
  16300. + *
  16301. + * Releases a reference on the @f'th paged fragment of @skb.
  16302. + */
  16303. +static inline void skb_frag_unref(struct sk_buff *skb, int f)
  16304. +{
  16305. + __skb_frag_unref(&skb_shinfo(skb)->frags[f]);
  16306. +}
  16307. +
  16308. +/**
  16309. + * skb_frag_address - gets the address of the data contained in a paged fragment
  16310. + * @frag: the paged fragment buffer
  16311. + *
  16312. + * Returns the address of the data within @frag. The page must already
  16313. + * be mapped.
  16314. + */
  16315. +static inline void *skb_frag_address(const skb_frag_t *frag)
  16316. +{
  16317. + return page_address(skb_frag_page(frag)) + frag->page_offset;
  16318. +}
  16319. +
  16320. +/**
  16321. + * skb_frag_address_safe - gets the address of the data contained in a paged fragment
  16322. + * @frag: the paged fragment buffer
  16323. + *
  16324. + * Returns the address of the data within @frag. Checks that the page
  16325. + * is mapped and returns %NULL otherwise.
  16326. + */
  16327. +static inline void *skb_frag_address_safe(const skb_frag_t *frag)
  16328. +{
  16329. + void *ptr = page_address(skb_frag_page(frag));
  16330. + if (unlikely(!ptr))
  16331. + return NULL;
  16332. +
  16333. + return ptr + frag->page_offset;
  16334. +}
  16335. +
  16336. +/**
  16337. + * __skb_frag_set_page - sets the page contained in a paged fragment
  16338. + * @frag: the paged fragment
  16339. + * @page: the page to set
  16340. + *
  16341. + * Sets the fragment @frag to contain @page.
  16342. + */
  16343. +static inline void __skb_frag_set_page(skb_frag_t *frag, struct page *page)
  16344. +{
  16345. + frag->page.p = page;
  16346. +}
  16347. +
  16348. +/**
  16349. + * skb_frag_set_page - sets the page contained in a paged fragment of an skb
  16350. + * @skb: the buffer
  16351. + * @f: the fragment offset
  16352. + * @page: the page to set
  16353. + *
  16354. + * Sets the @f'th fragment of @skb to contain @page.
  16355. + */
  16356. +static inline void skb_frag_set_page(struct sk_buff *skb, int f,
  16357. + struct page *page)
  16358. +{
  16359. + __skb_frag_set_page(&skb_shinfo(skb)->frags[f], page);
  16360. +}
  16361. +
  16362. +bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t prio);
  16363. +
  16364. +/**
  16365. + * skb_frag_dma_map - maps a paged fragment via the DMA API
  16366. + * @dev: the device to map the fragment to
  16367. + * @frag: the paged fragment to map
  16368. + * @offset: the offset within the fragment (starting at the
  16369. + * fragment's own offset)
  16370. + * @size: the number of bytes to map
  16371. + * @dir: the direction of the mapping (%PCI_DMA_*)
  16372. + *
  16373. + * Maps the page associated with @frag to @device.
  16374. + */
  16375. +static inline dma_addr_t skb_frag_dma_map(struct device *dev,
  16376. + const skb_frag_t *frag,
  16377. + size_t offset, size_t size,
  16378. + enum dma_data_direction dir)
  16379. +{
  16380. + return dma_map_page(dev, skb_frag_page(frag),
  16381. + frag->page_offset + offset, size, dir);
  16382. +}
  16383. +
  16384. +static inline struct sk_buff *pskb_copy(struct sk_buff *skb,
  16385. + gfp_t gfp_mask)
  16386. +{
  16387. + return __pskb_copy(skb, skb_headroom(skb), gfp_mask);
  16388. +}
  16389. +
  16390. +
  16391. +static inline struct sk_buff *pskb_copy_for_clone(struct sk_buff *skb,
  16392. + gfp_t gfp_mask)
  16393. +{
  16394. + return __pskb_copy_fclone(skb, skb_headroom(skb), gfp_mask, true);
  16395. +}
  16396. +
  16397. +
  16398. +/**
  16399. + * skb_clone_writable - is the header of a clone writable
  16400. + * @skb: buffer to check
  16401. + * @len: length up to which to write
  16402. + *
  16403. + * Returns true if modifying the header part of the cloned buffer
  16404. + * does not requires the data to be copied.
  16405. + */
  16406. +static inline int skb_clone_writable(const struct sk_buff *skb, unsigned int len)
  16407. +{
  16408. + return !skb_header_cloned(skb) &&
  16409. + skb_headroom(skb) + len <= skb->hdr_len;
  16410. +}
  16411. +
  16412. +static inline int __skb_cow(struct sk_buff *skb, unsigned int headroom,
  16413. + int cloned)
  16414. +{
  16415. + int delta = 0;
  16416. +
  16417. + if (headroom > skb_headroom(skb))
  16418. + delta = headroom - skb_headroom(skb);
  16419. +
  16420. + if (delta || cloned)
  16421. + return pskb_expand_head(skb, ALIGN(delta, NET_SKB_PAD), 0,
  16422. + GFP_ATOMIC);
  16423. + return 0;
  16424. +}
  16425. +
  16426. +/**
  16427. + * skb_cow - copy header of skb when it is required
  16428. + * @skb: buffer to cow
  16429. + * @headroom: needed headroom
  16430. + *
  16431. + * If the skb passed lacks sufficient headroom or its data part
  16432. + * is shared, data is reallocated. If reallocation fails, an error
  16433. + * is returned and original skb is not changed.
  16434. + *
  16435. + * The result is skb with writable area skb->head...skb->tail
  16436. + * and at least @headroom of space at head.
  16437. + */
  16438. +static inline int skb_cow(struct sk_buff *skb, unsigned int headroom)
  16439. +{
  16440. + return __skb_cow(skb, headroom, skb_cloned(skb));
  16441. +}
  16442. +
  16443. +/**
  16444. + * skb_cow_head - skb_cow but only making the head writable
  16445. + * @skb: buffer to cow
  16446. + * @headroom: needed headroom
  16447. + *
  16448. + * This function is identical to skb_cow except that we replace the
  16449. + * skb_cloned check by skb_header_cloned. It should be used when
  16450. + * you only need to push on some header and do not need to modify
  16451. + * the data.
  16452. + */
  16453. +static inline int skb_cow_head(struct sk_buff *skb, unsigned int headroom)
  16454. +{
  16455. + return __skb_cow(skb, headroom, skb_header_cloned(skb));
  16456. +}
  16457. +
  16458. +/**
  16459. + * skb_padto - pad an skbuff up to a minimal size
  16460. + * @skb: buffer to pad
  16461. + * @len: minimal length
  16462. + *
  16463. + * Pads up a buffer to ensure the trailing bytes exist and are
  16464. + * blanked. If the buffer already contains sufficient data it
  16465. + * is untouched. Otherwise it is extended. Returns zero on
  16466. + * success. The skb is freed on error.
  16467. + */
  16468. +
  16469. +static inline int skb_padto(struct sk_buff *skb, unsigned int len)
  16470. +{
  16471. + unsigned int size = skb->len;
  16472. + if (likely(size >= len))
  16473. + return 0;
  16474. + return skb_pad(skb, len - size);
  16475. +}
  16476. +
  16477. +static inline int skb_add_data(struct sk_buff *skb,
  16478. + char __user *from, int copy)
  16479. +{
  16480. + const int off = skb->len;
  16481. +
  16482. + if (skb->ip_summed == CHECKSUM_NONE) {
  16483. + int err = 0;
  16484. + __wsum csum = csum_and_copy_from_user(from, skb_put(skb, copy),
  16485. + copy, 0, &err);
  16486. + if (!err) {
  16487. + skb->csum = csum_block_add(skb->csum, csum, off);
  16488. + return 0;
  16489. + }
  16490. + } else if (!copy_from_user(skb_put(skb, copy), from, copy))
  16491. + return 0;
  16492. +
  16493. + __skb_trim(skb, off);
  16494. + return -EFAULT;
  16495. +}
  16496. +
  16497. +static inline bool skb_can_coalesce(struct sk_buff *skb, int i,
  16498. + const struct page *page, int off)
  16499. +{
  16500. + if (i) {
  16501. + const struct skb_frag_struct *frag = &skb_shinfo(skb)->frags[i - 1];
  16502. +
  16503. + return page == skb_frag_page(frag) &&
  16504. + off == frag->page_offset + skb_frag_size(frag);
  16505. + }
  16506. + return false;
  16507. +}
  16508. +
  16509. +static inline int __skb_linearize(struct sk_buff *skb)
  16510. +{
  16511. + return __pskb_pull_tail(skb, skb->data_len) ? 0 : -ENOMEM;
  16512. +}
  16513. +
  16514. +/**
  16515. + * skb_linearize - convert paged skb to linear one
  16516. + * @skb: buffer to linarize
  16517. + *
  16518. + * If there is no free memory -ENOMEM is returned, otherwise zero
  16519. + * is returned and the old skb data released.
  16520. + */
  16521. +static inline int skb_linearize(struct sk_buff *skb)
  16522. +{
  16523. + return skb_is_nonlinear(skb) ? __skb_linearize(skb) : 0;
  16524. +}
  16525. +
  16526. +/**
  16527. + * skb_has_shared_frag - can any frag be overwritten
  16528. + * @skb: buffer to test
  16529. + *
  16530. + * Return true if the skb has at least one frag that might be modified
  16531. + * by an external entity (as in vmsplice()/sendfile())
  16532. + */
  16533. +static inline bool skb_has_shared_frag(const struct sk_buff *skb)
  16534. +{
  16535. + return skb_is_nonlinear(skb) &&
  16536. + skb_shinfo(skb)->tx_flags & SKBTX_SHARED_FRAG;
  16537. +}
  16538. +
  16539. +/**
  16540. + * skb_linearize_cow - make sure skb is linear and writable
  16541. + * @skb: buffer to process
  16542. + *
  16543. + * If there is no free memory -ENOMEM is returned, otherwise zero
  16544. + * is returned and the old skb data released.
  16545. + */
  16546. +static inline int skb_linearize_cow(struct sk_buff *skb)
  16547. +{
  16548. + return skb_is_nonlinear(skb) || skb_cloned(skb) ?
  16549. + __skb_linearize(skb) : 0;
  16550. +}
  16551. +
  16552. +/**
  16553. + * skb_postpull_rcsum - update checksum for received skb after pull
  16554. + * @skb: buffer to update
  16555. + * @start: start of data before pull
  16556. + * @len: length of data pulled
  16557. + *
  16558. + * After doing a pull on a received packet, you need to call this to
  16559. + * update the CHECKSUM_COMPLETE checksum, or set ip_summed to
  16560. + * CHECKSUM_NONE so that it can be recomputed from scratch.
  16561. + */
  16562. +
  16563. +static inline void skb_postpull_rcsum(struct sk_buff *skb,
  16564. + const void *start, unsigned int len)
  16565. +{
  16566. + if (skb->ip_summed == CHECKSUM_COMPLETE)
  16567. + skb->csum = csum_sub(skb->csum, csum_partial(start, len, 0));
  16568. +}
  16569. +
  16570. +unsigned char *skb_pull_rcsum(struct sk_buff *skb, unsigned int len);
  16571. +
  16572. +/**
  16573. + * pskb_trim_rcsum - trim received skb and update checksum
  16574. + * @skb: buffer to trim
  16575. + * @len: new length
  16576. + *
  16577. + * This is exactly the same as pskb_trim except that it ensures the
  16578. + * checksum of received packets are still valid after the operation.
  16579. + */
  16580. +
  16581. +static inline int pskb_trim_rcsum(struct sk_buff *skb, unsigned int len)
  16582. +{
  16583. + if (likely(len >= skb->len))
  16584. + return 0;
  16585. + if (skb->ip_summed == CHECKSUM_COMPLETE)
  16586. + skb->ip_summed = CHECKSUM_NONE;
  16587. + return __pskb_trim(skb, len);
  16588. +}
  16589. +
  16590. +#define skb_queue_walk(queue, skb) \
  16591. + for (skb = (queue)->next; \
  16592. + skb != (struct sk_buff *)(queue); \
  16593. + skb = skb->next)
  16594. +
  16595. +#define skb_queue_walk_safe(queue, skb, tmp) \
  16596. + for (skb = (queue)->next, tmp = skb->next; \
  16597. + skb != (struct sk_buff *)(queue); \
  16598. + skb = tmp, tmp = skb->next)
  16599. +
  16600. +#define skb_queue_walk_from(queue, skb) \
  16601. + for (; skb != (struct sk_buff *)(queue); \
  16602. + skb = skb->next)
  16603. +
  16604. +#define skb_queue_walk_from_safe(queue, skb, tmp) \
  16605. + for (tmp = skb->next; \
  16606. + skb != (struct sk_buff *)(queue); \
  16607. + skb = tmp, tmp = skb->next)
  16608. +
  16609. +#define skb_queue_reverse_walk(queue, skb) \
  16610. + for (skb = (queue)->prev; \
  16611. + skb != (struct sk_buff *)(queue); \
  16612. + skb = skb->prev)
  16613. +
  16614. +#define skb_queue_reverse_walk_safe(queue, skb, tmp) \
  16615. + for (skb = (queue)->prev, tmp = skb->prev; \
  16616. + skb != (struct sk_buff *)(queue); \
  16617. + skb = tmp, tmp = skb->prev)
  16618. +
  16619. +#define skb_queue_reverse_walk_from_safe(queue, skb, tmp) \
  16620. + for (tmp = skb->prev; \
  16621. + skb != (struct sk_buff *)(queue); \
  16622. + skb = tmp, tmp = skb->prev)
  16623. +
  16624. +static inline bool skb_has_frag_list(const struct sk_buff *skb)
  16625. +{
  16626. + return skb_shinfo(skb)->frag_list != NULL;
  16627. +}
  16628. +
  16629. +static inline void skb_frag_list_init(struct sk_buff *skb)
  16630. +{
  16631. + skb_shinfo(skb)->frag_list = NULL;
  16632. +}
  16633. +
  16634. +static inline void skb_frag_add_head(struct sk_buff *skb, struct sk_buff *frag)
  16635. +{
  16636. + frag->next = skb_shinfo(skb)->frag_list;
  16637. + skb_shinfo(skb)->frag_list = frag;
  16638. +}
  16639. +
  16640. +#define skb_walk_frags(skb, iter) \
  16641. + for (iter = skb_shinfo(skb)->frag_list; iter; iter = iter->next)
  16642. +
  16643. +struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned flags,
  16644. + int *peeked, int *off, int *err);
  16645. +struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags, int noblock,
  16646. + int *err);
  16647. +unsigned int datagram_poll(struct file *file, struct socket *sock,
  16648. + struct poll_table_struct *wait);
  16649. +int skb_copy_datagram_iovec(const struct sk_buff *from, int offset,
  16650. + struct iovec *to, int size);
  16651. +int skb_copy_and_csum_datagram_iovec(struct sk_buff *skb, int hlen,
  16652. + struct iovec *iov);
  16653. +int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset,
  16654. + const struct iovec *from, int from_offset,
  16655. + int len);
  16656. +int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *frm,
  16657. + int offset, size_t count);
  16658. +int skb_copy_datagram_const_iovec(const struct sk_buff *from, int offset,
  16659. + const struct iovec *to, int to_offset,
  16660. + int size);
  16661. +void skb_free_datagram(struct sock *sk, struct sk_buff *skb);
  16662. +void skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb);
  16663. +int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags);
  16664. +int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len);
  16665. +int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len);
  16666. +__wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, u8 *to,
  16667. + int len, __wsum csum);
  16668. +int skb_splice_bits(struct sk_buff *skb, unsigned int offset,
  16669. + struct pipe_inode_info *pipe, unsigned int len,
  16670. + unsigned int flags);
  16671. +void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to);
  16672. +unsigned int skb_zerocopy_headlen(const struct sk_buff *from);
  16673. +int skb_zerocopy(struct sk_buff *to, struct sk_buff *from,
  16674. + int len, int hlen);
  16675. +void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len);
  16676. +int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen);
  16677. +void skb_scrub_packet(struct sk_buff *skb, bool xnet);
  16678. +unsigned int skb_gso_transport_seglen(const struct sk_buff *skb);
  16679. +struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features);
  16680. +struct sk_buff *skb_vlan_untag(struct sk_buff *skb);
  16681. +
  16682. +struct skb_checksum_ops {
  16683. + __wsum (*update)(const void *mem, int len, __wsum wsum);
  16684. + __wsum (*combine)(__wsum csum, __wsum csum2, int offset, int len);
  16685. +};
  16686. +
  16687. +__wsum __skb_checksum(const struct sk_buff *skb, int offset, int len,
  16688. + __wsum csum, const struct skb_checksum_ops *ops);
  16689. +__wsum skb_checksum(const struct sk_buff *skb, int offset, int len,
  16690. + __wsum csum);
  16691. +
  16692. +static inline void *__skb_header_pointer(const struct sk_buff *skb, int offset,
  16693. + int len, void *data, int hlen, void *buffer)
  16694. +{
  16695. + if (hlen - offset >= len)
  16696. + return data + offset;
  16697. +
  16698. + if (!skb ||
  16699. + skb_copy_bits(skb, offset, buffer, len) < 0)
  16700. + return NULL;
  16701. +
  16702. + return buffer;
  16703. +}
  16704. +
  16705. +static inline void *skb_header_pointer(const struct sk_buff *skb, int offset,
  16706. + int len, void *buffer)
  16707. +{
  16708. + return __skb_header_pointer(skb, offset, len, skb->data,
  16709. + skb_headlen(skb), buffer);
  16710. +}
  16711. +
  16712. +/**
  16713. + * skb_needs_linearize - check if we need to linearize a given skb
  16714. + * depending on the given device features.
  16715. + * @skb: socket buffer to check
  16716. + * @features: net device features
  16717. + *
  16718. + * Returns true if either:
  16719. + * 1. skb has frag_list and the device doesn't support FRAGLIST, or
  16720. + * 2. skb is fragmented and the device does not support SG.
  16721. + */
  16722. +static inline bool skb_needs_linearize(struct sk_buff *skb,
  16723. + netdev_features_t features)
  16724. +{
  16725. + return skb_is_nonlinear(skb) &&
  16726. + ((skb_has_frag_list(skb) && !(features & NETIF_F_FRAGLIST)) ||
  16727. + (skb_shinfo(skb)->nr_frags && !(features & NETIF_F_SG)));
  16728. +}
  16729. +
  16730. +static inline void skb_copy_from_linear_data(const struct sk_buff *skb,
  16731. + void *to,
  16732. + const unsigned int len)
  16733. +{
  16734. + memcpy(to, skb->data, len);
  16735. +}
  16736. +
  16737. +static inline void skb_copy_from_linear_data_offset(const struct sk_buff *skb,
  16738. + const int offset, void *to,
  16739. + const unsigned int len)
  16740. +{
  16741. + memcpy(to, skb->data + offset, len);
  16742. +}
  16743. +
  16744. +static inline void skb_copy_to_linear_data(struct sk_buff *skb,
  16745. + const void *from,
  16746. + const unsigned int len)
  16747. +{
  16748. + memcpy(skb->data, from, len);
  16749. +}
  16750. +
  16751. +static inline void skb_copy_to_linear_data_offset(struct sk_buff *skb,
  16752. + const int offset,
  16753. + const void *from,
  16754. + const unsigned int len)
  16755. +{
  16756. + memcpy(skb->data + offset, from, len);
  16757. +}
  16758. +
  16759. +void skb_init(void);
  16760. +
  16761. +static inline ktime_t skb_get_ktime(const struct sk_buff *skb)
  16762. +{
  16763. + return skb->tstamp;
  16764. +}
  16765. +
  16766. +/**
  16767. + * skb_get_timestamp - get timestamp from a skb
  16768. + * @skb: skb to get stamp from
  16769. + * @stamp: pointer to struct timeval to store stamp in
  16770. + *
  16771. + * Timestamps are stored in the skb as offsets to a base timestamp.
  16772. + * This function converts the offset back to a struct timeval and stores
  16773. + * it in stamp.
  16774. + */
  16775. +static inline void skb_get_timestamp(const struct sk_buff *skb,
  16776. + struct timeval *stamp)
  16777. +{
  16778. + *stamp = ktime_to_timeval(skb->tstamp);
  16779. +}
  16780. +
  16781. +static inline void skb_get_timestampns(const struct sk_buff *skb,
  16782. + struct timespec *stamp)
  16783. +{
  16784. + *stamp = ktime_to_timespec(skb->tstamp);
  16785. +}
  16786. +
  16787. +static inline void __net_timestamp(struct sk_buff *skb)
  16788. +{
  16789. + skb->tstamp = ktime_get_real();
  16790. +}
  16791. +
  16792. +static inline ktime_t net_timedelta(ktime_t t)
  16793. +{
  16794. + return ktime_sub(ktime_get_real(), t);
  16795. +}
  16796. +
  16797. +static inline ktime_t net_invalid_timestamp(void)
  16798. +{
  16799. + return ktime_set(0, 0);
  16800. +}
  16801. +
  16802. +struct sk_buff *skb_clone_sk(struct sk_buff *skb);
  16803. +
  16804. +#ifdef CONFIG_NETWORK_PHY_TIMESTAMPING
  16805. +
  16806. +void skb_clone_tx_timestamp(struct sk_buff *skb);
  16807. +bool skb_defer_rx_timestamp(struct sk_buff *skb);
  16808. +
  16809. +#else /* CONFIG_NETWORK_PHY_TIMESTAMPING */
  16810. +
  16811. +static inline void skb_clone_tx_timestamp(struct sk_buff *skb)
  16812. +{
  16813. +}
  16814. +
  16815. +static inline bool skb_defer_rx_timestamp(struct sk_buff *skb)
  16816. +{
  16817. + return false;
  16818. +}
  16819. +
  16820. +#endif /* !CONFIG_NETWORK_PHY_TIMESTAMPING */
  16821. +
  16822. +/**
  16823. + * skb_complete_tx_timestamp() - deliver cloned skb with tx timestamps
  16824. + *
  16825. + * PHY drivers may accept clones of transmitted packets for
  16826. + * timestamping via their phy_driver.txtstamp method. These drivers
  16827. + * must call this function to return the skb back to the stack, with
  16828. + * or without a timestamp.
  16829. + *
  16830. + * @skb: clone of the the original outgoing packet
  16831. + * @hwtstamps: hardware time stamps, may be NULL if not available
  16832. + *
  16833. + */
  16834. +void skb_complete_tx_timestamp(struct sk_buff *skb,
  16835. + struct skb_shared_hwtstamps *hwtstamps);
  16836. +
  16837. +void __skb_tstamp_tx(struct sk_buff *orig_skb,
  16838. + struct skb_shared_hwtstamps *hwtstamps,
  16839. + struct sock *sk, int tstype);
  16840. +
  16841. +/**
  16842. + * skb_tstamp_tx - queue clone of skb with send time stamps
  16843. + * @orig_skb: the original outgoing packet
  16844. + * @hwtstamps: hardware time stamps, may be NULL if not available
  16845. + *
  16846. + * If the skb has a socket associated, then this function clones the
  16847. + * skb (thus sharing the actual data and optional structures), stores
  16848. + * the optional hardware time stamping information (if non NULL) or
  16849. + * generates a software time stamp (otherwise), then queues the clone
  16850. + * to the error queue of the socket. Errors are silently ignored.
  16851. + */
  16852. +void skb_tstamp_tx(struct sk_buff *orig_skb,
  16853. + struct skb_shared_hwtstamps *hwtstamps);
  16854. +
  16855. +static inline void sw_tx_timestamp(struct sk_buff *skb)
  16856. +{
  16857. + if (skb_shinfo(skb)->tx_flags & SKBTX_SW_TSTAMP &&
  16858. + !(skb_shinfo(skb)->tx_flags & SKBTX_IN_PROGRESS))
  16859. + skb_tstamp_tx(skb, NULL);
  16860. +}
  16861. +
  16862. +/**
  16863. + * skb_tx_timestamp() - Driver hook for transmit timestamping
  16864. + *
  16865. + * Ethernet MAC Drivers should call this function in their hard_xmit()
  16866. + * function immediately before giving the sk_buff to the MAC hardware.
  16867. + *
  16868. + * Specifically, one should make absolutely sure that this function is
  16869. + * called before TX completion of this packet can trigger. Otherwise
  16870. + * the packet could potentially already be freed.
  16871. + *
  16872. + * @skb: A socket buffer.
  16873. + */
  16874. +static inline void skb_tx_timestamp(struct sk_buff *skb)
  16875. +{
  16876. + skb_clone_tx_timestamp(skb);
  16877. + sw_tx_timestamp(skb);
  16878. +}
  16879. +
  16880. +/**
  16881. + * skb_complete_wifi_ack - deliver skb with wifi status
  16882. + *
  16883. + * @skb: the original outgoing packet
  16884. + * @acked: ack status
  16885. + *
  16886. + */
  16887. +void skb_complete_wifi_ack(struct sk_buff *skb, bool acked);
  16888. +
  16889. +__sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len);
  16890. +__sum16 __skb_checksum_complete(struct sk_buff *skb);
  16891. +
  16892. +static inline int skb_csum_unnecessary(const struct sk_buff *skb)
  16893. +{
  16894. + return ((skb->ip_summed & CHECKSUM_UNNECESSARY) || skb->csum_valid);
  16895. +}
  16896. +
  16897. +/**
  16898. + * skb_checksum_complete - Calculate checksum of an entire packet
  16899. + * @skb: packet to process
  16900. + *
  16901. + * This function calculates the checksum over the entire packet plus
  16902. + * the value of skb->csum. The latter can be used to supply the
  16903. + * checksum of a pseudo header as used by TCP/UDP. It returns the
  16904. + * checksum.
  16905. + *
  16906. + * For protocols that contain complete checksums such as ICMP/TCP/UDP,
  16907. + * this function can be used to verify that checksum on received
  16908. + * packets. In that case the function should return zero if the
  16909. + * checksum is correct. In particular, this function will return zero
  16910. + * if skb->ip_summed is CHECKSUM_UNNECESSARY which indicates that the
  16911. + * hardware has already verified the correctness of the checksum.
  16912. + */
  16913. +static inline __sum16 skb_checksum_complete(struct sk_buff *skb)
  16914. +{
  16915. + return skb_csum_unnecessary(skb) ?
  16916. + 0 : __skb_checksum_complete(skb);
  16917. +}
  16918. +
  16919. +static inline void __skb_decr_checksum_unnecessary(struct sk_buff *skb)
  16920. +{
  16921. + if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
  16922. + if (skb->csum_level == 0)
  16923. + skb->ip_summed = CHECKSUM_NONE;
  16924. + else
  16925. + skb->csum_level--;
  16926. + }
  16927. +}
  16928. +
  16929. +static inline void __skb_incr_checksum_unnecessary(struct sk_buff *skb)
  16930. +{
  16931. + if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
  16932. + if (skb->csum_level < SKB_MAX_CSUM_LEVEL)
  16933. + skb->csum_level++;
  16934. + } else if (skb->ip_summed == CHECKSUM_NONE) {
  16935. + skb->ip_summed = CHECKSUM_UNNECESSARY;
  16936. + skb->csum_level = 0;
  16937. + }
  16938. +}
  16939. +
  16940. +static inline void __skb_mark_checksum_bad(struct sk_buff *skb)
  16941. +{
  16942. + /* Mark current checksum as bad (typically called from GRO
  16943. + * path). In the case that ip_summed is CHECKSUM_NONE
  16944. + * this must be the first checksum encountered in the packet.
  16945. + * When ip_summed is CHECKSUM_UNNECESSARY, this is the first
  16946. + * checksum after the last one validated. For UDP, a zero
  16947. + * checksum can not be marked as bad.
  16948. + */
  16949. +
  16950. + if (skb->ip_summed == CHECKSUM_NONE ||
  16951. + skb->ip_summed == CHECKSUM_UNNECESSARY)
  16952. + skb->csum_bad = 1;
  16953. +}
  16954. +
  16955. +/* Check if we need to perform checksum complete validation.
  16956. + *
  16957. + * Returns true if checksum complete is needed, false otherwise
  16958. + * (either checksum is unnecessary or zero checksum is allowed).
  16959. + */
  16960. +static inline bool __skb_checksum_validate_needed(struct sk_buff *skb,
  16961. + bool zero_okay,
  16962. + __sum16 check)
  16963. +{
  16964. + if (skb_csum_unnecessary(skb) || (zero_okay && !check)) {
  16965. + skb->csum_valid = 1;
  16966. + __skb_decr_checksum_unnecessary(skb);
  16967. + return false;
  16968. + }
  16969. +
  16970. + return true;
  16971. +}
  16972. +
  16973. +/* For small packets <= CHECKSUM_BREAK peform checksum complete directly
  16974. + * in checksum_init.
  16975. + */
  16976. +#define CHECKSUM_BREAK 76
  16977. +
  16978. +/* Unset checksum-complete
  16979. + *
  16980. + * Unset checksum complete can be done when packet is being modified
  16981. + * (uncompressed for instance) and checksum-complete value is
  16982. + * invalidated.
  16983. + */
  16984. +static inline void skb_checksum_complete_unset(struct sk_buff *skb)
  16985. +{
  16986. + if (skb->ip_summed == CHECKSUM_COMPLETE)
  16987. + skb->ip_summed = CHECKSUM_NONE;
  16988. +}
  16989. +
  16990. +/* Validate (init) checksum based on checksum complete.
  16991. + *
  16992. + * Return values:
  16993. + * 0: checksum is validated or try to in skb_checksum_complete. In the latter
  16994. + * case the ip_summed will not be CHECKSUM_UNNECESSARY and the pseudo
  16995. + * checksum is stored in skb->csum for use in __skb_checksum_complete
  16996. + * non-zero: value of invalid checksum
  16997. + *
  16998. + */
  16999. +static inline __sum16 __skb_checksum_validate_complete(struct sk_buff *skb,
  17000. + bool complete,
  17001. + __wsum psum)
  17002. +{
  17003. + if (skb->ip_summed == CHECKSUM_COMPLETE) {
  17004. + if (!csum_fold(csum_add(psum, skb->csum))) {
  17005. + skb->csum_valid = 1;
  17006. + return 0;
  17007. + }
  17008. + } else if (skb->csum_bad) {
  17009. + /* ip_summed == CHECKSUM_NONE in this case */
  17010. + return 1;
  17011. + }
  17012. +
  17013. + skb->csum = psum;
  17014. +
  17015. + if (complete || skb->len <= CHECKSUM_BREAK) {
  17016. + __sum16 csum;
  17017. +
  17018. + csum = __skb_checksum_complete(skb);
  17019. + skb->csum_valid = !csum;
  17020. + return csum;
  17021. + }
  17022. +
  17023. + return 0;
  17024. +}
  17025. +
  17026. +static inline __wsum null_compute_pseudo(struct sk_buff *skb, int proto)
  17027. +{
  17028. + return 0;
  17029. +}
  17030. +
  17031. +/* Perform checksum validate (init). Note that this is a macro since we only
  17032. + * want to calculate the pseudo header which is an input function if necessary.
  17033. + * First we try to validate without any computation (checksum unnecessary) and
  17034. + * then calculate based on checksum complete calling the function to compute
  17035. + * pseudo header.
  17036. + *
  17037. + * Return values:
  17038. + * 0: checksum is validated or try to in skb_checksum_complete
  17039. + * non-zero: value of invalid checksum
  17040. + */
  17041. +#define __skb_checksum_validate(skb, proto, complete, \
  17042. + zero_okay, check, compute_pseudo) \
  17043. +({ \
  17044. + __sum16 __ret = 0; \
  17045. + skb->csum_valid = 0; \
  17046. + if (__skb_checksum_validate_needed(skb, zero_okay, check)) \
  17047. + __ret = __skb_checksum_validate_complete(skb, \
  17048. + complete, compute_pseudo(skb, proto)); \
  17049. + __ret; \
  17050. +})
  17051. +
  17052. +#define skb_checksum_init(skb, proto, compute_pseudo) \
  17053. + __skb_checksum_validate(skb, proto, false, false, 0, compute_pseudo)
  17054. +
  17055. +#define skb_checksum_init_zero_check(skb, proto, check, compute_pseudo) \
  17056. + __skb_checksum_validate(skb, proto, false, true, check, compute_pseudo)
  17057. +
  17058. +#define skb_checksum_validate(skb, proto, compute_pseudo) \
  17059. + __skb_checksum_validate(skb, proto, true, false, 0, compute_pseudo)
  17060. +
  17061. +#define skb_checksum_validate_zero_check(skb, proto, check, \
  17062. + compute_pseudo) \
  17063. + __skb_checksum_validate_(skb, proto, true, true, check, compute_pseudo)
  17064. +
  17065. +#define skb_checksum_simple_validate(skb) \
  17066. + __skb_checksum_validate(skb, 0, true, false, 0, null_compute_pseudo)
  17067. +
  17068. +static inline bool __skb_checksum_convert_check(struct sk_buff *skb)
  17069. +{
  17070. + return (skb->ip_summed == CHECKSUM_NONE &&
  17071. + skb->csum_valid && !skb->csum_bad);
  17072. +}
  17073. +
  17074. +static inline void __skb_checksum_convert(struct sk_buff *skb,
  17075. + __sum16 check, __wsum pseudo)
  17076. +{
  17077. + skb->csum = ~pseudo;
  17078. + skb->ip_summed = CHECKSUM_COMPLETE;
  17079. +}
  17080. +
  17081. +#define skb_checksum_try_convert(skb, proto, check, compute_pseudo) \
  17082. +do { \
  17083. + if (__skb_checksum_convert_check(skb)) \
  17084. + __skb_checksum_convert(skb, check, \
  17085. + compute_pseudo(skb, proto)); \
  17086. +} while (0)
  17087. +
  17088. +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
  17089. +void nf_conntrack_destroy(struct nf_conntrack *nfct);
  17090. +static inline void nf_conntrack_put(struct nf_conntrack *nfct)
  17091. +{
  17092. + if (nfct && atomic_dec_and_test(&nfct->use))
  17093. + nf_conntrack_destroy(nfct);
  17094. +}
  17095. +static inline void nf_conntrack_get(struct nf_conntrack *nfct)
  17096. +{
  17097. + if (nfct)
  17098. + atomic_inc(&nfct->use);
  17099. +}
  17100. +#endif
  17101. +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
  17102. +static inline void nf_bridge_put(struct nf_bridge_info *nf_bridge)
  17103. +{
  17104. + if (nf_bridge && atomic_dec_and_test(&nf_bridge->use))
  17105. + kfree(nf_bridge);
  17106. +}
  17107. +static inline void nf_bridge_get(struct nf_bridge_info *nf_bridge)
  17108. +{
  17109. + if (nf_bridge)
  17110. + atomic_inc(&nf_bridge->use);
  17111. +}
  17112. +#endif /* CONFIG_BRIDGE_NETFILTER */
  17113. +static inline void nf_reset(struct sk_buff *skb)
  17114. +{
  17115. +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
  17116. + nf_conntrack_put(skb->nfct);
  17117. + skb->nfct = NULL;
  17118. +#endif
  17119. +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
  17120. + nf_bridge_put(skb->nf_bridge);
  17121. + skb->nf_bridge = NULL;
  17122. +#endif
  17123. +}
  17124. +
  17125. +static inline void nf_reset_trace(struct sk_buff *skb)
  17126. +{
  17127. +#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) || defined(CONFIG_NF_TABLES)
  17128. + skb->nf_trace = 0;
  17129. +#endif
  17130. +}
  17131. +
  17132. +/* Note: This doesn't put any conntrack and bridge info in dst. */
  17133. +static inline void __nf_copy(struct sk_buff *dst, const struct sk_buff *src,
  17134. + bool copy)
  17135. +{
  17136. +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
  17137. + dst->nfct = src->nfct;
  17138. + nf_conntrack_get(src->nfct);
  17139. + if (copy)
  17140. + dst->nfctinfo = src->nfctinfo;
  17141. +#endif
  17142. +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
  17143. + dst->nf_bridge = src->nf_bridge;
  17144. + nf_bridge_get(src->nf_bridge);
  17145. +#endif
  17146. +#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) || defined(CONFIG_NF_TABLES)
  17147. + if (copy)
  17148. + dst->nf_trace = src->nf_trace;
  17149. +#endif
  17150. +}
  17151. +
  17152. +static inline void nf_copy(struct sk_buff *dst, const struct sk_buff *src)
  17153. +{
  17154. +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
  17155. + nf_conntrack_put(dst->nfct);
  17156. +#endif
  17157. +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
  17158. + nf_bridge_put(dst->nf_bridge);
  17159. +#endif
  17160. + __nf_copy(dst, src, true);
  17161. +}
  17162. +
  17163. +#ifdef CONFIG_NETWORK_SECMARK
  17164. +static inline void skb_copy_secmark(struct sk_buff *to, const struct sk_buff *from)
  17165. +{
  17166. + to->secmark = from->secmark;
  17167. +}
  17168. +
  17169. +static inline void skb_init_secmark(struct sk_buff *skb)
  17170. +{
  17171. + skb->secmark = 0;
  17172. +}
  17173. +#else
  17174. +static inline void skb_copy_secmark(struct sk_buff *to, const struct sk_buff *from)
  17175. +{ }
  17176. +
  17177. +static inline void skb_init_secmark(struct sk_buff *skb)
  17178. +{ }
  17179. +#endif
  17180. +
  17181. +static inline bool skb_irq_freeable(const struct sk_buff *skb)
  17182. +{
  17183. + return !skb->destructor &&
  17184. +#if IS_ENABLED(CONFIG_XFRM)
  17185. + !skb->sp &&
  17186. +#endif
  17187. +#if IS_ENABLED(CONFIG_NF_CONNTRACK)
  17188. + !skb->nfct &&
  17189. +#endif
  17190. + !skb->_skb_refdst &&
  17191. + !skb_has_frag_list(skb);
  17192. +}
  17193. +
  17194. +static inline void skb_set_queue_mapping(struct sk_buff *skb, u16 queue_mapping)
  17195. +{
  17196. + skb->queue_mapping = queue_mapping;
  17197. +}
  17198. +
  17199. +static inline u16 skb_get_queue_mapping(const struct sk_buff *skb)
  17200. +{
  17201. + return skb->queue_mapping;
  17202. +}
  17203. +
  17204. +static inline void skb_copy_queue_mapping(struct sk_buff *to, const struct sk_buff *from)
  17205. +{
  17206. + to->queue_mapping = from->queue_mapping;
  17207. +}
  17208. +
  17209. +static inline void skb_record_rx_queue(struct sk_buff *skb, u16 rx_queue)
  17210. +{
  17211. + skb->queue_mapping = rx_queue + 1;
  17212. +}
  17213. +
  17214. +static inline u16 skb_get_rx_queue(const struct sk_buff *skb)
  17215. +{
  17216. + return skb->queue_mapping - 1;
  17217. +}
  17218. +
  17219. +static inline bool skb_rx_queue_recorded(const struct sk_buff *skb)
  17220. +{
  17221. + return skb->queue_mapping != 0;
  17222. +}
  17223. +
  17224. +u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
  17225. + unsigned int num_tx_queues);
  17226. +
  17227. +static inline struct sec_path *skb_sec_path(struct sk_buff *skb)
  17228. +{
  17229. +#ifdef CONFIG_XFRM
  17230. + return skb->sp;
  17231. +#else
  17232. + return NULL;
  17233. +#endif
  17234. +}
  17235. +
  17236. +/* Keeps track of mac header offset relative to skb->head.
  17237. + * It is useful for TSO of Tunneling protocol. e.g. GRE.
  17238. + * For non-tunnel skb it points to skb_mac_header() and for
  17239. + * tunnel skb it points to outer mac header.
  17240. + * Keeps track of level of encapsulation of network headers.
  17241. + */
  17242. +struct skb_gso_cb {
  17243. + int mac_offset;
  17244. + int encap_level;
  17245. + __u16 csum_start;
  17246. +};
  17247. +#define SKB_GSO_CB(skb) ((struct skb_gso_cb *)(skb)->cb)
  17248. +
  17249. +static inline int skb_tnl_header_len(const struct sk_buff *inner_skb)
  17250. +{
  17251. + return (skb_mac_header(inner_skb) - inner_skb->head) -
  17252. + SKB_GSO_CB(inner_skb)->mac_offset;
  17253. +}
  17254. +
  17255. +static inline int gso_pskb_expand_head(struct sk_buff *skb, int extra)
  17256. +{
  17257. + int new_headroom, headroom;
  17258. + int ret;
  17259. +
  17260. + headroom = skb_headroom(skb);
  17261. + ret = pskb_expand_head(skb, extra, 0, GFP_ATOMIC);
  17262. + if (ret)
  17263. + return ret;
  17264. +
  17265. + new_headroom = skb_headroom(skb);
  17266. + SKB_GSO_CB(skb)->mac_offset += (new_headroom - headroom);
  17267. + return 0;
  17268. +}
  17269. +
  17270. +/* Compute the checksum for a gso segment. First compute the checksum value
  17271. + * from the start of transport header to SKB_GSO_CB(skb)->csum_start, and
  17272. + * then add in skb->csum (checksum from csum_start to end of packet).
  17273. + * skb->csum and csum_start are then updated to reflect the checksum of the
  17274. + * resultant packet starting from the transport header-- the resultant checksum
  17275. + * is in the res argument (i.e. normally zero or ~ of checksum of a pseudo
  17276. + * header.
  17277. + */
  17278. +static inline __sum16 gso_make_checksum(struct sk_buff *skb, __wsum res)
  17279. +{
  17280. + int plen = SKB_GSO_CB(skb)->csum_start - skb_headroom(skb) -
  17281. + skb_transport_offset(skb);
  17282. + __u16 csum;
  17283. +
  17284. + csum = csum_fold(csum_partial(skb_transport_header(skb),
  17285. + plen, skb->csum));
  17286. + skb->csum = res;
  17287. + SKB_GSO_CB(skb)->csum_start -= plen;
  17288. +
  17289. + return csum;
  17290. +}
  17291. +
  17292. +static inline bool skb_is_gso(const struct sk_buff *skb)
  17293. +{
  17294. + return skb_shinfo(skb)->gso_size;
  17295. +}
  17296. +
  17297. +/* Note: Should be called only if skb_is_gso(skb) is true */
  17298. +static inline bool skb_is_gso_v6(const struct sk_buff *skb)
  17299. +{
  17300. + return skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6;
  17301. +}
  17302. +
  17303. +void __skb_warn_lro_forwarding(const struct sk_buff *skb);
  17304. +
  17305. +static inline bool skb_warn_if_lro(const struct sk_buff *skb)
  17306. +{
  17307. + /* LRO sets gso_size but not gso_type, whereas if GSO is really
  17308. + * wanted then gso_type will be set. */
  17309. + const struct skb_shared_info *shinfo = skb_shinfo(skb);
  17310. +
  17311. + if (skb_is_nonlinear(skb) && shinfo->gso_size != 0 &&
  17312. + unlikely(shinfo->gso_type == 0)) {
  17313. + __skb_warn_lro_forwarding(skb);
  17314. + return true;
  17315. + }
  17316. + return false;
  17317. +}
  17318. +
  17319. +static inline void skb_forward_csum(struct sk_buff *skb)
  17320. +{
  17321. + /* Unfortunately we don't support this one. Any brave souls? */
  17322. + if (skb->ip_summed == CHECKSUM_COMPLETE)
  17323. + skb->ip_summed = CHECKSUM_NONE;
  17324. +}
  17325. +
  17326. +/**
  17327. + * skb_checksum_none_assert - make sure skb ip_summed is CHECKSUM_NONE
  17328. + * @skb: skb to check
  17329. + *
  17330. + * fresh skbs have their ip_summed set to CHECKSUM_NONE.
  17331. + * Instead of forcing ip_summed to CHECKSUM_NONE, we can
  17332. + * use this helper, to document places where we make this assertion.
  17333. + */
  17334. +static inline void skb_checksum_none_assert(const struct sk_buff *skb)
  17335. +{
  17336. +#ifdef DEBUG
  17337. + BUG_ON(skb->ip_summed != CHECKSUM_NONE);
  17338. +#endif
  17339. +}
  17340. +
  17341. +bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off);
  17342. +
  17343. +int skb_checksum_setup(struct sk_buff *skb, bool recalculate);
  17344. +
  17345. +u32 skb_get_poff(const struct sk_buff *skb);
  17346. +u32 __skb_get_poff(const struct sk_buff *skb, void *data,
  17347. + const struct flow_keys *keys, int hlen);
  17348. +
  17349. +/**
  17350. + * skb_head_is_locked - Determine if the skb->head is locked down
  17351. + * @skb: skb to check
  17352. + *
  17353. + * The head on skbs build around a head frag can be removed if they are
  17354. + * not cloned. This function returns true if the skb head is locked down
  17355. + * due to either being allocated via kmalloc, or by being a clone with
  17356. + * multiple references to the head.
  17357. + */
  17358. +static inline bool skb_head_is_locked(const struct sk_buff *skb)
  17359. +{
  17360. + return !skb->head_frag || skb_cloned(skb);
  17361. +}
  17362. +
  17363. +/**
  17364. + * skb_gso_network_seglen - Return length of individual segments of a gso packet
  17365. + *
  17366. + * @skb: GSO skb
  17367. + *
  17368. + * skb_gso_network_seglen is used to determine the real size of the
  17369. + * individual segments, including Layer3 (IP, IPv6) and L4 headers (TCP/UDP).
  17370. + *
  17371. + * The MAC/L2 header is not accounted for.
  17372. + */
  17373. +static inline unsigned int skb_gso_network_seglen(const struct sk_buff *skb)
  17374. +{
  17375. + unsigned int hdr_len = skb_transport_header(skb) -
  17376. + skb_network_header(skb);
  17377. + return hdr_len + skb_gso_transport_seglen(skb);
  17378. +}
  17379. +#endif /* __KERNEL__ */
  17380. +#endif /* _LINUX_SKBUFF_H */
  17381. diff -Nur linux-3.18.14.orig/include/linux/smp.h linux-3.18.14-rt/include/linux/smp.h
  17382. --- linux-3.18.14.orig/include/linux/smp.h 2015-05-20 10:04:50.000000000 -0500
  17383. +++ linux-3.18.14-rt/include/linux/smp.h 2015-05-31 15:32:48.405635367 -0500
  17384. @@ -178,6 +178,9 @@
  17385. #define get_cpu() ({ preempt_disable(); smp_processor_id(); })
  17386. #define put_cpu() preempt_enable()
  17387. +#define get_cpu_light() ({ migrate_disable(); smp_processor_id(); })
  17388. +#define put_cpu_light() migrate_enable()
  17389. +
  17390. /*
  17391. * Callback to arch code if there's nosmp or maxcpus=0 on the
  17392. * boot command line:
  17393. diff -Nur linux-3.18.14.orig/include/linux/spinlock_api_smp.h linux-3.18.14-rt/include/linux/spinlock_api_smp.h
  17394. --- linux-3.18.14.orig/include/linux/spinlock_api_smp.h 2015-05-20 10:04:50.000000000 -0500
  17395. +++ linux-3.18.14-rt/include/linux/spinlock_api_smp.h 2015-05-31 15:32:48.409635367 -0500
  17396. @@ -187,6 +187,8 @@
  17397. return 0;
  17398. }
  17399. -#include <linux/rwlock_api_smp.h>
  17400. +#ifndef CONFIG_PREEMPT_RT_FULL
  17401. +# include <linux/rwlock_api_smp.h>
  17402. +#endif
  17403. #endif /* __LINUX_SPINLOCK_API_SMP_H */
  17404. diff -Nur linux-3.18.14.orig/include/linux/spinlock.h linux-3.18.14-rt/include/linux/spinlock.h
  17405. --- linux-3.18.14.orig/include/linux/spinlock.h 2015-05-20 10:04:50.000000000 -0500
  17406. +++ linux-3.18.14-rt/include/linux/spinlock.h 2015-05-31 15:32:48.405635367 -0500
  17407. @@ -278,7 +278,11 @@
  17408. #define raw_spin_can_lock(lock) (!raw_spin_is_locked(lock))
  17409. /* Include rwlock functions */
  17410. -#include <linux/rwlock.h>
  17411. +#ifdef CONFIG_PREEMPT_RT_FULL
  17412. +# include <linux/rwlock_rt.h>
  17413. +#else
  17414. +# include <linux/rwlock.h>
  17415. +#endif
  17416. /*
  17417. * Pull the _spin_*()/_read_*()/_write_*() functions/declarations:
  17418. @@ -289,6 +293,10 @@
  17419. # include <linux/spinlock_api_up.h>
  17420. #endif
  17421. +#ifdef CONFIG_PREEMPT_RT_FULL
  17422. +# include <linux/spinlock_rt.h>
  17423. +#else /* PREEMPT_RT_FULL */
  17424. +
  17425. /*
  17426. * Map the spin_lock functions to the raw variants for PREEMPT_RT=n
  17427. */
  17428. @@ -418,4 +426,6 @@
  17429. #define atomic_dec_and_lock(atomic, lock) \
  17430. __cond_lock(lock, _atomic_dec_and_lock(atomic, lock))
  17431. +#endif /* !PREEMPT_RT_FULL */
  17432. +
  17433. #endif /* __LINUX_SPINLOCK_H */
  17434. diff -Nur linux-3.18.14.orig/include/linux/spinlock_rt.h linux-3.18.14-rt/include/linux/spinlock_rt.h
  17435. --- linux-3.18.14.orig/include/linux/spinlock_rt.h 1969-12-31 18:00:00.000000000 -0600
  17436. +++ linux-3.18.14-rt/include/linux/spinlock_rt.h 2015-05-31 15:32:48.413635367 -0500
  17437. @@ -0,0 +1,167 @@
  17438. +#ifndef __LINUX_SPINLOCK_RT_H
  17439. +#define __LINUX_SPINLOCK_RT_H
  17440. +
  17441. +#ifndef __LINUX_SPINLOCK_H
  17442. +#error Do not include directly. Use spinlock.h
  17443. +#endif
  17444. +
  17445. +#include <linux/bug.h>
  17446. +
  17447. +extern void
  17448. +__rt_spin_lock_init(spinlock_t *lock, char *name, struct lock_class_key *key);
  17449. +
  17450. +#define spin_lock_init(slock) \
  17451. +do { \
  17452. + static struct lock_class_key __key; \
  17453. + \
  17454. + rt_mutex_init(&(slock)->lock); \
  17455. + __rt_spin_lock_init(slock, #slock, &__key); \
  17456. +} while (0)
  17457. +
  17458. +extern void __lockfunc rt_spin_lock(spinlock_t *lock);
  17459. +extern unsigned long __lockfunc rt_spin_lock_trace_flags(spinlock_t *lock);
  17460. +extern void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass);
  17461. +extern void __lockfunc rt_spin_unlock(spinlock_t *lock);
  17462. +extern void __lockfunc rt_spin_unlock_after_trylock_in_irq(spinlock_t *lock);
  17463. +extern void __lockfunc rt_spin_unlock_wait(spinlock_t *lock);
  17464. +extern int __lockfunc rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags);
  17465. +extern int __lockfunc rt_spin_trylock_bh(spinlock_t *lock);
  17466. +extern int __lockfunc rt_spin_trylock(spinlock_t *lock);
  17467. +extern int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock);
  17468. +
  17469. +/*
  17470. + * lockdep-less calls, for derived types like rwlock:
  17471. + * (for trylock they can use rt_mutex_trylock() directly.
  17472. + */
  17473. +extern void __lockfunc __rt_spin_lock(struct rt_mutex *lock);
  17474. +extern void __lockfunc __rt_spin_unlock(struct rt_mutex *lock);
  17475. +extern int __lockfunc __rt_spin_trylock(struct rt_mutex *lock);
  17476. +
  17477. +#define spin_lock(lock) \
  17478. + do { \
  17479. + migrate_disable(); \
  17480. + rt_spin_lock(lock); \
  17481. + } while (0)
  17482. +
  17483. +#define spin_lock_bh(lock) \
  17484. + do { \
  17485. + local_bh_disable(); \
  17486. + migrate_disable(); \
  17487. + rt_spin_lock(lock); \
  17488. + } while (0)
  17489. +
  17490. +#define spin_lock_irq(lock) spin_lock(lock)
  17491. +
  17492. +#define spin_do_trylock(lock) __cond_lock(lock, rt_spin_trylock(lock))
  17493. +
  17494. +#define spin_trylock(lock) \
  17495. +({ \
  17496. + int __locked; \
  17497. + migrate_disable(); \
  17498. + __locked = spin_do_trylock(lock); \
  17499. + if (!__locked) \
  17500. + migrate_enable(); \
  17501. + __locked; \
  17502. +})
  17503. +
  17504. +#ifdef CONFIG_LOCKDEP
  17505. +# define spin_lock_nested(lock, subclass) \
  17506. + do { \
  17507. + migrate_disable(); \
  17508. + rt_spin_lock_nested(lock, subclass); \
  17509. + } while (0)
  17510. +
  17511. +# define spin_lock_irqsave_nested(lock, flags, subclass) \
  17512. + do { \
  17513. + typecheck(unsigned long, flags); \
  17514. + flags = 0; \
  17515. + migrate_disable(); \
  17516. + rt_spin_lock_nested(lock, subclass); \
  17517. + } while (0)
  17518. +#else
  17519. +# define spin_lock_nested(lock, subclass) spin_lock(lock)
  17520. +
  17521. +# define spin_lock_irqsave_nested(lock, flags, subclass) \
  17522. + do { \
  17523. + typecheck(unsigned long, flags); \
  17524. + flags = 0; \
  17525. + spin_lock(lock); \
  17526. + } while (0)
  17527. +#endif
  17528. +
  17529. +#define spin_lock_irqsave(lock, flags) \
  17530. + do { \
  17531. + typecheck(unsigned long, flags); \
  17532. + flags = 0; \
  17533. + spin_lock(lock); \
  17534. + } while (0)
  17535. +
  17536. +static inline unsigned long spin_lock_trace_flags(spinlock_t *lock)
  17537. +{
  17538. + unsigned long flags = 0;
  17539. +#ifdef CONFIG_TRACE_IRQFLAGS
  17540. + flags = rt_spin_lock_trace_flags(lock);
  17541. +#else
  17542. + spin_lock(lock); /* lock_local */
  17543. +#endif
  17544. + return flags;
  17545. +}
  17546. +
  17547. +/* FIXME: we need rt_spin_lock_nest_lock */
  17548. +#define spin_lock_nest_lock(lock, nest_lock) spin_lock_nested(lock, 0)
  17549. +
  17550. +#define spin_unlock(lock) \
  17551. + do { \
  17552. + rt_spin_unlock(lock); \
  17553. + migrate_enable(); \
  17554. + } while (0)
  17555. +
  17556. +#define spin_unlock_bh(lock) \
  17557. + do { \
  17558. + rt_spin_unlock(lock); \
  17559. + migrate_enable(); \
  17560. + local_bh_enable(); \
  17561. + } while (0)
  17562. +
  17563. +#define spin_unlock_irq(lock) spin_unlock(lock)
  17564. +
  17565. +#define spin_unlock_irqrestore(lock, flags) \
  17566. + do { \
  17567. + typecheck(unsigned long, flags); \
  17568. + (void) flags; \
  17569. + spin_unlock(lock); \
  17570. + } while (0)
  17571. +
  17572. +#define spin_trylock_bh(lock) __cond_lock(lock, rt_spin_trylock_bh(lock))
  17573. +#define spin_trylock_irq(lock) spin_trylock(lock)
  17574. +
  17575. +#define spin_trylock_irqsave(lock, flags) \
  17576. + rt_spin_trylock_irqsave(lock, &(flags))
  17577. +
  17578. +#define spin_unlock_wait(lock) rt_spin_unlock_wait(lock)
  17579. +
  17580. +#ifdef CONFIG_GENERIC_LOCKBREAK
  17581. +# define spin_is_contended(lock) ((lock)->break_lock)
  17582. +#else
  17583. +# define spin_is_contended(lock) (((void)(lock), 0))
  17584. +#endif
  17585. +
  17586. +static inline int spin_can_lock(spinlock_t *lock)
  17587. +{
  17588. + return !rt_mutex_is_locked(&lock->lock);
  17589. +}
  17590. +
  17591. +static inline int spin_is_locked(spinlock_t *lock)
  17592. +{
  17593. + return rt_mutex_is_locked(&lock->lock);
  17594. +}
  17595. +
  17596. +static inline void assert_spin_locked(spinlock_t *lock)
  17597. +{
  17598. + BUG_ON(!spin_is_locked(lock));
  17599. +}
  17600. +
  17601. +#define atomic_dec_and_lock(atomic, lock) \
  17602. + atomic_dec_and_spin_lock(atomic, lock)
  17603. +
  17604. +#endif
  17605. diff -Nur linux-3.18.14.orig/include/linux/spinlock_types.h linux-3.18.14-rt/include/linux/spinlock_types.h
  17606. --- linux-3.18.14.orig/include/linux/spinlock_types.h 2015-05-20 10:04:50.000000000 -0500
  17607. +++ linux-3.18.14-rt/include/linux/spinlock_types.h 2015-05-31 15:32:48.413635367 -0500
  17608. @@ -9,80 +9,15 @@
  17609. * Released under the General Public License (GPL).
  17610. */
  17611. -#if defined(CONFIG_SMP)
  17612. -# include <asm/spinlock_types.h>
  17613. -#else
  17614. -# include <linux/spinlock_types_up.h>
  17615. -#endif
  17616. -
  17617. -#include <linux/lockdep.h>
  17618. -
  17619. -typedef struct raw_spinlock {
  17620. - arch_spinlock_t raw_lock;
  17621. -#ifdef CONFIG_GENERIC_LOCKBREAK
  17622. - unsigned int break_lock;
  17623. -#endif
  17624. -#ifdef CONFIG_DEBUG_SPINLOCK
  17625. - unsigned int magic, owner_cpu;
  17626. - void *owner;
  17627. -#endif
  17628. -#ifdef CONFIG_DEBUG_LOCK_ALLOC
  17629. - struct lockdep_map dep_map;
  17630. -#endif
  17631. -} raw_spinlock_t;
  17632. -
  17633. -#define SPINLOCK_MAGIC 0xdead4ead
  17634. -
  17635. -#define SPINLOCK_OWNER_INIT ((void *)-1L)
  17636. -
  17637. -#ifdef CONFIG_DEBUG_LOCK_ALLOC
  17638. -# define SPIN_DEP_MAP_INIT(lockname) .dep_map = { .name = #lockname }
  17639. -#else
  17640. -# define SPIN_DEP_MAP_INIT(lockname)
  17641. -#endif
  17642. +#include <linux/spinlock_types_raw.h>
  17643. -#ifdef CONFIG_DEBUG_SPINLOCK
  17644. -# define SPIN_DEBUG_INIT(lockname) \
  17645. - .magic = SPINLOCK_MAGIC, \
  17646. - .owner_cpu = -1, \
  17647. - .owner = SPINLOCK_OWNER_INIT,
  17648. +#ifndef CONFIG_PREEMPT_RT_FULL
  17649. +# include <linux/spinlock_types_nort.h>
  17650. +# include <linux/rwlock_types.h>
  17651. #else
  17652. -# define SPIN_DEBUG_INIT(lockname)
  17653. +# include <linux/rtmutex.h>
  17654. +# include <linux/spinlock_types_rt.h>
  17655. +# include <linux/rwlock_types_rt.h>
  17656. #endif
  17657. -#define __RAW_SPIN_LOCK_INITIALIZER(lockname) \
  17658. - { \
  17659. - .raw_lock = __ARCH_SPIN_LOCK_UNLOCKED, \
  17660. - SPIN_DEBUG_INIT(lockname) \
  17661. - SPIN_DEP_MAP_INIT(lockname) }
  17662. -
  17663. -#define __RAW_SPIN_LOCK_UNLOCKED(lockname) \
  17664. - (raw_spinlock_t) __RAW_SPIN_LOCK_INITIALIZER(lockname)
  17665. -
  17666. -#define DEFINE_RAW_SPINLOCK(x) raw_spinlock_t x = __RAW_SPIN_LOCK_UNLOCKED(x)
  17667. -
  17668. -typedef struct spinlock {
  17669. - union {
  17670. - struct raw_spinlock rlock;
  17671. -
  17672. -#ifdef CONFIG_DEBUG_LOCK_ALLOC
  17673. -# define LOCK_PADSIZE (offsetof(struct raw_spinlock, dep_map))
  17674. - struct {
  17675. - u8 __padding[LOCK_PADSIZE];
  17676. - struct lockdep_map dep_map;
  17677. - };
  17678. -#endif
  17679. - };
  17680. -} spinlock_t;
  17681. -
  17682. -#define __SPIN_LOCK_INITIALIZER(lockname) \
  17683. - { { .rlock = __RAW_SPIN_LOCK_INITIALIZER(lockname) } }
  17684. -
  17685. -#define __SPIN_LOCK_UNLOCKED(lockname) \
  17686. - (spinlock_t ) __SPIN_LOCK_INITIALIZER(lockname)
  17687. -
  17688. -#define DEFINE_SPINLOCK(x) spinlock_t x = __SPIN_LOCK_UNLOCKED(x)
  17689. -
  17690. -#include <linux/rwlock_types.h>
  17691. -
  17692. #endif /* __LINUX_SPINLOCK_TYPES_H */
  17693. diff -Nur linux-3.18.14.orig/include/linux/spinlock_types_nort.h linux-3.18.14-rt/include/linux/spinlock_types_nort.h
  17694. --- linux-3.18.14.orig/include/linux/spinlock_types_nort.h 1969-12-31 18:00:00.000000000 -0600
  17695. +++ linux-3.18.14-rt/include/linux/spinlock_types_nort.h 2015-05-31 15:32:48.413635367 -0500
  17696. @@ -0,0 +1,33 @@
  17697. +#ifndef __LINUX_SPINLOCK_TYPES_NORT_H
  17698. +#define __LINUX_SPINLOCK_TYPES_NORT_H
  17699. +
  17700. +#ifndef __LINUX_SPINLOCK_TYPES_H
  17701. +#error "Do not include directly. Include spinlock_types.h instead"
  17702. +#endif
  17703. +
  17704. +/*
  17705. + * The non RT version maps spinlocks to raw_spinlocks
  17706. + */
  17707. +typedef struct spinlock {
  17708. + union {
  17709. + struct raw_spinlock rlock;
  17710. +
  17711. +#ifdef CONFIG_DEBUG_LOCK_ALLOC
  17712. +# define LOCK_PADSIZE (offsetof(struct raw_spinlock, dep_map))
  17713. + struct {
  17714. + u8 __padding[LOCK_PADSIZE];
  17715. + struct lockdep_map dep_map;
  17716. + };
  17717. +#endif
  17718. + };
  17719. +} spinlock_t;
  17720. +
  17721. +#define __SPIN_LOCK_INITIALIZER(lockname) \
  17722. + { { .rlock = __RAW_SPIN_LOCK_INITIALIZER(lockname) } }
  17723. +
  17724. +#define __SPIN_LOCK_UNLOCKED(lockname) \
  17725. + (spinlock_t ) __SPIN_LOCK_INITIALIZER(lockname)
  17726. +
  17727. +#define DEFINE_SPINLOCK(x) spinlock_t x = __SPIN_LOCK_UNLOCKED(x)
  17728. +
  17729. +#endif
  17730. diff -Nur linux-3.18.14.orig/include/linux/spinlock_types_raw.h linux-3.18.14-rt/include/linux/spinlock_types_raw.h
  17731. --- linux-3.18.14.orig/include/linux/spinlock_types_raw.h 1969-12-31 18:00:00.000000000 -0600
  17732. +++ linux-3.18.14-rt/include/linux/spinlock_types_raw.h 2015-05-31 15:32:48.413635367 -0500
  17733. @@ -0,0 +1,56 @@
  17734. +#ifndef __LINUX_SPINLOCK_TYPES_RAW_H
  17735. +#define __LINUX_SPINLOCK_TYPES_RAW_H
  17736. +
  17737. +#if defined(CONFIG_SMP)
  17738. +# include <asm/spinlock_types.h>
  17739. +#else
  17740. +# include <linux/spinlock_types_up.h>
  17741. +#endif
  17742. +
  17743. +#include <linux/lockdep.h>
  17744. +
  17745. +typedef struct raw_spinlock {
  17746. + arch_spinlock_t raw_lock;
  17747. +#ifdef CONFIG_GENERIC_LOCKBREAK
  17748. + unsigned int break_lock;
  17749. +#endif
  17750. +#ifdef CONFIG_DEBUG_SPINLOCK
  17751. + unsigned int magic, owner_cpu;
  17752. + void *owner;
  17753. +#endif
  17754. +#ifdef CONFIG_DEBUG_LOCK_ALLOC
  17755. + struct lockdep_map dep_map;
  17756. +#endif
  17757. +} raw_spinlock_t;
  17758. +
  17759. +#define SPINLOCK_MAGIC 0xdead4ead
  17760. +
  17761. +#define SPINLOCK_OWNER_INIT ((void *)-1L)
  17762. +
  17763. +#ifdef CONFIG_DEBUG_LOCK_ALLOC
  17764. +# define SPIN_DEP_MAP_INIT(lockname) .dep_map = { .name = #lockname }
  17765. +#else
  17766. +# define SPIN_DEP_MAP_INIT(lockname)
  17767. +#endif
  17768. +
  17769. +#ifdef CONFIG_DEBUG_SPINLOCK
  17770. +# define SPIN_DEBUG_INIT(lockname) \
  17771. + .magic = SPINLOCK_MAGIC, \
  17772. + .owner_cpu = -1, \
  17773. + .owner = SPINLOCK_OWNER_INIT,
  17774. +#else
  17775. +# define SPIN_DEBUG_INIT(lockname)
  17776. +#endif
  17777. +
  17778. +#define __RAW_SPIN_LOCK_INITIALIZER(lockname) \
  17779. + { \
  17780. + .raw_lock = __ARCH_SPIN_LOCK_UNLOCKED, \
  17781. + SPIN_DEBUG_INIT(lockname) \
  17782. + SPIN_DEP_MAP_INIT(lockname) }
  17783. +
  17784. +#define __RAW_SPIN_LOCK_UNLOCKED(lockname) \
  17785. + (raw_spinlock_t) __RAW_SPIN_LOCK_INITIALIZER(lockname)
  17786. +
  17787. +#define DEFINE_RAW_SPINLOCK(x) raw_spinlock_t x = __RAW_SPIN_LOCK_UNLOCKED(x)
  17788. +
  17789. +#endif
  17790. diff -Nur linux-3.18.14.orig/include/linux/spinlock_types_rt.h linux-3.18.14-rt/include/linux/spinlock_types_rt.h
  17791. --- linux-3.18.14.orig/include/linux/spinlock_types_rt.h 1969-12-31 18:00:00.000000000 -0600
  17792. +++ linux-3.18.14-rt/include/linux/spinlock_types_rt.h 2015-05-31 15:32:48.413635367 -0500
  17793. @@ -0,0 +1,51 @@
  17794. +#ifndef __LINUX_SPINLOCK_TYPES_RT_H
  17795. +#define __LINUX_SPINLOCK_TYPES_RT_H
  17796. +
  17797. +#ifndef __LINUX_SPINLOCK_TYPES_H
  17798. +#error "Do not include directly. Include spinlock_types.h instead"
  17799. +#endif
  17800. +
  17801. +#include <linux/cache.h>
  17802. +
  17803. +/*
  17804. + * PREEMPT_RT: spinlocks - an RT mutex plus lock-break field:
  17805. + */
  17806. +typedef struct spinlock {
  17807. + struct rt_mutex lock;
  17808. + unsigned int break_lock;
  17809. +#ifdef CONFIG_DEBUG_LOCK_ALLOC
  17810. + struct lockdep_map dep_map;
  17811. +#endif
  17812. +} spinlock_t;
  17813. +
  17814. +#ifdef CONFIG_DEBUG_RT_MUTEXES
  17815. +# define __RT_SPIN_INITIALIZER(name) \
  17816. + { \
  17817. + .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \
  17818. + .save_state = 1, \
  17819. + .file = __FILE__, \
  17820. + .line = __LINE__ , \
  17821. + }
  17822. +#else
  17823. +# define __RT_SPIN_INITIALIZER(name) \
  17824. + { \
  17825. + .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \
  17826. + .save_state = 1, \
  17827. + }
  17828. +#endif
  17829. +
  17830. +/*
  17831. +.wait_list = PLIST_HEAD_INIT_RAW((name).lock.wait_list, (name).lock.wait_lock)
  17832. +*/
  17833. +
  17834. +#define __SPIN_LOCK_UNLOCKED(name) \
  17835. + { .lock = __RT_SPIN_INITIALIZER(name.lock), \
  17836. + SPIN_DEP_MAP_INIT(name) }
  17837. +
  17838. +#define __DEFINE_SPINLOCK(name) \
  17839. + spinlock_t name = __SPIN_LOCK_UNLOCKED(name)
  17840. +
  17841. +#define DEFINE_SPINLOCK(name) \
  17842. + spinlock_t name __cacheline_aligned_in_smp = __SPIN_LOCK_UNLOCKED(name)
  17843. +
  17844. +#endif
  17845. diff -Nur linux-3.18.14.orig/include/linux/srcu.h linux-3.18.14-rt/include/linux/srcu.h
  17846. --- linux-3.18.14.orig/include/linux/srcu.h 2015-05-20 10:04:50.000000000 -0500
  17847. +++ linux-3.18.14-rt/include/linux/srcu.h 2015-05-31 15:32:48.445635367 -0500
  17848. @@ -84,10 +84,10 @@
  17849. void process_srcu(struct work_struct *work);
  17850. -#define __SRCU_STRUCT_INIT(name) \
  17851. +#define __SRCU_STRUCT_INIT(name, pcpu_name) \
  17852. { \
  17853. .completed = -300, \
  17854. - .per_cpu_ref = &name##_srcu_array, \
  17855. + .per_cpu_ref = &pcpu_name, \
  17856. .queue_lock = __SPIN_LOCK_UNLOCKED(name.queue_lock), \
  17857. .running = false, \
  17858. .batch_queue = RCU_BATCH_INIT(name.batch_queue), \
  17859. @@ -104,11 +104,12 @@
  17860. */
  17861. #define DEFINE_SRCU(name) \
  17862. static DEFINE_PER_CPU(struct srcu_struct_array, name##_srcu_array);\
  17863. - struct srcu_struct name = __SRCU_STRUCT_INIT(name);
  17864. + struct srcu_struct name = __SRCU_STRUCT_INIT(name, name##_srcu_array);
  17865. #define DEFINE_STATIC_SRCU(name) \
  17866. static DEFINE_PER_CPU(struct srcu_struct_array, name##_srcu_array);\
  17867. - static struct srcu_struct name = __SRCU_STRUCT_INIT(name);
  17868. + static struct srcu_struct name = __SRCU_STRUCT_INIT(\
  17869. + name, name##_srcu_array);
  17870. /**
  17871. * call_srcu() - Queue a callback for invocation after an SRCU grace period
  17872. diff -Nur linux-3.18.14.orig/include/linux/swap.h linux-3.18.14-rt/include/linux/swap.h
  17873. --- linux-3.18.14.orig/include/linux/swap.h 2015-05-20 10:04:50.000000000 -0500
  17874. +++ linux-3.18.14-rt/include/linux/swap.h 2015-05-31 15:32:48.449635367 -0500
  17875. @@ -11,6 +11,7 @@
  17876. #include <linux/fs.h>
  17877. #include <linux/atomic.h>
  17878. #include <linux/page-flags.h>
  17879. +#include <linux/locallock.h>
  17880. #include <asm/page.h>
  17881. struct notifier_block;
  17882. @@ -260,7 +261,8 @@
  17883. void *workingset_eviction(struct address_space *mapping, struct page *page);
  17884. bool workingset_refault(void *shadow);
  17885. void workingset_activation(struct page *page);
  17886. -extern struct list_lru workingset_shadow_nodes;
  17887. +extern struct list_lru __workingset_shadow_nodes;
  17888. +DECLARE_LOCAL_IRQ_LOCK(workingset_shadow_lock);
  17889. static inline unsigned int workingset_node_pages(struct radix_tree_node *node)
  17890. {
  17891. diff -Nur linux-3.18.14.orig/include/linux/sysctl.h linux-3.18.14-rt/include/linux/sysctl.h
  17892. --- linux-3.18.14.orig/include/linux/sysctl.h 2015-05-20 10:04:50.000000000 -0500
  17893. +++ linux-3.18.14-rt/include/linux/sysctl.h 2015-05-31 15:32:48.449635367 -0500
  17894. @@ -25,6 +25,7 @@
  17895. #include <linux/rcupdate.h>
  17896. #include <linux/wait.h>
  17897. #include <linux/rbtree.h>
  17898. +#include <linux/atomic.h>
  17899. #include <uapi/linux/sysctl.h>
  17900. /* For the /proc/sys support */
  17901. diff -Nur linux-3.18.14.orig/include/linux/thread_info.h linux-3.18.14-rt/include/linux/thread_info.h
  17902. --- linux-3.18.14.orig/include/linux/thread_info.h 2015-05-20 10:04:50.000000000 -0500
  17903. +++ linux-3.18.14-rt/include/linux/thread_info.h 2015-05-31 15:32:48.449635367 -0500
  17904. @@ -102,7 +102,17 @@
  17905. #define test_thread_flag(flag) \
  17906. test_ti_thread_flag(current_thread_info(), flag)
  17907. -#define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED)
  17908. +#ifdef CONFIG_PREEMPT_LAZY
  17909. +#define tif_need_resched() (test_thread_flag(TIF_NEED_RESCHED) || \
  17910. + test_thread_flag(TIF_NEED_RESCHED_LAZY))
  17911. +#define tif_need_resched_now() (test_thread_flag(TIF_NEED_RESCHED))
  17912. +#define tif_need_resched_lazy() test_thread_flag(TIF_NEED_RESCHED_LAZY))
  17913. +
  17914. +#else
  17915. +#define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED)
  17916. +#define tif_need_resched_now() test_thread_flag(TIF_NEED_RESCHED)
  17917. +#define tif_need_resched_lazy() 0
  17918. +#endif
  17919. #if defined TIF_RESTORE_SIGMASK && !defined HAVE_SET_RESTORE_SIGMASK
  17920. /*
  17921. diff -Nur linux-3.18.14.orig/include/linux/timer.h linux-3.18.14-rt/include/linux/timer.h
  17922. --- linux-3.18.14.orig/include/linux/timer.h 2015-05-20 10:04:50.000000000 -0500
  17923. +++ linux-3.18.14-rt/include/linux/timer.h 2015-05-31 15:32:48.449635367 -0500
  17924. @@ -241,7 +241,7 @@
  17925. extern int try_to_del_timer_sync(struct timer_list *timer);
  17926. -#ifdef CONFIG_SMP
  17927. +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
  17928. extern int del_timer_sync(struct timer_list *timer);
  17929. #else
  17930. # define del_timer_sync(t) del_timer(t)
  17931. diff -Nur linux-3.18.14.orig/include/linux/uaccess.h linux-3.18.14-rt/include/linux/uaccess.h
  17932. --- linux-3.18.14.orig/include/linux/uaccess.h 2015-05-20 10:04:50.000000000 -0500
  17933. +++ linux-3.18.14-rt/include/linux/uaccess.h 2015-05-31 15:32:48.449635367 -0500
  17934. @@ -6,14 +6,9 @@
  17935. /*
  17936. * These routines enable/disable the pagefault handler in that
  17937. - * it will not take any locks and go straight to the fixup table.
  17938. - *
  17939. - * They have great resemblance to the preempt_disable/enable calls
  17940. - * and in fact they are identical; this is because currently there is
  17941. - * no other way to make the pagefault handlers do this. So we do
  17942. - * disable preemption but we don't necessarily care about that.
  17943. + * it will not take any MM locks and go straight to the fixup table.
  17944. */
  17945. -static inline void pagefault_disable(void)
  17946. +static inline void raw_pagefault_disable(void)
  17947. {
  17948. preempt_count_inc();
  17949. /*
  17950. @@ -23,7 +18,7 @@
  17951. barrier();
  17952. }
  17953. -static inline void pagefault_enable(void)
  17954. +static inline void raw_pagefault_enable(void)
  17955. {
  17956. #ifndef CONFIG_PREEMPT
  17957. /*
  17958. @@ -37,6 +32,21 @@
  17959. #endif
  17960. }
  17961. +#ifndef CONFIG_PREEMPT_RT_FULL
  17962. +static inline void pagefault_disable(void)
  17963. +{
  17964. + raw_pagefault_disable();
  17965. +}
  17966. +
  17967. +static inline void pagefault_enable(void)
  17968. +{
  17969. + raw_pagefault_enable();
  17970. +}
  17971. +#else
  17972. +extern void pagefault_disable(void);
  17973. +extern void pagefault_enable(void);
  17974. +#endif
  17975. +
  17976. #ifndef ARCH_HAS_NOCACHE_UACCESS
  17977. static inline unsigned long __copy_from_user_inatomic_nocache(void *to,
  17978. @@ -76,9 +86,9 @@
  17979. mm_segment_t old_fs = get_fs(); \
  17980. \
  17981. set_fs(KERNEL_DS); \
  17982. - pagefault_disable(); \
  17983. + raw_pagefault_disable(); \
  17984. ret = __copy_from_user_inatomic(&(retval), (__force typeof(retval) __user *)(addr), sizeof(retval)); \
  17985. - pagefault_enable(); \
  17986. + raw_pagefault_enable(); \
  17987. set_fs(old_fs); \
  17988. ret; \
  17989. })
  17990. diff -Nur linux-3.18.14.orig/include/linux/uprobes.h linux-3.18.14-rt/include/linux/uprobes.h
  17991. --- linux-3.18.14.orig/include/linux/uprobes.h 2015-05-20 10:04:50.000000000 -0500
  17992. +++ linux-3.18.14-rt/include/linux/uprobes.h 2015-05-31 15:32:48.481635367 -0500
  17993. @@ -27,6 +27,7 @@
  17994. #include <linux/errno.h>
  17995. #include <linux/rbtree.h>
  17996. #include <linux/types.h>
  17997. +#include <linux/wait.h>
  17998. struct vm_area_struct;
  17999. struct mm_struct;
  18000. diff -Nur linux-3.18.14.orig/include/linux/vmstat.h linux-3.18.14-rt/include/linux/vmstat.h
  18001. --- linux-3.18.14.orig/include/linux/vmstat.h 2015-05-20 10:04:50.000000000 -0500
  18002. +++ linux-3.18.14-rt/include/linux/vmstat.h 2015-05-31 15:32:48.481635367 -0500
  18003. @@ -33,7 +33,9 @@
  18004. */
  18005. static inline void __count_vm_event(enum vm_event_item item)
  18006. {
  18007. + preempt_disable_rt();
  18008. raw_cpu_inc(vm_event_states.event[item]);
  18009. + preempt_enable_rt();
  18010. }
  18011. static inline void count_vm_event(enum vm_event_item item)
  18012. @@ -43,7 +45,9 @@
  18013. static inline void __count_vm_events(enum vm_event_item item, long delta)
  18014. {
  18015. + preempt_disable_rt();
  18016. raw_cpu_add(vm_event_states.event[item], delta);
  18017. + preempt_enable_rt();
  18018. }
  18019. static inline void count_vm_events(enum vm_event_item item, long delta)
  18020. diff -Nur linux-3.18.14.orig/include/linux/wait.h linux-3.18.14-rt/include/linux/wait.h
  18021. --- linux-3.18.14.orig/include/linux/wait.h 2015-05-20 10:04:50.000000000 -0500
  18022. +++ linux-3.18.14-rt/include/linux/wait.h 2015-05-31 15:32:48.481635367 -0500
  18023. @@ -8,6 +8,7 @@
  18024. #include <linux/spinlock.h>
  18025. #include <asm/current.h>
  18026. #include <uapi/linux/wait.h>
  18027. +#include <linux/atomic.h>
  18028. typedef struct __wait_queue wait_queue_t;
  18029. typedef int (*wait_queue_func_t)(wait_queue_t *wait, unsigned mode, int flags, void *key);
  18030. diff -Nur linux-3.18.14.orig/include/linux/wait-simple.h linux-3.18.14-rt/include/linux/wait-simple.h
  18031. --- linux-3.18.14.orig/include/linux/wait-simple.h 1969-12-31 18:00:00.000000000 -0600
  18032. +++ linux-3.18.14-rt/include/linux/wait-simple.h 2015-05-31 15:32:48.481635367 -0500
  18033. @@ -0,0 +1,207 @@
  18034. +#ifndef _LINUX_WAIT_SIMPLE_H
  18035. +#define _LINUX_WAIT_SIMPLE_H
  18036. +
  18037. +#include <linux/spinlock.h>
  18038. +#include <linux/list.h>
  18039. +
  18040. +#include <asm/current.h>
  18041. +
  18042. +struct swaiter {
  18043. + struct task_struct *task;
  18044. + struct list_head node;
  18045. +};
  18046. +
  18047. +#define DEFINE_SWAITER(name) \
  18048. + struct swaiter name = { \
  18049. + .task = current, \
  18050. + .node = LIST_HEAD_INIT((name).node), \
  18051. + }
  18052. +
  18053. +struct swait_head {
  18054. + raw_spinlock_t lock;
  18055. + struct list_head list;
  18056. +};
  18057. +
  18058. +#define SWAIT_HEAD_INITIALIZER(name) { \
  18059. + .lock = __RAW_SPIN_LOCK_UNLOCKED(name.lock), \
  18060. + .list = LIST_HEAD_INIT((name).list), \
  18061. + }
  18062. +
  18063. +#define DEFINE_SWAIT_HEAD(name) \
  18064. + struct swait_head name = SWAIT_HEAD_INITIALIZER(name)
  18065. +
  18066. +extern void __init_swait_head(struct swait_head *h, struct lock_class_key *key);
  18067. +
  18068. +#define init_swait_head(swh) \
  18069. + do { \
  18070. + static struct lock_class_key __key; \
  18071. + \
  18072. + __init_swait_head((swh), &__key); \
  18073. + } while (0)
  18074. +
  18075. +/*
  18076. + * Waiter functions
  18077. + */
  18078. +extern void swait_prepare_locked(struct swait_head *head, struct swaiter *w);
  18079. +extern void swait_prepare(struct swait_head *head, struct swaiter *w, int state);
  18080. +extern void swait_finish_locked(struct swait_head *head, struct swaiter *w);
  18081. +extern void swait_finish(struct swait_head *head, struct swaiter *w);
  18082. +
  18083. +/* Check whether a head has waiters enqueued */
  18084. +static inline bool swaitqueue_active(struct swait_head *h)
  18085. +{
  18086. + /* Make sure the condition is visible before checking list_empty() */
  18087. + smp_mb();
  18088. + return !list_empty(&h->list);
  18089. +}
  18090. +
  18091. +/*
  18092. + * Wakeup functions
  18093. + */
  18094. +extern unsigned int __swait_wake(struct swait_head *head, unsigned int state, unsigned int num);
  18095. +extern unsigned int __swait_wake_locked(struct swait_head *head, unsigned int state, unsigned int num);
  18096. +
  18097. +#define swait_wake(head) __swait_wake(head, TASK_NORMAL, 1)
  18098. +#define swait_wake_interruptible(head) __swait_wake(head, TASK_INTERRUPTIBLE, 1)
  18099. +#define swait_wake_all(head) __swait_wake(head, TASK_NORMAL, 0)
  18100. +#define swait_wake_all_interruptible(head) __swait_wake(head, TASK_INTERRUPTIBLE, 0)
  18101. +
  18102. +/*
  18103. + * Event API
  18104. + */
  18105. +#define __swait_event(wq, condition) \
  18106. +do { \
  18107. + DEFINE_SWAITER(__wait); \
  18108. + \
  18109. + for (;;) { \
  18110. + swait_prepare(&wq, &__wait, TASK_UNINTERRUPTIBLE); \
  18111. + if (condition) \
  18112. + break; \
  18113. + schedule(); \
  18114. + } \
  18115. + swait_finish(&wq, &__wait); \
  18116. +} while (0)
  18117. +
  18118. +/**
  18119. + * swait_event - sleep until a condition gets true
  18120. + * @wq: the waitqueue to wait on
  18121. + * @condition: a C expression for the event to wait for
  18122. + *
  18123. + * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
  18124. + * @condition evaluates to true. The @condition is checked each time
  18125. + * the waitqueue @wq is woken up.
  18126. + *
  18127. + * wake_up() has to be called after changing any variable that could
  18128. + * change the result of the wait condition.
  18129. + */
  18130. +#define swait_event(wq, condition) \
  18131. +do { \
  18132. + if (condition) \
  18133. + break; \
  18134. + __swait_event(wq, condition); \
  18135. +} while (0)
  18136. +
  18137. +#define __swait_event_interruptible(wq, condition, ret) \
  18138. +do { \
  18139. + DEFINE_SWAITER(__wait); \
  18140. + \
  18141. + for (;;) { \
  18142. + swait_prepare(&wq, &__wait, TASK_INTERRUPTIBLE); \
  18143. + if (condition) \
  18144. + break; \
  18145. + if (signal_pending(current)) { \
  18146. + ret = -ERESTARTSYS; \
  18147. + break; \
  18148. + } \
  18149. + schedule(); \
  18150. + } \
  18151. + swait_finish(&wq, &__wait); \
  18152. +} while (0)
  18153. +
  18154. +#define __swait_event_interruptible_timeout(wq, condition, ret) \
  18155. +do { \
  18156. + DEFINE_SWAITER(__wait); \
  18157. + \
  18158. + for (;;) { \
  18159. + swait_prepare(&wq, &__wait, TASK_INTERRUPTIBLE); \
  18160. + if (condition) \
  18161. + break; \
  18162. + if (signal_pending(current)) { \
  18163. + ret = -ERESTARTSYS; \
  18164. + break; \
  18165. + } \
  18166. + ret = schedule_timeout(ret); \
  18167. + if (!ret) \
  18168. + break; \
  18169. + } \
  18170. + swait_finish(&wq, &__wait); \
  18171. +} while (0)
  18172. +
  18173. +/**
  18174. + * swait_event_interruptible - sleep until a condition gets true
  18175. + * @wq: the waitqueue to wait on
  18176. + * @condition: a C expression for the event to wait for
  18177. + *
  18178. + * The process is put to sleep (TASK_INTERRUPTIBLE) until the
  18179. + * @condition evaluates to true. The @condition is checked each time
  18180. + * the waitqueue @wq is woken up.
  18181. + *
  18182. + * wake_up() has to be called after changing any variable that could
  18183. + * change the result of the wait condition.
  18184. + */
  18185. +#define swait_event_interruptible(wq, condition) \
  18186. +({ \
  18187. + int __ret = 0; \
  18188. + if (!(condition)) \
  18189. + __swait_event_interruptible(wq, condition, __ret); \
  18190. + __ret; \
  18191. +})
  18192. +
  18193. +#define swait_event_interruptible_timeout(wq, condition, timeout) \
  18194. +({ \
  18195. + int __ret = timeout; \
  18196. + if (!(condition)) \
  18197. + __swait_event_interruptible_timeout(wq, condition, __ret); \
  18198. + __ret; \
  18199. +})
  18200. +
  18201. +#define __swait_event_timeout(wq, condition, ret) \
  18202. +do { \
  18203. + DEFINE_SWAITER(__wait); \
  18204. + \
  18205. + for (;;) { \
  18206. + swait_prepare(&wq, &__wait, TASK_UNINTERRUPTIBLE); \
  18207. + if (condition) \
  18208. + break; \
  18209. + ret = schedule_timeout(ret); \
  18210. + if (!ret) \
  18211. + break; \
  18212. + } \
  18213. + swait_finish(&wq, &__wait); \
  18214. +} while (0)
  18215. +
  18216. +/**
  18217. + * swait_event_timeout - sleep until a condition gets true or a timeout elapses
  18218. + * @wq: the waitqueue to wait on
  18219. + * @condition: a C expression for the event to wait for
  18220. + * @timeout: timeout, in jiffies
  18221. + *
  18222. + * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
  18223. + * @condition evaluates to true. The @condition is checked each time
  18224. + * the waitqueue @wq is woken up.
  18225. + *
  18226. + * wake_up() has to be called after changing any variable that could
  18227. + * change the result of the wait condition.
  18228. + *
  18229. + * The function returns 0 if the @timeout elapsed, and the remaining
  18230. + * jiffies if the condition evaluated to true before the timeout elapsed.
  18231. + */
  18232. +#define swait_event_timeout(wq, condition, timeout) \
  18233. +({ \
  18234. + long __ret = timeout; \
  18235. + if (!(condition)) \
  18236. + __swait_event_timeout(wq, condition, __ret); \
  18237. + __ret; \
  18238. +})
  18239. +
  18240. +#endif
  18241. diff -Nur linux-3.18.14.orig/include/linux/work-simple.h linux-3.18.14-rt/include/linux/work-simple.h
  18242. --- linux-3.18.14.orig/include/linux/work-simple.h 1969-12-31 18:00:00.000000000 -0600
  18243. +++ linux-3.18.14-rt/include/linux/work-simple.h 2015-05-31 15:32:48.481635367 -0500
  18244. @@ -0,0 +1,24 @@
  18245. +#ifndef _LINUX_SWORK_H
  18246. +#define _LINUX_SWORK_H
  18247. +
  18248. +#include <linux/list.h>
  18249. +
  18250. +struct swork_event {
  18251. + struct list_head item;
  18252. + unsigned long flags;
  18253. + void (*func)(struct swork_event *);
  18254. +};
  18255. +
  18256. +static inline void INIT_SWORK(struct swork_event *event,
  18257. + void (*func)(struct swork_event *))
  18258. +{
  18259. + event->flags = 0;
  18260. + event->func = func;
  18261. +}
  18262. +
  18263. +bool swork_queue(struct swork_event *sev);
  18264. +
  18265. +int swork_get(void);
  18266. +void swork_put(void);
  18267. +
  18268. +#endif /* _LINUX_SWORK_H */
  18269. diff -Nur linux-3.18.14.orig/include/net/dst.h linux-3.18.14-rt/include/net/dst.h
  18270. --- linux-3.18.14.orig/include/net/dst.h 2015-05-20 10:04:50.000000000 -0500
  18271. +++ linux-3.18.14-rt/include/net/dst.h 2015-05-31 15:32:48.497635366 -0500
  18272. @@ -403,7 +403,7 @@
  18273. static inline int dst_neigh_output(struct dst_entry *dst, struct neighbour *n,
  18274. struct sk_buff *skb)
  18275. {
  18276. - const struct hh_cache *hh;
  18277. + struct hh_cache *hh;
  18278. if (dst->pending_confirm) {
  18279. unsigned long now = jiffies;
  18280. diff -Nur linux-3.18.14.orig/include/net/neighbour.h linux-3.18.14-rt/include/net/neighbour.h
  18281. --- linux-3.18.14.orig/include/net/neighbour.h 2015-05-20 10:04:50.000000000 -0500
  18282. +++ linux-3.18.14-rt/include/net/neighbour.h 2015-05-31 15:32:48.521635366 -0500
  18283. @@ -387,7 +387,7 @@
  18284. }
  18285. #endif
  18286. -static inline int neigh_hh_output(const struct hh_cache *hh, struct sk_buff *skb)
  18287. +static inline int neigh_hh_output(struct hh_cache *hh, struct sk_buff *skb)
  18288. {
  18289. unsigned int seq;
  18290. int hh_len;
  18291. @@ -442,7 +442,7 @@
  18292. #define NEIGH_CB(skb) ((struct neighbour_cb *)(skb)->cb)
  18293. -static inline void neigh_ha_snapshot(char *dst, const struct neighbour *n,
  18294. +static inline void neigh_ha_snapshot(char *dst, struct neighbour *n,
  18295. const struct net_device *dev)
  18296. {
  18297. unsigned int seq;
  18298. diff -Nur linux-3.18.14.orig/include/net/netns/ipv4.h linux-3.18.14-rt/include/net/netns/ipv4.h
  18299. --- linux-3.18.14.orig/include/net/netns/ipv4.h 2015-05-20 10:04:50.000000000 -0500
  18300. +++ linux-3.18.14-rt/include/net/netns/ipv4.h 2015-05-31 15:32:48.521635366 -0500
  18301. @@ -67,6 +67,7 @@
  18302. int sysctl_icmp_echo_ignore_all;
  18303. int sysctl_icmp_echo_ignore_broadcasts;
  18304. + int sysctl_icmp_echo_sysrq;
  18305. int sysctl_icmp_ignore_bogus_error_responses;
  18306. int sysctl_icmp_ratelimit;
  18307. int sysctl_icmp_ratemask;
  18308. diff -Nur linux-3.18.14.orig/include/trace/events/hist.h linux-3.18.14-rt/include/trace/events/hist.h
  18309. --- linux-3.18.14.orig/include/trace/events/hist.h 1969-12-31 18:00:00.000000000 -0600
  18310. +++ linux-3.18.14-rt/include/trace/events/hist.h 2015-05-31 15:32:48.521635366 -0500
  18311. @@ -0,0 +1,72 @@
  18312. +#undef TRACE_SYSTEM
  18313. +#define TRACE_SYSTEM hist
  18314. +
  18315. +#if !defined(_TRACE_HIST_H) || defined(TRACE_HEADER_MULTI_READ)
  18316. +#define _TRACE_HIST_H
  18317. +
  18318. +#include "latency_hist.h"
  18319. +#include <linux/tracepoint.h>
  18320. +
  18321. +#if !defined(CONFIG_PREEMPT_OFF_HIST) && !defined(CONFIG_INTERRUPT_OFF_HIST)
  18322. +#define trace_preemptirqsoff_hist(a, b)
  18323. +#else
  18324. +TRACE_EVENT(preemptirqsoff_hist,
  18325. +
  18326. + TP_PROTO(int reason, int starthist),
  18327. +
  18328. + TP_ARGS(reason, starthist),
  18329. +
  18330. + TP_STRUCT__entry(
  18331. + __field(int, reason)
  18332. + __field(int, starthist)
  18333. + ),
  18334. +
  18335. + TP_fast_assign(
  18336. + __entry->reason = reason;
  18337. + __entry->starthist = starthist;
  18338. + ),
  18339. +
  18340. + TP_printk("reason=%s starthist=%s", getaction(__entry->reason),
  18341. + __entry->starthist ? "start" : "stop")
  18342. +);
  18343. +#endif
  18344. +
  18345. +#ifndef CONFIG_MISSED_TIMER_OFFSETS_HIST
  18346. +#define trace_hrtimer_interrupt(a, b, c, d)
  18347. +#else
  18348. +TRACE_EVENT(hrtimer_interrupt,
  18349. +
  18350. + TP_PROTO(int cpu, long long offset, struct task_struct *curr,
  18351. + struct task_struct *task),
  18352. +
  18353. + TP_ARGS(cpu, offset, curr, task),
  18354. +
  18355. + TP_STRUCT__entry(
  18356. + __field(int, cpu)
  18357. + __field(long long, offset)
  18358. + __array(char, ccomm, TASK_COMM_LEN)
  18359. + __field(int, cprio)
  18360. + __array(char, tcomm, TASK_COMM_LEN)
  18361. + __field(int, tprio)
  18362. + ),
  18363. +
  18364. + TP_fast_assign(
  18365. + __entry->cpu = cpu;
  18366. + __entry->offset = offset;
  18367. + memcpy(__entry->ccomm, curr->comm, TASK_COMM_LEN);
  18368. + __entry->cprio = curr->prio;
  18369. + memcpy(__entry->tcomm, task != NULL ? task->comm : "<none>",
  18370. + task != NULL ? TASK_COMM_LEN : 7);
  18371. + __entry->tprio = task != NULL ? task->prio : -1;
  18372. + ),
  18373. +
  18374. + TP_printk("cpu=%d offset=%lld curr=%s[%d] thread=%s[%d]",
  18375. + __entry->cpu, __entry->offset, __entry->ccomm,
  18376. + __entry->cprio, __entry->tcomm, __entry->tprio)
  18377. +);
  18378. +#endif
  18379. +
  18380. +#endif /* _TRACE_HIST_H */
  18381. +
  18382. +/* This part must be outside protection */
  18383. +#include <trace/define_trace.h>
  18384. diff -Nur linux-3.18.14.orig/include/trace/events/latency_hist.h linux-3.18.14-rt/include/trace/events/latency_hist.h
  18385. --- linux-3.18.14.orig/include/trace/events/latency_hist.h 1969-12-31 18:00:00.000000000 -0600
  18386. +++ linux-3.18.14-rt/include/trace/events/latency_hist.h 2015-05-31 15:32:48.521635366 -0500
  18387. @@ -0,0 +1,29 @@
  18388. +#ifndef _LATENCY_HIST_H
  18389. +#define _LATENCY_HIST_H
  18390. +
  18391. +enum hist_action {
  18392. + IRQS_ON,
  18393. + PREEMPT_ON,
  18394. + TRACE_STOP,
  18395. + IRQS_OFF,
  18396. + PREEMPT_OFF,
  18397. + TRACE_START,
  18398. +};
  18399. +
  18400. +static char *actions[] = {
  18401. + "IRQS_ON",
  18402. + "PREEMPT_ON",
  18403. + "TRACE_STOP",
  18404. + "IRQS_OFF",
  18405. + "PREEMPT_OFF",
  18406. + "TRACE_START",
  18407. +};
  18408. +
  18409. +static inline char *getaction(int action)
  18410. +{
  18411. + if (action >= 0 && action <= sizeof(actions)/sizeof(actions[0]))
  18412. + return actions[action];
  18413. + return "unknown";
  18414. +}
  18415. +
  18416. +#endif /* _LATENCY_HIST_H */
  18417. diff -Nur linux-3.18.14.orig/init/Kconfig linux-3.18.14-rt/init/Kconfig
  18418. --- linux-3.18.14.orig/init/Kconfig 2015-05-20 10:04:50.000000000 -0500
  18419. +++ linux-3.18.14-rt/init/Kconfig 2015-05-31 15:32:48.525635366 -0500
  18420. @@ -635,7 +635,7 @@
  18421. config RCU_FAST_NO_HZ
  18422. bool "Accelerate last non-dyntick-idle CPU's grace periods"
  18423. - depends on NO_HZ_COMMON && SMP
  18424. + depends on NO_HZ_COMMON && SMP && !PREEMPT_RT_FULL
  18425. default n
  18426. help
  18427. This option permits CPUs to enter dynticks-idle state even if
  18428. @@ -662,7 +662,7 @@
  18429. config RCU_BOOST
  18430. bool "Enable RCU priority boosting"
  18431. depends on RT_MUTEXES && PREEMPT_RCU
  18432. - default n
  18433. + default y if PREEMPT_RT_FULL
  18434. help
  18435. This option boosts the priority of preempted RCU readers that
  18436. block the current preemptible RCU grace period for too long.
  18437. @@ -1106,6 +1106,7 @@
  18438. config RT_GROUP_SCHED
  18439. bool "Group scheduling for SCHED_RR/FIFO"
  18440. depends on CGROUP_SCHED
  18441. + depends on !PREEMPT_RT_FULL
  18442. default n
  18443. help
  18444. This feature lets you explicitly allocate real CPU bandwidth
  18445. @@ -1677,6 +1678,7 @@
  18446. config SLAB
  18447. bool "SLAB"
  18448. + depends on !PREEMPT_RT_FULL
  18449. help
  18450. The regular slab allocator that is established and known to work
  18451. well in all environments. It organizes cache hot objects in
  18452. @@ -1695,6 +1697,7 @@
  18453. config SLOB
  18454. depends on EXPERT
  18455. bool "SLOB (Simple Allocator)"
  18456. + depends on !PREEMPT_RT_FULL
  18457. help
  18458. SLOB replaces the stock allocator with a drastically simpler
  18459. allocator. SLOB is generally more space efficient but
  18460. diff -Nur linux-3.18.14.orig/init/main.c linux-3.18.14-rt/init/main.c
  18461. --- linux-3.18.14.orig/init/main.c 2015-05-20 10:04:50.000000000 -0500
  18462. +++ linux-3.18.14-rt/init/main.c 2015-05-31 15:32:48.545635366 -0500
  18463. @@ -533,6 +533,7 @@
  18464. setup_command_line(command_line);
  18465. setup_nr_cpu_ids();
  18466. setup_per_cpu_areas();
  18467. + softirq_early_init();
  18468. smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */
  18469. build_all_zonelists(NULL, NULL);
  18470. diff -Nur linux-3.18.14.orig/init/Makefile linux-3.18.14-rt/init/Makefile
  18471. --- linux-3.18.14.orig/init/Makefile 2015-05-20 10:04:50.000000000 -0500
  18472. +++ linux-3.18.14-rt/init/Makefile 2015-05-31 15:32:48.525635366 -0500
  18473. @@ -33,4 +33,4 @@
  18474. include/generated/compile.h: FORCE
  18475. @$($(quiet)chk_compile.h)
  18476. $(Q)$(CONFIG_SHELL) $(srctree)/scripts/mkcompile_h $@ \
  18477. - "$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" "$(CC) $(KBUILD_CFLAGS)"
  18478. + "$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" "$(CONFIG_PREEMPT_RT_FULL)" "$(CC) $(KBUILD_CFLAGS)"
  18479. diff -Nur linux-3.18.14.orig/ipc/mqueue.c linux-3.18.14-rt/ipc/mqueue.c
  18480. --- linux-3.18.14.orig/ipc/mqueue.c 2015-05-20 10:04:50.000000000 -0500
  18481. +++ linux-3.18.14-rt/ipc/mqueue.c 2015-05-31 15:32:48.557635366 -0500
  18482. @@ -923,12 +923,17 @@
  18483. struct msg_msg *message,
  18484. struct ext_wait_queue *receiver)
  18485. {
  18486. + /*
  18487. + * Keep them in one critical section for PREEMPT_RT:
  18488. + */
  18489. + preempt_disable_rt();
  18490. receiver->msg = message;
  18491. list_del(&receiver->list);
  18492. receiver->state = STATE_PENDING;
  18493. wake_up_process(receiver->task);
  18494. smp_wmb();
  18495. receiver->state = STATE_READY;
  18496. + preempt_enable_rt();
  18497. }
  18498. /* pipelined_receive() - if there is task waiting in sys_mq_timedsend()
  18499. @@ -942,13 +947,18 @@
  18500. wake_up_interruptible(&info->wait_q);
  18501. return;
  18502. }
  18503. - if (msg_insert(sender->msg, info))
  18504. - return;
  18505. - list_del(&sender->list);
  18506. - sender->state = STATE_PENDING;
  18507. - wake_up_process(sender->task);
  18508. - smp_wmb();
  18509. - sender->state = STATE_READY;
  18510. + /*
  18511. + * Keep them in one critical section for PREEMPT_RT:
  18512. + */
  18513. + preempt_disable_rt();
  18514. + if (!msg_insert(sender->msg, info)) {
  18515. + list_del(&sender->list);
  18516. + sender->state = STATE_PENDING;
  18517. + wake_up_process(sender->task);
  18518. + smp_wmb();
  18519. + sender->state = STATE_READY;
  18520. + }
  18521. + preempt_enable_rt();
  18522. }
  18523. SYSCALL_DEFINE5(mq_timedsend, mqd_t, mqdes, const char __user *, u_msg_ptr,
  18524. diff -Nur linux-3.18.14.orig/ipc/msg.c linux-3.18.14-rt/ipc/msg.c
  18525. --- linux-3.18.14.orig/ipc/msg.c 2015-05-20 10:04:50.000000000 -0500
  18526. +++ linux-3.18.14-rt/ipc/msg.c 2015-05-31 15:32:48.577635366 -0500
  18527. @@ -188,6 +188,12 @@
  18528. struct msg_receiver *msr, *t;
  18529. list_for_each_entry_safe(msr, t, &msq->q_receivers, r_list) {
  18530. + /*
  18531. + * Make sure that the wakeup doesnt preempt
  18532. + * this CPU prematurely. (on PREEMPT_RT)
  18533. + */
  18534. + preempt_disable_rt();
  18535. +
  18536. msr->r_msg = NULL; /* initialize expunge ordering */
  18537. wake_up_process(msr->r_tsk);
  18538. /*
  18539. @@ -198,6 +204,8 @@
  18540. */
  18541. smp_mb();
  18542. msr->r_msg = ERR_PTR(res);
  18543. +
  18544. + preempt_enable_rt();
  18545. }
  18546. }
  18547. @@ -574,6 +582,11 @@
  18548. if (testmsg(msg, msr->r_msgtype, msr->r_mode) &&
  18549. !security_msg_queue_msgrcv(msq, msg, msr->r_tsk,
  18550. msr->r_msgtype, msr->r_mode)) {
  18551. + /*
  18552. + * Make sure that the wakeup doesnt preempt
  18553. + * this CPU prematurely. (on PREEMPT_RT)
  18554. + */
  18555. + preempt_disable_rt();
  18556. list_del(&msr->r_list);
  18557. if (msr->r_maxsize < msg->m_ts) {
  18558. @@ -595,12 +608,13 @@
  18559. */
  18560. smp_mb();
  18561. msr->r_msg = msg;
  18562. + preempt_enable_rt();
  18563. return 1;
  18564. }
  18565. + preempt_enable_rt();
  18566. }
  18567. }
  18568. -
  18569. return 0;
  18570. }
  18571. diff -Nur linux-3.18.14.orig/ipc/sem.c linux-3.18.14-rt/ipc/sem.c
  18572. --- linux-3.18.14.orig/ipc/sem.c 2015-05-20 10:04:50.000000000 -0500
  18573. +++ linux-3.18.14-rt/ipc/sem.c 2015-05-31 15:32:48.577635366 -0500
  18574. @@ -673,6 +673,13 @@
  18575. static void wake_up_sem_queue_prepare(struct list_head *pt,
  18576. struct sem_queue *q, int error)
  18577. {
  18578. +#ifdef CONFIG_PREEMPT_RT_BASE
  18579. + struct task_struct *p = q->sleeper;
  18580. + get_task_struct(p);
  18581. + q->status = error;
  18582. + wake_up_process(p);
  18583. + put_task_struct(p);
  18584. +#else
  18585. if (list_empty(pt)) {
  18586. /*
  18587. * Hold preempt off so that we don't get preempted and have the
  18588. @@ -684,6 +691,7 @@
  18589. q->pid = error;
  18590. list_add_tail(&q->list, pt);
  18591. +#endif
  18592. }
  18593. /**
  18594. @@ -697,6 +705,7 @@
  18595. */
  18596. static void wake_up_sem_queue_do(struct list_head *pt)
  18597. {
  18598. +#ifndef CONFIG_PREEMPT_RT_BASE
  18599. struct sem_queue *q, *t;
  18600. int did_something;
  18601. @@ -709,6 +718,7 @@
  18602. }
  18603. if (did_something)
  18604. preempt_enable();
  18605. +#endif
  18606. }
  18607. static void unlink_queue(struct sem_array *sma, struct sem_queue *q)
  18608. diff -Nur linux-3.18.14.orig/kernel/cgroup.c linux-3.18.14-rt/kernel/cgroup.c
  18609. --- linux-3.18.14.orig/kernel/cgroup.c 2015-05-20 10:04:50.000000000 -0500
  18610. +++ linux-3.18.14-rt/kernel/cgroup.c 2015-05-31 15:32:48.597635365 -0500
  18611. @@ -4355,10 +4355,10 @@
  18612. queue_work(cgroup_destroy_wq, &css->destroy_work);
  18613. }
  18614. -static void css_release_work_fn(struct work_struct *work)
  18615. +static void css_release_work_fn(struct swork_event *sev)
  18616. {
  18617. struct cgroup_subsys_state *css =
  18618. - container_of(work, struct cgroup_subsys_state, destroy_work);
  18619. + container_of(sev, struct cgroup_subsys_state, destroy_swork);
  18620. struct cgroup_subsys *ss = css->ss;
  18621. struct cgroup *cgrp = css->cgroup;
  18622. @@ -4395,8 +4395,8 @@
  18623. struct cgroup_subsys_state *css =
  18624. container_of(ref, struct cgroup_subsys_state, refcnt);
  18625. - INIT_WORK(&css->destroy_work, css_release_work_fn);
  18626. - queue_work(cgroup_destroy_wq, &css->destroy_work);
  18627. + INIT_SWORK(&css->destroy_swork, css_release_work_fn);
  18628. + swork_queue(&css->destroy_swork);
  18629. }
  18630. static void init_and_link_css(struct cgroup_subsys_state *css,
  18631. @@ -4997,6 +4997,7 @@
  18632. */
  18633. cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
  18634. BUG_ON(!cgroup_destroy_wq);
  18635. + BUG_ON(swork_get());
  18636. /*
  18637. * Used to destroy pidlists and separate to serve as flush domain.
  18638. diff -Nur linux-3.18.14.orig/kernel/cpu.c linux-3.18.14-rt/kernel/cpu.c
  18639. --- linux-3.18.14.orig/kernel/cpu.c 2015-05-20 10:04:50.000000000 -0500
  18640. +++ linux-3.18.14-rt/kernel/cpu.c 2015-05-31 15:32:48.601635365 -0500
  18641. @@ -86,6 +86,290 @@
  18642. #define cpuhp_lock_acquire() lock_map_acquire(&cpu_hotplug.dep_map)
  18643. #define cpuhp_lock_release() lock_map_release(&cpu_hotplug.dep_map)
  18644. +/**
  18645. + * hotplug_pcp - per cpu hotplug descriptor
  18646. + * @unplug: set when pin_current_cpu() needs to sync tasks
  18647. + * @sync_tsk: the task that waits for tasks to finish pinned sections
  18648. + * @refcount: counter of tasks in pinned sections
  18649. + * @grab_lock: set when the tasks entering pinned sections should wait
  18650. + * @synced: notifier for @sync_tsk to tell cpu_down it's finished
  18651. + * @mutex: the mutex to make tasks wait (used when @grab_lock is true)
  18652. + * @mutex_init: zero if the mutex hasn't been initialized yet.
  18653. + *
  18654. + * Although @unplug and @sync_tsk may point to the same task, the @unplug
  18655. + * is used as a flag and still exists after @sync_tsk has exited and
  18656. + * @sync_tsk set to NULL.
  18657. + */
  18658. +struct hotplug_pcp {
  18659. + struct task_struct *unplug;
  18660. + struct task_struct *sync_tsk;
  18661. + int refcount;
  18662. + int grab_lock;
  18663. + struct completion synced;
  18664. + struct completion unplug_wait;
  18665. +#ifdef CONFIG_PREEMPT_RT_FULL
  18666. + /*
  18667. + * Note, on PREEMPT_RT, the hotplug lock must save the state of
  18668. + * the task, otherwise the mutex will cause the task to fail
  18669. + * to sleep when required. (Because it's called from migrate_disable())
  18670. + *
  18671. + * The spinlock_t on PREEMPT_RT is a mutex that saves the task's
  18672. + * state.
  18673. + */
  18674. + spinlock_t lock;
  18675. +#else
  18676. + struct mutex mutex;
  18677. +#endif
  18678. + int mutex_init;
  18679. +};
  18680. +
  18681. +#ifdef CONFIG_PREEMPT_RT_FULL
  18682. +# define hotplug_lock(hp) rt_spin_lock(&(hp)->lock)
  18683. +# define hotplug_unlock(hp) rt_spin_unlock(&(hp)->lock)
  18684. +#else
  18685. +# define hotplug_lock(hp) mutex_lock(&(hp)->mutex)
  18686. +# define hotplug_unlock(hp) mutex_unlock(&(hp)->mutex)
  18687. +#endif
  18688. +
  18689. +static DEFINE_PER_CPU(struct hotplug_pcp, hotplug_pcp);
  18690. +
  18691. +/**
  18692. + * pin_current_cpu - Prevent the current cpu from being unplugged
  18693. + *
  18694. + * Lightweight version of get_online_cpus() to prevent cpu from being
  18695. + * unplugged when code runs in a migration disabled region.
  18696. + *
  18697. + * Must be called with preemption disabled (preempt_count = 1)!
  18698. + */
  18699. +void pin_current_cpu(void)
  18700. +{
  18701. + struct hotplug_pcp *hp;
  18702. + int force = 0;
  18703. +
  18704. +retry:
  18705. + hp = &__get_cpu_var(hotplug_pcp);
  18706. +
  18707. + if (!hp->unplug || hp->refcount || force || preempt_count() > 1 ||
  18708. + hp->unplug == current) {
  18709. + hp->refcount++;
  18710. + return;
  18711. + }
  18712. + if (hp->grab_lock) {
  18713. + preempt_enable();
  18714. + hotplug_lock(hp);
  18715. + hotplug_unlock(hp);
  18716. + } else {
  18717. + preempt_enable();
  18718. + /*
  18719. + * Try to push this task off of this CPU.
  18720. + */
  18721. + if (!migrate_me()) {
  18722. + preempt_disable();
  18723. + hp = &__get_cpu_var(hotplug_pcp);
  18724. + if (!hp->grab_lock) {
  18725. + /*
  18726. + * Just let it continue it's already pinned
  18727. + * or about to sleep.
  18728. + */
  18729. + force = 1;
  18730. + goto retry;
  18731. + }
  18732. + preempt_enable();
  18733. + }
  18734. + }
  18735. + preempt_disable();
  18736. + goto retry;
  18737. +}
  18738. +
  18739. +/**
  18740. + * unpin_current_cpu - Allow unplug of current cpu
  18741. + *
  18742. + * Must be called with preemption or interrupts disabled!
  18743. + */
  18744. +void unpin_current_cpu(void)
  18745. +{
  18746. + struct hotplug_pcp *hp = &__get_cpu_var(hotplug_pcp);
  18747. +
  18748. + WARN_ON(hp->refcount <= 0);
  18749. +
  18750. + /* This is safe. sync_unplug_thread is pinned to this cpu */
  18751. + if (!--hp->refcount && hp->unplug && hp->unplug != current)
  18752. + wake_up_process(hp->unplug);
  18753. +}
  18754. +
  18755. +static void wait_for_pinned_cpus(struct hotplug_pcp *hp)
  18756. +{
  18757. + set_current_state(TASK_UNINTERRUPTIBLE);
  18758. + while (hp->refcount) {
  18759. + schedule_preempt_disabled();
  18760. + set_current_state(TASK_UNINTERRUPTIBLE);
  18761. + }
  18762. +}
  18763. +
  18764. +static int sync_unplug_thread(void *data)
  18765. +{
  18766. + struct hotplug_pcp *hp = data;
  18767. +
  18768. + wait_for_completion(&hp->unplug_wait);
  18769. + preempt_disable();
  18770. + hp->unplug = current;
  18771. + wait_for_pinned_cpus(hp);
  18772. +
  18773. + /*
  18774. + * This thread will synchronize the cpu_down() with threads
  18775. + * that have pinned the CPU. When the pinned CPU count reaches
  18776. + * zero, we inform the cpu_down code to continue to the next step.
  18777. + */
  18778. + set_current_state(TASK_UNINTERRUPTIBLE);
  18779. + preempt_enable();
  18780. + complete(&hp->synced);
  18781. +
  18782. + /*
  18783. + * If all succeeds, the next step will need tasks to wait till
  18784. + * the CPU is offline before continuing. To do this, the grab_lock
  18785. + * is set and tasks going into pin_current_cpu() will block on the
  18786. + * mutex. But we still need to wait for those that are already in
  18787. + * pinned CPU sections. If the cpu_down() failed, the kthread_should_stop()
  18788. + * will kick this thread out.
  18789. + */
  18790. + while (!hp->grab_lock && !kthread_should_stop()) {
  18791. + schedule();
  18792. + set_current_state(TASK_UNINTERRUPTIBLE);
  18793. + }
  18794. +
  18795. + /* Make sure grab_lock is seen before we see a stale completion */
  18796. + smp_mb();
  18797. +
  18798. + /*
  18799. + * Now just before cpu_down() enters stop machine, we need to make
  18800. + * sure all tasks that are in pinned CPU sections are out, and new
  18801. + * tasks will now grab the lock, keeping them from entering pinned
  18802. + * CPU sections.
  18803. + */
  18804. + if (!kthread_should_stop()) {
  18805. + preempt_disable();
  18806. + wait_for_pinned_cpus(hp);
  18807. + preempt_enable();
  18808. + complete(&hp->synced);
  18809. + }
  18810. +
  18811. + set_current_state(TASK_UNINTERRUPTIBLE);
  18812. + while (!kthread_should_stop()) {
  18813. + schedule();
  18814. + set_current_state(TASK_UNINTERRUPTIBLE);
  18815. + }
  18816. + set_current_state(TASK_RUNNING);
  18817. +
  18818. + /*
  18819. + * Force this thread off this CPU as it's going down and
  18820. + * we don't want any more work on this CPU.
  18821. + */
  18822. + current->flags &= ~PF_NO_SETAFFINITY;
  18823. + set_cpus_allowed_ptr(current, cpu_present_mask);
  18824. + migrate_me();
  18825. + return 0;
  18826. +}
  18827. +
  18828. +static void __cpu_unplug_sync(struct hotplug_pcp *hp)
  18829. +{
  18830. + wake_up_process(hp->sync_tsk);
  18831. + wait_for_completion(&hp->synced);
  18832. +}
  18833. +
  18834. +static void __cpu_unplug_wait(unsigned int cpu)
  18835. +{
  18836. + struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
  18837. +
  18838. + complete(&hp->unplug_wait);
  18839. + wait_for_completion(&hp->synced);
  18840. +}
  18841. +
  18842. +/*
  18843. + * Start the sync_unplug_thread on the target cpu and wait for it to
  18844. + * complete.
  18845. + */
  18846. +static int cpu_unplug_begin(unsigned int cpu)
  18847. +{
  18848. + struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
  18849. + int err;
  18850. +
  18851. + /* Protected by cpu_hotplug.lock */
  18852. + if (!hp->mutex_init) {
  18853. +#ifdef CONFIG_PREEMPT_RT_FULL
  18854. + spin_lock_init(&hp->lock);
  18855. +#else
  18856. + mutex_init(&hp->mutex);
  18857. +#endif
  18858. + hp->mutex_init = 1;
  18859. + }
  18860. +
  18861. + /* Inform the scheduler to migrate tasks off this CPU */
  18862. + tell_sched_cpu_down_begin(cpu);
  18863. +
  18864. + init_completion(&hp->synced);
  18865. + init_completion(&hp->unplug_wait);
  18866. +
  18867. + hp->sync_tsk = kthread_create(sync_unplug_thread, hp, "sync_unplug/%d", cpu);
  18868. + if (IS_ERR(hp->sync_tsk)) {
  18869. + err = PTR_ERR(hp->sync_tsk);
  18870. + hp->sync_tsk = NULL;
  18871. + return err;
  18872. + }
  18873. + kthread_bind(hp->sync_tsk, cpu);
  18874. +
  18875. + /*
  18876. + * Wait for tasks to get out of the pinned sections,
  18877. + * it's still OK if new tasks enter. Some CPU notifiers will
  18878. + * wait for tasks that are going to enter these sections and
  18879. + * we must not have them block.
  18880. + */
  18881. + wake_up_process(hp->sync_tsk);
  18882. + return 0;
  18883. +}
  18884. +
  18885. +static void cpu_unplug_sync(unsigned int cpu)
  18886. +{
  18887. + struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
  18888. +
  18889. + init_completion(&hp->synced);
  18890. + /* The completion needs to be initialzied before setting grab_lock */
  18891. + smp_wmb();
  18892. +
  18893. + /* Grab the mutex before setting grab_lock */
  18894. + hotplug_lock(hp);
  18895. + hp->grab_lock = 1;
  18896. +
  18897. + /*
  18898. + * The CPU notifiers have been completed.
  18899. + * Wait for tasks to get out of pinned CPU sections and have new
  18900. + * tasks block until the CPU is completely down.
  18901. + */
  18902. + __cpu_unplug_sync(hp);
  18903. +
  18904. + /* All done with the sync thread */
  18905. + kthread_stop(hp->sync_tsk);
  18906. + hp->sync_tsk = NULL;
  18907. +}
  18908. +
  18909. +static void cpu_unplug_done(unsigned int cpu)
  18910. +{
  18911. + struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
  18912. +
  18913. + hp->unplug = NULL;
  18914. + /* Let all tasks know cpu unplug is finished before cleaning up */
  18915. + smp_wmb();
  18916. +
  18917. + if (hp->sync_tsk)
  18918. + kthread_stop(hp->sync_tsk);
  18919. +
  18920. + if (hp->grab_lock) {
  18921. + hotplug_unlock(hp);
  18922. + /* protected by cpu_hotplug.lock */
  18923. + hp->grab_lock = 0;
  18924. + }
  18925. + tell_sched_cpu_down_done(cpu);
  18926. +}
  18927. +
  18928. void get_online_cpus(void)
  18929. {
  18930. might_sleep();
  18931. @@ -102,6 +386,7 @@
  18932. {
  18933. if (cpu_hotplug.active_writer == current)
  18934. return true;
  18935. +
  18936. if (!mutex_trylock(&cpu_hotplug.lock))
  18937. return false;
  18938. cpuhp_lock_acquire_tryread();
  18939. @@ -349,13 +634,15 @@
  18940. /* Requires cpu_add_remove_lock to be held */
  18941. static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
  18942. {
  18943. - int err, nr_calls = 0;
  18944. + int mycpu, err, nr_calls = 0;
  18945. void *hcpu = (void *)(long)cpu;
  18946. unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
  18947. struct take_cpu_down_param tcd_param = {
  18948. .mod = mod,
  18949. .hcpu = hcpu,
  18950. };
  18951. + cpumask_var_t cpumask;
  18952. + cpumask_var_t cpumask_org;
  18953. if (num_online_cpus() == 1)
  18954. return -EBUSY;
  18955. @@ -363,7 +650,34 @@
  18956. if (!cpu_online(cpu))
  18957. return -EINVAL;
  18958. + /* Move the downtaker off the unplug cpu */
  18959. + if (!alloc_cpumask_var(&cpumask, GFP_KERNEL))
  18960. + return -ENOMEM;
  18961. + if (!alloc_cpumask_var(&cpumask_org, GFP_KERNEL)) {
  18962. + free_cpumask_var(cpumask);
  18963. + return -ENOMEM;
  18964. + }
  18965. +
  18966. + cpumask_copy(cpumask_org, tsk_cpus_allowed(current));
  18967. + cpumask_andnot(cpumask, cpu_online_mask, cpumask_of(cpu));
  18968. + set_cpus_allowed_ptr(current, cpumask);
  18969. + free_cpumask_var(cpumask);
  18970. + migrate_disable();
  18971. + mycpu = smp_processor_id();
  18972. + if (mycpu == cpu) {
  18973. + printk(KERN_ERR "Yuck! Still on unplug CPU\n!");
  18974. + migrate_enable();
  18975. + err = -EBUSY;
  18976. + goto restore_cpus;
  18977. + }
  18978. + migrate_enable();
  18979. +
  18980. cpu_hotplug_begin();
  18981. + err = cpu_unplug_begin(cpu);
  18982. + if (err) {
  18983. + printk("cpu_unplug_begin(%d) failed\n", cpu);
  18984. + goto out_cancel;
  18985. + }
  18986. err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls);
  18987. if (err) {
  18988. @@ -389,8 +703,12 @@
  18989. #endif
  18990. synchronize_rcu();
  18991. + __cpu_unplug_wait(cpu);
  18992. smpboot_park_threads(cpu);
  18993. + /* Notifiers are done. Don't let any more tasks pin this CPU. */
  18994. + cpu_unplug_sync(cpu);
  18995. +
  18996. /*
  18997. * So now all preempt/rcu users must observe !cpu_active().
  18998. */
  18999. @@ -423,9 +741,14 @@
  19000. check_for_tasks(cpu);
  19001. out_release:
  19002. + cpu_unplug_done(cpu);
  19003. +out_cancel:
  19004. cpu_hotplug_done();
  19005. if (!err)
  19006. cpu_notify_nofail(CPU_POST_DEAD | mod, hcpu);
  19007. +restore_cpus:
  19008. + set_cpus_allowed_ptr(current, cpumask_org);
  19009. + free_cpumask_var(cpumask_org);
  19010. return err;
  19011. }
  19012. diff -Nur linux-3.18.14.orig/kernel/debug/kdb/kdb_io.c linux-3.18.14-rt/kernel/debug/kdb/kdb_io.c
  19013. --- linux-3.18.14.orig/kernel/debug/kdb/kdb_io.c 2015-05-20 10:04:50.000000000 -0500
  19014. +++ linux-3.18.14-rt/kernel/debug/kdb/kdb_io.c 2015-05-31 15:32:48.605635366 -0500
  19015. @@ -554,7 +554,6 @@
  19016. int linecount;
  19017. int colcount;
  19018. int logging, saved_loglevel = 0;
  19019. - int saved_trap_printk;
  19020. int got_printf_lock = 0;
  19021. int retlen = 0;
  19022. int fnd, len;
  19023. @@ -565,8 +564,6 @@
  19024. unsigned long uninitialized_var(flags);
  19025. preempt_disable();
  19026. - saved_trap_printk = kdb_trap_printk;
  19027. - kdb_trap_printk = 0;
  19028. /* Serialize kdb_printf if multiple cpus try to write at once.
  19029. * But if any cpu goes recursive in kdb, just print the output,
  19030. @@ -833,7 +830,6 @@
  19031. } else {
  19032. __release(kdb_printf_lock);
  19033. }
  19034. - kdb_trap_printk = saved_trap_printk;
  19035. preempt_enable();
  19036. return retlen;
  19037. }
  19038. @@ -843,9 +839,11 @@
  19039. va_list ap;
  19040. int r;
  19041. + kdb_trap_printk++;
  19042. va_start(ap, fmt);
  19043. r = vkdb_printf(fmt, ap);
  19044. va_end(ap);
  19045. + kdb_trap_printk--;
  19046. return r;
  19047. }
  19048. diff -Nur linux-3.18.14.orig/kernel/events/core.c linux-3.18.14-rt/kernel/events/core.c
  19049. --- linux-3.18.14.orig/kernel/events/core.c 2015-05-20 10:04:50.000000000 -0500
  19050. +++ linux-3.18.14-rt/kernel/events/core.c 2015-05-31 15:32:48.637635365 -0500
  19051. @@ -6346,6 +6346,7 @@
  19052. hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
  19053. hwc->hrtimer.function = perf_swevent_hrtimer;
  19054. + hwc->hrtimer.irqsafe = 1;
  19055. /*
  19056. * Since hrtimers have a fixed rate, we can do a static freq->period
  19057. diff -Nur linux-3.18.14.orig/kernel/exit.c linux-3.18.14-rt/kernel/exit.c
  19058. --- linux-3.18.14.orig/kernel/exit.c 2015-05-20 10:04:50.000000000 -0500
  19059. +++ linux-3.18.14-rt/kernel/exit.c 2015-05-31 15:32:48.649635365 -0500
  19060. @@ -147,7 +147,7 @@
  19061. * Do this under ->siglock, we can race with another thread
  19062. * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.
  19063. */
  19064. - flush_sigqueue(&tsk->pending);
  19065. + flush_task_sigqueue(tsk);
  19066. tsk->sighand = NULL;
  19067. spin_unlock(&sighand->siglock);
  19068. diff -Nur linux-3.18.14.orig/kernel/fork.c linux-3.18.14-rt/kernel/fork.c
  19069. --- linux-3.18.14.orig/kernel/fork.c 2015-05-20 10:04:50.000000000 -0500
  19070. +++ linux-3.18.14-rt/kernel/fork.c 2015-05-31 15:32:48.657635365 -0500
  19071. @@ -97,7 +97,7 @@
  19072. DEFINE_PER_CPU(unsigned long, process_counts) = 0;
  19073. -__cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */
  19074. +DEFINE_RWLOCK(tasklist_lock); /* outer */
  19075. #ifdef CONFIG_PROVE_RCU
  19076. int lockdep_tasklist_lock_is_held(void)
  19077. @@ -233,7 +233,9 @@
  19078. if (atomic_dec_and_test(&sig->sigcnt))
  19079. free_signal_struct(sig);
  19080. }
  19081. -
  19082. +#ifdef CONFIG_PREEMPT_RT_BASE
  19083. +static
  19084. +#endif
  19085. void __put_task_struct(struct task_struct *tsk)
  19086. {
  19087. WARN_ON(!tsk->exit_state);
  19088. @@ -249,7 +251,18 @@
  19089. if (!profile_handoff_task(tsk))
  19090. free_task(tsk);
  19091. }
  19092. +#ifndef CONFIG_PREEMPT_RT_BASE
  19093. EXPORT_SYMBOL_GPL(__put_task_struct);
  19094. +#else
  19095. +void __put_task_struct_cb(struct rcu_head *rhp)
  19096. +{
  19097. + struct task_struct *tsk = container_of(rhp, struct task_struct, put_rcu);
  19098. +
  19099. + __put_task_struct(tsk);
  19100. +
  19101. +}
  19102. +EXPORT_SYMBOL_GPL(__put_task_struct_cb);
  19103. +#endif
  19104. void __init __weak arch_task_cache_init(void) { }
  19105. @@ -643,6 +656,19 @@
  19106. }
  19107. EXPORT_SYMBOL_GPL(__mmdrop);
  19108. +#ifdef CONFIG_PREEMPT_RT_BASE
  19109. +/*
  19110. + * RCU callback for delayed mm drop. Not strictly rcu, but we don't
  19111. + * want another facility to make this work.
  19112. + */
  19113. +void __mmdrop_delayed(struct rcu_head *rhp)
  19114. +{
  19115. + struct mm_struct *mm = container_of(rhp, struct mm_struct, delayed_drop);
  19116. +
  19117. + __mmdrop(mm);
  19118. +}
  19119. +#endif
  19120. +
  19121. /*
  19122. * Decrement the use count and release all resources for an mm.
  19123. */
  19124. @@ -1157,6 +1183,9 @@
  19125. */
  19126. static void posix_cpu_timers_init(struct task_struct *tsk)
  19127. {
  19128. +#ifdef CONFIG_PREEMPT_RT_BASE
  19129. + tsk->posix_timer_list = NULL;
  19130. +#endif
  19131. tsk->cputime_expires.prof_exp = 0;
  19132. tsk->cputime_expires.virt_exp = 0;
  19133. tsk->cputime_expires.sched_exp = 0;
  19134. @@ -1284,6 +1313,7 @@
  19135. spin_lock_init(&p->alloc_lock);
  19136. init_sigpending(&p->pending);
  19137. + p->sigqueue_cache = NULL;
  19138. p->utime = p->stime = p->gtime = 0;
  19139. p->utimescaled = p->stimescaled = 0;
  19140. @@ -1291,7 +1321,8 @@
  19141. p->prev_cputime.utime = p->prev_cputime.stime = 0;
  19142. #endif
  19143. #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
  19144. - seqlock_init(&p->vtime_seqlock);
  19145. + raw_spin_lock_init(&p->vtime_lock);
  19146. + seqcount_init(&p->vtime_seq);
  19147. p->vtime_snap = 0;
  19148. p->vtime_snap_whence = VTIME_SLEEPING;
  19149. #endif
  19150. @@ -1342,6 +1373,9 @@
  19151. p->hardirq_context = 0;
  19152. p->softirq_context = 0;
  19153. #endif
  19154. +#ifdef CONFIG_PREEMPT_RT_FULL
  19155. + p->pagefault_disabled = 0;
  19156. +#endif
  19157. #ifdef CONFIG_LOCKDEP
  19158. p->lockdep_depth = 0; /* no locks held yet */
  19159. p->curr_chain_key = 0;
  19160. diff -Nur linux-3.18.14.orig/kernel/futex.c linux-3.18.14-rt/kernel/futex.c
  19161. --- linux-3.18.14.orig/kernel/futex.c 2015-05-20 10:04:50.000000000 -0500
  19162. +++ linux-3.18.14-rt/kernel/futex.c 2015-05-31 15:32:48.665635365 -0500
  19163. @@ -738,7 +738,9 @@
  19164. * task still owns the PI-state:
  19165. */
  19166. if (head->next != next) {
  19167. + raw_spin_unlock_irq(&curr->pi_lock);
  19168. spin_unlock(&hb->lock);
  19169. + raw_spin_lock_irq(&curr->pi_lock);
  19170. continue;
  19171. }
  19172. @@ -1705,6 +1707,16 @@
  19173. requeue_pi_wake_futex(this, &key2, hb2);
  19174. drop_count++;
  19175. continue;
  19176. + } else if (ret == -EAGAIN) {
  19177. + /*
  19178. + * Waiter was woken by timeout or
  19179. + * signal and has set pi_blocked_on to
  19180. + * PI_WAKEUP_INPROGRESS before we
  19181. + * tried to enqueue it on the rtmutex.
  19182. + */
  19183. + this->pi_state = NULL;
  19184. + free_pi_state(pi_state);
  19185. + continue;
  19186. } else if (ret) {
  19187. /* -EDEADLK */
  19188. this->pi_state = NULL;
  19189. @@ -2549,7 +2561,7 @@
  19190. struct hrtimer_sleeper timeout, *to = NULL;
  19191. struct rt_mutex_waiter rt_waiter;
  19192. struct rt_mutex *pi_mutex = NULL;
  19193. - struct futex_hash_bucket *hb;
  19194. + struct futex_hash_bucket *hb, *hb2;
  19195. union futex_key key2 = FUTEX_KEY_INIT;
  19196. struct futex_q q = futex_q_init;
  19197. int res, ret;
  19198. @@ -2574,10 +2586,7 @@
  19199. * The waiter is allocated on our stack, manipulated by the requeue
  19200. * code while we sleep on uaddr.
  19201. */
  19202. - debug_rt_mutex_init_waiter(&rt_waiter);
  19203. - RB_CLEAR_NODE(&rt_waiter.pi_tree_entry);
  19204. - RB_CLEAR_NODE(&rt_waiter.tree_entry);
  19205. - rt_waiter.task = NULL;
  19206. + rt_mutex_init_waiter(&rt_waiter, false);
  19207. ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
  19208. if (unlikely(ret != 0))
  19209. @@ -2608,20 +2617,55 @@
  19210. /* Queue the futex_q, drop the hb lock, wait for wakeup. */
  19211. futex_wait_queue_me(hb, &q, to);
  19212. - spin_lock(&hb->lock);
  19213. - ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
  19214. - spin_unlock(&hb->lock);
  19215. - if (ret)
  19216. - goto out_put_keys;
  19217. + /*
  19218. + * On RT we must avoid races with requeue and trying to block
  19219. + * on two mutexes (hb->lock and uaddr2's rtmutex) by
  19220. + * serializing access to pi_blocked_on with pi_lock.
  19221. + */
  19222. + raw_spin_lock_irq(&current->pi_lock);
  19223. + if (current->pi_blocked_on) {
  19224. + /*
  19225. + * We have been requeued or are in the process of
  19226. + * being requeued.
  19227. + */
  19228. + raw_spin_unlock_irq(&current->pi_lock);
  19229. + } else {
  19230. + /*
  19231. + * Setting pi_blocked_on to PI_WAKEUP_INPROGRESS
  19232. + * prevents a concurrent requeue from moving us to the
  19233. + * uaddr2 rtmutex. After that we can safely acquire
  19234. + * (and possibly block on) hb->lock.
  19235. + */
  19236. + current->pi_blocked_on = PI_WAKEUP_INPROGRESS;
  19237. + raw_spin_unlock_irq(&current->pi_lock);
  19238. +
  19239. + spin_lock(&hb->lock);
  19240. +
  19241. + /*
  19242. + * Clean up pi_blocked_on. We might leak it otherwise
  19243. + * when we succeeded with the hb->lock in the fast
  19244. + * path.
  19245. + */
  19246. + raw_spin_lock_irq(&current->pi_lock);
  19247. + current->pi_blocked_on = NULL;
  19248. + raw_spin_unlock_irq(&current->pi_lock);
  19249. +
  19250. + ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
  19251. + spin_unlock(&hb->lock);
  19252. + if (ret)
  19253. + goto out_put_keys;
  19254. + }
  19255. /*
  19256. - * In order for us to be here, we know our q.key == key2, and since
  19257. - * we took the hb->lock above, we also know that futex_requeue() has
  19258. - * completed and we no longer have to concern ourselves with a wakeup
  19259. - * race with the atomic proxy lock acquisition by the requeue code. The
  19260. - * futex_requeue dropped our key1 reference and incremented our key2
  19261. - * reference count.
  19262. + * In order to be here, we have either been requeued, are in
  19263. + * the process of being requeued, or requeue successfully
  19264. + * acquired uaddr2 on our behalf. If pi_blocked_on was
  19265. + * non-null above, we may be racing with a requeue. Do not
  19266. + * rely on q->lock_ptr to be hb2->lock until after blocking on
  19267. + * hb->lock or hb2->lock. The futex_requeue dropped our key1
  19268. + * reference and incremented our key2 reference count.
  19269. */
  19270. + hb2 = hash_futex(&key2);
  19271. /* Check if the requeue code acquired the second futex for us. */
  19272. if (!q.rt_waiter) {
  19273. @@ -2630,9 +2674,10 @@
  19274. * did a lock-steal - fix up the PI-state in that case.
  19275. */
  19276. if (q.pi_state && (q.pi_state->owner != current)) {
  19277. - spin_lock(q.lock_ptr);
  19278. + spin_lock(&hb2->lock);
  19279. + BUG_ON(&hb2->lock != q.lock_ptr);
  19280. ret = fixup_pi_state_owner(uaddr2, &q, current);
  19281. - spin_unlock(q.lock_ptr);
  19282. + spin_unlock(&hb2->lock);
  19283. }
  19284. } else {
  19285. /*
  19286. @@ -2645,7 +2690,8 @@
  19287. ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter);
  19288. debug_rt_mutex_free_waiter(&rt_waiter);
  19289. - spin_lock(q.lock_ptr);
  19290. + spin_lock(&hb2->lock);
  19291. + BUG_ON(&hb2->lock != q.lock_ptr);
  19292. /*
  19293. * Fixup the pi_state owner and possibly acquire the lock if we
  19294. * haven't already.
  19295. diff -Nur linux-3.18.14.orig/kernel/irq/handle.c linux-3.18.14-rt/kernel/irq/handle.c
  19296. --- linux-3.18.14.orig/kernel/irq/handle.c 2015-05-20 10:04:50.000000000 -0500
  19297. +++ linux-3.18.14-rt/kernel/irq/handle.c 2015-05-31 15:32:48.677635365 -0500
  19298. @@ -133,6 +133,8 @@
  19299. irqreturn_t
  19300. handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
  19301. {
  19302. + struct pt_regs *regs = get_irq_regs();
  19303. + u64 ip = regs ? instruction_pointer(regs) : 0;
  19304. irqreturn_t retval = IRQ_NONE;
  19305. unsigned int flags = 0, irq = desc->irq_data.irq;
  19306. @@ -173,7 +175,11 @@
  19307. action = action->next;
  19308. } while (action);
  19309. - add_interrupt_randomness(irq, flags);
  19310. +#ifndef CONFIG_PREEMPT_RT_FULL
  19311. + add_interrupt_randomness(irq, flags, ip);
  19312. +#else
  19313. + desc->random_ip = ip;
  19314. +#endif
  19315. if (!noirqdebug)
  19316. note_interrupt(irq, desc, retval);
  19317. diff -Nur linux-3.18.14.orig/kernel/irq/manage.c linux-3.18.14-rt/kernel/irq/manage.c
  19318. --- linux-3.18.14.orig/kernel/irq/manage.c 2015-05-20 10:04:50.000000000 -0500
  19319. +++ linux-3.18.14-rt/kernel/irq/manage.c 2015-05-31 15:32:48.697635365 -0500
  19320. @@ -22,6 +22,7 @@
  19321. #include "internals.h"
  19322. #ifdef CONFIG_IRQ_FORCED_THREADING
  19323. +# ifndef CONFIG_PREEMPT_RT_BASE
  19324. __read_mostly bool force_irqthreads;
  19325. static int __init setup_forced_irqthreads(char *arg)
  19326. @@ -30,6 +31,7 @@
  19327. return 0;
  19328. }
  19329. early_param("threadirqs", setup_forced_irqthreads);
  19330. +# endif
  19331. #endif
  19332. static void __synchronize_hardirq(struct irq_desc *desc)
  19333. @@ -173,6 +175,62 @@
  19334. irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { }
  19335. #endif
  19336. +#ifdef CONFIG_PREEMPT_RT_FULL
  19337. +static void _irq_affinity_notify(struct irq_affinity_notify *notify);
  19338. +static struct task_struct *set_affinity_helper;
  19339. +static LIST_HEAD(affinity_list);
  19340. +static DEFINE_RAW_SPINLOCK(affinity_list_lock);
  19341. +
  19342. +static int set_affinity_thread(void *unused)
  19343. +{
  19344. + while (1) {
  19345. + struct irq_affinity_notify *notify;
  19346. + int empty;
  19347. +
  19348. + set_current_state(TASK_INTERRUPTIBLE);
  19349. +
  19350. + raw_spin_lock_irq(&affinity_list_lock);
  19351. + empty = list_empty(&affinity_list);
  19352. + raw_spin_unlock_irq(&affinity_list_lock);
  19353. +
  19354. + if (empty)
  19355. + schedule();
  19356. + if (kthread_should_stop())
  19357. + break;
  19358. + set_current_state(TASK_RUNNING);
  19359. +try_next:
  19360. + notify = NULL;
  19361. +
  19362. + raw_spin_lock_irq(&affinity_list_lock);
  19363. + if (!list_empty(&affinity_list)) {
  19364. + notify = list_first_entry(&affinity_list,
  19365. + struct irq_affinity_notify, list);
  19366. + list_del_init(&notify->list);
  19367. + }
  19368. + raw_spin_unlock_irq(&affinity_list_lock);
  19369. +
  19370. + if (!notify)
  19371. + continue;
  19372. + _irq_affinity_notify(notify);
  19373. + goto try_next;
  19374. + }
  19375. + return 0;
  19376. +}
  19377. +
  19378. +static void init_helper_thread(void)
  19379. +{
  19380. + if (set_affinity_helper)
  19381. + return;
  19382. + set_affinity_helper = kthread_run(set_affinity_thread, NULL,
  19383. + "affinity-cb");
  19384. + WARN_ON(IS_ERR(set_affinity_helper));
  19385. +}
  19386. +#else
  19387. +
  19388. +static inline void init_helper_thread(void) { }
  19389. +
  19390. +#endif
  19391. +
  19392. int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
  19393. bool force)
  19394. {
  19395. @@ -211,7 +269,17 @@
  19396. if (desc->affinity_notify) {
  19397. kref_get(&desc->affinity_notify->kref);
  19398. +
  19399. +#ifdef CONFIG_PREEMPT_RT_FULL
  19400. + raw_spin_lock(&affinity_list_lock);
  19401. + if (list_empty(&desc->affinity_notify->list))
  19402. + list_add_tail(&affinity_list,
  19403. + &desc->affinity_notify->list);
  19404. + raw_spin_unlock(&affinity_list_lock);
  19405. + wake_up_process(set_affinity_helper);
  19406. +#else
  19407. schedule_work(&desc->affinity_notify->work);
  19408. +#endif
  19409. }
  19410. irqd_set(data, IRQD_AFFINITY_SET);
  19411. @@ -246,10 +314,8 @@
  19412. }
  19413. EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
  19414. -static void irq_affinity_notify(struct work_struct *work)
  19415. +static void _irq_affinity_notify(struct irq_affinity_notify *notify)
  19416. {
  19417. - struct irq_affinity_notify *notify =
  19418. - container_of(work, struct irq_affinity_notify, work);
  19419. struct irq_desc *desc = irq_to_desc(notify->irq);
  19420. cpumask_var_t cpumask;
  19421. unsigned long flags;
  19422. @@ -271,6 +337,13 @@
  19423. kref_put(&notify->kref, notify->release);
  19424. }
  19425. +static void irq_affinity_notify(struct work_struct *work)
  19426. +{
  19427. + struct irq_affinity_notify *notify =
  19428. + container_of(work, struct irq_affinity_notify, work);
  19429. + _irq_affinity_notify(notify);
  19430. +}
  19431. +
  19432. /**
  19433. * irq_set_affinity_notifier - control notification of IRQ affinity changes
  19434. * @irq: Interrupt for which to enable/disable notification
  19435. @@ -300,6 +373,8 @@
  19436. notify->irq = irq;
  19437. kref_init(&notify->kref);
  19438. INIT_WORK(&notify->work, irq_affinity_notify);
  19439. + INIT_LIST_HEAD(&notify->list);
  19440. + init_helper_thread();
  19441. }
  19442. raw_spin_lock_irqsave(&desc->lock, flags);
  19443. @@ -788,7 +863,15 @@
  19444. local_bh_disable();
  19445. ret = action->thread_fn(action->irq, action->dev_id);
  19446. irq_finalize_oneshot(desc, action);
  19447. - local_bh_enable();
  19448. + /*
  19449. + * Interrupts which have real time requirements can be set up
  19450. + * to avoid softirq processing in the thread handler. This is
  19451. + * safe as these interrupts do not raise soft interrupts.
  19452. + */
  19453. + if (irq_settings_no_softirq_call(desc))
  19454. + _local_bh_enable();
  19455. + else
  19456. + local_bh_enable();
  19457. return ret;
  19458. }
  19459. @@ -871,6 +954,12 @@
  19460. if (action_ret == IRQ_HANDLED)
  19461. atomic_inc(&desc->threads_handled);
  19462. +#ifdef CONFIG_PREEMPT_RT_FULL
  19463. + migrate_disable();
  19464. + add_interrupt_randomness(action->irq, 0,
  19465. + desc->random_ip ^ (unsigned long) action);
  19466. + migrate_enable();
  19467. +#endif
  19468. wake_threads_waitq(desc);
  19469. }
  19470. @@ -1184,6 +1273,9 @@
  19471. irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
  19472. }
  19473. + if (new->flags & IRQF_NO_SOFTIRQ_CALL)
  19474. + irq_settings_set_no_softirq_call(desc);
  19475. +
  19476. /* Set default affinity mask once everything is setup */
  19477. setup_affinity(irq, desc, mask);
  19478. diff -Nur linux-3.18.14.orig/kernel/irq/settings.h linux-3.18.14-rt/kernel/irq/settings.h
  19479. --- linux-3.18.14.orig/kernel/irq/settings.h 2015-05-20 10:04:50.000000000 -0500
  19480. +++ linux-3.18.14-rt/kernel/irq/settings.h 2015-05-31 15:32:48.697635365 -0500
  19481. @@ -15,6 +15,7 @@
  19482. _IRQ_NESTED_THREAD = IRQ_NESTED_THREAD,
  19483. _IRQ_PER_CPU_DEVID = IRQ_PER_CPU_DEVID,
  19484. _IRQ_IS_POLLED = IRQ_IS_POLLED,
  19485. + _IRQ_NO_SOFTIRQ_CALL = IRQ_NO_SOFTIRQ_CALL,
  19486. _IRQF_MODIFY_MASK = IRQF_MODIFY_MASK,
  19487. };
  19488. @@ -28,6 +29,7 @@
  19489. #define IRQ_NESTED_THREAD GOT_YOU_MORON
  19490. #define IRQ_PER_CPU_DEVID GOT_YOU_MORON
  19491. #define IRQ_IS_POLLED GOT_YOU_MORON
  19492. +#define IRQ_NO_SOFTIRQ_CALL GOT_YOU_MORON
  19493. #undef IRQF_MODIFY_MASK
  19494. #define IRQF_MODIFY_MASK GOT_YOU_MORON
  19495. @@ -38,6 +40,16 @@
  19496. desc->status_use_accessors |= (set & _IRQF_MODIFY_MASK);
  19497. }
  19498. +static inline bool irq_settings_no_softirq_call(struct irq_desc *desc)
  19499. +{
  19500. + return desc->status_use_accessors & _IRQ_NO_SOFTIRQ_CALL;
  19501. +}
  19502. +
  19503. +static inline void irq_settings_set_no_softirq_call(struct irq_desc *desc)
  19504. +{
  19505. + desc->status_use_accessors |= _IRQ_NO_SOFTIRQ_CALL;
  19506. +}
  19507. +
  19508. static inline bool irq_settings_is_per_cpu(struct irq_desc *desc)
  19509. {
  19510. return desc->status_use_accessors & _IRQ_PER_CPU;
  19511. diff -Nur linux-3.18.14.orig/kernel/irq/spurious.c linux-3.18.14-rt/kernel/irq/spurious.c
  19512. --- linux-3.18.14.orig/kernel/irq/spurious.c 2015-05-20 10:04:50.000000000 -0500
  19513. +++ linux-3.18.14-rt/kernel/irq/spurious.c 2015-05-31 15:32:48.709635364 -0500
  19514. @@ -444,6 +444,10 @@
  19515. static int __init irqfixup_setup(char *str)
  19516. {
  19517. +#ifdef CONFIG_PREEMPT_RT_BASE
  19518. + pr_warn("irqfixup boot option not supported w/ CONFIG_PREEMPT_RT_BASE\n");
  19519. + return 1;
  19520. +#endif
  19521. irqfixup = 1;
  19522. printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n");
  19523. printk(KERN_WARNING "This may impact system performance.\n");
  19524. @@ -456,6 +460,10 @@
  19525. static int __init irqpoll_setup(char *str)
  19526. {
  19527. +#ifdef CONFIG_PREEMPT_RT_BASE
  19528. + pr_warn("irqpoll boot option not supported w/ CONFIG_PREEMPT_RT_BASE\n");
  19529. + return 1;
  19530. +#endif
  19531. irqfixup = 2;
  19532. printk(KERN_WARNING "Misrouted IRQ fixup and polling support "
  19533. "enabled\n");
  19534. diff -Nur linux-3.18.14.orig/kernel/irq_work.c linux-3.18.14-rt/kernel/irq_work.c
  19535. --- linux-3.18.14.orig/kernel/irq_work.c 2015-05-20 10:04:50.000000000 -0500
  19536. +++ linux-3.18.14-rt/kernel/irq_work.c 2015-05-31 15:32:48.713635365 -0500
  19537. @@ -17,6 +17,7 @@
  19538. #include <linux/cpu.h>
  19539. #include <linux/notifier.h>
  19540. #include <linux/smp.h>
  19541. +#include <linux/interrupt.h>
  19542. #include <asm/processor.h>
  19543. @@ -65,6 +66,8 @@
  19544. */
  19545. bool irq_work_queue_on(struct irq_work *work, int cpu)
  19546. {
  19547. + struct llist_head *list;
  19548. +
  19549. /* All work should have been flushed before going offline */
  19550. WARN_ON_ONCE(cpu_is_offline(cpu));
  19551. @@ -75,7 +78,12 @@
  19552. if (!irq_work_claim(work))
  19553. return false;
  19554. - if (llist_add(&work->llnode, &per_cpu(raised_list, cpu)))
  19555. + if (IS_ENABLED(CONFIG_PREEMPT_RT_FULL) && !(work->flags & IRQ_WORK_HARD_IRQ))
  19556. + list = &per_cpu(lazy_list, cpu);
  19557. + else
  19558. + list = &per_cpu(raised_list, cpu);
  19559. +
  19560. + if (llist_add(&work->llnode, list))
  19561. arch_send_call_function_single_ipi(cpu);
  19562. return true;
  19563. @@ -86,6 +94,9 @@
  19564. /* Enqueue the irq work @work on the current CPU */
  19565. bool irq_work_queue(struct irq_work *work)
  19566. {
  19567. + struct llist_head *list;
  19568. + bool lazy_work, realtime = IS_ENABLED(CONFIG_PREEMPT_RT_FULL);
  19569. +
  19570. /* Only queue if not already pending */
  19571. if (!irq_work_claim(work))
  19572. return false;
  19573. @@ -93,13 +104,15 @@
  19574. /* Queue the entry and raise the IPI if needed. */
  19575. preempt_disable();
  19576. - /* If the work is "lazy", handle it from next tick if any */
  19577. - if (work->flags & IRQ_WORK_LAZY) {
  19578. - if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) &&
  19579. - tick_nohz_tick_stopped())
  19580. - arch_irq_work_raise();
  19581. - } else {
  19582. - if (llist_add(&work->llnode, this_cpu_ptr(&raised_list)))
  19583. + lazy_work = work->flags & IRQ_WORK_LAZY;
  19584. +
  19585. + if (lazy_work || (realtime && !(work->flags & IRQ_WORK_HARD_IRQ)))
  19586. + list = this_cpu_ptr(&lazy_list);
  19587. + else
  19588. + list = this_cpu_ptr(&raised_list);
  19589. +
  19590. + if (llist_add(&work->llnode, list)) {
  19591. + if (!lazy_work || tick_nohz_tick_stopped())
  19592. arch_irq_work_raise();
  19593. }
  19594. @@ -116,9 +129,8 @@
  19595. raised = this_cpu_ptr(&raised_list);
  19596. lazy = this_cpu_ptr(&lazy_list);
  19597. - if (llist_empty(raised) || arch_irq_work_has_interrupt())
  19598. - if (llist_empty(lazy))
  19599. - return false;
  19600. + if (llist_empty(raised) && llist_empty(lazy))
  19601. + return false;
  19602. /* All work should have been flushed before going offline */
  19603. WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
  19604. @@ -132,7 +144,7 @@
  19605. struct irq_work *work;
  19606. struct llist_node *llnode;
  19607. - BUG_ON(!irqs_disabled());
  19608. + BUG_ON(!IS_ENABLED(CONFIG_PREEMPT_RT_FULL) && !irqs_disabled());
  19609. if (llist_empty(list))
  19610. return;
  19611. @@ -169,17 +181,26 @@
  19612. void irq_work_run(void)
  19613. {
  19614. irq_work_run_list(this_cpu_ptr(&raised_list));
  19615. - irq_work_run_list(this_cpu_ptr(&lazy_list));
  19616. + if (IS_ENABLED(CONFIG_PREEMPT_RT_FULL)) {
  19617. + /*
  19618. + * NOTE: we raise softirq via IPI for safety,
  19619. + * and execute in irq_work_tick() to move the
  19620. + * overhead from hard to soft irq context.
  19621. + */
  19622. + if (!llist_empty(this_cpu_ptr(&lazy_list)))
  19623. + raise_softirq(TIMER_SOFTIRQ);
  19624. + } else
  19625. + irq_work_run_list(this_cpu_ptr(&lazy_list));
  19626. }
  19627. EXPORT_SYMBOL_GPL(irq_work_run);
  19628. void irq_work_tick(void)
  19629. {
  19630. - struct llist_head *raised = &__get_cpu_var(raised_list);
  19631. + struct llist_head *raised = this_cpu_ptr(&raised_list);
  19632. if (!llist_empty(raised) && !arch_irq_work_has_interrupt())
  19633. irq_work_run_list(raised);
  19634. - irq_work_run_list(&__get_cpu_var(lazy_list));
  19635. + irq_work_run_list(this_cpu_ptr(&lazy_list));
  19636. }
  19637. /*
  19638. diff -Nur linux-3.18.14.orig/kernel/Kconfig.locks linux-3.18.14-rt/kernel/Kconfig.locks
  19639. --- linux-3.18.14.orig/kernel/Kconfig.locks 2015-05-20 10:04:50.000000000 -0500
  19640. +++ linux-3.18.14-rt/kernel/Kconfig.locks 2015-05-31 15:32:48.585635365 -0500
  19641. @@ -225,11 +225,11 @@
  19642. config MUTEX_SPIN_ON_OWNER
  19643. def_bool y
  19644. - depends on SMP && !DEBUG_MUTEXES && ARCH_SUPPORTS_ATOMIC_RMW
  19645. + depends on SMP && !DEBUG_MUTEXES && ARCH_SUPPORTS_ATOMIC_RMW && !PREEMPT_RT_FULL
  19646. config RWSEM_SPIN_ON_OWNER
  19647. def_bool y
  19648. - depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW
  19649. + depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW && !PREEMPT_RT_FULL
  19650. config ARCH_USE_QUEUE_RWLOCK
  19651. bool
  19652. diff -Nur linux-3.18.14.orig/kernel/Kconfig.preempt linux-3.18.14-rt/kernel/Kconfig.preempt
  19653. --- linux-3.18.14.orig/kernel/Kconfig.preempt 2015-05-20 10:04:50.000000000 -0500
  19654. +++ linux-3.18.14-rt/kernel/Kconfig.preempt 2015-05-31 15:32:48.589635366 -0500
  19655. @@ -1,3 +1,16 @@
  19656. +config PREEMPT
  19657. + bool
  19658. + select PREEMPT_COUNT
  19659. +
  19660. +config PREEMPT_RT_BASE
  19661. + bool
  19662. + select PREEMPT
  19663. +
  19664. +config HAVE_PREEMPT_LAZY
  19665. + bool
  19666. +
  19667. +config PREEMPT_LAZY
  19668. + def_bool y if HAVE_PREEMPT_LAZY && PREEMPT_RT_FULL
  19669. choice
  19670. prompt "Preemption Model"
  19671. @@ -33,9 +46,9 @@
  19672. Select this if you are building a kernel for a desktop system.
  19673. -config PREEMPT
  19674. +config PREEMPT__LL
  19675. bool "Preemptible Kernel (Low-Latency Desktop)"
  19676. - select PREEMPT_COUNT
  19677. + select PREEMPT
  19678. select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
  19679. help
  19680. This option reduces the latency of the kernel by making
  19681. @@ -52,6 +65,22 @@
  19682. embedded system with latency requirements in the milliseconds
  19683. range.
  19684. +config PREEMPT_RTB
  19685. + bool "Preemptible Kernel (Basic RT)"
  19686. + select PREEMPT_RT_BASE
  19687. + help
  19688. + This option is basically the same as (Low-Latency Desktop) but
  19689. + enables changes which are preliminary for the full preemptible
  19690. + RT kernel.
  19691. +
  19692. +config PREEMPT_RT_FULL
  19693. + bool "Fully Preemptible Kernel (RT)"
  19694. + depends on IRQ_FORCED_THREADING
  19695. + select PREEMPT_RT_BASE
  19696. + select PREEMPT_RCU
  19697. + help
  19698. + All and everything
  19699. +
  19700. endchoice
  19701. config PREEMPT_COUNT
  19702. diff -Nur linux-3.18.14.orig/kernel/ksysfs.c linux-3.18.14-rt/kernel/ksysfs.c
  19703. --- linux-3.18.14.orig/kernel/ksysfs.c 2015-05-20 10:04:50.000000000 -0500
  19704. +++ linux-3.18.14-rt/kernel/ksysfs.c 2015-05-31 15:32:48.733635364 -0500
  19705. @@ -136,6 +136,15 @@
  19706. #endif /* CONFIG_KEXEC */
  19707. +#if defined(CONFIG_PREEMPT_RT_FULL)
  19708. +static ssize_t realtime_show(struct kobject *kobj,
  19709. + struct kobj_attribute *attr, char *buf)
  19710. +{
  19711. + return sprintf(buf, "%d\n", 1);
  19712. +}
  19713. +KERNEL_ATTR_RO(realtime);
  19714. +#endif
  19715. +
  19716. /* whether file capabilities are enabled */
  19717. static ssize_t fscaps_show(struct kobject *kobj,
  19718. struct kobj_attribute *attr, char *buf)
  19719. @@ -203,6 +212,9 @@
  19720. &vmcoreinfo_attr.attr,
  19721. #endif
  19722. &rcu_expedited_attr.attr,
  19723. +#ifdef CONFIG_PREEMPT_RT_FULL
  19724. + &realtime_attr.attr,
  19725. +#endif
  19726. NULL
  19727. };
  19728. diff -Nur linux-3.18.14.orig/kernel/locking/lglock.c linux-3.18.14-rt/kernel/locking/lglock.c
  19729. --- linux-3.18.14.orig/kernel/locking/lglock.c 2015-05-20 10:04:50.000000000 -0500
  19730. +++ linux-3.18.14-rt/kernel/locking/lglock.c 2015-05-31 15:32:48.749635364 -0500
  19731. @@ -4,6 +4,15 @@
  19732. #include <linux/cpu.h>
  19733. #include <linux/string.h>
  19734. +#ifndef CONFIG_PREEMPT_RT_FULL
  19735. +# define lg_lock_ptr arch_spinlock_t
  19736. +# define lg_do_lock(l) arch_spin_lock(l)
  19737. +# define lg_do_unlock(l) arch_spin_unlock(l)
  19738. +#else
  19739. +# define lg_lock_ptr struct rt_mutex
  19740. +# define lg_do_lock(l) __rt_spin_lock(l)
  19741. +# define lg_do_unlock(l) __rt_spin_unlock(l)
  19742. +#endif
  19743. /*
  19744. * Note there is no uninit, so lglocks cannot be defined in
  19745. * modules (but it's fine to use them from there)
  19746. @@ -12,51 +21,60 @@
  19747. void lg_lock_init(struct lglock *lg, char *name)
  19748. {
  19749. +#ifdef CONFIG_PREEMPT_RT_FULL
  19750. + int i;
  19751. +
  19752. + for_each_possible_cpu(i) {
  19753. + struct rt_mutex *lock = per_cpu_ptr(lg->lock, i);
  19754. +
  19755. + rt_mutex_init(lock);
  19756. + }
  19757. +#endif
  19758. LOCKDEP_INIT_MAP(&lg->lock_dep_map, name, &lg->lock_key, 0);
  19759. }
  19760. EXPORT_SYMBOL(lg_lock_init);
  19761. void lg_local_lock(struct lglock *lg)
  19762. {
  19763. - arch_spinlock_t *lock;
  19764. + lg_lock_ptr *lock;
  19765. - preempt_disable();
  19766. + migrate_disable();
  19767. lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
  19768. lock = this_cpu_ptr(lg->lock);
  19769. - arch_spin_lock(lock);
  19770. + lg_do_lock(lock);
  19771. }
  19772. EXPORT_SYMBOL(lg_local_lock);
  19773. void lg_local_unlock(struct lglock *lg)
  19774. {
  19775. - arch_spinlock_t *lock;
  19776. + lg_lock_ptr *lock;
  19777. lock_release(&lg->lock_dep_map, 1, _RET_IP_);
  19778. lock = this_cpu_ptr(lg->lock);
  19779. - arch_spin_unlock(lock);
  19780. - preempt_enable();
  19781. + lg_do_unlock(lock);
  19782. + migrate_enable();
  19783. }
  19784. EXPORT_SYMBOL(lg_local_unlock);
  19785. void lg_local_lock_cpu(struct lglock *lg, int cpu)
  19786. {
  19787. - arch_spinlock_t *lock;
  19788. + lg_lock_ptr *lock;
  19789. - preempt_disable();
  19790. + preempt_disable_nort();
  19791. lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
  19792. lock = per_cpu_ptr(lg->lock, cpu);
  19793. - arch_spin_lock(lock);
  19794. + lg_do_lock(lock);
  19795. }
  19796. EXPORT_SYMBOL(lg_local_lock_cpu);
  19797. void lg_local_unlock_cpu(struct lglock *lg, int cpu)
  19798. {
  19799. - arch_spinlock_t *lock;
  19800. + lg_lock_ptr *lock;
  19801. lock_release(&lg->lock_dep_map, 1, _RET_IP_);
  19802. lock = per_cpu_ptr(lg->lock, cpu);
  19803. - arch_spin_unlock(lock);
  19804. - preempt_enable();
  19805. + lg_do_unlock(lock);
  19806. + preempt_enable_nort();
  19807. }
  19808. EXPORT_SYMBOL(lg_local_unlock_cpu);
  19809. @@ -64,12 +82,12 @@
  19810. {
  19811. int i;
  19812. - preempt_disable();
  19813. + preempt_disable_nort();
  19814. lock_acquire_exclusive(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
  19815. for_each_possible_cpu(i) {
  19816. - arch_spinlock_t *lock;
  19817. + lg_lock_ptr *lock;
  19818. lock = per_cpu_ptr(lg->lock, i);
  19819. - arch_spin_lock(lock);
  19820. + lg_do_lock(lock);
  19821. }
  19822. }
  19823. EXPORT_SYMBOL(lg_global_lock);
  19824. @@ -80,10 +98,35 @@
  19825. lock_release(&lg->lock_dep_map, 1, _RET_IP_);
  19826. for_each_possible_cpu(i) {
  19827. - arch_spinlock_t *lock;
  19828. + lg_lock_ptr *lock;
  19829. lock = per_cpu_ptr(lg->lock, i);
  19830. - arch_spin_unlock(lock);
  19831. + lg_do_unlock(lock);
  19832. }
  19833. - preempt_enable();
  19834. + preempt_enable_nort();
  19835. }
  19836. EXPORT_SYMBOL(lg_global_unlock);
  19837. +
  19838. +#ifdef CONFIG_PREEMPT_RT_FULL
  19839. +/*
  19840. + * HACK: If you use this, you get to keep the pieces.
  19841. + * Used in queue_stop_cpus_work() when stop machinery
  19842. + * is called from inactive CPU, so we can't schedule.
  19843. + */
  19844. +# define lg_do_trylock_relax(l) \
  19845. + do { \
  19846. + while (!__rt_spin_trylock(l)) \
  19847. + cpu_relax(); \
  19848. + } while (0)
  19849. +
  19850. +void lg_global_trylock_relax(struct lglock *lg)
  19851. +{
  19852. + int i;
  19853. +
  19854. + lock_acquire_exclusive(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
  19855. + for_each_possible_cpu(i) {
  19856. + lg_lock_ptr *lock;
  19857. + lock = per_cpu_ptr(lg->lock, i);
  19858. + lg_do_trylock_relax(lock);
  19859. + }
  19860. +}
  19861. +#endif
  19862. diff -Nur linux-3.18.14.orig/kernel/locking/lockdep.c linux-3.18.14-rt/kernel/locking/lockdep.c
  19863. --- linux-3.18.14.orig/kernel/locking/lockdep.c 2015-05-20 10:04:50.000000000 -0500
  19864. +++ linux-3.18.14-rt/kernel/locking/lockdep.c 2015-05-31 15:32:48.749635364 -0500
  19865. @@ -3542,6 +3542,7 @@
  19866. }
  19867. }
  19868. +#ifndef CONFIG_PREEMPT_RT_FULL
  19869. /*
  19870. * We dont accurately track softirq state in e.g.
  19871. * hardirq contexts (such as on 4KSTACKS), so only
  19872. @@ -3556,6 +3557,7 @@
  19873. DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled);
  19874. }
  19875. }
  19876. +#endif
  19877. if (!debug_locks)
  19878. print_irqtrace_events(current);
  19879. diff -Nur linux-3.18.14.orig/kernel/locking/Makefile linux-3.18.14-rt/kernel/locking/Makefile
  19880. --- linux-3.18.14.orig/kernel/locking/Makefile 2015-05-20 10:04:50.000000000 -0500
  19881. +++ linux-3.18.14-rt/kernel/locking/Makefile 2015-05-31 15:32:48.737635364 -0500
  19882. @@ -1,5 +1,5 @@
  19883. -obj-y += mutex.o semaphore.o rwsem.o mcs_spinlock.o
  19884. +obj-y += semaphore.o mcs_spinlock.o
  19885. ifdef CONFIG_FUNCTION_TRACER
  19886. CFLAGS_REMOVE_lockdep.o = -pg
  19887. @@ -8,7 +8,11 @@
  19888. CFLAGS_REMOVE_rtmutex-debug.o = -pg
  19889. endif
  19890. +ifneq ($(CONFIG_PREEMPT_RT_FULL),y)
  19891. +obj-y += mutex.o
  19892. obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
  19893. +obj-y += rwsem.o
  19894. +endif
  19895. obj-$(CONFIG_LOCKDEP) += lockdep.o
  19896. ifeq ($(CONFIG_PROC_FS),y)
  19897. obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
  19898. @@ -21,8 +25,11 @@
  19899. obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
  19900. obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
  19901. obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
  19902. +ifneq ($(CONFIG_PREEMPT_RT_FULL),y)
  19903. obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
  19904. obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o
  19905. +endif
  19906. obj-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o
  19907. +obj-$(CONFIG_PREEMPT_RT_FULL) += rt.o
  19908. obj-$(CONFIG_QUEUE_RWLOCK) += qrwlock.o
  19909. obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o
  19910. diff -Nur linux-3.18.14.orig/kernel/locking/percpu-rwsem.c linux-3.18.14-rt/kernel/locking/percpu-rwsem.c
  19911. --- linux-3.18.14.orig/kernel/locking/percpu-rwsem.c 2015-05-20 10:04:50.000000000 -0500
  19912. +++ linux-3.18.14-rt/kernel/locking/percpu-rwsem.c 2015-05-31 15:32:48.757635364 -0500
  19913. @@ -84,8 +84,12 @@
  19914. down_read(&brw->rw_sem);
  19915. atomic_inc(&brw->slow_read_ctr);
  19916. +#ifdef CONFIG_PREEMPT_RT_FULL
  19917. + up_read(&brw->rw_sem);
  19918. +#else
  19919. /* avoid up_read()->rwsem_release() */
  19920. __up_read(&brw->rw_sem);
  19921. +#endif
  19922. }
  19923. void percpu_up_read(struct percpu_rw_semaphore *brw)
  19924. diff -Nur linux-3.18.14.orig/kernel/locking/rt.c linux-3.18.14-rt/kernel/locking/rt.c
  19925. --- linux-3.18.14.orig/kernel/locking/rt.c 1969-12-31 18:00:00.000000000 -0600
  19926. +++ linux-3.18.14-rt/kernel/locking/rt.c 2015-05-31 15:32:48.757635364 -0500
  19927. @@ -0,0 +1,456 @@
  19928. +/*
  19929. + * kernel/rt.c
  19930. + *
  19931. + * Real-Time Preemption Support
  19932. + *
  19933. + * started by Ingo Molnar:
  19934. + *
  19935. + * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
  19936. + * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
  19937. + *
  19938. + * historic credit for proving that Linux spinlocks can be implemented via
  19939. + * RT-aware mutexes goes to many people: The Pmutex project (Dirk Grambow
  19940. + * and others) who prototyped it on 2.4 and did lots of comparative
  19941. + * research and analysis; TimeSys, for proving that you can implement a
  19942. + * fully preemptible kernel via the use of IRQ threading and mutexes;
  19943. + * Bill Huey for persuasively arguing on lkml that the mutex model is the
  19944. + * right one; and to MontaVista, who ported pmutexes to 2.6.
  19945. + *
  19946. + * This code is a from-scratch implementation and is not based on pmutexes,
  19947. + * but the idea of converting spinlocks to mutexes is used here too.
  19948. + *
  19949. + * lock debugging, locking tree, deadlock detection:
  19950. + *
  19951. + * Copyright (C) 2004, LynuxWorks, Inc., Igor Manyilov, Bill Huey
  19952. + * Released under the General Public License (GPL).
  19953. + *
  19954. + * Includes portions of the generic R/W semaphore implementation from:
  19955. + *
  19956. + * Copyright (c) 2001 David Howells (dhowells@redhat.com).
  19957. + * - Derived partially from idea by Andrea Arcangeli <andrea@suse.de>
  19958. + * - Derived also from comments by Linus
  19959. + *
  19960. + * Pending ownership of locks and ownership stealing:
  19961. + *
  19962. + * Copyright (C) 2005, Kihon Technologies Inc., Steven Rostedt
  19963. + *
  19964. + * (also by Steven Rostedt)
  19965. + * - Converted single pi_lock to individual task locks.
  19966. + *
  19967. + * By Esben Nielsen:
  19968. + * Doing priority inheritance with help of the scheduler.
  19969. + *
  19970. + * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
  19971. + * - major rework based on Esben Nielsens initial patch
  19972. + * - replaced thread_info references by task_struct refs
  19973. + * - removed task->pending_owner dependency
  19974. + * - BKL drop/reacquire for semaphore style locks to avoid deadlocks
  19975. + * in the scheduler return path as discussed with Steven Rostedt
  19976. + *
  19977. + * Copyright (C) 2006, Kihon Technologies Inc.
  19978. + * Steven Rostedt <rostedt@goodmis.org>
  19979. + * - debugged and patched Thomas Gleixner's rework.
  19980. + * - added back the cmpxchg to the rework.
  19981. + * - turned atomic require back on for SMP.
  19982. + */
  19983. +
  19984. +#include <linux/spinlock.h>
  19985. +#include <linux/rtmutex.h>
  19986. +#include <linux/sched.h>
  19987. +#include <linux/delay.h>
  19988. +#include <linux/module.h>
  19989. +#include <linux/kallsyms.h>
  19990. +#include <linux/syscalls.h>
  19991. +#include <linux/interrupt.h>
  19992. +#include <linux/plist.h>
  19993. +#include <linux/fs.h>
  19994. +#include <linux/futex.h>
  19995. +#include <linux/hrtimer.h>
  19996. +
  19997. +#include "rtmutex_common.h"
  19998. +
  19999. +/*
  20000. + * struct mutex functions
  20001. + */
  20002. +void __mutex_do_init(struct mutex *mutex, const char *name,
  20003. + struct lock_class_key *key)
  20004. +{
  20005. +#ifdef CONFIG_DEBUG_LOCK_ALLOC
  20006. + /*
  20007. + * Make sure we are not reinitializing a held lock:
  20008. + */
  20009. + debug_check_no_locks_freed((void *)mutex, sizeof(*mutex));
  20010. + lockdep_init_map(&mutex->dep_map, name, key, 0);
  20011. +#endif
  20012. + mutex->lock.save_state = 0;
  20013. +}
  20014. +EXPORT_SYMBOL(__mutex_do_init);
  20015. +
  20016. +void __lockfunc _mutex_lock(struct mutex *lock)
  20017. +{
  20018. + mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
  20019. + rt_mutex_lock(&lock->lock);
  20020. +}
  20021. +EXPORT_SYMBOL(_mutex_lock);
  20022. +
  20023. +int __lockfunc _mutex_lock_interruptible(struct mutex *lock)
  20024. +{
  20025. + int ret;
  20026. +
  20027. + mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
  20028. + ret = rt_mutex_lock_interruptible(&lock->lock);
  20029. + if (ret)
  20030. + mutex_release(&lock->dep_map, 1, _RET_IP_);
  20031. + return ret;
  20032. +}
  20033. +EXPORT_SYMBOL(_mutex_lock_interruptible);
  20034. +
  20035. +int __lockfunc _mutex_lock_killable(struct mutex *lock)
  20036. +{
  20037. + int ret;
  20038. +
  20039. + mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
  20040. + ret = rt_mutex_lock_killable(&lock->lock);
  20041. + if (ret)
  20042. + mutex_release(&lock->dep_map, 1, _RET_IP_);
  20043. + return ret;
  20044. +}
  20045. +EXPORT_SYMBOL(_mutex_lock_killable);
  20046. +
  20047. +#ifdef CONFIG_DEBUG_LOCK_ALLOC
  20048. +void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass)
  20049. +{
  20050. + mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_);
  20051. + rt_mutex_lock(&lock->lock);
  20052. +}
  20053. +EXPORT_SYMBOL(_mutex_lock_nested);
  20054. +
  20055. +void __lockfunc _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest)
  20056. +{
  20057. + mutex_acquire_nest(&lock->dep_map, 0, 0, nest, _RET_IP_);
  20058. + rt_mutex_lock(&lock->lock);
  20059. +}
  20060. +EXPORT_SYMBOL(_mutex_lock_nest_lock);
  20061. +
  20062. +int __lockfunc _mutex_lock_interruptible_nested(struct mutex *lock, int subclass)
  20063. +{
  20064. + int ret;
  20065. +
  20066. + mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_);
  20067. + ret = rt_mutex_lock_interruptible(&lock->lock);
  20068. + if (ret)
  20069. + mutex_release(&lock->dep_map, 1, _RET_IP_);
  20070. + return ret;
  20071. +}
  20072. +EXPORT_SYMBOL(_mutex_lock_interruptible_nested);
  20073. +
  20074. +int __lockfunc _mutex_lock_killable_nested(struct mutex *lock, int subclass)
  20075. +{
  20076. + int ret;
  20077. +
  20078. + mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
  20079. + ret = rt_mutex_lock_killable(&lock->lock);
  20080. + if (ret)
  20081. + mutex_release(&lock->dep_map, 1, _RET_IP_);
  20082. + return ret;
  20083. +}
  20084. +EXPORT_SYMBOL(_mutex_lock_killable_nested);
  20085. +#endif
  20086. +
  20087. +int __lockfunc _mutex_trylock(struct mutex *lock)
  20088. +{
  20089. + int ret = rt_mutex_trylock(&lock->lock);
  20090. +
  20091. + if (ret)
  20092. + mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
  20093. +
  20094. + return ret;
  20095. +}
  20096. +EXPORT_SYMBOL(_mutex_trylock);
  20097. +
  20098. +void __lockfunc _mutex_unlock(struct mutex *lock)
  20099. +{
  20100. + mutex_release(&lock->dep_map, 1, _RET_IP_);
  20101. + rt_mutex_unlock(&lock->lock);
  20102. +}
  20103. +EXPORT_SYMBOL(_mutex_unlock);
  20104. +
  20105. +/*
  20106. + * rwlock_t functions
  20107. + */
  20108. +int __lockfunc rt_write_trylock(rwlock_t *rwlock)
  20109. +{
  20110. + int ret;
  20111. +
  20112. + migrate_disable();
  20113. + ret = rt_mutex_trylock(&rwlock->lock);
  20114. + if (ret)
  20115. + rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_);
  20116. + else
  20117. + migrate_enable();
  20118. +
  20119. + return ret;
  20120. +}
  20121. +EXPORT_SYMBOL(rt_write_trylock);
  20122. +
  20123. +int __lockfunc rt_write_trylock_irqsave(rwlock_t *rwlock, unsigned long *flags)
  20124. +{
  20125. + int ret;
  20126. +
  20127. + *flags = 0;
  20128. + ret = rt_write_trylock(rwlock);
  20129. + return ret;
  20130. +}
  20131. +EXPORT_SYMBOL(rt_write_trylock_irqsave);
  20132. +
  20133. +int __lockfunc rt_read_trylock(rwlock_t *rwlock)
  20134. +{
  20135. + struct rt_mutex *lock = &rwlock->lock;
  20136. + int ret = 1;
  20137. +
  20138. + /*
  20139. + * recursive read locks succeed when current owns the lock,
  20140. + * but not when read_depth == 0 which means that the lock is
  20141. + * write locked.
  20142. + */
  20143. + if (rt_mutex_owner(lock) != current) {
  20144. + migrate_disable();
  20145. + ret = rt_mutex_trylock(lock);
  20146. + if (ret)
  20147. + rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_);
  20148. + else
  20149. + migrate_enable();
  20150. +
  20151. + } else if (!rwlock->read_depth) {
  20152. + ret = 0;
  20153. + }
  20154. +
  20155. + if (ret)
  20156. + rwlock->read_depth++;
  20157. +
  20158. + return ret;
  20159. +}
  20160. +EXPORT_SYMBOL(rt_read_trylock);
  20161. +
  20162. +void __lockfunc rt_write_lock(rwlock_t *rwlock)
  20163. +{
  20164. + rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_);
  20165. + migrate_disable();
  20166. + __rt_spin_lock(&rwlock->lock);
  20167. +}
  20168. +EXPORT_SYMBOL(rt_write_lock);
  20169. +
  20170. +void __lockfunc rt_read_lock(rwlock_t *rwlock)
  20171. +{
  20172. + struct rt_mutex *lock = &rwlock->lock;
  20173. +
  20174. +
  20175. + /*
  20176. + * recursive read locks succeed when current owns the lock
  20177. + */
  20178. + if (rt_mutex_owner(lock) != current) {
  20179. + migrate_disable();
  20180. + rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_);
  20181. + __rt_spin_lock(lock);
  20182. + }
  20183. + rwlock->read_depth++;
  20184. +}
  20185. +
  20186. +EXPORT_SYMBOL(rt_read_lock);
  20187. +
  20188. +void __lockfunc rt_write_unlock(rwlock_t *rwlock)
  20189. +{
  20190. + /* NOTE: we always pass in '1' for nested, for simplicity */
  20191. + rwlock_release(&rwlock->dep_map, 1, _RET_IP_);
  20192. + __rt_spin_unlock(&rwlock->lock);
  20193. + migrate_enable();
  20194. +}
  20195. +EXPORT_SYMBOL(rt_write_unlock);
  20196. +
  20197. +void __lockfunc rt_read_unlock(rwlock_t *rwlock)
  20198. +{
  20199. + /* Release the lock only when read_depth is down to 0 */
  20200. + if (--rwlock->read_depth == 0) {
  20201. + rwlock_release(&rwlock->dep_map, 1, _RET_IP_);
  20202. + __rt_spin_unlock(&rwlock->lock);
  20203. + migrate_enable();
  20204. + }
  20205. +}
  20206. +EXPORT_SYMBOL(rt_read_unlock);
  20207. +
  20208. +unsigned long __lockfunc rt_write_lock_irqsave(rwlock_t *rwlock)
  20209. +{
  20210. + rt_write_lock(rwlock);
  20211. +
  20212. + return 0;
  20213. +}
  20214. +EXPORT_SYMBOL(rt_write_lock_irqsave);
  20215. +
  20216. +unsigned long __lockfunc rt_read_lock_irqsave(rwlock_t *rwlock)
  20217. +{
  20218. + rt_read_lock(rwlock);
  20219. +
  20220. + return 0;
  20221. +}
  20222. +EXPORT_SYMBOL(rt_read_lock_irqsave);
  20223. +
  20224. +void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key)
  20225. +{
  20226. +#ifdef CONFIG_DEBUG_LOCK_ALLOC
  20227. + /*
  20228. + * Make sure we are not reinitializing a held lock:
  20229. + */
  20230. + debug_check_no_locks_freed((void *)rwlock, sizeof(*rwlock));
  20231. + lockdep_init_map(&rwlock->dep_map, name, key, 0);
  20232. +#endif
  20233. + rwlock->lock.save_state = 1;
  20234. + rwlock->read_depth = 0;
  20235. +}
  20236. +EXPORT_SYMBOL(__rt_rwlock_init);
  20237. +
  20238. +/*
  20239. + * rw_semaphores
  20240. + */
  20241. +
  20242. +void rt_up_write(struct rw_semaphore *rwsem)
  20243. +{
  20244. + rwsem_release(&rwsem->dep_map, 1, _RET_IP_);
  20245. + rt_mutex_unlock(&rwsem->lock);
  20246. +}
  20247. +EXPORT_SYMBOL(rt_up_write);
  20248. +
  20249. +void rt_up_read(struct rw_semaphore *rwsem)
  20250. +{
  20251. + rwsem_release(&rwsem->dep_map, 1, _RET_IP_);
  20252. + if (--rwsem->read_depth == 0)
  20253. + rt_mutex_unlock(&rwsem->lock);
  20254. +}
  20255. +EXPORT_SYMBOL(rt_up_read);
  20256. +
  20257. +/*
  20258. + * downgrade a write lock into a read lock
  20259. + * - just wake up any readers at the front of the queue
  20260. + */
  20261. +void rt_downgrade_write(struct rw_semaphore *rwsem)
  20262. +{
  20263. + BUG_ON(rt_mutex_owner(&rwsem->lock) != current);
  20264. + rwsem->read_depth = 1;
  20265. +}
  20266. +EXPORT_SYMBOL(rt_downgrade_write);
  20267. +
  20268. +int rt_down_write_trylock(struct rw_semaphore *rwsem)
  20269. +{
  20270. + int ret = rt_mutex_trylock(&rwsem->lock);
  20271. +
  20272. + if (ret)
  20273. + rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_);
  20274. + return ret;
  20275. +}
  20276. +EXPORT_SYMBOL(rt_down_write_trylock);
  20277. +
  20278. +void rt_down_write(struct rw_semaphore *rwsem)
  20279. +{
  20280. + rwsem_acquire(&rwsem->dep_map, 0, 0, _RET_IP_);
  20281. + rt_mutex_lock(&rwsem->lock);
  20282. +}
  20283. +EXPORT_SYMBOL(rt_down_write);
  20284. +
  20285. +void rt_down_write_nested(struct rw_semaphore *rwsem, int subclass)
  20286. +{
  20287. + rwsem_acquire(&rwsem->dep_map, subclass, 0, _RET_IP_);
  20288. + rt_mutex_lock(&rwsem->lock);
  20289. +}
  20290. +EXPORT_SYMBOL(rt_down_write_nested);
  20291. +
  20292. +void rt_down_write_nested_lock(struct rw_semaphore *rwsem,
  20293. + struct lockdep_map *nest)
  20294. +{
  20295. + rwsem_acquire_nest(&rwsem->dep_map, 0, 0, nest, _RET_IP_);
  20296. + rt_mutex_lock(&rwsem->lock);
  20297. +}
  20298. +EXPORT_SYMBOL(rt_down_write_nested_lock);
  20299. +
  20300. +int rt_down_read_trylock(struct rw_semaphore *rwsem)
  20301. +{
  20302. + struct rt_mutex *lock = &rwsem->lock;
  20303. + int ret = 1;
  20304. +
  20305. + /*
  20306. + * recursive read locks succeed when current owns the rwsem,
  20307. + * but not when read_depth == 0 which means that the rwsem is
  20308. + * write locked.
  20309. + */
  20310. + if (rt_mutex_owner(lock) != current)
  20311. + ret = rt_mutex_trylock(&rwsem->lock);
  20312. + else if (!rwsem->read_depth)
  20313. + ret = 0;
  20314. +
  20315. + if (ret) {
  20316. + rwsem->read_depth++;
  20317. + rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_);
  20318. + }
  20319. + return ret;
  20320. +}
  20321. +EXPORT_SYMBOL(rt_down_read_trylock);
  20322. +
  20323. +static void __rt_down_read(struct rw_semaphore *rwsem, int subclass)
  20324. +{
  20325. + struct rt_mutex *lock = &rwsem->lock;
  20326. +
  20327. + rwsem_acquire_read(&rwsem->dep_map, subclass, 0, _RET_IP_);
  20328. +
  20329. + if (rt_mutex_owner(lock) != current)
  20330. + rt_mutex_lock(&rwsem->lock);
  20331. + rwsem->read_depth++;
  20332. +}
  20333. +
  20334. +void rt_down_read(struct rw_semaphore *rwsem)
  20335. +{
  20336. + __rt_down_read(rwsem, 0);
  20337. +}
  20338. +EXPORT_SYMBOL(rt_down_read);
  20339. +
  20340. +void rt_down_read_nested(struct rw_semaphore *rwsem, int subclass)
  20341. +{
  20342. + __rt_down_read(rwsem, subclass);
  20343. +}
  20344. +EXPORT_SYMBOL(rt_down_read_nested);
  20345. +
  20346. +void __rt_rwsem_init(struct rw_semaphore *rwsem, const char *name,
  20347. + struct lock_class_key *key)
  20348. +{
  20349. +#ifdef CONFIG_DEBUG_LOCK_ALLOC
  20350. + /*
  20351. + * Make sure we are not reinitializing a held lock:
  20352. + */
  20353. + debug_check_no_locks_freed((void *)rwsem, sizeof(*rwsem));
  20354. + lockdep_init_map(&rwsem->dep_map, name, key, 0);
  20355. +#endif
  20356. + rwsem->read_depth = 0;
  20357. + rwsem->lock.save_state = 0;
  20358. +}
  20359. +EXPORT_SYMBOL(__rt_rwsem_init);
  20360. +
  20361. +/**
  20362. + * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
  20363. + * @cnt: the atomic which we are to dec
  20364. + * @lock: the mutex to return holding if we dec to 0
  20365. + *
  20366. + * return true and hold lock if we dec to 0, return false otherwise
  20367. + */
  20368. +int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock)
  20369. +{
  20370. + /* dec if we can't possibly hit 0 */
  20371. + if (atomic_add_unless(cnt, -1, 1))
  20372. + return 0;
  20373. + /* we might hit 0, so take the lock */
  20374. + mutex_lock(lock);
  20375. + if (!atomic_dec_and_test(cnt)) {
  20376. + /* when we actually did the dec, we didn't hit 0 */
  20377. + mutex_unlock(lock);
  20378. + return 0;
  20379. + }
  20380. + /* we hit 0, and we hold the lock */
  20381. + return 1;
  20382. +}
  20383. +EXPORT_SYMBOL(atomic_dec_and_mutex_lock);
  20384. diff -Nur linux-3.18.14.orig/kernel/locking/rtmutex.c linux-3.18.14-rt/kernel/locking/rtmutex.c
  20385. --- linux-3.18.14.orig/kernel/locking/rtmutex.c 2015-05-20 10:04:50.000000000 -0500
  20386. +++ linux-3.18.14-rt/kernel/locking/rtmutex.c 2015-05-31 15:32:48.769635364 -0500
  20387. @@ -7,6 +7,11 @@
  20388. * Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
  20389. * Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt
  20390. * Copyright (C) 2006 Esben Nielsen
  20391. + * Adaptive Spinlocks:
  20392. + * Copyright (C) 2008 Novell, Inc., Gregory Haskins, Sven Dietrich,
  20393. + * and Peter Morreale,
  20394. + * Adaptive Spinlocks simplification:
  20395. + * Copyright (C) 2008 Red Hat, Inc., Steven Rostedt <srostedt@redhat.com>
  20396. *
  20397. * See Documentation/locking/rt-mutex-design.txt for details.
  20398. */
  20399. @@ -16,6 +21,7 @@
  20400. #include <linux/sched/rt.h>
  20401. #include <linux/sched/deadline.h>
  20402. #include <linux/timer.h>
  20403. +#include <linux/ww_mutex.h>
  20404. #include "rtmutex_common.h"
  20405. @@ -69,6 +75,12 @@
  20406. clear_rt_mutex_waiters(lock);
  20407. }
  20408. +static int rt_mutex_real_waiter(struct rt_mutex_waiter *waiter)
  20409. +{
  20410. + return waiter && waiter != PI_WAKEUP_INPROGRESS &&
  20411. + waiter != PI_REQUEUE_INPROGRESS;
  20412. +}
  20413. +
  20414. /*
  20415. * We can speed up the acquire/release, if the architecture
  20416. * supports cmpxchg and if there's no debugging state to be set up
  20417. @@ -333,6 +345,14 @@
  20418. return debug_rt_mutex_detect_deadlock(waiter, chwalk);
  20419. }
  20420. +static void rt_mutex_wake_waiter(struct rt_mutex_waiter *waiter)
  20421. +{
  20422. + if (waiter->savestate)
  20423. + wake_up_lock_sleeper(waiter->task);
  20424. + else
  20425. + wake_up_process(waiter->task);
  20426. +}
  20427. +
  20428. /*
  20429. * Max number of times we'll walk the boosting chain:
  20430. */
  20431. @@ -340,7 +360,8 @@
  20432. static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p)
  20433. {
  20434. - return p->pi_blocked_on ? p->pi_blocked_on->lock : NULL;
  20435. + return rt_mutex_real_waiter(p->pi_blocked_on) ?
  20436. + p->pi_blocked_on->lock : NULL;
  20437. }
  20438. /*
  20439. @@ -477,7 +498,7 @@
  20440. * reached or the state of the chain has changed while we
  20441. * dropped the locks.
  20442. */
  20443. - if (!waiter)
  20444. + if (!rt_mutex_real_waiter(waiter))
  20445. goto out_unlock_pi;
  20446. /*
  20447. @@ -639,13 +660,16 @@
  20448. * follow here. This is the end of the chain we are walking.
  20449. */
  20450. if (!rt_mutex_owner(lock)) {
  20451. + struct rt_mutex_waiter *lock_top_waiter;
  20452. +
  20453. /*
  20454. * If the requeue [7] above changed the top waiter,
  20455. * then we need to wake the new top waiter up to try
  20456. * to get the lock.
  20457. */
  20458. - if (prerequeue_top_waiter != rt_mutex_top_waiter(lock))
  20459. - wake_up_process(rt_mutex_top_waiter(lock)->task);
  20460. + lock_top_waiter = rt_mutex_top_waiter(lock);
  20461. + if (prerequeue_top_waiter != lock_top_waiter)
  20462. + rt_mutex_wake_waiter(lock_top_waiter);
  20463. raw_spin_unlock(&lock->wait_lock);
  20464. return 0;
  20465. }
  20466. @@ -738,6 +762,25 @@
  20467. return ret;
  20468. }
  20469. +
  20470. +#define STEAL_NORMAL 0
  20471. +#define STEAL_LATERAL 1
  20472. +
  20473. +/*
  20474. + * Note that RT tasks are excluded from lateral-steals to prevent the
  20475. + * introduction of an unbounded latency
  20476. + */
  20477. +static inline int lock_is_stealable(struct task_struct *task,
  20478. + struct task_struct *pendowner, int mode)
  20479. +{
  20480. + if (mode == STEAL_NORMAL || rt_task(task)) {
  20481. + if (task->prio >= pendowner->prio)
  20482. + return 0;
  20483. + } else if (task->prio > pendowner->prio)
  20484. + return 0;
  20485. + return 1;
  20486. +}
  20487. +
  20488. /*
  20489. * Try to take an rt-mutex
  20490. *
  20491. @@ -748,8 +791,9 @@
  20492. * @waiter: The waiter that is queued to the lock's wait list if the
  20493. * callsite called task_blocked_on_lock(), otherwise NULL
  20494. */
  20495. -static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
  20496. - struct rt_mutex_waiter *waiter)
  20497. +static int __try_to_take_rt_mutex(struct rt_mutex *lock,
  20498. + struct task_struct *task,
  20499. + struct rt_mutex_waiter *waiter, int mode)
  20500. {
  20501. unsigned long flags;
  20502. @@ -788,8 +832,10 @@
  20503. * If waiter is not the highest priority waiter of
  20504. * @lock, give up.
  20505. */
  20506. - if (waiter != rt_mutex_top_waiter(lock))
  20507. + if (waiter != rt_mutex_top_waiter(lock)) {
  20508. + /* XXX lock_is_stealable() ? */
  20509. return 0;
  20510. + }
  20511. /*
  20512. * We can acquire the lock. Remove the waiter from the
  20513. @@ -807,14 +853,10 @@
  20514. * not need to be dequeued.
  20515. */
  20516. if (rt_mutex_has_waiters(lock)) {
  20517. - /*
  20518. - * If @task->prio is greater than or equal to
  20519. - * the top waiter priority (kernel view),
  20520. - * @task lost.
  20521. - */
  20522. - if (task->prio >= rt_mutex_top_waiter(lock)->prio)
  20523. - return 0;
  20524. + struct task_struct *pown = rt_mutex_top_waiter(lock)->task;
  20525. + if (task != pown && !lock_is_stealable(task, pown, mode))
  20526. + return 0;
  20527. /*
  20528. * The current top waiter stays enqueued. We
  20529. * don't have to change anything in the lock
  20530. @@ -863,6 +905,369 @@
  20531. return 1;
  20532. }
  20533. +#ifdef CONFIG_PREEMPT_RT_FULL
  20534. +/*
  20535. + * preemptible spin_lock functions:
  20536. + */
  20537. +static inline void rt_spin_lock_fastlock(struct rt_mutex *lock,
  20538. + void (*slowfn)(struct rt_mutex *lock))
  20539. +{
  20540. + might_sleep();
  20541. +
  20542. + if (likely(rt_mutex_cmpxchg(lock, NULL, current)))
  20543. + rt_mutex_deadlock_account_lock(lock, current);
  20544. + else
  20545. + slowfn(lock);
  20546. +}
  20547. +
  20548. +static inline void rt_spin_lock_fastunlock(struct rt_mutex *lock,
  20549. + void (*slowfn)(struct rt_mutex *lock))
  20550. +{
  20551. + if (likely(rt_mutex_cmpxchg(lock, current, NULL)))
  20552. + rt_mutex_deadlock_account_unlock(current);
  20553. + else
  20554. + slowfn(lock);
  20555. +}
  20556. +#ifdef CONFIG_SMP
  20557. +/*
  20558. + * Note that owner is a speculative pointer and dereferencing relies
  20559. + * on rcu_read_lock() and the check against the lock owner.
  20560. + */
  20561. +static int adaptive_wait(struct rt_mutex *lock,
  20562. + struct task_struct *owner)
  20563. +{
  20564. + int res = 0;
  20565. +
  20566. + rcu_read_lock();
  20567. + for (;;) {
  20568. + if (owner != rt_mutex_owner(lock))
  20569. + break;
  20570. + /*
  20571. + * Ensure that owner->on_cpu is dereferenced _after_
  20572. + * checking the above to be valid.
  20573. + */
  20574. + barrier();
  20575. + if (!owner->on_cpu) {
  20576. + res = 1;
  20577. + break;
  20578. + }
  20579. + cpu_relax();
  20580. + }
  20581. + rcu_read_unlock();
  20582. + return res;
  20583. +}
  20584. +#else
  20585. +static int adaptive_wait(struct rt_mutex *lock,
  20586. + struct task_struct *orig_owner)
  20587. +{
  20588. + return 1;
  20589. +}
  20590. +#endif
  20591. +
  20592. +# define pi_lock(lock) raw_spin_lock_irq(lock)
  20593. +# define pi_unlock(lock) raw_spin_unlock_irq(lock)
  20594. +
  20595. +static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
  20596. + struct rt_mutex_waiter *waiter,
  20597. + struct task_struct *task,
  20598. + enum rtmutex_chainwalk chwalk);
  20599. +/*
  20600. + * Slow path lock function spin_lock style: this variant is very
  20601. + * careful not to miss any non-lock wakeups.
  20602. + *
  20603. + * We store the current state under p->pi_lock in p->saved_state and
  20604. + * the try_to_wake_up() code handles this accordingly.
  20605. + */
  20606. +static void noinline __sched rt_spin_lock_slowlock(struct rt_mutex *lock)
  20607. +{
  20608. + struct task_struct *lock_owner, *self = current;
  20609. + struct rt_mutex_waiter waiter, *top_waiter;
  20610. + int ret;
  20611. +
  20612. + rt_mutex_init_waiter(&waiter, true);
  20613. +
  20614. + raw_spin_lock(&lock->wait_lock);
  20615. +
  20616. + if (__try_to_take_rt_mutex(lock, self, NULL, STEAL_LATERAL)) {
  20617. + raw_spin_unlock(&lock->wait_lock);
  20618. + return;
  20619. + }
  20620. +
  20621. + BUG_ON(rt_mutex_owner(lock) == self);
  20622. +
  20623. + /*
  20624. + * We save whatever state the task is in and we'll restore it
  20625. + * after acquiring the lock taking real wakeups into account
  20626. + * as well. We are serialized via pi_lock against wakeups. See
  20627. + * try_to_wake_up().
  20628. + */
  20629. + pi_lock(&self->pi_lock);
  20630. + self->saved_state = self->state;
  20631. + __set_current_state(TASK_UNINTERRUPTIBLE);
  20632. + pi_unlock(&self->pi_lock);
  20633. +
  20634. + ret = task_blocks_on_rt_mutex(lock, &waiter, self, 0);
  20635. + BUG_ON(ret);
  20636. +
  20637. + for (;;) {
  20638. + /* Try to acquire the lock again. */
  20639. + if (__try_to_take_rt_mutex(lock, self, &waiter, STEAL_LATERAL))
  20640. + break;
  20641. +
  20642. + top_waiter = rt_mutex_top_waiter(lock);
  20643. + lock_owner = rt_mutex_owner(lock);
  20644. +
  20645. + raw_spin_unlock(&lock->wait_lock);
  20646. +
  20647. + debug_rt_mutex_print_deadlock(&waiter);
  20648. +
  20649. + if (top_waiter != &waiter || adaptive_wait(lock, lock_owner))
  20650. + schedule_rt_mutex(lock);
  20651. +
  20652. + raw_spin_lock(&lock->wait_lock);
  20653. +
  20654. + pi_lock(&self->pi_lock);
  20655. + __set_current_state(TASK_UNINTERRUPTIBLE);
  20656. + pi_unlock(&self->pi_lock);
  20657. + }
  20658. +
  20659. + /*
  20660. + * Restore the task state to current->saved_state. We set it
  20661. + * to the original state above and the try_to_wake_up() code
  20662. + * has possibly updated it when a real (non-rtmutex) wakeup
  20663. + * happened while we were blocked. Clear saved_state so
  20664. + * try_to_wakeup() does not get confused.
  20665. + */
  20666. + pi_lock(&self->pi_lock);
  20667. + __set_current_state(self->saved_state);
  20668. + self->saved_state = TASK_RUNNING;
  20669. + pi_unlock(&self->pi_lock);
  20670. +
  20671. + /*
  20672. + * try_to_take_rt_mutex() sets the waiter bit
  20673. + * unconditionally. We might have to fix that up:
  20674. + */
  20675. + fixup_rt_mutex_waiters(lock);
  20676. +
  20677. + BUG_ON(rt_mutex_has_waiters(lock) && &waiter == rt_mutex_top_waiter(lock));
  20678. + BUG_ON(!RB_EMPTY_NODE(&waiter.tree_entry));
  20679. +
  20680. + raw_spin_unlock(&lock->wait_lock);
  20681. +
  20682. + debug_rt_mutex_free_waiter(&waiter);
  20683. +}
  20684. +
  20685. +static void wakeup_next_waiter(struct rt_mutex *lock);
  20686. +/*
  20687. + * Slow path to release a rt_mutex spin_lock style
  20688. + */
  20689. +static void __sched __rt_spin_lock_slowunlock(struct rt_mutex *lock)
  20690. +{
  20691. + debug_rt_mutex_unlock(lock);
  20692. +
  20693. + rt_mutex_deadlock_account_unlock(current);
  20694. +
  20695. + if (!rt_mutex_has_waiters(lock)) {
  20696. + lock->owner = NULL;
  20697. + raw_spin_unlock(&lock->wait_lock);
  20698. + return;
  20699. + }
  20700. +
  20701. + wakeup_next_waiter(lock);
  20702. +
  20703. + raw_spin_unlock(&lock->wait_lock);
  20704. +
  20705. + /* Undo pi boosting.when necessary */
  20706. + rt_mutex_adjust_prio(current);
  20707. +}
  20708. +
  20709. +static void noinline __sched rt_spin_lock_slowunlock(struct rt_mutex *lock)
  20710. +{
  20711. + raw_spin_lock(&lock->wait_lock);
  20712. + __rt_spin_lock_slowunlock(lock);
  20713. +}
  20714. +
  20715. +static void noinline __sched rt_spin_lock_slowunlock_hirq(struct rt_mutex *lock)
  20716. +{
  20717. + int ret;
  20718. +
  20719. + do {
  20720. + ret = raw_spin_trylock(&lock->wait_lock);
  20721. + } while (!ret);
  20722. +
  20723. + __rt_spin_lock_slowunlock(lock);
  20724. +}
  20725. +
  20726. +void __lockfunc rt_spin_lock(spinlock_t *lock)
  20727. +{
  20728. + rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock);
  20729. + spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
  20730. +}
  20731. +EXPORT_SYMBOL(rt_spin_lock);
  20732. +
  20733. +void __lockfunc __rt_spin_lock(struct rt_mutex *lock)
  20734. +{
  20735. + rt_spin_lock_fastlock(lock, rt_spin_lock_slowlock);
  20736. +}
  20737. +EXPORT_SYMBOL(__rt_spin_lock);
  20738. +
  20739. +#ifdef CONFIG_DEBUG_LOCK_ALLOC
  20740. +void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass)
  20741. +{
  20742. + rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock);
  20743. + spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
  20744. +}
  20745. +EXPORT_SYMBOL(rt_spin_lock_nested);
  20746. +#endif
  20747. +
  20748. +void __lockfunc rt_spin_unlock(spinlock_t *lock)
  20749. +{
  20750. + /* NOTE: we always pass in '1' for nested, for simplicity */
  20751. + spin_release(&lock->dep_map, 1, _RET_IP_);
  20752. + rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock);
  20753. +}
  20754. +EXPORT_SYMBOL(rt_spin_unlock);
  20755. +
  20756. +void __lockfunc rt_spin_unlock_after_trylock_in_irq(spinlock_t *lock)
  20757. +{
  20758. + /* NOTE: we always pass in '1' for nested, for simplicity */
  20759. + spin_release(&lock->dep_map, 1, _RET_IP_);
  20760. + rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock_hirq);
  20761. +}
  20762. +
  20763. +void __lockfunc __rt_spin_unlock(struct rt_mutex *lock)
  20764. +{
  20765. + rt_spin_lock_fastunlock(lock, rt_spin_lock_slowunlock);
  20766. +}
  20767. +EXPORT_SYMBOL(__rt_spin_unlock);
  20768. +
  20769. +/*
  20770. + * Wait for the lock to get unlocked: instead of polling for an unlock
  20771. + * (like raw spinlocks do), we lock and unlock, to force the kernel to
  20772. + * schedule if there's contention:
  20773. + */
  20774. +void __lockfunc rt_spin_unlock_wait(spinlock_t *lock)
  20775. +{
  20776. + spin_lock(lock);
  20777. + spin_unlock(lock);
  20778. +}
  20779. +EXPORT_SYMBOL(rt_spin_unlock_wait);
  20780. +
  20781. +int __lockfunc __rt_spin_trylock(struct rt_mutex *lock)
  20782. +{
  20783. + return rt_mutex_trylock(lock);
  20784. +}
  20785. +
  20786. +int __lockfunc rt_spin_trylock(spinlock_t *lock)
  20787. +{
  20788. + int ret = rt_mutex_trylock(&lock->lock);
  20789. +
  20790. + if (ret)
  20791. + spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
  20792. + return ret;
  20793. +}
  20794. +EXPORT_SYMBOL(rt_spin_trylock);
  20795. +
  20796. +int __lockfunc rt_spin_trylock_bh(spinlock_t *lock)
  20797. +{
  20798. + int ret;
  20799. +
  20800. + local_bh_disable();
  20801. + ret = rt_mutex_trylock(&lock->lock);
  20802. + if (ret) {
  20803. + migrate_disable();
  20804. + spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
  20805. + } else
  20806. + local_bh_enable();
  20807. + return ret;
  20808. +}
  20809. +EXPORT_SYMBOL(rt_spin_trylock_bh);
  20810. +
  20811. +int __lockfunc rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags)
  20812. +{
  20813. + int ret;
  20814. +
  20815. + *flags = 0;
  20816. + ret = rt_mutex_trylock(&lock->lock);
  20817. + if (ret) {
  20818. + migrate_disable();
  20819. + spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
  20820. + }
  20821. + return ret;
  20822. +}
  20823. +EXPORT_SYMBOL(rt_spin_trylock_irqsave);
  20824. +
  20825. +int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock)
  20826. +{
  20827. + /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */
  20828. + if (atomic_add_unless(atomic, -1, 1))
  20829. + return 0;
  20830. + migrate_disable();
  20831. + rt_spin_lock(lock);
  20832. + if (atomic_dec_and_test(atomic))
  20833. + return 1;
  20834. + rt_spin_unlock(lock);
  20835. + migrate_enable();
  20836. + return 0;
  20837. +}
  20838. +EXPORT_SYMBOL(atomic_dec_and_spin_lock);
  20839. +
  20840. + void
  20841. +__rt_spin_lock_init(spinlock_t *lock, char *name, struct lock_class_key *key)
  20842. +{
  20843. +#ifdef CONFIG_DEBUG_LOCK_ALLOC
  20844. + /*
  20845. + * Make sure we are not reinitializing a held lock:
  20846. + */
  20847. + debug_check_no_locks_freed((void *)lock, sizeof(*lock));
  20848. + lockdep_init_map(&lock->dep_map, name, key, 0);
  20849. +#endif
  20850. +}
  20851. +EXPORT_SYMBOL(__rt_spin_lock_init);
  20852. +
  20853. +#endif /* PREEMPT_RT_FULL */
  20854. +
  20855. +#ifdef CONFIG_PREEMPT_RT_FULL
  20856. + static inline int __sched
  20857. +__mutex_lock_check_stamp(struct rt_mutex *lock, struct ww_acquire_ctx *ctx)
  20858. +{
  20859. + struct ww_mutex *ww = container_of(lock, struct ww_mutex, base.lock);
  20860. + struct ww_acquire_ctx *hold_ctx = ACCESS_ONCE(ww->ctx);
  20861. +
  20862. + if (!hold_ctx)
  20863. + return 0;
  20864. +
  20865. + if (unlikely(ctx == hold_ctx))
  20866. + return -EALREADY;
  20867. +
  20868. + if (ctx->stamp - hold_ctx->stamp <= LONG_MAX &&
  20869. + (ctx->stamp != hold_ctx->stamp || ctx > hold_ctx)) {
  20870. +#ifdef CONFIG_DEBUG_MUTEXES
  20871. + DEBUG_LOCKS_WARN_ON(ctx->contending_lock);
  20872. + ctx->contending_lock = ww;
  20873. +#endif
  20874. + return -EDEADLK;
  20875. + }
  20876. +
  20877. + return 0;
  20878. +}
  20879. +#else
  20880. + static inline int __sched
  20881. +__mutex_lock_check_stamp(struct rt_mutex *lock, struct ww_acquire_ctx *ctx)
  20882. +{
  20883. + BUG();
  20884. + return 0;
  20885. +}
  20886. +
  20887. +#endif
  20888. +
  20889. +static inline int
  20890. +try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
  20891. + struct rt_mutex_waiter *waiter)
  20892. +{
  20893. + return __try_to_take_rt_mutex(lock, task, waiter, STEAL_NORMAL);
  20894. +}
  20895. +
  20896. /*
  20897. * Task blocks on lock.
  20898. *
  20899. @@ -894,6 +1299,23 @@
  20900. return -EDEADLK;
  20901. raw_spin_lock_irqsave(&task->pi_lock, flags);
  20902. +
  20903. + /*
  20904. + * In the case of futex requeue PI, this will be a proxy
  20905. + * lock. The task will wake unaware that it is enqueueed on
  20906. + * this lock. Avoid blocking on two locks and corrupting
  20907. + * pi_blocked_on via the PI_WAKEUP_INPROGRESS
  20908. + * flag. futex_wait_requeue_pi() sets this when it wakes up
  20909. + * before requeue (due to a signal or timeout). Do not enqueue
  20910. + * the task if PI_WAKEUP_INPROGRESS is set.
  20911. + */
  20912. + if (task != current && task->pi_blocked_on == PI_WAKEUP_INPROGRESS) {
  20913. + raw_spin_unlock_irqrestore(&task->pi_lock, flags);
  20914. + return -EAGAIN;
  20915. + }
  20916. +
  20917. + BUG_ON(rt_mutex_real_waiter(task->pi_blocked_on));
  20918. +
  20919. __rt_mutex_adjust_prio(task);
  20920. waiter->task = task;
  20921. waiter->lock = lock;
  20922. @@ -917,7 +1339,7 @@
  20923. rt_mutex_enqueue_pi(owner, waiter);
  20924. __rt_mutex_adjust_prio(owner);
  20925. - if (owner->pi_blocked_on)
  20926. + if (rt_mutex_real_waiter(owner->pi_blocked_on))
  20927. chain_walk = 1;
  20928. } else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) {
  20929. chain_walk = 1;
  20930. @@ -994,7 +1416,7 @@
  20931. * long as we hold lock->wait_lock. The waiter task needs to
  20932. * acquire it in order to dequeue the waiter.
  20933. */
  20934. - wake_up_process(waiter->task);
  20935. + rt_mutex_wake_waiter(waiter);
  20936. }
  20937. /*
  20938. @@ -1008,7 +1430,7 @@
  20939. {
  20940. bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock));
  20941. struct task_struct *owner = rt_mutex_owner(lock);
  20942. - struct rt_mutex *next_lock;
  20943. + struct rt_mutex *next_lock = NULL;
  20944. unsigned long flags;
  20945. raw_spin_lock_irqsave(&current->pi_lock, flags);
  20946. @@ -1033,7 +1455,8 @@
  20947. __rt_mutex_adjust_prio(owner);
  20948. /* Store the lock on which owner is blocked or NULL */
  20949. - next_lock = task_blocked_on_lock(owner);
  20950. + if (rt_mutex_real_waiter(owner->pi_blocked_on))
  20951. + next_lock = task_blocked_on_lock(owner);
  20952. raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
  20953. @@ -1069,17 +1492,17 @@
  20954. raw_spin_lock_irqsave(&task->pi_lock, flags);
  20955. waiter = task->pi_blocked_on;
  20956. - if (!waiter || (waiter->prio == task->prio &&
  20957. + if (!rt_mutex_real_waiter(waiter) || (waiter->prio == task->prio &&
  20958. !dl_prio(task->prio))) {
  20959. raw_spin_unlock_irqrestore(&task->pi_lock, flags);
  20960. return;
  20961. }
  20962. next_lock = waiter->lock;
  20963. - raw_spin_unlock_irqrestore(&task->pi_lock, flags);
  20964. /* gets dropped in rt_mutex_adjust_prio_chain()! */
  20965. get_task_struct(task);
  20966. + raw_spin_unlock_irqrestore(&task->pi_lock, flags);
  20967. rt_mutex_adjust_prio_chain(task, RT_MUTEX_MIN_CHAINWALK, NULL,
  20968. next_lock, NULL, task);
  20969. }
  20970. @@ -1097,7 +1520,8 @@
  20971. static int __sched
  20972. __rt_mutex_slowlock(struct rt_mutex *lock, int state,
  20973. struct hrtimer_sleeper *timeout,
  20974. - struct rt_mutex_waiter *waiter)
  20975. + struct rt_mutex_waiter *waiter,
  20976. + struct ww_acquire_ctx *ww_ctx)
  20977. {
  20978. int ret = 0;
  20979. @@ -1120,6 +1544,12 @@
  20980. break;
  20981. }
  20982. + if (ww_ctx && ww_ctx->acquired > 0) {
  20983. + ret = __mutex_lock_check_stamp(lock, ww_ctx);
  20984. + if (ret)
  20985. + break;
  20986. + }
  20987. +
  20988. raw_spin_unlock(&lock->wait_lock);
  20989. debug_rt_mutex_print_deadlock(waiter);
  20990. @@ -1153,25 +1583,102 @@
  20991. }
  20992. }
  20993. +static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww,
  20994. + struct ww_acquire_ctx *ww_ctx)
  20995. +{
  20996. +#ifdef CONFIG_DEBUG_MUTEXES
  20997. + /*
  20998. + * If this WARN_ON triggers, you used ww_mutex_lock to acquire,
  20999. + * but released with a normal mutex_unlock in this call.
  21000. + *
  21001. + * This should never happen, always use ww_mutex_unlock.
  21002. + */
  21003. + DEBUG_LOCKS_WARN_ON(ww->ctx);
  21004. +
  21005. + /*
  21006. + * Not quite done after calling ww_acquire_done() ?
  21007. + */
  21008. + DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire);
  21009. +
  21010. + if (ww_ctx->contending_lock) {
  21011. + /*
  21012. + * After -EDEADLK you tried to
  21013. + * acquire a different ww_mutex? Bad!
  21014. + */
  21015. + DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww);
  21016. +
  21017. + /*
  21018. + * You called ww_mutex_lock after receiving -EDEADLK,
  21019. + * but 'forgot' to unlock everything else first?
  21020. + */
  21021. + DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0);
  21022. + ww_ctx->contending_lock = NULL;
  21023. + }
  21024. +
  21025. + /*
  21026. + * Naughty, using a different class will lead to undefined behavior!
  21027. + */
  21028. + DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class);
  21029. +#endif
  21030. + ww_ctx->acquired++;
  21031. +}
  21032. +
  21033. +#ifdef CONFIG_PREEMPT_RT_FULL
  21034. +static void ww_mutex_account_lock(struct rt_mutex *lock,
  21035. + struct ww_acquire_ctx *ww_ctx)
  21036. +{
  21037. + struct ww_mutex *ww = container_of(lock, struct ww_mutex, base.lock);
  21038. + struct rt_mutex_waiter *waiter, *n;
  21039. +
  21040. + /*
  21041. + * This branch gets optimized out for the common case,
  21042. + * and is only important for ww_mutex_lock.
  21043. + */
  21044. + ww_mutex_lock_acquired(ww, ww_ctx);
  21045. + ww->ctx = ww_ctx;
  21046. +
  21047. + /*
  21048. + * Give any possible sleeping processes the chance to wake up,
  21049. + * so they can recheck if they have to back off.
  21050. + */
  21051. + rbtree_postorder_for_each_entry_safe(waiter, n, &lock->waiters,
  21052. + tree_entry) {
  21053. + /* XXX debug rt mutex waiter wakeup */
  21054. +
  21055. + BUG_ON(waiter->lock != lock);
  21056. + rt_mutex_wake_waiter(waiter);
  21057. + }
  21058. +}
  21059. +
  21060. +#else
  21061. +
  21062. +static void ww_mutex_account_lock(struct rt_mutex *lock,
  21063. + struct ww_acquire_ctx *ww_ctx)
  21064. +{
  21065. + BUG();
  21066. +}
  21067. +#endif
  21068. +
  21069. /*
  21070. * Slow path lock function:
  21071. */
  21072. static int __sched
  21073. rt_mutex_slowlock(struct rt_mutex *lock, int state,
  21074. struct hrtimer_sleeper *timeout,
  21075. - enum rtmutex_chainwalk chwalk)
  21076. + enum rtmutex_chainwalk chwalk,
  21077. + struct ww_acquire_ctx *ww_ctx)
  21078. {
  21079. struct rt_mutex_waiter waiter;
  21080. int ret = 0;
  21081. - debug_rt_mutex_init_waiter(&waiter);
  21082. - RB_CLEAR_NODE(&waiter.pi_tree_entry);
  21083. - RB_CLEAR_NODE(&waiter.tree_entry);
  21084. + rt_mutex_init_waiter(&waiter, false);
  21085. raw_spin_lock(&lock->wait_lock);
  21086. /* Try to acquire the lock again: */
  21087. if (try_to_take_rt_mutex(lock, current, NULL)) {
  21088. + if (ww_ctx)
  21089. + ww_mutex_account_lock(lock, ww_ctx);
  21090. raw_spin_unlock(&lock->wait_lock);
  21091. return 0;
  21092. }
  21093. @@ -1188,14 +1695,23 @@
  21094. ret = task_blocks_on_rt_mutex(lock, &waiter, current, chwalk);
  21095. if (likely(!ret))
  21096. - ret = __rt_mutex_slowlock(lock, state, timeout, &waiter);
  21097. + ret = __rt_mutex_slowlock(lock, state, timeout, &waiter, ww_ctx);
  21098. + else if (ww_ctx) {
  21099. + /* ww_mutex received EDEADLK, let it become EALREADY */
  21100. + ret = __mutex_lock_check_stamp(lock, ww_ctx);
  21101. + BUG_ON(!ret);
  21102. + }
  21103. set_current_state(TASK_RUNNING);
  21104. if (unlikely(ret)) {
  21105. if (rt_mutex_has_waiters(lock))
  21106. remove_waiter(lock, &waiter);
  21107. - rt_mutex_handle_deadlock(ret, chwalk, &waiter);
  21108. + /* ww_mutex want to report EDEADLK/EALREADY, let them */
  21109. + if (!ww_ctx)
  21110. + rt_mutex_handle_deadlock(ret, chwalk, &waiter);
  21111. + } else if (ww_ctx) {
  21112. + ww_mutex_account_lock(lock, ww_ctx);
  21113. }
  21114. /*
  21115. @@ -1234,7 +1750,8 @@
  21116. * The mutex has currently no owner. Lock the wait lock and
  21117. * try to acquire the lock.
  21118. */
  21119. - raw_spin_lock(&lock->wait_lock);
  21120. + if (!raw_spin_trylock(&lock->wait_lock))
  21121. + return 0;
  21122. ret = try_to_take_rt_mutex(lock, current, NULL);
  21123. @@ -1320,31 +1837,36 @@
  21124. */
  21125. static inline int
  21126. rt_mutex_fastlock(struct rt_mutex *lock, int state,
  21127. + struct ww_acquire_ctx *ww_ctx,
  21128. int (*slowfn)(struct rt_mutex *lock, int state,
  21129. struct hrtimer_sleeper *timeout,
  21130. - enum rtmutex_chainwalk chwalk))
  21131. + enum rtmutex_chainwalk chwalk,
  21132. + struct ww_acquire_ctx *ww_ctx))
  21133. {
  21134. if (likely(rt_mutex_cmpxchg(lock, NULL, current))) {
  21135. rt_mutex_deadlock_account_lock(lock, current);
  21136. return 0;
  21137. } else
  21138. - return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK);
  21139. + return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK,
  21140. + ww_ctx);
  21141. }
  21142. static inline int
  21143. rt_mutex_timed_fastlock(struct rt_mutex *lock, int state,
  21144. struct hrtimer_sleeper *timeout,
  21145. enum rtmutex_chainwalk chwalk,
  21146. + struct ww_acquire_ctx *ww_ctx,
  21147. int (*slowfn)(struct rt_mutex *lock, int state,
  21148. struct hrtimer_sleeper *timeout,
  21149. - enum rtmutex_chainwalk chwalk))
  21150. + enum rtmutex_chainwalk chwalk,
  21151. + struct ww_acquire_ctx *ww_ctx))
  21152. {
  21153. if (chwalk == RT_MUTEX_MIN_CHAINWALK &&
  21154. likely(rt_mutex_cmpxchg(lock, NULL, current))) {
  21155. rt_mutex_deadlock_account_lock(lock, current);
  21156. return 0;
  21157. } else
  21158. - return slowfn(lock, state, timeout, chwalk);
  21159. + return slowfn(lock, state, timeout, chwalk, ww_ctx);
  21160. }
  21161. static inline int
  21162. @@ -1377,7 +1899,7 @@
  21163. {
  21164. might_sleep();
  21165. - rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, rt_mutex_slowlock);
  21166. + rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, NULL, rt_mutex_slowlock);
  21167. }
  21168. EXPORT_SYMBOL_GPL(rt_mutex_lock);
  21169. @@ -1394,7 +1916,7 @@
  21170. {
  21171. might_sleep();
  21172. - return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, rt_mutex_slowlock);
  21173. + return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, NULL, rt_mutex_slowlock);
  21174. }
  21175. EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
  21176. @@ -1407,11 +1929,30 @@
  21177. might_sleep();
  21178. return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
  21179. - RT_MUTEX_FULL_CHAINWALK,
  21180. + RT_MUTEX_FULL_CHAINWALK, NULL,
  21181. rt_mutex_slowlock);
  21182. }
  21183. /**
  21184. + * rt_mutex_lock_killable - lock a rt_mutex killable
  21185. + *
  21186. + * @lock: the rt_mutex to be locked
  21187. + * @detect_deadlock: deadlock detection on/off
  21188. + *
  21189. + * Returns:
  21190. + * 0 on success
  21191. + * -EINTR when interrupted by a signal
  21192. + * -EDEADLK when the lock would deadlock (when deadlock detection is on)
  21193. + */
  21194. +int __sched rt_mutex_lock_killable(struct rt_mutex *lock)
  21195. +{
  21196. + might_sleep();
  21197. +
  21198. + return rt_mutex_fastlock(lock, TASK_KILLABLE, NULL, rt_mutex_slowlock);
  21199. +}
  21200. +EXPORT_SYMBOL_GPL(rt_mutex_lock_killable);
  21201. +
  21202. +/**
  21203. * rt_mutex_timed_lock - lock a rt_mutex interruptible
  21204. * the timeout structure is provided
  21205. * by the caller
  21206. @@ -1431,6 +1972,7 @@
  21207. return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
  21208. RT_MUTEX_MIN_CHAINWALK,
  21209. + NULL,
  21210. rt_mutex_slowlock);
  21211. }
  21212. EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
  21213. @@ -1489,13 +2031,12 @@
  21214. void __rt_mutex_init(struct rt_mutex *lock, const char *name)
  21215. {
  21216. lock->owner = NULL;
  21217. - raw_spin_lock_init(&lock->wait_lock);
  21218. lock->waiters = RB_ROOT;
  21219. lock->waiters_leftmost = NULL;
  21220. debug_rt_mutex_init(lock, name);
  21221. }
  21222. -EXPORT_SYMBOL_GPL(__rt_mutex_init);
  21223. +EXPORT_SYMBOL(__rt_mutex_init);
  21224. /**
  21225. * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a
  21226. @@ -1510,7 +2051,7 @@
  21227. void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
  21228. struct task_struct *proxy_owner)
  21229. {
  21230. - __rt_mutex_init(lock, NULL);
  21231. + rt_mutex_init(lock);
  21232. debug_rt_mutex_proxy_lock(lock, proxy_owner);
  21233. rt_mutex_set_owner(lock, proxy_owner);
  21234. rt_mutex_deadlock_account_lock(lock, proxy_owner);
  21235. @@ -1558,6 +2099,35 @@
  21236. return 1;
  21237. }
  21238. +#ifdef CONFIG_PREEMPT_RT_FULL
  21239. + /*
  21240. + * In PREEMPT_RT there's an added race.
  21241. + * If the task, that we are about to requeue, times out,
  21242. + * it can set the PI_WAKEUP_INPROGRESS. This tells the requeue
  21243. + * to skip this task. But right after the task sets
  21244. + * its pi_blocked_on to PI_WAKEUP_INPROGRESS it can then
  21245. + * block on the spin_lock(&hb->lock), which in RT is an rtmutex.
  21246. + * This will replace the PI_WAKEUP_INPROGRESS with the actual
  21247. + * lock that it blocks on. We *must not* place this task
  21248. + * on this proxy lock in that case.
  21249. + *
  21250. + * To prevent this race, we first take the task's pi_lock
  21251. + * and check if it has updated its pi_blocked_on. If it has,
  21252. + * we assume that it woke up and we return -EAGAIN.
  21253. + * Otherwise, we set the task's pi_blocked_on to
  21254. + * PI_REQUEUE_INPROGRESS, so that if the task is waking up
  21255. + * it will know that we are in the process of requeuing it.
  21256. + */
  21257. + raw_spin_lock_irq(&task->pi_lock);
  21258. + if (task->pi_blocked_on) {
  21259. + raw_spin_unlock_irq(&task->pi_lock);
  21260. + raw_spin_unlock(&lock->wait_lock);
  21261. + return -EAGAIN;
  21262. + }
  21263. + task->pi_blocked_on = PI_REQUEUE_INPROGRESS;
  21264. + raw_spin_unlock_irq(&task->pi_lock);
  21265. +#endif
  21266. +
  21267. /* We enforce deadlock detection for futexes */
  21268. ret = task_blocks_on_rt_mutex(lock, waiter, task,
  21269. RT_MUTEX_FULL_CHAINWALK);
  21270. @@ -1627,7 +2197,7 @@
  21271. set_current_state(TASK_INTERRUPTIBLE);
  21272. - ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
  21273. + ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter, NULL);
  21274. set_current_state(TASK_RUNNING);
  21275. @@ -1644,3 +2214,89 @@
  21276. return ret;
  21277. }
  21278. +
  21279. +static inline int
  21280. +ww_mutex_deadlock_injection(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
  21281. +{
  21282. +#ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH
  21283. + unsigned tmp;
  21284. +
  21285. + if (ctx->deadlock_inject_countdown-- == 0) {
  21286. + tmp = ctx->deadlock_inject_interval;
  21287. + if (tmp > UINT_MAX/4)
  21288. + tmp = UINT_MAX;
  21289. + else
  21290. + tmp = tmp*2 + tmp + tmp/2;
  21291. +
  21292. + ctx->deadlock_inject_interval = tmp;
  21293. + ctx->deadlock_inject_countdown = tmp;
  21294. + ctx->contending_lock = lock;
  21295. +
  21296. + ww_mutex_unlock(lock);
  21297. +
  21298. + return -EDEADLK;
  21299. + }
  21300. +#endif
  21301. +
  21302. + return 0;
  21303. +}
  21304. +
  21305. +#ifdef CONFIG_PREEMPT_RT_FULL
  21306. +int __sched
  21307. +__ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx)
  21308. +{
  21309. + int ret;
  21310. +
  21311. + might_sleep();
  21312. +
  21313. + mutex_acquire_nest(&lock->base.dep_map, 0, 0, &ww_ctx->dep_map, _RET_IP_);
  21314. + ret = rt_mutex_slowlock(&lock->base.lock, TASK_INTERRUPTIBLE, NULL, 0, ww_ctx);
  21315. + if (ret)
  21316. + mutex_release(&lock->base.dep_map, 1, _RET_IP_);
  21317. + else if (!ret && ww_ctx->acquired > 1)
  21318. + return ww_mutex_deadlock_injection(lock, ww_ctx);
  21319. +
  21320. + return ret;
  21321. +}
  21322. +EXPORT_SYMBOL_GPL(__ww_mutex_lock_interruptible);
  21323. +
  21324. +int __sched
  21325. +__ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx)
  21326. +{
  21327. + int ret;
  21328. +
  21329. + might_sleep();
  21330. +
  21331. + mutex_acquire_nest(&lock->base.dep_map, 0, 0, &ww_ctx->dep_map, _RET_IP_);
  21332. + ret = rt_mutex_slowlock(&lock->base.lock, TASK_UNINTERRUPTIBLE, NULL, 0, ww_ctx);
  21333. + if (ret)
  21334. + mutex_release(&lock->base.dep_map, 1, _RET_IP_);
  21335. + else if (!ret && ww_ctx->acquired > 1)
  21336. + return ww_mutex_deadlock_injection(lock, ww_ctx);
  21337. +
  21338. + return ret;
  21339. +}
  21340. +EXPORT_SYMBOL_GPL(__ww_mutex_lock);
  21341. +
  21342. +void __sched ww_mutex_unlock(struct ww_mutex *lock)
  21343. +{
  21344. + int nest = !!lock->ctx;
  21345. +
  21346. + /*
  21347. + * The unlocking fastpath is the 0->1 transition from 'locked'
  21348. + * into 'unlocked' state:
  21349. + */
  21350. + if (nest) {
  21351. +#ifdef CONFIG_DEBUG_MUTEXES
  21352. + DEBUG_LOCKS_WARN_ON(!lock->ctx->acquired);
  21353. +#endif
  21354. + if (lock->ctx->acquired > 0)
  21355. + lock->ctx->acquired--;
  21356. + lock->ctx = NULL;
  21357. + }
  21358. +
  21359. + mutex_release(&lock->base.dep_map, nest, _RET_IP_);
  21360. + rt_mutex_unlock(&lock->base.lock);
  21361. +}
  21362. +EXPORT_SYMBOL(ww_mutex_unlock);
  21363. +#endif
  21364. diff -Nur linux-3.18.14.orig/kernel/locking/rtmutex_common.h linux-3.18.14-rt/kernel/locking/rtmutex_common.h
  21365. --- linux-3.18.14.orig/kernel/locking/rtmutex_common.h 2015-05-20 10:04:50.000000000 -0500
  21366. +++ linux-3.18.14-rt/kernel/locking/rtmutex_common.h 2015-05-31 15:32:48.769635364 -0500
  21367. @@ -49,6 +49,7 @@
  21368. struct rb_node pi_tree_entry;
  21369. struct task_struct *task;
  21370. struct rt_mutex *lock;
  21371. + bool savestate;
  21372. #ifdef CONFIG_DEBUG_RT_MUTEXES
  21373. unsigned long ip;
  21374. struct pid *deadlock_task_pid;
  21375. @@ -119,6 +120,9 @@
  21376. /*
  21377. * PI-futex support (proxy locking functions, etc.):
  21378. */
  21379. +#define PI_WAKEUP_INPROGRESS ((struct rt_mutex_waiter *) 1)
  21380. +#define PI_REQUEUE_INPROGRESS ((struct rt_mutex_waiter *) 2)
  21381. +
  21382. extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);
  21383. extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
  21384. struct task_struct *proxy_owner);
  21385. @@ -138,4 +142,14 @@
  21386. # include "rtmutex.h"
  21387. #endif
  21388. +static inline void
  21389. +rt_mutex_init_waiter(struct rt_mutex_waiter *waiter, bool savestate)
  21390. +{
  21391. + debug_rt_mutex_init_waiter(waiter);
  21392. + waiter->task = NULL;
  21393. + waiter->savestate = savestate;
  21394. + RB_CLEAR_NODE(&waiter->pi_tree_entry);
  21395. + RB_CLEAR_NODE(&waiter->tree_entry);
  21396. +}
  21397. +
  21398. #endif
  21399. diff -Nur linux-3.18.14.orig/kernel/locking/spinlock.c linux-3.18.14-rt/kernel/locking/spinlock.c
  21400. --- linux-3.18.14.orig/kernel/locking/spinlock.c 2015-05-20 10:04:50.000000000 -0500
  21401. +++ linux-3.18.14-rt/kernel/locking/spinlock.c 2015-05-31 15:32:48.769635364 -0500
  21402. @@ -124,8 +124,11 @@
  21403. * __[spin|read|write]_lock_bh()
  21404. */
  21405. BUILD_LOCK_OPS(spin, raw_spinlock);
  21406. +
  21407. +#ifndef CONFIG_PREEMPT_RT_FULL
  21408. BUILD_LOCK_OPS(read, rwlock);
  21409. BUILD_LOCK_OPS(write, rwlock);
  21410. +#endif
  21411. #endif
  21412. @@ -209,6 +212,8 @@
  21413. EXPORT_SYMBOL(_raw_spin_unlock_bh);
  21414. #endif
  21415. +#ifndef CONFIG_PREEMPT_RT_FULL
  21416. +
  21417. #ifndef CONFIG_INLINE_READ_TRYLOCK
  21418. int __lockfunc _raw_read_trylock(rwlock_t *lock)
  21419. {
  21420. @@ -353,6 +358,8 @@
  21421. EXPORT_SYMBOL(_raw_write_unlock_bh);
  21422. #endif
  21423. +#endif /* !PREEMPT_RT_FULL */
  21424. +
  21425. #ifdef CONFIG_DEBUG_LOCK_ALLOC
  21426. void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass)
  21427. diff -Nur linux-3.18.14.orig/kernel/locking/spinlock_debug.c linux-3.18.14-rt/kernel/locking/spinlock_debug.c
  21428. --- linux-3.18.14.orig/kernel/locking/spinlock_debug.c 2015-05-20 10:04:50.000000000 -0500
  21429. +++ linux-3.18.14-rt/kernel/locking/spinlock_debug.c 2015-05-31 15:32:48.793635364 -0500
  21430. @@ -31,6 +31,7 @@
  21431. EXPORT_SYMBOL(__raw_spin_lock_init);
  21432. +#ifndef CONFIG_PREEMPT_RT_FULL
  21433. void __rwlock_init(rwlock_t *lock, const char *name,
  21434. struct lock_class_key *key)
  21435. {
  21436. @@ -48,6 +49,7 @@
  21437. }
  21438. EXPORT_SYMBOL(__rwlock_init);
  21439. +#endif
  21440. static void spin_dump(raw_spinlock_t *lock, const char *msg)
  21441. {
  21442. @@ -159,6 +161,7 @@
  21443. arch_spin_unlock(&lock->raw_lock);
  21444. }
  21445. +#ifndef CONFIG_PREEMPT_RT_FULL
  21446. static void rwlock_bug(rwlock_t *lock, const char *msg)
  21447. {
  21448. if (!debug_locks_off())
  21449. @@ -300,3 +303,5 @@
  21450. debug_write_unlock(lock);
  21451. arch_write_unlock(&lock->raw_lock);
  21452. }
  21453. +
  21454. +#endif
  21455. diff -Nur linux-3.18.14.orig/kernel/panic.c linux-3.18.14-rt/kernel/panic.c
  21456. --- linux-3.18.14.orig/kernel/panic.c 2015-05-20 10:04:50.000000000 -0500
  21457. +++ linux-3.18.14-rt/kernel/panic.c 2015-05-31 15:32:48.793635364 -0500
  21458. @@ -384,9 +384,11 @@
  21459. static int init_oops_id(void)
  21460. {
  21461. +#ifndef CONFIG_PREEMPT_RT_FULL
  21462. if (!oops_id)
  21463. get_random_bytes(&oops_id, sizeof(oops_id));
  21464. else
  21465. +#endif
  21466. oops_id++;
  21467. return 0;
  21468. diff -Nur linux-3.18.14.orig/kernel/power/hibernate.c linux-3.18.14-rt/kernel/power/hibernate.c
  21469. --- linux-3.18.14.orig/kernel/power/hibernate.c 2015-05-20 10:04:50.000000000 -0500
  21470. +++ linux-3.18.14-rt/kernel/power/hibernate.c 2015-05-31 15:32:48.797635364 -0500
  21471. @@ -287,6 +287,8 @@
  21472. local_irq_disable();
  21473. + system_state = SYSTEM_SUSPEND;
  21474. +
  21475. error = syscore_suspend();
  21476. if (error) {
  21477. printk(KERN_ERR "PM: Some system devices failed to power down, "
  21478. @@ -316,6 +318,7 @@
  21479. syscore_resume();
  21480. Enable_irqs:
  21481. + system_state = SYSTEM_RUNNING;
  21482. local_irq_enable();
  21483. Enable_cpus:
  21484. @@ -439,6 +442,7 @@
  21485. goto Enable_cpus;
  21486. local_irq_disable();
  21487. + system_state = SYSTEM_SUSPEND;
  21488. error = syscore_suspend();
  21489. if (error)
  21490. @@ -472,6 +476,7 @@
  21491. syscore_resume();
  21492. Enable_irqs:
  21493. + system_state = SYSTEM_RUNNING;
  21494. local_irq_enable();
  21495. Enable_cpus:
  21496. @@ -557,6 +562,7 @@
  21497. goto Platform_finish;
  21498. local_irq_disable();
  21499. + system_state = SYSTEM_SUSPEND;
  21500. syscore_suspend();
  21501. if (pm_wakeup_pending()) {
  21502. error = -EAGAIN;
  21503. @@ -569,6 +575,7 @@
  21504. Power_up:
  21505. syscore_resume();
  21506. + system_state = SYSTEM_RUNNING;
  21507. local_irq_enable();
  21508. enable_nonboot_cpus();
  21509. diff -Nur linux-3.18.14.orig/kernel/power/suspend.c linux-3.18.14-rt/kernel/power/suspend.c
  21510. --- linux-3.18.14.orig/kernel/power/suspend.c 2015-05-20 10:04:50.000000000 -0500
  21511. +++ linux-3.18.14-rt/kernel/power/suspend.c 2015-05-31 15:32:48.797635364 -0500
  21512. @@ -318,6 +318,8 @@
  21513. arch_suspend_disable_irqs();
  21514. BUG_ON(!irqs_disabled());
  21515. + system_state = SYSTEM_SUSPEND;
  21516. +
  21517. error = syscore_suspend();
  21518. if (!error) {
  21519. *wakeup = pm_wakeup_pending();
  21520. @@ -332,6 +334,8 @@
  21521. syscore_resume();
  21522. }
  21523. + system_state = SYSTEM_RUNNING;
  21524. +
  21525. arch_suspend_enable_irqs();
  21526. BUG_ON(irqs_disabled());
  21527. diff -Nur linux-3.18.14.orig/kernel/printk/printk.c linux-3.18.14-rt/kernel/printk/printk.c
  21528. --- linux-3.18.14.orig/kernel/printk/printk.c 2015-05-20 10:04:50.000000000 -0500
  21529. +++ linux-3.18.14-rt/kernel/printk/printk.c 2015-05-31 15:32:48.801635363 -0500
  21530. @@ -1165,6 +1165,7 @@
  21531. {
  21532. char *text;
  21533. int len = 0;
  21534. + int attempts = 0;
  21535. text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
  21536. if (!text)
  21537. @@ -1176,7 +1177,14 @@
  21538. u64 seq;
  21539. u32 idx;
  21540. enum log_flags prev;
  21541. -
  21542. + int num_msg;
  21543. +try_again:
  21544. + attempts++;
  21545. + if (attempts > 10) {
  21546. + len = -EBUSY;
  21547. + goto out;
  21548. + }
  21549. + num_msg = 0;
  21550. if (clear_seq < log_first_seq) {
  21551. /* messages are gone, move to first available one */
  21552. clear_seq = log_first_seq;
  21553. @@ -1197,6 +1205,14 @@
  21554. prev = msg->flags;
  21555. idx = log_next(idx);
  21556. seq++;
  21557. + num_msg++;
  21558. + if (num_msg > 5) {
  21559. + num_msg = 0;
  21560. + raw_spin_unlock_irq(&logbuf_lock);
  21561. + raw_spin_lock_irq(&logbuf_lock);
  21562. + if (clear_seq < log_first_seq)
  21563. + goto try_again;
  21564. + }
  21565. }
  21566. /* move first record forward until length fits into the buffer */
  21567. @@ -1210,6 +1226,14 @@
  21568. prev = msg->flags;
  21569. idx = log_next(idx);
  21570. seq++;
  21571. + num_msg++;
  21572. + if (num_msg > 5) {
  21573. + num_msg = 0;
  21574. + raw_spin_unlock_irq(&logbuf_lock);
  21575. + raw_spin_lock_irq(&logbuf_lock);
  21576. + if (clear_seq < log_first_seq)
  21577. + goto try_again;
  21578. + }
  21579. }
  21580. /* last message fitting into this dump */
  21581. @@ -1250,6 +1274,7 @@
  21582. clear_seq = log_next_seq;
  21583. clear_idx = log_next_idx;
  21584. }
  21585. +out:
  21586. raw_spin_unlock_irq(&logbuf_lock);
  21587. kfree(text);
  21588. @@ -1407,6 +1432,7 @@
  21589. if (!console_drivers)
  21590. return;
  21591. + migrate_disable();
  21592. for_each_console(con) {
  21593. if (exclusive_console && con != exclusive_console)
  21594. continue;
  21595. @@ -1419,6 +1445,7 @@
  21596. continue;
  21597. con->write(con, text, len);
  21598. }
  21599. + migrate_enable();
  21600. }
  21601. /*
  21602. @@ -1479,6 +1506,15 @@
  21603. static int console_trylock_for_printk(void)
  21604. {
  21605. unsigned int cpu = smp_processor_id();
  21606. +#ifdef CONFIG_PREEMPT_RT_FULL
  21607. + int lock = !early_boot_irqs_disabled && (preempt_count() == 0) &&
  21608. + !irqs_disabled();
  21609. +#else
  21610. + int lock = 1;
  21611. +#endif
  21612. +
  21613. + if (!lock)
  21614. + return 0;
  21615. if (!console_trylock())
  21616. return 0;
  21617. @@ -1613,6 +1649,62 @@
  21618. return textlen;
  21619. }
  21620. +#ifdef CONFIG_EARLY_PRINTK
  21621. +struct console *early_console;
  21622. +
  21623. +void early_vprintk(const char *fmt, va_list ap)
  21624. +{
  21625. + if (early_console) {
  21626. + char buf[512];
  21627. + int n = vscnprintf(buf, sizeof(buf), fmt, ap);
  21628. +
  21629. + early_console->write(early_console, buf, n);
  21630. + }
  21631. +}
  21632. +
  21633. +asmlinkage void early_printk(const char *fmt, ...)
  21634. +{
  21635. + va_list ap;
  21636. +
  21637. + va_start(ap, fmt);
  21638. + early_vprintk(fmt, ap);
  21639. + va_end(ap);
  21640. +}
  21641. +
  21642. +/*
  21643. + * This is independent of any log levels - a global
  21644. + * kill switch that turns off all of printk.
  21645. + *
  21646. + * Used by the NMI watchdog if early-printk is enabled.
  21647. + */
  21648. +static bool __read_mostly printk_killswitch;
  21649. +
  21650. +static int __init force_early_printk_setup(char *str)
  21651. +{
  21652. + printk_killswitch = true;
  21653. + return 0;
  21654. +}
  21655. +early_param("force_early_printk", force_early_printk_setup);
  21656. +
  21657. +void printk_kill(void)
  21658. +{
  21659. + printk_killswitch = true;
  21660. +}
  21661. +
  21662. +static int forced_early_printk(const char *fmt, va_list ap)
  21663. +{
  21664. + if (!printk_killswitch)
  21665. + return 0;
  21666. + early_vprintk(fmt, ap);
  21667. + return 1;
  21668. +}
  21669. +#else
  21670. +static inline int forced_early_printk(const char *fmt, va_list ap)
  21671. +{
  21672. + return 0;
  21673. +}
  21674. +#endif
  21675. +
  21676. asmlinkage int vprintk_emit(int facility, int level,
  21677. const char *dict, size_t dictlen,
  21678. const char *fmt, va_list args)
  21679. @@ -1629,6 +1721,13 @@
  21680. /* cpu currently holding logbuf_lock in this function */
  21681. static volatile unsigned int logbuf_cpu = UINT_MAX;
  21682. + /*
  21683. + * Fall back to early_printk if a debugging subsystem has
  21684. + * killed printk output
  21685. + */
  21686. + if (unlikely(forced_early_printk(fmt, args)))
  21687. + return 1;
  21688. +
  21689. if (level == SCHED_MESSAGE_LOGLEVEL) {
  21690. level = -1;
  21691. in_sched = true;
  21692. @@ -1769,8 +1868,7 @@
  21693. * console_sem which would prevent anyone from printing to
  21694. * console
  21695. */
  21696. - preempt_disable();
  21697. -
  21698. + migrate_disable();
  21699. /*
  21700. * Try to acquire and then immediately release the console
  21701. * semaphore. The release will print out buffers and wake up
  21702. @@ -1778,7 +1876,7 @@
  21703. */
  21704. if (console_trylock_for_printk())
  21705. console_unlock();
  21706. - preempt_enable();
  21707. + migrate_enable();
  21708. lockdep_on();
  21709. }
  21710. @@ -1878,29 +1976,6 @@
  21711. #endif /* CONFIG_PRINTK */
  21712. -#ifdef CONFIG_EARLY_PRINTK
  21713. -struct console *early_console;
  21714. -
  21715. -void early_vprintk(const char *fmt, va_list ap)
  21716. -{
  21717. - if (early_console) {
  21718. - char buf[512];
  21719. - int n = vscnprintf(buf, sizeof(buf), fmt, ap);
  21720. -
  21721. - early_console->write(early_console, buf, n);
  21722. - }
  21723. -}
  21724. -
  21725. -asmlinkage __visible void early_printk(const char *fmt, ...)
  21726. -{
  21727. - va_list ap;
  21728. -
  21729. - va_start(ap, fmt);
  21730. - early_vprintk(fmt, ap);
  21731. - va_end(ap);
  21732. -}
  21733. -#endif
  21734. -
  21735. static int __add_preferred_console(char *name, int idx, char *options,
  21736. char *brl_options)
  21737. {
  21738. @@ -2140,11 +2215,16 @@
  21739. goto out;
  21740. len = cont_print_text(text, size);
  21741. +#ifndef CONFIG_PREEMPT_RT_FULL
  21742. raw_spin_unlock(&logbuf_lock);
  21743. stop_critical_timings();
  21744. call_console_drivers(cont.level, text, len);
  21745. start_critical_timings();
  21746. local_irq_restore(flags);
  21747. +#else
  21748. + raw_spin_unlock_irqrestore(&logbuf_lock, flags);
  21749. + call_console_drivers(cont.level, text, len);
  21750. +#endif
  21751. return;
  21752. out:
  21753. raw_spin_unlock_irqrestore(&logbuf_lock, flags);
  21754. @@ -2232,12 +2312,17 @@
  21755. console_idx = log_next(console_idx);
  21756. console_seq++;
  21757. console_prev = msg->flags;
  21758. +#ifdef CONFIG_PREEMPT_RT_FULL
  21759. + raw_spin_unlock_irqrestore(&logbuf_lock, flags);
  21760. + call_console_drivers(level, text, len);
  21761. +#else
  21762. raw_spin_unlock(&logbuf_lock);
  21763. stop_critical_timings(); /* don't trace print latency */
  21764. call_console_drivers(level, text, len);
  21765. start_critical_timings();
  21766. local_irq_restore(flags);
  21767. +#endif
  21768. }
  21769. console_locked = 0;
  21770. diff -Nur linux-3.18.14.orig/kernel/ptrace.c linux-3.18.14-rt/kernel/ptrace.c
  21771. --- linux-3.18.14.orig/kernel/ptrace.c 2015-05-20 10:04:50.000000000 -0500
  21772. +++ linux-3.18.14-rt/kernel/ptrace.c 2015-05-31 15:32:48.801635363 -0500
  21773. @@ -129,7 +129,12 @@
  21774. spin_lock_irq(&task->sighand->siglock);
  21775. if (task_is_traced(task) && !__fatal_signal_pending(task)) {
  21776. - task->state = __TASK_TRACED;
  21777. + raw_spin_lock_irq(&task->pi_lock);
  21778. + if (task->state & __TASK_TRACED)
  21779. + task->state = __TASK_TRACED;
  21780. + else
  21781. + task->saved_state = __TASK_TRACED;
  21782. + raw_spin_unlock_irq(&task->pi_lock);
  21783. ret = true;
  21784. }
  21785. spin_unlock_irq(&task->sighand->siglock);
  21786. diff -Nur linux-3.18.14.orig/kernel/rcu/tiny.c linux-3.18.14-rt/kernel/rcu/tiny.c
  21787. --- linux-3.18.14.orig/kernel/rcu/tiny.c 2015-05-20 10:04:50.000000000 -0500
  21788. +++ linux-3.18.14-rt/kernel/rcu/tiny.c 2015-05-31 15:32:48.801635363 -0500
  21789. @@ -370,6 +370,7 @@
  21790. }
  21791. EXPORT_SYMBOL_GPL(call_rcu_sched);
  21792. +#ifndef CONFIG_PREEMPT_RT_FULL
  21793. /*
  21794. * Post an RCU bottom-half callback to be invoked after any subsequent
  21795. * quiescent state.
  21796. @@ -379,6 +380,7 @@
  21797. __call_rcu(head, func, &rcu_bh_ctrlblk);
  21798. }
  21799. EXPORT_SYMBOL_GPL(call_rcu_bh);
  21800. +#endif
  21801. void rcu_init(void)
  21802. {
  21803. diff -Nur linux-3.18.14.orig/kernel/rcu/tree.c linux-3.18.14-rt/kernel/rcu/tree.c
  21804. --- linux-3.18.14.orig/kernel/rcu/tree.c 2015-05-20 10:04:50.000000000 -0500
  21805. +++ linux-3.18.14-rt/kernel/rcu/tree.c 2015-05-31 15:32:48.805635363 -0500
  21806. @@ -56,6 +56,11 @@
  21807. #include <linux/random.h>
  21808. #include <linux/ftrace_event.h>
  21809. #include <linux/suspend.h>
  21810. +#include <linux/delay.h>
  21811. +#include <linux/gfp.h>
  21812. +#include <linux/oom.h>
  21813. +#include <linux/smpboot.h>
  21814. +#include "../time/tick-internal.h"
  21815. #include "tree.h"
  21816. #include "rcu.h"
  21817. @@ -152,8 +157,6 @@
  21818. */
  21819. static int rcu_scheduler_fully_active __read_mostly;
  21820. -#ifdef CONFIG_RCU_BOOST
  21821. -
  21822. /*
  21823. * Control variables for per-CPU and per-rcu_node kthreads. These
  21824. * handle all flavors of RCU.
  21825. @@ -163,8 +166,6 @@
  21826. DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
  21827. DEFINE_PER_CPU(char, rcu_cpu_has_work);
  21828. -#endif /* #ifdef CONFIG_RCU_BOOST */
  21829. -
  21830. static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
  21831. static void invoke_rcu_core(void);
  21832. static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
  21833. @@ -207,6 +208,19 @@
  21834. }
  21835. }
  21836. +#ifdef CONFIG_PREEMPT_RT_FULL
  21837. +static void rcu_preempt_qs(void);
  21838. +
  21839. +void rcu_bh_qs(void)
  21840. +{
  21841. + unsigned long flags;
  21842. +
  21843. + /* Callers to this function, rcu_preempt_qs(), must disable irqs. */
  21844. + local_irq_save(flags);
  21845. + rcu_preempt_qs();
  21846. + local_irq_restore(flags);
  21847. +}
  21848. +#else
  21849. void rcu_bh_qs(void)
  21850. {
  21851. if (!__this_cpu_read(rcu_bh_data.passed_quiesce)) {
  21852. @@ -216,6 +230,7 @@
  21853. __this_cpu_write(rcu_bh_data.passed_quiesce, 1);
  21854. }
  21855. }
  21856. +#endif
  21857. static DEFINE_PER_CPU(int, rcu_sched_qs_mask);
  21858. @@ -336,6 +351,7 @@
  21859. }
  21860. EXPORT_SYMBOL_GPL(rcu_batches_completed_sched);
  21861. +#ifndef CONFIG_PREEMPT_RT_FULL
  21862. /*
  21863. * Return the number of RCU BH batches processed thus far for debug & stats.
  21864. */
  21865. @@ -363,6 +379,13 @@
  21866. }
  21867. EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);
  21868. +#else
  21869. +void rcu_force_quiescent_state(void)
  21870. +{
  21871. +}
  21872. +EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
  21873. +#endif
  21874. +
  21875. /*
  21876. * Show the state of the grace-period kthreads.
  21877. */
  21878. @@ -1411,7 +1434,7 @@
  21879. !ACCESS_ONCE(rsp->gp_flags) ||
  21880. !rsp->gp_kthread)
  21881. return;
  21882. - wake_up(&rsp->gp_wq);
  21883. + swait_wake(&rsp->gp_wq);
  21884. }
  21885. /*
  21886. @@ -1793,7 +1816,7 @@
  21887. ACCESS_ONCE(rsp->gpnum),
  21888. TPS("reqwait"));
  21889. rsp->gp_state = RCU_GP_WAIT_GPS;
  21890. - wait_event_interruptible(rsp->gp_wq,
  21891. + swait_event_interruptible(rsp->gp_wq,
  21892. ACCESS_ONCE(rsp->gp_flags) &
  21893. RCU_GP_FLAG_INIT);
  21894. /* Locking provides needed memory barrier. */
  21895. @@ -1821,7 +1844,7 @@
  21896. ACCESS_ONCE(rsp->gpnum),
  21897. TPS("fqswait"));
  21898. rsp->gp_state = RCU_GP_WAIT_FQS;
  21899. - ret = wait_event_interruptible_timeout(rsp->gp_wq,
  21900. + ret = swait_event_interruptible_timeout(rsp->gp_wq,
  21901. ((gf = ACCESS_ONCE(rsp->gp_flags)) &
  21902. RCU_GP_FLAG_FQS) ||
  21903. (!ACCESS_ONCE(rnp->qsmask) &&
  21904. @@ -2565,16 +2588,14 @@
  21905. /*
  21906. * Do RCU core processing for the current CPU.
  21907. */
  21908. -static void rcu_process_callbacks(struct softirq_action *unused)
  21909. +static void rcu_process_callbacks(void)
  21910. {
  21911. struct rcu_state *rsp;
  21912. if (cpu_is_offline(smp_processor_id()))
  21913. return;
  21914. - trace_rcu_utilization(TPS("Start RCU core"));
  21915. for_each_rcu_flavor(rsp)
  21916. __rcu_process_callbacks(rsp);
  21917. - trace_rcu_utilization(TPS("End RCU core"));
  21918. }
  21919. /*
  21920. @@ -2588,18 +2609,105 @@
  21921. {
  21922. if (unlikely(!ACCESS_ONCE(rcu_scheduler_fully_active)))
  21923. return;
  21924. - if (likely(!rsp->boost)) {
  21925. - rcu_do_batch(rsp, rdp);
  21926. + rcu_do_batch(rsp, rdp);
  21927. +}
  21928. +
  21929. +static void rcu_wake_cond(struct task_struct *t, int status)
  21930. +{
  21931. + /*
  21932. + * If the thread is yielding, only wake it when this
  21933. + * is invoked from idle
  21934. + */
  21935. + if (t && (status != RCU_KTHREAD_YIELDING || is_idle_task(current)))
  21936. + wake_up_process(t);
  21937. +}
  21938. +
  21939. +/*
  21940. + * Wake up this CPU's rcuc kthread to do RCU core processing.
  21941. + */
  21942. +static void invoke_rcu_core(void)
  21943. +{
  21944. + unsigned long flags;
  21945. + struct task_struct *t;
  21946. +
  21947. + if (!cpu_online(smp_processor_id()))
  21948. return;
  21949. + local_irq_save(flags);
  21950. + __this_cpu_write(rcu_cpu_has_work, 1);
  21951. + t = __this_cpu_read(rcu_cpu_kthread_task);
  21952. + if (t != NULL && current != t)
  21953. + rcu_wake_cond(t, __this_cpu_read(rcu_cpu_kthread_status));
  21954. + local_irq_restore(flags);
  21955. +}
  21956. +
  21957. +static void rcu_cpu_kthread_park(unsigned int cpu)
  21958. +{
  21959. + per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
  21960. +}
  21961. +
  21962. +static int rcu_cpu_kthread_should_run(unsigned int cpu)
  21963. +{
  21964. + return __this_cpu_read(rcu_cpu_has_work);
  21965. +}
  21966. +
  21967. +/*
  21968. + * Per-CPU kernel thread that invokes RCU callbacks. This replaces the
  21969. + * RCU softirq used in flavors and configurations of RCU that do not
  21970. + * support RCU priority boosting.
  21971. + */
  21972. +static void rcu_cpu_kthread(unsigned int cpu)
  21973. +{
  21974. + unsigned int *statusp = &__get_cpu_var(rcu_cpu_kthread_status);
  21975. + char work, *workp = &__get_cpu_var(rcu_cpu_has_work);
  21976. + int spincnt;
  21977. +
  21978. + for (spincnt = 0; spincnt < 10; spincnt++) {
  21979. + trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
  21980. + local_bh_disable();
  21981. + *statusp = RCU_KTHREAD_RUNNING;
  21982. + this_cpu_inc(rcu_cpu_kthread_loops);
  21983. + local_irq_disable();
  21984. + work = *workp;
  21985. + *workp = 0;
  21986. + local_irq_enable();
  21987. + if (work)
  21988. + rcu_process_callbacks();
  21989. + local_bh_enable();
  21990. + if (*workp == 0) {
  21991. + trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
  21992. + *statusp = RCU_KTHREAD_WAITING;
  21993. + return;
  21994. + }
  21995. }
  21996. - invoke_rcu_callbacks_kthread();
  21997. + *statusp = RCU_KTHREAD_YIELDING;
  21998. + trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
  21999. + schedule_timeout_interruptible(2);
  22000. + trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
  22001. + *statusp = RCU_KTHREAD_WAITING;
  22002. }
  22003. -static void invoke_rcu_core(void)
  22004. +static struct smp_hotplug_thread rcu_cpu_thread_spec = {
  22005. + .store = &rcu_cpu_kthread_task,
  22006. + .thread_should_run = rcu_cpu_kthread_should_run,
  22007. + .thread_fn = rcu_cpu_kthread,
  22008. + .thread_comm = "rcuc/%u",
  22009. + .setup = rcu_cpu_kthread_setup,
  22010. + .park = rcu_cpu_kthread_park,
  22011. +};
  22012. +
  22013. +/*
  22014. + * Spawn per-CPU RCU core processing kthreads.
  22015. + */
  22016. +static int __init rcu_spawn_core_kthreads(void)
  22017. {
  22018. - if (cpu_online(smp_processor_id()))
  22019. - raise_softirq(RCU_SOFTIRQ);
  22020. + int cpu;
  22021. +
  22022. + for_each_possible_cpu(cpu)
  22023. + per_cpu(rcu_cpu_has_work, cpu) = 0;
  22024. + BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
  22025. + return 0;
  22026. }
  22027. +early_initcall(rcu_spawn_core_kthreads);
  22028. /*
  22029. * Handle any core-RCU processing required by a call_rcu() invocation.
  22030. @@ -2734,6 +2842,7 @@
  22031. }
  22032. EXPORT_SYMBOL_GPL(call_rcu_sched);
  22033. +#ifndef CONFIG_PREEMPT_RT_FULL
  22034. /*
  22035. * Queue an RCU callback for invocation after a quicker grace period.
  22036. */
  22037. @@ -2742,6 +2851,7 @@
  22038. __call_rcu(head, func, &rcu_bh_state, -1, 0);
  22039. }
  22040. EXPORT_SYMBOL_GPL(call_rcu_bh);
  22041. +#endif
  22042. /*
  22043. * Queue an RCU callback for lazy invocation after a grace period.
  22044. @@ -2833,6 +2943,7 @@
  22045. }
  22046. EXPORT_SYMBOL_GPL(synchronize_sched);
  22047. +#ifndef CONFIG_PREEMPT_RT_FULL
  22048. /**
  22049. * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed.
  22050. *
  22051. @@ -2859,6 +2970,7 @@
  22052. wait_rcu_gp(call_rcu_bh);
  22053. }
  22054. EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
  22055. +#endif
  22056. /**
  22057. * get_state_synchronize_rcu - Snapshot current RCU state
  22058. @@ -3341,6 +3453,7 @@
  22059. mutex_unlock(&rsp->barrier_mutex);
  22060. }
  22061. +#ifndef CONFIG_PREEMPT_RT_FULL
  22062. /**
  22063. * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete.
  22064. */
  22065. @@ -3349,6 +3462,7 @@
  22066. _rcu_barrier(&rcu_bh_state);
  22067. }
  22068. EXPORT_SYMBOL_GPL(rcu_barrier_bh);
  22069. +#endif
  22070. /**
  22071. * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks.
  22072. @@ -3658,7 +3772,7 @@
  22073. }
  22074. rsp->rda = rda;
  22075. - init_waitqueue_head(&rsp->gp_wq);
  22076. + init_swait_head(&rsp->gp_wq);
  22077. rnp = rsp->level[rcu_num_lvls - 1];
  22078. for_each_possible_cpu(i) {
  22079. while (i > rnp->grphi)
  22080. @@ -3755,7 +3869,6 @@
  22081. rcu_init_one(&rcu_bh_state, &rcu_bh_data);
  22082. rcu_init_one(&rcu_sched_state, &rcu_sched_data);
  22083. __rcu_init_preempt();
  22084. - open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
  22085. /*
  22086. * We don't need protection against CPU-hotplug here because
  22087. diff -Nur linux-3.18.14.orig/kernel/rcu/tree.h linux-3.18.14-rt/kernel/rcu/tree.h
  22088. --- linux-3.18.14.orig/kernel/rcu/tree.h 2015-05-20 10:04:50.000000000 -0500
  22089. +++ linux-3.18.14-rt/kernel/rcu/tree.h 2015-05-31 15:32:48.809635364 -0500
  22090. @@ -28,6 +28,7 @@
  22091. #include <linux/cpumask.h>
  22092. #include <linux/seqlock.h>
  22093. #include <linux/irq_work.h>
  22094. +#include <linux/wait-simple.h>
  22095. /*
  22096. * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and
  22097. @@ -172,11 +173,6 @@
  22098. /* queued on this rcu_node structure that */
  22099. /* are blocking the current grace period, */
  22100. /* there can be no such task. */
  22101. - struct completion boost_completion;
  22102. - /* Used to ensure that the rt_mutex used */
  22103. - /* to carry out the boosting is fully */
  22104. - /* released with no future boostee accesses */
  22105. - /* before that rt_mutex is re-initialized. */
  22106. struct rt_mutex boost_mtx;
  22107. /* Used only for the priority-boosting */
  22108. /* side effect, not as a lock. */
  22109. @@ -208,7 +204,7 @@
  22110. /* This can happen due to race conditions. */
  22111. #endif /* #ifdef CONFIG_RCU_BOOST */
  22112. #ifdef CONFIG_RCU_NOCB_CPU
  22113. - wait_queue_head_t nocb_gp_wq[2];
  22114. + struct swait_head nocb_gp_wq[2];
  22115. /* Place for rcu_nocb_kthread() to wait GP. */
  22116. #endif /* #ifdef CONFIG_RCU_NOCB_CPU */
  22117. int need_future_gp[2];
  22118. @@ -348,7 +344,7 @@
  22119. atomic_long_t nocb_follower_count_lazy; /* (approximate). */
  22120. int nocb_p_count; /* # CBs being invoked by kthread */
  22121. int nocb_p_count_lazy; /* (approximate). */
  22122. - wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */
  22123. + struct swait_head nocb_wq; /* For nocb kthreads to sleep on. */
  22124. struct task_struct *nocb_kthread;
  22125. int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */
  22126. @@ -439,7 +435,7 @@
  22127. unsigned long gpnum; /* Current gp number. */
  22128. unsigned long completed; /* # of last completed gp. */
  22129. struct task_struct *gp_kthread; /* Task for grace periods. */
  22130. - wait_queue_head_t gp_wq; /* Where GP task waits. */
  22131. + struct swait_head gp_wq; /* Where GP task waits. */
  22132. short gp_flags; /* Commands for GP task. */
  22133. short gp_state; /* GP kthread sleep state. */
  22134. @@ -570,10 +566,9 @@
  22135. static void __init __rcu_init_preempt(void);
  22136. static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
  22137. static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
  22138. -static void invoke_rcu_callbacks_kthread(void);
  22139. static bool rcu_is_callbacks_kthread(void);
  22140. +static void rcu_cpu_kthread_setup(unsigned int cpu);
  22141. #ifdef CONFIG_RCU_BOOST
  22142. -static void rcu_preempt_do_callbacks(void);
  22143. static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
  22144. struct rcu_node *rnp);
  22145. #endif /* #ifdef CONFIG_RCU_BOOST */
  22146. diff -Nur linux-3.18.14.orig/kernel/rcu/tree_plugin.h linux-3.18.14-rt/kernel/rcu/tree_plugin.h
  22147. --- linux-3.18.14.orig/kernel/rcu/tree_plugin.h 2015-05-20 10:04:50.000000000 -0500
  22148. +++ linux-3.18.14-rt/kernel/rcu/tree_plugin.h 2015-05-31 15:32:48.829635363 -0500
  22149. @@ -24,12 +24,6 @@
  22150. * Paul E. McKenney <paulmck@linux.vnet.ibm.com>
  22151. */
  22152. -#include <linux/delay.h>
  22153. -#include <linux/gfp.h>
  22154. -#include <linux/oom.h>
  22155. -#include <linux/smpboot.h>
  22156. -#include "../time/tick-internal.h"
  22157. -
  22158. #define RCU_KTHREAD_PRIO 1
  22159. #ifdef CONFIG_RCU_BOOST
  22160. @@ -335,7 +329,7 @@
  22161. }
  22162. /* Hardware IRQ handlers cannot block, complain if they get here. */
  22163. - if (WARN_ON_ONCE(in_irq() || in_serving_softirq())) {
  22164. + if (WARN_ON_ONCE(preempt_count() & (HARDIRQ_MASK | SOFTIRQ_OFFSET))) {
  22165. local_irq_restore(flags);
  22166. return;
  22167. }
  22168. @@ -398,10 +392,8 @@
  22169. #ifdef CONFIG_RCU_BOOST
  22170. /* Unboost if we were boosted. */
  22171. - if (drop_boost_mutex) {
  22172. + if (drop_boost_mutex)
  22173. rt_mutex_unlock(&rnp->boost_mtx);
  22174. - complete(&rnp->boost_completion);
  22175. - }
  22176. #endif /* #ifdef CONFIG_RCU_BOOST */
  22177. /*
  22178. @@ -635,15 +627,6 @@
  22179. t->rcu_read_unlock_special.b.need_qs = true;
  22180. }
  22181. -#ifdef CONFIG_RCU_BOOST
  22182. -
  22183. -static void rcu_preempt_do_callbacks(void)
  22184. -{
  22185. - rcu_do_batch(&rcu_preempt_state, this_cpu_ptr(&rcu_preempt_data));
  22186. -}
  22187. -
  22188. -#endif /* #ifdef CONFIG_RCU_BOOST */
  22189. -
  22190. /*
  22191. * Queue a preemptible-RCU callback for invocation after a grace period.
  22192. */
  22193. @@ -1072,6 +1055,19 @@
  22194. #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
  22195. +/*
  22196. + * If boosting, set rcuc kthreads to realtime priority.
  22197. + */
  22198. +static void rcu_cpu_kthread_setup(unsigned int cpu)
  22199. +{
  22200. +#ifdef CONFIG_RCU_BOOST
  22201. + struct sched_param sp;
  22202. +
  22203. + sp.sched_priority = RCU_KTHREAD_PRIO;
  22204. + sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
  22205. +#endif /* #ifdef CONFIG_RCU_BOOST */
  22206. +}
  22207. +
  22208. #ifdef CONFIG_RCU_BOOST
  22209. #include "../locking/rtmutex_common.h"
  22210. @@ -1103,16 +1099,6 @@
  22211. #endif /* #else #ifdef CONFIG_RCU_TRACE */
  22212. -static void rcu_wake_cond(struct task_struct *t, int status)
  22213. -{
  22214. - /*
  22215. - * If the thread is yielding, only wake it when this
  22216. - * is invoked from idle
  22217. - */
  22218. - if (status != RCU_KTHREAD_YIELDING || is_idle_task(current))
  22219. - wake_up_process(t);
  22220. -}
  22221. -
  22222. /*
  22223. * Carry out RCU priority boosting on the task indicated by ->exp_tasks
  22224. * or ->boost_tasks, advancing the pointer to the next task in the
  22225. @@ -1175,15 +1161,11 @@
  22226. */
  22227. t = container_of(tb, struct task_struct, rcu_node_entry);
  22228. rt_mutex_init_proxy_locked(&rnp->boost_mtx, t);
  22229. - init_completion(&rnp->boost_completion);
  22230. raw_spin_unlock_irqrestore(&rnp->lock, flags);
  22231. /* Lock only for side effect: boosts task t's priority. */
  22232. rt_mutex_lock(&rnp->boost_mtx);
  22233. rt_mutex_unlock(&rnp->boost_mtx); /* Then keep lockdep happy. */
  22234. - /* Wait for boostee to be done w/boost_mtx before reinitializing. */
  22235. - wait_for_completion(&rnp->boost_completion);
  22236. -
  22237. return ACCESS_ONCE(rnp->exp_tasks) != NULL ||
  22238. ACCESS_ONCE(rnp->boost_tasks) != NULL;
  22239. }
  22240. @@ -1261,23 +1243,6 @@
  22241. }
  22242. /*
  22243. - * Wake up the per-CPU kthread to invoke RCU callbacks.
  22244. - */
  22245. -static void invoke_rcu_callbacks_kthread(void)
  22246. -{
  22247. - unsigned long flags;
  22248. -
  22249. - local_irq_save(flags);
  22250. - __this_cpu_write(rcu_cpu_has_work, 1);
  22251. - if (__this_cpu_read(rcu_cpu_kthread_task) != NULL &&
  22252. - current != __this_cpu_read(rcu_cpu_kthread_task)) {
  22253. - rcu_wake_cond(__this_cpu_read(rcu_cpu_kthread_task),
  22254. - __this_cpu_read(rcu_cpu_kthread_status));
  22255. - }
  22256. - local_irq_restore(flags);
  22257. -}
  22258. -
  22259. -/*
  22260. * Is the current CPU running the RCU-callbacks kthread?
  22261. * Caller must have preemption disabled.
  22262. */
  22263. @@ -1332,67 +1297,6 @@
  22264. return 0;
  22265. }
  22266. -static void rcu_kthread_do_work(void)
  22267. -{
  22268. - rcu_do_batch(&rcu_sched_state, this_cpu_ptr(&rcu_sched_data));
  22269. - rcu_do_batch(&rcu_bh_state, this_cpu_ptr(&rcu_bh_data));
  22270. - rcu_preempt_do_callbacks();
  22271. -}
  22272. -
  22273. -static void rcu_cpu_kthread_setup(unsigned int cpu)
  22274. -{
  22275. - struct sched_param sp;
  22276. -
  22277. - sp.sched_priority = RCU_KTHREAD_PRIO;
  22278. - sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
  22279. -}
  22280. -
  22281. -static void rcu_cpu_kthread_park(unsigned int cpu)
  22282. -{
  22283. - per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
  22284. -}
  22285. -
  22286. -static int rcu_cpu_kthread_should_run(unsigned int cpu)
  22287. -{
  22288. - return __this_cpu_read(rcu_cpu_has_work);
  22289. -}
  22290. -
  22291. -/*
  22292. - * Per-CPU kernel thread that invokes RCU callbacks. This replaces the
  22293. - * RCU softirq used in flavors and configurations of RCU that do not
  22294. - * support RCU priority boosting.
  22295. - */
  22296. -static void rcu_cpu_kthread(unsigned int cpu)
  22297. -{
  22298. - unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status);
  22299. - char work, *workp = this_cpu_ptr(&rcu_cpu_has_work);
  22300. - int spincnt;
  22301. -
  22302. - for (spincnt = 0; spincnt < 10; spincnt++) {
  22303. - trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
  22304. - local_bh_disable();
  22305. - *statusp = RCU_KTHREAD_RUNNING;
  22306. - this_cpu_inc(rcu_cpu_kthread_loops);
  22307. - local_irq_disable();
  22308. - work = *workp;
  22309. - *workp = 0;
  22310. - local_irq_enable();
  22311. - if (work)
  22312. - rcu_kthread_do_work();
  22313. - local_bh_enable();
  22314. - if (*workp == 0) {
  22315. - trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
  22316. - *statusp = RCU_KTHREAD_WAITING;
  22317. - return;
  22318. - }
  22319. - }
  22320. - *statusp = RCU_KTHREAD_YIELDING;
  22321. - trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
  22322. - schedule_timeout_interruptible(2);
  22323. - trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
  22324. - *statusp = RCU_KTHREAD_WAITING;
  22325. -}
  22326. -
  22327. /*
  22328. * Set the per-rcu_node kthread's affinity to cover all CPUs that are
  22329. * served by the rcu_node in question. The CPU hotplug lock is still
  22330. @@ -1426,26 +1330,13 @@
  22331. free_cpumask_var(cm);
  22332. }
  22333. -static struct smp_hotplug_thread rcu_cpu_thread_spec = {
  22334. - .store = &rcu_cpu_kthread_task,
  22335. - .thread_should_run = rcu_cpu_kthread_should_run,
  22336. - .thread_fn = rcu_cpu_kthread,
  22337. - .thread_comm = "rcuc/%u",
  22338. - .setup = rcu_cpu_kthread_setup,
  22339. - .park = rcu_cpu_kthread_park,
  22340. -};
  22341. -
  22342. /*
  22343. * Spawn boost kthreads -- called as soon as the scheduler is running.
  22344. */
  22345. static void __init rcu_spawn_boost_kthreads(void)
  22346. {
  22347. struct rcu_node *rnp;
  22348. - int cpu;
  22349. - for_each_possible_cpu(cpu)
  22350. - per_cpu(rcu_cpu_has_work, cpu) = 0;
  22351. - BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
  22352. rnp = rcu_get_root(rcu_state_p);
  22353. (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
  22354. if (NUM_RCU_NODES > 1) {
  22355. @@ -1472,11 +1363,6 @@
  22356. raw_spin_unlock_irqrestore(&rnp->lock, flags);
  22357. }
  22358. -static void invoke_rcu_callbacks_kthread(void)
  22359. -{
  22360. - WARN_ON_ONCE(1);
  22361. -}
  22362. -
  22363. static bool rcu_is_callbacks_kthread(void)
  22364. {
  22365. return false;
  22366. @@ -1500,7 +1386,7 @@
  22367. #endif /* #else #ifdef CONFIG_RCU_BOOST */
  22368. -#if !defined(CONFIG_RCU_FAST_NO_HZ)
  22369. +#if !defined(CONFIG_RCU_FAST_NO_HZ) || defined(CONFIG_PREEMPT_RT_FULL)
  22370. /*
  22371. * Check to see if any future RCU-related work will need to be done
  22372. @@ -1518,7 +1404,9 @@
  22373. return rcu_cpu_has_callbacks(cpu, NULL);
  22374. }
  22375. #endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
  22376. +#endif /* !defined(CONFIG_RCU_FAST_NO_HZ) || defined(CONFIG_PREEMPT_RT_FULL) */
  22377. +#if !defined(CONFIG_RCU_FAST_NO_HZ)
  22378. /*
  22379. * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
  22380. * after it.
  22381. @@ -1615,6 +1503,8 @@
  22382. return cbs_ready;
  22383. }
  22384. +#ifndef CONFIG_PREEMPT_RT_FULL
  22385. +
  22386. /*
  22387. * Allow the CPU to enter dyntick-idle mode unless it has callbacks ready
  22388. * to invoke. If the CPU has callbacks, try to advance them. Tell the
  22389. @@ -1655,7 +1545,7 @@
  22390. return 0;
  22391. }
  22392. #endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
  22393. -
  22394. +#endif /* #ifndef CONFIG_PREEMPT_RT_FULL */
  22395. /*
  22396. * Prepare a CPU for idle from an RCU perspective. The first major task
  22397. * is to sense whether nohz mode has been enabled or disabled via sysfs.
  22398. @@ -2001,7 +1891,7 @@
  22399. */
  22400. static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
  22401. {
  22402. - wake_up_all(&rnp->nocb_gp_wq[rnp->completed & 0x1]);
  22403. + swait_wake_all(&rnp->nocb_gp_wq[rnp->completed & 0x1]);
  22404. }
  22405. /*
  22406. @@ -2019,8 +1909,8 @@
  22407. static void rcu_init_one_nocb(struct rcu_node *rnp)
  22408. {
  22409. - init_waitqueue_head(&rnp->nocb_gp_wq[0]);
  22410. - init_waitqueue_head(&rnp->nocb_gp_wq[1]);
  22411. + init_swait_head(&rnp->nocb_gp_wq[0]);
  22412. + init_swait_head(&rnp->nocb_gp_wq[1]);
  22413. }
  22414. #ifndef CONFIG_RCU_NOCB_CPU_ALL
  22415. @@ -2045,7 +1935,7 @@
  22416. if (ACCESS_ONCE(rdp_leader->nocb_leader_sleep) || force) {
  22417. /* Prior smp_mb__after_atomic() orders against prior enqueue. */
  22418. ACCESS_ONCE(rdp_leader->nocb_leader_sleep) = false;
  22419. - wake_up(&rdp_leader->nocb_wq);
  22420. + swait_wake(&rdp_leader->nocb_wq);
  22421. }
  22422. }
  22423. @@ -2238,7 +2128,7 @@
  22424. */
  22425. trace_rcu_future_gp(rnp, rdp, c, TPS("StartWait"));
  22426. for (;;) {
  22427. - wait_event_interruptible(
  22428. + swait_event_interruptible(
  22429. rnp->nocb_gp_wq[c & 0x1],
  22430. (d = ULONG_CMP_GE(ACCESS_ONCE(rnp->completed), c)));
  22431. if (likely(d))
  22432. @@ -2266,7 +2156,7 @@
  22433. /* Wait for callbacks to appear. */
  22434. if (!rcu_nocb_poll) {
  22435. trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep");
  22436. - wait_event_interruptible(my_rdp->nocb_wq,
  22437. + swait_event_interruptible(my_rdp->nocb_wq,
  22438. !ACCESS_ONCE(my_rdp->nocb_leader_sleep));
  22439. /* Memory barrier handled by smp_mb() calls below and repoll. */
  22440. } else if (firsttime) {
  22441. @@ -2347,7 +2237,7 @@
  22442. * List was empty, wake up the follower.
  22443. * Memory barriers supplied by atomic_long_add().
  22444. */
  22445. - wake_up(&rdp->nocb_wq);
  22446. + swait_wake(&rdp->nocb_wq);
  22447. }
  22448. }
  22449. @@ -2368,7 +2258,7 @@
  22450. if (!rcu_nocb_poll) {
  22451. trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
  22452. "FollowerSleep");
  22453. - wait_event_interruptible(rdp->nocb_wq,
  22454. + swait_event_interruptible(rdp->nocb_wq,
  22455. ACCESS_ONCE(rdp->nocb_follower_head));
  22456. } else if (firsttime) {
  22457. /* Don't drown trace log with "Poll"! */
  22458. @@ -2539,7 +2429,7 @@
  22459. static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
  22460. {
  22461. rdp->nocb_tail = &rdp->nocb_head;
  22462. - init_waitqueue_head(&rdp->nocb_wq);
  22463. + init_swait_head(&rdp->nocb_wq);
  22464. rdp->nocb_follower_tail = &rdp->nocb_follower_head;
  22465. }
  22466. diff -Nur linux-3.18.14.orig/kernel/rcu/update.c linux-3.18.14-rt/kernel/rcu/update.c
  22467. --- linux-3.18.14.orig/kernel/rcu/update.c 2015-05-20 10:04:50.000000000 -0500
  22468. +++ linux-3.18.14-rt/kernel/rcu/update.c 2015-05-31 15:32:48.829635363 -0500
  22469. @@ -170,6 +170,7 @@
  22470. }
  22471. EXPORT_SYMBOL_GPL(rcu_read_lock_held);
  22472. +#ifndef CONFIG_PREEMPT_RT_FULL
  22473. /**
  22474. * rcu_read_lock_bh_held() - might we be in RCU-bh read-side critical section?
  22475. *
  22476. @@ -196,6 +197,7 @@
  22477. return in_softirq() || irqs_disabled();
  22478. }
  22479. EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
  22480. +#endif
  22481. #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
  22482. diff -Nur linux-3.18.14.orig/kernel/relay.c linux-3.18.14-rt/kernel/relay.c
  22483. --- linux-3.18.14.orig/kernel/relay.c 2015-05-20 10:04:50.000000000 -0500
  22484. +++ linux-3.18.14-rt/kernel/relay.c 2015-05-31 15:32:48.829635363 -0500
  22485. @@ -339,6 +339,10 @@
  22486. {
  22487. struct rchan_buf *buf = (struct rchan_buf *)data;
  22488. wake_up_interruptible(&buf->read_wait);
  22489. + /*
  22490. + * Stupid polling for now:
  22491. + */
  22492. + mod_timer(&buf->timer, jiffies + 1);
  22493. }
  22494. /**
  22495. @@ -356,6 +360,7 @@
  22496. init_waitqueue_head(&buf->read_wait);
  22497. kref_init(&buf->kref);
  22498. setup_timer(&buf->timer, wakeup_readers, (unsigned long)buf);
  22499. + mod_timer(&buf->timer, jiffies + 1);
  22500. } else
  22501. del_timer_sync(&buf->timer);
  22502. @@ -739,15 +744,6 @@
  22503. else
  22504. buf->early_bytes += buf->chan->subbuf_size -
  22505. buf->padding[old_subbuf];
  22506. - smp_mb();
  22507. - if (waitqueue_active(&buf->read_wait))
  22508. - /*
  22509. - * Calling wake_up_interruptible() from here
  22510. - * will deadlock if we happen to be logging
  22511. - * from the scheduler (trying to re-grab
  22512. - * rq->lock), so defer it.
  22513. - */
  22514. - mod_timer(&buf->timer, jiffies + 1);
  22515. }
  22516. old = buf->data;
  22517. diff -Nur linux-3.18.14.orig/kernel/res_counter.c linux-3.18.14-rt/kernel/res_counter.c
  22518. --- linux-3.18.14.orig/kernel/res_counter.c 2015-05-20 10:04:50.000000000 -0500
  22519. +++ linux-3.18.14-rt/kernel/res_counter.c 2015-05-31 15:32:48.845635363 -0500
  22520. @@ -59,7 +59,7 @@
  22521. r = ret = 0;
  22522. *limit_fail_at = NULL;
  22523. - local_irq_save(flags);
  22524. + local_irq_save_nort(flags);
  22525. for (c = counter; c != NULL; c = c->parent) {
  22526. spin_lock(&c->lock);
  22527. r = res_counter_charge_locked(c, val, force);
  22528. @@ -79,7 +79,7 @@
  22529. spin_unlock(&u->lock);
  22530. }
  22531. }
  22532. - local_irq_restore(flags);
  22533. + local_irq_restore_nort(flags);
  22534. return ret;
  22535. }
  22536. @@ -104,7 +104,7 @@
  22537. struct res_counter *c;
  22538. u64 ret = 0;
  22539. - local_irq_save(flags);
  22540. + local_irq_save_nort(flags);
  22541. for (c = counter; c != top; c = c->parent) {
  22542. u64 r;
  22543. spin_lock(&c->lock);
  22544. @@ -113,7 +113,7 @@
  22545. ret = r;
  22546. spin_unlock(&c->lock);
  22547. }
  22548. - local_irq_restore(flags);
  22549. + local_irq_restore_nort(flags);
  22550. return ret;
  22551. }
  22552. diff -Nur linux-3.18.14.orig/kernel/sched/completion.c linux-3.18.14-rt/kernel/sched/completion.c
  22553. --- linux-3.18.14.orig/kernel/sched/completion.c 2015-05-20 10:04:50.000000000 -0500
  22554. +++ linux-3.18.14-rt/kernel/sched/completion.c 2015-05-31 15:32:48.889635363 -0500
  22555. @@ -30,10 +30,10 @@
  22556. {
  22557. unsigned long flags;
  22558. - spin_lock_irqsave(&x->wait.lock, flags);
  22559. + raw_spin_lock_irqsave(&x->wait.lock, flags);
  22560. x->done++;
  22561. - __wake_up_locked(&x->wait, TASK_NORMAL, 1);
  22562. - spin_unlock_irqrestore(&x->wait.lock, flags);
  22563. + __swait_wake_locked(&x->wait, TASK_NORMAL, 1);
  22564. + raw_spin_unlock_irqrestore(&x->wait.lock, flags);
  22565. }
  22566. EXPORT_SYMBOL(complete);
  22567. @@ -50,10 +50,10 @@
  22568. {
  22569. unsigned long flags;
  22570. - spin_lock_irqsave(&x->wait.lock, flags);
  22571. + raw_spin_lock_irqsave(&x->wait.lock, flags);
  22572. x->done += UINT_MAX/2;
  22573. - __wake_up_locked(&x->wait, TASK_NORMAL, 0);
  22574. - spin_unlock_irqrestore(&x->wait.lock, flags);
  22575. + __swait_wake_locked(&x->wait, TASK_NORMAL, 0);
  22576. + raw_spin_unlock_irqrestore(&x->wait.lock, flags);
  22577. }
  22578. EXPORT_SYMBOL(complete_all);
  22579. @@ -62,20 +62,20 @@
  22580. long (*action)(long), long timeout, int state)
  22581. {
  22582. if (!x->done) {
  22583. - DECLARE_WAITQUEUE(wait, current);
  22584. + DEFINE_SWAITER(wait);
  22585. - __add_wait_queue_tail_exclusive(&x->wait, &wait);
  22586. + swait_prepare_locked(&x->wait, &wait);
  22587. do {
  22588. if (signal_pending_state(state, current)) {
  22589. timeout = -ERESTARTSYS;
  22590. break;
  22591. }
  22592. __set_current_state(state);
  22593. - spin_unlock_irq(&x->wait.lock);
  22594. + raw_spin_unlock_irq(&x->wait.lock);
  22595. timeout = action(timeout);
  22596. - spin_lock_irq(&x->wait.lock);
  22597. + raw_spin_lock_irq(&x->wait.lock);
  22598. } while (!x->done && timeout);
  22599. - __remove_wait_queue(&x->wait, &wait);
  22600. + swait_finish_locked(&x->wait, &wait);
  22601. if (!x->done)
  22602. return timeout;
  22603. }
  22604. @@ -89,9 +89,9 @@
  22605. {
  22606. might_sleep();
  22607. - spin_lock_irq(&x->wait.lock);
  22608. + raw_spin_lock_irq(&x->wait.lock);
  22609. timeout = do_wait_for_common(x, action, timeout, state);
  22610. - spin_unlock_irq(&x->wait.lock);
  22611. + raw_spin_unlock_irq(&x->wait.lock);
  22612. return timeout;
  22613. }
  22614. @@ -267,12 +267,12 @@
  22615. unsigned long flags;
  22616. int ret = 1;
  22617. - spin_lock_irqsave(&x->wait.lock, flags);
  22618. + raw_spin_lock_irqsave(&x->wait.lock, flags);
  22619. if (!x->done)
  22620. ret = 0;
  22621. else
  22622. x->done--;
  22623. - spin_unlock_irqrestore(&x->wait.lock, flags);
  22624. + raw_spin_unlock_irqrestore(&x->wait.lock, flags);
  22625. return ret;
  22626. }
  22627. EXPORT_SYMBOL(try_wait_for_completion);
  22628. @@ -290,10 +290,10 @@
  22629. unsigned long flags;
  22630. int ret = 1;
  22631. - spin_lock_irqsave(&x->wait.lock, flags);
  22632. + raw_spin_lock_irqsave(&x->wait.lock, flags);
  22633. if (!x->done)
  22634. ret = 0;
  22635. - spin_unlock_irqrestore(&x->wait.lock, flags);
  22636. + raw_spin_unlock_irqrestore(&x->wait.lock, flags);
  22637. return ret;
  22638. }
  22639. EXPORT_SYMBOL(completion_done);
  22640. diff -Nur linux-3.18.14.orig/kernel/sched/core.c linux-3.18.14-rt/kernel/sched/core.c
  22641. --- linux-3.18.14.orig/kernel/sched/core.c 2015-05-20 10:04:50.000000000 -0500
  22642. +++ linux-3.18.14-rt/kernel/sched/core.c 2015-05-31 15:32:48.893635363 -0500
  22643. @@ -280,7 +280,11 @@
  22644. * Number of tasks to iterate in a single balance run.
  22645. * Limited because this is done with IRQs disabled.
  22646. */
  22647. +#ifndef CONFIG_PREEMPT_RT_FULL
  22648. const_debug unsigned int sysctl_sched_nr_migrate = 32;
  22649. +#else
  22650. +const_debug unsigned int sysctl_sched_nr_migrate = 8;
  22651. +#endif
  22652. /*
  22653. * period over which we average the RT time consumption, measured
  22654. @@ -516,6 +520,7 @@
  22655. hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
  22656. rq->hrtick_timer.function = hrtick;
  22657. + rq->hrtick_timer.irqsafe = 1;
  22658. }
  22659. #else /* CONFIG_SCHED_HRTICK */
  22660. static inline void hrtick_clear(struct rq *rq)
  22661. @@ -627,6 +632,38 @@
  22662. trace_sched_wake_idle_without_ipi(cpu);
  22663. }
  22664. +#ifdef CONFIG_PREEMPT_LAZY
  22665. +void resched_curr_lazy(struct rq *rq)
  22666. +{
  22667. + struct task_struct *curr = rq->curr;
  22668. + int cpu;
  22669. +
  22670. + if (!sched_feat(PREEMPT_LAZY)) {
  22671. + resched_curr(rq);
  22672. + return;
  22673. + }
  22674. +
  22675. + lockdep_assert_held(&rq->lock);
  22676. +
  22677. + if (test_tsk_need_resched(curr))
  22678. + return;
  22679. +
  22680. + if (test_tsk_need_resched_lazy(curr))
  22681. + return;
  22682. +
  22683. + set_tsk_need_resched_lazy(curr);
  22684. +
  22685. + cpu = cpu_of(rq);
  22686. + if (cpu == smp_processor_id())
  22687. + return;
  22688. +
  22689. + /* NEED_RESCHED_LAZY must be visible before we test polling */
  22690. + smp_mb();
  22691. + if (!tsk_is_polling(curr))
  22692. + smp_send_reschedule(cpu);
  22693. +}
  22694. +#endif
  22695. +
  22696. void resched_cpu(int cpu)
  22697. {
  22698. struct rq *rq = cpu_rq(cpu);
  22699. @@ -650,12 +687,14 @@
  22700. */
  22701. int get_nohz_timer_target(int pinned)
  22702. {
  22703. - int cpu = smp_processor_id();
  22704. + int cpu;
  22705. int i;
  22706. struct sched_domain *sd;
  22707. + preempt_disable_rt();
  22708. + cpu = smp_processor_id();
  22709. if (pinned || !get_sysctl_timer_migration() || !idle_cpu(cpu))
  22710. - return cpu;
  22711. + goto preempt_en_rt;
  22712. rcu_read_lock();
  22713. for_each_domain(cpu, sd) {
  22714. @@ -668,6 +707,8 @@
  22715. }
  22716. unlock:
  22717. rcu_read_unlock();
  22718. +preempt_en_rt:
  22719. + preempt_enable_rt();
  22720. return cpu;
  22721. }
  22722. /*
  22723. @@ -745,14 +786,29 @@
  22724. #endif /* CONFIG_NO_HZ_COMMON */
  22725. #ifdef CONFIG_NO_HZ_FULL
  22726. +
  22727. +static int ksoftirqd_running(void)
  22728. +{
  22729. + struct task_struct *softirqd;
  22730. +
  22731. + if (!IS_ENABLED(CONFIG_PREEMPT_RT_FULL))
  22732. + return 0;
  22733. + softirqd = this_cpu_ksoftirqd();
  22734. + if (softirqd && softirqd->on_rq)
  22735. + return 1;
  22736. + return 0;
  22737. +}
  22738. +
  22739. bool sched_can_stop_tick(void)
  22740. {
  22741. /*
  22742. * More than one running task need preemption.
  22743. * nr_running update is assumed to be visible
  22744. * after IPI is sent from wakers.
  22745. + *
  22746. + * NOTE, RT: if ksoftirqd is awake, subtract it.
  22747. */
  22748. - if (this_rq()->nr_running > 1)
  22749. + if (this_rq()->nr_running - ksoftirqd_running() > 1)
  22750. return false;
  22751. return true;
  22752. @@ -1198,6 +1254,18 @@
  22753. static int migration_cpu_stop(void *data);
  22754. +static bool check_task_state(struct task_struct *p, long match_state)
  22755. +{
  22756. + bool match = false;
  22757. +
  22758. + raw_spin_lock_irq(&p->pi_lock);
  22759. + if (p->state == match_state || p->saved_state == match_state)
  22760. + match = true;
  22761. + raw_spin_unlock_irq(&p->pi_lock);
  22762. +
  22763. + return match;
  22764. +}
  22765. +
  22766. /*
  22767. * wait_task_inactive - wait for a thread to unschedule.
  22768. *
  22769. @@ -1242,7 +1310,7 @@
  22770. * is actually now running somewhere else!
  22771. */
  22772. while (task_running(rq, p)) {
  22773. - if (match_state && unlikely(p->state != match_state))
  22774. + if (match_state && !check_task_state(p, match_state))
  22775. return 0;
  22776. cpu_relax();
  22777. }
  22778. @@ -1257,7 +1325,8 @@
  22779. running = task_running(rq, p);
  22780. queued = task_on_rq_queued(p);
  22781. ncsw = 0;
  22782. - if (!match_state || p->state == match_state)
  22783. + if (!match_state || p->state == match_state ||
  22784. + p->saved_state == match_state)
  22785. ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
  22786. task_rq_unlock(rq, p, &flags);
  22787. @@ -1482,10 +1551,6 @@
  22788. {
  22789. activate_task(rq, p, en_flags);
  22790. p->on_rq = TASK_ON_RQ_QUEUED;
  22791. -
  22792. - /* if a worker is waking up, notify workqueue */
  22793. - if (p->flags & PF_WQ_WORKER)
  22794. - wq_worker_waking_up(p, cpu_of(rq));
  22795. }
  22796. /*
  22797. @@ -1699,8 +1764,27 @@
  22798. */
  22799. smp_mb__before_spinlock();
  22800. raw_spin_lock_irqsave(&p->pi_lock, flags);
  22801. - if (!(p->state & state))
  22802. + if (!(p->state & state)) {
  22803. + /*
  22804. + * The task might be running due to a spinlock sleeper
  22805. + * wakeup. Check the saved state and set it to running
  22806. + * if the wakeup condition is true.
  22807. + */
  22808. + if (!(wake_flags & WF_LOCK_SLEEPER)) {
  22809. + if (p->saved_state & state) {
  22810. + p->saved_state = TASK_RUNNING;
  22811. + success = 1;
  22812. + }
  22813. + }
  22814. goto out;
  22815. + }
  22816. +
  22817. + /*
  22818. + * If this is a regular wakeup, then we can unconditionally
  22819. + * clear the saved state of a "lock sleeper".
  22820. + */
  22821. + if (!(wake_flags & WF_LOCK_SLEEPER))
  22822. + p->saved_state = TASK_RUNNING;
  22823. success = 1; /* we're going to change ->state */
  22824. cpu = task_cpu(p);
  22825. @@ -1743,42 +1827,6 @@
  22826. }
  22827. /**
  22828. - * try_to_wake_up_local - try to wake up a local task with rq lock held
  22829. - * @p: the thread to be awakened
  22830. - *
  22831. - * Put @p on the run-queue if it's not already there. The caller must
  22832. - * ensure that this_rq() is locked, @p is bound to this_rq() and not
  22833. - * the current task.
  22834. - */
  22835. -static void try_to_wake_up_local(struct task_struct *p)
  22836. -{
  22837. - struct rq *rq = task_rq(p);
  22838. -
  22839. - if (WARN_ON_ONCE(rq != this_rq()) ||
  22840. - WARN_ON_ONCE(p == current))
  22841. - return;
  22842. -
  22843. - lockdep_assert_held(&rq->lock);
  22844. -
  22845. - if (!raw_spin_trylock(&p->pi_lock)) {
  22846. - raw_spin_unlock(&rq->lock);
  22847. - raw_spin_lock(&p->pi_lock);
  22848. - raw_spin_lock(&rq->lock);
  22849. - }
  22850. -
  22851. - if (!(p->state & TASK_NORMAL))
  22852. - goto out;
  22853. -
  22854. - if (!task_on_rq_queued(p))
  22855. - ttwu_activate(rq, p, ENQUEUE_WAKEUP);
  22856. -
  22857. - ttwu_do_wakeup(rq, p, 0);
  22858. - ttwu_stat(p, smp_processor_id(), 0);
  22859. -out:
  22860. - raw_spin_unlock(&p->pi_lock);
  22861. -}
  22862. -
  22863. -/**
  22864. * wake_up_process - Wake up a specific process
  22865. * @p: The process to be woken up.
  22866. *
  22867. @@ -1792,11 +1840,23 @@
  22868. */
  22869. int wake_up_process(struct task_struct *p)
  22870. {
  22871. - WARN_ON(task_is_stopped_or_traced(p));
  22872. + WARN_ON(__task_is_stopped_or_traced(p));
  22873. return try_to_wake_up(p, TASK_NORMAL, 0);
  22874. }
  22875. EXPORT_SYMBOL(wake_up_process);
  22876. +/**
  22877. + * wake_up_lock_sleeper - Wake up a specific process blocked on a "sleeping lock"
  22878. + * @p: The process to be woken up.
  22879. + *
  22880. + * Same as wake_up_process() above, but wake_flags=WF_LOCK_SLEEPER to indicate
  22881. + * the nature of the wakeup.
  22882. + */
  22883. +int wake_up_lock_sleeper(struct task_struct *p)
  22884. +{
  22885. + return try_to_wake_up(p, TASK_ALL, WF_LOCK_SLEEPER);
  22886. +}
  22887. +
  22888. int wake_up_state(struct task_struct *p, unsigned int state)
  22889. {
  22890. return try_to_wake_up(p, state, 0);
  22891. @@ -1987,6 +2047,9 @@
  22892. p->on_cpu = 0;
  22893. #endif
  22894. init_task_preempt_count(p);
  22895. +#ifdef CONFIG_HAVE_PREEMPT_LAZY
  22896. + task_thread_info(p)->preempt_lazy_count = 0;
  22897. +#endif
  22898. #ifdef CONFIG_SMP
  22899. plist_node_init(&p->pushable_tasks, MAX_PRIO);
  22900. RB_CLEAR_NODE(&p->pushable_dl_tasks);
  22901. @@ -2270,8 +2333,12 @@
  22902. finish_arch_post_lock_switch();
  22903. fire_sched_in_preempt_notifiers(current);
  22904. + /*
  22905. + * We use mmdrop_delayed() here so we don't have to do the
  22906. + * full __mmdrop() when we are the last user.
  22907. + */
  22908. if (mm)
  22909. - mmdrop(mm);
  22910. + mmdrop_delayed(mm);
  22911. if (unlikely(prev_state == TASK_DEAD)) {
  22912. if (prev->sched_class->task_dead)
  22913. prev->sched_class->task_dead(prev);
  22914. @@ -2696,6 +2763,133 @@
  22915. schedstat_inc(this_rq(), sched_count);
  22916. }
  22917. +#if defined(CONFIG_PREEMPT_RT_FULL) && defined(CONFIG_SMP)
  22918. +#define MIGRATE_DISABLE_SET_AFFIN (1<<30) /* Can't make a negative */
  22919. +#define migrate_disabled_updated(p) ((p)->migrate_disable & MIGRATE_DISABLE_SET_AFFIN)
  22920. +#define migrate_disable_count(p) ((p)->migrate_disable & ~MIGRATE_DISABLE_SET_AFFIN)
  22921. +
  22922. +static inline void update_migrate_disable(struct task_struct *p)
  22923. +{
  22924. + const struct cpumask *mask;
  22925. +
  22926. + if (likely(!p->migrate_disable))
  22927. + return;
  22928. +
  22929. + /* Did we already update affinity? */
  22930. + if (unlikely(migrate_disabled_updated(p)))
  22931. + return;
  22932. +
  22933. + /*
  22934. + * Since this is always current we can get away with only locking
  22935. + * rq->lock, the ->cpus_allowed value can normally only be changed
  22936. + * while holding both p->pi_lock and rq->lock, but seeing that this
  22937. + * is current, we cannot actually be waking up, so all code that
  22938. + * relies on serialization against p->pi_lock is out of scope.
  22939. + *
  22940. + * Having rq->lock serializes us against things like
  22941. + * set_cpus_allowed_ptr() that can still happen concurrently.
  22942. + */
  22943. + mask = tsk_cpus_allowed(p);
  22944. +
  22945. + if (p->sched_class->set_cpus_allowed)
  22946. + p->sched_class->set_cpus_allowed(p, mask);
  22947. + /* mask==cpumask_of(task_cpu(p)) which has a cpumask_weight==1 */
  22948. + p->nr_cpus_allowed = 1;
  22949. +
  22950. + /* Let migrate_enable know to fix things back up */
  22951. + p->migrate_disable |= MIGRATE_DISABLE_SET_AFFIN;
  22952. +}
  22953. +
  22954. +void migrate_disable(void)
  22955. +{
  22956. + struct task_struct *p = current;
  22957. +
  22958. + if (in_atomic()) {
  22959. +#ifdef CONFIG_SCHED_DEBUG
  22960. + p->migrate_disable_atomic++;
  22961. +#endif
  22962. + return;
  22963. + }
  22964. +
  22965. +#ifdef CONFIG_SCHED_DEBUG
  22966. + if (unlikely(p->migrate_disable_atomic)) {
  22967. + tracing_off();
  22968. + WARN_ON_ONCE(1);
  22969. + }
  22970. +#endif
  22971. +
  22972. + if (p->migrate_disable) {
  22973. + p->migrate_disable++;
  22974. + return;
  22975. + }
  22976. +
  22977. + preempt_disable();
  22978. + preempt_lazy_disable();
  22979. + pin_current_cpu();
  22980. + p->migrate_disable = 1;
  22981. + preempt_enable();
  22982. +}
  22983. +EXPORT_SYMBOL(migrate_disable);
  22984. +
  22985. +void migrate_enable(void)
  22986. +{
  22987. + struct task_struct *p = current;
  22988. + const struct cpumask *mask;
  22989. + unsigned long flags;
  22990. + struct rq *rq;
  22991. +
  22992. + if (in_atomic()) {
  22993. +#ifdef CONFIG_SCHED_DEBUG
  22994. + p->migrate_disable_atomic--;
  22995. +#endif
  22996. + return;
  22997. + }
  22998. +
  22999. +#ifdef CONFIG_SCHED_DEBUG
  23000. + if (unlikely(p->migrate_disable_atomic)) {
  23001. + tracing_off();
  23002. + WARN_ON_ONCE(1);
  23003. + }
  23004. +#endif
  23005. + WARN_ON_ONCE(p->migrate_disable <= 0);
  23006. +
  23007. + if (migrate_disable_count(p) > 1) {
  23008. + p->migrate_disable--;
  23009. + return;
  23010. + }
  23011. +
  23012. + preempt_disable();
  23013. + if (unlikely(migrate_disabled_updated(p))) {
  23014. + /*
  23015. + * Undo whatever update_migrate_disable() did, also see there
  23016. + * about locking.
  23017. + */
  23018. + rq = this_rq();
  23019. + raw_spin_lock_irqsave(&rq->lock, flags);
  23020. +
  23021. + /*
  23022. + * Clearing migrate_disable causes tsk_cpus_allowed to
  23023. + * show the tasks original cpu affinity.
  23024. + */
  23025. + p->migrate_disable = 0;
  23026. + mask = tsk_cpus_allowed(p);
  23027. + if (p->sched_class->set_cpus_allowed)
  23028. + p->sched_class->set_cpus_allowed(p, mask);
  23029. + p->nr_cpus_allowed = cpumask_weight(mask);
  23030. + raw_spin_unlock_irqrestore(&rq->lock, flags);
  23031. + } else
  23032. + p->migrate_disable = 0;
  23033. +
  23034. + unpin_current_cpu();
  23035. + preempt_enable();
  23036. + preempt_lazy_enable();
  23037. +}
  23038. +EXPORT_SYMBOL(migrate_enable);
  23039. +#else
  23040. +static inline void update_migrate_disable(struct task_struct *p) { }
  23041. +#define migrate_disabled_updated(p) 0
  23042. +#endif
  23043. +
  23044. /*
  23045. * Pick up the highest-prio task:
  23046. */
  23047. @@ -2799,6 +2993,8 @@
  23048. smp_mb__before_spinlock();
  23049. raw_spin_lock_irq(&rq->lock);
  23050. + update_migrate_disable(prev);
  23051. +
  23052. switch_count = &prev->nivcsw;
  23053. if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
  23054. if (unlikely(signal_pending_state(prev->state, prev))) {
  23055. @@ -2806,19 +3002,6 @@
  23056. } else {
  23057. deactivate_task(rq, prev, DEQUEUE_SLEEP);
  23058. prev->on_rq = 0;
  23059. -
  23060. - /*
  23061. - * If a worker went to sleep, notify and ask workqueue
  23062. - * whether it wants to wake up a task to maintain
  23063. - * concurrency.
  23064. - */
  23065. - if (prev->flags & PF_WQ_WORKER) {
  23066. - struct task_struct *to_wakeup;
  23067. -
  23068. - to_wakeup = wq_worker_sleeping(prev, cpu);
  23069. - if (to_wakeup)
  23070. - try_to_wake_up_local(to_wakeup);
  23071. - }
  23072. }
  23073. switch_count = &prev->nvcsw;
  23074. }
  23075. @@ -2828,6 +3011,7 @@
  23076. next = pick_next_task(rq, prev);
  23077. clear_tsk_need_resched(prev);
  23078. + clear_tsk_need_resched_lazy(prev);
  23079. clear_preempt_need_resched();
  23080. rq->skip_clock_update = 0;
  23081. @@ -2857,9 +3041,20 @@
  23082. static inline void sched_submit_work(struct task_struct *tsk)
  23083. {
  23084. - if (!tsk->state || tsk_is_pi_blocked(tsk))
  23085. + if (!tsk->state)
  23086. return;
  23087. /*
  23088. + * If a worker went to sleep, notify and ask workqueue whether
  23089. + * it wants to wake up a task to maintain concurrency.
  23090. + */
  23091. + if (tsk->flags & PF_WQ_WORKER)
  23092. + wq_worker_sleeping(tsk);
  23093. +
  23094. +
  23095. + if (tsk_is_pi_blocked(tsk))
  23096. + return;
  23097. +
  23098. + /*
  23099. * If we are going to sleep and we have plugged IO queued,
  23100. * make sure to submit it to avoid deadlocks.
  23101. */
  23102. @@ -2867,12 +3062,19 @@
  23103. blk_schedule_flush_plug(tsk);
  23104. }
  23105. +static inline void sched_update_worker(struct task_struct *tsk)
  23106. +{
  23107. + if (tsk->flags & PF_WQ_WORKER)
  23108. + wq_worker_running(tsk);
  23109. +}
  23110. +
  23111. asmlinkage __visible void __sched schedule(void)
  23112. {
  23113. struct task_struct *tsk = current;
  23114. sched_submit_work(tsk);
  23115. __schedule();
  23116. + sched_update_worker(tsk);
  23117. }
  23118. EXPORT_SYMBOL(schedule);
  23119. @@ -2922,9 +3124,26 @@
  23120. if (likely(!preemptible()))
  23121. return;
  23122. +#ifdef CONFIG_PREEMPT_LAZY
  23123. + /*
  23124. + * Check for lazy preemption
  23125. + */
  23126. + if (current_thread_info()->preempt_lazy_count &&
  23127. + !test_thread_flag(TIF_NEED_RESCHED))
  23128. + return;
  23129. +#endif
  23130. do {
  23131. __preempt_count_add(PREEMPT_ACTIVE);
  23132. + /*
  23133. + * The add/subtract must not be traced by the function
  23134. + * tracer. But we still want to account for the
  23135. + * preempt off latency tracer. Since the _notrace versions
  23136. + * of add/subtract skip the accounting for latency tracer
  23137. + * we must force it manually.
  23138. + */
  23139. + start_critical_timings();
  23140. __schedule();
  23141. + stop_critical_timings();
  23142. __preempt_count_sub(PREEMPT_ACTIVE);
  23143. /*
  23144. @@ -4236,9 +4455,16 @@
  23145. static void __cond_resched(void)
  23146. {
  23147. - __preempt_count_add(PREEMPT_ACTIVE);
  23148. - __schedule();
  23149. - __preempt_count_sub(PREEMPT_ACTIVE);
  23150. + do {
  23151. + __preempt_count_add(PREEMPT_ACTIVE);
  23152. + __schedule();
  23153. + __preempt_count_sub(PREEMPT_ACTIVE);
  23154. + /*
  23155. + * Check again in case we missed a preemption
  23156. + * opportunity between schedule and now.
  23157. + */
  23158. + barrier();
  23159. + } while (need_resched());
  23160. }
  23161. int __sched _cond_resched(void)
  23162. @@ -4279,6 +4505,7 @@
  23163. }
  23164. EXPORT_SYMBOL(__cond_resched_lock);
  23165. +#ifndef CONFIG_PREEMPT_RT_FULL
  23166. int __sched __cond_resched_softirq(void)
  23167. {
  23168. BUG_ON(!in_softirq());
  23169. @@ -4292,6 +4519,7 @@
  23170. return 0;
  23171. }
  23172. EXPORT_SYMBOL(__cond_resched_softirq);
  23173. +#endif
  23174. /**
  23175. * yield - yield the current processor to other threads.
  23176. @@ -4653,7 +4881,9 @@
  23177. /* Set the preempt count _outside_ the spinlocks! */
  23178. init_idle_preempt_count(idle, cpu);
  23179. -
  23180. +#ifdef CONFIG_HAVE_PREEMPT_LAZY
  23181. + task_thread_info(idle)->preempt_lazy_count = 0;
  23182. +#endif
  23183. /*
  23184. * The idle tasks have their own, simple scheduling class:
  23185. */
  23186. @@ -4695,11 +4925,91 @@
  23187. void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
  23188. {
  23189. - if (p->sched_class && p->sched_class->set_cpus_allowed)
  23190. - p->sched_class->set_cpus_allowed(p, new_mask);
  23191. + if (!migrate_disabled_updated(p)) {
  23192. + if (p->sched_class && p->sched_class->set_cpus_allowed)
  23193. + p->sched_class->set_cpus_allowed(p, new_mask);
  23194. + p->nr_cpus_allowed = cpumask_weight(new_mask);
  23195. + }
  23196. cpumask_copy(&p->cpus_allowed, new_mask);
  23197. - p->nr_cpus_allowed = cpumask_weight(new_mask);
  23198. +}
  23199. +
  23200. +static DEFINE_PER_CPU(struct cpumask, sched_cpumasks);
  23201. +static DEFINE_MUTEX(sched_down_mutex);
  23202. +static cpumask_t sched_down_cpumask;
  23203. +
  23204. +void tell_sched_cpu_down_begin(int cpu)
  23205. +{
  23206. + mutex_lock(&sched_down_mutex);
  23207. + cpumask_set_cpu(cpu, &sched_down_cpumask);
  23208. + mutex_unlock(&sched_down_mutex);
  23209. +}
  23210. +
  23211. +void tell_sched_cpu_down_done(int cpu)
  23212. +{
  23213. + mutex_lock(&sched_down_mutex);
  23214. + cpumask_clear_cpu(cpu, &sched_down_cpumask);
  23215. + mutex_unlock(&sched_down_mutex);
  23216. +}
  23217. +
  23218. +/**
  23219. + * migrate_me - try to move the current task off this cpu
  23220. + *
  23221. + * Used by the pin_current_cpu() code to try to get tasks
  23222. + * to move off the current CPU as it is going down.
  23223. + * It will only move the task if the task isn't pinned to
  23224. + * the CPU (with migrate_disable, affinity or NO_SETAFFINITY)
  23225. + * and the task has to be in a RUNNING state. Otherwise the
  23226. + * movement of the task will wake it up (change its state
  23227. + * to running) when the task did not expect it.
  23228. + *
  23229. + * Returns 1 if it succeeded in moving the current task
  23230. + * 0 otherwise.
  23231. + */
  23232. +int migrate_me(void)
  23233. +{
  23234. + struct task_struct *p = current;
  23235. + struct migration_arg arg;
  23236. + struct cpumask *cpumask;
  23237. + struct cpumask *mask;
  23238. + unsigned long flags;
  23239. + unsigned int dest_cpu;
  23240. + struct rq *rq;
  23241. +
  23242. + /*
  23243. + * We can not migrate tasks bounded to a CPU or tasks not
  23244. + * running. The movement of the task will wake it up.
  23245. + */
  23246. + if (p->flags & PF_NO_SETAFFINITY || p->state)
  23247. + return 0;
  23248. +
  23249. + mutex_lock(&sched_down_mutex);
  23250. + rq = task_rq_lock(p, &flags);
  23251. +
  23252. + cpumask = &__get_cpu_var(sched_cpumasks);
  23253. + mask = &p->cpus_allowed;
  23254. +
  23255. + cpumask_andnot(cpumask, mask, &sched_down_cpumask);
  23256. +
  23257. + if (!cpumask_weight(cpumask)) {
  23258. + /* It's only on this CPU? */
  23259. + task_rq_unlock(rq, p, &flags);
  23260. + mutex_unlock(&sched_down_mutex);
  23261. + return 0;
  23262. + }
  23263. +
  23264. + dest_cpu = cpumask_any_and(cpu_active_mask, cpumask);
  23265. +
  23266. + arg.task = p;
  23267. + arg.dest_cpu = dest_cpu;
  23268. +
  23269. + task_rq_unlock(rq, p, &flags);
  23270. +
  23271. + stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
  23272. + tlb_migrate_finish(p->mm);
  23273. + mutex_unlock(&sched_down_mutex);
  23274. +
  23275. + return 1;
  23276. }
  23277. /*
  23278. @@ -4745,7 +5055,7 @@
  23279. do_set_cpus_allowed(p, new_mask);
  23280. /* Can the task run on the task's current CPU? If so, we're done */
  23281. - if (cpumask_test_cpu(task_cpu(p), new_mask))
  23282. + if (cpumask_test_cpu(task_cpu(p), new_mask) || __migrate_disabled(p))
  23283. goto out;
  23284. dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
  23285. @@ -4885,6 +5195,8 @@
  23286. #ifdef CONFIG_HOTPLUG_CPU
  23287. +static DEFINE_PER_CPU(struct mm_struct *, idle_last_mm);
  23288. +
  23289. /*
  23290. * Ensures that the idle task is using init_mm right before its cpu goes
  23291. * offline.
  23292. @@ -4899,7 +5211,11 @@
  23293. switch_mm(mm, &init_mm, current);
  23294. finish_arch_post_lock_switch();
  23295. }
  23296. - mmdrop(mm);
  23297. + /*
  23298. + * Defer the cleanup to an alive cpu. On RT we can neither
  23299. + * call mmdrop() nor mmdrop_delayed() from here.
  23300. + */
  23301. + per_cpu(idle_last_mm, smp_processor_id()) = mm;
  23302. }
  23303. /*
  23304. @@ -5242,6 +5558,10 @@
  23305. case CPU_DEAD:
  23306. calc_load_migrate(rq);
  23307. + if (per_cpu(idle_last_mm, cpu)) {
  23308. + mmdrop(per_cpu(idle_last_mm, cpu));
  23309. + per_cpu(idle_last_mm, cpu) = NULL;
  23310. + }
  23311. break;
  23312. #endif
  23313. }
  23314. @@ -7183,7 +7503,8 @@
  23315. #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
  23316. static inline int preempt_count_equals(int preempt_offset)
  23317. {
  23318. - int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
  23319. + int nested = (preempt_count() & ~PREEMPT_ACTIVE) +
  23320. + sched_rcu_preempt_depth();
  23321. return (nested == preempt_offset);
  23322. }
  23323. diff -Nur linux-3.18.14.orig/kernel/sched/cputime.c linux-3.18.14-rt/kernel/sched/cputime.c
  23324. --- linux-3.18.14.orig/kernel/sched/cputime.c 2015-05-20 10:04:50.000000000 -0500
  23325. +++ linux-3.18.14-rt/kernel/sched/cputime.c 2015-05-31 15:32:48.893635363 -0500
  23326. @@ -675,37 +675,45 @@
  23327. void vtime_account_system(struct task_struct *tsk)
  23328. {
  23329. - write_seqlock(&tsk->vtime_seqlock);
  23330. + raw_spin_lock(&tsk->vtime_lock);
  23331. + write_seqcount_begin(&tsk->vtime_seq);
  23332. __vtime_account_system(tsk);
  23333. - write_sequnlock(&tsk->vtime_seqlock);
  23334. + write_seqcount_end(&tsk->vtime_seq);
  23335. + raw_spin_unlock(&tsk->vtime_lock);
  23336. }
  23337. void vtime_gen_account_irq_exit(struct task_struct *tsk)
  23338. {
  23339. - write_seqlock(&tsk->vtime_seqlock);
  23340. + raw_spin_lock(&tsk->vtime_lock);
  23341. + write_seqcount_begin(&tsk->vtime_seq);
  23342. __vtime_account_system(tsk);
  23343. if (context_tracking_in_user())
  23344. tsk->vtime_snap_whence = VTIME_USER;
  23345. - write_sequnlock(&tsk->vtime_seqlock);
  23346. + write_seqcount_end(&tsk->vtime_seq);
  23347. + raw_spin_unlock(&tsk->vtime_lock);
  23348. }
  23349. void vtime_account_user(struct task_struct *tsk)
  23350. {
  23351. cputime_t delta_cpu;
  23352. - write_seqlock(&tsk->vtime_seqlock);
  23353. + raw_spin_lock(&tsk->vtime_lock);
  23354. + write_seqcount_begin(&tsk->vtime_seq);
  23355. delta_cpu = get_vtime_delta(tsk);
  23356. tsk->vtime_snap_whence = VTIME_SYS;
  23357. account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu));
  23358. - write_sequnlock(&tsk->vtime_seqlock);
  23359. + write_seqcount_end(&tsk->vtime_seq);
  23360. + raw_spin_unlock(&tsk->vtime_lock);
  23361. }
  23362. void vtime_user_enter(struct task_struct *tsk)
  23363. {
  23364. - write_seqlock(&tsk->vtime_seqlock);
  23365. + raw_spin_lock(&tsk->vtime_lock);
  23366. + write_seqcount_begin(&tsk->vtime_seq);
  23367. __vtime_account_system(tsk);
  23368. tsk->vtime_snap_whence = VTIME_USER;
  23369. - write_sequnlock(&tsk->vtime_seqlock);
  23370. + write_seqcount_end(&tsk->vtime_seq);
  23371. + raw_spin_unlock(&tsk->vtime_lock);
  23372. }
  23373. void vtime_guest_enter(struct task_struct *tsk)
  23374. @@ -717,19 +725,23 @@
  23375. * synchronization against the reader (task_gtime())
  23376. * that can thus safely catch up with a tickless delta.
  23377. */
  23378. - write_seqlock(&tsk->vtime_seqlock);
  23379. + raw_spin_lock(&tsk->vtime_lock);
  23380. + write_seqcount_begin(&tsk->vtime_seq);
  23381. __vtime_account_system(tsk);
  23382. current->flags |= PF_VCPU;
  23383. - write_sequnlock(&tsk->vtime_seqlock);
  23384. + write_seqcount_end(&tsk->vtime_seq);
  23385. + raw_spin_unlock(&tsk->vtime_lock);
  23386. }
  23387. EXPORT_SYMBOL_GPL(vtime_guest_enter);
  23388. void vtime_guest_exit(struct task_struct *tsk)
  23389. {
  23390. - write_seqlock(&tsk->vtime_seqlock);
  23391. + raw_spin_lock(&tsk->vtime_lock);
  23392. + write_seqcount_begin(&tsk->vtime_seq);
  23393. __vtime_account_system(tsk);
  23394. current->flags &= ~PF_VCPU;
  23395. - write_sequnlock(&tsk->vtime_seqlock);
  23396. + write_seqcount_end(&tsk->vtime_seq);
  23397. + raw_spin_unlock(&tsk->vtime_lock);
  23398. }
  23399. EXPORT_SYMBOL_GPL(vtime_guest_exit);
  23400. @@ -742,24 +754,30 @@
  23401. void arch_vtime_task_switch(struct task_struct *prev)
  23402. {
  23403. - write_seqlock(&prev->vtime_seqlock);
  23404. + raw_spin_lock(&prev->vtime_lock);
  23405. + write_seqcount_begin(&prev->vtime_seq);
  23406. prev->vtime_snap_whence = VTIME_SLEEPING;
  23407. - write_sequnlock(&prev->vtime_seqlock);
  23408. + write_seqcount_end(&prev->vtime_seq);
  23409. + raw_spin_unlock(&prev->vtime_lock);
  23410. - write_seqlock(&current->vtime_seqlock);
  23411. + raw_spin_lock(&current->vtime_lock);
  23412. + write_seqcount_begin(&current->vtime_seq);
  23413. current->vtime_snap_whence = VTIME_SYS;
  23414. current->vtime_snap = sched_clock_cpu(smp_processor_id());
  23415. - write_sequnlock(&current->vtime_seqlock);
  23416. + write_seqcount_end(&current->vtime_seq);
  23417. + raw_spin_unlock(&current->vtime_lock);
  23418. }
  23419. void vtime_init_idle(struct task_struct *t, int cpu)
  23420. {
  23421. unsigned long flags;
  23422. - write_seqlock_irqsave(&t->vtime_seqlock, flags);
  23423. + raw_spin_lock_irqsave(&t->vtime_lock, flags);
  23424. + write_seqcount_begin(&t->vtime_seq);
  23425. t->vtime_snap_whence = VTIME_SYS;
  23426. t->vtime_snap = sched_clock_cpu(cpu);
  23427. - write_sequnlock_irqrestore(&t->vtime_seqlock, flags);
  23428. + write_seqcount_end(&t->vtime_seq);
  23429. + raw_spin_unlock_irqrestore(&t->vtime_lock, flags);
  23430. }
  23431. cputime_t task_gtime(struct task_struct *t)
  23432. @@ -768,13 +786,13 @@
  23433. cputime_t gtime;
  23434. do {
  23435. - seq = read_seqbegin(&t->vtime_seqlock);
  23436. + seq = read_seqcount_begin(&t->vtime_seq);
  23437. gtime = t->gtime;
  23438. if (t->flags & PF_VCPU)
  23439. gtime += vtime_delta(t);
  23440. - } while (read_seqretry(&t->vtime_seqlock, seq));
  23441. + } while (read_seqcount_retry(&t->vtime_seq, seq));
  23442. return gtime;
  23443. }
  23444. @@ -797,7 +815,7 @@
  23445. *udelta = 0;
  23446. *sdelta = 0;
  23447. - seq = read_seqbegin(&t->vtime_seqlock);
  23448. + seq = read_seqcount_begin(&t->vtime_seq);
  23449. if (u_dst)
  23450. *u_dst = *u_src;
  23451. @@ -821,7 +839,7 @@
  23452. if (t->vtime_snap_whence == VTIME_SYS)
  23453. *sdelta = delta;
  23454. }
  23455. - } while (read_seqretry(&t->vtime_seqlock, seq));
  23456. + } while (read_seqcount_retry(&t->vtime_seq, seq));
  23457. }
  23458. diff -Nur linux-3.18.14.orig/kernel/sched/deadline.c linux-3.18.14-rt/kernel/sched/deadline.c
  23459. --- linux-3.18.14.orig/kernel/sched/deadline.c 2015-05-20 10:04:50.000000000 -0500
  23460. +++ linux-3.18.14-rt/kernel/sched/deadline.c 2015-05-31 15:32:48.893635363 -0500
  23461. @@ -570,6 +570,7 @@
  23462. hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
  23463. timer->function = dl_task_timer;
  23464. + timer->irqsafe = 1;
  23465. }
  23466. static
  23467. diff -Nur linux-3.18.14.orig/kernel/sched/debug.c linux-3.18.14-rt/kernel/sched/debug.c
  23468. --- linux-3.18.14.orig/kernel/sched/debug.c 2015-05-20 10:04:50.000000000 -0500
  23469. +++ linux-3.18.14-rt/kernel/sched/debug.c 2015-05-31 15:32:48.897635363 -0500
  23470. @@ -256,6 +256,9 @@
  23471. P(rt_throttled);
  23472. PN(rt_time);
  23473. PN(rt_runtime);
  23474. +#ifdef CONFIG_SMP
  23475. + P(rt_nr_migratory);
  23476. +#endif
  23477. #undef PN
  23478. #undef P
  23479. @@ -634,6 +637,10 @@
  23480. #endif
  23481. P(policy);
  23482. P(prio);
  23483. +#ifdef CONFIG_PREEMPT_RT_FULL
  23484. + P(migrate_disable);
  23485. +#endif
  23486. + P(nr_cpus_allowed);
  23487. #undef PN
  23488. #undef __PN
  23489. #undef P
  23490. diff -Nur linux-3.18.14.orig/kernel/sched/fair.c linux-3.18.14-rt/kernel/sched/fair.c
  23491. --- linux-3.18.14.orig/kernel/sched/fair.c 2015-05-20 10:04:50.000000000 -0500
  23492. +++ linux-3.18.14-rt/kernel/sched/fair.c 2015-05-31 15:32:48.897635363 -0500
  23493. @@ -2951,7 +2951,7 @@
  23494. ideal_runtime = sched_slice(cfs_rq, curr);
  23495. delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
  23496. if (delta_exec > ideal_runtime) {
  23497. - resched_curr(rq_of(cfs_rq));
  23498. + resched_curr_lazy(rq_of(cfs_rq));
  23499. /*
  23500. * The current task ran long enough, ensure it doesn't get
  23501. * re-elected due to buddy favours.
  23502. @@ -2975,7 +2975,7 @@
  23503. return;
  23504. if (delta > ideal_runtime)
  23505. - resched_curr(rq_of(cfs_rq));
  23506. + resched_curr_lazy(rq_of(cfs_rq));
  23507. }
  23508. static void
  23509. @@ -3115,7 +3115,7 @@
  23510. * validating it and just reschedule.
  23511. */
  23512. if (queued) {
  23513. - resched_curr(rq_of(cfs_rq));
  23514. + resched_curr_lazy(rq_of(cfs_rq));
  23515. return;
  23516. }
  23517. /*
  23518. @@ -3306,7 +3306,7 @@
  23519. * hierarchy can be throttled
  23520. */
  23521. if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
  23522. - resched_curr(rq_of(cfs_rq));
  23523. + resched_curr_lazy(rq_of(cfs_rq));
  23524. }
  23525. static __always_inline
  23526. @@ -3925,7 +3925,7 @@
  23527. if (delta < 0) {
  23528. if (rq->curr == p)
  23529. - resched_curr(rq);
  23530. + resched_curr_lazy(rq);
  23531. return;
  23532. }
  23533. hrtick_start(rq, delta);
  23534. @@ -4792,7 +4792,7 @@
  23535. return;
  23536. preempt:
  23537. - resched_curr(rq);
  23538. + resched_curr_lazy(rq);
  23539. /*
  23540. * Only set the backward buddy when the current task is still
  23541. * on the rq. This can happen when a wakeup gets interleaved
  23542. @@ -7576,7 +7576,7 @@
  23543. * 'current' within the tree based on its new key value.
  23544. */
  23545. swap(curr->vruntime, se->vruntime);
  23546. - resched_curr(rq);
  23547. + resched_curr_lazy(rq);
  23548. }
  23549. se->vruntime -= cfs_rq->min_vruntime;
  23550. @@ -7601,7 +7601,7 @@
  23551. */
  23552. if (rq->curr == p) {
  23553. if (p->prio > oldprio)
  23554. - resched_curr(rq);
  23555. + resched_curr_lazy(rq);
  23556. } else
  23557. check_preempt_curr(rq, p, 0);
  23558. }
  23559. diff -Nur linux-3.18.14.orig/kernel/sched/features.h linux-3.18.14-rt/kernel/sched/features.h
  23560. --- linux-3.18.14.orig/kernel/sched/features.h 2015-05-20 10:04:50.000000000 -0500
  23561. +++ linux-3.18.14-rt/kernel/sched/features.h 2015-05-31 15:32:48.897635363 -0500
  23562. @@ -50,12 +50,18 @@
  23563. */
  23564. SCHED_FEAT(NONTASK_CAPACITY, true)
  23565. +#ifdef CONFIG_PREEMPT_RT_FULL
  23566. +SCHED_FEAT(TTWU_QUEUE, false)
  23567. +# ifdef CONFIG_PREEMPT_LAZY
  23568. +SCHED_FEAT(PREEMPT_LAZY, true)
  23569. +# endif
  23570. +#else
  23571. /*
  23572. * Queue remote wakeups on the target CPU and process them
  23573. * using the scheduler IPI. Reduces rq->lock contention/bounces.
  23574. */
  23575. SCHED_FEAT(TTWU_QUEUE, true)
  23576. -
  23577. +#endif
  23578. SCHED_FEAT(FORCE_SD_OVERLAP, false)
  23579. SCHED_FEAT(RT_RUNTIME_SHARE, true)
  23580. SCHED_FEAT(LB_MIN, false)
  23581. diff -Nur linux-3.18.14.orig/kernel/sched/Makefile linux-3.18.14-rt/kernel/sched/Makefile
  23582. --- linux-3.18.14.orig/kernel/sched/Makefile 2015-05-20 10:04:50.000000000 -0500
  23583. +++ linux-3.18.14-rt/kernel/sched/Makefile 2015-05-31 15:32:48.861635363 -0500
  23584. @@ -13,7 +13,7 @@
  23585. obj-y += core.o proc.o clock.o cputime.o
  23586. obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
  23587. -obj-y += wait.o completion.o idle.o
  23588. +obj-y += wait.o wait-simple.o work-simple.o completion.o idle.o
  23589. obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
  23590. obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
  23591. obj-$(CONFIG_SCHEDSTATS) += stats.o
  23592. diff -Nur linux-3.18.14.orig/kernel/sched/rt.c linux-3.18.14-rt/kernel/sched/rt.c
  23593. --- linux-3.18.14.orig/kernel/sched/rt.c 2015-05-20 10:04:50.000000000 -0500
  23594. +++ linux-3.18.14-rt/kernel/sched/rt.c 2015-05-31 15:32:48.897635363 -0500
  23595. @@ -43,6 +43,7 @@
  23596. hrtimer_init(&rt_b->rt_period_timer,
  23597. CLOCK_MONOTONIC, HRTIMER_MODE_REL);
  23598. + rt_b->rt_period_timer.irqsafe = 1;
  23599. rt_b->rt_period_timer.function = sched_rt_period_timer;
  23600. }
  23601. diff -Nur linux-3.18.14.orig/kernel/sched/sched.h linux-3.18.14-rt/kernel/sched/sched.h
  23602. --- linux-3.18.14.orig/kernel/sched/sched.h 2015-05-20 10:04:50.000000000 -0500
  23603. +++ linux-3.18.14-rt/kernel/sched/sched.h 2015-05-31 15:32:48.897635363 -0500
  23604. @@ -1018,6 +1018,7 @@
  23605. #define WF_SYNC 0x01 /* waker goes to sleep after wakeup */
  23606. #define WF_FORK 0x02 /* child wakeup after fork */
  23607. #define WF_MIGRATED 0x4 /* internal use, task got migrated */
  23608. +#define WF_LOCK_SLEEPER 0x08 /* wakeup spinlock "sleeper" */
  23609. /*
  23610. * To aid in avoiding the subversion of "niceness" due to uneven distribution
  23611. @@ -1210,6 +1211,15 @@
  23612. extern void resched_curr(struct rq *rq);
  23613. extern void resched_cpu(int cpu);
  23614. +#ifdef CONFIG_PREEMPT_LAZY
  23615. +extern void resched_curr_lazy(struct rq *rq);
  23616. +#else
  23617. +static inline void resched_curr_lazy(struct rq *rq)
  23618. +{
  23619. + resched_curr(rq);
  23620. +}
  23621. +#endif
  23622. +
  23623. extern struct rt_bandwidth def_rt_bandwidth;
  23624. extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
  23625. diff -Nur linux-3.18.14.orig/kernel/sched/wait-simple.c linux-3.18.14-rt/kernel/sched/wait-simple.c
  23626. --- linux-3.18.14.orig/kernel/sched/wait-simple.c 1969-12-31 18:00:00.000000000 -0600
  23627. +++ linux-3.18.14-rt/kernel/sched/wait-simple.c 2015-05-31 15:32:48.897635363 -0500
  23628. @@ -0,0 +1,115 @@
  23629. +/*
  23630. + * Simple waitqueues without fancy flags and callbacks
  23631. + *
  23632. + * (C) 2011 Thomas Gleixner <tglx@linutronix.de>
  23633. + *
  23634. + * Based on kernel/wait.c
  23635. + *
  23636. + * For licencing details see kernel-base/COPYING
  23637. + */
  23638. +#include <linux/init.h>
  23639. +#include <linux/export.h>
  23640. +#include <linux/sched.h>
  23641. +#include <linux/wait-simple.h>
  23642. +
  23643. +/* Adds w to head->list. Must be called with head->lock locked. */
  23644. +static inline void __swait_enqueue(struct swait_head *head, struct swaiter *w)
  23645. +{
  23646. + list_add(&w->node, &head->list);
  23647. + /* We can't let the condition leak before the setting of head */
  23648. + smp_mb();
  23649. +}
  23650. +
  23651. +/* Removes w from head->list. Must be called with head->lock locked. */
  23652. +static inline void __swait_dequeue(struct swaiter *w)
  23653. +{
  23654. + list_del_init(&w->node);
  23655. +}
  23656. +
  23657. +void __init_swait_head(struct swait_head *head, struct lock_class_key *key)
  23658. +{
  23659. + raw_spin_lock_init(&head->lock);
  23660. + lockdep_set_class(&head->lock, key);
  23661. + INIT_LIST_HEAD(&head->list);
  23662. +}
  23663. +EXPORT_SYMBOL(__init_swait_head);
  23664. +
  23665. +void swait_prepare_locked(struct swait_head *head, struct swaiter *w)
  23666. +{
  23667. + w->task = current;
  23668. + if (list_empty(&w->node))
  23669. + __swait_enqueue(head, w);
  23670. +}
  23671. +
  23672. +void swait_prepare(struct swait_head *head, struct swaiter *w, int state)
  23673. +{
  23674. + unsigned long flags;
  23675. +
  23676. + raw_spin_lock_irqsave(&head->lock, flags);
  23677. + swait_prepare_locked(head, w);
  23678. + __set_current_state(state);
  23679. + raw_spin_unlock_irqrestore(&head->lock, flags);
  23680. +}
  23681. +EXPORT_SYMBOL(swait_prepare);
  23682. +
  23683. +void swait_finish_locked(struct swait_head *head, struct swaiter *w)
  23684. +{
  23685. + __set_current_state(TASK_RUNNING);
  23686. + if (w->task)
  23687. + __swait_dequeue(w);
  23688. +}
  23689. +
  23690. +void swait_finish(struct swait_head *head, struct swaiter *w)
  23691. +{
  23692. + unsigned long flags;
  23693. +
  23694. + __set_current_state(TASK_RUNNING);
  23695. + if (w->task) {
  23696. + raw_spin_lock_irqsave(&head->lock, flags);
  23697. + __swait_dequeue(w);
  23698. + raw_spin_unlock_irqrestore(&head->lock, flags);
  23699. + }
  23700. +}
  23701. +EXPORT_SYMBOL(swait_finish);
  23702. +
  23703. +unsigned int
  23704. +__swait_wake_locked(struct swait_head *head, unsigned int state, unsigned int num)
  23705. +{
  23706. + struct swaiter *curr, *next;
  23707. + int woken = 0;
  23708. +
  23709. + list_for_each_entry_safe(curr, next, &head->list, node) {
  23710. + if (wake_up_state(curr->task, state)) {
  23711. + __swait_dequeue(curr);
  23712. + /*
  23713. + * The waiting task can free the waiter as
  23714. + * soon as curr->task = NULL is written,
  23715. + * without taking any locks. A memory barrier
  23716. + * is required here to prevent the following
  23717. + * store to curr->task from getting ahead of
  23718. + * the dequeue operation.
  23719. + */
  23720. + smp_wmb();
  23721. + curr->task = NULL;
  23722. + if (++woken == num)
  23723. + break;
  23724. + }
  23725. + }
  23726. + return woken;
  23727. +}
  23728. +
  23729. +unsigned int
  23730. +__swait_wake(struct swait_head *head, unsigned int state, unsigned int num)
  23731. +{
  23732. + unsigned long flags;
  23733. + int woken;
  23734. +
  23735. + if (!swaitqueue_active(head))
  23736. + return 0;
  23737. +
  23738. + raw_spin_lock_irqsave(&head->lock, flags);
  23739. + woken = __swait_wake_locked(head, state, num);
  23740. + raw_spin_unlock_irqrestore(&head->lock, flags);
  23741. + return woken;
  23742. +}
  23743. +EXPORT_SYMBOL(__swait_wake);
  23744. diff -Nur linux-3.18.14.orig/kernel/sched/work-simple.c linux-3.18.14-rt/kernel/sched/work-simple.c
  23745. --- linux-3.18.14.orig/kernel/sched/work-simple.c 1969-12-31 18:00:00.000000000 -0600
  23746. +++ linux-3.18.14-rt/kernel/sched/work-simple.c 2015-05-31 15:32:48.901635363 -0500
  23747. @@ -0,0 +1,172 @@
  23748. +/*
  23749. + * Copyright (C) 2014 BMW Car IT GmbH, Daniel Wagner daniel.wagner@bmw-carit.de
  23750. + *
  23751. + * Provides a framework for enqueuing callbacks from irq context
  23752. + * PREEMPT_RT_FULL safe. The callbacks are executed in kthread context.
  23753. + */
  23754. +
  23755. +#include <linux/wait-simple.h>
  23756. +#include <linux/work-simple.h>
  23757. +#include <linux/kthread.h>
  23758. +#include <linux/slab.h>
  23759. +#include <linux/spinlock.h>
  23760. +
  23761. +#define SWORK_EVENT_PENDING (1 << 0)
  23762. +
  23763. +static DEFINE_MUTEX(worker_mutex);
  23764. +static struct sworker *glob_worker;
  23765. +
  23766. +struct sworker {
  23767. + struct list_head events;
  23768. + struct swait_head wq;
  23769. +
  23770. + raw_spinlock_t lock;
  23771. +
  23772. + struct task_struct *task;
  23773. + int refs;
  23774. +};
  23775. +
  23776. +static bool swork_readable(struct sworker *worker)
  23777. +{
  23778. + bool r;
  23779. +
  23780. + if (kthread_should_stop())
  23781. + return true;
  23782. +
  23783. + raw_spin_lock_irq(&worker->lock);
  23784. + r = !list_empty(&worker->events);
  23785. + raw_spin_unlock_irq(&worker->lock);
  23786. +
  23787. + return r;
  23788. +}
  23789. +
  23790. +static int swork_kthread(void *arg)
  23791. +{
  23792. + struct sworker *worker = arg;
  23793. +
  23794. + for (;;) {
  23795. + swait_event_interruptible(worker->wq,
  23796. + swork_readable(worker));
  23797. + if (kthread_should_stop())
  23798. + break;
  23799. +
  23800. + raw_spin_lock_irq(&worker->lock);
  23801. + while (!list_empty(&worker->events)) {
  23802. + struct swork_event *sev;
  23803. +
  23804. + sev = list_first_entry(&worker->events,
  23805. + struct swork_event, item);
  23806. + list_del(&sev->item);
  23807. + raw_spin_unlock_irq(&worker->lock);
  23808. +
  23809. + WARN_ON_ONCE(!test_and_clear_bit(SWORK_EVENT_PENDING,
  23810. + &sev->flags));
  23811. + sev->func(sev);
  23812. + raw_spin_lock_irq(&worker->lock);
  23813. + }
  23814. + raw_spin_unlock_irq(&worker->lock);
  23815. + }
  23816. + return 0;
  23817. +}
  23818. +
  23819. +static struct sworker *swork_create(void)
  23820. +{
  23821. + struct sworker *worker;
  23822. +
  23823. + worker = kzalloc(sizeof(*worker), GFP_KERNEL);
  23824. + if (!worker)
  23825. + return ERR_PTR(-ENOMEM);
  23826. +
  23827. + INIT_LIST_HEAD(&worker->events);
  23828. + raw_spin_lock_init(&worker->lock);
  23829. + init_swait_head(&worker->wq);
  23830. +
  23831. + worker->task = kthread_run(swork_kthread, worker, "kswork");
  23832. + if (IS_ERR(worker->task)) {
  23833. + kfree(worker);
  23834. + return ERR_PTR(-ENOMEM);
  23835. + }
  23836. +
  23837. + return worker;
  23838. +}
  23839. +
  23840. +static void swork_destroy(struct sworker *worker)
  23841. +{
  23842. + kthread_stop(worker->task);
  23843. +
  23844. + WARN_ON(!list_empty(&worker->events));
  23845. + kfree(worker);
  23846. +}
  23847. +
  23848. +/**
  23849. + * swork_queue - queue swork
  23850. + *
  23851. + * Returns %false if @work was already on a queue, %true otherwise.
  23852. + *
  23853. + * The work is queued and processed on a random CPU
  23854. + */
  23855. +bool swork_queue(struct swork_event *sev)
  23856. +{
  23857. + unsigned long flags;
  23858. +
  23859. + if (test_and_set_bit(SWORK_EVENT_PENDING, &sev->flags))
  23860. + return false;
  23861. +
  23862. + raw_spin_lock_irqsave(&glob_worker->lock, flags);
  23863. + list_add_tail(&sev->item, &glob_worker->events);
  23864. + raw_spin_unlock_irqrestore(&glob_worker->lock, flags);
  23865. +
  23866. + swait_wake(&glob_worker->wq);
  23867. + return true;
  23868. +}
  23869. +EXPORT_SYMBOL_GPL(swork_queue);
  23870. +
  23871. +/**
  23872. + * swork_get - get an instance of the sworker
  23873. + *
  23874. + * Returns an negative error code if the initialization if the worker did not
  23875. + * work, %0 otherwise.
  23876. + *
  23877. + */
  23878. +int swork_get(void)
  23879. +{
  23880. + struct sworker *worker;
  23881. +
  23882. + mutex_lock(&worker_mutex);
  23883. + if (!glob_worker) {
  23884. + worker = swork_create();
  23885. + if (IS_ERR(worker)) {
  23886. + mutex_unlock(&worker_mutex);
  23887. + return -ENOMEM;
  23888. + }
  23889. +
  23890. + glob_worker = worker;
  23891. + }
  23892. +
  23893. + glob_worker->refs++;
  23894. + mutex_unlock(&worker_mutex);
  23895. +
  23896. + return 0;
  23897. +}
  23898. +EXPORT_SYMBOL_GPL(swork_get);
  23899. +
  23900. +/**
  23901. + * swork_put - puts an instance of the sworker
  23902. + *
  23903. + * Will destroy the sworker thread. This function must not be called until all
  23904. + * queued events have been completed.
  23905. + */
  23906. +void swork_put(void)
  23907. +{
  23908. + mutex_lock(&worker_mutex);
  23909. +
  23910. + glob_worker->refs--;
  23911. + if (glob_worker->refs > 0)
  23912. + goto out;
  23913. +
  23914. + swork_destroy(glob_worker);
  23915. + glob_worker = NULL;
  23916. +out:
  23917. + mutex_unlock(&worker_mutex);
  23918. +}
  23919. +EXPORT_SYMBOL_GPL(swork_put);
  23920. diff -Nur linux-3.18.14.orig/kernel/signal.c linux-3.18.14-rt/kernel/signal.c
  23921. --- linux-3.18.14.orig/kernel/signal.c 2015-05-20 10:04:50.000000000 -0500
  23922. +++ linux-3.18.14-rt/kernel/signal.c 2015-05-31 15:32:48.921635363 -0500
  23923. @@ -14,6 +14,7 @@
  23924. #include <linux/export.h>
  23925. #include <linux/init.h>
  23926. #include <linux/sched.h>
  23927. +#include <linux/sched/rt.h>
  23928. #include <linux/fs.h>
  23929. #include <linux/tty.h>
  23930. #include <linux/binfmts.h>
  23931. @@ -352,13 +353,45 @@
  23932. return false;
  23933. }
  23934. +#ifdef __HAVE_ARCH_CMPXCHG
  23935. +static inline struct sigqueue *get_task_cache(struct task_struct *t)
  23936. +{
  23937. + struct sigqueue *q = t->sigqueue_cache;
  23938. +
  23939. + if (cmpxchg(&t->sigqueue_cache, q, NULL) != q)
  23940. + return NULL;
  23941. + return q;
  23942. +}
  23943. +
  23944. +static inline int put_task_cache(struct task_struct *t, struct sigqueue *q)
  23945. +{
  23946. + if (cmpxchg(&t->sigqueue_cache, NULL, q) == NULL)
  23947. + return 0;
  23948. + return 1;
  23949. +}
  23950. +
  23951. +#else
  23952. +
  23953. +static inline struct sigqueue *get_task_cache(struct task_struct *t)
  23954. +{
  23955. + return NULL;
  23956. +}
  23957. +
  23958. +static inline int put_task_cache(struct task_struct *t, struct sigqueue *q)
  23959. +{
  23960. + return 1;
  23961. +}
  23962. +
  23963. +#endif
  23964. +
  23965. /*
  23966. * allocate a new signal queue record
  23967. * - this may be called without locks if and only if t == current, otherwise an
  23968. * appropriate lock must be held to stop the target task from exiting
  23969. */
  23970. static struct sigqueue *
  23971. -__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit)
  23972. +__sigqueue_do_alloc(int sig, struct task_struct *t, gfp_t flags,
  23973. + int override_rlimit, int fromslab)
  23974. {
  23975. struct sigqueue *q = NULL;
  23976. struct user_struct *user;
  23977. @@ -375,7 +408,10 @@
  23978. if (override_rlimit ||
  23979. atomic_read(&user->sigpending) <=
  23980. task_rlimit(t, RLIMIT_SIGPENDING)) {
  23981. - q = kmem_cache_alloc(sigqueue_cachep, flags);
  23982. + if (!fromslab)
  23983. + q = get_task_cache(t);
  23984. + if (!q)
  23985. + q = kmem_cache_alloc(sigqueue_cachep, flags);
  23986. } else {
  23987. print_dropped_signal(sig);
  23988. }
  23989. @@ -392,6 +428,13 @@
  23990. return q;
  23991. }
  23992. +static struct sigqueue *
  23993. +__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags,
  23994. + int override_rlimit)
  23995. +{
  23996. + return __sigqueue_do_alloc(sig, t, flags, override_rlimit, 0);
  23997. +}
  23998. +
  23999. static void __sigqueue_free(struct sigqueue *q)
  24000. {
  24001. if (q->flags & SIGQUEUE_PREALLOC)
  24002. @@ -401,6 +444,21 @@
  24003. kmem_cache_free(sigqueue_cachep, q);
  24004. }
  24005. +static void sigqueue_free_current(struct sigqueue *q)
  24006. +{
  24007. + struct user_struct *up;
  24008. +
  24009. + if (q->flags & SIGQUEUE_PREALLOC)
  24010. + return;
  24011. +
  24012. + up = q->user;
  24013. + if (rt_prio(current->normal_prio) && !put_task_cache(current, q)) {
  24014. + atomic_dec(&up->sigpending);
  24015. + free_uid(up);
  24016. + } else
  24017. + __sigqueue_free(q);
  24018. +}
  24019. +
  24020. void flush_sigqueue(struct sigpending *queue)
  24021. {
  24022. struct sigqueue *q;
  24023. @@ -414,6 +472,21 @@
  24024. }
  24025. /*
  24026. + * Called from __exit_signal. Flush tsk->pending and
  24027. + * tsk->sigqueue_cache
  24028. + */
  24029. +void flush_task_sigqueue(struct task_struct *tsk)
  24030. +{
  24031. + struct sigqueue *q;
  24032. +
  24033. + flush_sigqueue(&tsk->pending);
  24034. +
  24035. + q = get_task_cache(tsk);
  24036. + if (q)
  24037. + kmem_cache_free(sigqueue_cachep, q);
  24038. +}
  24039. +
  24040. +/*
  24041. * Flush all pending signals for a task.
  24042. */
  24043. void __flush_signals(struct task_struct *t)
  24044. @@ -565,7 +638,7 @@
  24045. still_pending:
  24046. list_del_init(&first->list);
  24047. copy_siginfo(info, &first->info);
  24048. - __sigqueue_free(first);
  24049. + sigqueue_free_current(first);
  24050. } else {
  24051. /*
  24052. * Ok, it wasn't in the queue. This must be
  24053. @@ -611,6 +684,8 @@
  24054. {
  24055. int signr;
  24056. + WARN_ON_ONCE(tsk != current);
  24057. +
  24058. /* We only dequeue private signals from ourselves, we don't let
  24059. * signalfd steal them
  24060. */
  24061. @@ -1207,8 +1282,8 @@
  24062. * We don't want to have recursive SIGSEGV's etc, for example,
  24063. * that is why we also clear SIGNAL_UNKILLABLE.
  24064. */
  24065. -int
  24066. -force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
  24067. +static int
  24068. +do_force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
  24069. {
  24070. unsigned long int flags;
  24071. int ret, blocked, ignored;
  24072. @@ -1233,6 +1308,39 @@
  24073. return ret;
  24074. }
  24075. +int force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
  24076. +{
  24077. +/*
  24078. + * On some archs, PREEMPT_RT has to delay sending a signal from a trap
  24079. + * since it can not enable preemption, and the signal code's spin_locks
  24080. + * turn into mutexes. Instead, it must set TIF_NOTIFY_RESUME which will
  24081. + * send the signal on exit of the trap.
  24082. + */
  24083. +#ifdef ARCH_RT_DELAYS_SIGNAL_SEND
  24084. + if (in_atomic()) {
  24085. + if (WARN_ON_ONCE(t != current))
  24086. + return 0;
  24087. + if (WARN_ON_ONCE(t->forced_info.si_signo))
  24088. + return 0;
  24089. +
  24090. + if (is_si_special(info)) {
  24091. + WARN_ON_ONCE(info != SEND_SIG_PRIV);
  24092. + t->forced_info.si_signo = sig;
  24093. + t->forced_info.si_errno = 0;
  24094. + t->forced_info.si_code = SI_KERNEL;
  24095. + t->forced_info.si_pid = 0;
  24096. + t->forced_info.si_uid = 0;
  24097. + } else {
  24098. + t->forced_info = *info;
  24099. + }
  24100. +
  24101. + set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
  24102. + return 0;
  24103. + }
  24104. +#endif
  24105. + return do_force_sig_info(sig, info, t);
  24106. +}
  24107. +
  24108. /*
  24109. * Nuke all other threads in the group.
  24110. */
  24111. @@ -1267,12 +1375,12 @@
  24112. * Disable interrupts early to avoid deadlocks.
  24113. * See rcu_read_unlock() comment header for details.
  24114. */
  24115. - local_irq_save(*flags);
  24116. + local_irq_save_nort(*flags);
  24117. rcu_read_lock();
  24118. sighand = rcu_dereference(tsk->sighand);
  24119. if (unlikely(sighand == NULL)) {
  24120. rcu_read_unlock();
  24121. - local_irq_restore(*flags);
  24122. + local_irq_restore_nort(*flags);
  24123. break;
  24124. }
  24125. @@ -1283,7 +1391,7 @@
  24126. }
  24127. spin_unlock(&sighand->siglock);
  24128. rcu_read_unlock();
  24129. - local_irq_restore(*flags);
  24130. + local_irq_restore_nort(*flags);
  24131. }
  24132. return sighand;
  24133. @@ -1528,7 +1636,8 @@
  24134. */
  24135. struct sigqueue *sigqueue_alloc(void)
  24136. {
  24137. - struct sigqueue *q = __sigqueue_alloc(-1, current, GFP_KERNEL, 0);
  24138. + /* Preallocated sigqueue objects always from the slabcache ! */
  24139. + struct sigqueue *q = __sigqueue_do_alloc(-1, current, GFP_KERNEL, 0, 1);
  24140. if (q)
  24141. q->flags |= SIGQUEUE_PREALLOC;
  24142. @@ -1889,15 +1998,7 @@
  24143. if (gstop_done && ptrace_reparented(current))
  24144. do_notify_parent_cldstop(current, false, why);
  24145. - /*
  24146. - * Don't want to allow preemption here, because
  24147. - * sys_ptrace() needs this task to be inactive.
  24148. - *
  24149. - * XXX: implement read_unlock_no_resched().
  24150. - */
  24151. - preempt_disable();
  24152. read_unlock(&tasklist_lock);
  24153. - preempt_enable_no_resched();
  24154. freezable_schedule();
  24155. } else {
  24156. /*
  24157. diff -Nur linux-3.18.14.orig/kernel/softirq.c linux-3.18.14-rt/kernel/softirq.c
  24158. --- linux-3.18.14.orig/kernel/softirq.c 2015-05-20 10:04:50.000000000 -0500
  24159. +++ linux-3.18.14-rt/kernel/softirq.c 2015-05-31 15:32:48.921635363 -0500
  24160. @@ -21,10 +21,12 @@
  24161. #include <linux/freezer.h>
  24162. #include <linux/kthread.h>
  24163. #include <linux/rcupdate.h>
  24164. +#include <linux/delay.h>
  24165. #include <linux/ftrace.h>
  24166. #include <linux/smp.h>
  24167. #include <linux/smpboot.h>
  24168. #include <linux/tick.h>
  24169. +#include <linux/locallock.h>
  24170. #include <linux/irq.h>
  24171. #define CREATE_TRACE_POINTS
  24172. @@ -62,6 +64,98 @@
  24173. "TASKLET", "SCHED", "HRTIMER", "RCU"
  24174. };
  24175. +#ifdef CONFIG_NO_HZ_COMMON
  24176. +# ifdef CONFIG_PREEMPT_RT_FULL
  24177. +
  24178. +struct softirq_runner {
  24179. + struct task_struct *runner[NR_SOFTIRQS];
  24180. +};
  24181. +
  24182. +static DEFINE_PER_CPU(struct softirq_runner, softirq_runners);
  24183. +
  24184. +static inline void softirq_set_runner(unsigned int sirq)
  24185. +{
  24186. + struct softirq_runner *sr = &__get_cpu_var(softirq_runners);
  24187. +
  24188. + sr->runner[sirq] = current;
  24189. +}
  24190. +
  24191. +static inline void softirq_clr_runner(unsigned int sirq)
  24192. +{
  24193. + struct softirq_runner *sr = &__get_cpu_var(softirq_runners);
  24194. +
  24195. + sr->runner[sirq] = NULL;
  24196. +}
  24197. +
  24198. +/*
  24199. + * On preempt-rt a softirq running context might be blocked on a
  24200. + * lock. There might be no other runnable task on this CPU because the
  24201. + * lock owner runs on some other CPU. So we have to go into idle with
  24202. + * the pending bit set. Therefor we need to check this otherwise we
  24203. + * warn about false positives which confuses users and defeats the
  24204. + * whole purpose of this test.
  24205. + *
  24206. + * This code is called with interrupts disabled.
  24207. + */
  24208. +void softirq_check_pending_idle(void)
  24209. +{
  24210. + static int rate_limit;
  24211. + struct softirq_runner *sr = &__get_cpu_var(softirq_runners);
  24212. + u32 warnpending;
  24213. + int i;
  24214. +
  24215. + if (rate_limit >= 10)
  24216. + return;
  24217. +
  24218. + warnpending = local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK;
  24219. + for (i = 0; i < NR_SOFTIRQS; i++) {
  24220. + struct task_struct *tsk = sr->runner[i];
  24221. +
  24222. + /*
  24223. + * The wakeup code in rtmutex.c wakes up the task
  24224. + * _before_ it sets pi_blocked_on to NULL under
  24225. + * tsk->pi_lock. So we need to check for both: state
  24226. + * and pi_blocked_on.
  24227. + */
  24228. + if (tsk) {
  24229. + raw_spin_lock(&tsk->pi_lock);
  24230. + if (tsk->pi_blocked_on || tsk->state == TASK_RUNNING) {
  24231. + /* Clear all bits pending in that task */
  24232. + warnpending &= ~(tsk->softirqs_raised);
  24233. + warnpending &= ~(1 << i);
  24234. + }
  24235. + raw_spin_unlock(&tsk->pi_lock);
  24236. + }
  24237. + }
  24238. +
  24239. + if (warnpending) {
  24240. + printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
  24241. + warnpending);
  24242. + rate_limit++;
  24243. + }
  24244. +}
  24245. +# else
  24246. +/*
  24247. + * On !PREEMPT_RT we just printk rate limited:
  24248. + */
  24249. +void softirq_check_pending_idle(void)
  24250. +{
  24251. + static int rate_limit;
  24252. +
  24253. + if (rate_limit < 10 &&
  24254. + (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
  24255. + printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
  24256. + local_softirq_pending());
  24257. + rate_limit++;
  24258. + }
  24259. +}
  24260. +# endif
  24261. +
  24262. +#else /* !CONFIG_NO_HZ_COMMON */
  24263. +static inline void softirq_set_runner(unsigned int sirq) { }
  24264. +static inline void softirq_clr_runner(unsigned int sirq) { }
  24265. +#endif
  24266. +
  24267. /*
  24268. * we cannot loop indefinitely here to avoid userspace starvation,
  24269. * but we also don't want to introduce a worst case 1/HZ latency
  24270. @@ -77,6 +171,70 @@
  24271. wake_up_process(tsk);
  24272. }
  24273. +static void handle_softirq(unsigned int vec_nr)
  24274. +{
  24275. + struct softirq_action *h = softirq_vec + vec_nr;
  24276. + int prev_count;
  24277. +
  24278. + prev_count = preempt_count();
  24279. +
  24280. + kstat_incr_softirqs_this_cpu(vec_nr);
  24281. +
  24282. + trace_softirq_entry(vec_nr);
  24283. + h->action(h);
  24284. + trace_softirq_exit(vec_nr);
  24285. + if (unlikely(prev_count != preempt_count())) {
  24286. + pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n",
  24287. + vec_nr, softirq_to_name[vec_nr], h->action,
  24288. + prev_count, preempt_count());
  24289. + preempt_count_set(prev_count);
  24290. + }
  24291. +}
  24292. +
  24293. +#ifndef CONFIG_PREEMPT_RT_FULL
  24294. +static inline int ksoftirqd_softirq_pending(void)
  24295. +{
  24296. + return local_softirq_pending();
  24297. +}
  24298. +
  24299. +static void handle_pending_softirqs(u32 pending, int need_rcu_bh_qs)
  24300. +{
  24301. + struct softirq_action *h = softirq_vec;
  24302. + int softirq_bit;
  24303. +
  24304. + local_irq_enable();
  24305. +
  24306. + h = softirq_vec;
  24307. +
  24308. + while ((softirq_bit = ffs(pending))) {
  24309. + unsigned int vec_nr;
  24310. +
  24311. + h += softirq_bit - 1;
  24312. + vec_nr = h - softirq_vec;
  24313. + handle_softirq(vec_nr);
  24314. +
  24315. + h++;
  24316. + pending >>= softirq_bit;
  24317. + }
  24318. +
  24319. + if (need_rcu_bh_qs)
  24320. + rcu_bh_qs();
  24321. + local_irq_disable();
  24322. +}
  24323. +
  24324. +static void run_ksoftirqd(unsigned int cpu)
  24325. +{
  24326. + local_irq_disable();
  24327. + if (ksoftirqd_softirq_pending()) {
  24328. + __do_softirq();
  24329. + rcu_note_context_switch(cpu);
  24330. + local_irq_enable();
  24331. + cond_resched();
  24332. + return;
  24333. + }
  24334. + local_irq_enable();
  24335. +}
  24336. +
  24337. /*
  24338. * preempt_count and SOFTIRQ_OFFSET usage:
  24339. * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving
  24340. @@ -228,10 +386,8 @@
  24341. unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
  24342. unsigned long old_flags = current->flags;
  24343. int max_restart = MAX_SOFTIRQ_RESTART;
  24344. - struct softirq_action *h;
  24345. bool in_hardirq;
  24346. __u32 pending;
  24347. - int softirq_bit;
  24348. /*
  24349. * Mask out PF_MEMALLOC s current task context is borrowed for the
  24350. @@ -250,36 +406,7 @@
  24351. /* Reset the pending bitmask before enabling irqs */
  24352. set_softirq_pending(0);
  24353. - local_irq_enable();
  24354. -
  24355. - h = softirq_vec;
  24356. -
  24357. - while ((softirq_bit = ffs(pending))) {
  24358. - unsigned int vec_nr;
  24359. - int prev_count;
  24360. -
  24361. - h += softirq_bit - 1;
  24362. -
  24363. - vec_nr = h - softirq_vec;
  24364. - prev_count = preempt_count();
  24365. -
  24366. - kstat_incr_softirqs_this_cpu(vec_nr);
  24367. -
  24368. - trace_softirq_entry(vec_nr);
  24369. - h->action(h);
  24370. - trace_softirq_exit(vec_nr);
  24371. - if (unlikely(prev_count != preempt_count())) {
  24372. - pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n",
  24373. - vec_nr, softirq_to_name[vec_nr], h->action,
  24374. - prev_count, preempt_count());
  24375. - preempt_count_set(prev_count);
  24376. - }
  24377. - h++;
  24378. - pending >>= softirq_bit;
  24379. - }
  24380. -
  24381. - rcu_bh_qs();
  24382. - local_irq_disable();
  24383. + handle_pending_softirqs(pending, 1);
  24384. pending = local_softirq_pending();
  24385. if (pending) {
  24386. @@ -316,6 +443,285 @@
  24387. }
  24388. /*
  24389. + * This function must run with irqs disabled!
  24390. + */
  24391. +void raise_softirq_irqoff(unsigned int nr)
  24392. +{
  24393. + __raise_softirq_irqoff(nr);
  24394. +
  24395. + /*
  24396. + * If we're in an interrupt or softirq, we're done
  24397. + * (this also catches softirq-disabled code). We will
  24398. + * actually run the softirq once we return from
  24399. + * the irq or softirq.
  24400. + *
  24401. + * Otherwise we wake up ksoftirqd to make sure we
  24402. + * schedule the softirq soon.
  24403. + */
  24404. + if (!in_interrupt())
  24405. + wakeup_softirqd();
  24406. +}
  24407. +
  24408. +void __raise_softirq_irqoff(unsigned int nr)
  24409. +{
  24410. + trace_softirq_raise(nr);
  24411. + or_softirq_pending(1UL << nr);
  24412. +}
  24413. +
  24414. +static inline void local_bh_disable_nort(void) { local_bh_disable(); }
  24415. +static inline void _local_bh_enable_nort(void) { _local_bh_enable(); }
  24416. +static void ksoftirqd_set_sched_params(unsigned int cpu) { }
  24417. +static void ksoftirqd_clr_sched_params(unsigned int cpu, bool online) { }
  24418. +
  24419. +#else /* !PREEMPT_RT_FULL */
  24420. +
  24421. +/*
  24422. + * On RT we serialize softirq execution with a cpu local lock per softirq
  24423. + */
  24424. +static DEFINE_PER_CPU(struct local_irq_lock [NR_SOFTIRQS], local_softirq_locks);
  24425. +
  24426. +void __init softirq_early_init(void)
  24427. +{
  24428. + int i;
  24429. +
  24430. + for (i = 0; i < NR_SOFTIRQS; i++)
  24431. + local_irq_lock_init(local_softirq_locks[i]);
  24432. +}
  24433. +
  24434. +static void lock_softirq(int which)
  24435. +{
  24436. + local_lock(local_softirq_locks[which]);
  24437. +}
  24438. +
  24439. +static void unlock_softirq(int which)
  24440. +{
  24441. + local_unlock(local_softirq_locks[which]);
  24442. +}
  24443. +
  24444. +static void do_single_softirq(int which, int need_rcu_bh_qs)
  24445. +{
  24446. + unsigned long old_flags = current->flags;
  24447. +
  24448. + current->flags &= ~PF_MEMALLOC;
  24449. + vtime_account_irq_enter(current);
  24450. + current->flags |= PF_IN_SOFTIRQ;
  24451. + lockdep_softirq_enter();
  24452. + local_irq_enable();
  24453. + handle_softirq(which);
  24454. + local_irq_disable();
  24455. + lockdep_softirq_exit();
  24456. + current->flags &= ~PF_IN_SOFTIRQ;
  24457. + vtime_account_irq_enter(current);
  24458. + tsk_restore_flags(current, old_flags, PF_MEMALLOC);
  24459. +}
  24460. +
  24461. +/*
  24462. + * Called with interrupts disabled. Process softirqs which were raised
  24463. + * in current context (or on behalf of ksoftirqd).
  24464. + */
  24465. +static void do_current_softirqs(int need_rcu_bh_qs)
  24466. +{
  24467. + while (current->softirqs_raised) {
  24468. + int i = __ffs(current->softirqs_raised);
  24469. + unsigned int pending, mask = (1U << i);
  24470. +
  24471. + current->softirqs_raised &= ~mask;
  24472. + local_irq_enable();
  24473. +
  24474. + /*
  24475. + * If the lock is contended, we boost the owner to
  24476. + * process the softirq or leave the critical section
  24477. + * now.
  24478. + */
  24479. + lock_softirq(i);
  24480. + local_irq_disable();
  24481. + softirq_set_runner(i);
  24482. + /*
  24483. + * Check with the local_softirq_pending() bits,
  24484. + * whether we need to process this still or if someone
  24485. + * else took care of it.
  24486. + */
  24487. + pending = local_softirq_pending();
  24488. + if (pending & mask) {
  24489. + set_softirq_pending(pending & ~mask);
  24490. + do_single_softirq(i, need_rcu_bh_qs);
  24491. + }
  24492. + softirq_clr_runner(i);
  24493. + unlock_softirq(i);
  24494. + WARN_ON(current->softirq_nestcnt != 1);
  24495. + }
  24496. +}
  24497. +
  24498. +static void __local_bh_disable(void)
  24499. +{
  24500. + if (++current->softirq_nestcnt == 1)
  24501. + migrate_disable();
  24502. +}
  24503. +
  24504. +void local_bh_disable(void)
  24505. +{
  24506. + __local_bh_disable();
  24507. +}
  24508. +EXPORT_SYMBOL(local_bh_disable);
  24509. +
  24510. +void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
  24511. +{
  24512. + __local_bh_disable();
  24513. + if (cnt & PREEMPT_CHECK_OFFSET)
  24514. + preempt_disable();
  24515. +}
  24516. +
  24517. +static void __local_bh_enable(void)
  24518. +{
  24519. + if (WARN_ON(current->softirq_nestcnt == 0))
  24520. + return;
  24521. +
  24522. + local_irq_disable();
  24523. + if (current->softirq_nestcnt == 1 && current->softirqs_raised)
  24524. + do_current_softirqs(1);
  24525. + local_irq_enable();
  24526. +
  24527. + if (--current->softirq_nestcnt == 0)
  24528. + migrate_enable();
  24529. +}
  24530. +
  24531. +void local_bh_enable(void)
  24532. +{
  24533. + __local_bh_enable();
  24534. +}
  24535. +EXPORT_SYMBOL(local_bh_enable);
  24536. +
  24537. +extern void __local_bh_enable_ip(unsigned long ip, unsigned int cnt)
  24538. +{
  24539. + __local_bh_enable();
  24540. + if (cnt & PREEMPT_CHECK_OFFSET)
  24541. + preempt_enable();
  24542. +}
  24543. +
  24544. +void local_bh_enable_ip(unsigned long ip)
  24545. +{
  24546. + local_bh_enable();
  24547. +}
  24548. +EXPORT_SYMBOL(local_bh_enable_ip);
  24549. +
  24550. +void _local_bh_enable(void)
  24551. +{
  24552. + if (WARN_ON(current->softirq_nestcnt == 0))
  24553. + return;
  24554. + if (--current->softirq_nestcnt == 0)
  24555. + migrate_enable();
  24556. +}
  24557. +EXPORT_SYMBOL(_local_bh_enable);
  24558. +
  24559. +int in_serving_softirq(void)
  24560. +{
  24561. + return current->flags & PF_IN_SOFTIRQ;
  24562. +}
  24563. +EXPORT_SYMBOL(in_serving_softirq);
  24564. +
  24565. +/* Called with preemption disabled */
  24566. +static void run_ksoftirqd(unsigned int cpu)
  24567. +{
  24568. + local_irq_disable();
  24569. + current->softirq_nestcnt++;
  24570. +
  24571. + do_current_softirqs(1);
  24572. + current->softirq_nestcnt--;
  24573. + rcu_note_context_switch(cpu);
  24574. + local_irq_enable();
  24575. +}
  24576. +
  24577. +/*
  24578. + * Called from netif_rx_ni(). Preemption enabled, but migration
  24579. + * disabled. So the cpu can't go away under us.
  24580. + */
  24581. +void thread_do_softirq(void)
  24582. +{
  24583. + if (!in_serving_softirq() && current->softirqs_raised) {
  24584. + current->softirq_nestcnt++;
  24585. + do_current_softirqs(0);
  24586. + current->softirq_nestcnt--;
  24587. + }
  24588. +}
  24589. +
  24590. +static void do_raise_softirq_irqoff(unsigned int nr)
  24591. +{
  24592. + trace_softirq_raise(nr);
  24593. + or_softirq_pending(1UL << nr);
  24594. +
  24595. + /*
  24596. + * If we are not in a hard interrupt and inside a bh disabled
  24597. + * region, we simply raise the flag on current. local_bh_enable()
  24598. + * will make sure that the softirq is executed. Otherwise we
  24599. + * delegate it to ksoftirqd.
  24600. + */
  24601. + if (!in_irq() && current->softirq_nestcnt)
  24602. + current->softirqs_raised |= (1U << nr);
  24603. + else if (__this_cpu_read(ksoftirqd))
  24604. + __this_cpu_read(ksoftirqd)->softirqs_raised |= (1U << nr);
  24605. +}
  24606. +
  24607. +void __raise_softirq_irqoff(unsigned int nr)
  24608. +{
  24609. + do_raise_softirq_irqoff(nr);
  24610. + if (!in_irq() && !current->softirq_nestcnt)
  24611. + wakeup_softirqd();
  24612. +}
  24613. +
  24614. +/*
  24615. + * This function must run with irqs disabled!
  24616. + */
  24617. +void raise_softirq_irqoff(unsigned int nr)
  24618. +{
  24619. + do_raise_softirq_irqoff(nr);
  24620. +
  24621. + /*
  24622. + * If we're in an hard interrupt we let irq return code deal
  24623. + * with the wakeup of ksoftirqd.
  24624. + */
  24625. + if (in_irq())
  24626. + return;
  24627. + /*
  24628. + * If we are in thread context but outside of a bh disabled
  24629. + * region, we need to wake ksoftirqd as well.
  24630. + *
  24631. + * CHECKME: Some of the places which do that could be wrapped
  24632. + * into local_bh_disable/enable pairs. Though it's unclear
  24633. + * whether this is worth the effort. To find those places just
  24634. + * raise a WARN() if the condition is met.
  24635. + */
  24636. + if (!current->softirq_nestcnt)
  24637. + wakeup_softirqd();
  24638. +}
  24639. +
  24640. +static inline int ksoftirqd_softirq_pending(void)
  24641. +{
  24642. + return current->softirqs_raised;
  24643. +}
  24644. +
  24645. +static inline void local_bh_disable_nort(void) { }
  24646. +static inline void _local_bh_enable_nort(void) { }
  24647. +
  24648. +static inline void ksoftirqd_set_sched_params(unsigned int cpu)
  24649. +{
  24650. + struct sched_param param = { .sched_priority = 1 };
  24651. +
  24652. + sched_setscheduler(current, SCHED_FIFO, &param);
  24653. + /* Take over all pending softirqs when starting */
  24654. + local_irq_disable();
  24655. + current->softirqs_raised = local_softirq_pending();
  24656. + local_irq_enable();
  24657. +}
  24658. +
  24659. +static inline void ksoftirqd_clr_sched_params(unsigned int cpu, bool online)
  24660. +{
  24661. + struct sched_param param = { .sched_priority = 0 };
  24662. +
  24663. + sched_setscheduler(current, SCHED_NORMAL, &param);
  24664. +}
  24665. +
  24666. +#endif /* PREEMPT_RT_FULL */
  24667. +/*
  24668. * Enter an interrupt context.
  24669. */
  24670. void irq_enter(void)
  24671. @@ -326,9 +732,9 @@
  24672. * Prevent raise_softirq from needlessly waking up ksoftirqd
  24673. * here, as softirq will be serviced on return from interrupt.
  24674. */
  24675. - local_bh_disable();
  24676. + local_bh_disable_nort();
  24677. tick_irq_enter();
  24678. - _local_bh_enable();
  24679. + _local_bh_enable_nort();
  24680. }
  24681. __irq_enter();
  24682. @@ -336,6 +742,7 @@
  24683. static inline void invoke_softirq(void)
  24684. {
  24685. +#ifndef CONFIG_PREEMPT_RT_FULL
  24686. if (!force_irqthreads) {
  24687. #ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK
  24688. /*
  24689. @@ -355,6 +762,15 @@
  24690. } else {
  24691. wakeup_softirqd();
  24692. }
  24693. +#else /* PREEMPT_RT_FULL */
  24694. + unsigned long flags;
  24695. +
  24696. + local_irq_save(flags);
  24697. + if (__this_cpu_read(ksoftirqd) &&
  24698. + __this_cpu_read(ksoftirqd)->softirqs_raised)
  24699. + wakeup_softirqd();
  24700. + local_irq_restore(flags);
  24701. +#endif
  24702. }
  24703. static inline void tick_irq_exit(void)
  24704. @@ -391,26 +807,6 @@
  24705. trace_hardirq_exit(); /* must be last! */
  24706. }
  24707. -/*
  24708. - * This function must run with irqs disabled!
  24709. - */
  24710. -inline void raise_softirq_irqoff(unsigned int nr)
  24711. -{
  24712. - __raise_softirq_irqoff(nr);
  24713. -
  24714. - /*
  24715. - * If we're in an interrupt or softirq, we're done
  24716. - * (this also catches softirq-disabled code). We will
  24717. - * actually run the softirq once we return from
  24718. - * the irq or softirq.
  24719. - *
  24720. - * Otherwise we wake up ksoftirqd to make sure we
  24721. - * schedule the softirq soon.
  24722. - */
  24723. - if (!in_interrupt())
  24724. - wakeup_softirqd();
  24725. -}
  24726. -
  24727. void raise_softirq(unsigned int nr)
  24728. {
  24729. unsigned long flags;
  24730. @@ -420,12 +816,6 @@
  24731. local_irq_restore(flags);
  24732. }
  24733. -void __raise_softirq_irqoff(unsigned int nr)
  24734. -{
  24735. - trace_softirq_raise(nr);
  24736. - or_softirq_pending(1UL << nr);
  24737. -}
  24738. -
  24739. void open_softirq(int nr, void (*action)(struct softirq_action *))
  24740. {
  24741. softirq_vec[nr].action = action;
  24742. @@ -442,15 +832,45 @@
  24743. static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec);
  24744. static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec);
  24745. +static void inline
  24746. +__tasklet_common_schedule(struct tasklet_struct *t, struct tasklet_head *head, unsigned int nr)
  24747. +{
  24748. + if (tasklet_trylock(t)) {
  24749. +again:
  24750. + /* We may have been preempted before tasklet_trylock
  24751. + * and __tasklet_action may have already run.
  24752. + * So double check the sched bit while the takslet
  24753. + * is locked before adding it to the list.
  24754. + */
  24755. + if (test_bit(TASKLET_STATE_SCHED, &t->state)) {
  24756. + t->next = NULL;
  24757. + *head->tail = t;
  24758. + head->tail = &(t->next);
  24759. + raise_softirq_irqoff(nr);
  24760. + tasklet_unlock(t);
  24761. + } else {
  24762. + /* This is subtle. If we hit the corner case above
  24763. + * It is possible that we get preempted right here,
  24764. + * and another task has successfully called
  24765. + * tasklet_schedule(), then this function, and
  24766. + * failed on the trylock. Thus we must be sure
  24767. + * before releasing the tasklet lock, that the
  24768. + * SCHED_BIT is clear. Otherwise the tasklet
  24769. + * may get its SCHED_BIT set, but not added to the
  24770. + * list
  24771. + */
  24772. + if (!tasklet_tryunlock(t))
  24773. + goto again;
  24774. + }
  24775. + }
  24776. +}
  24777. +
  24778. void __tasklet_schedule(struct tasklet_struct *t)
  24779. {
  24780. unsigned long flags;
  24781. local_irq_save(flags);
  24782. - t->next = NULL;
  24783. - *__this_cpu_read(tasklet_vec.tail) = t;
  24784. - __this_cpu_write(tasklet_vec.tail, &(t->next));
  24785. - raise_softirq_irqoff(TASKLET_SOFTIRQ);
  24786. + __tasklet_common_schedule(t, &__get_cpu_var(tasklet_vec), TASKLET_SOFTIRQ);
  24787. local_irq_restore(flags);
  24788. }
  24789. EXPORT_SYMBOL(__tasklet_schedule);
  24790. @@ -460,10 +880,7 @@
  24791. unsigned long flags;
  24792. local_irq_save(flags);
  24793. - t->next = NULL;
  24794. - *__this_cpu_read(tasklet_hi_vec.tail) = t;
  24795. - __this_cpu_write(tasklet_hi_vec.tail, &(t->next));
  24796. - raise_softirq_irqoff(HI_SOFTIRQ);
  24797. + __tasklet_common_schedule(t, &__get_cpu_var(tasklet_hi_vec), HI_SOFTIRQ);
  24798. local_irq_restore(flags);
  24799. }
  24800. EXPORT_SYMBOL(__tasklet_hi_schedule);
  24801. @@ -472,48 +889,116 @@
  24802. {
  24803. BUG_ON(!irqs_disabled());
  24804. - t->next = __this_cpu_read(tasklet_hi_vec.head);
  24805. - __this_cpu_write(tasklet_hi_vec.head, t);
  24806. - __raise_softirq_irqoff(HI_SOFTIRQ);
  24807. + __tasklet_hi_schedule(t);
  24808. }
  24809. EXPORT_SYMBOL(__tasklet_hi_schedule_first);
  24810. -static void tasklet_action(struct softirq_action *a)
  24811. +void tasklet_enable(struct tasklet_struct *t)
  24812. {
  24813. - struct tasklet_struct *list;
  24814. + if (!atomic_dec_and_test(&t->count))
  24815. + return;
  24816. + if (test_and_clear_bit(TASKLET_STATE_PENDING, &t->state))
  24817. + tasklet_schedule(t);
  24818. +}
  24819. +EXPORT_SYMBOL(tasklet_enable);
  24820. - local_irq_disable();
  24821. - list = __this_cpu_read(tasklet_vec.head);
  24822. - __this_cpu_write(tasklet_vec.head, NULL);
  24823. - __this_cpu_write(tasklet_vec.tail, this_cpu_ptr(&tasklet_vec.head));
  24824. - local_irq_enable();
  24825. +void tasklet_hi_enable(struct tasklet_struct *t)
  24826. +{
  24827. + if (!atomic_dec_and_test(&t->count))
  24828. + return;
  24829. + if (test_and_clear_bit(TASKLET_STATE_PENDING, &t->state))
  24830. + tasklet_hi_schedule(t);
  24831. +}
  24832. +EXPORT_SYMBOL(tasklet_hi_enable);
  24833. +
  24834. +static void __tasklet_action(struct softirq_action *a,
  24835. + struct tasklet_struct *list)
  24836. +{
  24837. + int loops = 1000000;
  24838. while (list) {
  24839. struct tasklet_struct *t = list;
  24840. list = list->next;
  24841. - if (tasklet_trylock(t)) {
  24842. - if (!atomic_read(&t->count)) {
  24843. - if (!test_and_clear_bit(TASKLET_STATE_SCHED,
  24844. - &t->state))
  24845. - BUG();
  24846. - t->func(t->data);
  24847. - tasklet_unlock(t);
  24848. - continue;
  24849. - }
  24850. - tasklet_unlock(t);
  24851. + /*
  24852. + * Should always succeed - after a tasklist got on the
  24853. + * list (after getting the SCHED bit set from 0 to 1),
  24854. + * nothing but the tasklet softirq it got queued to can
  24855. + * lock it:
  24856. + */
  24857. + if (!tasklet_trylock(t)) {
  24858. + WARN_ON(1);
  24859. + continue;
  24860. }
  24861. - local_irq_disable();
  24862. t->next = NULL;
  24863. - *__this_cpu_read(tasklet_vec.tail) = t;
  24864. - __this_cpu_write(tasklet_vec.tail, &(t->next));
  24865. - __raise_softirq_irqoff(TASKLET_SOFTIRQ);
  24866. - local_irq_enable();
  24867. +
  24868. + /*
  24869. + * If we cannot handle the tasklet because it's disabled,
  24870. + * mark it as pending. tasklet_enable() will later
  24871. + * re-schedule the tasklet.
  24872. + */
  24873. + if (unlikely(atomic_read(&t->count))) {
  24874. +out_disabled:
  24875. + /* implicit unlock: */
  24876. + wmb();
  24877. + t->state = TASKLET_STATEF_PENDING;
  24878. + continue;
  24879. + }
  24880. +
  24881. + /*
  24882. + * After this point on the tasklet might be rescheduled
  24883. + * on another CPU, but it can only be added to another
  24884. + * CPU's tasklet list if we unlock the tasklet (which we
  24885. + * dont do yet).
  24886. + */
  24887. + if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
  24888. + WARN_ON(1);
  24889. +
  24890. +again:
  24891. + t->func(t->data);
  24892. +
  24893. + /*
  24894. + * Try to unlock the tasklet. We must use cmpxchg, because
  24895. + * another CPU might have scheduled or disabled the tasklet.
  24896. + * We only allow the STATE_RUN -> 0 transition here.
  24897. + */
  24898. + while (!tasklet_tryunlock(t)) {
  24899. + /*
  24900. + * If it got disabled meanwhile, bail out:
  24901. + */
  24902. + if (atomic_read(&t->count))
  24903. + goto out_disabled;
  24904. + /*
  24905. + * If it got scheduled meanwhile, re-execute
  24906. + * the tasklet function:
  24907. + */
  24908. + if (test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
  24909. + goto again;
  24910. + if (!--loops) {
  24911. + printk("hm, tasklet state: %08lx\n", t->state);
  24912. + WARN_ON(1);
  24913. + tasklet_unlock(t);
  24914. + break;
  24915. + }
  24916. + }
  24917. }
  24918. }
  24919. +static void tasklet_action(struct softirq_action *a)
  24920. +{
  24921. + struct tasklet_struct *list;
  24922. +
  24923. + local_irq_disable();
  24924. + list = __get_cpu_var(tasklet_vec).head;
  24925. + __get_cpu_var(tasklet_vec).head = NULL;
  24926. + __get_cpu_var(tasklet_vec).tail = &__get_cpu_var(tasklet_vec).head;
  24927. + local_irq_enable();
  24928. +
  24929. + __tasklet_action(a, list);
  24930. +}
  24931. +
  24932. static void tasklet_hi_action(struct softirq_action *a)
  24933. {
  24934. struct tasklet_struct *list;
  24935. @@ -524,30 +1009,7 @@
  24936. __this_cpu_write(tasklet_hi_vec.tail, this_cpu_ptr(&tasklet_hi_vec.head));
  24937. local_irq_enable();
  24938. - while (list) {
  24939. - struct tasklet_struct *t = list;
  24940. -
  24941. - list = list->next;
  24942. -
  24943. - if (tasklet_trylock(t)) {
  24944. - if (!atomic_read(&t->count)) {
  24945. - if (!test_and_clear_bit(TASKLET_STATE_SCHED,
  24946. - &t->state))
  24947. - BUG();
  24948. - t->func(t->data);
  24949. - tasklet_unlock(t);
  24950. - continue;
  24951. - }
  24952. - tasklet_unlock(t);
  24953. - }
  24954. -
  24955. - local_irq_disable();
  24956. - t->next = NULL;
  24957. - *__this_cpu_read(tasklet_hi_vec.tail) = t;
  24958. - __this_cpu_write(tasklet_hi_vec.tail, &(t->next));
  24959. - __raise_softirq_irqoff(HI_SOFTIRQ);
  24960. - local_irq_enable();
  24961. - }
  24962. + __tasklet_action(a, list);
  24963. }
  24964. void tasklet_init(struct tasklet_struct *t,
  24965. @@ -568,7 +1030,7 @@
  24966. while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) {
  24967. do {
  24968. - yield();
  24969. + msleep(1);
  24970. } while (test_bit(TASKLET_STATE_SCHED, &t->state));
  24971. }
  24972. tasklet_unlock_wait(t);
  24973. @@ -642,26 +1104,26 @@
  24974. open_softirq(HI_SOFTIRQ, tasklet_hi_action);
  24975. }
  24976. -static int ksoftirqd_should_run(unsigned int cpu)
  24977. -{
  24978. - return local_softirq_pending();
  24979. -}
  24980. -
  24981. -static void run_ksoftirqd(unsigned int cpu)
  24982. +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
  24983. +void tasklet_unlock_wait(struct tasklet_struct *t)
  24984. {
  24985. - local_irq_disable();
  24986. - if (local_softirq_pending()) {
  24987. + while (test_bit(TASKLET_STATE_RUN, &(t)->state)) {
  24988. /*
  24989. - * We can safely run softirq on inline stack, as we are not deep
  24990. - * in the task stack here.
  24991. + * Hack for now to avoid this busy-loop:
  24992. */
  24993. - __do_softirq();
  24994. - rcu_note_context_switch(cpu);
  24995. - local_irq_enable();
  24996. - cond_resched();
  24997. - return;
  24998. +#ifdef CONFIG_PREEMPT_RT_FULL
  24999. + msleep(1);
  25000. +#else
  25001. + barrier();
  25002. +#endif
  25003. }
  25004. - local_irq_enable();
  25005. +}
  25006. +EXPORT_SYMBOL(tasklet_unlock_wait);
  25007. +#endif
  25008. +
  25009. +static int ksoftirqd_should_run(unsigned int cpu)
  25010. +{
  25011. + return ksoftirqd_softirq_pending();
  25012. }
  25013. #ifdef CONFIG_HOTPLUG_CPU
  25014. @@ -743,6 +1205,8 @@
  25015. static struct smp_hotplug_thread softirq_threads = {
  25016. .store = &ksoftirqd,
  25017. + .setup = ksoftirqd_set_sched_params,
  25018. + .cleanup = ksoftirqd_clr_sched_params,
  25019. .thread_should_run = ksoftirqd_should_run,
  25020. .thread_fn = run_ksoftirqd,
  25021. .thread_comm = "ksoftirqd/%u",
  25022. diff -Nur linux-3.18.14.orig/kernel/stop_machine.c linux-3.18.14-rt/kernel/stop_machine.c
  25023. --- linux-3.18.14.orig/kernel/stop_machine.c 2015-05-20 10:04:50.000000000 -0500
  25024. +++ linux-3.18.14-rt/kernel/stop_machine.c 2015-05-31 15:32:48.925635362 -0500
  25025. @@ -30,12 +30,12 @@
  25026. atomic_t nr_todo; /* nr left to execute */
  25027. bool executed; /* actually executed? */
  25028. int ret; /* collected return value */
  25029. - struct completion completion; /* fired if nr_todo reaches 0 */
  25030. + struct task_struct *waiter; /* woken when nr_todo reaches 0 */
  25031. };
  25032. /* the actual stopper, one per every possible cpu, enabled on online cpus */
  25033. struct cpu_stopper {
  25034. - spinlock_t lock;
  25035. + raw_spinlock_t lock;
  25036. bool enabled; /* is this stopper enabled? */
  25037. struct list_head works; /* list of pending works */
  25038. };
  25039. @@ -56,7 +56,7 @@
  25040. {
  25041. memset(done, 0, sizeof(*done));
  25042. atomic_set(&done->nr_todo, nr_todo);
  25043. - init_completion(&done->completion);
  25044. + done->waiter = current;
  25045. }
  25046. /* signal completion unless @done is NULL */
  25047. @@ -65,8 +65,10 @@
  25048. if (done) {
  25049. if (executed)
  25050. done->executed = true;
  25051. - if (atomic_dec_and_test(&done->nr_todo))
  25052. - complete(&done->completion);
  25053. + if (atomic_dec_and_test(&done->nr_todo)) {
  25054. + wake_up_process(done->waiter);
  25055. + done->waiter = NULL;
  25056. + }
  25057. }
  25058. }
  25059. @@ -78,7 +80,7 @@
  25060. unsigned long flags;
  25061. - spin_lock_irqsave(&stopper->lock, flags);
  25062. + raw_spin_lock_irqsave(&stopper->lock, flags);
  25063. if (stopper->enabled) {
  25064. list_add_tail(&work->list, &stopper->works);
  25065. @@ -86,7 +88,23 @@
  25066. } else
  25067. cpu_stop_signal_done(work->done, false);
  25068. - spin_unlock_irqrestore(&stopper->lock, flags);
  25069. + raw_spin_unlock_irqrestore(&stopper->lock, flags);
  25070. +}
  25071. +
  25072. +static void wait_for_stop_done(struct cpu_stop_done *done)
  25073. +{
  25074. + set_current_state(TASK_UNINTERRUPTIBLE);
  25075. + while (atomic_read(&done->nr_todo)) {
  25076. + schedule();
  25077. + set_current_state(TASK_UNINTERRUPTIBLE);
  25078. + }
  25079. + /*
  25080. + * We need to wait until cpu_stop_signal_done() has cleared
  25081. + * done->waiter.
  25082. + */
  25083. + while (done->waiter)
  25084. + cpu_relax();
  25085. + set_current_state(TASK_RUNNING);
  25086. }
  25087. /**
  25088. @@ -120,7 +138,7 @@
  25089. cpu_stop_init_done(&done, 1);
  25090. cpu_stop_queue_work(cpu, &work);
  25091. - wait_for_completion(&done.completion);
  25092. + wait_for_stop_done(&done);
  25093. return done.executed ? done.ret : -ENOENT;
  25094. }
  25095. @@ -248,7 +266,7 @@
  25096. struct irq_cpu_stop_queue_work_info call_args;
  25097. struct multi_stop_data msdata;
  25098. - preempt_disable();
  25099. + preempt_disable_nort();
  25100. msdata = (struct multi_stop_data){
  25101. .fn = fn,
  25102. .data = arg,
  25103. @@ -281,7 +299,7 @@
  25104. * This relies on the stopper workqueues to be FIFO.
  25105. */
  25106. if (!cpu_active(cpu1) || !cpu_active(cpu2)) {
  25107. - preempt_enable();
  25108. + preempt_enable_nort();
  25109. return -ENOENT;
  25110. }
  25111. @@ -295,9 +313,9 @@
  25112. &irq_cpu_stop_queue_work,
  25113. &call_args, 1);
  25114. lg_local_unlock(&stop_cpus_lock);
  25115. - preempt_enable();
  25116. + preempt_enable_nort();
  25117. - wait_for_completion(&done.completion);
  25118. + wait_for_stop_done(&done);
  25119. return done.executed ? done.ret : -ENOENT;
  25120. }
  25121. @@ -329,7 +347,7 @@
  25122. static void queue_stop_cpus_work(const struct cpumask *cpumask,
  25123. cpu_stop_fn_t fn, void *arg,
  25124. - struct cpu_stop_done *done)
  25125. + struct cpu_stop_done *done, bool inactive)
  25126. {
  25127. struct cpu_stop_work *work;
  25128. unsigned int cpu;
  25129. @@ -343,11 +361,13 @@
  25130. }
  25131. /*
  25132. - * Disable preemption while queueing to avoid getting
  25133. - * preempted by a stopper which might wait for other stoppers
  25134. - * to enter @fn which can lead to deadlock.
  25135. + * Make sure that all work is queued on all cpus before
  25136. + * any of the cpus can execute it.
  25137. */
  25138. - lg_global_lock(&stop_cpus_lock);
  25139. + if (!inactive)
  25140. + lg_global_lock(&stop_cpus_lock);
  25141. + else
  25142. + lg_global_trylock_relax(&stop_cpus_lock);
  25143. for_each_cpu(cpu, cpumask)
  25144. cpu_stop_queue_work(cpu, &per_cpu(stop_cpus_work, cpu));
  25145. lg_global_unlock(&stop_cpus_lock);
  25146. @@ -359,8 +379,8 @@
  25147. struct cpu_stop_done done;
  25148. cpu_stop_init_done(&done, cpumask_weight(cpumask));
  25149. - queue_stop_cpus_work(cpumask, fn, arg, &done);
  25150. - wait_for_completion(&done.completion);
  25151. + queue_stop_cpus_work(cpumask, fn, arg, &done, false);
  25152. + wait_for_stop_done(&done);
  25153. return done.executed ? done.ret : -ENOENT;
  25154. }
  25155. @@ -439,9 +459,9 @@
  25156. unsigned long flags;
  25157. int run;
  25158. - spin_lock_irqsave(&stopper->lock, flags);
  25159. + raw_spin_lock_irqsave(&stopper->lock, flags);
  25160. run = !list_empty(&stopper->works);
  25161. - spin_unlock_irqrestore(&stopper->lock, flags);
  25162. + raw_spin_unlock_irqrestore(&stopper->lock, flags);
  25163. return run;
  25164. }
  25165. @@ -453,13 +473,13 @@
  25166. repeat:
  25167. work = NULL;
  25168. - spin_lock_irq(&stopper->lock);
  25169. + raw_spin_lock_irq(&stopper->lock);
  25170. if (!list_empty(&stopper->works)) {
  25171. work = list_first_entry(&stopper->works,
  25172. struct cpu_stop_work, list);
  25173. list_del_init(&work->list);
  25174. }
  25175. - spin_unlock_irq(&stopper->lock);
  25176. + raw_spin_unlock_irq(&stopper->lock);
  25177. if (work) {
  25178. cpu_stop_fn_t fn = work->fn;
  25179. @@ -467,6 +487,16 @@
  25180. struct cpu_stop_done *done = work->done;
  25181. char ksym_buf[KSYM_NAME_LEN] __maybe_unused;
  25182. + /*
  25183. + * Wait until the stopper finished scheduling on all
  25184. + * cpus
  25185. + */
  25186. + lg_global_lock(&stop_cpus_lock);
  25187. + /*
  25188. + * Let other cpu threads continue as well
  25189. + */
  25190. + lg_global_unlock(&stop_cpus_lock);
  25191. +
  25192. /* cpu stop callbacks are not allowed to sleep */
  25193. preempt_disable();
  25194. @@ -481,7 +511,13 @@
  25195. kallsyms_lookup((unsigned long)fn, NULL, NULL, NULL,
  25196. ksym_buf), arg);
  25197. + /*
  25198. + * Make sure that the wakeup and setting done->waiter
  25199. + * to NULL is atomic.
  25200. + */
  25201. + local_irq_disable();
  25202. cpu_stop_signal_done(done, true);
  25203. + local_irq_enable();
  25204. goto repeat;
  25205. }
  25206. }
  25207. @@ -500,20 +536,20 @@
  25208. unsigned long flags;
  25209. /* drain remaining works */
  25210. - spin_lock_irqsave(&stopper->lock, flags);
  25211. + raw_spin_lock_irqsave(&stopper->lock, flags);
  25212. list_for_each_entry(work, &stopper->works, list)
  25213. cpu_stop_signal_done(work->done, false);
  25214. stopper->enabled = false;
  25215. - spin_unlock_irqrestore(&stopper->lock, flags);
  25216. + raw_spin_unlock_irqrestore(&stopper->lock, flags);
  25217. }
  25218. static void cpu_stop_unpark(unsigned int cpu)
  25219. {
  25220. struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
  25221. - spin_lock_irq(&stopper->lock);
  25222. + raw_spin_lock_irq(&stopper->lock);
  25223. stopper->enabled = true;
  25224. - spin_unlock_irq(&stopper->lock);
  25225. + raw_spin_unlock_irq(&stopper->lock);
  25226. }
  25227. static struct smp_hotplug_thread cpu_stop_threads = {
  25228. @@ -535,10 +571,12 @@
  25229. for_each_possible_cpu(cpu) {
  25230. struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
  25231. - spin_lock_init(&stopper->lock);
  25232. + raw_spin_lock_init(&stopper->lock);
  25233. INIT_LIST_HEAD(&stopper->works);
  25234. }
  25235. + lg_lock_init(&stop_cpus_lock, "stop_cpus_lock");
  25236. +
  25237. BUG_ON(smpboot_register_percpu_thread(&cpu_stop_threads));
  25238. stop_machine_initialized = true;
  25239. return 0;
  25240. @@ -634,11 +672,11 @@
  25241. set_state(&msdata, MULTI_STOP_PREPARE);
  25242. cpu_stop_init_done(&done, num_active_cpus());
  25243. queue_stop_cpus_work(cpu_active_mask, multi_cpu_stop, &msdata,
  25244. - &done);
  25245. + &done, true);
  25246. ret = multi_cpu_stop(&msdata);
  25247. /* Busy wait for completion. */
  25248. - while (!completion_done(&done.completion))
  25249. + while (atomic_read(&done.nr_todo))
  25250. cpu_relax();
  25251. mutex_unlock(&stop_cpus_mutex);
  25252. diff -Nur linux-3.18.14.orig/kernel/time/hrtimer.c linux-3.18.14-rt/kernel/time/hrtimer.c
  25253. --- linux-3.18.14.orig/kernel/time/hrtimer.c 2015-05-20 10:04:50.000000000 -0500
  25254. +++ linux-3.18.14-rt/kernel/time/hrtimer.c 2015-05-31 15:32:48.925635362 -0500
  25255. @@ -48,11 +48,13 @@
  25256. #include <linux/sched/rt.h>
  25257. #include <linux/sched/deadline.h>
  25258. #include <linux/timer.h>
  25259. +#include <linux/kthread.h>
  25260. #include <linux/freezer.h>
  25261. #include <asm/uaccess.h>
  25262. #include <trace/events/timer.h>
  25263. +#include <trace/events/hist.h>
  25264. #include "timekeeping.h"
  25265. @@ -568,8 +570,7 @@
  25266. * When the callback is running, we do not reprogram the clock event
  25267. * device. The timer callback is either running on a different CPU or
  25268. * the callback is executed in the hrtimer_interrupt context. The
  25269. - * reprogramming is handled either by the softirq, which called the
  25270. - * callback or at the end of the hrtimer_interrupt.
  25271. + * reprogramming is handled at the end of the hrtimer_interrupt.
  25272. */
  25273. if (hrtimer_callback_running(timer))
  25274. return 0;
  25275. @@ -604,6 +605,9 @@
  25276. return res;
  25277. }
  25278. +static void __run_hrtimer(struct hrtimer *timer, ktime_t *now);
  25279. +static int hrtimer_rt_defer(struct hrtimer *timer);
  25280. +
  25281. /*
  25282. * Initialize the high resolution related parts of cpu_base
  25283. */
  25284. @@ -613,6 +617,21 @@
  25285. base->hres_active = 0;
  25286. }
  25287. +static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
  25288. + struct hrtimer_clock_base *base,
  25289. + int wakeup)
  25290. +{
  25291. + if (!hrtimer_reprogram(timer, base))
  25292. + return 0;
  25293. + if (!wakeup)
  25294. + return -ETIME;
  25295. +#ifdef CONFIG_PREEMPT_RT_BASE
  25296. + if (!hrtimer_rt_defer(timer))
  25297. + return -ETIME;
  25298. +#endif
  25299. + return 1;
  25300. +}
  25301. +
  25302. static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
  25303. {
  25304. ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset;
  25305. @@ -678,6 +697,44 @@
  25306. static DECLARE_WORK(hrtimer_work, clock_was_set_work);
  25307. +#ifdef CONFIG_PREEMPT_RT_FULL
  25308. +/*
  25309. + * RT can not call schedule_work from real interrupt context.
  25310. + * Need to make a thread to do the real work.
  25311. + */
  25312. +static struct task_struct *clock_set_delay_thread;
  25313. +static bool do_clock_set_delay;
  25314. +
  25315. +static int run_clock_set_delay(void *ignore)
  25316. +{
  25317. + while (!kthread_should_stop()) {
  25318. + set_current_state(TASK_INTERRUPTIBLE);
  25319. + if (do_clock_set_delay) {
  25320. + do_clock_set_delay = false;
  25321. + schedule_work(&hrtimer_work);
  25322. + }
  25323. + schedule();
  25324. + }
  25325. + __set_current_state(TASK_RUNNING);
  25326. + return 0;
  25327. +}
  25328. +
  25329. +void clock_was_set_delayed(void)
  25330. +{
  25331. + do_clock_set_delay = true;
  25332. + /* Make visible before waking up process */
  25333. + smp_wmb();
  25334. + wake_up_process(clock_set_delay_thread);
  25335. +}
  25336. +
  25337. +static __init int create_clock_set_delay_thread(void)
  25338. +{
  25339. + clock_set_delay_thread = kthread_run(run_clock_set_delay, NULL, "kclksetdelayd");
  25340. + BUG_ON(!clock_set_delay_thread);
  25341. + return 0;
  25342. +}
  25343. +early_initcall(create_clock_set_delay_thread);
  25344. +#else /* PREEMPT_RT_FULL */
  25345. /*
  25346. * Called from timekeeping and resume code to reprogramm the hrtimer
  25347. * interrupt device on all cpus.
  25348. @@ -686,6 +743,7 @@
  25349. {
  25350. schedule_work(&hrtimer_work);
  25351. }
  25352. +#endif
  25353. #else
  25354. @@ -694,6 +752,13 @@
  25355. static inline int hrtimer_switch_to_hres(void) { return 0; }
  25356. static inline void
  25357. hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { }
  25358. +static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
  25359. + struct hrtimer_clock_base *base,
  25360. + int wakeup)
  25361. +{
  25362. + return 0;
  25363. +}
  25364. +
  25365. static inline int hrtimer_reprogram(struct hrtimer *timer,
  25366. struct hrtimer_clock_base *base)
  25367. {
  25368. @@ -701,7 +766,6 @@
  25369. }
  25370. static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
  25371. static inline void retrigger_next_event(void *arg) { }
  25372. -
  25373. #endif /* CONFIG_HIGH_RES_TIMERS */
  25374. /*
  25375. @@ -819,6 +883,32 @@
  25376. }
  25377. EXPORT_SYMBOL_GPL(hrtimer_forward);
  25378. +#ifdef CONFIG_PREEMPT_RT_BASE
  25379. +# define wake_up_timer_waiters(b) wake_up(&(b)->wait)
  25380. +
  25381. +/**
  25382. + * hrtimer_wait_for_timer - Wait for a running timer
  25383. + *
  25384. + * @timer: timer to wait for
  25385. + *
  25386. + * The function waits in case the timers callback function is
  25387. + * currently executed on the waitqueue of the timer base. The
  25388. + * waitqueue is woken up after the timer callback function has
  25389. + * finished execution.
  25390. + */
  25391. +void hrtimer_wait_for_timer(const struct hrtimer *timer)
  25392. +{
  25393. + struct hrtimer_clock_base *base = timer->base;
  25394. +
  25395. + if (base && base->cpu_base && !timer->irqsafe)
  25396. + wait_event(base->cpu_base->wait,
  25397. + !(timer->state & HRTIMER_STATE_CALLBACK));
  25398. +}
  25399. +
  25400. +#else
  25401. +# define wake_up_timer_waiters(b) do { } while (0)
  25402. +#endif
  25403. +
  25404. /*
  25405. * enqueue_hrtimer - internal function to (re)start a timer
  25406. *
  25407. @@ -862,6 +952,11 @@
  25408. if (!(timer->state & HRTIMER_STATE_ENQUEUED))
  25409. goto out;
  25410. + if (unlikely(!list_empty(&timer->cb_entry))) {
  25411. + list_del_init(&timer->cb_entry);
  25412. + goto out;
  25413. + }
  25414. +
  25415. next_timer = timerqueue_getnext(&base->active);
  25416. timerqueue_del(&base->active, &timer->node);
  25417. if (&timer->node == next_timer) {
  25418. @@ -949,7 +1044,16 @@
  25419. new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED);
  25420. timer_stats_hrtimer_set_start_info(timer);
  25421. +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
  25422. + {
  25423. + ktime_t now = new_base->get_time();
  25424. + if (ktime_to_ns(tim) < ktime_to_ns(now))
  25425. + timer->praecox = now;
  25426. + else
  25427. + timer->praecox = ktime_set(0, 0);
  25428. + }
  25429. +#endif
  25430. leftmost = enqueue_hrtimer(timer, new_base);
  25431. if (!leftmost) {
  25432. @@ -963,15 +1067,26 @@
  25433. * on dynticks target.
  25434. */
  25435. wake_up_nohz_cpu(new_base->cpu_base->cpu);
  25436. - } else if (new_base->cpu_base == this_cpu_ptr(&hrtimer_bases) &&
  25437. - hrtimer_reprogram(timer, new_base)) {
  25438. + } else if (new_base->cpu_base == this_cpu_ptr(&hrtimer_bases)) {
  25439. +
  25440. + ret = hrtimer_enqueue_reprogram(timer, new_base, wakeup);
  25441. + if (ret < 0) {
  25442. + /*
  25443. + * In case we failed to reprogram the timer (mostly
  25444. + * because out current timer is already elapsed),
  25445. + * remove it again and report a failure. This avoids
  25446. + * stale base->first entries.
  25447. + */
  25448. + debug_deactivate(timer);
  25449. + __remove_hrtimer(timer, new_base,
  25450. + timer->state & HRTIMER_STATE_CALLBACK, 0);
  25451. + } else if (ret > 0) {
  25452. /*
  25453. * Only allow reprogramming if the new base is on this CPU.
  25454. * (it might still be on another CPU if the timer was pending)
  25455. *
  25456. * XXX send_remote_softirq() ?
  25457. */
  25458. - if (wakeup) {
  25459. /*
  25460. * We need to drop cpu_base->lock to avoid a
  25461. * lock ordering issue vs. rq->lock.
  25462. @@ -979,9 +1094,7 @@
  25463. raw_spin_unlock(&new_base->cpu_base->lock);
  25464. raise_softirq_irqoff(HRTIMER_SOFTIRQ);
  25465. local_irq_restore(flags);
  25466. - return ret;
  25467. - } else {
  25468. - __raise_softirq_irqoff(HRTIMER_SOFTIRQ);
  25469. + return 0;
  25470. }
  25471. }
  25472. @@ -1072,7 +1185,7 @@
  25473. if (ret >= 0)
  25474. return ret;
  25475. - cpu_relax();
  25476. + hrtimer_wait_for_timer(timer);
  25477. }
  25478. }
  25479. EXPORT_SYMBOL_GPL(hrtimer_cancel);
  25480. @@ -1151,6 +1264,7 @@
  25481. base = hrtimer_clockid_to_base(clock_id);
  25482. timer->base = &cpu_base->clock_base[base];
  25483. + INIT_LIST_HEAD(&timer->cb_entry);
  25484. timerqueue_init(&timer->node);
  25485. #ifdef CONFIG_TIMER_STATS
  25486. @@ -1234,6 +1348,126 @@
  25487. timer->state &= ~HRTIMER_STATE_CALLBACK;
  25488. }
  25489. +static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer);
  25490. +
  25491. +#ifdef CONFIG_PREEMPT_RT_BASE
  25492. +static void hrtimer_rt_reprogram(int restart, struct hrtimer *timer,
  25493. + struct hrtimer_clock_base *base)
  25494. +{
  25495. + /*
  25496. + * Note, we clear the callback flag before we requeue the
  25497. + * timer otherwise we trigger the callback_running() check
  25498. + * in hrtimer_reprogram().
  25499. + */
  25500. + timer->state &= ~HRTIMER_STATE_CALLBACK;
  25501. +
  25502. + if (restart != HRTIMER_NORESTART) {
  25503. + BUG_ON(hrtimer_active(timer));
  25504. + /*
  25505. + * Enqueue the timer, if it's the leftmost timer then
  25506. + * we need to reprogram it.
  25507. + */
  25508. + if (!enqueue_hrtimer(timer, base))
  25509. + return;
  25510. +
  25511. +#ifndef CONFIG_HIGH_RES_TIMERS
  25512. + }
  25513. +#else
  25514. + if (base->cpu_base->hres_active &&
  25515. + hrtimer_reprogram(timer, base))
  25516. + goto requeue;
  25517. +
  25518. + } else if (hrtimer_active(timer)) {
  25519. + /*
  25520. + * If the timer was rearmed on another CPU, reprogram
  25521. + * the event device.
  25522. + */
  25523. + if (&timer->node == base->active.next &&
  25524. + base->cpu_base->hres_active &&
  25525. + hrtimer_reprogram(timer, base))
  25526. + goto requeue;
  25527. + }
  25528. + return;
  25529. +
  25530. +requeue:
  25531. + /*
  25532. + * Timer is expired. Thus move it from tree to pending list
  25533. + * again.
  25534. + */
  25535. + __remove_hrtimer(timer, base, timer->state, 0);
  25536. + list_add_tail(&timer->cb_entry, &base->expired);
  25537. +#endif
  25538. +}
  25539. +
  25540. +/*
  25541. + * The changes in mainline which removed the callback modes from
  25542. + * hrtimer are not yet working with -rt. The non wakeup_process()
  25543. + * based callbacks which involve sleeping locks need to be treated
  25544. + * seperately.
  25545. + */
  25546. +static void hrtimer_rt_run_pending(void)
  25547. +{
  25548. + enum hrtimer_restart (*fn)(struct hrtimer *);
  25549. + struct hrtimer_cpu_base *cpu_base;
  25550. + struct hrtimer_clock_base *base;
  25551. + struct hrtimer *timer;
  25552. + int index, restart;
  25553. +
  25554. + local_irq_disable();
  25555. + cpu_base = &per_cpu(hrtimer_bases, smp_processor_id());
  25556. +
  25557. + raw_spin_lock(&cpu_base->lock);
  25558. +
  25559. + for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) {
  25560. + base = &cpu_base->clock_base[index];
  25561. +
  25562. + while (!list_empty(&base->expired)) {
  25563. + timer = list_first_entry(&base->expired,
  25564. + struct hrtimer, cb_entry);
  25565. +
  25566. + /*
  25567. + * Same as the above __run_hrtimer function
  25568. + * just we run with interrupts enabled.
  25569. + */
  25570. + debug_hrtimer_deactivate(timer);
  25571. + __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
  25572. + timer_stats_account_hrtimer(timer);
  25573. + fn = timer->function;
  25574. +
  25575. + raw_spin_unlock_irq(&cpu_base->lock);
  25576. + restart = fn(timer);
  25577. + raw_spin_lock_irq(&cpu_base->lock);
  25578. +
  25579. + hrtimer_rt_reprogram(restart, timer, base);
  25580. + }
  25581. + }
  25582. +
  25583. + raw_spin_unlock_irq(&cpu_base->lock);
  25584. +
  25585. + wake_up_timer_waiters(cpu_base);
  25586. +}
  25587. +
  25588. +static int hrtimer_rt_defer(struct hrtimer *timer)
  25589. +{
  25590. + if (timer->irqsafe)
  25591. + return 0;
  25592. +
  25593. + __remove_hrtimer(timer, timer->base, timer->state, 0);
  25594. + list_add_tail(&timer->cb_entry, &timer->base->expired);
  25595. + return 1;
  25596. +}
  25597. +
  25598. +#else
  25599. +
  25600. +static inline void hrtimer_rt_run_pending(void)
  25601. +{
  25602. + hrtimer_peek_ahead_timers();
  25603. +}
  25604. +
  25605. +static inline int hrtimer_rt_defer(struct hrtimer *timer) { return 0; }
  25606. +
  25607. +#endif
  25608. +
  25609. #ifdef CONFIG_HIGH_RES_TIMERS
  25610. /*
  25611. @@ -1244,7 +1478,7 @@
  25612. {
  25613. struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
  25614. ktime_t expires_next, now, entry_time, delta;
  25615. - int i, retries = 0;
  25616. + int i, retries = 0, raise = 0;
  25617. BUG_ON(!cpu_base->hres_active);
  25618. cpu_base->nr_events++;
  25619. @@ -1279,6 +1513,15 @@
  25620. timer = container_of(node, struct hrtimer, node);
  25621. + trace_hrtimer_interrupt(raw_smp_processor_id(),
  25622. + ktime_to_ns(ktime_sub(ktime_to_ns(timer->praecox) ?
  25623. + timer->praecox : hrtimer_get_expires(timer),
  25624. + basenow)),
  25625. + current,
  25626. + timer->function == hrtimer_wakeup ?
  25627. + container_of(timer, struct hrtimer_sleeper,
  25628. + timer)->task : NULL);
  25629. +
  25630. /*
  25631. * The immediate goal for using the softexpires is
  25632. * minimizing wakeups, not running timers at the
  25633. @@ -1304,7 +1547,10 @@
  25634. break;
  25635. }
  25636. - __run_hrtimer(timer, &basenow);
  25637. + if (!hrtimer_rt_defer(timer))
  25638. + __run_hrtimer(timer, &basenow);
  25639. + else
  25640. + raise = 1;
  25641. }
  25642. }
  25643. @@ -1319,7 +1565,7 @@
  25644. if (expires_next.tv64 == KTIME_MAX ||
  25645. !tick_program_event(expires_next, 0)) {
  25646. cpu_base->hang_detected = 0;
  25647. - return;
  25648. + goto out;
  25649. }
  25650. /*
  25651. @@ -1363,6 +1609,9 @@
  25652. tick_program_event(expires_next, 1);
  25653. printk_once(KERN_WARNING "hrtimer: interrupt took %llu ns\n",
  25654. ktime_to_ns(delta));
  25655. +out:
  25656. + if (raise)
  25657. + raise_softirq_irqoff(HRTIMER_SOFTIRQ);
  25658. }
  25659. /*
  25660. @@ -1398,18 +1647,18 @@
  25661. __hrtimer_peek_ahead_timers();
  25662. local_irq_restore(flags);
  25663. }
  25664. -
  25665. -static void run_hrtimer_softirq(struct softirq_action *h)
  25666. -{
  25667. - hrtimer_peek_ahead_timers();
  25668. -}
  25669. -
  25670. #else /* CONFIG_HIGH_RES_TIMERS */
  25671. static inline void __hrtimer_peek_ahead_timers(void) { }
  25672. #endif /* !CONFIG_HIGH_RES_TIMERS */
  25673. +
  25674. +static void run_hrtimer_softirq(struct softirq_action *h)
  25675. +{
  25676. + hrtimer_rt_run_pending();
  25677. +}
  25678. +
  25679. /*
  25680. * Called from timer softirq every jiffy, expire hrtimers:
  25681. *
  25682. @@ -1442,7 +1691,7 @@
  25683. struct timerqueue_node *node;
  25684. struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
  25685. struct hrtimer_clock_base *base;
  25686. - int index, gettime = 1;
  25687. + int index, gettime = 1, raise = 0;
  25688. if (hrtimer_hres_active())
  25689. return;
  25690. @@ -1467,10 +1716,16 @@
  25691. hrtimer_get_expires_tv64(timer))
  25692. break;
  25693. - __run_hrtimer(timer, &base->softirq_time);
  25694. + if (!hrtimer_rt_defer(timer))
  25695. + __run_hrtimer(timer, &base->softirq_time);
  25696. + else
  25697. + raise = 1;
  25698. }
  25699. raw_spin_unlock(&cpu_base->lock);
  25700. }
  25701. +
  25702. + if (raise)
  25703. + raise_softirq_irqoff(HRTIMER_SOFTIRQ);
  25704. }
  25705. /*
  25706. @@ -1492,16 +1747,18 @@
  25707. void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
  25708. {
  25709. sl->timer.function = hrtimer_wakeup;
  25710. + sl->timer.irqsafe = 1;
  25711. sl->task = task;
  25712. }
  25713. EXPORT_SYMBOL_GPL(hrtimer_init_sleeper);
  25714. -static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode)
  25715. +static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode,
  25716. + unsigned long state)
  25717. {
  25718. hrtimer_init_sleeper(t, current);
  25719. do {
  25720. - set_current_state(TASK_INTERRUPTIBLE);
  25721. + set_current_state(state);
  25722. hrtimer_start_expires(&t->timer, mode);
  25723. if (!hrtimer_active(&t->timer))
  25724. t->task = NULL;
  25725. @@ -1545,7 +1802,8 @@
  25726. HRTIMER_MODE_ABS);
  25727. hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires);
  25728. - if (do_nanosleep(&t, HRTIMER_MODE_ABS))
  25729. + /* cpu_chill() does not care about restart state. */
  25730. + if (do_nanosleep(&t, HRTIMER_MODE_ABS, TASK_INTERRUPTIBLE))
  25731. goto out;
  25732. rmtp = restart->nanosleep.rmtp;
  25733. @@ -1562,8 +1820,10 @@
  25734. return ret;
  25735. }
  25736. -long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
  25737. - const enum hrtimer_mode mode, const clockid_t clockid)
  25738. +static long
  25739. +__hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
  25740. + const enum hrtimer_mode mode, const clockid_t clockid,
  25741. + unsigned long state)
  25742. {
  25743. struct restart_block *restart;
  25744. struct hrtimer_sleeper t;
  25745. @@ -1576,7 +1836,7 @@
  25746. hrtimer_init_on_stack(&t.timer, clockid, mode);
  25747. hrtimer_set_expires_range_ns(&t.timer, timespec_to_ktime(*rqtp), slack);
  25748. - if (do_nanosleep(&t, mode))
  25749. + if (do_nanosleep(&t, mode, state))
  25750. goto out;
  25751. /* Absolute timers do not update the rmtp value and restart: */
  25752. @@ -1603,6 +1863,12 @@
  25753. return ret;
  25754. }
  25755. +long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
  25756. + const enum hrtimer_mode mode, const clockid_t clockid)
  25757. +{
  25758. + return __hrtimer_nanosleep(rqtp, rmtp, mode, clockid, TASK_INTERRUPTIBLE);
  25759. +}
  25760. +
  25761. SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
  25762. struct timespec __user *, rmtp)
  25763. {
  25764. @@ -1617,6 +1883,26 @@
  25765. return hrtimer_nanosleep(&tu, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
  25766. }
  25767. +#ifdef CONFIG_PREEMPT_RT_FULL
  25768. +/*
  25769. + * Sleep for 1 ms in hope whoever holds what we want will let it go.
  25770. + */
  25771. +void cpu_chill(void)
  25772. +{
  25773. + struct timespec tu = {
  25774. + .tv_nsec = NSEC_PER_MSEC,
  25775. + };
  25776. + unsigned int freeze_flag = current->flags & PF_NOFREEZE;
  25777. +
  25778. + current->flags |= PF_NOFREEZE;
  25779. + __hrtimer_nanosleep(&tu, NULL, HRTIMER_MODE_REL, CLOCK_MONOTONIC,
  25780. + TASK_UNINTERRUPTIBLE);
  25781. + if (!freeze_flag)
  25782. + current->flags &= ~PF_NOFREEZE;
  25783. +}
  25784. +EXPORT_SYMBOL(cpu_chill);
  25785. +#endif
  25786. +
  25787. /*
  25788. * Functions related to boot-time initialization:
  25789. */
  25790. @@ -1628,10 +1914,14 @@
  25791. for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
  25792. cpu_base->clock_base[i].cpu_base = cpu_base;
  25793. timerqueue_init_head(&cpu_base->clock_base[i].active);
  25794. + INIT_LIST_HEAD(&cpu_base->clock_base[i].expired);
  25795. }
  25796. cpu_base->cpu = cpu;
  25797. hrtimer_init_hres(cpu_base);
  25798. +#ifdef CONFIG_PREEMPT_RT_BASE
  25799. + init_waitqueue_head(&cpu_base->wait);
  25800. +#endif
  25801. }
  25802. #ifdef CONFIG_HOTPLUG_CPU
  25803. @@ -1744,9 +2034,7 @@
  25804. hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE,
  25805. (void *)(long)smp_processor_id());
  25806. register_cpu_notifier(&hrtimers_nb);
  25807. -#ifdef CONFIG_HIGH_RES_TIMERS
  25808. open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq);
  25809. -#endif
  25810. }
  25811. /**
  25812. diff -Nur linux-3.18.14.orig/kernel/time/itimer.c linux-3.18.14-rt/kernel/time/itimer.c
  25813. --- linux-3.18.14.orig/kernel/time/itimer.c 2015-05-20 10:04:50.000000000 -0500
  25814. +++ linux-3.18.14-rt/kernel/time/itimer.c 2015-05-31 15:32:48.957635362 -0500
  25815. @@ -213,6 +213,7 @@
  25816. /* We are sharing ->siglock with it_real_fn() */
  25817. if (hrtimer_try_to_cancel(timer) < 0) {
  25818. spin_unlock_irq(&tsk->sighand->siglock);
  25819. + hrtimer_wait_for_timer(&tsk->signal->real_timer);
  25820. goto again;
  25821. }
  25822. expires = timeval_to_ktime(value->it_value);
  25823. diff -Nur linux-3.18.14.orig/kernel/time/jiffies.c linux-3.18.14-rt/kernel/time/jiffies.c
  25824. --- linux-3.18.14.orig/kernel/time/jiffies.c 2015-05-20 10:04:50.000000000 -0500
  25825. +++ linux-3.18.14-rt/kernel/time/jiffies.c 2015-05-31 15:32:48.957635362 -0500
  25826. @@ -73,7 +73,8 @@
  25827. .shift = JIFFIES_SHIFT,
  25828. };
  25829. -__cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock);
  25830. +__cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(jiffies_lock);
  25831. +__cacheline_aligned_in_smp seqcount_t jiffies_seq;
  25832. #if (BITS_PER_LONG < 64)
  25833. u64 get_jiffies_64(void)
  25834. @@ -82,9 +83,9 @@
  25835. u64 ret;
  25836. do {
  25837. - seq = read_seqbegin(&jiffies_lock);
  25838. + seq = read_seqcount_begin(&jiffies_seq);
  25839. ret = jiffies_64;
  25840. - } while (read_seqretry(&jiffies_lock, seq));
  25841. + } while (read_seqcount_retry(&jiffies_seq, seq));
  25842. return ret;
  25843. }
  25844. EXPORT_SYMBOL(get_jiffies_64);
  25845. diff -Nur linux-3.18.14.orig/kernel/time/ntp.c linux-3.18.14-rt/kernel/time/ntp.c
  25846. --- linux-3.18.14.orig/kernel/time/ntp.c 2015-05-20 10:04:50.000000000 -0500
  25847. +++ linux-3.18.14-rt/kernel/time/ntp.c 2015-05-31 15:32:48.957635362 -0500
  25848. @@ -10,6 +10,7 @@
  25849. #include <linux/workqueue.h>
  25850. #include <linux/hrtimer.h>
  25851. #include <linux/jiffies.h>
  25852. +#include <linux/kthread.h>
  25853. #include <linux/math64.h>
  25854. #include <linux/timex.h>
  25855. #include <linux/time.h>
  25856. @@ -519,10 +520,52 @@
  25857. &sync_cmos_work, timespec_to_jiffies(&next));
  25858. }
  25859. +#ifdef CONFIG_PREEMPT_RT_FULL
  25860. +/*
  25861. + * RT can not call schedule_delayed_work from real interrupt context.
  25862. + * Need to make a thread to do the real work.
  25863. + */
  25864. +static struct task_struct *cmos_delay_thread;
  25865. +static bool do_cmos_delay;
  25866. +
  25867. +static int run_cmos_delay(void *ignore)
  25868. +{
  25869. + while (!kthread_should_stop()) {
  25870. + set_current_state(TASK_INTERRUPTIBLE);
  25871. + if (do_cmos_delay) {
  25872. + do_cmos_delay = false;
  25873. + queue_delayed_work(system_power_efficient_wq,
  25874. + &sync_cmos_work, 0);
  25875. + }
  25876. + schedule();
  25877. + }
  25878. + __set_current_state(TASK_RUNNING);
  25879. + return 0;
  25880. +}
  25881. +
  25882. +void ntp_notify_cmos_timer(void)
  25883. +{
  25884. + do_cmos_delay = true;
  25885. + /* Make visible before waking up process */
  25886. + smp_wmb();
  25887. + wake_up_process(cmos_delay_thread);
  25888. +}
  25889. +
  25890. +static __init int create_cmos_delay_thread(void)
  25891. +{
  25892. + cmos_delay_thread = kthread_run(run_cmos_delay, NULL, "kcmosdelayd");
  25893. + BUG_ON(!cmos_delay_thread);
  25894. + return 0;
  25895. +}
  25896. +early_initcall(create_cmos_delay_thread);
  25897. +
  25898. +#else
  25899. +
  25900. void ntp_notify_cmos_timer(void)
  25901. {
  25902. queue_delayed_work(system_power_efficient_wq, &sync_cmos_work, 0);
  25903. }
  25904. +#endif /* CONFIG_PREEMPT_RT_FULL */
  25905. #else
  25906. void ntp_notify_cmos_timer(void) { }
  25907. diff -Nur linux-3.18.14.orig/kernel/time/posix-cpu-timers.c linux-3.18.14-rt/kernel/time/posix-cpu-timers.c
  25908. --- linux-3.18.14.orig/kernel/time/posix-cpu-timers.c 2015-05-20 10:04:50.000000000 -0500
  25909. +++ linux-3.18.14-rt/kernel/time/posix-cpu-timers.c 2015-05-31 15:32:48.961635362 -0500
  25910. @@ -3,6 +3,7 @@
  25911. */
  25912. #include <linux/sched.h>
  25913. +#include <linux/sched/rt.h>
  25914. #include <linux/posix-timers.h>
  25915. #include <linux/errno.h>
  25916. #include <linux/math64.h>
  25917. @@ -626,7 +627,7 @@
  25918. /*
  25919. * Disarm any old timer after extracting its expiry time.
  25920. */
  25921. - WARN_ON_ONCE(!irqs_disabled());
  25922. + WARN_ON_ONCE_NONRT(!irqs_disabled());
  25923. ret = 0;
  25924. old_incr = timer->it.cpu.incr;
  25925. @@ -1047,7 +1048,7 @@
  25926. /*
  25927. * Now re-arm for the new expiry time.
  25928. */
  25929. - WARN_ON_ONCE(!irqs_disabled());
  25930. + WARN_ON_ONCE_NONRT(!irqs_disabled());
  25931. arm_timer(timer);
  25932. unlock_task_sighand(p, &flags);
  25933. @@ -1113,10 +1114,11 @@
  25934. sig = tsk->signal;
  25935. if (sig->cputimer.running) {
  25936. struct task_cputime group_sample;
  25937. + unsigned long flags;
  25938. - raw_spin_lock(&sig->cputimer.lock);
  25939. + raw_spin_lock_irqsave(&sig->cputimer.lock, flags);
  25940. group_sample = sig->cputimer.cputime;
  25941. - raw_spin_unlock(&sig->cputimer.lock);
  25942. + raw_spin_unlock_irqrestore(&sig->cputimer.lock, flags);
  25943. if (task_cputime_expired(&group_sample, &sig->cputime_expires))
  25944. return 1;
  25945. @@ -1130,13 +1132,13 @@
  25946. * already updated our counts. We need to check if any timers fire now.
  25947. * Interrupts are disabled.
  25948. */
  25949. -void run_posix_cpu_timers(struct task_struct *tsk)
  25950. +static void __run_posix_cpu_timers(struct task_struct *tsk)
  25951. {
  25952. LIST_HEAD(firing);
  25953. struct k_itimer *timer, *next;
  25954. unsigned long flags;
  25955. - WARN_ON_ONCE(!irqs_disabled());
  25956. + WARN_ON_ONCE_NONRT(!irqs_disabled());
  25957. /*
  25958. * The fast path checks that there are no expired thread or thread
  25959. @@ -1194,6 +1196,190 @@
  25960. }
  25961. }
  25962. +#ifdef CONFIG_PREEMPT_RT_BASE
  25963. +#include <linux/kthread.h>
  25964. +#include <linux/cpu.h>
  25965. +DEFINE_PER_CPU(struct task_struct *, posix_timer_task);
  25966. +DEFINE_PER_CPU(struct task_struct *, posix_timer_tasklist);
  25967. +
  25968. +static int posix_cpu_timers_thread(void *data)
  25969. +{
  25970. + int cpu = (long)data;
  25971. +
  25972. + BUG_ON(per_cpu(posix_timer_task,cpu) != current);
  25973. +
  25974. + while (!kthread_should_stop()) {
  25975. + struct task_struct *tsk = NULL;
  25976. + struct task_struct *next = NULL;
  25977. +
  25978. + if (cpu_is_offline(cpu))
  25979. + goto wait_to_die;
  25980. +
  25981. + /* grab task list */
  25982. + raw_local_irq_disable();
  25983. + tsk = per_cpu(posix_timer_tasklist, cpu);
  25984. + per_cpu(posix_timer_tasklist, cpu) = NULL;
  25985. + raw_local_irq_enable();
  25986. +
  25987. + /* its possible the list is empty, just return */
  25988. + if (!tsk) {
  25989. + set_current_state(TASK_INTERRUPTIBLE);
  25990. + schedule();
  25991. + __set_current_state(TASK_RUNNING);
  25992. + continue;
  25993. + }
  25994. +
  25995. + /* Process task list */
  25996. + while (1) {
  25997. + /* save next */
  25998. + next = tsk->posix_timer_list;
  25999. +
  26000. + /* run the task timers, clear its ptr and
  26001. + * unreference it
  26002. + */
  26003. + __run_posix_cpu_timers(tsk);
  26004. + tsk->posix_timer_list = NULL;
  26005. + put_task_struct(tsk);
  26006. +
  26007. + /* check if this is the last on the list */
  26008. + if (next == tsk)
  26009. + break;
  26010. + tsk = next;
  26011. + }
  26012. + }
  26013. + return 0;
  26014. +
  26015. +wait_to_die:
  26016. + /* Wait for kthread_stop */
  26017. + set_current_state(TASK_INTERRUPTIBLE);
  26018. + while (!kthread_should_stop()) {
  26019. + schedule();
  26020. + set_current_state(TASK_INTERRUPTIBLE);
  26021. + }
  26022. + __set_current_state(TASK_RUNNING);
  26023. + return 0;
  26024. +}
  26025. +
  26026. +static inline int __fastpath_timer_check(struct task_struct *tsk)
  26027. +{
  26028. + /* tsk == current, ensure it is safe to use ->signal/sighand */
  26029. + if (unlikely(tsk->exit_state))
  26030. + return 0;
  26031. +
  26032. + if (!task_cputime_zero(&tsk->cputime_expires))
  26033. + return 1;
  26034. +
  26035. + if (!task_cputime_zero(&tsk->signal->cputime_expires))
  26036. + return 1;
  26037. +
  26038. + return 0;
  26039. +}
  26040. +
  26041. +void run_posix_cpu_timers(struct task_struct *tsk)
  26042. +{
  26043. + unsigned long cpu = smp_processor_id();
  26044. + struct task_struct *tasklist;
  26045. +
  26046. + BUG_ON(!irqs_disabled());
  26047. + if(!per_cpu(posix_timer_task, cpu))
  26048. + return;
  26049. + /* get per-cpu references */
  26050. + tasklist = per_cpu(posix_timer_tasklist, cpu);
  26051. +
  26052. + /* check to see if we're already queued */
  26053. + if (!tsk->posix_timer_list && __fastpath_timer_check(tsk)) {
  26054. + get_task_struct(tsk);
  26055. + if (tasklist) {
  26056. + tsk->posix_timer_list = tasklist;
  26057. + } else {
  26058. + /*
  26059. + * The list is terminated by a self-pointing
  26060. + * task_struct
  26061. + */
  26062. + tsk->posix_timer_list = tsk;
  26063. + }
  26064. + per_cpu(posix_timer_tasklist, cpu) = tsk;
  26065. +
  26066. + wake_up_process(per_cpu(posix_timer_task, cpu));
  26067. + }
  26068. +}
  26069. +
  26070. +/*
  26071. + * posix_cpu_thread_call - callback that gets triggered when a CPU is added.
  26072. + * Here we can start up the necessary migration thread for the new CPU.
  26073. + */
  26074. +static int posix_cpu_thread_call(struct notifier_block *nfb,
  26075. + unsigned long action, void *hcpu)
  26076. +{
  26077. + int cpu = (long)hcpu;
  26078. + struct task_struct *p;
  26079. + struct sched_param param;
  26080. +
  26081. + switch (action) {
  26082. + case CPU_UP_PREPARE:
  26083. + p = kthread_create(posix_cpu_timers_thread, hcpu,
  26084. + "posixcputmr/%d",cpu);
  26085. + if (IS_ERR(p))
  26086. + return NOTIFY_BAD;
  26087. + p->flags |= PF_NOFREEZE;
  26088. + kthread_bind(p, cpu);
  26089. + /* Must be high prio to avoid getting starved */
  26090. + param.sched_priority = MAX_RT_PRIO-1;
  26091. + sched_setscheduler(p, SCHED_FIFO, &param);
  26092. + per_cpu(posix_timer_task,cpu) = p;
  26093. + break;
  26094. + case CPU_ONLINE:
  26095. + /* Strictly unneccessary, as first user will wake it. */
  26096. + wake_up_process(per_cpu(posix_timer_task,cpu));
  26097. + break;
  26098. +#ifdef CONFIG_HOTPLUG_CPU
  26099. + case CPU_UP_CANCELED:
  26100. + /* Unbind it from offline cpu so it can run. Fall thru. */
  26101. + kthread_bind(per_cpu(posix_timer_task, cpu),
  26102. + cpumask_any(cpu_online_mask));
  26103. + kthread_stop(per_cpu(posix_timer_task,cpu));
  26104. + per_cpu(posix_timer_task,cpu) = NULL;
  26105. + break;
  26106. + case CPU_DEAD:
  26107. + kthread_stop(per_cpu(posix_timer_task,cpu));
  26108. + per_cpu(posix_timer_task,cpu) = NULL;
  26109. + break;
  26110. +#endif
  26111. + }
  26112. + return NOTIFY_OK;
  26113. +}
  26114. +
  26115. +/* Register at highest priority so that task migration (migrate_all_tasks)
  26116. + * happens before everything else.
  26117. + */
  26118. +static struct notifier_block posix_cpu_thread_notifier = {
  26119. + .notifier_call = posix_cpu_thread_call,
  26120. + .priority = 10
  26121. +};
  26122. +
  26123. +static int __init posix_cpu_thread_init(void)
  26124. +{
  26125. + void *hcpu = (void *)(long)smp_processor_id();
  26126. + /* Start one for boot CPU. */
  26127. + unsigned long cpu;
  26128. +
  26129. + /* init the per-cpu posix_timer_tasklets */
  26130. + for_each_possible_cpu(cpu)
  26131. + per_cpu(posix_timer_tasklist, cpu) = NULL;
  26132. +
  26133. + posix_cpu_thread_call(&posix_cpu_thread_notifier, CPU_UP_PREPARE, hcpu);
  26134. + posix_cpu_thread_call(&posix_cpu_thread_notifier, CPU_ONLINE, hcpu);
  26135. + register_cpu_notifier(&posix_cpu_thread_notifier);
  26136. + return 0;
  26137. +}
  26138. +early_initcall(posix_cpu_thread_init);
  26139. +#else /* CONFIG_PREEMPT_RT_BASE */
  26140. +void run_posix_cpu_timers(struct task_struct *tsk)
  26141. +{
  26142. + __run_posix_cpu_timers(tsk);
  26143. +}
  26144. +#endif /* CONFIG_PREEMPT_RT_BASE */
  26145. +
  26146. /*
  26147. * Set one of the process-wide special case CPU timers or RLIMIT_CPU.
  26148. * The tsk->sighand->siglock must be held by the caller.
  26149. diff -Nur linux-3.18.14.orig/kernel/time/posix-timers.c linux-3.18.14-rt/kernel/time/posix-timers.c
  26150. --- linux-3.18.14.orig/kernel/time/posix-timers.c 2015-05-20 10:04:50.000000000 -0500
  26151. +++ linux-3.18.14-rt/kernel/time/posix-timers.c 2015-05-31 15:32:48.961635362 -0500
  26152. @@ -499,6 +499,7 @@
  26153. static struct pid *good_sigevent(sigevent_t * event)
  26154. {
  26155. struct task_struct *rtn = current->group_leader;
  26156. + int sig = event->sigev_signo;
  26157. if ((event->sigev_notify & SIGEV_THREAD_ID ) &&
  26158. (!(rtn = find_task_by_vpid(event->sigev_notify_thread_id)) ||
  26159. @@ -507,7 +508,8 @@
  26160. return NULL;
  26161. if (((event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) &&
  26162. - ((event->sigev_signo <= 0) || (event->sigev_signo > SIGRTMAX)))
  26163. + (sig <= 0 || sig > SIGRTMAX || sig_kernel_only(sig) ||
  26164. + sig_kernel_coredump(sig)))
  26165. return NULL;
  26166. return task_pid(rtn);
  26167. @@ -819,6 +821,20 @@
  26168. return overrun;
  26169. }
  26170. +/*
  26171. + * Protected by RCU!
  26172. + */
  26173. +static void timer_wait_for_callback(struct k_clock *kc, struct k_itimer *timr)
  26174. +{
  26175. +#ifdef CONFIG_PREEMPT_RT_FULL
  26176. + if (kc->timer_set == common_timer_set)
  26177. + hrtimer_wait_for_timer(&timr->it.real.timer);
  26178. + else
  26179. + /* FIXME: Whacky hack for posix-cpu-timers */
  26180. + schedule_timeout(1);
  26181. +#endif
  26182. +}
  26183. +
  26184. /* Set a POSIX.1b interval timer. */
  26185. /* timr->it_lock is taken. */
  26186. static int
  26187. @@ -896,6 +912,7 @@
  26188. if (!timr)
  26189. return -EINVAL;
  26190. + rcu_read_lock();
  26191. kc = clockid_to_kclock(timr->it_clock);
  26192. if (WARN_ON_ONCE(!kc || !kc->timer_set))
  26193. error = -EINVAL;
  26194. @@ -904,9 +921,12 @@
  26195. unlock_timer(timr, flag);
  26196. if (error == TIMER_RETRY) {
  26197. + timer_wait_for_callback(kc, timr);
  26198. rtn = NULL; // We already got the old time...
  26199. + rcu_read_unlock();
  26200. goto retry;
  26201. }
  26202. + rcu_read_unlock();
  26203. if (old_setting && !error &&
  26204. copy_to_user(old_setting, &old_spec, sizeof (old_spec)))
  26205. @@ -944,10 +964,15 @@
  26206. if (!timer)
  26207. return -EINVAL;
  26208. + rcu_read_lock();
  26209. if (timer_delete_hook(timer) == TIMER_RETRY) {
  26210. unlock_timer(timer, flags);
  26211. + timer_wait_for_callback(clockid_to_kclock(timer->it_clock),
  26212. + timer);
  26213. + rcu_read_unlock();
  26214. goto retry_delete;
  26215. }
  26216. + rcu_read_unlock();
  26217. spin_lock(&current->sighand->siglock);
  26218. list_del(&timer->list);
  26219. @@ -973,8 +998,18 @@
  26220. retry_delete:
  26221. spin_lock_irqsave(&timer->it_lock, flags);
  26222. + /* On RT we can race with a deletion */
  26223. + if (!timer->it_signal) {
  26224. + unlock_timer(timer, flags);
  26225. + return;
  26226. + }
  26227. +
  26228. if (timer_delete_hook(timer) == TIMER_RETRY) {
  26229. + rcu_read_lock();
  26230. unlock_timer(timer, flags);
  26231. + timer_wait_for_callback(clockid_to_kclock(timer->it_clock),
  26232. + timer);
  26233. + rcu_read_unlock();
  26234. goto retry_delete;
  26235. }
  26236. list_del(&timer->list);
  26237. diff -Nur linux-3.18.14.orig/kernel/time/tick-common.c linux-3.18.14-rt/kernel/time/tick-common.c
  26238. --- linux-3.18.14.orig/kernel/time/tick-common.c 2015-05-20 10:04:50.000000000 -0500
  26239. +++ linux-3.18.14-rt/kernel/time/tick-common.c 2015-05-31 15:32:48.961635362 -0500
  26240. @@ -78,13 +78,15 @@
  26241. static void tick_periodic(int cpu)
  26242. {
  26243. if (tick_do_timer_cpu == cpu) {
  26244. - write_seqlock(&jiffies_lock);
  26245. + raw_spin_lock(&jiffies_lock);
  26246. + write_seqcount_begin(&jiffies_seq);
  26247. /* Keep track of the next tick event */
  26248. tick_next_period = ktime_add(tick_next_period, tick_period);
  26249. do_timer(1);
  26250. - write_sequnlock(&jiffies_lock);
  26251. + write_seqcount_end(&jiffies_seq);
  26252. + raw_spin_unlock(&jiffies_lock);
  26253. update_wall_time();
  26254. }
  26255. @@ -146,9 +148,9 @@
  26256. ktime_t next;
  26257. do {
  26258. - seq = read_seqbegin(&jiffies_lock);
  26259. + seq = read_seqcount_begin(&jiffies_seq);
  26260. next = tick_next_period;
  26261. - } while (read_seqretry(&jiffies_lock, seq));
  26262. + } while (read_seqcount_retry(&jiffies_seq, seq));
  26263. clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
  26264. diff -Nur linux-3.18.14.orig/kernel/time/tick-internal.h linux-3.18.14-rt/kernel/time/tick-internal.h
  26265. --- linux-3.18.14.orig/kernel/time/tick-internal.h 2015-05-20 10:04:50.000000000 -0500
  26266. +++ linux-3.18.14-rt/kernel/time/tick-internal.h 2015-05-31 15:32:48.961635362 -0500
  26267. @@ -6,7 +6,8 @@
  26268. #include "timekeeping.h"
  26269. -extern seqlock_t jiffies_lock;
  26270. +extern raw_spinlock_t jiffies_lock;
  26271. +extern seqcount_t jiffies_seq;
  26272. #define CS_NAME_LEN 32
  26273. diff -Nur linux-3.18.14.orig/kernel/time/tick-sched.c linux-3.18.14-rt/kernel/time/tick-sched.c
  26274. --- linux-3.18.14.orig/kernel/time/tick-sched.c 2015-05-20 10:04:50.000000000 -0500
  26275. +++ linux-3.18.14-rt/kernel/time/tick-sched.c 2015-05-31 15:32:48.961635362 -0500
  26276. @@ -62,7 +62,8 @@
  26277. return;
  26278. /* Reevalute with jiffies_lock held */
  26279. - write_seqlock(&jiffies_lock);
  26280. + raw_spin_lock(&jiffies_lock);
  26281. + write_seqcount_begin(&jiffies_seq);
  26282. delta = ktime_sub(now, last_jiffies_update);
  26283. if (delta.tv64 >= tick_period.tv64) {
  26284. @@ -85,10 +86,12 @@
  26285. /* Keep the tick_next_period variable up to date */
  26286. tick_next_period = ktime_add(last_jiffies_update, tick_period);
  26287. } else {
  26288. - write_sequnlock(&jiffies_lock);
  26289. + write_seqcount_end(&jiffies_seq);
  26290. + raw_spin_unlock(&jiffies_lock);
  26291. return;
  26292. }
  26293. - write_sequnlock(&jiffies_lock);
  26294. + write_seqcount_end(&jiffies_seq);
  26295. + raw_spin_unlock(&jiffies_lock);
  26296. update_wall_time();
  26297. }
  26298. @@ -99,12 +102,14 @@
  26299. {
  26300. ktime_t period;
  26301. - write_seqlock(&jiffies_lock);
  26302. + raw_spin_lock(&jiffies_lock);
  26303. + write_seqcount_begin(&jiffies_seq);
  26304. /* Did we start the jiffies update yet ? */
  26305. if (last_jiffies_update.tv64 == 0)
  26306. last_jiffies_update = tick_next_period;
  26307. period = last_jiffies_update;
  26308. - write_sequnlock(&jiffies_lock);
  26309. + write_seqcount_end(&jiffies_seq);
  26310. + raw_spin_unlock(&jiffies_lock);
  26311. return period;
  26312. }
  26313. @@ -176,6 +181,11 @@
  26314. return false;
  26315. }
  26316. + if (!arch_irq_work_has_interrupt()) {
  26317. + trace_tick_stop(0, "missing irq work interrupt\n");
  26318. + return false;
  26319. + }
  26320. +
  26321. /* sched_clock_tick() needs us? */
  26322. #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
  26323. /*
  26324. @@ -217,11 +227,17 @@
  26325. static void nohz_full_kick_work_func(struct irq_work *work)
  26326. {
  26327. + unsigned long flags;
  26328. +
  26329. + /* ksoftirqd processes sirqs with interrupts enabled */
  26330. + local_irq_save(flags);
  26331. __tick_nohz_full_check();
  26332. + local_irq_restore(flags);
  26333. }
  26334. static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
  26335. .func = nohz_full_kick_work_func,
  26336. + .flags = IRQ_WORK_HARD_IRQ,
  26337. };
  26338. /*
  26339. @@ -580,10 +596,10 @@
  26340. /* Read jiffies and the time when jiffies were updated last */
  26341. do {
  26342. - seq = read_seqbegin(&jiffies_lock);
  26343. + seq = read_seqcount_begin(&jiffies_seq);
  26344. last_update = last_jiffies_update;
  26345. last_jiffies = jiffies;
  26346. - } while (read_seqretry(&jiffies_lock, seq));
  26347. + } while (read_seqcount_retry(&jiffies_seq, seq));
  26348. if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) ||
  26349. arch_needs_cpu() || irq_work_needs_cpu()) {
  26350. @@ -761,14 +777,7 @@
  26351. return false;
  26352. if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
  26353. - static int ratelimit;
  26354. -
  26355. - if (ratelimit < 10 &&
  26356. - (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
  26357. - pr_warn("NOHZ: local_softirq_pending %02x\n",
  26358. - (unsigned int) local_softirq_pending());
  26359. - ratelimit++;
  26360. - }
  26361. + softirq_check_pending_idle();
  26362. return false;
  26363. }
  26364. @@ -1156,6 +1165,7 @@
  26365. * Emulate tick processing via per-CPU hrtimers:
  26366. */
  26367. hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
  26368. + ts->sched_timer.irqsafe = 1;
  26369. ts->sched_timer.function = tick_sched_timer;
  26370. /* Get the next period (per cpu) */
  26371. diff -Nur linux-3.18.14.orig/kernel/time/timekeeping.c linux-3.18.14-rt/kernel/time/timekeeping.c
  26372. --- linux-3.18.14.orig/kernel/time/timekeeping.c 2015-05-20 10:04:50.000000000 -0500
  26373. +++ linux-3.18.14-rt/kernel/time/timekeeping.c 2015-05-31 15:32:48.969635362 -0500
  26374. @@ -1814,8 +1814,10 @@
  26375. */
  26376. void xtime_update(unsigned long ticks)
  26377. {
  26378. - write_seqlock(&jiffies_lock);
  26379. + raw_spin_lock(&jiffies_lock);
  26380. + write_seqcount_begin(&jiffies_seq);
  26381. do_timer(ticks);
  26382. - write_sequnlock(&jiffies_lock);
  26383. + write_seqcount_end(&jiffies_seq);
  26384. + raw_spin_unlock(&jiffies_lock);
  26385. update_wall_time();
  26386. }
  26387. diff -Nur linux-3.18.14.orig/kernel/time/timer.c linux-3.18.14-rt/kernel/time/timer.c
  26388. --- linux-3.18.14.orig/kernel/time/timer.c 2015-05-20 10:04:50.000000000 -0500
  26389. +++ linux-3.18.14-rt/kernel/time/timer.c 2015-05-31 15:32:48.973635362 -0500
  26390. @@ -78,6 +78,9 @@
  26391. struct tvec_base {
  26392. spinlock_t lock;
  26393. struct timer_list *running_timer;
  26394. +#ifdef CONFIG_PREEMPT_RT_FULL
  26395. + wait_queue_head_t wait_for_running_timer;
  26396. +#endif
  26397. unsigned long timer_jiffies;
  26398. unsigned long next_timer;
  26399. unsigned long active_timers;
  26400. @@ -758,6 +761,36 @@
  26401. }
  26402. }
  26403. +#ifndef CONFIG_PREEMPT_RT_FULL
  26404. +static inline struct tvec_base *switch_timer_base(struct timer_list *timer,
  26405. + struct tvec_base *old,
  26406. + struct tvec_base *new)
  26407. +{
  26408. + /* See the comment in lock_timer_base() */
  26409. + timer_set_base(timer, NULL);
  26410. + spin_unlock(&old->lock);
  26411. + spin_lock(&new->lock);
  26412. + timer_set_base(timer, new);
  26413. + return new;
  26414. +}
  26415. +#else
  26416. +static inline struct tvec_base *switch_timer_base(struct timer_list *timer,
  26417. + struct tvec_base *old,
  26418. + struct tvec_base *new)
  26419. +{
  26420. + /*
  26421. + * We cannot do the above because we might be preempted and
  26422. + * then the preempter would see NULL and loop forever.
  26423. + */
  26424. + if (spin_trylock(&new->lock)) {
  26425. + timer_set_base(timer, new);
  26426. + spin_unlock(&old->lock);
  26427. + return new;
  26428. + }
  26429. + return old;
  26430. +}
  26431. +#endif
  26432. +
  26433. static inline int
  26434. __mod_timer(struct timer_list *timer, unsigned long expires,
  26435. bool pending_only, int pinned)
  26436. @@ -788,14 +821,8 @@
  26437. * handler yet has not finished. This also guarantees that
  26438. * the timer is serialized wrt itself.
  26439. */
  26440. - if (likely(base->running_timer != timer)) {
  26441. - /* See the comment in lock_timer_base() */
  26442. - timer_set_base(timer, NULL);
  26443. - spin_unlock(&base->lock);
  26444. - base = new_base;
  26445. - spin_lock(&base->lock);
  26446. - timer_set_base(timer, base);
  26447. - }
  26448. + if (likely(base->running_timer != timer))
  26449. + base = switch_timer_base(timer, base, new_base);
  26450. }
  26451. timer->expires = expires;
  26452. @@ -969,6 +996,29 @@
  26453. }
  26454. EXPORT_SYMBOL_GPL(add_timer_on);
  26455. +#ifdef CONFIG_PREEMPT_RT_FULL
  26456. +/*
  26457. + * Wait for a running timer
  26458. + */
  26459. +static void wait_for_running_timer(struct timer_list *timer)
  26460. +{
  26461. + struct tvec_base *base = timer->base;
  26462. +
  26463. + if (base->running_timer == timer)
  26464. + wait_event(base->wait_for_running_timer,
  26465. + base->running_timer != timer);
  26466. +}
  26467. +
  26468. +# define wakeup_timer_waiters(b) wake_up(&(b)->wait_for_running_timer)
  26469. +#else
  26470. +static inline void wait_for_running_timer(struct timer_list *timer)
  26471. +{
  26472. + cpu_relax();
  26473. +}
  26474. +
  26475. +# define wakeup_timer_waiters(b) do { } while (0)
  26476. +#endif
  26477. +
  26478. /**
  26479. * del_timer - deactive a timer.
  26480. * @timer: the timer to be deactivated
  26481. @@ -1026,7 +1076,7 @@
  26482. }
  26483. EXPORT_SYMBOL(try_to_del_timer_sync);
  26484. -#ifdef CONFIG_SMP
  26485. +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
  26486. /**
  26487. * del_timer_sync - deactivate a timer and wait for the handler to finish.
  26488. * @timer: the timer to be deactivated
  26489. @@ -1086,7 +1136,7 @@
  26490. int ret = try_to_del_timer_sync(timer);
  26491. if (ret >= 0)
  26492. return ret;
  26493. - cpu_relax();
  26494. + wait_for_running_timer(timer);
  26495. }
  26496. }
  26497. EXPORT_SYMBOL(del_timer_sync);
  26498. @@ -1207,15 +1257,17 @@
  26499. if (irqsafe) {
  26500. spin_unlock(&base->lock);
  26501. call_timer_fn(timer, fn, data);
  26502. + base->running_timer = NULL;
  26503. spin_lock(&base->lock);
  26504. } else {
  26505. spin_unlock_irq(&base->lock);
  26506. call_timer_fn(timer, fn, data);
  26507. + base->running_timer = NULL;
  26508. spin_lock_irq(&base->lock);
  26509. }
  26510. }
  26511. }
  26512. - base->running_timer = NULL;
  26513. + wakeup_timer_waiters(base);
  26514. spin_unlock_irq(&base->lock);
  26515. }
  26516. @@ -1355,17 +1407,31 @@
  26517. if (cpu_is_offline(smp_processor_id()))
  26518. return expires;
  26519. +#ifdef CONFIG_PREEMPT_RT_FULL
  26520. + /*
  26521. + * On PREEMPT_RT we cannot sleep here. If the trylock does not
  26522. + * succeed then we return the worst-case 'expires in 1 tick'
  26523. + * value. We use the rt functions here directly to avoid a
  26524. + * migrate_disable() call.
  26525. + */
  26526. + if (!spin_do_trylock(&base->lock))
  26527. + return now + 1;
  26528. +#else
  26529. spin_lock(&base->lock);
  26530. +#endif
  26531. if (base->active_timers) {
  26532. if (time_before_eq(base->next_timer, base->timer_jiffies))
  26533. base->next_timer = __next_timer_interrupt(base);
  26534. expires = base->next_timer;
  26535. }
  26536. +#ifdef CONFIG_PREEMPT_RT_FULL
  26537. + rt_spin_unlock_after_trylock_in_irq(&base->lock);
  26538. +#else
  26539. spin_unlock(&base->lock);
  26540. +#endif
  26541. if (time_before_eq(expires, now))
  26542. return now;
  26543. -
  26544. return cmp_next_hrtimer_event(now, expires);
  26545. }
  26546. #endif
  26547. @@ -1381,13 +1447,13 @@
  26548. /* Note: this timer irq context must be accounted for as well. */
  26549. account_process_tick(p, user_tick);
  26550. + scheduler_tick();
  26551. run_local_timers();
  26552. rcu_check_callbacks(cpu, user_tick);
  26553. -#ifdef CONFIG_IRQ_WORK
  26554. - if (in_irq())
  26555. - irq_work_tick();
  26556. +
  26557. +#if defined(CONFIG_IRQ_WORK) && !defined(CONFIG_PREEMPT_RT_FULL)
  26558. + irq_work_tick();
  26559. #endif
  26560. - scheduler_tick();
  26561. run_posix_cpu_timers(p);
  26562. }
  26563. @@ -1400,6 +1466,10 @@
  26564. hrtimer_run_pending();
  26565. +#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_PREEMPT_RT_FULL)
  26566. + irq_work_tick();
  26567. +#endif
  26568. +
  26569. if (time_after_eq(jiffies, base->timer_jiffies))
  26570. __run_timers(base);
  26571. }
  26572. @@ -1574,6 +1644,9 @@
  26573. base = per_cpu(tvec_bases, cpu);
  26574. }
  26575. +#ifdef CONFIG_PREEMPT_RT_FULL
  26576. + init_waitqueue_head(&base->wait_for_running_timer);
  26577. +#endif
  26578. for (j = 0; j < TVN_SIZE; j++) {
  26579. INIT_LIST_HEAD(base->tv5.vec + j);
  26580. @@ -1613,7 +1686,7 @@
  26581. BUG_ON(cpu_online(cpu));
  26582. old_base = per_cpu(tvec_bases, cpu);
  26583. - new_base = get_cpu_var(tvec_bases);
  26584. + new_base = get_local_var(tvec_bases);
  26585. /*
  26586. * The caller is globally serialized and nobody else
  26587. * takes two locks at once, deadlock is not possible.
  26588. @@ -1634,7 +1707,7 @@
  26589. spin_unlock(&old_base->lock);
  26590. spin_unlock_irq(&new_base->lock);
  26591. - put_cpu_var(tvec_bases);
  26592. + put_local_var(tvec_bases);
  26593. }
  26594. #endif /* CONFIG_HOTPLUG_CPU */
  26595. diff -Nur linux-3.18.14.orig/kernel/trace/Kconfig linux-3.18.14-rt/kernel/trace/Kconfig
  26596. --- linux-3.18.14.orig/kernel/trace/Kconfig 2015-05-20 10:04:50.000000000 -0500
  26597. +++ linux-3.18.14-rt/kernel/trace/Kconfig 2015-05-31 15:32:48.973635362 -0500
  26598. @@ -187,6 +187,24 @@
  26599. enabled. This option and the preempt-off timing option can be
  26600. used together or separately.)
  26601. +config INTERRUPT_OFF_HIST
  26602. + bool "Interrupts-off Latency Histogram"
  26603. + depends on IRQSOFF_TRACER
  26604. + help
  26605. + This option generates continuously updated histograms (one per cpu)
  26606. + of the duration of time periods with interrupts disabled. The
  26607. + histograms are disabled by default. To enable them, write a non-zero
  26608. + number to
  26609. +
  26610. + /sys/kernel/debug/tracing/latency_hist/enable/preemptirqsoff
  26611. +
  26612. + If PREEMPT_OFF_HIST is also selected, additional histograms (one
  26613. + per cpu) are generated that accumulate the duration of time periods
  26614. + when both interrupts and preemption are disabled. The histogram data
  26615. + will be located in the debug file system at
  26616. +
  26617. + /sys/kernel/debug/tracing/latency_hist/irqsoff
  26618. +
  26619. config PREEMPT_TRACER
  26620. bool "Preemption-off Latency Tracer"
  26621. default n
  26622. @@ -211,6 +229,24 @@
  26623. enabled. This option and the irqs-off timing option can be
  26624. used together or separately.)
  26625. +config PREEMPT_OFF_HIST
  26626. + bool "Preemption-off Latency Histogram"
  26627. + depends on PREEMPT_TRACER
  26628. + help
  26629. + This option generates continuously updated histograms (one per cpu)
  26630. + of the duration of time periods with preemption disabled. The
  26631. + histograms are disabled by default. To enable them, write a non-zero
  26632. + number to
  26633. +
  26634. + /sys/kernel/debug/tracing/latency_hist/enable/preemptirqsoff
  26635. +
  26636. + If INTERRUPT_OFF_HIST is also selected, additional histograms (one
  26637. + per cpu) are generated that accumulate the duration of time periods
  26638. + when both interrupts and preemption are disabled. The histogram data
  26639. + will be located in the debug file system at
  26640. +
  26641. + /sys/kernel/debug/tracing/latency_hist/preemptoff
  26642. +
  26643. config SCHED_TRACER
  26644. bool "Scheduling Latency Tracer"
  26645. select GENERIC_TRACER
  26646. @@ -221,6 +257,74 @@
  26647. This tracer tracks the latency of the highest priority task
  26648. to be scheduled in, starting from the point it has woken up.
  26649. +config WAKEUP_LATENCY_HIST
  26650. + bool "Scheduling Latency Histogram"
  26651. + depends on SCHED_TRACER
  26652. + help
  26653. + This option generates continuously updated histograms (one per cpu)
  26654. + of the scheduling latency of the highest priority task.
  26655. + The histograms are disabled by default. To enable them, write a
  26656. + non-zero number to
  26657. +
  26658. + /sys/kernel/debug/tracing/latency_hist/enable/wakeup
  26659. +
  26660. + Two different algorithms are used, one to determine the latency of
  26661. + processes that exclusively use the highest priority of the system and
  26662. + another one to determine the latency of processes that share the
  26663. + highest system priority with other processes. The former is used to
  26664. + improve hardware and system software, the latter to optimize the
  26665. + priority design of a given system. The histogram data will be
  26666. + located in the debug file system at
  26667. +
  26668. + /sys/kernel/debug/tracing/latency_hist/wakeup
  26669. +
  26670. + and
  26671. +
  26672. + /sys/kernel/debug/tracing/latency_hist/wakeup/sharedprio
  26673. +
  26674. + If both Scheduling Latency Histogram and Missed Timer Offsets
  26675. + Histogram are selected, additional histogram data will be collected
  26676. + that contain, in addition to the wakeup latency, the timer latency, in
  26677. + case the wakeup was triggered by an expired timer. These histograms
  26678. + are available in the
  26679. +
  26680. + /sys/kernel/debug/tracing/latency_hist/timerandwakeup
  26681. +
  26682. + directory. They reflect the apparent interrupt and scheduling latency
  26683. + and are best suitable to determine the worst-case latency of a given
  26684. + system. To enable these histograms, write a non-zero number to
  26685. +
  26686. + /sys/kernel/debug/tracing/latency_hist/enable/timerandwakeup
  26687. +
  26688. +config MISSED_TIMER_OFFSETS_HIST
  26689. + depends on HIGH_RES_TIMERS
  26690. + select GENERIC_TRACER
  26691. + bool "Missed Timer Offsets Histogram"
  26692. + help
  26693. + Generate a histogram of missed timer offsets in microseconds. The
  26694. + histograms are disabled by default. To enable them, write a non-zero
  26695. + number to
  26696. +
  26697. + /sys/kernel/debug/tracing/latency_hist/enable/missed_timer_offsets
  26698. +
  26699. + The histogram data will be located in the debug file system at
  26700. +
  26701. + /sys/kernel/debug/tracing/latency_hist/missed_timer_offsets
  26702. +
  26703. + If both Scheduling Latency Histogram and Missed Timer Offsets
  26704. + Histogram are selected, additional histogram data will be collected
  26705. + that contain, in addition to the wakeup latency, the timer latency, in
  26706. + case the wakeup was triggered by an expired timer. These histograms
  26707. + are available in the
  26708. +
  26709. + /sys/kernel/debug/tracing/latency_hist/timerandwakeup
  26710. +
  26711. + directory. They reflect the apparent interrupt and scheduling latency
  26712. + and are best suitable to determine the worst-case latency of a given
  26713. + system. To enable these histograms, write a non-zero number to
  26714. +
  26715. + /sys/kernel/debug/tracing/latency_hist/enable/timerandwakeup
  26716. +
  26717. config ENABLE_DEFAULT_TRACERS
  26718. bool "Trace process context switches and events"
  26719. depends on !GENERIC_TRACER
  26720. diff -Nur linux-3.18.14.orig/kernel/trace/latency_hist.c linux-3.18.14-rt/kernel/trace/latency_hist.c
  26721. --- linux-3.18.14.orig/kernel/trace/latency_hist.c 1969-12-31 18:00:00.000000000 -0600
  26722. +++ linux-3.18.14-rt/kernel/trace/latency_hist.c 2015-05-31 15:32:48.989635362 -0500
  26723. @@ -0,0 +1,1178 @@
  26724. +/*
  26725. + * kernel/trace/latency_hist.c
  26726. + *
  26727. + * Add support for histograms of preemption-off latency and
  26728. + * interrupt-off latency and wakeup latency, it depends on
  26729. + * Real-Time Preemption Support.
  26730. + *
  26731. + * Copyright (C) 2005 MontaVista Software, Inc.
  26732. + * Yi Yang <yyang@ch.mvista.com>
  26733. + *
  26734. + * Converted to work with the new latency tracer.
  26735. + * Copyright (C) 2008 Red Hat, Inc.
  26736. + * Steven Rostedt <srostedt@redhat.com>
  26737. + *
  26738. + */
  26739. +#include <linux/module.h>
  26740. +#include <linux/debugfs.h>
  26741. +#include <linux/seq_file.h>
  26742. +#include <linux/percpu.h>
  26743. +#include <linux/kallsyms.h>
  26744. +#include <linux/uaccess.h>
  26745. +#include <linux/sched.h>
  26746. +#include <linux/sched/rt.h>
  26747. +#include <linux/slab.h>
  26748. +#include <linux/atomic.h>
  26749. +#include <asm/div64.h>
  26750. +
  26751. +#include "trace.h"
  26752. +#include <trace/events/sched.h>
  26753. +
  26754. +#define NSECS_PER_USECS 1000L
  26755. +
  26756. +#define CREATE_TRACE_POINTS
  26757. +#include <trace/events/hist.h>
  26758. +
  26759. +enum {
  26760. + IRQSOFF_LATENCY = 0,
  26761. + PREEMPTOFF_LATENCY,
  26762. + PREEMPTIRQSOFF_LATENCY,
  26763. + WAKEUP_LATENCY,
  26764. + WAKEUP_LATENCY_SHAREDPRIO,
  26765. + MISSED_TIMER_OFFSETS,
  26766. + TIMERANDWAKEUP_LATENCY,
  26767. + MAX_LATENCY_TYPE,
  26768. +};
  26769. +
  26770. +#define MAX_ENTRY_NUM 10240
  26771. +
  26772. +struct hist_data {
  26773. + atomic_t hist_mode; /* 0 log, 1 don't log */
  26774. + long offset; /* set it to MAX_ENTRY_NUM/2 for a bipolar scale */
  26775. + long min_lat;
  26776. + long max_lat;
  26777. + unsigned long long below_hist_bound_samples;
  26778. + unsigned long long above_hist_bound_samples;
  26779. + long long accumulate_lat;
  26780. + unsigned long long total_samples;
  26781. + unsigned long long hist_array[MAX_ENTRY_NUM];
  26782. +};
  26783. +
  26784. +struct enable_data {
  26785. + int latency_type;
  26786. + int enabled;
  26787. +};
  26788. +
  26789. +static char *latency_hist_dir_root = "latency_hist";
  26790. +
  26791. +#ifdef CONFIG_INTERRUPT_OFF_HIST
  26792. +static DEFINE_PER_CPU(struct hist_data, irqsoff_hist);
  26793. +static char *irqsoff_hist_dir = "irqsoff";
  26794. +static DEFINE_PER_CPU(cycles_t, hist_irqsoff_start);
  26795. +static DEFINE_PER_CPU(int, hist_irqsoff_counting);
  26796. +#endif
  26797. +
  26798. +#ifdef CONFIG_PREEMPT_OFF_HIST
  26799. +static DEFINE_PER_CPU(struct hist_data, preemptoff_hist);
  26800. +static char *preemptoff_hist_dir = "preemptoff";
  26801. +static DEFINE_PER_CPU(cycles_t, hist_preemptoff_start);
  26802. +static DEFINE_PER_CPU(int, hist_preemptoff_counting);
  26803. +#endif
  26804. +
  26805. +#if defined(CONFIG_PREEMPT_OFF_HIST) && defined(CONFIG_INTERRUPT_OFF_HIST)
  26806. +static DEFINE_PER_CPU(struct hist_data, preemptirqsoff_hist);
  26807. +static char *preemptirqsoff_hist_dir = "preemptirqsoff";
  26808. +static DEFINE_PER_CPU(cycles_t, hist_preemptirqsoff_start);
  26809. +static DEFINE_PER_CPU(int, hist_preemptirqsoff_counting);
  26810. +#endif
  26811. +
  26812. +#if defined(CONFIG_PREEMPT_OFF_HIST) || defined(CONFIG_INTERRUPT_OFF_HIST)
  26813. +static notrace void probe_preemptirqsoff_hist(void *v, int reason, int start);
  26814. +static struct enable_data preemptirqsoff_enabled_data = {
  26815. + .latency_type = PREEMPTIRQSOFF_LATENCY,
  26816. + .enabled = 0,
  26817. +};
  26818. +#endif
  26819. +
  26820. +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
  26821. + defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
  26822. +struct maxlatproc_data {
  26823. + char comm[FIELD_SIZEOF(struct task_struct, comm)];
  26824. + char current_comm[FIELD_SIZEOF(struct task_struct, comm)];
  26825. + int pid;
  26826. + int current_pid;
  26827. + int prio;
  26828. + int current_prio;
  26829. + long latency;
  26830. + long timeroffset;
  26831. + cycle_t timestamp;
  26832. +};
  26833. +#endif
  26834. +
  26835. +#ifdef CONFIG_WAKEUP_LATENCY_HIST
  26836. +static DEFINE_PER_CPU(struct hist_data, wakeup_latency_hist);
  26837. +static DEFINE_PER_CPU(struct hist_data, wakeup_latency_hist_sharedprio);
  26838. +static char *wakeup_latency_hist_dir = "wakeup";
  26839. +static char *wakeup_latency_hist_dir_sharedprio = "sharedprio";
  26840. +static notrace void probe_wakeup_latency_hist_start(void *v,
  26841. + struct task_struct *p, int success);
  26842. +static notrace void probe_wakeup_latency_hist_stop(void *v,
  26843. + struct task_struct *prev, struct task_struct *next);
  26844. +static notrace void probe_sched_migrate_task(void *,
  26845. + struct task_struct *task, int cpu);
  26846. +static struct enable_data wakeup_latency_enabled_data = {
  26847. + .latency_type = WAKEUP_LATENCY,
  26848. + .enabled = 0,
  26849. +};
  26850. +static DEFINE_PER_CPU(struct maxlatproc_data, wakeup_maxlatproc);
  26851. +static DEFINE_PER_CPU(struct maxlatproc_data, wakeup_maxlatproc_sharedprio);
  26852. +static DEFINE_PER_CPU(struct task_struct *, wakeup_task);
  26853. +static DEFINE_PER_CPU(int, wakeup_sharedprio);
  26854. +static unsigned long wakeup_pid;
  26855. +#endif
  26856. +
  26857. +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
  26858. +static DEFINE_PER_CPU(struct hist_data, missed_timer_offsets);
  26859. +static char *missed_timer_offsets_dir = "missed_timer_offsets";
  26860. +static notrace void probe_hrtimer_interrupt(void *v, int cpu,
  26861. + long long offset, struct task_struct *curr, struct task_struct *task);
  26862. +static struct enable_data missed_timer_offsets_enabled_data = {
  26863. + .latency_type = MISSED_TIMER_OFFSETS,
  26864. + .enabled = 0,
  26865. +};
  26866. +static DEFINE_PER_CPU(struct maxlatproc_data, missed_timer_offsets_maxlatproc);
  26867. +static unsigned long missed_timer_offsets_pid;
  26868. +#endif
  26869. +
  26870. +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
  26871. + defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
  26872. +static DEFINE_PER_CPU(struct hist_data, timerandwakeup_latency_hist);
  26873. +static char *timerandwakeup_latency_hist_dir = "timerandwakeup";
  26874. +static struct enable_data timerandwakeup_enabled_data = {
  26875. + .latency_type = TIMERANDWAKEUP_LATENCY,
  26876. + .enabled = 0,
  26877. +};
  26878. +static DEFINE_PER_CPU(struct maxlatproc_data, timerandwakeup_maxlatproc);
  26879. +#endif
  26880. +
  26881. +void notrace latency_hist(int latency_type, int cpu, long latency,
  26882. + long timeroffset, cycle_t stop,
  26883. + struct task_struct *p)
  26884. +{
  26885. + struct hist_data *my_hist;
  26886. +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
  26887. + defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
  26888. + struct maxlatproc_data *mp = NULL;
  26889. +#endif
  26890. +
  26891. + if (!cpu_possible(cpu) || latency_type < 0 ||
  26892. + latency_type >= MAX_LATENCY_TYPE)
  26893. + return;
  26894. +
  26895. + switch (latency_type) {
  26896. +#ifdef CONFIG_INTERRUPT_OFF_HIST
  26897. + case IRQSOFF_LATENCY:
  26898. + my_hist = &per_cpu(irqsoff_hist, cpu);
  26899. + break;
  26900. +#endif
  26901. +#ifdef CONFIG_PREEMPT_OFF_HIST
  26902. + case PREEMPTOFF_LATENCY:
  26903. + my_hist = &per_cpu(preemptoff_hist, cpu);
  26904. + break;
  26905. +#endif
  26906. +#if defined(CONFIG_PREEMPT_OFF_HIST) && defined(CONFIG_INTERRUPT_OFF_HIST)
  26907. + case PREEMPTIRQSOFF_LATENCY:
  26908. + my_hist = &per_cpu(preemptirqsoff_hist, cpu);
  26909. + break;
  26910. +#endif
  26911. +#ifdef CONFIG_WAKEUP_LATENCY_HIST
  26912. + case WAKEUP_LATENCY:
  26913. + my_hist = &per_cpu(wakeup_latency_hist, cpu);
  26914. + mp = &per_cpu(wakeup_maxlatproc, cpu);
  26915. + break;
  26916. + case WAKEUP_LATENCY_SHAREDPRIO:
  26917. + my_hist = &per_cpu(wakeup_latency_hist_sharedprio, cpu);
  26918. + mp = &per_cpu(wakeup_maxlatproc_sharedprio, cpu);
  26919. + break;
  26920. +#endif
  26921. +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
  26922. + case MISSED_TIMER_OFFSETS:
  26923. + my_hist = &per_cpu(missed_timer_offsets, cpu);
  26924. + mp = &per_cpu(missed_timer_offsets_maxlatproc, cpu);
  26925. + break;
  26926. +#endif
  26927. +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
  26928. + defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
  26929. + case TIMERANDWAKEUP_LATENCY:
  26930. + my_hist = &per_cpu(timerandwakeup_latency_hist, cpu);
  26931. + mp = &per_cpu(timerandwakeup_maxlatproc, cpu);
  26932. + break;
  26933. +#endif
  26934. +
  26935. + default:
  26936. + return;
  26937. + }
  26938. +
  26939. + latency += my_hist->offset;
  26940. +
  26941. + if (atomic_read(&my_hist->hist_mode) == 0)
  26942. + return;
  26943. +
  26944. + if (latency < 0 || latency >= MAX_ENTRY_NUM) {
  26945. + if (latency < 0)
  26946. + my_hist->below_hist_bound_samples++;
  26947. + else
  26948. + my_hist->above_hist_bound_samples++;
  26949. + } else
  26950. + my_hist->hist_array[latency]++;
  26951. +
  26952. + if (unlikely(latency > my_hist->max_lat ||
  26953. + my_hist->min_lat == LONG_MAX)) {
  26954. +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
  26955. + defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
  26956. + if (latency_type == WAKEUP_LATENCY ||
  26957. + latency_type == WAKEUP_LATENCY_SHAREDPRIO ||
  26958. + latency_type == MISSED_TIMER_OFFSETS ||
  26959. + latency_type == TIMERANDWAKEUP_LATENCY) {
  26960. + strncpy(mp->comm, p->comm, sizeof(mp->comm));
  26961. + strncpy(mp->current_comm, current->comm,
  26962. + sizeof(mp->current_comm));
  26963. + mp->pid = task_pid_nr(p);
  26964. + mp->current_pid = task_pid_nr(current);
  26965. + mp->prio = p->prio;
  26966. + mp->current_prio = current->prio;
  26967. + mp->latency = latency;
  26968. + mp->timeroffset = timeroffset;
  26969. + mp->timestamp = stop;
  26970. + }
  26971. +#endif
  26972. + my_hist->max_lat = latency;
  26973. + }
  26974. + if (unlikely(latency < my_hist->min_lat))
  26975. + my_hist->min_lat = latency;
  26976. + my_hist->total_samples++;
  26977. + my_hist->accumulate_lat += latency;
  26978. +}
  26979. +
  26980. +static void *l_start(struct seq_file *m, loff_t *pos)
  26981. +{
  26982. + loff_t *index_ptr = NULL;
  26983. + loff_t index = *pos;
  26984. + struct hist_data *my_hist = m->private;
  26985. +
  26986. + if (index == 0) {
  26987. + char minstr[32], avgstr[32], maxstr[32];
  26988. +
  26989. + atomic_dec(&my_hist->hist_mode);
  26990. +
  26991. + if (likely(my_hist->total_samples)) {
  26992. + long avg = (long) div64_s64(my_hist->accumulate_lat,
  26993. + my_hist->total_samples);
  26994. + snprintf(minstr, sizeof(minstr), "%ld",
  26995. + my_hist->min_lat - my_hist->offset);
  26996. + snprintf(avgstr, sizeof(avgstr), "%ld",
  26997. + avg - my_hist->offset);
  26998. + snprintf(maxstr, sizeof(maxstr), "%ld",
  26999. + my_hist->max_lat - my_hist->offset);
  27000. + } else {
  27001. + strcpy(minstr, "<undef>");
  27002. + strcpy(avgstr, minstr);
  27003. + strcpy(maxstr, minstr);
  27004. + }
  27005. +
  27006. + seq_printf(m, "#Minimum latency: %s microseconds\n"
  27007. + "#Average latency: %s microseconds\n"
  27008. + "#Maximum latency: %s microseconds\n"
  27009. + "#Total samples: %llu\n"
  27010. + "#There are %llu samples lower than %ld"
  27011. + " microseconds.\n"
  27012. + "#There are %llu samples greater or equal"
  27013. + " than %ld microseconds.\n"
  27014. + "#usecs\t%16s\n",
  27015. + minstr, avgstr, maxstr,
  27016. + my_hist->total_samples,
  27017. + my_hist->below_hist_bound_samples,
  27018. + -my_hist->offset,
  27019. + my_hist->above_hist_bound_samples,
  27020. + MAX_ENTRY_NUM - my_hist->offset,
  27021. + "samples");
  27022. + }
  27023. + if (index < MAX_ENTRY_NUM) {
  27024. + index_ptr = kmalloc(sizeof(loff_t), GFP_KERNEL);
  27025. + if (index_ptr)
  27026. + *index_ptr = index;
  27027. + }
  27028. +
  27029. + return index_ptr;
  27030. +}
  27031. +
  27032. +static void *l_next(struct seq_file *m, void *p, loff_t *pos)
  27033. +{
  27034. + loff_t *index_ptr = p;
  27035. + struct hist_data *my_hist = m->private;
  27036. +
  27037. + if (++*pos >= MAX_ENTRY_NUM) {
  27038. + atomic_inc(&my_hist->hist_mode);
  27039. + return NULL;
  27040. + }
  27041. + *index_ptr = *pos;
  27042. + return index_ptr;
  27043. +}
  27044. +
  27045. +static void l_stop(struct seq_file *m, void *p)
  27046. +{
  27047. + kfree(p);
  27048. +}
  27049. +
  27050. +static int l_show(struct seq_file *m, void *p)
  27051. +{
  27052. + int index = *(loff_t *) p;
  27053. + struct hist_data *my_hist = m->private;
  27054. +
  27055. + seq_printf(m, "%6ld\t%16llu\n", index - my_hist->offset,
  27056. + my_hist->hist_array[index]);
  27057. + return 0;
  27058. +}
  27059. +
  27060. +static const struct seq_operations latency_hist_seq_op = {
  27061. + .start = l_start,
  27062. + .next = l_next,
  27063. + .stop = l_stop,
  27064. + .show = l_show
  27065. +};
  27066. +
  27067. +static int latency_hist_open(struct inode *inode, struct file *file)
  27068. +{
  27069. + int ret;
  27070. +
  27071. + ret = seq_open(file, &latency_hist_seq_op);
  27072. + if (!ret) {
  27073. + struct seq_file *seq = file->private_data;
  27074. + seq->private = inode->i_private;
  27075. + }
  27076. + return ret;
  27077. +}
  27078. +
  27079. +static const struct file_operations latency_hist_fops = {
  27080. + .open = latency_hist_open,
  27081. + .read = seq_read,
  27082. + .llseek = seq_lseek,
  27083. + .release = seq_release,
  27084. +};
  27085. +
  27086. +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
  27087. + defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
  27088. +static void clear_maxlatprocdata(struct maxlatproc_data *mp)
  27089. +{
  27090. + mp->comm[0] = mp->current_comm[0] = '\0';
  27091. + mp->prio = mp->current_prio = mp->pid = mp->current_pid =
  27092. + mp->latency = mp->timeroffset = -1;
  27093. + mp->timestamp = 0;
  27094. +}
  27095. +#endif
  27096. +
  27097. +static void hist_reset(struct hist_data *hist)
  27098. +{
  27099. + atomic_dec(&hist->hist_mode);
  27100. +
  27101. + memset(hist->hist_array, 0, sizeof(hist->hist_array));
  27102. + hist->below_hist_bound_samples = 0ULL;
  27103. + hist->above_hist_bound_samples = 0ULL;
  27104. + hist->min_lat = LONG_MAX;
  27105. + hist->max_lat = LONG_MIN;
  27106. + hist->total_samples = 0ULL;
  27107. + hist->accumulate_lat = 0LL;
  27108. +
  27109. + atomic_inc(&hist->hist_mode);
  27110. +}
  27111. +
  27112. +static ssize_t
  27113. +latency_hist_reset(struct file *file, const char __user *a,
  27114. + size_t size, loff_t *off)
  27115. +{
  27116. + int cpu;
  27117. + struct hist_data *hist = NULL;
  27118. +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
  27119. + defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
  27120. + struct maxlatproc_data *mp = NULL;
  27121. +#endif
  27122. + off_t latency_type = (off_t) file->private_data;
  27123. +
  27124. + for_each_online_cpu(cpu) {
  27125. +
  27126. + switch (latency_type) {
  27127. +#ifdef CONFIG_PREEMPT_OFF_HIST
  27128. + case PREEMPTOFF_LATENCY:
  27129. + hist = &per_cpu(preemptoff_hist, cpu);
  27130. + break;
  27131. +#endif
  27132. +#ifdef CONFIG_INTERRUPT_OFF_HIST
  27133. + case IRQSOFF_LATENCY:
  27134. + hist = &per_cpu(irqsoff_hist, cpu);
  27135. + break;
  27136. +#endif
  27137. +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
  27138. + case PREEMPTIRQSOFF_LATENCY:
  27139. + hist = &per_cpu(preemptirqsoff_hist, cpu);
  27140. + break;
  27141. +#endif
  27142. +#ifdef CONFIG_WAKEUP_LATENCY_HIST
  27143. + case WAKEUP_LATENCY:
  27144. + hist = &per_cpu(wakeup_latency_hist, cpu);
  27145. + mp = &per_cpu(wakeup_maxlatproc, cpu);
  27146. + break;
  27147. + case WAKEUP_LATENCY_SHAREDPRIO:
  27148. + hist = &per_cpu(wakeup_latency_hist_sharedprio, cpu);
  27149. + mp = &per_cpu(wakeup_maxlatproc_sharedprio, cpu);
  27150. + break;
  27151. +#endif
  27152. +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
  27153. + case MISSED_TIMER_OFFSETS:
  27154. + hist = &per_cpu(missed_timer_offsets, cpu);
  27155. + mp = &per_cpu(missed_timer_offsets_maxlatproc, cpu);
  27156. + break;
  27157. +#endif
  27158. +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
  27159. + defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
  27160. + case TIMERANDWAKEUP_LATENCY:
  27161. + hist = &per_cpu(timerandwakeup_latency_hist, cpu);
  27162. + mp = &per_cpu(timerandwakeup_maxlatproc, cpu);
  27163. + break;
  27164. +#endif
  27165. + }
  27166. +
  27167. + hist_reset(hist);
  27168. +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
  27169. + defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
  27170. + if (latency_type == WAKEUP_LATENCY ||
  27171. + latency_type == WAKEUP_LATENCY_SHAREDPRIO ||
  27172. + latency_type == MISSED_TIMER_OFFSETS ||
  27173. + latency_type == TIMERANDWAKEUP_LATENCY)
  27174. + clear_maxlatprocdata(mp);
  27175. +#endif
  27176. + }
  27177. +
  27178. + return size;
  27179. +}
  27180. +
  27181. +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
  27182. + defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
  27183. +static ssize_t
  27184. +show_pid(struct file *file, char __user *ubuf, size_t cnt, loff_t *ppos)
  27185. +{
  27186. + char buf[64];
  27187. + int r;
  27188. + unsigned long *this_pid = file->private_data;
  27189. +
  27190. + r = snprintf(buf, sizeof(buf), "%lu\n", *this_pid);
  27191. + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
  27192. +}
  27193. +
  27194. +static ssize_t do_pid(struct file *file, const char __user *ubuf,
  27195. + size_t cnt, loff_t *ppos)
  27196. +{
  27197. + char buf[64];
  27198. + unsigned long pid;
  27199. + unsigned long *this_pid = file->private_data;
  27200. +
  27201. + if (cnt >= sizeof(buf))
  27202. + return -EINVAL;
  27203. +
  27204. + if (copy_from_user(&buf, ubuf, cnt))
  27205. + return -EFAULT;
  27206. +
  27207. + buf[cnt] = '\0';
  27208. +
  27209. + if (kstrtoul(buf, 10, &pid))
  27210. + return -EINVAL;
  27211. +
  27212. + *this_pid = pid;
  27213. +
  27214. + return cnt;
  27215. +}
  27216. +#endif
  27217. +
  27218. +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
  27219. + defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
  27220. +static ssize_t
  27221. +show_maxlatproc(struct file *file, char __user *ubuf, size_t cnt, loff_t *ppos)
  27222. +{
  27223. + int r;
  27224. + struct maxlatproc_data *mp = file->private_data;
  27225. + int strmaxlen = (TASK_COMM_LEN * 2) + (8 * 8);
  27226. + unsigned long long t;
  27227. + unsigned long usecs, secs;
  27228. + char *buf;
  27229. +
  27230. + if (mp->pid == -1 || mp->current_pid == -1) {
  27231. + buf = "(none)\n";
  27232. + return simple_read_from_buffer(ubuf, cnt, ppos, buf,
  27233. + strlen(buf));
  27234. + }
  27235. +
  27236. + buf = kmalloc(strmaxlen, GFP_KERNEL);
  27237. + if (buf == NULL)
  27238. + return -ENOMEM;
  27239. +
  27240. + t = ns2usecs(mp->timestamp);
  27241. + usecs = do_div(t, USEC_PER_SEC);
  27242. + secs = (unsigned long) t;
  27243. + r = snprintf(buf, strmaxlen,
  27244. + "%d %d %ld (%ld) %s <- %d %d %s %lu.%06lu\n", mp->pid,
  27245. + MAX_RT_PRIO-1 - mp->prio, mp->latency, mp->timeroffset, mp->comm,
  27246. + mp->current_pid, MAX_RT_PRIO-1 - mp->current_prio, mp->current_comm,
  27247. + secs, usecs);
  27248. + r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
  27249. + kfree(buf);
  27250. + return r;
  27251. +}
  27252. +#endif
  27253. +
  27254. +static ssize_t
  27255. +show_enable(struct file *file, char __user *ubuf, size_t cnt, loff_t *ppos)
  27256. +{
  27257. + char buf[64];
  27258. + struct enable_data *ed = file->private_data;
  27259. + int r;
  27260. +
  27261. + r = snprintf(buf, sizeof(buf), "%d\n", ed->enabled);
  27262. + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
  27263. +}
  27264. +
  27265. +static ssize_t
  27266. +do_enable(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos)
  27267. +{
  27268. + char buf[64];
  27269. + long enable;
  27270. + struct enable_data *ed = file->private_data;
  27271. +
  27272. + if (cnt >= sizeof(buf))
  27273. + return -EINVAL;
  27274. +
  27275. + if (copy_from_user(&buf, ubuf, cnt))
  27276. + return -EFAULT;
  27277. +
  27278. + buf[cnt] = 0;
  27279. +
  27280. + if (kstrtoul(buf, 10, &enable))
  27281. + return -EINVAL;
  27282. +
  27283. + if ((enable && ed->enabled) || (!enable && !ed->enabled))
  27284. + return cnt;
  27285. +
  27286. + if (enable) {
  27287. + int ret;
  27288. +
  27289. + switch (ed->latency_type) {
  27290. +#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
  27291. + case PREEMPTIRQSOFF_LATENCY:
  27292. + ret = register_trace_preemptirqsoff_hist(
  27293. + probe_preemptirqsoff_hist, NULL);
  27294. + if (ret) {
  27295. + pr_info("wakeup trace: Couldn't assign "
  27296. + "probe_preemptirqsoff_hist "
  27297. + "to trace_preemptirqsoff_hist\n");
  27298. + return ret;
  27299. + }
  27300. + break;
  27301. +#endif
  27302. +#ifdef CONFIG_WAKEUP_LATENCY_HIST
  27303. + case WAKEUP_LATENCY:
  27304. + ret = register_trace_sched_wakeup(
  27305. + probe_wakeup_latency_hist_start, NULL);
  27306. + if (ret) {
  27307. + pr_info("wakeup trace: Couldn't assign "
  27308. + "probe_wakeup_latency_hist_start "
  27309. + "to trace_sched_wakeup\n");
  27310. + return ret;
  27311. + }
  27312. + ret = register_trace_sched_wakeup_new(
  27313. + probe_wakeup_latency_hist_start, NULL);
  27314. + if (ret) {
  27315. + pr_info("wakeup trace: Couldn't assign "
  27316. + "probe_wakeup_latency_hist_start "
  27317. + "to trace_sched_wakeup_new\n");
  27318. + unregister_trace_sched_wakeup(
  27319. + probe_wakeup_latency_hist_start, NULL);
  27320. + return ret;
  27321. + }
  27322. + ret = register_trace_sched_switch(
  27323. + probe_wakeup_latency_hist_stop, NULL);
  27324. + if (ret) {
  27325. + pr_info("wakeup trace: Couldn't assign "
  27326. + "probe_wakeup_latency_hist_stop "
  27327. + "to trace_sched_switch\n");
  27328. + unregister_trace_sched_wakeup(
  27329. + probe_wakeup_latency_hist_start, NULL);
  27330. + unregister_trace_sched_wakeup_new(
  27331. + probe_wakeup_latency_hist_start, NULL);
  27332. + return ret;
  27333. + }
  27334. + ret = register_trace_sched_migrate_task(
  27335. + probe_sched_migrate_task, NULL);
  27336. + if (ret) {
  27337. + pr_info("wakeup trace: Couldn't assign "
  27338. + "probe_sched_migrate_task "
  27339. + "to trace_sched_migrate_task\n");
  27340. + unregister_trace_sched_wakeup(
  27341. + probe_wakeup_latency_hist_start, NULL);
  27342. + unregister_trace_sched_wakeup_new(
  27343. + probe_wakeup_latency_hist_start, NULL);
  27344. + unregister_trace_sched_switch(
  27345. + probe_wakeup_latency_hist_stop, NULL);
  27346. + return ret;
  27347. + }
  27348. + break;
  27349. +#endif
  27350. +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
  27351. + case MISSED_TIMER_OFFSETS:
  27352. + ret = register_trace_hrtimer_interrupt(
  27353. + probe_hrtimer_interrupt, NULL);
  27354. + if (ret) {
  27355. + pr_info("wakeup trace: Couldn't assign "
  27356. + "probe_hrtimer_interrupt "
  27357. + "to trace_hrtimer_interrupt\n");
  27358. + return ret;
  27359. + }
  27360. + break;
  27361. +#endif
  27362. +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
  27363. + defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
  27364. + case TIMERANDWAKEUP_LATENCY:
  27365. + if (!wakeup_latency_enabled_data.enabled ||
  27366. + !missed_timer_offsets_enabled_data.enabled)
  27367. + return -EINVAL;
  27368. + break;
  27369. +#endif
  27370. + default:
  27371. + break;
  27372. + }
  27373. + } else {
  27374. + switch (ed->latency_type) {
  27375. +#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
  27376. + case PREEMPTIRQSOFF_LATENCY:
  27377. + {
  27378. + int cpu;
  27379. +
  27380. + unregister_trace_preemptirqsoff_hist(
  27381. + probe_preemptirqsoff_hist, NULL);
  27382. + for_each_online_cpu(cpu) {
  27383. +#ifdef CONFIG_INTERRUPT_OFF_HIST
  27384. + per_cpu(hist_irqsoff_counting,
  27385. + cpu) = 0;
  27386. +#endif
  27387. +#ifdef CONFIG_PREEMPT_OFF_HIST
  27388. + per_cpu(hist_preemptoff_counting,
  27389. + cpu) = 0;
  27390. +#endif
  27391. +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
  27392. + per_cpu(hist_preemptirqsoff_counting,
  27393. + cpu) = 0;
  27394. +#endif
  27395. + }
  27396. + }
  27397. + break;
  27398. +#endif
  27399. +#ifdef CONFIG_WAKEUP_LATENCY_HIST
  27400. + case WAKEUP_LATENCY:
  27401. + {
  27402. + int cpu;
  27403. +
  27404. + unregister_trace_sched_wakeup(
  27405. + probe_wakeup_latency_hist_start, NULL);
  27406. + unregister_trace_sched_wakeup_new(
  27407. + probe_wakeup_latency_hist_start, NULL);
  27408. + unregister_trace_sched_switch(
  27409. + probe_wakeup_latency_hist_stop, NULL);
  27410. + unregister_trace_sched_migrate_task(
  27411. + probe_sched_migrate_task, NULL);
  27412. +
  27413. + for_each_online_cpu(cpu) {
  27414. + per_cpu(wakeup_task, cpu) = NULL;
  27415. + per_cpu(wakeup_sharedprio, cpu) = 0;
  27416. + }
  27417. + }
  27418. +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
  27419. + timerandwakeup_enabled_data.enabled = 0;
  27420. +#endif
  27421. + break;
  27422. +#endif
  27423. +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
  27424. + case MISSED_TIMER_OFFSETS:
  27425. + unregister_trace_hrtimer_interrupt(
  27426. + probe_hrtimer_interrupt, NULL);
  27427. +#ifdef CONFIG_WAKEUP_LATENCY_HIST
  27428. + timerandwakeup_enabled_data.enabled = 0;
  27429. +#endif
  27430. + break;
  27431. +#endif
  27432. + default:
  27433. + break;
  27434. + }
  27435. + }
  27436. + ed->enabled = enable;
  27437. + return cnt;
  27438. +}
  27439. +
  27440. +static const struct file_operations latency_hist_reset_fops = {
  27441. + .open = tracing_open_generic,
  27442. + .write = latency_hist_reset,
  27443. +};
  27444. +
  27445. +static const struct file_operations enable_fops = {
  27446. + .open = tracing_open_generic,
  27447. + .read = show_enable,
  27448. + .write = do_enable,
  27449. +};
  27450. +
  27451. +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
  27452. + defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
  27453. +static const struct file_operations pid_fops = {
  27454. + .open = tracing_open_generic,
  27455. + .read = show_pid,
  27456. + .write = do_pid,
  27457. +};
  27458. +
  27459. +static const struct file_operations maxlatproc_fops = {
  27460. + .open = tracing_open_generic,
  27461. + .read = show_maxlatproc,
  27462. +};
  27463. +#endif
  27464. +
  27465. +#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
  27466. +static notrace void probe_preemptirqsoff_hist(void *v, int reason,
  27467. + int starthist)
  27468. +{
  27469. + int cpu = raw_smp_processor_id();
  27470. + int time_set = 0;
  27471. +
  27472. + if (starthist) {
  27473. + cycle_t uninitialized_var(start);
  27474. +
  27475. + if (!preempt_count() && !irqs_disabled())
  27476. + return;
  27477. +
  27478. +#ifdef CONFIG_INTERRUPT_OFF_HIST
  27479. + if ((reason == IRQS_OFF || reason == TRACE_START) &&
  27480. + !per_cpu(hist_irqsoff_counting, cpu)) {
  27481. + per_cpu(hist_irqsoff_counting, cpu) = 1;
  27482. + start = ftrace_now(cpu);
  27483. + time_set++;
  27484. + per_cpu(hist_irqsoff_start, cpu) = start;
  27485. + }
  27486. +#endif
  27487. +
  27488. +#ifdef CONFIG_PREEMPT_OFF_HIST
  27489. + if ((reason == PREEMPT_OFF || reason == TRACE_START) &&
  27490. + !per_cpu(hist_preemptoff_counting, cpu)) {
  27491. + per_cpu(hist_preemptoff_counting, cpu) = 1;
  27492. + if (!(time_set++))
  27493. + start = ftrace_now(cpu);
  27494. + per_cpu(hist_preemptoff_start, cpu) = start;
  27495. + }
  27496. +#endif
  27497. +
  27498. +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
  27499. + if (per_cpu(hist_irqsoff_counting, cpu) &&
  27500. + per_cpu(hist_preemptoff_counting, cpu) &&
  27501. + !per_cpu(hist_preemptirqsoff_counting, cpu)) {
  27502. + per_cpu(hist_preemptirqsoff_counting, cpu) = 1;
  27503. + if (!time_set)
  27504. + start = ftrace_now(cpu);
  27505. + per_cpu(hist_preemptirqsoff_start, cpu) = start;
  27506. + }
  27507. +#endif
  27508. + } else {
  27509. + cycle_t uninitialized_var(stop);
  27510. +
  27511. +#ifdef CONFIG_INTERRUPT_OFF_HIST
  27512. + if ((reason == IRQS_ON || reason == TRACE_STOP) &&
  27513. + per_cpu(hist_irqsoff_counting, cpu)) {
  27514. + cycle_t start = per_cpu(hist_irqsoff_start, cpu);
  27515. +
  27516. + stop = ftrace_now(cpu);
  27517. + time_set++;
  27518. + if (start) {
  27519. + long latency = ((long) (stop - start)) /
  27520. + NSECS_PER_USECS;
  27521. +
  27522. + latency_hist(IRQSOFF_LATENCY, cpu, latency, 0,
  27523. + stop, NULL);
  27524. + }
  27525. + per_cpu(hist_irqsoff_counting, cpu) = 0;
  27526. + }
  27527. +#endif
  27528. +
  27529. +#ifdef CONFIG_PREEMPT_OFF_HIST
  27530. + if ((reason == PREEMPT_ON || reason == TRACE_STOP) &&
  27531. + per_cpu(hist_preemptoff_counting, cpu)) {
  27532. + cycle_t start = per_cpu(hist_preemptoff_start, cpu);
  27533. +
  27534. + if (!(time_set++))
  27535. + stop = ftrace_now(cpu);
  27536. + if (start) {
  27537. + long latency = ((long) (stop - start)) /
  27538. + NSECS_PER_USECS;
  27539. +
  27540. + latency_hist(PREEMPTOFF_LATENCY, cpu, latency,
  27541. + 0, stop, NULL);
  27542. + }
  27543. + per_cpu(hist_preemptoff_counting, cpu) = 0;
  27544. + }
  27545. +#endif
  27546. +
  27547. +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
  27548. + if ((!per_cpu(hist_irqsoff_counting, cpu) ||
  27549. + !per_cpu(hist_preemptoff_counting, cpu)) &&
  27550. + per_cpu(hist_preemptirqsoff_counting, cpu)) {
  27551. + cycle_t start = per_cpu(hist_preemptirqsoff_start, cpu);
  27552. +
  27553. + if (!time_set)
  27554. + stop = ftrace_now(cpu);
  27555. + if (start) {
  27556. + long latency = ((long) (stop - start)) /
  27557. + NSECS_PER_USECS;
  27558. +
  27559. + latency_hist(PREEMPTIRQSOFF_LATENCY, cpu,
  27560. + latency, 0, stop, NULL);
  27561. + }
  27562. + per_cpu(hist_preemptirqsoff_counting, cpu) = 0;
  27563. + }
  27564. +#endif
  27565. + }
  27566. +}
  27567. +#endif
  27568. +
  27569. +#ifdef CONFIG_WAKEUP_LATENCY_HIST
  27570. +static DEFINE_RAW_SPINLOCK(wakeup_lock);
  27571. +static notrace void probe_sched_migrate_task(void *v, struct task_struct *task,
  27572. + int cpu)
  27573. +{
  27574. + int old_cpu = task_cpu(task);
  27575. +
  27576. + if (cpu != old_cpu) {
  27577. + unsigned long flags;
  27578. + struct task_struct *cpu_wakeup_task;
  27579. +
  27580. + raw_spin_lock_irqsave(&wakeup_lock, flags);
  27581. +
  27582. + cpu_wakeup_task = per_cpu(wakeup_task, old_cpu);
  27583. + if (task == cpu_wakeup_task) {
  27584. + put_task_struct(cpu_wakeup_task);
  27585. + per_cpu(wakeup_task, old_cpu) = NULL;
  27586. + cpu_wakeup_task = per_cpu(wakeup_task, cpu) = task;
  27587. + get_task_struct(cpu_wakeup_task);
  27588. + }
  27589. +
  27590. + raw_spin_unlock_irqrestore(&wakeup_lock, flags);
  27591. + }
  27592. +}
  27593. +
  27594. +static notrace void probe_wakeup_latency_hist_start(void *v,
  27595. + struct task_struct *p, int success)
  27596. +{
  27597. + unsigned long flags;
  27598. + struct task_struct *curr = current;
  27599. + int cpu = task_cpu(p);
  27600. + struct task_struct *cpu_wakeup_task;
  27601. +
  27602. + raw_spin_lock_irqsave(&wakeup_lock, flags);
  27603. +
  27604. + cpu_wakeup_task = per_cpu(wakeup_task, cpu);
  27605. +
  27606. + if (wakeup_pid) {
  27607. + if ((cpu_wakeup_task && p->prio == cpu_wakeup_task->prio) ||
  27608. + p->prio == curr->prio)
  27609. + per_cpu(wakeup_sharedprio, cpu) = 1;
  27610. + if (likely(wakeup_pid != task_pid_nr(p)))
  27611. + goto out;
  27612. + } else {
  27613. + if (likely(!rt_task(p)) ||
  27614. + (cpu_wakeup_task && p->prio > cpu_wakeup_task->prio) ||
  27615. + p->prio > curr->prio)
  27616. + goto out;
  27617. + if ((cpu_wakeup_task && p->prio == cpu_wakeup_task->prio) ||
  27618. + p->prio == curr->prio)
  27619. + per_cpu(wakeup_sharedprio, cpu) = 1;
  27620. + }
  27621. +
  27622. + if (cpu_wakeup_task)
  27623. + put_task_struct(cpu_wakeup_task);
  27624. + cpu_wakeup_task = per_cpu(wakeup_task, cpu) = p;
  27625. + get_task_struct(cpu_wakeup_task);
  27626. + cpu_wakeup_task->preempt_timestamp_hist =
  27627. + ftrace_now(raw_smp_processor_id());
  27628. +out:
  27629. + raw_spin_unlock_irqrestore(&wakeup_lock, flags);
  27630. +}
  27631. +
  27632. +static notrace void probe_wakeup_latency_hist_stop(void *v,
  27633. + struct task_struct *prev, struct task_struct *next)
  27634. +{
  27635. + unsigned long flags;
  27636. + int cpu = task_cpu(next);
  27637. + long latency;
  27638. + cycle_t stop;
  27639. + struct task_struct *cpu_wakeup_task;
  27640. +
  27641. + raw_spin_lock_irqsave(&wakeup_lock, flags);
  27642. +
  27643. + cpu_wakeup_task = per_cpu(wakeup_task, cpu);
  27644. +
  27645. + if (cpu_wakeup_task == NULL)
  27646. + goto out;
  27647. +
  27648. + /* Already running? */
  27649. + if (unlikely(current == cpu_wakeup_task))
  27650. + goto out_reset;
  27651. +
  27652. + if (next != cpu_wakeup_task) {
  27653. + if (next->prio < cpu_wakeup_task->prio)
  27654. + goto out_reset;
  27655. +
  27656. + if (next->prio == cpu_wakeup_task->prio)
  27657. + per_cpu(wakeup_sharedprio, cpu) = 1;
  27658. +
  27659. + goto out;
  27660. + }
  27661. +
  27662. + if (current->prio == cpu_wakeup_task->prio)
  27663. + per_cpu(wakeup_sharedprio, cpu) = 1;
  27664. +
  27665. + /*
  27666. + * The task we are waiting for is about to be switched to.
  27667. + * Calculate latency and store it in histogram.
  27668. + */
  27669. + stop = ftrace_now(raw_smp_processor_id());
  27670. +
  27671. + latency = ((long) (stop - next->preempt_timestamp_hist)) /
  27672. + NSECS_PER_USECS;
  27673. +
  27674. + if (per_cpu(wakeup_sharedprio, cpu)) {
  27675. + latency_hist(WAKEUP_LATENCY_SHAREDPRIO, cpu, latency, 0, stop,
  27676. + next);
  27677. + per_cpu(wakeup_sharedprio, cpu) = 0;
  27678. + } else {
  27679. + latency_hist(WAKEUP_LATENCY, cpu, latency, 0, stop, next);
  27680. +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
  27681. + if (timerandwakeup_enabled_data.enabled) {
  27682. + latency_hist(TIMERANDWAKEUP_LATENCY, cpu,
  27683. + next->timer_offset + latency, next->timer_offset,
  27684. + stop, next);
  27685. + }
  27686. +#endif
  27687. + }
  27688. +
  27689. +out_reset:
  27690. +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
  27691. + next->timer_offset = 0;
  27692. +#endif
  27693. + put_task_struct(cpu_wakeup_task);
  27694. + per_cpu(wakeup_task, cpu) = NULL;
  27695. +out:
  27696. + raw_spin_unlock_irqrestore(&wakeup_lock, flags);
  27697. +}
  27698. +#endif
  27699. +
  27700. +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
  27701. +static notrace void probe_hrtimer_interrupt(void *v, int cpu,
  27702. + long long latency_ns, struct task_struct *curr,
  27703. + struct task_struct *task)
  27704. +{
  27705. + if (latency_ns <= 0 && task != NULL && rt_task(task) &&
  27706. + (task->prio < curr->prio ||
  27707. + (task->prio == curr->prio &&
  27708. + !cpumask_test_cpu(cpu, &task->cpus_allowed)))) {
  27709. + long latency;
  27710. + cycle_t now;
  27711. +
  27712. + if (missed_timer_offsets_pid) {
  27713. + if (likely(missed_timer_offsets_pid !=
  27714. + task_pid_nr(task)))
  27715. + return;
  27716. + }
  27717. +
  27718. + now = ftrace_now(cpu);
  27719. + latency = (long) div_s64(-latency_ns, NSECS_PER_USECS);
  27720. + latency_hist(MISSED_TIMER_OFFSETS, cpu, latency, latency, now,
  27721. + task);
  27722. +#ifdef CONFIG_WAKEUP_LATENCY_HIST
  27723. + task->timer_offset = latency;
  27724. +#endif
  27725. + }
  27726. +}
  27727. +#endif
  27728. +
  27729. +static __init int latency_hist_init(void)
  27730. +{
  27731. + struct dentry *latency_hist_root = NULL;
  27732. + struct dentry *dentry;
  27733. +#ifdef CONFIG_WAKEUP_LATENCY_HIST
  27734. + struct dentry *dentry_sharedprio;
  27735. +#endif
  27736. + struct dentry *entry;
  27737. + struct dentry *enable_root;
  27738. + int i = 0;
  27739. + struct hist_data *my_hist;
  27740. + char name[64];
  27741. + char *cpufmt = "CPU%d";
  27742. +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
  27743. + defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
  27744. + char *cpufmt_maxlatproc = "max_latency-CPU%d";
  27745. + struct maxlatproc_data *mp = NULL;
  27746. +#endif
  27747. +
  27748. + dentry = tracing_init_dentry();
  27749. + latency_hist_root = debugfs_create_dir(latency_hist_dir_root, dentry);
  27750. + enable_root = debugfs_create_dir("enable", latency_hist_root);
  27751. +
  27752. +#ifdef CONFIG_INTERRUPT_OFF_HIST
  27753. + dentry = debugfs_create_dir(irqsoff_hist_dir, latency_hist_root);
  27754. + for_each_possible_cpu(i) {
  27755. + sprintf(name, cpufmt, i);
  27756. + entry = debugfs_create_file(name, 0444, dentry,
  27757. + &per_cpu(irqsoff_hist, i), &latency_hist_fops);
  27758. + my_hist = &per_cpu(irqsoff_hist, i);
  27759. + atomic_set(&my_hist->hist_mode, 1);
  27760. + my_hist->min_lat = LONG_MAX;
  27761. + }
  27762. + entry = debugfs_create_file("reset", 0644, dentry,
  27763. + (void *)IRQSOFF_LATENCY, &latency_hist_reset_fops);
  27764. +#endif
  27765. +
  27766. +#ifdef CONFIG_PREEMPT_OFF_HIST
  27767. + dentry = debugfs_create_dir(preemptoff_hist_dir,
  27768. + latency_hist_root);
  27769. + for_each_possible_cpu(i) {
  27770. + sprintf(name, cpufmt, i);
  27771. + entry = debugfs_create_file(name, 0444, dentry,
  27772. + &per_cpu(preemptoff_hist, i), &latency_hist_fops);
  27773. + my_hist = &per_cpu(preemptoff_hist, i);
  27774. + atomic_set(&my_hist->hist_mode, 1);
  27775. + my_hist->min_lat = LONG_MAX;
  27776. + }
  27777. + entry = debugfs_create_file("reset", 0644, dentry,
  27778. + (void *)PREEMPTOFF_LATENCY, &latency_hist_reset_fops);
  27779. +#endif
  27780. +
  27781. +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
  27782. + dentry = debugfs_create_dir(preemptirqsoff_hist_dir,
  27783. + latency_hist_root);
  27784. + for_each_possible_cpu(i) {
  27785. + sprintf(name, cpufmt, i);
  27786. + entry = debugfs_create_file(name, 0444, dentry,
  27787. + &per_cpu(preemptirqsoff_hist, i), &latency_hist_fops);
  27788. + my_hist = &per_cpu(preemptirqsoff_hist, i);
  27789. + atomic_set(&my_hist->hist_mode, 1);
  27790. + my_hist->min_lat = LONG_MAX;
  27791. + }
  27792. + entry = debugfs_create_file("reset", 0644, dentry,
  27793. + (void *)PREEMPTIRQSOFF_LATENCY, &latency_hist_reset_fops);
  27794. +#endif
  27795. +
  27796. +#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
  27797. + entry = debugfs_create_file("preemptirqsoff", 0644,
  27798. + enable_root, (void *)&preemptirqsoff_enabled_data,
  27799. + &enable_fops);
  27800. +#endif
  27801. +
  27802. +#ifdef CONFIG_WAKEUP_LATENCY_HIST
  27803. + dentry = debugfs_create_dir(wakeup_latency_hist_dir,
  27804. + latency_hist_root);
  27805. + dentry_sharedprio = debugfs_create_dir(
  27806. + wakeup_latency_hist_dir_sharedprio, dentry);
  27807. + for_each_possible_cpu(i) {
  27808. + sprintf(name, cpufmt, i);
  27809. +
  27810. + entry = debugfs_create_file(name, 0444, dentry,
  27811. + &per_cpu(wakeup_latency_hist, i),
  27812. + &latency_hist_fops);
  27813. + my_hist = &per_cpu(wakeup_latency_hist, i);
  27814. + atomic_set(&my_hist->hist_mode, 1);
  27815. + my_hist->min_lat = LONG_MAX;
  27816. +
  27817. + entry = debugfs_create_file(name, 0444, dentry_sharedprio,
  27818. + &per_cpu(wakeup_latency_hist_sharedprio, i),
  27819. + &latency_hist_fops);
  27820. + my_hist = &per_cpu(wakeup_latency_hist_sharedprio, i);
  27821. + atomic_set(&my_hist->hist_mode, 1);
  27822. + my_hist->min_lat = LONG_MAX;
  27823. +
  27824. + sprintf(name, cpufmt_maxlatproc, i);
  27825. +
  27826. + mp = &per_cpu(wakeup_maxlatproc, i);
  27827. + entry = debugfs_create_file(name, 0444, dentry, mp,
  27828. + &maxlatproc_fops);
  27829. + clear_maxlatprocdata(mp);
  27830. +
  27831. + mp = &per_cpu(wakeup_maxlatproc_sharedprio, i);
  27832. + entry = debugfs_create_file(name, 0444, dentry_sharedprio, mp,
  27833. + &maxlatproc_fops);
  27834. + clear_maxlatprocdata(mp);
  27835. + }
  27836. + entry = debugfs_create_file("pid", 0644, dentry,
  27837. + (void *)&wakeup_pid, &pid_fops);
  27838. + entry = debugfs_create_file("reset", 0644, dentry,
  27839. + (void *)WAKEUP_LATENCY, &latency_hist_reset_fops);
  27840. + entry = debugfs_create_file("reset", 0644, dentry_sharedprio,
  27841. + (void *)WAKEUP_LATENCY_SHAREDPRIO, &latency_hist_reset_fops);
  27842. + entry = debugfs_create_file("wakeup", 0644,
  27843. + enable_root, (void *)&wakeup_latency_enabled_data,
  27844. + &enable_fops);
  27845. +#endif
  27846. +
  27847. +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
  27848. + dentry = debugfs_create_dir(missed_timer_offsets_dir,
  27849. + latency_hist_root);
  27850. + for_each_possible_cpu(i) {
  27851. + sprintf(name, cpufmt, i);
  27852. + entry = debugfs_create_file(name, 0444, dentry,
  27853. + &per_cpu(missed_timer_offsets, i), &latency_hist_fops);
  27854. + my_hist = &per_cpu(missed_timer_offsets, i);
  27855. + atomic_set(&my_hist->hist_mode, 1);
  27856. + my_hist->min_lat = LONG_MAX;
  27857. +
  27858. + sprintf(name, cpufmt_maxlatproc, i);
  27859. + mp = &per_cpu(missed_timer_offsets_maxlatproc, i);
  27860. + entry = debugfs_create_file(name, 0444, dentry, mp,
  27861. + &maxlatproc_fops);
  27862. + clear_maxlatprocdata(mp);
  27863. + }
  27864. + entry = debugfs_create_file("pid", 0644, dentry,
  27865. + (void *)&missed_timer_offsets_pid, &pid_fops);
  27866. + entry = debugfs_create_file("reset", 0644, dentry,
  27867. + (void *)MISSED_TIMER_OFFSETS, &latency_hist_reset_fops);
  27868. + entry = debugfs_create_file("missed_timer_offsets", 0644,
  27869. + enable_root, (void *)&missed_timer_offsets_enabled_data,
  27870. + &enable_fops);
  27871. +#endif
  27872. +
  27873. +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
  27874. + defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
  27875. + dentry = debugfs_create_dir(timerandwakeup_latency_hist_dir,
  27876. + latency_hist_root);
  27877. + for_each_possible_cpu(i) {
  27878. + sprintf(name, cpufmt, i);
  27879. + entry = debugfs_create_file(name, 0444, dentry,
  27880. + &per_cpu(timerandwakeup_latency_hist, i),
  27881. + &latency_hist_fops);
  27882. + my_hist = &per_cpu(timerandwakeup_latency_hist, i);
  27883. + atomic_set(&my_hist->hist_mode, 1);
  27884. + my_hist->min_lat = LONG_MAX;
  27885. +
  27886. + sprintf(name, cpufmt_maxlatproc, i);
  27887. + mp = &per_cpu(timerandwakeup_maxlatproc, i);
  27888. + entry = debugfs_create_file(name, 0444, dentry, mp,
  27889. + &maxlatproc_fops);
  27890. + clear_maxlatprocdata(mp);
  27891. + }
  27892. + entry = debugfs_create_file("reset", 0644, dentry,
  27893. + (void *)TIMERANDWAKEUP_LATENCY, &latency_hist_reset_fops);
  27894. + entry = debugfs_create_file("timerandwakeup", 0644,
  27895. + enable_root, (void *)&timerandwakeup_enabled_data,
  27896. + &enable_fops);
  27897. +#endif
  27898. + return 0;
  27899. +}
  27900. +
  27901. +device_initcall(latency_hist_init);
  27902. diff -Nur linux-3.18.14.orig/kernel/trace/Makefile linux-3.18.14-rt/kernel/trace/Makefile
  27903. --- linux-3.18.14.orig/kernel/trace/Makefile 2015-05-20 10:04:50.000000000 -0500
  27904. +++ linux-3.18.14-rt/kernel/trace/Makefile 2015-05-31 15:32:48.989635362 -0500
  27905. @@ -36,6 +36,10 @@
  27906. obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
  27907. obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
  27908. obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
  27909. +obj-$(CONFIG_INTERRUPT_OFF_HIST) += latency_hist.o
  27910. +obj-$(CONFIG_PREEMPT_OFF_HIST) += latency_hist.o
  27911. +obj-$(CONFIG_WAKEUP_LATENCY_HIST) += latency_hist.o
  27912. +obj-$(CONFIG_MISSED_TIMER_OFFSETS_HIST) += latency_hist.o
  27913. obj-$(CONFIG_NOP_TRACER) += trace_nop.o
  27914. obj-$(CONFIG_STACK_TRACER) += trace_stack.o
  27915. obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
  27916. diff -Nur linux-3.18.14.orig/kernel/trace/trace.c linux-3.18.14-rt/kernel/trace/trace.c
  27917. --- linux-3.18.14.orig/kernel/trace/trace.c 2015-05-20 10:04:50.000000000 -0500
  27918. +++ linux-3.18.14-rt/kernel/trace/trace.c 2015-05-31 15:32:49.021635361 -0500
  27919. @@ -1579,6 +1579,7 @@
  27920. struct task_struct *tsk = current;
  27921. entry->preempt_count = pc & 0xff;
  27922. + entry->preempt_lazy_count = preempt_lazy_count();
  27923. entry->pid = (tsk) ? tsk->pid : 0;
  27924. entry->flags =
  27925. #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
  27926. @@ -1588,8 +1589,11 @@
  27927. #endif
  27928. ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
  27929. ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
  27930. - (tif_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) |
  27931. + (tif_need_resched_now() ? TRACE_FLAG_NEED_RESCHED : 0) |
  27932. + (need_resched_lazy() ? TRACE_FLAG_NEED_RESCHED_LAZY : 0) |
  27933. (test_preempt_need_resched() ? TRACE_FLAG_PREEMPT_RESCHED : 0);
  27934. +
  27935. + entry->migrate_disable = (tsk) ? __migrate_disabled(tsk) & 0xFF : 0;
  27936. }
  27937. EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
  27938. @@ -2509,14 +2513,17 @@
  27939. static void print_lat_help_header(struct seq_file *m)
  27940. {
  27941. - seq_puts(m, "# _------=> CPU# \n");
  27942. - seq_puts(m, "# / _-----=> irqs-off \n");
  27943. - seq_puts(m, "# | / _----=> need-resched \n");
  27944. - seq_puts(m, "# || / _---=> hardirq/softirq \n");
  27945. - seq_puts(m, "# ||| / _--=> preempt-depth \n");
  27946. - seq_puts(m, "# |||| / delay \n");
  27947. - seq_puts(m, "# cmd pid ||||| time | caller \n");
  27948. - seq_puts(m, "# \\ / ||||| \\ | / \n");
  27949. + seq_puts(m, "# _--------=> CPU# \n");
  27950. + seq_puts(m, "# / _-------=> irqs-off \n");
  27951. + seq_puts(m, "# | / _------=> need-resched \n");
  27952. + seq_puts(m, "# || / _-----=> need-resched_lazy \n");
  27953. + seq_puts(m, "# ||| / _----=> hardirq/softirq \n");
  27954. + seq_puts(m, "# |||| / _---=> preempt-depth \n");
  27955. + seq_puts(m, "# ||||| / _--=> preempt-lazy-depth\n");
  27956. + seq_puts(m, "# |||||| / _-=> migrate-disable \n");
  27957. + seq_puts(m, "# ||||||| / delay \n");
  27958. + seq_puts(m, "# cmd pid |||||||| time | caller \n");
  27959. + seq_puts(m, "# \\ / |||||||| \\ | / \n");
  27960. }
  27961. static void print_event_info(struct trace_buffer *buf, struct seq_file *m)
  27962. @@ -2540,13 +2547,16 @@
  27963. static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m)
  27964. {
  27965. print_event_info(buf, m);
  27966. - seq_puts(m, "# _-----=> irqs-off\n");
  27967. - seq_puts(m, "# / _----=> need-resched\n");
  27968. - seq_puts(m, "# | / _---=> hardirq/softirq\n");
  27969. - seq_puts(m, "# || / _--=> preempt-depth\n");
  27970. - seq_puts(m, "# ||| / delay\n");
  27971. - seq_puts(m, "# TASK-PID CPU# |||| TIMESTAMP FUNCTION\n");
  27972. - seq_puts(m, "# | | | |||| | |\n");
  27973. + seq_puts(m, "# _-------=> irqs-off \n");
  27974. + seq_puts(m, "# / _------=> need-resched \n");
  27975. + seq_puts(m, "# |/ _-----=> need-resched_lazy \n");
  27976. + seq_puts(m, "# ||/ _----=> hardirq/softirq \n");
  27977. + seq_puts(m, "# |||/ _---=> preempt-depth \n");
  27978. + seq_puts(m, "# ||||/ _--=> preempt-lazy-depth\n");
  27979. + seq_puts(m, "# ||||| / _-=> migrate-disable \n");
  27980. + seq_puts(m, "# |||||| / delay\n");
  27981. + seq_puts(m, "# TASK-PID CPU# |||||| TIMESTAMP FUNCTION\n");
  27982. + seq_puts(m, "# | | | |||||| | |\n");
  27983. }
  27984. void
  27985. diff -Nur linux-3.18.14.orig/kernel/trace/trace_events.c linux-3.18.14-rt/kernel/trace/trace_events.c
  27986. --- linux-3.18.14.orig/kernel/trace/trace_events.c 2015-05-20 10:04:50.000000000 -0500
  27987. +++ linux-3.18.14-rt/kernel/trace/trace_events.c 2015-05-31 15:32:49.025635362 -0500
  27988. @@ -162,6 +162,8 @@
  27989. __common_field(unsigned char, flags);
  27990. __common_field(unsigned char, preempt_count);
  27991. __common_field(int, pid);
  27992. + __common_field(unsigned short, migrate_disable);
  27993. + __common_field(unsigned short, padding);
  27994. return ret;
  27995. }
  27996. diff -Nur linux-3.18.14.orig/kernel/trace/trace.h linux-3.18.14-rt/kernel/trace/trace.h
  27997. --- linux-3.18.14.orig/kernel/trace/trace.h 2015-05-20 10:04:50.000000000 -0500
  27998. +++ linux-3.18.14-rt/kernel/trace/trace.h 2015-05-31 15:32:49.021635361 -0500
  27999. @@ -119,6 +119,7 @@
  28000. * NEED_RESCHED - reschedule is requested
  28001. * HARDIRQ - inside an interrupt handler
  28002. * SOFTIRQ - inside a softirq handler
  28003. + * NEED_RESCHED_LAZY - lazy reschedule is requested
  28004. */
  28005. enum trace_flag_type {
  28006. TRACE_FLAG_IRQS_OFF = 0x01,
  28007. @@ -127,6 +128,7 @@
  28008. TRACE_FLAG_HARDIRQ = 0x08,
  28009. TRACE_FLAG_SOFTIRQ = 0x10,
  28010. TRACE_FLAG_PREEMPT_RESCHED = 0x20,
  28011. + TRACE_FLAG_NEED_RESCHED_LAZY = 0x40,
  28012. };
  28013. #define TRACE_BUF_SIZE 1024
  28014. diff -Nur linux-3.18.14.orig/kernel/trace/trace_irqsoff.c linux-3.18.14-rt/kernel/trace/trace_irqsoff.c
  28015. --- linux-3.18.14.orig/kernel/trace/trace_irqsoff.c 2015-05-20 10:04:50.000000000 -0500
  28016. +++ linux-3.18.14-rt/kernel/trace/trace_irqsoff.c 2015-05-31 15:32:49.025635362 -0500
  28017. @@ -17,6 +17,7 @@
  28018. #include <linux/fs.h>
  28019. #include "trace.h"
  28020. +#include <trace/events/hist.h>
  28021. static struct trace_array *irqsoff_trace __read_mostly;
  28022. static int tracer_enabled __read_mostly;
  28023. @@ -435,11 +436,13 @@
  28024. {
  28025. if (preempt_trace() || irq_trace())
  28026. start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
  28027. + trace_preemptirqsoff_hist(TRACE_START, 1);
  28028. }
  28029. EXPORT_SYMBOL_GPL(start_critical_timings);
  28030. void stop_critical_timings(void)
  28031. {
  28032. + trace_preemptirqsoff_hist(TRACE_STOP, 0);
  28033. if (preempt_trace() || irq_trace())
  28034. stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
  28035. }
  28036. @@ -449,6 +452,7 @@
  28037. #ifdef CONFIG_PROVE_LOCKING
  28038. void time_hardirqs_on(unsigned long a0, unsigned long a1)
  28039. {
  28040. + trace_preemptirqsoff_hist(IRQS_ON, 0);
  28041. if (!preempt_trace() && irq_trace())
  28042. stop_critical_timing(a0, a1);
  28043. }
  28044. @@ -457,6 +461,7 @@
  28045. {
  28046. if (!preempt_trace() && irq_trace())
  28047. start_critical_timing(a0, a1);
  28048. + trace_preemptirqsoff_hist(IRQS_OFF, 1);
  28049. }
  28050. #else /* !CONFIG_PROVE_LOCKING */
  28051. @@ -482,6 +487,7 @@
  28052. */
  28053. void trace_hardirqs_on(void)
  28054. {
  28055. + trace_preemptirqsoff_hist(IRQS_ON, 0);
  28056. if (!preempt_trace() && irq_trace())
  28057. stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
  28058. }
  28059. @@ -491,11 +497,13 @@
  28060. {
  28061. if (!preempt_trace() && irq_trace())
  28062. start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
  28063. + trace_preemptirqsoff_hist(IRQS_OFF, 1);
  28064. }
  28065. EXPORT_SYMBOL(trace_hardirqs_off);
  28066. __visible void trace_hardirqs_on_caller(unsigned long caller_addr)
  28067. {
  28068. + trace_preemptirqsoff_hist(IRQS_ON, 0);
  28069. if (!preempt_trace() && irq_trace())
  28070. stop_critical_timing(CALLER_ADDR0, caller_addr);
  28071. }
  28072. @@ -505,6 +513,7 @@
  28073. {
  28074. if (!preempt_trace() && irq_trace())
  28075. start_critical_timing(CALLER_ADDR0, caller_addr);
  28076. + trace_preemptirqsoff_hist(IRQS_OFF, 1);
  28077. }
  28078. EXPORT_SYMBOL(trace_hardirqs_off_caller);
  28079. @@ -514,12 +523,14 @@
  28080. #ifdef CONFIG_PREEMPT_TRACER
  28081. void trace_preempt_on(unsigned long a0, unsigned long a1)
  28082. {
  28083. + trace_preemptirqsoff_hist(PREEMPT_ON, 0);
  28084. if (preempt_trace() && !irq_trace())
  28085. stop_critical_timing(a0, a1);
  28086. }
  28087. void trace_preempt_off(unsigned long a0, unsigned long a1)
  28088. {
  28089. + trace_preemptirqsoff_hist(PREEMPT_ON, 1);
  28090. if (preempt_trace() && !irq_trace())
  28091. start_critical_timing(a0, a1);
  28092. }
  28093. diff -Nur linux-3.18.14.orig/kernel/trace/trace_output.c linux-3.18.14-rt/kernel/trace/trace_output.c
  28094. --- linux-3.18.14.orig/kernel/trace/trace_output.c 2015-05-20 10:04:50.000000000 -0500
  28095. +++ linux-3.18.14-rt/kernel/trace/trace_output.c 2015-05-31 15:32:49.025635362 -0500
  28096. @@ -410,6 +410,7 @@
  28097. {
  28098. char hardsoft_irq;
  28099. char need_resched;
  28100. + char need_resched_lazy;
  28101. char irqs_off;
  28102. int hardirq;
  28103. int softirq;
  28104. @@ -438,6 +439,8 @@
  28105. need_resched = '.';
  28106. break;
  28107. }
  28108. + need_resched_lazy =
  28109. + (entry->flags & TRACE_FLAG_NEED_RESCHED_LAZY) ? 'L' : '.';
  28110. hardsoft_irq =
  28111. (hardirq && softirq) ? 'H' :
  28112. @@ -445,8 +448,9 @@
  28113. softirq ? 's' :
  28114. '.';
  28115. - if (!trace_seq_printf(s, "%c%c%c",
  28116. - irqs_off, need_resched, hardsoft_irq))
  28117. + if (!trace_seq_printf(s, "%c%c%c%c",
  28118. + irqs_off, need_resched, need_resched_lazy,
  28119. + hardsoft_irq))
  28120. return 0;
  28121. if (entry->preempt_count)
  28122. @@ -454,6 +458,16 @@
  28123. else
  28124. ret = trace_seq_putc(s, '.');
  28125. + if (entry->preempt_lazy_count)
  28126. + ret = trace_seq_printf(s, "%x", entry->preempt_lazy_count);
  28127. + else
  28128. + ret = trace_seq_putc(s, '.');
  28129. +
  28130. + if (entry->migrate_disable)
  28131. + ret = trace_seq_printf(s, "%x", entry->migrate_disable);
  28132. + else
  28133. + ret = trace_seq_putc(s, '.');
  28134. +
  28135. return ret;
  28136. }
  28137. diff -Nur linux-3.18.14.orig/kernel/user.c linux-3.18.14-rt/kernel/user.c
  28138. --- linux-3.18.14.orig/kernel/user.c 2015-05-20 10:04:50.000000000 -0500
  28139. +++ linux-3.18.14-rt/kernel/user.c 2015-05-31 15:32:49.045635362 -0500
  28140. @@ -158,11 +158,11 @@
  28141. if (!up)
  28142. return;
  28143. - local_irq_save(flags);
  28144. + local_irq_save_nort(flags);
  28145. if (atomic_dec_and_lock(&up->__count, &uidhash_lock))
  28146. free_user(up, flags);
  28147. else
  28148. - local_irq_restore(flags);
  28149. + local_irq_restore_nort(flags);
  28150. }
  28151. struct user_struct *alloc_uid(kuid_t uid)
  28152. diff -Nur linux-3.18.14.orig/kernel/watchdog.c linux-3.18.14-rt/kernel/watchdog.c
  28153. --- linux-3.18.14.orig/kernel/watchdog.c 2015-05-20 10:04:50.000000000 -0500
  28154. +++ linux-3.18.14-rt/kernel/watchdog.c 2015-05-31 15:32:49.065635361 -0500
  28155. @@ -248,6 +248,8 @@
  28156. #ifdef CONFIG_HARDLOCKUP_DETECTOR
  28157. +static DEFINE_RAW_SPINLOCK(watchdog_output_lock);
  28158. +
  28159. static struct perf_event_attr wd_hw_attr = {
  28160. .type = PERF_TYPE_HARDWARE,
  28161. .config = PERF_COUNT_HW_CPU_CYCLES,
  28162. @@ -281,13 +283,21 @@
  28163. /* only print hardlockups once */
  28164. if (__this_cpu_read(hard_watchdog_warn) == true)
  28165. return;
  28166. + /*
  28167. + * If early-printk is enabled then make sure we do not
  28168. + * lock up in printk() and kill console logging:
  28169. + */
  28170. + printk_kill();
  28171. - if (hardlockup_panic)
  28172. + if (hardlockup_panic) {
  28173. panic("Watchdog detected hard LOCKUP on cpu %d",
  28174. this_cpu);
  28175. - else
  28176. + } else {
  28177. + raw_spin_lock(&watchdog_output_lock);
  28178. WARN(1, "Watchdog detected hard LOCKUP on cpu %d",
  28179. this_cpu);
  28180. + raw_spin_unlock(&watchdog_output_lock);
  28181. + }
  28182. __this_cpu_write(hard_watchdog_warn, true);
  28183. return;
  28184. @@ -430,6 +440,7 @@
  28185. /* kick off the timer for the hardlockup detector */
  28186. hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
  28187. hrtimer->function = watchdog_timer_fn;
  28188. + hrtimer->irqsafe = 1;
  28189. /* Enable the perf event */
  28190. watchdog_nmi_enable(cpu);
  28191. diff -Nur linux-3.18.14.orig/kernel/workqueue.c linux-3.18.14-rt/kernel/workqueue.c
  28192. --- linux-3.18.14.orig/kernel/workqueue.c 2015-05-20 10:04:50.000000000 -0500
  28193. +++ linux-3.18.14-rt/kernel/workqueue.c 2015-05-31 15:32:49.069635361 -0500
  28194. @@ -48,6 +48,8 @@
  28195. #include <linux/nodemask.h>
  28196. #include <linux/moduleparam.h>
  28197. #include <linux/uaccess.h>
  28198. +#include <linux/locallock.h>
  28199. +#include <linux/delay.h>
  28200. #include "workqueue_internal.h"
  28201. @@ -121,15 +123,20 @@
  28202. * cpu or grabbing pool->lock is enough for read access. If
  28203. * POOL_DISASSOCIATED is set, it's identical to L.
  28204. *
  28205. + * On RT we need the extra protection via rt_lock_idle_list() for
  28206. + * the list manipulations against read access from
  28207. + * wq_worker_sleeping(). All other places are nicely serialized via
  28208. + * pool->lock.
  28209. + *
  28210. * A: pool->attach_mutex protected.
  28211. *
  28212. * PL: wq_pool_mutex protected.
  28213. *
  28214. - * PR: wq_pool_mutex protected for writes. Sched-RCU protected for reads.
  28215. + * PR: wq_pool_mutex protected for writes. RCU protected for reads.
  28216. *
  28217. * WQ: wq->mutex protected.
  28218. *
  28219. - * WR: wq->mutex protected for writes. Sched-RCU protected for reads.
  28220. + * WR: wq->mutex protected for writes. RCU protected for reads.
  28221. *
  28222. * MD: wq_mayday_lock protected.
  28223. */
  28224. @@ -177,7 +184,7 @@
  28225. atomic_t nr_running ____cacheline_aligned_in_smp;
  28226. /*
  28227. - * Destruction of pool is sched-RCU protected to allow dereferences
  28228. + * Destruction of pool is RCU protected to allow dereferences
  28229. * from get_work_pool().
  28230. */
  28231. struct rcu_head rcu;
  28232. @@ -206,7 +213,7 @@
  28233. /*
  28234. * Release of unbound pwq is punted to system_wq. See put_pwq()
  28235. * and pwq_unbound_release_workfn() for details. pool_workqueue
  28236. - * itself is also sched-RCU protected so that the first pwq can be
  28237. + * itself is also RCU protected so that the first pwq can be
  28238. * determined without grabbing wq->mutex.
  28239. */
  28240. struct work_struct unbound_release_work;
  28241. @@ -321,6 +328,8 @@
  28242. struct workqueue_struct *system_freezable_power_efficient_wq __read_mostly;
  28243. EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq);
  28244. +static DEFINE_LOCAL_IRQ_LOCK(pendingb_lock);
  28245. +
  28246. static int worker_thread(void *__worker);
  28247. static void copy_workqueue_attrs(struct workqueue_attrs *to,
  28248. const struct workqueue_attrs *from);
  28249. @@ -329,14 +338,14 @@
  28250. #include <trace/events/workqueue.h>
  28251. #define assert_rcu_or_pool_mutex() \
  28252. - rcu_lockdep_assert(rcu_read_lock_sched_held() || \
  28253. + rcu_lockdep_assert(rcu_read_lock_held() || \
  28254. lockdep_is_held(&wq_pool_mutex), \
  28255. - "sched RCU or wq_pool_mutex should be held")
  28256. + "RCU or wq_pool_mutex should be held")
  28257. #define assert_rcu_or_wq_mutex(wq) \
  28258. - rcu_lockdep_assert(rcu_read_lock_sched_held() || \
  28259. + rcu_lockdep_assert(rcu_read_lock_held() || \
  28260. lockdep_is_held(&wq->mutex), \
  28261. - "sched RCU or wq->mutex should be held")
  28262. + "RCU or wq->mutex should be held")
  28263. #define for_each_cpu_worker_pool(pool, cpu) \
  28264. for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0]; \
  28265. @@ -348,7 +357,7 @@
  28266. * @pool: iteration cursor
  28267. * @pi: integer used for iteration
  28268. *
  28269. - * This must be called either with wq_pool_mutex held or sched RCU read
  28270. + * This must be called either with wq_pool_mutex held or RCU read
  28271. * locked. If the pool needs to be used beyond the locking in effect, the
  28272. * caller is responsible for guaranteeing that the pool stays online.
  28273. *
  28274. @@ -380,7 +389,7 @@
  28275. * @pwq: iteration cursor
  28276. * @wq: the target workqueue
  28277. *
  28278. - * This must be called either with wq->mutex held or sched RCU read locked.
  28279. + * This must be called either with wq->mutex held or RCU read locked.
  28280. * If the pwq needs to be used beyond the locking in effect, the caller is
  28281. * responsible for guaranteeing that the pwq stays online.
  28282. *
  28283. @@ -392,6 +401,31 @@
  28284. if (({ assert_rcu_or_wq_mutex(wq); false; })) { } \
  28285. else
  28286. +#ifdef CONFIG_PREEMPT_RT_BASE
  28287. +static inline void rt_lock_idle_list(struct worker_pool *pool)
  28288. +{
  28289. + preempt_disable();
  28290. +}
  28291. +static inline void rt_unlock_idle_list(struct worker_pool *pool)
  28292. +{
  28293. + preempt_enable();
  28294. +}
  28295. +static inline void sched_lock_idle_list(struct worker_pool *pool) { }
  28296. +static inline void sched_unlock_idle_list(struct worker_pool *pool) { }
  28297. +#else
  28298. +static inline void rt_lock_idle_list(struct worker_pool *pool) { }
  28299. +static inline void rt_unlock_idle_list(struct worker_pool *pool) { }
  28300. +static inline void sched_lock_idle_list(struct worker_pool *pool)
  28301. +{
  28302. + spin_lock_irq(&pool->lock);
  28303. +}
  28304. +static inline void sched_unlock_idle_list(struct worker_pool *pool)
  28305. +{
  28306. + spin_unlock_irq(&pool->lock);
  28307. +}
  28308. +#endif
  28309. +
  28310. +
  28311. #ifdef CONFIG_DEBUG_OBJECTS_WORK
  28312. static struct debug_obj_descr work_debug_descr;
  28313. @@ -542,7 +576,7 @@
  28314. * @wq: the target workqueue
  28315. * @node: the node ID
  28316. *
  28317. - * This must be called either with pwq_lock held or sched RCU read locked.
  28318. + * This must be called either with pwq_lock held or RCU read locked.
  28319. * If the pwq needs to be used beyond the locking in effect, the caller is
  28320. * responsible for guaranteeing that the pwq stays online.
  28321. *
  28322. @@ -646,8 +680,8 @@
  28323. * @work: the work item of interest
  28324. *
  28325. * Pools are created and destroyed under wq_pool_mutex, and allows read
  28326. - * access under sched-RCU read lock. As such, this function should be
  28327. - * called under wq_pool_mutex or with preemption disabled.
  28328. + * access under RCU read lock. As such, this function should be
  28329. + * called under wq_pool_mutex or inside of a rcu_read_lock() region.
  28330. *
  28331. * All fields of the returned pool are accessible as long as the above
  28332. * mentioned locking is in effect. If the returned pool needs to be used
  28333. @@ -784,51 +818,44 @@
  28334. */
  28335. static void wake_up_worker(struct worker_pool *pool)
  28336. {
  28337. - struct worker *worker = first_idle_worker(pool);
  28338. + struct worker *worker;
  28339. +
  28340. + rt_lock_idle_list(pool);
  28341. +
  28342. + worker = first_idle_worker(pool);
  28343. if (likely(worker))
  28344. wake_up_process(worker->task);
  28345. +
  28346. + rt_unlock_idle_list(pool);
  28347. }
  28348. /**
  28349. - * wq_worker_waking_up - a worker is waking up
  28350. - * @task: task waking up
  28351. - * @cpu: CPU @task is waking up to
  28352. - *
  28353. - * This function is called during try_to_wake_up() when a worker is
  28354. - * being awoken.
  28355. + * wq_worker_running - a worker is running again
  28356. + * @task: task returning from sleep
  28357. *
  28358. - * CONTEXT:
  28359. - * spin_lock_irq(rq->lock)
  28360. + * This function is called when a worker returns from schedule()
  28361. */
  28362. -void wq_worker_waking_up(struct task_struct *task, int cpu)
  28363. +void wq_worker_running(struct task_struct *task)
  28364. {
  28365. struct worker *worker = kthread_data(task);
  28366. - if (!(worker->flags & WORKER_NOT_RUNNING)) {
  28367. - WARN_ON_ONCE(worker->pool->cpu != cpu);
  28368. + if (!worker->sleeping)
  28369. + return;
  28370. + if (!(worker->flags & WORKER_NOT_RUNNING))
  28371. atomic_inc(&worker->pool->nr_running);
  28372. - }
  28373. + worker->sleeping = 0;
  28374. }
  28375. /**
  28376. * wq_worker_sleeping - a worker is going to sleep
  28377. * @task: task going to sleep
  28378. - * @cpu: CPU in question, must be the current CPU number
  28379. - *
  28380. - * This function is called during schedule() when a busy worker is
  28381. - * going to sleep. Worker on the same cpu can be woken up by
  28382. - * returning pointer to its task.
  28383. - *
  28384. - * CONTEXT:
  28385. - * spin_lock_irq(rq->lock)
  28386. - *
  28387. - * Return:
  28388. - * Worker task on @cpu to wake up, %NULL if none.
  28389. + * This function is called from schedule() when a busy worker is
  28390. + * going to sleep.
  28391. */
  28392. -struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu)
  28393. +void wq_worker_sleeping(struct task_struct *task)
  28394. {
  28395. - struct worker *worker = kthread_data(task), *to_wakeup = NULL;
  28396. + struct worker *worker = kthread_data(task);
  28397. struct worker_pool *pool;
  28398. /*
  28399. @@ -837,29 +864,26 @@
  28400. * checking NOT_RUNNING.
  28401. */
  28402. if (worker->flags & WORKER_NOT_RUNNING)
  28403. - return NULL;
  28404. + return;
  28405. pool = worker->pool;
  28406. - /* this can only happen on the local cpu */
  28407. - if (WARN_ON_ONCE(cpu != raw_smp_processor_id() || pool->cpu != cpu))
  28408. - return NULL;
  28409. + if (WARN_ON_ONCE(worker->sleeping))
  28410. + return;
  28411. +
  28412. + worker->sleeping = 1;
  28413. /*
  28414. * The counterpart of the following dec_and_test, implied mb,
  28415. * worklist not empty test sequence is in insert_work().
  28416. * Please read comment there.
  28417. - *
  28418. - * NOT_RUNNING is clear. This means that we're bound to and
  28419. - * running on the local cpu w/ rq lock held and preemption
  28420. - * disabled, which in turn means that none else could be
  28421. - * manipulating idle_list, so dereferencing idle_list without pool
  28422. - * lock is safe.
  28423. */
  28424. if (atomic_dec_and_test(&pool->nr_running) &&
  28425. - !list_empty(&pool->worklist))
  28426. - to_wakeup = first_idle_worker(pool);
  28427. - return to_wakeup ? to_wakeup->task : NULL;
  28428. + !list_empty(&pool->worklist)) {
  28429. + sched_lock_idle_list(pool);
  28430. + wake_up_worker(pool);
  28431. + sched_unlock_idle_list(pool);
  28432. + }
  28433. }
  28434. /**
  28435. @@ -1053,12 +1077,12 @@
  28436. {
  28437. if (pwq) {
  28438. /*
  28439. - * As both pwqs and pools are sched-RCU protected, the
  28440. + * As both pwqs and pools are RCU protected, the
  28441. * following lock operations are safe.
  28442. */
  28443. - spin_lock_irq(&pwq->pool->lock);
  28444. + local_spin_lock_irq(pendingb_lock, &pwq->pool->lock);
  28445. put_pwq(pwq);
  28446. - spin_unlock_irq(&pwq->pool->lock);
  28447. + local_spin_unlock_irq(pendingb_lock, &pwq->pool->lock);
  28448. }
  28449. }
  28450. @@ -1160,7 +1184,7 @@
  28451. struct worker_pool *pool;
  28452. struct pool_workqueue *pwq;
  28453. - local_irq_save(*flags);
  28454. + local_lock_irqsave(pendingb_lock, *flags);
  28455. /* try to steal the timer if it exists */
  28456. if (is_dwork) {
  28457. @@ -1179,6 +1203,7 @@
  28458. if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
  28459. return 0;
  28460. + rcu_read_lock();
  28461. /*
  28462. * The queueing is in progress, or it is already queued. Try to
  28463. * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
  28464. @@ -1217,14 +1242,16 @@
  28465. set_work_pool_and_keep_pending(work, pool->id);
  28466. spin_unlock(&pool->lock);
  28467. + rcu_read_unlock();
  28468. return 1;
  28469. }
  28470. spin_unlock(&pool->lock);
  28471. fail:
  28472. - local_irq_restore(*flags);
  28473. + rcu_read_unlock();
  28474. + local_unlock_irqrestore(pendingb_lock, *flags);
  28475. if (work_is_canceling(work))
  28476. return -ENOENT;
  28477. - cpu_relax();
  28478. + cpu_chill();
  28479. return -EAGAIN;
  28480. }
  28481. @@ -1293,7 +1320,7 @@
  28482. * queued or lose PENDING. Grabbing PENDING and queueing should
  28483. * happen with IRQ disabled.
  28484. */
  28485. - WARN_ON_ONCE(!irqs_disabled());
  28486. + WARN_ON_ONCE_NONRT(!irqs_disabled());
  28487. debug_work_activate(work);
  28488. @@ -1301,6 +1328,8 @@
  28489. if (unlikely(wq->flags & __WQ_DRAINING) &&
  28490. WARN_ON_ONCE(!is_chained_work(wq)))
  28491. return;
  28492. +
  28493. + rcu_read_lock();
  28494. retry:
  28495. if (req_cpu == WORK_CPU_UNBOUND)
  28496. cpu = raw_smp_processor_id();
  28497. @@ -1357,10 +1386,8 @@
  28498. /* pwq determined, queue */
  28499. trace_workqueue_queue_work(req_cpu, pwq, work);
  28500. - if (WARN_ON(!list_empty(&work->entry))) {
  28501. - spin_unlock(&pwq->pool->lock);
  28502. - return;
  28503. - }
  28504. + if (WARN_ON(!list_empty(&work->entry)))
  28505. + goto out;
  28506. pwq->nr_in_flight[pwq->work_color]++;
  28507. work_flags = work_color_to_flags(pwq->work_color);
  28508. @@ -1376,7 +1403,9 @@
  28509. insert_work(pwq, work, worklist, work_flags);
  28510. +out:
  28511. spin_unlock(&pwq->pool->lock);
  28512. + rcu_read_unlock();
  28513. }
  28514. /**
  28515. @@ -1396,14 +1425,14 @@
  28516. bool ret = false;
  28517. unsigned long flags;
  28518. - local_irq_save(flags);
  28519. + local_lock_irqsave(pendingb_lock,flags);
  28520. if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
  28521. __queue_work(cpu, wq, work);
  28522. ret = true;
  28523. }
  28524. - local_irq_restore(flags);
  28525. + local_unlock_irqrestore(pendingb_lock, flags);
  28526. return ret;
  28527. }
  28528. EXPORT_SYMBOL(queue_work_on);
  28529. @@ -1470,14 +1499,14 @@
  28530. unsigned long flags;
  28531. /* read the comment in __queue_work() */
  28532. - local_irq_save(flags);
  28533. + local_lock_irqsave(pendingb_lock, flags);
  28534. if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
  28535. __queue_delayed_work(cpu, wq, dwork, delay);
  28536. ret = true;
  28537. }
  28538. - local_irq_restore(flags);
  28539. + local_unlock_irqrestore(pendingb_lock, flags);
  28540. return ret;
  28541. }
  28542. EXPORT_SYMBOL(queue_delayed_work_on);
  28543. @@ -1512,7 +1541,7 @@
  28544. if (likely(ret >= 0)) {
  28545. __queue_delayed_work(cpu, wq, dwork, delay);
  28546. - local_irq_restore(flags);
  28547. + local_unlock_irqrestore(pendingb_lock, flags);
  28548. }
  28549. /* -ENOENT from try_to_grab_pending() becomes %true */
  28550. @@ -1545,7 +1574,9 @@
  28551. worker->last_active = jiffies;
  28552. /* idle_list is LIFO */
  28553. + rt_lock_idle_list(pool);
  28554. list_add(&worker->entry, &pool->idle_list);
  28555. + rt_unlock_idle_list(pool);
  28556. if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
  28557. mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
  28558. @@ -1578,7 +1609,9 @@
  28559. return;
  28560. worker_clr_flags(worker, WORKER_IDLE);
  28561. pool->nr_idle--;
  28562. + rt_lock_idle_list(pool);
  28563. list_del_init(&worker->entry);
  28564. + rt_unlock_idle_list(pool);
  28565. }
  28566. static struct worker *alloc_worker(int node)
  28567. @@ -1746,7 +1779,9 @@
  28568. pool->nr_workers--;
  28569. pool->nr_idle--;
  28570. + rt_lock_idle_list(pool);
  28571. list_del_init(&worker->entry);
  28572. + rt_unlock_idle_list(pool);
  28573. worker->flags |= WORKER_DIE;
  28574. wake_up_process(worker->task);
  28575. }
  28576. @@ -2641,14 +2676,14 @@
  28577. might_sleep();
  28578. - local_irq_disable();
  28579. + rcu_read_lock();
  28580. pool = get_work_pool(work);
  28581. if (!pool) {
  28582. - local_irq_enable();
  28583. + rcu_read_unlock();
  28584. return false;
  28585. }
  28586. - spin_lock(&pool->lock);
  28587. + spin_lock_irq(&pool->lock);
  28588. /* see the comment in try_to_grab_pending() with the same code */
  28589. pwq = get_work_pwq(work);
  28590. if (pwq) {
  28591. @@ -2675,10 +2710,11 @@
  28592. else
  28593. lock_map_acquire_read(&pwq->wq->lockdep_map);
  28594. lock_map_release(&pwq->wq->lockdep_map);
  28595. -
  28596. + rcu_read_unlock();
  28597. return true;
  28598. already_gone:
  28599. spin_unlock_irq(&pool->lock);
  28600. + rcu_read_unlock();
  28601. return false;
  28602. }
  28603. @@ -2765,7 +2801,7 @@
  28604. /* tell other tasks trying to grab @work to back off */
  28605. mark_work_canceling(work);
  28606. - local_irq_restore(flags);
  28607. + local_unlock_irqrestore(pendingb_lock, flags);
  28608. flush_work(work);
  28609. clear_work_data(work);
  28610. @@ -2820,10 +2856,10 @@
  28611. */
  28612. bool flush_delayed_work(struct delayed_work *dwork)
  28613. {
  28614. - local_irq_disable();
  28615. + local_lock_irq(pendingb_lock);
  28616. if (del_timer_sync(&dwork->timer))
  28617. __queue_work(dwork->cpu, dwork->wq, &dwork->work);
  28618. - local_irq_enable();
  28619. + local_unlock_irq(pendingb_lock);
  28620. return flush_work(&dwork->work);
  28621. }
  28622. EXPORT_SYMBOL(flush_delayed_work);
  28623. @@ -2858,7 +2894,7 @@
  28624. set_work_pool_and_clear_pending(&dwork->work,
  28625. get_work_pool_id(&dwork->work));
  28626. - local_irq_restore(flags);
  28627. + local_unlock_irqrestore(pendingb_lock, flags);
  28628. return ret;
  28629. }
  28630. EXPORT_SYMBOL(cancel_delayed_work);
  28631. @@ -3044,7 +3080,8 @@
  28632. const char *delim = "";
  28633. int node, written = 0;
  28634. - rcu_read_lock_sched();
  28635. + get_online_cpus();
  28636. + rcu_read_lock();
  28637. for_each_node(node) {
  28638. written += scnprintf(buf + written, PAGE_SIZE - written,
  28639. "%s%d:%d", delim, node,
  28640. @@ -3052,7 +3089,8 @@
  28641. delim = " ";
  28642. }
  28643. written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
  28644. - rcu_read_unlock_sched();
  28645. + rcu_read_unlock();
  28646. + put_online_cpus();
  28647. return written;
  28648. }
  28649. @@ -3420,7 +3458,7 @@
  28650. * put_unbound_pool - put a worker_pool
  28651. * @pool: worker_pool to put
  28652. *
  28653. - * Put @pool. If its refcnt reaches zero, it gets destroyed in sched-RCU
  28654. + * Put @pool. If its refcnt reaches zero, it gets destroyed in RCU
  28655. * safe manner. get_unbound_pool() calls this function on its failure path
  28656. * and this function should be able to release pools which went through,
  28657. * successfully or not, init_worker_pool().
  28658. @@ -3474,8 +3512,8 @@
  28659. del_timer_sync(&pool->idle_timer);
  28660. del_timer_sync(&pool->mayday_timer);
  28661. - /* sched-RCU protected to allow dereferences from get_work_pool() */
  28662. - call_rcu_sched(&pool->rcu, rcu_free_pool);
  28663. + /* RCU protected to allow dereferences from get_work_pool() */
  28664. + call_rcu(&pool->rcu, rcu_free_pool);
  28665. }
  28666. /**
  28667. @@ -3580,7 +3618,7 @@
  28668. put_unbound_pool(pool);
  28669. mutex_unlock(&wq_pool_mutex);
  28670. - call_rcu_sched(&pwq->rcu, rcu_free_pwq);
  28671. + call_rcu(&pwq->rcu, rcu_free_pwq);
  28672. /*
  28673. * If we're the last pwq going away, @wq is already dead and no one
  28674. @@ -4292,7 +4330,8 @@
  28675. struct pool_workqueue *pwq;
  28676. bool ret;
  28677. - rcu_read_lock_sched();
  28678. + rcu_read_lock();
  28679. + preempt_disable();
  28680. if (cpu == WORK_CPU_UNBOUND)
  28681. cpu = smp_processor_id();
  28682. @@ -4303,7 +4342,8 @@
  28683. pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));
  28684. ret = !list_empty(&pwq->delayed_works);
  28685. - rcu_read_unlock_sched();
  28686. + preempt_enable();
  28687. + rcu_read_unlock();
  28688. return ret;
  28689. }
  28690. @@ -4329,16 +4369,15 @@
  28691. if (work_pending(work))
  28692. ret |= WORK_BUSY_PENDING;
  28693. - local_irq_save(flags);
  28694. + rcu_read_lock();
  28695. pool = get_work_pool(work);
  28696. if (pool) {
  28697. - spin_lock(&pool->lock);
  28698. + spin_lock_irqsave(&pool->lock, flags);
  28699. if (find_worker_executing_work(pool, work))
  28700. ret |= WORK_BUSY_RUNNING;
  28701. - spin_unlock(&pool->lock);
  28702. + spin_unlock_irqrestore(&pool->lock, flags);
  28703. }
  28704. - local_irq_restore(flags);
  28705. -
  28706. + rcu_read_unlock();
  28707. return ret;
  28708. }
  28709. EXPORT_SYMBOL_GPL(work_busy);
  28710. @@ -4767,16 +4806,16 @@
  28711. * nr_active is monotonically decreasing. It's safe
  28712. * to peek without lock.
  28713. */
  28714. - rcu_read_lock_sched();
  28715. + rcu_read_lock();
  28716. for_each_pwq(pwq, wq) {
  28717. WARN_ON_ONCE(pwq->nr_active < 0);
  28718. if (pwq->nr_active) {
  28719. busy = true;
  28720. - rcu_read_unlock_sched();
  28721. + rcu_read_unlock();
  28722. goto out_unlock;
  28723. }
  28724. }
  28725. - rcu_read_unlock_sched();
  28726. + rcu_read_unlock();
  28727. }
  28728. out_unlock:
  28729. mutex_unlock(&wq_pool_mutex);
  28730. diff -Nur linux-3.18.14.orig/kernel/workqueue_internal.h linux-3.18.14-rt/kernel/workqueue_internal.h
  28731. --- linux-3.18.14.orig/kernel/workqueue_internal.h 2015-05-20 10:04:50.000000000 -0500
  28732. +++ linux-3.18.14-rt/kernel/workqueue_internal.h 2015-05-31 15:32:49.069635361 -0500
  28733. @@ -43,6 +43,7 @@
  28734. unsigned long last_active; /* L: last active timestamp */
  28735. unsigned int flags; /* X: flags */
  28736. int id; /* I: worker id */
  28737. + int sleeping; /* None */
  28738. /*
  28739. * Opaque string set with work_set_desc(). Printed out with task
  28740. @@ -68,7 +69,7 @@
  28741. * Scheduler hooks for concurrency managed workqueue. Only to be used from
  28742. * sched/core.c and workqueue.c.
  28743. */
  28744. -void wq_worker_waking_up(struct task_struct *task, int cpu);
  28745. -struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu);
  28746. +void wq_worker_running(struct task_struct *task);
  28747. +void wq_worker_sleeping(struct task_struct *task);
  28748. #endif /* _KERNEL_WORKQUEUE_INTERNAL_H */
  28749. diff -Nur linux-3.18.14.orig/lib/debugobjects.c linux-3.18.14-rt/lib/debugobjects.c
  28750. --- linux-3.18.14.orig/lib/debugobjects.c 2015-05-20 10:04:50.000000000 -0500
  28751. +++ linux-3.18.14-rt/lib/debugobjects.c 2015-05-31 15:32:49.113635361 -0500
  28752. @@ -309,7 +309,10 @@
  28753. struct debug_obj *obj;
  28754. unsigned long flags;
  28755. - fill_pool();
  28756. +#ifdef CONFIG_PREEMPT_RT_FULL
  28757. + if (preempt_count() == 0 && !irqs_disabled())
  28758. +#endif
  28759. + fill_pool();
  28760. db = get_bucket((unsigned long) addr);
  28761. diff -Nur linux-3.18.14.orig/lib/idr.c linux-3.18.14-rt/lib/idr.c
  28762. --- linux-3.18.14.orig/lib/idr.c 2015-05-20 10:04:50.000000000 -0500
  28763. +++ linux-3.18.14-rt/lib/idr.c 2015-05-31 15:32:49.141635361 -0500
  28764. @@ -31,6 +31,7 @@
  28765. #include <linux/spinlock.h>
  28766. #include <linux/percpu.h>
  28767. #include <linux/hardirq.h>
  28768. +#include <linux/locallock.h>
  28769. #define MAX_IDR_SHIFT (sizeof(int) * 8 - 1)
  28770. #define MAX_IDR_BIT (1U << MAX_IDR_SHIFT)
  28771. @@ -367,6 +368,35 @@
  28772. idr_mark_full(pa, id);
  28773. }
  28774. +#ifdef CONFIG_PREEMPT_RT_FULL
  28775. +static DEFINE_LOCAL_IRQ_LOCK(idr_lock);
  28776. +
  28777. +static inline void idr_preload_lock(void)
  28778. +{
  28779. + local_lock(idr_lock);
  28780. +}
  28781. +
  28782. +static inline void idr_preload_unlock(void)
  28783. +{
  28784. + local_unlock(idr_lock);
  28785. +}
  28786. +
  28787. +void idr_preload_end(void)
  28788. +{
  28789. + idr_preload_unlock();
  28790. +}
  28791. +EXPORT_SYMBOL(idr_preload_end);
  28792. +#else
  28793. +static inline void idr_preload_lock(void)
  28794. +{
  28795. + preempt_disable();
  28796. +}
  28797. +
  28798. +static inline void idr_preload_unlock(void)
  28799. +{
  28800. + preempt_enable();
  28801. +}
  28802. +#endif
  28803. /**
  28804. * idr_preload - preload for idr_alloc()
  28805. @@ -402,7 +432,7 @@
  28806. WARN_ON_ONCE(in_interrupt());
  28807. might_sleep_if(gfp_mask & __GFP_WAIT);
  28808. - preempt_disable();
  28809. + idr_preload_lock();
  28810. /*
  28811. * idr_alloc() is likely to succeed w/o full idr_layer buffer and
  28812. @@ -414,9 +444,9 @@
  28813. while (__this_cpu_read(idr_preload_cnt) < MAX_IDR_FREE) {
  28814. struct idr_layer *new;
  28815. - preempt_enable();
  28816. + idr_preload_unlock();
  28817. new = kmem_cache_zalloc(idr_layer_cache, gfp_mask);
  28818. - preempt_disable();
  28819. + idr_preload_lock();
  28820. if (!new)
  28821. break;
  28822. diff -Nur linux-3.18.14.orig/lib/Kconfig linux-3.18.14-rt/lib/Kconfig
  28823. --- linux-3.18.14.orig/lib/Kconfig 2015-05-20 10:04:50.000000000 -0500
  28824. +++ linux-3.18.14-rt/lib/Kconfig 2015-05-31 15:32:49.085635361 -0500
  28825. @@ -383,6 +383,7 @@
  28826. config CPUMASK_OFFSTACK
  28827. bool "Force CPU masks off stack" if DEBUG_PER_CPU_MAPS
  28828. + depends on !PREEMPT_RT_FULL
  28829. help
  28830. Use dynamic allocation for cpumask_var_t, instead of putting
  28831. them on the stack. This is a bit more expensive, but avoids
  28832. diff -Nur linux-3.18.14.orig/lib/Kconfig.debug linux-3.18.14-rt/lib/Kconfig.debug
  28833. --- linux-3.18.14.orig/lib/Kconfig.debug 2015-05-20 10:04:50.000000000 -0500
  28834. +++ linux-3.18.14-rt/lib/Kconfig.debug 2015-05-31 15:32:49.097635361 -0500
  28835. @@ -639,7 +639,7 @@
  28836. config DEBUG_SHIRQ
  28837. bool "Debug shared IRQ handlers"
  28838. - depends on DEBUG_KERNEL
  28839. + depends on DEBUG_KERNEL && !PREEMPT_RT_BASE
  28840. help
  28841. Enable this to generate a spurious interrupt as soon as a shared
  28842. interrupt handler is registered, and just before one is deregistered.
  28843. diff -Nur linux-3.18.14.orig/lib/locking-selftest.c linux-3.18.14-rt/lib/locking-selftest.c
  28844. --- linux-3.18.14.orig/lib/locking-selftest.c 2015-05-20 10:04:50.000000000 -0500
  28845. +++ linux-3.18.14-rt/lib/locking-selftest.c 2015-05-31 15:32:49.141635361 -0500
  28846. @@ -590,6 +590,8 @@
  28847. #include "locking-selftest-spin-hardirq.h"
  28848. GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_hard_spin)
  28849. +#ifndef CONFIG_PREEMPT_RT_FULL
  28850. +
  28851. #include "locking-selftest-rlock-hardirq.h"
  28852. GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_hard_rlock)
  28853. @@ -605,9 +607,12 @@
  28854. #include "locking-selftest-wlock-softirq.h"
  28855. GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_soft_wlock)
  28856. +#endif
  28857. +
  28858. #undef E1
  28859. #undef E2
  28860. +#ifndef CONFIG_PREEMPT_RT_FULL
  28861. /*
  28862. * Enabling hardirqs with a softirq-safe lock held:
  28863. */
  28864. @@ -640,6 +645,8 @@
  28865. #undef E1
  28866. #undef E2
  28867. +#endif
  28868. +
  28869. /*
  28870. * Enabling irqs with an irq-safe lock held:
  28871. */
  28872. @@ -663,6 +670,8 @@
  28873. #include "locking-selftest-spin-hardirq.h"
  28874. GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_hard_spin)
  28875. +#ifndef CONFIG_PREEMPT_RT_FULL
  28876. +
  28877. #include "locking-selftest-rlock-hardirq.h"
  28878. GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_hard_rlock)
  28879. @@ -678,6 +687,8 @@
  28880. #include "locking-selftest-wlock-softirq.h"
  28881. GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_wlock)
  28882. +#endif
  28883. +
  28884. #undef E1
  28885. #undef E2
  28886. @@ -709,6 +720,8 @@
  28887. #include "locking-selftest-spin-hardirq.h"
  28888. GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_hard_spin)
  28889. +#ifndef CONFIG_PREEMPT_RT_FULL
  28890. +
  28891. #include "locking-selftest-rlock-hardirq.h"
  28892. GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_hard_rlock)
  28893. @@ -724,6 +737,8 @@
  28894. #include "locking-selftest-wlock-softirq.h"
  28895. GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_wlock)
  28896. +#endif
  28897. +
  28898. #undef E1
  28899. #undef E2
  28900. #undef E3
  28901. @@ -757,6 +772,8 @@
  28902. #include "locking-selftest-spin-hardirq.h"
  28903. GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_hard_spin)
  28904. +#ifndef CONFIG_PREEMPT_RT_FULL
  28905. +
  28906. #include "locking-selftest-rlock-hardirq.h"
  28907. GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_hard_rlock)
  28908. @@ -772,10 +789,14 @@
  28909. #include "locking-selftest-wlock-softirq.h"
  28910. GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_soft_wlock)
  28911. +#endif
  28912. +
  28913. #undef E1
  28914. #undef E2
  28915. #undef E3
  28916. +#ifndef CONFIG_PREEMPT_RT_FULL
  28917. +
  28918. /*
  28919. * read-lock / write-lock irq inversion.
  28920. *
  28921. @@ -838,6 +859,10 @@
  28922. #undef E2
  28923. #undef E3
  28924. +#endif
  28925. +
  28926. +#ifndef CONFIG_PREEMPT_RT_FULL
  28927. +
  28928. /*
  28929. * read-lock / write-lock recursion that is actually safe.
  28930. */
  28931. @@ -876,6 +901,8 @@
  28932. #undef E2
  28933. #undef E3
  28934. +#endif
  28935. +
  28936. /*
  28937. * read-lock / write-lock recursion that is unsafe.
  28938. */
  28939. @@ -1858,6 +1885,7 @@
  28940. printk(" --------------------------------------------------------------------------\n");
  28941. +#ifndef CONFIG_PREEMPT_RT_FULL
  28942. /*
  28943. * irq-context testcases:
  28944. */
  28945. @@ -1870,6 +1898,28 @@
  28946. DO_TESTCASE_6x2("irq read-recursion", irq_read_recursion);
  28947. // DO_TESTCASE_6x2B("irq read-recursion #2", irq_read_recursion2);
  28948. +#else
  28949. + /* On -rt, we only do hardirq context test for raw spinlock */
  28950. + DO_TESTCASE_1B("hard-irqs-on + irq-safe-A", irqsafe1_hard_spin, 12);
  28951. + DO_TESTCASE_1B("hard-irqs-on + irq-safe-A", irqsafe1_hard_spin, 21);
  28952. +
  28953. + DO_TESTCASE_1B("hard-safe-A + irqs-on", irqsafe2B_hard_spin, 12);
  28954. + DO_TESTCASE_1B("hard-safe-A + irqs-on", irqsafe2B_hard_spin, 21);
  28955. +
  28956. + DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 123);
  28957. + DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 132);
  28958. + DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 213);
  28959. + DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 231);
  28960. + DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 312);
  28961. + DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 321);
  28962. +
  28963. + DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 123);
  28964. + DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 132);
  28965. + DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 213);
  28966. + DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 231);
  28967. + DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 312);
  28968. + DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 321);
  28969. +#endif
  28970. ww_tests();
  28971. diff -Nur linux-3.18.14.orig/lib/percpu_ida.c linux-3.18.14-rt/lib/percpu_ida.c
  28972. --- linux-3.18.14.orig/lib/percpu_ida.c 2015-05-20 10:04:50.000000000 -0500
  28973. +++ linux-3.18.14-rt/lib/percpu_ida.c 2015-05-31 15:32:49.161635360 -0500
  28974. @@ -29,6 +29,9 @@
  28975. #include <linux/string.h>
  28976. #include <linux/spinlock.h>
  28977. #include <linux/percpu_ida.h>
  28978. +#include <linux/locallock.h>
  28979. +
  28980. +static DEFINE_LOCAL_IRQ_LOCK(irq_off_lock);
  28981. struct percpu_ida_cpu {
  28982. /*
  28983. @@ -151,13 +154,13 @@
  28984. unsigned long flags;
  28985. int tag;
  28986. - local_irq_save(flags);
  28987. + local_lock_irqsave(irq_off_lock, flags);
  28988. tags = this_cpu_ptr(pool->tag_cpu);
  28989. /* Fastpath */
  28990. tag = alloc_local_tag(tags);
  28991. if (likely(tag >= 0)) {
  28992. - local_irq_restore(flags);
  28993. + local_unlock_irqrestore(irq_off_lock, flags);
  28994. return tag;
  28995. }
  28996. @@ -176,6 +179,7 @@
  28997. if (!tags->nr_free)
  28998. alloc_global_tags(pool, tags);
  28999. +
  29000. if (!tags->nr_free)
  29001. steal_tags(pool, tags);
  29002. @@ -187,7 +191,7 @@
  29003. }
  29004. spin_unlock(&pool->lock);
  29005. - local_irq_restore(flags);
  29006. + local_unlock_irqrestore(irq_off_lock, flags);
  29007. if (tag >= 0 || state == TASK_RUNNING)
  29008. break;
  29009. @@ -199,7 +203,7 @@
  29010. schedule();
  29011. - local_irq_save(flags);
  29012. + local_lock_irqsave(irq_off_lock, flags);
  29013. tags = this_cpu_ptr(pool->tag_cpu);
  29014. }
  29015. if (state != TASK_RUNNING)
  29016. @@ -224,7 +228,7 @@
  29017. BUG_ON(tag >= pool->nr_tags);
  29018. - local_irq_save(flags);
  29019. + local_lock_irqsave(irq_off_lock, flags);
  29020. tags = this_cpu_ptr(pool->tag_cpu);
  29021. spin_lock(&tags->lock);
  29022. @@ -256,7 +260,7 @@
  29023. spin_unlock(&pool->lock);
  29024. }
  29025. - local_irq_restore(flags);
  29026. + local_unlock_irqrestore(irq_off_lock, flags);
  29027. }
  29028. EXPORT_SYMBOL_GPL(percpu_ida_free);
  29029. @@ -348,7 +352,7 @@
  29030. struct percpu_ida_cpu *remote;
  29031. unsigned cpu, i, err = 0;
  29032. - local_irq_save(flags);
  29033. + local_lock_irqsave(irq_off_lock, flags);
  29034. for_each_possible_cpu(cpu) {
  29035. remote = per_cpu_ptr(pool->tag_cpu, cpu);
  29036. spin_lock(&remote->lock);
  29037. @@ -370,7 +374,7 @@
  29038. }
  29039. spin_unlock(&pool->lock);
  29040. out:
  29041. - local_irq_restore(flags);
  29042. + local_unlock_irqrestore(irq_off_lock, flags);
  29043. return err;
  29044. }
  29045. EXPORT_SYMBOL_GPL(percpu_ida_for_each_free);
  29046. diff -Nur linux-3.18.14.orig/lib/radix-tree.c linux-3.18.14-rt/lib/radix-tree.c
  29047. --- linux-3.18.14.orig/lib/radix-tree.c 2015-05-20 10:04:50.000000000 -0500
  29048. +++ linux-3.18.14-rt/lib/radix-tree.c 2015-05-31 15:32:49.161635360 -0500
  29049. @@ -195,12 +195,13 @@
  29050. * succeed in getting a node here (and never reach
  29051. * kmem_cache_alloc)
  29052. */
  29053. - rtp = this_cpu_ptr(&radix_tree_preloads);
  29054. + rtp = &get_cpu_var(radix_tree_preloads);
  29055. if (rtp->nr) {
  29056. ret = rtp->nodes[rtp->nr - 1];
  29057. rtp->nodes[rtp->nr - 1] = NULL;
  29058. rtp->nr--;
  29059. }
  29060. + put_cpu_var(radix_tree_preloads);
  29061. /*
  29062. * Update the allocation stack trace as this is more useful
  29063. * for debugging.
  29064. @@ -240,6 +241,7 @@
  29065. call_rcu(&node->rcu_head, radix_tree_node_rcu_free);
  29066. }
  29067. +#ifndef CONFIG_PREEMPT_RT_FULL
  29068. /*
  29069. * Load up this CPU's radix_tree_node buffer with sufficient objects to
  29070. * ensure that the addition of a single element in the tree cannot fail. On
  29071. @@ -305,6 +307,7 @@
  29072. return 0;
  29073. }
  29074. EXPORT_SYMBOL(radix_tree_maybe_preload);
  29075. +#endif
  29076. /*
  29077. * Return the maximum key which can be store into a
  29078. diff -Nur linux-3.18.14.orig/lib/scatterlist.c linux-3.18.14-rt/lib/scatterlist.c
  29079. --- linux-3.18.14.orig/lib/scatterlist.c 2015-05-20 10:04:50.000000000 -0500
  29080. +++ linux-3.18.14-rt/lib/scatterlist.c 2015-05-31 15:32:49.161635360 -0500
  29081. @@ -592,7 +592,7 @@
  29082. flush_kernel_dcache_page(miter->page);
  29083. if (miter->__flags & SG_MITER_ATOMIC) {
  29084. - WARN_ON_ONCE(preemptible());
  29085. + WARN_ON_ONCE(!pagefault_disabled());
  29086. kunmap_atomic(miter->addr);
  29087. } else
  29088. kunmap(miter->page);
  29089. @@ -637,7 +637,7 @@
  29090. if (!sg_miter_skip(&miter, skip))
  29091. return false;
  29092. - local_irq_save(flags);
  29093. + local_irq_save_nort(flags);
  29094. while (sg_miter_next(&miter) && offset < buflen) {
  29095. unsigned int len;
  29096. @@ -654,7 +654,7 @@
  29097. sg_miter_stop(&miter);
  29098. - local_irq_restore(flags);
  29099. + local_irq_restore_nort(flags);
  29100. return offset;
  29101. }
  29102. diff -Nur linux-3.18.14.orig/lib/smp_processor_id.c linux-3.18.14-rt/lib/smp_processor_id.c
  29103. --- linux-3.18.14.orig/lib/smp_processor_id.c 2015-05-20 10:04:50.000000000 -0500
  29104. +++ linux-3.18.14-rt/lib/smp_processor_id.c 2015-05-31 15:32:49.161635360 -0500
  29105. @@ -39,8 +39,9 @@
  29106. if (!printk_ratelimit())
  29107. goto out_enable;
  29108. - printk(KERN_ERR "BUG: using %s%s() in preemptible [%08x] code: %s/%d\n",
  29109. - what1, what2, preempt_count() - 1, current->comm, current->pid);
  29110. + printk(KERN_ERR "BUG: using %s%s() in preemptible [%08x %08x] code: %s/%d\n",
  29111. + what1, what2, preempt_count() - 1, __migrate_disabled(current),
  29112. + current->comm, current->pid);
  29113. print_symbol("caller is %s\n", (long)__builtin_return_address(0));
  29114. dump_stack();
  29115. diff -Nur linux-3.18.14.orig/mm/filemap.c linux-3.18.14-rt/mm/filemap.c
  29116. --- linux-3.18.14.orig/mm/filemap.c 2015-05-20 10:04:50.000000000 -0500
  29117. +++ linux-3.18.14-rt/mm/filemap.c 2015-05-31 15:32:49.181635360 -0500
  29118. @@ -168,7 +168,9 @@
  29119. if (!workingset_node_pages(node) &&
  29120. list_empty(&node->private_list)) {
  29121. node->private_data = mapping;
  29122. - list_lru_add(&workingset_shadow_nodes, &node->private_list);
  29123. + local_lock(workingset_shadow_lock);
  29124. + list_lru_add(&__workingset_shadow_nodes, &node->private_list);
  29125. + local_unlock(workingset_shadow_lock);
  29126. }
  29127. }
  29128. @@ -535,9 +537,12 @@
  29129. * node->private_list is protected by
  29130. * mapping->tree_lock.
  29131. */
  29132. - if (!list_empty(&node->private_list))
  29133. - list_lru_del(&workingset_shadow_nodes,
  29134. + if (!list_empty(&node->private_list)) {
  29135. + local_lock(workingset_shadow_lock);
  29136. + list_lru_del(&__workingset_shadow_nodes,
  29137. &node->private_list);
  29138. + local_unlock(workingset_shadow_lock);
  29139. + }
  29140. }
  29141. return 0;
  29142. }
  29143. diff -Nur linux-3.18.14.orig/mm/highmem.c linux-3.18.14-rt/mm/highmem.c
  29144. --- linux-3.18.14.orig/mm/highmem.c 2015-05-20 10:04:50.000000000 -0500
  29145. +++ linux-3.18.14-rt/mm/highmem.c 2015-05-31 15:32:49.201635360 -0500
  29146. @@ -29,10 +29,11 @@
  29147. #include <linux/kgdb.h>
  29148. #include <asm/tlbflush.h>
  29149. -
  29150. +#ifndef CONFIG_PREEMPT_RT_FULL
  29151. #if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32)
  29152. DEFINE_PER_CPU(int, __kmap_atomic_idx);
  29153. #endif
  29154. +#endif
  29155. /*
  29156. * Virtual_count is not a pure "count".
  29157. @@ -107,8 +108,9 @@
  29158. unsigned long totalhigh_pages __read_mostly;
  29159. EXPORT_SYMBOL(totalhigh_pages);
  29160. -
  29161. +#ifndef CONFIG_PREEMPT_RT_FULL
  29162. EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx);
  29163. +#endif
  29164. unsigned int nr_free_highpages (void)
  29165. {
  29166. diff -Nur linux-3.18.14.orig/mm/Kconfig linux-3.18.14-rt/mm/Kconfig
  29167. --- linux-3.18.14.orig/mm/Kconfig 2015-05-20 10:04:50.000000000 -0500
  29168. +++ linux-3.18.14-rt/mm/Kconfig 2015-05-31 15:32:49.177635360 -0500
  29169. @@ -408,7 +408,7 @@
  29170. config TRANSPARENT_HUGEPAGE
  29171. bool "Transparent Hugepage Support"
  29172. - depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE
  29173. + depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE && !PREEMPT_RT_FULL
  29174. select COMPACTION
  29175. help
  29176. Transparent Hugepages allows the kernel to use huge pages and
  29177. diff -Nur linux-3.18.14.orig/mm/memcontrol.c linux-3.18.14-rt/mm/memcontrol.c
  29178. --- linux-3.18.14.orig/mm/memcontrol.c 2015-05-20 10:04:50.000000000 -0500
  29179. +++ linux-3.18.14-rt/mm/memcontrol.c 2015-05-31 15:32:49.213635360 -0500
  29180. @@ -60,6 +60,8 @@
  29181. #include <net/sock.h>
  29182. #include <net/ip.h>
  29183. #include <net/tcp_memcontrol.h>
  29184. +#include <linux/locallock.h>
  29185. +
  29186. #include "slab.h"
  29187. #include <asm/uaccess.h>
  29188. @@ -87,6 +89,7 @@
  29189. #define do_swap_account 0
  29190. #endif
  29191. +static DEFINE_LOCAL_IRQ_LOCK(event_lock);
  29192. static const char * const mem_cgroup_stat_names[] = {
  29193. "cache",
  29194. @@ -2376,14 +2379,17 @@
  29195. */
  29196. static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
  29197. {
  29198. - struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
  29199. + struct memcg_stock_pcp *stock;
  29200. + int cpu = get_cpu_light();
  29201. +
  29202. + stock = &per_cpu(memcg_stock, cpu);
  29203. if (stock->cached != memcg) { /* reset if necessary */
  29204. drain_stock(stock);
  29205. stock->cached = memcg;
  29206. }
  29207. stock->nr_pages += nr_pages;
  29208. - put_cpu_var(memcg_stock);
  29209. + put_cpu_light();
  29210. }
  29211. /*
  29212. @@ -2397,7 +2403,7 @@
  29213. /* Notify other cpus that system-wide "drain" is running */
  29214. get_online_cpus();
  29215. - curcpu = get_cpu();
  29216. + curcpu = get_cpu_light();
  29217. for_each_online_cpu(cpu) {
  29218. struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
  29219. struct mem_cgroup *memcg;
  29220. @@ -2414,7 +2420,7 @@
  29221. schedule_work_on(cpu, &stock->work);
  29222. }
  29223. }
  29224. - put_cpu();
  29225. + put_cpu_light();
  29226. if (!sync)
  29227. goto out;
  29228. @@ -3419,12 +3425,12 @@
  29229. move_unlock_mem_cgroup(from, &flags);
  29230. ret = 0;
  29231. - local_irq_disable();
  29232. + local_lock_irq(event_lock);
  29233. mem_cgroup_charge_statistics(to, page, nr_pages);
  29234. memcg_check_events(to, page);
  29235. mem_cgroup_charge_statistics(from, page, -nr_pages);
  29236. memcg_check_events(from, page);
  29237. - local_irq_enable();
  29238. + local_unlock_irq(event_lock);
  29239. out_unlock:
  29240. unlock_page(page);
  29241. out:
  29242. @@ -6406,10 +6412,10 @@
  29243. VM_BUG_ON_PAGE(!PageTransHuge(page), page);
  29244. }
  29245. - local_irq_disable();
  29246. + local_lock_irq(event_lock);
  29247. mem_cgroup_charge_statistics(memcg, page, nr_pages);
  29248. memcg_check_events(memcg, page);
  29249. - local_irq_enable();
  29250. + local_unlock_irq(event_lock);
  29251. if (do_swap_account && PageSwapCache(page)) {
  29252. swp_entry_t entry = { .val = page_private(page) };
  29253. @@ -6468,14 +6474,14 @@
  29254. memcg_oom_recover(memcg);
  29255. }
  29256. - local_irq_save(flags);
  29257. + local_lock_irqsave(event_lock, flags);
  29258. __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon);
  29259. __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file);
  29260. __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge);
  29261. __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout);
  29262. __this_cpu_add(memcg->stat->nr_page_events, nr_anon + nr_file);
  29263. memcg_check_events(memcg, dummy_page);
  29264. - local_irq_restore(flags);
  29265. + local_unlock_irqrestore(event_lock, flags);
  29266. }
  29267. static void uncharge_list(struct list_head *page_list)
  29268. diff -Nur linux-3.18.14.orig/mm/memory.c linux-3.18.14-rt/mm/memory.c
  29269. --- linux-3.18.14.orig/mm/memory.c 2015-05-20 10:04:50.000000000 -0500
  29270. +++ linux-3.18.14-rt/mm/memory.c 2015-05-31 15:32:49.229635360 -0500
  29271. @@ -3244,6 +3244,32 @@
  29272. return 0;
  29273. }
  29274. +#ifdef CONFIG_PREEMPT_RT_FULL
  29275. +void pagefault_disable(void)
  29276. +{
  29277. + migrate_disable();
  29278. + current->pagefault_disabled++;
  29279. + /*
  29280. + * make sure to have issued the store before a pagefault
  29281. + * can hit.
  29282. + */
  29283. + barrier();
  29284. +}
  29285. +EXPORT_SYMBOL(pagefault_disable);
  29286. +
  29287. +void pagefault_enable(void)
  29288. +{
  29289. + /*
  29290. + * make sure to issue those last loads/stores before enabling
  29291. + * the pagefault handler again.
  29292. + */
  29293. + barrier();
  29294. + current->pagefault_disabled--;
  29295. + migrate_enable();
  29296. +}
  29297. +EXPORT_SYMBOL(pagefault_enable);
  29298. +#endif
  29299. +
  29300. /*
  29301. * By the time we get here, we already hold the mm semaphore
  29302. *
  29303. diff -Nur linux-3.18.14.orig/mm/mmu_context.c linux-3.18.14-rt/mm/mmu_context.c
  29304. --- linux-3.18.14.orig/mm/mmu_context.c 2015-05-20 10:04:50.000000000 -0500
  29305. +++ linux-3.18.14-rt/mm/mmu_context.c 2015-05-31 15:32:49.249635360 -0500
  29306. @@ -23,6 +23,7 @@
  29307. struct task_struct *tsk = current;
  29308. task_lock(tsk);
  29309. + preempt_disable_rt();
  29310. active_mm = tsk->active_mm;
  29311. if (active_mm != mm) {
  29312. atomic_inc(&mm->mm_count);
  29313. @@ -30,6 +31,7 @@
  29314. }
  29315. tsk->mm = mm;
  29316. switch_mm(active_mm, mm, tsk);
  29317. + preempt_enable_rt();
  29318. task_unlock(tsk);
  29319. #ifdef finish_arch_post_lock_switch
  29320. finish_arch_post_lock_switch();
  29321. diff -Nur linux-3.18.14.orig/mm/page_alloc.c linux-3.18.14-rt/mm/page_alloc.c
  29322. --- linux-3.18.14.orig/mm/page_alloc.c 2015-05-20 10:04:50.000000000 -0500
  29323. +++ linux-3.18.14-rt/mm/page_alloc.c 2015-05-31 15:32:49.253635359 -0500
  29324. @@ -59,6 +59,7 @@
  29325. #include <linux/page-debug-flags.h>
  29326. #include <linux/hugetlb.h>
  29327. #include <linux/sched/rt.h>
  29328. +#include <linux/locallock.h>
  29329. #include <asm/sections.h>
  29330. #include <asm/tlbflush.h>
  29331. @@ -230,6 +231,18 @@
  29332. EXPORT_SYMBOL(nr_online_nodes);
  29333. #endif
  29334. +static DEFINE_LOCAL_IRQ_LOCK(pa_lock);
  29335. +
  29336. +#ifdef CONFIG_PREEMPT_RT_BASE
  29337. +# define cpu_lock_irqsave(cpu, flags) \
  29338. + local_lock_irqsave_on(pa_lock, flags, cpu)
  29339. +# define cpu_unlock_irqrestore(cpu, flags) \
  29340. + local_unlock_irqrestore_on(pa_lock, flags, cpu)
  29341. +#else
  29342. +# define cpu_lock_irqsave(cpu, flags) local_irq_save(flags)
  29343. +# define cpu_unlock_irqrestore(cpu, flags) local_irq_restore(flags)
  29344. +#endif
  29345. +
  29346. int page_group_by_mobility_disabled __read_mostly;
  29347. void set_pageblock_migratetype(struct page *page, int migratetype)
  29348. @@ -654,7 +667,7 @@
  29349. }
  29350. /*
  29351. - * Frees a number of pages from the PCP lists
  29352. + * Frees a number of pages which have been collected from the pcp lists.
  29353. * Assumes all pages on list are in same zone, and of same order.
  29354. * count is the number of pages to free.
  29355. *
  29356. @@ -665,18 +678,51 @@
  29357. * pinned" detection logic.
  29358. */
  29359. static void free_pcppages_bulk(struct zone *zone, int count,
  29360. - struct per_cpu_pages *pcp)
  29361. + struct list_head *list)
  29362. {
  29363. - int migratetype = 0;
  29364. - int batch_free = 0;
  29365. int to_free = count;
  29366. unsigned long nr_scanned;
  29367. + unsigned long flags;
  29368. +
  29369. + spin_lock_irqsave(&zone->lock, flags);
  29370. - spin_lock(&zone->lock);
  29371. nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED);
  29372. if (nr_scanned)
  29373. __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned);
  29374. + while (!list_empty(list)) {
  29375. + struct page *page = list_first_entry(list, struct page, lru);
  29376. + int mt; /* migratetype of the to-be-freed page */
  29377. +
  29378. + /* must delete as __free_one_page list manipulates */
  29379. + list_del(&page->lru);
  29380. +
  29381. + mt = get_freepage_migratetype(page);
  29382. + if (unlikely(has_isolate_pageblock(zone)))
  29383. + mt = get_pageblock_migratetype(page);
  29384. +
  29385. + /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
  29386. + __free_one_page(page, page_to_pfn(page), zone, 0, mt);
  29387. + trace_mm_page_pcpu_drain(page, 0, mt);
  29388. + to_free--;
  29389. + }
  29390. + WARN_ON(to_free != 0);
  29391. + spin_unlock_irqrestore(&zone->lock, flags);
  29392. +}
  29393. +
  29394. +/*
  29395. + * Moves a number of pages from the PCP lists to free list which
  29396. + * is freed outside of the locked region.
  29397. + *
  29398. + * Assumes all pages on list are in same zone, and of same order.
  29399. + * count is the number of pages to free.
  29400. + */
  29401. +static void isolate_pcp_pages(int to_free, struct per_cpu_pages *src,
  29402. + struct list_head *dst)
  29403. +{
  29404. + int migratetype = 0;
  29405. + int batch_free = 0;
  29406. +
  29407. while (to_free) {
  29408. struct page *page;
  29409. struct list_head *list;
  29410. @@ -692,7 +738,7 @@
  29411. batch_free++;
  29412. if (++migratetype == MIGRATE_PCPTYPES)
  29413. migratetype = 0;
  29414. - list = &pcp->lists[migratetype];
  29415. + list = &src->lists[migratetype];
  29416. } while (list_empty(list));
  29417. /* This is the only non-empty list. Free them all. */
  29418. @@ -700,21 +746,11 @@
  29419. batch_free = to_free;
  29420. do {
  29421. - int mt; /* migratetype of the to-be-freed page */
  29422. -
  29423. - page = list_entry(list->prev, struct page, lru);
  29424. - /* must delete as __free_one_page list manipulates */
  29425. + page = list_last_entry(list, struct page, lru);
  29426. list_del(&page->lru);
  29427. - mt = get_freepage_migratetype(page);
  29428. - if (unlikely(has_isolate_pageblock(zone)))
  29429. - mt = get_pageblock_migratetype(page);
  29430. -
  29431. - /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
  29432. - __free_one_page(page, page_to_pfn(page), zone, 0, mt);
  29433. - trace_mm_page_pcpu_drain(page, 0, mt);
  29434. + list_add(&page->lru, dst);
  29435. } while (--to_free && --batch_free && !list_empty(list));
  29436. }
  29437. - spin_unlock(&zone->lock);
  29438. }
  29439. static void free_one_page(struct zone *zone,
  29440. @@ -723,7 +759,9 @@
  29441. int migratetype)
  29442. {
  29443. unsigned long nr_scanned;
  29444. - spin_lock(&zone->lock);
  29445. + unsigned long flags;
  29446. +
  29447. + spin_lock_irqsave(&zone->lock, flags);
  29448. nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED);
  29449. if (nr_scanned)
  29450. __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned);
  29451. @@ -733,7 +771,7 @@
  29452. migratetype = get_pfnblock_migratetype(page, pfn);
  29453. }
  29454. __free_one_page(page, pfn, zone, order, migratetype);
  29455. - spin_unlock(&zone->lock);
  29456. + spin_unlock_irqrestore(&zone->lock, flags);
  29457. }
  29458. static bool free_pages_prepare(struct page *page, unsigned int order)
  29459. @@ -773,11 +811,11 @@
  29460. return;
  29461. migratetype = get_pfnblock_migratetype(page, pfn);
  29462. - local_irq_save(flags);
  29463. + local_lock_irqsave(pa_lock, flags);
  29464. __count_vm_events(PGFREE, 1 << order);
  29465. set_freepage_migratetype(page, migratetype);
  29466. free_one_page(page_zone(page), page, pfn, order, migratetype);
  29467. - local_irq_restore(flags);
  29468. + local_unlock_irqrestore(pa_lock, flags);
  29469. }
  29470. void __init __free_pages_bootmem(struct page *page, unsigned int order)
  29471. @@ -1251,16 +1289,18 @@
  29472. void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
  29473. {
  29474. unsigned long flags;
  29475. + LIST_HEAD(dst);
  29476. int to_drain, batch;
  29477. - local_irq_save(flags);
  29478. + local_lock_irqsave(pa_lock, flags);
  29479. batch = ACCESS_ONCE(pcp->batch);
  29480. to_drain = min(pcp->count, batch);
  29481. if (to_drain > 0) {
  29482. - free_pcppages_bulk(zone, to_drain, pcp);
  29483. + isolate_pcp_pages(to_drain, pcp, &dst);
  29484. pcp->count -= to_drain;
  29485. }
  29486. - local_irq_restore(flags);
  29487. + local_unlock_irqrestore(pa_lock, flags);
  29488. + free_pcppages_bulk(zone, to_drain, &dst);
  29489. }
  29490. #endif
  29491. @@ -1279,16 +1319,21 @@
  29492. for_each_populated_zone(zone) {
  29493. struct per_cpu_pageset *pset;
  29494. struct per_cpu_pages *pcp;
  29495. + LIST_HEAD(dst);
  29496. + int count;
  29497. - local_irq_save(flags);
  29498. + cpu_lock_irqsave(cpu, flags);
  29499. pset = per_cpu_ptr(zone->pageset, cpu);
  29500. pcp = &pset->pcp;
  29501. - if (pcp->count) {
  29502. - free_pcppages_bulk(zone, pcp->count, pcp);
  29503. + count = pcp->count;
  29504. + if (count) {
  29505. + isolate_pcp_pages(count, pcp, &dst);
  29506. pcp->count = 0;
  29507. }
  29508. - local_irq_restore(flags);
  29509. + cpu_unlock_irqrestore(cpu, flags);
  29510. + if (count)
  29511. + free_pcppages_bulk(zone, count, &dst);
  29512. }
  29513. }
  29514. @@ -1341,7 +1386,12 @@
  29515. else
  29516. cpumask_clear_cpu(cpu, &cpus_with_pcps);
  29517. }
  29518. +#ifndef CONFIG_PREEMPT_RT_BASE
  29519. on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1);
  29520. +#else
  29521. + for_each_cpu(cpu, &cpus_with_pcps)
  29522. + drain_pages(cpu);
  29523. +#endif
  29524. }
  29525. #ifdef CONFIG_HIBERNATION
  29526. @@ -1397,7 +1447,7 @@
  29527. migratetype = get_pfnblock_migratetype(page, pfn);
  29528. set_freepage_migratetype(page, migratetype);
  29529. - local_irq_save(flags);
  29530. + local_lock_irqsave(pa_lock, flags);
  29531. __count_vm_event(PGFREE);
  29532. /*
  29533. @@ -1423,12 +1473,17 @@
  29534. pcp->count++;
  29535. if (pcp->count >= pcp->high) {
  29536. unsigned long batch = ACCESS_ONCE(pcp->batch);
  29537. - free_pcppages_bulk(zone, batch, pcp);
  29538. + LIST_HEAD(dst);
  29539. +
  29540. + isolate_pcp_pages(batch, pcp, &dst);
  29541. pcp->count -= batch;
  29542. + local_unlock_irqrestore(pa_lock, flags);
  29543. + free_pcppages_bulk(zone, batch, &dst);
  29544. + return;
  29545. }
  29546. out:
  29547. - local_irq_restore(flags);
  29548. + local_unlock_irqrestore(pa_lock, flags);
  29549. }
  29550. /*
  29551. @@ -1558,7 +1613,7 @@
  29552. struct per_cpu_pages *pcp;
  29553. struct list_head *list;
  29554. - local_irq_save(flags);
  29555. + local_lock_irqsave(pa_lock, flags);
  29556. pcp = &this_cpu_ptr(zone->pageset)->pcp;
  29557. list = &pcp->lists[migratetype];
  29558. if (list_empty(list)) {
  29559. @@ -1590,13 +1645,15 @@
  29560. */
  29561. WARN_ON_ONCE(order > 1);
  29562. }
  29563. - spin_lock_irqsave(&zone->lock, flags);
  29564. + local_spin_lock_irqsave(pa_lock, &zone->lock, flags);
  29565. page = __rmqueue(zone, order, migratetype);
  29566. - spin_unlock(&zone->lock);
  29567. - if (!page)
  29568. + if (!page) {
  29569. + spin_unlock(&zone->lock);
  29570. goto failed;
  29571. + }
  29572. __mod_zone_freepage_state(zone, -(1 << order),
  29573. get_freepage_migratetype(page));
  29574. + spin_unlock(&zone->lock);
  29575. }
  29576. __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
  29577. @@ -1606,7 +1663,7 @@
  29578. __count_zone_vm_events(PGALLOC, zone, 1 << order);
  29579. zone_statistics(preferred_zone, zone, gfp_flags);
  29580. - local_irq_restore(flags);
  29581. + local_unlock_irqrestore(pa_lock, flags);
  29582. VM_BUG_ON_PAGE(bad_range(zone, page), page);
  29583. if (prep_new_page(page, order, gfp_flags))
  29584. @@ -1614,7 +1671,7 @@
  29585. return page;
  29586. failed:
  29587. - local_irq_restore(flags);
  29588. + local_unlock_irqrestore(pa_lock, flags);
  29589. return NULL;
  29590. }
  29591. @@ -2325,8 +2382,8 @@
  29592. count_vm_event(COMPACTSTALL);
  29593. /* Page migration frees to the PCP lists but we want merging */
  29594. - drain_pages(get_cpu());
  29595. - put_cpu();
  29596. + drain_pages(get_cpu_light());
  29597. + put_cpu_light();
  29598. page = get_page_from_freelist(gfp_mask, nodemask,
  29599. order, zonelist, high_zoneidx,
  29600. @@ -5565,6 +5622,7 @@
  29601. void __init page_alloc_init(void)
  29602. {
  29603. hotcpu_notifier(page_alloc_cpu_notify, 0);
  29604. + local_irq_lock_init(pa_lock);
  29605. }
  29606. /*
  29607. @@ -6459,7 +6517,7 @@
  29608. struct per_cpu_pageset *pset;
  29609. /* avoid races with drain_pages() */
  29610. - local_irq_save(flags);
  29611. + local_lock_irqsave(pa_lock, flags);
  29612. if (zone->pageset != &boot_pageset) {
  29613. for_each_online_cpu(cpu) {
  29614. pset = per_cpu_ptr(zone->pageset, cpu);
  29615. @@ -6468,7 +6526,7 @@
  29616. free_percpu(zone->pageset);
  29617. zone->pageset = &boot_pageset;
  29618. }
  29619. - local_irq_restore(flags);
  29620. + local_unlock_irqrestore(pa_lock, flags);
  29621. }
  29622. #ifdef CONFIG_MEMORY_HOTREMOVE
  29623. diff -Nur linux-3.18.14.orig/mm/slab.h linux-3.18.14-rt/mm/slab.h
  29624. --- linux-3.18.14.orig/mm/slab.h 2015-05-20 10:04:50.000000000 -0500
  29625. +++ linux-3.18.14-rt/mm/slab.h 2015-05-31 15:32:49.257635359 -0500
  29626. @@ -315,7 +315,11 @@
  29627. * The slab lists for all objects.
  29628. */
  29629. struct kmem_cache_node {
  29630. +#ifdef CONFIG_SLUB
  29631. + raw_spinlock_t list_lock;
  29632. +#else
  29633. spinlock_t list_lock;
  29634. +#endif
  29635. #ifdef CONFIG_SLAB
  29636. struct list_head slabs_partial; /* partial list first, better asm code */
  29637. diff -Nur linux-3.18.14.orig/mm/slub.c linux-3.18.14-rt/mm/slub.c
  29638. --- linux-3.18.14.orig/mm/slub.c 2015-05-20 10:04:50.000000000 -0500
  29639. +++ linux-3.18.14-rt/mm/slub.c 2015-05-31 15:32:49.257635359 -0500
  29640. @@ -1044,7 +1044,7 @@
  29641. {
  29642. struct kmem_cache_node *n = get_node(s, page_to_nid(page));
  29643. - spin_lock_irqsave(&n->list_lock, *flags);
  29644. + raw_spin_lock_irqsave(&n->list_lock, *flags);
  29645. slab_lock(page);
  29646. if (!check_slab(s, page))
  29647. @@ -1091,7 +1091,7 @@
  29648. fail:
  29649. slab_unlock(page);
  29650. - spin_unlock_irqrestore(&n->list_lock, *flags);
  29651. + raw_spin_unlock_irqrestore(&n->list_lock, *flags);
  29652. slab_fix(s, "Object at 0x%p not freed", object);
  29653. return NULL;
  29654. }
  29655. @@ -1219,6 +1219,12 @@
  29656. #endif /* CONFIG_SLUB_DEBUG */
  29657. +struct slub_free_list {
  29658. + raw_spinlock_t lock;
  29659. + struct list_head list;
  29660. +};
  29661. +static DEFINE_PER_CPU(struct slub_free_list, slub_free_list);
  29662. +
  29663. /*
  29664. * Hooks for other subsystems that check memory allocations. In a typical
  29665. * production configuration these hooks all should produce no code at all.
  29666. @@ -1303,10 +1309,15 @@
  29667. struct page *page;
  29668. struct kmem_cache_order_objects oo = s->oo;
  29669. gfp_t alloc_gfp;
  29670. + bool enableirqs;
  29671. flags &= gfp_allowed_mask;
  29672. - if (flags & __GFP_WAIT)
  29673. + enableirqs = (flags & __GFP_WAIT) != 0;
  29674. +#ifdef CONFIG_PREEMPT_RT_FULL
  29675. + enableirqs |= system_state == SYSTEM_RUNNING;
  29676. +#endif
  29677. + if (enableirqs)
  29678. local_irq_enable();
  29679. flags |= s->allocflags;
  29680. @@ -1347,7 +1358,7 @@
  29681. kmemcheck_mark_unallocated_pages(page, pages);
  29682. }
  29683. - if (flags & __GFP_WAIT)
  29684. + if (enableirqs)
  29685. local_irq_disable();
  29686. if (!page)
  29687. return NULL;
  29688. @@ -1365,8 +1376,10 @@
  29689. void *object)
  29690. {
  29691. setup_object_debug(s, page, object);
  29692. +#ifndef CONFIG_PREEMPT_RT_FULL
  29693. if (unlikely(s->ctor))
  29694. s->ctor(object);
  29695. +#endif
  29696. }
  29697. static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
  29698. @@ -1442,6 +1455,16 @@
  29699. memcg_uncharge_slab(s, order);
  29700. }
  29701. +static void free_delayed(struct list_head *h)
  29702. +{
  29703. + while(!list_empty(h)) {
  29704. + struct page *page = list_first_entry(h, struct page, lru);
  29705. +
  29706. + list_del(&page->lru);
  29707. + __free_slab(page->slab_cache, page);
  29708. + }
  29709. +}
  29710. +
  29711. #define need_reserve_slab_rcu \
  29712. (sizeof(((struct page *)NULL)->lru) < sizeof(struct rcu_head))
  29713. @@ -1476,6 +1499,12 @@
  29714. }
  29715. call_rcu(head, rcu_free_slab);
  29716. + } else if (irqs_disabled()) {
  29717. + struct slub_free_list *f = &__get_cpu_var(slub_free_list);
  29718. +
  29719. + raw_spin_lock(&f->lock);
  29720. + list_add(&page->lru, &f->list);
  29721. + raw_spin_unlock(&f->lock);
  29722. } else
  29723. __free_slab(s, page);
  29724. }
  29725. @@ -1589,7 +1618,7 @@
  29726. if (!n || !n->nr_partial)
  29727. return NULL;
  29728. - spin_lock(&n->list_lock);
  29729. + raw_spin_lock(&n->list_lock);
  29730. list_for_each_entry_safe(page, page2, &n->partial, lru) {
  29731. void *t;
  29732. @@ -1614,7 +1643,7 @@
  29733. break;
  29734. }
  29735. - spin_unlock(&n->list_lock);
  29736. + raw_spin_unlock(&n->list_lock);
  29737. return object;
  29738. }
  29739. @@ -1860,7 +1889,7 @@
  29740. * that acquire_slab() will see a slab page that
  29741. * is frozen
  29742. */
  29743. - spin_lock(&n->list_lock);
  29744. + raw_spin_lock(&n->list_lock);
  29745. }
  29746. } else {
  29747. m = M_FULL;
  29748. @@ -1871,7 +1900,7 @@
  29749. * slabs from diagnostic functions will not see
  29750. * any frozen slabs.
  29751. */
  29752. - spin_lock(&n->list_lock);
  29753. + raw_spin_lock(&n->list_lock);
  29754. }
  29755. }
  29756. @@ -1906,7 +1935,7 @@
  29757. goto redo;
  29758. if (lock)
  29759. - spin_unlock(&n->list_lock);
  29760. + raw_spin_unlock(&n->list_lock);
  29761. if (m == M_FREE) {
  29762. stat(s, DEACTIVATE_EMPTY);
  29763. @@ -1938,10 +1967,10 @@
  29764. n2 = get_node(s, page_to_nid(page));
  29765. if (n != n2) {
  29766. if (n)
  29767. - spin_unlock(&n->list_lock);
  29768. + raw_spin_unlock(&n->list_lock);
  29769. n = n2;
  29770. - spin_lock(&n->list_lock);
  29771. + raw_spin_lock(&n->list_lock);
  29772. }
  29773. do {
  29774. @@ -1970,7 +1999,7 @@
  29775. }
  29776. if (n)
  29777. - spin_unlock(&n->list_lock);
  29778. + raw_spin_unlock(&n->list_lock);
  29779. while (discard_page) {
  29780. page = discard_page;
  29781. @@ -2008,14 +2037,21 @@
  29782. pobjects = oldpage->pobjects;
  29783. pages = oldpage->pages;
  29784. if (drain && pobjects > s->cpu_partial) {
  29785. + struct slub_free_list *f;
  29786. unsigned long flags;
  29787. + LIST_HEAD(tofree);
  29788. /*
  29789. * partial array is full. Move the existing
  29790. * set to the per node partial list.
  29791. */
  29792. local_irq_save(flags);
  29793. unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
  29794. + f = &__get_cpu_var(slub_free_list);
  29795. + raw_spin_lock(&f->lock);
  29796. + list_splice_init(&f->list, &tofree);
  29797. + raw_spin_unlock(&f->lock);
  29798. local_irq_restore(flags);
  29799. + free_delayed(&tofree);
  29800. oldpage = NULL;
  29801. pobjects = 0;
  29802. pages = 0;
  29803. @@ -2079,7 +2115,22 @@
  29804. static void flush_all(struct kmem_cache *s)
  29805. {
  29806. + LIST_HEAD(tofree);
  29807. + int cpu;
  29808. +
  29809. on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1, GFP_ATOMIC);
  29810. + for_each_online_cpu(cpu) {
  29811. + struct slub_free_list *f;
  29812. +
  29813. + if (!has_cpu_slab(cpu, s))
  29814. + continue;
  29815. +
  29816. + f = &per_cpu(slub_free_list, cpu);
  29817. + raw_spin_lock_irq(&f->lock);
  29818. + list_splice_init(&f->list, &tofree);
  29819. + raw_spin_unlock_irq(&f->lock);
  29820. + free_delayed(&tofree);
  29821. + }
  29822. }
  29823. /*
  29824. @@ -2115,10 +2166,10 @@
  29825. unsigned long x = 0;
  29826. struct page *page;
  29827. - spin_lock_irqsave(&n->list_lock, flags);
  29828. + raw_spin_lock_irqsave(&n->list_lock, flags);
  29829. list_for_each_entry(page, &n->partial, lru)
  29830. x += get_count(page);
  29831. - spin_unlock_irqrestore(&n->list_lock, flags);
  29832. + raw_spin_unlock_irqrestore(&n->list_lock, flags);
  29833. return x;
  29834. }
  29835. #endif /* CONFIG_SLUB_DEBUG || CONFIG_SYSFS */
  29836. @@ -2255,9 +2306,11 @@
  29837. static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
  29838. unsigned long addr, struct kmem_cache_cpu *c)
  29839. {
  29840. + struct slub_free_list *f;
  29841. void *freelist;
  29842. struct page *page;
  29843. unsigned long flags;
  29844. + LIST_HEAD(tofree);
  29845. local_irq_save(flags);
  29846. #ifdef CONFIG_PREEMPT
  29847. @@ -2325,7 +2378,13 @@
  29848. VM_BUG_ON(!c->page->frozen);
  29849. c->freelist = get_freepointer(s, freelist);
  29850. c->tid = next_tid(c->tid);
  29851. +out:
  29852. + f = &__get_cpu_var(slub_free_list);
  29853. + raw_spin_lock(&f->lock);
  29854. + list_splice_init(&f->list, &tofree);
  29855. + raw_spin_unlock(&f->lock);
  29856. local_irq_restore(flags);
  29857. + free_delayed(&tofree);
  29858. return freelist;
  29859. new_slab:
  29860. @@ -2342,8 +2401,7 @@
  29861. if (unlikely(!freelist)) {
  29862. slab_out_of_memory(s, gfpflags, node);
  29863. - local_irq_restore(flags);
  29864. - return NULL;
  29865. + goto out;
  29866. }
  29867. page = c->page;
  29868. @@ -2358,8 +2416,7 @@
  29869. deactivate_slab(s, page, get_freepointer(s, freelist));
  29870. c->page = NULL;
  29871. c->freelist = NULL;
  29872. - local_irq_restore(flags);
  29873. - return freelist;
  29874. + goto out;
  29875. }
  29876. /*
  29877. @@ -2444,6 +2501,10 @@
  29878. if (unlikely(gfpflags & __GFP_ZERO) && object)
  29879. memset(object, 0, s->object_size);
  29880. +#ifdef CONFIG_PREEMPT_RT_FULL
  29881. + if (unlikely(s->ctor) && object)
  29882. + s->ctor(object);
  29883. +#endif
  29884. slab_post_alloc_hook(s, gfpflags, object);
  29885. @@ -2531,7 +2592,7 @@
  29886. do {
  29887. if (unlikely(n)) {
  29888. - spin_unlock_irqrestore(&n->list_lock, flags);
  29889. + raw_spin_unlock_irqrestore(&n->list_lock, flags);
  29890. n = NULL;
  29891. }
  29892. prior = page->freelist;
  29893. @@ -2563,7 +2624,7 @@
  29894. * Otherwise the list_lock will synchronize with
  29895. * other processors updating the list of slabs.
  29896. */
  29897. - spin_lock_irqsave(&n->list_lock, flags);
  29898. + raw_spin_lock_irqsave(&n->list_lock, flags);
  29899. }
  29900. }
  29901. @@ -2605,7 +2666,7 @@
  29902. add_partial(n, page, DEACTIVATE_TO_TAIL);
  29903. stat(s, FREE_ADD_PARTIAL);
  29904. }
  29905. - spin_unlock_irqrestore(&n->list_lock, flags);
  29906. + raw_spin_unlock_irqrestore(&n->list_lock, flags);
  29907. return;
  29908. slab_empty:
  29909. @@ -2620,7 +2681,7 @@
  29910. remove_full(s, n, page);
  29911. }
  29912. - spin_unlock_irqrestore(&n->list_lock, flags);
  29913. + raw_spin_unlock_irqrestore(&n->list_lock, flags);
  29914. stat(s, FREE_SLAB);
  29915. discard_slab(s, page);
  29916. }
  29917. @@ -2816,7 +2877,7 @@
  29918. init_kmem_cache_node(struct kmem_cache_node *n)
  29919. {
  29920. n->nr_partial = 0;
  29921. - spin_lock_init(&n->list_lock);
  29922. + raw_spin_lock_init(&n->list_lock);
  29923. INIT_LIST_HEAD(&n->partial);
  29924. #ifdef CONFIG_SLUB_DEBUG
  29925. atomic_long_set(&n->nr_slabs, 0);
  29926. @@ -3373,7 +3434,7 @@
  29927. for (i = 0; i < objects; i++)
  29928. INIT_LIST_HEAD(slabs_by_inuse + i);
  29929. - spin_lock_irqsave(&n->list_lock, flags);
  29930. + raw_spin_lock_irqsave(&n->list_lock, flags);
  29931. /*
  29932. * Build lists indexed by the items in use in each slab.
  29933. @@ -3394,7 +3455,7 @@
  29934. for (i = objects - 1; i > 0; i--)
  29935. list_splice(slabs_by_inuse + i, n->partial.prev);
  29936. - spin_unlock_irqrestore(&n->list_lock, flags);
  29937. + raw_spin_unlock_irqrestore(&n->list_lock, flags);
  29938. /* Release empty slabs */
  29939. list_for_each_entry_safe(page, t, slabs_by_inuse, lru)
  29940. @@ -3567,6 +3628,12 @@
  29941. {
  29942. static __initdata struct kmem_cache boot_kmem_cache,
  29943. boot_kmem_cache_node;
  29944. + int cpu;
  29945. +
  29946. + for_each_possible_cpu(cpu) {
  29947. + raw_spin_lock_init(&per_cpu(slub_free_list, cpu).lock);
  29948. + INIT_LIST_HEAD(&per_cpu(slub_free_list, cpu).list);
  29949. + }
  29950. if (debug_guardpage_minorder())
  29951. slub_max_order = 0;
  29952. @@ -3815,7 +3882,7 @@
  29953. struct page *page;
  29954. unsigned long flags;
  29955. - spin_lock_irqsave(&n->list_lock, flags);
  29956. + raw_spin_lock_irqsave(&n->list_lock, flags);
  29957. list_for_each_entry(page, &n->partial, lru) {
  29958. validate_slab_slab(s, page, map);
  29959. @@ -3837,7 +3904,7 @@
  29960. s->name, count, atomic_long_read(&n->nr_slabs));
  29961. out:
  29962. - spin_unlock_irqrestore(&n->list_lock, flags);
  29963. + raw_spin_unlock_irqrestore(&n->list_lock, flags);
  29964. return count;
  29965. }
  29966. @@ -4025,12 +4092,12 @@
  29967. if (!atomic_long_read(&n->nr_slabs))
  29968. continue;
  29969. - spin_lock_irqsave(&n->list_lock, flags);
  29970. + raw_spin_lock_irqsave(&n->list_lock, flags);
  29971. list_for_each_entry(page, &n->partial, lru)
  29972. process_slab(&t, s, page, alloc, map);
  29973. list_for_each_entry(page, &n->full, lru)
  29974. process_slab(&t, s, page, alloc, map);
  29975. - spin_unlock_irqrestore(&n->list_lock, flags);
  29976. + raw_spin_unlock_irqrestore(&n->list_lock, flags);
  29977. }
  29978. for (i = 0; i < t.count; i++) {
  29979. diff -Nur linux-3.18.14.orig/mm/swap.c linux-3.18.14-rt/mm/swap.c
  29980. --- linux-3.18.14.orig/mm/swap.c 2015-05-20 10:04:50.000000000 -0500
  29981. +++ linux-3.18.14-rt/mm/swap.c 2015-05-31 15:32:49.285635359 -0500
  29982. @@ -31,6 +31,7 @@
  29983. #include <linux/memcontrol.h>
  29984. #include <linux/gfp.h>
  29985. #include <linux/uio.h>
  29986. +#include <linux/locallock.h>
  29987. #include "internal.h"
  29988. @@ -44,6 +45,9 @@
  29989. static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
  29990. static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
  29991. +static DEFINE_LOCAL_IRQ_LOCK(rotate_lock);
  29992. +static DEFINE_LOCAL_IRQ_LOCK(swapvec_lock);
  29993. +
  29994. /*
  29995. * This path almost never happens for VM activity - pages are normally
  29996. * freed via pagevecs. But it gets used by networking.
  29997. @@ -473,11 +477,11 @@
  29998. unsigned long flags;
  29999. page_cache_get(page);
  30000. - local_irq_save(flags);
  30001. + local_lock_irqsave(rotate_lock, flags);
  30002. pvec = this_cpu_ptr(&lru_rotate_pvecs);
  30003. if (!pagevec_add(pvec, page))
  30004. pagevec_move_tail(pvec);
  30005. - local_irq_restore(flags);
  30006. + local_unlock_irqrestore(rotate_lock, flags);
  30007. }
  30008. }
  30009. @@ -528,12 +532,13 @@
  30010. void activate_page(struct page *page)
  30011. {
  30012. if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
  30013. - struct pagevec *pvec = &get_cpu_var(activate_page_pvecs);
  30014. + struct pagevec *pvec = &get_locked_var(swapvec_lock,
  30015. + activate_page_pvecs);
  30016. page_cache_get(page);
  30017. if (!pagevec_add(pvec, page))
  30018. pagevec_lru_move_fn(pvec, __activate_page, NULL);
  30019. - put_cpu_var(activate_page_pvecs);
  30020. + put_locked_var(swapvec_lock, activate_page_pvecs);
  30021. }
  30022. }
  30023. @@ -559,7 +564,7 @@
  30024. static void __lru_cache_activate_page(struct page *page)
  30025. {
  30026. - struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
  30027. + struct pagevec *pvec = &get_locked_var(swapvec_lock, lru_add_pvec);
  30028. int i;
  30029. /*
  30030. @@ -581,7 +586,7 @@
  30031. }
  30032. }
  30033. - put_cpu_var(lru_add_pvec);
  30034. + put_locked_var(swapvec_lock, lru_add_pvec);
  30035. }
  30036. /*
  30037. @@ -620,13 +625,13 @@
  30038. static void __lru_cache_add(struct page *page)
  30039. {
  30040. - struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
  30041. + struct pagevec *pvec = &get_locked_var(swapvec_lock, lru_add_pvec);
  30042. page_cache_get(page);
  30043. if (!pagevec_space(pvec))
  30044. __pagevec_lru_add(pvec);
  30045. pagevec_add(pvec, page);
  30046. - put_cpu_var(lru_add_pvec);
  30047. + put_locked_var(swapvec_lock, lru_add_pvec);
  30048. }
  30049. /**
  30050. @@ -806,9 +811,9 @@
  30051. unsigned long flags;
  30052. /* No harm done if a racing interrupt already did this */
  30053. - local_irq_save(flags);
  30054. + local_lock_irqsave(rotate_lock, flags);
  30055. pagevec_move_tail(pvec);
  30056. - local_irq_restore(flags);
  30057. + local_unlock_irqrestore(rotate_lock, flags);
  30058. }
  30059. pvec = &per_cpu(lru_deactivate_pvecs, cpu);
  30060. @@ -836,18 +841,19 @@
  30061. return;
  30062. if (likely(get_page_unless_zero(page))) {
  30063. - struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs);
  30064. + struct pagevec *pvec = &get_locked_var(swapvec_lock,
  30065. + lru_deactivate_pvecs);
  30066. if (!pagevec_add(pvec, page))
  30067. pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
  30068. - put_cpu_var(lru_deactivate_pvecs);
  30069. + put_locked_var(swapvec_lock, lru_deactivate_pvecs);
  30070. }
  30071. }
  30072. void lru_add_drain(void)
  30073. {
  30074. - lru_add_drain_cpu(get_cpu());
  30075. - put_cpu();
  30076. + lru_add_drain_cpu(local_lock_cpu(swapvec_lock));
  30077. + local_unlock_cpu(swapvec_lock);
  30078. }
  30079. static void lru_add_drain_per_cpu(struct work_struct *dummy)
  30080. diff -Nur linux-3.18.14.orig/mm/truncate.c linux-3.18.14-rt/mm/truncate.c
  30081. --- linux-3.18.14.orig/mm/truncate.c 2015-05-20 10:04:50.000000000 -0500
  30082. +++ linux-3.18.14-rt/mm/truncate.c 2015-05-31 15:32:49.293635359 -0500
  30083. @@ -56,8 +56,11 @@
  30084. * protected by mapping->tree_lock.
  30085. */
  30086. if (!workingset_node_shadows(node) &&
  30087. - !list_empty(&node->private_list))
  30088. - list_lru_del(&workingset_shadow_nodes, &node->private_list);
  30089. + !list_empty(&node->private_list)) {
  30090. + local_lock(workingset_shadow_lock);
  30091. + list_lru_del(&__workingset_shadow_nodes, &node->private_list);
  30092. + local_unlock(workingset_shadow_lock);
  30093. + }
  30094. __radix_tree_delete_node(&mapping->page_tree, node);
  30095. unlock:
  30096. spin_unlock_irq(&mapping->tree_lock);
  30097. diff -Nur linux-3.18.14.orig/mm/vmalloc.c linux-3.18.14-rt/mm/vmalloc.c
  30098. --- linux-3.18.14.orig/mm/vmalloc.c 2015-05-20 10:04:50.000000000 -0500
  30099. +++ linux-3.18.14-rt/mm/vmalloc.c 2015-05-31 15:32:49.297635359 -0500
  30100. @@ -798,7 +798,7 @@
  30101. struct vmap_block *vb;
  30102. struct vmap_area *va;
  30103. unsigned long vb_idx;
  30104. - int node, err;
  30105. + int node, err, cpu;
  30106. node = numa_node_id();
  30107. @@ -836,11 +836,12 @@
  30108. BUG_ON(err);
  30109. radix_tree_preload_end();
  30110. - vbq = &get_cpu_var(vmap_block_queue);
  30111. + cpu = get_cpu_light();
  30112. + vbq = &__get_cpu_var(vmap_block_queue);
  30113. spin_lock(&vbq->lock);
  30114. list_add_rcu(&vb->free_list, &vbq->free);
  30115. spin_unlock(&vbq->lock);
  30116. - put_cpu_var(vmap_block_queue);
  30117. + put_cpu_light();
  30118. return vb;
  30119. }
  30120. @@ -908,6 +909,7 @@
  30121. struct vmap_block *vb;
  30122. unsigned long addr = 0;
  30123. unsigned int order;
  30124. + int cpu = 0;
  30125. BUG_ON(size & ~PAGE_MASK);
  30126. BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
  30127. @@ -923,7 +925,8 @@
  30128. again:
  30129. rcu_read_lock();
  30130. - vbq = &get_cpu_var(vmap_block_queue);
  30131. + cpu = get_cpu_light();
  30132. + vbq = &__get_cpu_var(vmap_block_queue);
  30133. list_for_each_entry_rcu(vb, &vbq->free, free_list) {
  30134. int i;
  30135. @@ -947,7 +950,7 @@
  30136. spin_unlock(&vb->lock);
  30137. }
  30138. - put_cpu_var(vmap_block_queue);
  30139. + put_cpu_light();
  30140. rcu_read_unlock();
  30141. if (!addr) {
  30142. diff -Nur linux-3.18.14.orig/mm/vmstat.c linux-3.18.14-rt/mm/vmstat.c
  30143. --- linux-3.18.14.orig/mm/vmstat.c 2015-05-20 10:04:50.000000000 -0500
  30144. +++ linux-3.18.14-rt/mm/vmstat.c 2015-05-31 15:32:49.297635359 -0500
  30145. @@ -221,6 +221,7 @@
  30146. long x;
  30147. long t;
  30148. + preempt_disable_rt();
  30149. x = delta + __this_cpu_read(*p);
  30150. t = __this_cpu_read(pcp->stat_threshold);
  30151. @@ -230,6 +231,7 @@
  30152. x = 0;
  30153. }
  30154. __this_cpu_write(*p, x);
  30155. + preempt_enable_rt();
  30156. }
  30157. EXPORT_SYMBOL(__mod_zone_page_state);
  30158. @@ -262,6 +264,7 @@
  30159. s8 __percpu *p = pcp->vm_stat_diff + item;
  30160. s8 v, t;
  30161. + preempt_disable_rt();
  30162. v = __this_cpu_inc_return(*p);
  30163. t = __this_cpu_read(pcp->stat_threshold);
  30164. if (unlikely(v > t)) {
  30165. @@ -270,6 +273,7 @@
  30166. zone_page_state_add(v + overstep, zone, item);
  30167. __this_cpu_write(*p, -overstep);
  30168. }
  30169. + preempt_enable_rt();
  30170. }
  30171. void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
  30172. @@ -284,6 +288,7 @@
  30173. s8 __percpu *p = pcp->vm_stat_diff + item;
  30174. s8 v, t;
  30175. + preempt_disable_rt();
  30176. v = __this_cpu_dec_return(*p);
  30177. t = __this_cpu_read(pcp->stat_threshold);
  30178. if (unlikely(v < - t)) {
  30179. @@ -292,6 +297,7 @@
  30180. zone_page_state_add(v - overstep, zone, item);
  30181. __this_cpu_write(*p, overstep);
  30182. }
  30183. + preempt_enable_rt();
  30184. }
  30185. void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
  30186. diff -Nur linux-3.18.14.orig/mm/workingset.c linux-3.18.14-rt/mm/workingset.c
  30187. --- linux-3.18.14.orig/mm/workingset.c 2015-05-20 10:04:50.000000000 -0500
  30188. +++ linux-3.18.14-rt/mm/workingset.c 2015-05-31 15:32:49.321635359 -0500
  30189. @@ -264,7 +264,8 @@
  30190. * point where they would still be useful.
  30191. */
  30192. -struct list_lru workingset_shadow_nodes;
  30193. +struct list_lru __workingset_shadow_nodes;
  30194. +DEFINE_LOCAL_IRQ_LOCK(workingset_shadow_lock);
  30195. static unsigned long count_shadow_nodes(struct shrinker *shrinker,
  30196. struct shrink_control *sc)
  30197. @@ -274,9 +275,9 @@
  30198. unsigned long pages;
  30199. /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
  30200. - local_irq_disable();
  30201. - shadow_nodes = list_lru_count_node(&workingset_shadow_nodes, sc->nid);
  30202. - local_irq_enable();
  30203. + local_lock_irq(workingset_shadow_lock);
  30204. + shadow_nodes = list_lru_count_node(&__workingset_shadow_nodes, sc->nid);
  30205. + local_unlock_irq(workingset_shadow_lock);
  30206. pages = node_present_pages(sc->nid);
  30207. /*
  30208. @@ -362,9 +363,9 @@
  30209. spin_unlock(&mapping->tree_lock);
  30210. ret = LRU_REMOVED_RETRY;
  30211. out:
  30212. - local_irq_enable();
  30213. + local_unlock_irq(workingset_shadow_lock);
  30214. cond_resched();
  30215. - local_irq_disable();
  30216. + local_lock_irq(workingset_shadow_lock);
  30217. spin_lock(lru_lock);
  30218. return ret;
  30219. }
  30220. @@ -375,10 +376,10 @@
  30221. unsigned long ret;
  30222. /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
  30223. - local_irq_disable();
  30224. - ret = list_lru_walk_node(&workingset_shadow_nodes, sc->nid,
  30225. + local_lock_irq(workingset_shadow_lock);
  30226. + ret = list_lru_walk_node(&__workingset_shadow_nodes, sc->nid,
  30227. shadow_lru_isolate, NULL, &sc->nr_to_scan);
  30228. - local_irq_enable();
  30229. + local_unlock_irq(workingset_shadow_lock);
  30230. return ret;
  30231. }
  30232. @@ -399,7 +400,7 @@
  30233. {
  30234. int ret;
  30235. - ret = list_lru_init_key(&workingset_shadow_nodes, &shadow_nodes_key);
  30236. + ret = list_lru_init_key(&__workingset_shadow_nodes, &shadow_nodes_key);
  30237. if (ret)
  30238. goto err;
  30239. ret = register_shrinker(&workingset_shadow_shrinker);
  30240. @@ -407,7 +408,7 @@
  30241. goto err_list_lru;
  30242. return 0;
  30243. err_list_lru:
  30244. - list_lru_destroy(&workingset_shadow_nodes);
  30245. + list_lru_destroy(&__workingset_shadow_nodes);
  30246. err:
  30247. return ret;
  30248. }
  30249. diff -Nur linux-3.18.14.orig/net/core/dev.c linux-3.18.14-rt/net/core/dev.c
  30250. --- linux-3.18.14.orig/net/core/dev.c 2015-05-20 10:04:50.000000000 -0500
  30251. +++ linux-3.18.14-rt/net/core/dev.c 2015-05-31 15:32:49.369635359 -0500
  30252. @@ -182,6 +182,7 @@
  30253. static DEFINE_HASHTABLE(napi_hash, 8);
  30254. static seqcount_t devnet_rename_seq;
  30255. +static DEFINE_MUTEX(devnet_rename_mutex);
  30256. static inline void dev_base_seq_inc(struct net *net)
  30257. {
  30258. @@ -203,14 +204,14 @@
  30259. static inline void rps_lock(struct softnet_data *sd)
  30260. {
  30261. #ifdef CONFIG_RPS
  30262. - spin_lock(&sd->input_pkt_queue.lock);
  30263. + raw_spin_lock(&sd->input_pkt_queue.raw_lock);
  30264. #endif
  30265. }
  30266. static inline void rps_unlock(struct softnet_data *sd)
  30267. {
  30268. #ifdef CONFIG_RPS
  30269. - spin_unlock(&sd->input_pkt_queue.lock);
  30270. + raw_spin_unlock(&sd->input_pkt_queue.raw_lock);
  30271. #endif
  30272. }
  30273. @@ -832,7 +833,8 @@
  30274. strcpy(name, dev->name);
  30275. rcu_read_unlock();
  30276. if (read_seqcount_retry(&devnet_rename_seq, seq)) {
  30277. - cond_resched();
  30278. + mutex_lock(&devnet_rename_mutex);
  30279. + mutex_unlock(&devnet_rename_mutex);
  30280. goto retry;
  30281. }
  30282. @@ -1101,20 +1103,17 @@
  30283. if (dev->flags & IFF_UP)
  30284. return -EBUSY;
  30285. - write_seqcount_begin(&devnet_rename_seq);
  30286. + mutex_lock(&devnet_rename_mutex);
  30287. + __raw_write_seqcount_begin(&devnet_rename_seq);
  30288. - if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
  30289. - write_seqcount_end(&devnet_rename_seq);
  30290. - return 0;
  30291. - }
  30292. + if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
  30293. + goto outunlock;
  30294. memcpy(oldname, dev->name, IFNAMSIZ);
  30295. err = dev_get_valid_name(net, dev, newname);
  30296. - if (err < 0) {
  30297. - write_seqcount_end(&devnet_rename_seq);
  30298. - return err;
  30299. - }
  30300. + if (err < 0)
  30301. + goto outunlock;
  30302. if (oldname[0] && !strchr(oldname, '%'))
  30303. netdev_info(dev, "renamed from %s\n", oldname);
  30304. @@ -1127,11 +1126,12 @@
  30305. if (ret) {
  30306. memcpy(dev->name, oldname, IFNAMSIZ);
  30307. dev->name_assign_type = old_assign_type;
  30308. - write_seqcount_end(&devnet_rename_seq);
  30309. - return ret;
  30310. + err = ret;
  30311. + goto outunlock;
  30312. }
  30313. - write_seqcount_end(&devnet_rename_seq);
  30314. + __raw_write_seqcount_end(&devnet_rename_seq);
  30315. + mutex_unlock(&devnet_rename_mutex);
  30316. netdev_adjacent_rename_links(dev, oldname);
  30317. @@ -1152,7 +1152,8 @@
  30318. /* err >= 0 after dev_alloc_name() or stores the first errno */
  30319. if (err >= 0) {
  30320. err = ret;
  30321. - write_seqcount_begin(&devnet_rename_seq);
  30322. + mutex_lock(&devnet_rename_mutex);
  30323. + __raw_write_seqcount_begin(&devnet_rename_seq);
  30324. memcpy(dev->name, oldname, IFNAMSIZ);
  30325. memcpy(oldname, newname, IFNAMSIZ);
  30326. dev->name_assign_type = old_assign_type;
  30327. @@ -1165,6 +1166,11 @@
  30328. }
  30329. return err;
  30330. +
  30331. +outunlock:
  30332. + __raw_write_seqcount_end(&devnet_rename_seq);
  30333. + mutex_unlock(&devnet_rename_mutex);
  30334. + return err;
  30335. }
  30336. /**
  30337. @@ -2160,6 +2166,7 @@
  30338. sd->output_queue_tailp = &q->next_sched;
  30339. raise_softirq_irqoff(NET_TX_SOFTIRQ);
  30340. local_irq_restore(flags);
  30341. + preempt_check_resched_rt();
  30342. }
  30343. void __netif_schedule(struct Qdisc *q)
  30344. @@ -2241,6 +2248,7 @@
  30345. __this_cpu_write(softnet_data.completion_queue, skb);
  30346. raise_softirq_irqoff(NET_TX_SOFTIRQ);
  30347. local_irq_restore(flags);
  30348. + preempt_check_resched_rt();
  30349. }
  30350. EXPORT_SYMBOL(__dev_kfree_skb_irq);
  30351. @@ -3334,6 +3342,7 @@
  30352. rps_unlock(sd);
  30353. local_irq_restore(flags);
  30354. + preempt_check_resched_rt();
  30355. atomic_long_inc(&skb->dev->rx_dropped);
  30356. kfree_skb(skb);
  30357. @@ -3352,7 +3361,7 @@
  30358. struct rps_dev_flow voidflow, *rflow = &voidflow;
  30359. int cpu;
  30360. - preempt_disable();
  30361. + migrate_disable();
  30362. rcu_read_lock();
  30363. cpu = get_rps_cpu(skb->dev, skb, &rflow);
  30364. @@ -3362,13 +3371,13 @@
  30365. ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
  30366. rcu_read_unlock();
  30367. - preempt_enable();
  30368. + migrate_enable();
  30369. } else
  30370. #endif
  30371. {
  30372. unsigned int qtail;
  30373. - ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
  30374. - put_cpu();
  30375. + ret = enqueue_to_backlog(skb, get_cpu_light(), &qtail);
  30376. + put_cpu_light();
  30377. }
  30378. return ret;
  30379. }
  30380. @@ -3402,16 +3411,44 @@
  30381. trace_netif_rx_ni_entry(skb);
  30382. - preempt_disable();
  30383. + local_bh_disable();
  30384. err = netif_rx_internal(skb);
  30385. - if (local_softirq_pending())
  30386. - do_softirq();
  30387. - preempt_enable();
  30388. + local_bh_enable();
  30389. return err;
  30390. }
  30391. EXPORT_SYMBOL(netif_rx_ni);
  30392. +#ifdef CONFIG_PREEMPT_RT_FULL
  30393. +/*
  30394. + * RT runs ksoftirqd as a real time thread and the root_lock is a
  30395. + * "sleeping spinlock". If the trylock fails then we can go into an
  30396. + * infinite loop when ksoftirqd preempted the task which actually
  30397. + * holds the lock, because we requeue q and raise NET_TX softirq
  30398. + * causing ksoftirqd to loop forever.
  30399. + *
  30400. + * It's safe to use spin_lock on RT here as softirqs run in thread
  30401. + * context and cannot deadlock against the thread which is holding
  30402. + * root_lock.
  30403. + *
  30404. + * On !RT the trylock might fail, but there we bail out from the
  30405. + * softirq loop after 10 attempts which we can't do on RT. And the
  30406. + * task holding root_lock cannot be preempted, so the only downside of
  30407. + * that trylock is that we need 10 loops to decide that we should have
  30408. + * given up in the first one :)
  30409. + */
  30410. +static inline int take_root_lock(spinlock_t *lock)
  30411. +{
  30412. + spin_lock(lock);
  30413. + return 1;
  30414. +}
  30415. +#else
  30416. +static inline int take_root_lock(spinlock_t *lock)
  30417. +{
  30418. + return spin_trylock(lock);
  30419. +}
  30420. +#endif
  30421. +
  30422. static void net_tx_action(struct softirq_action *h)
  30423. {
  30424. struct softnet_data *sd = this_cpu_ptr(&softnet_data);
  30425. @@ -3453,7 +3490,7 @@
  30426. head = head->next_sched;
  30427. root_lock = qdisc_lock(q);
  30428. - if (spin_trylock(root_lock)) {
  30429. + if (take_root_lock(root_lock)) {
  30430. smp_mb__before_atomic();
  30431. clear_bit(__QDISC_STATE_SCHED,
  30432. &q->state);
  30433. @@ -3846,7 +3883,7 @@
  30434. skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
  30435. if (skb->dev == dev) {
  30436. __skb_unlink(skb, &sd->input_pkt_queue);
  30437. - kfree_skb(skb);
  30438. + __skb_queue_tail(&sd->tofree_queue, skb);
  30439. input_queue_head_incr(sd);
  30440. }
  30441. }
  30442. @@ -3855,10 +3892,13 @@
  30443. skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
  30444. if (skb->dev == dev) {
  30445. __skb_unlink(skb, &sd->process_queue);
  30446. - kfree_skb(skb);
  30447. + __skb_queue_tail(&sd->tofree_queue, skb);
  30448. input_queue_head_incr(sd);
  30449. }
  30450. }
  30451. +
  30452. + if (!skb_queue_empty(&sd->tofree_queue))
  30453. + raise_softirq_irqoff(NET_RX_SOFTIRQ);
  30454. }
  30455. static int napi_gro_complete(struct sk_buff *skb)
  30456. @@ -4321,6 +4361,7 @@
  30457. } else
  30458. #endif
  30459. local_irq_enable();
  30460. + preempt_check_resched_rt();
  30461. }
  30462. static int process_backlog(struct napi_struct *napi, int quota)
  30463. @@ -4392,6 +4433,7 @@
  30464. local_irq_save(flags);
  30465. ____napi_schedule(this_cpu_ptr(&softnet_data), n);
  30466. local_irq_restore(flags);
  30467. + preempt_check_resched_rt();
  30468. }
  30469. EXPORT_SYMBOL(__napi_schedule);
  30470. @@ -4514,10 +4556,17 @@
  30471. struct softnet_data *sd = this_cpu_ptr(&softnet_data);
  30472. unsigned long time_limit = jiffies + 2;
  30473. int budget = netdev_budget;
  30474. + struct sk_buff *skb;
  30475. void *have;
  30476. local_irq_disable();
  30477. + while ((skb = __skb_dequeue(&sd->tofree_queue))) {
  30478. + local_irq_enable();
  30479. + kfree_skb(skb);
  30480. + local_irq_disable();
  30481. + }
  30482. +
  30483. while (!list_empty(&sd->poll_list)) {
  30484. struct napi_struct *n;
  30485. int work, weight;
  30486. @@ -7006,6 +7055,7 @@
  30487. raise_softirq_irqoff(NET_TX_SOFTIRQ);
  30488. local_irq_enable();
  30489. + preempt_check_resched_rt();
  30490. /* Process offline CPU's input_pkt_queue */
  30491. while ((skb = __skb_dequeue(&oldsd->process_queue))) {
  30492. @@ -7016,6 +7066,9 @@
  30493. netif_rx_internal(skb);
  30494. input_queue_head_incr(oldsd);
  30495. }
  30496. + while ((skb = __skb_dequeue(&oldsd->tofree_queue))) {
  30497. + kfree_skb(skb);
  30498. + }
  30499. return NOTIFY_OK;
  30500. }
  30501. @@ -7317,8 +7370,9 @@
  30502. for_each_possible_cpu(i) {
  30503. struct softnet_data *sd = &per_cpu(softnet_data, i);
  30504. - skb_queue_head_init(&sd->input_pkt_queue);
  30505. - skb_queue_head_init(&sd->process_queue);
  30506. + skb_queue_head_init_raw(&sd->input_pkt_queue);
  30507. + skb_queue_head_init_raw(&sd->process_queue);
  30508. + skb_queue_head_init_raw(&sd->tofree_queue);
  30509. INIT_LIST_HEAD(&sd->poll_list);
  30510. sd->output_queue_tailp = &sd->output_queue;
  30511. #ifdef CONFIG_RPS
  30512. diff -Nur linux-3.18.14.orig/net/core/skbuff.c linux-3.18.14-rt/net/core/skbuff.c
  30513. --- linux-3.18.14.orig/net/core/skbuff.c 2015-05-20 10:04:50.000000000 -0500
  30514. +++ linux-3.18.14-rt/net/core/skbuff.c 2015-05-31 15:32:49.393635358 -0500
  30515. @@ -63,6 +63,7 @@
  30516. #include <linux/errqueue.h>
  30517. #include <linux/prefetch.h>
  30518. #include <linux/if_vlan.h>
  30519. +#include <linux/locallock.h>
  30520. #include <net/protocol.h>
  30521. #include <net/dst.h>
  30522. @@ -353,6 +354,7 @@
  30523. unsigned int pagecnt_bias;
  30524. };
  30525. static DEFINE_PER_CPU(struct netdev_alloc_cache, netdev_alloc_cache);
  30526. +static DEFINE_LOCAL_IRQ_LOCK(netdev_alloc_lock);
  30527. static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
  30528. {
  30529. @@ -361,7 +363,7 @@
  30530. int order;
  30531. unsigned long flags;
  30532. - local_irq_save(flags);
  30533. + local_lock_irqsave(netdev_alloc_lock, flags);
  30534. nc = this_cpu_ptr(&netdev_alloc_cache);
  30535. if (unlikely(!nc->frag.page)) {
  30536. refill:
  30537. @@ -407,7 +409,7 @@
  30538. nc->frag.offset += fragsz;
  30539. nc->pagecnt_bias--;
  30540. end:
  30541. - local_irq_restore(flags);
  30542. + local_unlock_irqrestore(netdev_alloc_lock, flags);
  30543. return data;
  30544. }
  30545. diff -Nur linux-3.18.14.orig/net/core/skbuff.c.orig linux-3.18.14-rt/net/core/skbuff.c.orig
  30546. --- linux-3.18.14.orig/net/core/skbuff.c.orig 1969-12-31 18:00:00.000000000 -0600
  30547. +++ linux-3.18.14-rt/net/core/skbuff.c.orig 2015-05-20 10:04:50.000000000 -0500
  30548. @@ -0,0 +1,4231 @@
  30549. +/*
  30550. + * Routines having to do with the 'struct sk_buff' memory handlers.
  30551. + *
  30552. + * Authors: Alan Cox <alan@lxorguk.ukuu.org.uk>
  30553. + * Florian La Roche <rzsfl@rz.uni-sb.de>
  30554. + *
  30555. + * Fixes:
  30556. + * Alan Cox : Fixed the worst of the load
  30557. + * balancer bugs.
  30558. + * Dave Platt : Interrupt stacking fix.
  30559. + * Richard Kooijman : Timestamp fixes.
  30560. + * Alan Cox : Changed buffer format.
  30561. + * Alan Cox : destructor hook for AF_UNIX etc.
  30562. + * Linus Torvalds : Better skb_clone.
  30563. + * Alan Cox : Added skb_copy.
  30564. + * Alan Cox : Added all the changed routines Linus
  30565. + * only put in the headers
  30566. + * Ray VanTassle : Fixed --skb->lock in free
  30567. + * Alan Cox : skb_copy copy arp field
  30568. + * Andi Kleen : slabified it.
  30569. + * Robert Olsson : Removed skb_head_pool
  30570. + *
  30571. + * NOTE:
  30572. + * The __skb_ routines should be called with interrupts
  30573. + * disabled, or you better be *real* sure that the operation is atomic
  30574. + * with respect to whatever list is being frobbed (e.g. via lock_sock()
  30575. + * or via disabling bottom half handlers, etc).
  30576. + *
  30577. + * This program is free software; you can redistribute it and/or
  30578. + * modify it under the terms of the GNU General Public License
  30579. + * as published by the Free Software Foundation; either version
  30580. + * 2 of the License, or (at your option) any later version.
  30581. + */
  30582. +
  30583. +/*
  30584. + * The functions in this file will not compile correctly with gcc 2.4.x
  30585. + */
  30586. +
  30587. +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  30588. +
  30589. +#include <linux/module.h>
  30590. +#include <linux/types.h>
  30591. +#include <linux/kernel.h>
  30592. +#include <linux/kmemcheck.h>
  30593. +#include <linux/mm.h>
  30594. +#include <linux/interrupt.h>
  30595. +#include <linux/in.h>
  30596. +#include <linux/inet.h>
  30597. +#include <linux/slab.h>
  30598. +#include <linux/tcp.h>
  30599. +#include <linux/udp.h>
  30600. +#include <linux/netdevice.h>
  30601. +#ifdef CONFIG_NET_CLS_ACT
  30602. +#include <net/pkt_sched.h>
  30603. +#endif
  30604. +#include <linux/string.h>
  30605. +#include <linux/skbuff.h>
  30606. +#include <linux/splice.h>
  30607. +#include <linux/cache.h>
  30608. +#include <linux/rtnetlink.h>
  30609. +#include <linux/init.h>
  30610. +#include <linux/scatterlist.h>
  30611. +#include <linux/errqueue.h>
  30612. +#include <linux/prefetch.h>
  30613. +#include <linux/if_vlan.h>
  30614. +
  30615. +#include <net/protocol.h>
  30616. +#include <net/dst.h>
  30617. +#include <net/sock.h>
  30618. +#include <net/checksum.h>
  30619. +#include <net/ip6_checksum.h>
  30620. +#include <net/xfrm.h>
  30621. +
  30622. +#include <asm/uaccess.h>
  30623. +#include <trace/events/skb.h>
  30624. +#include <linux/highmem.h>
  30625. +
  30626. +struct kmem_cache *skbuff_head_cache __read_mostly;
  30627. +static struct kmem_cache *skbuff_fclone_cache __read_mostly;
  30628. +
  30629. +/**
  30630. + * skb_panic - private function for out-of-line support
  30631. + * @skb: buffer
  30632. + * @sz: size
  30633. + * @addr: address
  30634. + * @msg: skb_over_panic or skb_under_panic
  30635. + *
  30636. + * Out-of-line support for skb_put() and skb_push().
  30637. + * Called via the wrapper skb_over_panic() or skb_under_panic().
  30638. + * Keep out of line to prevent kernel bloat.
  30639. + * __builtin_return_address is not used because it is not always reliable.
  30640. + */
  30641. +static void skb_panic(struct sk_buff *skb, unsigned int sz, void *addr,
  30642. + const char msg[])
  30643. +{
  30644. + pr_emerg("%s: text:%p len:%d put:%d head:%p data:%p tail:%#lx end:%#lx dev:%s\n",
  30645. + msg, addr, skb->len, sz, skb->head, skb->data,
  30646. + (unsigned long)skb->tail, (unsigned long)skb->end,
  30647. + skb->dev ? skb->dev->name : "<NULL>");
  30648. + BUG();
  30649. +}
  30650. +
  30651. +static void skb_over_panic(struct sk_buff *skb, unsigned int sz, void *addr)
  30652. +{
  30653. + skb_panic(skb, sz, addr, __func__);
  30654. +}
  30655. +
  30656. +static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr)
  30657. +{
  30658. + skb_panic(skb, sz, addr, __func__);
  30659. +}
  30660. +
  30661. +/*
  30662. + * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells
  30663. + * the caller if emergency pfmemalloc reserves are being used. If it is and
  30664. + * the socket is later found to be SOCK_MEMALLOC then PFMEMALLOC reserves
  30665. + * may be used. Otherwise, the packet data may be discarded until enough
  30666. + * memory is free
  30667. + */
  30668. +#define kmalloc_reserve(size, gfp, node, pfmemalloc) \
  30669. + __kmalloc_reserve(size, gfp, node, _RET_IP_, pfmemalloc)
  30670. +
  30671. +static void *__kmalloc_reserve(size_t size, gfp_t flags, int node,
  30672. + unsigned long ip, bool *pfmemalloc)
  30673. +{
  30674. + void *obj;
  30675. + bool ret_pfmemalloc = false;
  30676. +
  30677. + /*
  30678. + * Try a regular allocation, when that fails and we're not entitled
  30679. + * to the reserves, fail.
  30680. + */
  30681. + obj = kmalloc_node_track_caller(size,
  30682. + flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
  30683. + node);
  30684. + if (obj || !(gfp_pfmemalloc_allowed(flags)))
  30685. + goto out;
  30686. +
  30687. + /* Try again but now we are using pfmemalloc reserves */
  30688. + ret_pfmemalloc = true;
  30689. + obj = kmalloc_node_track_caller(size, flags, node);
  30690. +
  30691. +out:
  30692. + if (pfmemalloc)
  30693. + *pfmemalloc = ret_pfmemalloc;
  30694. +
  30695. + return obj;
  30696. +}
  30697. +
  30698. +/* Allocate a new skbuff. We do this ourselves so we can fill in a few
  30699. + * 'private' fields and also do memory statistics to find all the
  30700. + * [BEEP] leaks.
  30701. + *
  30702. + */
  30703. +
  30704. +struct sk_buff *__alloc_skb_head(gfp_t gfp_mask, int node)
  30705. +{
  30706. + struct sk_buff *skb;
  30707. +
  30708. + /* Get the HEAD */
  30709. + skb = kmem_cache_alloc_node(skbuff_head_cache,
  30710. + gfp_mask & ~__GFP_DMA, node);
  30711. + if (!skb)
  30712. + goto out;
  30713. +
  30714. + /*
  30715. + * Only clear those fields we need to clear, not those that we will
  30716. + * actually initialise below. Hence, don't put any more fields after
  30717. + * the tail pointer in struct sk_buff!
  30718. + */
  30719. + memset(skb, 0, offsetof(struct sk_buff, tail));
  30720. + skb->head = NULL;
  30721. + skb->truesize = sizeof(struct sk_buff);
  30722. + atomic_set(&skb->users, 1);
  30723. +
  30724. + skb->mac_header = (typeof(skb->mac_header))~0U;
  30725. +out:
  30726. + return skb;
  30727. +}
  30728. +
  30729. +/**
  30730. + * __alloc_skb - allocate a network buffer
  30731. + * @size: size to allocate
  30732. + * @gfp_mask: allocation mask
  30733. + * @flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache
  30734. + * instead of head cache and allocate a cloned (child) skb.
  30735. + * If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for
  30736. + * allocations in case the data is required for writeback
  30737. + * @node: numa node to allocate memory on
  30738. + *
  30739. + * Allocate a new &sk_buff. The returned buffer has no headroom and a
  30740. + * tail room of at least size bytes. The object has a reference count
  30741. + * of one. The return is the buffer. On a failure the return is %NULL.
  30742. + *
  30743. + * Buffers may only be allocated from interrupts using a @gfp_mask of
  30744. + * %GFP_ATOMIC.
  30745. + */
  30746. +struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
  30747. + int flags, int node)
  30748. +{
  30749. + struct kmem_cache *cache;
  30750. + struct skb_shared_info *shinfo;
  30751. + struct sk_buff *skb;
  30752. + u8 *data;
  30753. + bool pfmemalloc;
  30754. +
  30755. + cache = (flags & SKB_ALLOC_FCLONE)
  30756. + ? skbuff_fclone_cache : skbuff_head_cache;
  30757. +
  30758. + if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX))
  30759. + gfp_mask |= __GFP_MEMALLOC;
  30760. +
  30761. + /* Get the HEAD */
  30762. + skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node);
  30763. + if (!skb)
  30764. + goto out;
  30765. + prefetchw(skb);
  30766. +
  30767. + /* We do our best to align skb_shared_info on a separate cache
  30768. + * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives
  30769. + * aligned memory blocks, unless SLUB/SLAB debug is enabled.
  30770. + * Both skb->head and skb_shared_info are cache line aligned.
  30771. + */
  30772. + size = SKB_DATA_ALIGN(size);
  30773. + size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
  30774. + data = kmalloc_reserve(size, gfp_mask, node, &pfmemalloc);
  30775. + if (!data)
  30776. + goto nodata;
  30777. + /* kmalloc(size) might give us more room than requested.
  30778. + * Put skb_shared_info exactly at the end of allocated zone,
  30779. + * to allow max possible filling before reallocation.
  30780. + */
  30781. + size = SKB_WITH_OVERHEAD(ksize(data));
  30782. + prefetchw(data + size);
  30783. +
  30784. + /*
  30785. + * Only clear those fields we need to clear, not those that we will
  30786. + * actually initialise below. Hence, don't put any more fields after
  30787. + * the tail pointer in struct sk_buff!
  30788. + */
  30789. + memset(skb, 0, offsetof(struct sk_buff, tail));
  30790. + /* Account for allocated memory : skb + skb->head */
  30791. + skb->truesize = SKB_TRUESIZE(size);
  30792. + skb->pfmemalloc = pfmemalloc;
  30793. + atomic_set(&skb->users, 1);
  30794. + skb->head = data;
  30795. + skb->data = data;
  30796. + skb_reset_tail_pointer(skb);
  30797. + skb->end = skb->tail + size;
  30798. + skb->mac_header = (typeof(skb->mac_header))~0U;
  30799. + skb->transport_header = (typeof(skb->transport_header))~0U;
  30800. +
  30801. + /* make sure we initialize shinfo sequentially */
  30802. + shinfo = skb_shinfo(skb);
  30803. + memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
  30804. + atomic_set(&shinfo->dataref, 1);
  30805. + kmemcheck_annotate_variable(shinfo->destructor_arg);
  30806. +
  30807. + if (flags & SKB_ALLOC_FCLONE) {
  30808. + struct sk_buff_fclones *fclones;
  30809. +
  30810. + fclones = container_of(skb, struct sk_buff_fclones, skb1);
  30811. +
  30812. + kmemcheck_annotate_bitfield(&fclones->skb2, flags1);
  30813. + skb->fclone = SKB_FCLONE_ORIG;
  30814. + atomic_set(&fclones->fclone_ref, 1);
  30815. +
  30816. + fclones->skb2.fclone = SKB_FCLONE_FREE;
  30817. + fclones->skb2.pfmemalloc = pfmemalloc;
  30818. + }
  30819. +out:
  30820. + return skb;
  30821. +nodata:
  30822. + kmem_cache_free(cache, skb);
  30823. + skb = NULL;
  30824. + goto out;
  30825. +}
  30826. +EXPORT_SYMBOL(__alloc_skb);
  30827. +
  30828. +/**
  30829. + * __build_skb - build a network buffer
  30830. + * @data: data buffer provided by caller
  30831. + * @frag_size: size of data, or 0 if head was kmalloced
  30832. + *
  30833. + * Allocate a new &sk_buff. Caller provides space holding head and
  30834. + * skb_shared_info. @data must have been allocated by kmalloc() only if
  30835. + * @frag_size is 0, otherwise data should come from the page allocator
  30836. + * or vmalloc()
  30837. + * The return is the new skb buffer.
  30838. + * On a failure the return is %NULL, and @data is not freed.
  30839. + * Notes :
  30840. + * Before IO, driver allocates only data buffer where NIC put incoming frame
  30841. + * Driver should add room at head (NET_SKB_PAD) and
  30842. + * MUST add room at tail (SKB_DATA_ALIGN(skb_shared_info))
  30843. + * After IO, driver calls build_skb(), to allocate sk_buff and populate it
  30844. + * before giving packet to stack.
  30845. + * RX rings only contains data buffers, not full skbs.
  30846. + */
  30847. +struct sk_buff *__build_skb(void *data, unsigned int frag_size)
  30848. +{
  30849. + struct skb_shared_info *shinfo;
  30850. + struct sk_buff *skb;
  30851. + unsigned int size = frag_size ? : ksize(data);
  30852. +
  30853. + skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC);
  30854. + if (!skb)
  30855. + return NULL;
  30856. +
  30857. + size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
  30858. +
  30859. + memset(skb, 0, offsetof(struct sk_buff, tail));
  30860. + skb->truesize = SKB_TRUESIZE(size);
  30861. + atomic_set(&skb->users, 1);
  30862. + skb->head = data;
  30863. + skb->data = data;
  30864. + skb_reset_tail_pointer(skb);
  30865. + skb->end = skb->tail + size;
  30866. + skb->mac_header = (typeof(skb->mac_header))~0U;
  30867. + skb->transport_header = (typeof(skb->transport_header))~0U;
  30868. +
  30869. + /* make sure we initialize shinfo sequentially */
  30870. + shinfo = skb_shinfo(skb);
  30871. + memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
  30872. + atomic_set(&shinfo->dataref, 1);
  30873. + kmemcheck_annotate_variable(shinfo->destructor_arg);
  30874. +
  30875. + return skb;
  30876. +}
  30877. +
  30878. +/* build_skb() is wrapper over __build_skb(), that specifically
  30879. + * takes care of skb->head and skb->pfmemalloc
  30880. + * This means that if @frag_size is not zero, then @data must be backed
  30881. + * by a page fragment, not kmalloc() or vmalloc()
  30882. + */
  30883. +struct sk_buff *build_skb(void *data, unsigned int frag_size)
  30884. +{
  30885. + struct sk_buff *skb = __build_skb(data, frag_size);
  30886. +
  30887. + if (skb && frag_size) {
  30888. + skb->head_frag = 1;
  30889. + if (virt_to_head_page(data)->pfmemalloc)
  30890. + skb->pfmemalloc = 1;
  30891. + }
  30892. + return skb;
  30893. +}
  30894. +EXPORT_SYMBOL(build_skb);
  30895. +
  30896. +struct netdev_alloc_cache {
  30897. + struct page_frag frag;
  30898. + /* we maintain a pagecount bias, so that we dont dirty cache line
  30899. + * containing page->_count every time we allocate a fragment.
  30900. + */
  30901. + unsigned int pagecnt_bias;
  30902. +};
  30903. +static DEFINE_PER_CPU(struct netdev_alloc_cache, netdev_alloc_cache);
  30904. +
  30905. +static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
  30906. +{
  30907. + struct netdev_alloc_cache *nc;
  30908. + void *data = NULL;
  30909. + int order;
  30910. + unsigned long flags;
  30911. +
  30912. + local_irq_save(flags);
  30913. + nc = this_cpu_ptr(&netdev_alloc_cache);
  30914. + if (unlikely(!nc->frag.page)) {
  30915. +refill:
  30916. + for (order = NETDEV_FRAG_PAGE_MAX_ORDER; ;) {
  30917. + gfp_t gfp = gfp_mask;
  30918. +
  30919. + if (order)
  30920. + gfp |= __GFP_COMP | __GFP_NOWARN |
  30921. + __GFP_NOMEMALLOC;
  30922. + nc->frag.page = alloc_pages(gfp, order);
  30923. + if (likely(nc->frag.page))
  30924. + break;
  30925. + if (--order < 0)
  30926. + goto end;
  30927. + }
  30928. + nc->frag.size = PAGE_SIZE << order;
  30929. + /* Even if we own the page, we do not use atomic_set().
  30930. + * This would break get_page_unless_zero() users.
  30931. + */
  30932. + atomic_add(NETDEV_PAGECNT_MAX_BIAS - 1,
  30933. + &nc->frag.page->_count);
  30934. + nc->pagecnt_bias = NETDEV_PAGECNT_MAX_BIAS;
  30935. + nc->frag.offset = 0;
  30936. + }
  30937. +
  30938. + if (nc->frag.offset + fragsz > nc->frag.size) {
  30939. + if (atomic_read(&nc->frag.page->_count) != nc->pagecnt_bias) {
  30940. + if (!atomic_sub_and_test(nc->pagecnt_bias,
  30941. + &nc->frag.page->_count))
  30942. + goto refill;
  30943. + /* OK, page count is 0, we can safely set it */
  30944. + atomic_set(&nc->frag.page->_count,
  30945. + NETDEV_PAGECNT_MAX_BIAS);
  30946. + } else {
  30947. + atomic_add(NETDEV_PAGECNT_MAX_BIAS - nc->pagecnt_bias,
  30948. + &nc->frag.page->_count);
  30949. + }
  30950. + nc->pagecnt_bias = NETDEV_PAGECNT_MAX_BIAS;
  30951. + nc->frag.offset = 0;
  30952. + }
  30953. +
  30954. + data = page_address(nc->frag.page) + nc->frag.offset;
  30955. + nc->frag.offset += fragsz;
  30956. + nc->pagecnt_bias--;
  30957. +end:
  30958. + local_irq_restore(flags);
  30959. + return data;
  30960. +}
  30961. +
  30962. +/**
  30963. + * netdev_alloc_frag - allocate a page fragment
  30964. + * @fragsz: fragment size
  30965. + *
  30966. + * Allocates a frag from a page for receive buffer.
  30967. + * Uses GFP_ATOMIC allocations.
  30968. + */
  30969. +void *netdev_alloc_frag(unsigned int fragsz)
  30970. +{
  30971. + return __netdev_alloc_frag(fragsz, GFP_ATOMIC | __GFP_COLD);
  30972. +}
  30973. +EXPORT_SYMBOL(netdev_alloc_frag);
  30974. +
  30975. +/**
  30976. + * __netdev_alloc_skb - allocate an skbuff for rx on a specific device
  30977. + * @dev: network device to receive on
  30978. + * @length: length to allocate
  30979. + * @gfp_mask: get_free_pages mask, passed to alloc_skb
  30980. + *
  30981. + * Allocate a new &sk_buff and assign it a usage count of one. The
  30982. + * buffer has unspecified headroom built in. Users should allocate
  30983. + * the headroom they think they need without accounting for the
  30984. + * built in space. The built in space is used for optimisations.
  30985. + *
  30986. + * %NULL is returned if there is no free memory.
  30987. + */
  30988. +struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
  30989. + unsigned int length, gfp_t gfp_mask)
  30990. +{
  30991. + struct sk_buff *skb = NULL;
  30992. + unsigned int fragsz = SKB_DATA_ALIGN(length + NET_SKB_PAD) +
  30993. + SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
  30994. +
  30995. + if (fragsz <= PAGE_SIZE && !(gfp_mask & (__GFP_WAIT | GFP_DMA))) {
  30996. + void *data;
  30997. +
  30998. + if (sk_memalloc_socks())
  30999. + gfp_mask |= __GFP_MEMALLOC;
  31000. +
  31001. + data = __netdev_alloc_frag(fragsz, gfp_mask);
  31002. +
  31003. + if (likely(data)) {
  31004. + skb = build_skb(data, fragsz);
  31005. + if (unlikely(!skb))
  31006. + put_page(virt_to_head_page(data));
  31007. + }
  31008. + } else {
  31009. + skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask,
  31010. + SKB_ALLOC_RX, NUMA_NO_NODE);
  31011. + }
  31012. + if (likely(skb)) {
  31013. + skb_reserve(skb, NET_SKB_PAD);
  31014. + skb->dev = dev;
  31015. + }
  31016. + return skb;
  31017. +}
  31018. +EXPORT_SYMBOL(__netdev_alloc_skb);
  31019. +
  31020. +void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
  31021. + int size, unsigned int truesize)
  31022. +{
  31023. + skb_fill_page_desc(skb, i, page, off, size);
  31024. + skb->len += size;
  31025. + skb->data_len += size;
  31026. + skb->truesize += truesize;
  31027. +}
  31028. +EXPORT_SYMBOL(skb_add_rx_frag);
  31029. +
  31030. +void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size,
  31031. + unsigned int truesize)
  31032. +{
  31033. + skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
  31034. +
  31035. + skb_frag_size_add(frag, size);
  31036. + skb->len += size;
  31037. + skb->data_len += size;
  31038. + skb->truesize += truesize;
  31039. +}
  31040. +EXPORT_SYMBOL(skb_coalesce_rx_frag);
  31041. +
  31042. +static void skb_drop_list(struct sk_buff **listp)
  31043. +{
  31044. + kfree_skb_list(*listp);
  31045. + *listp = NULL;
  31046. +}
  31047. +
  31048. +static inline void skb_drop_fraglist(struct sk_buff *skb)
  31049. +{
  31050. + skb_drop_list(&skb_shinfo(skb)->frag_list);
  31051. +}
  31052. +
  31053. +static void skb_clone_fraglist(struct sk_buff *skb)
  31054. +{
  31055. + struct sk_buff *list;
  31056. +
  31057. + skb_walk_frags(skb, list)
  31058. + skb_get(list);
  31059. +}
  31060. +
  31061. +static void skb_free_head(struct sk_buff *skb)
  31062. +{
  31063. + if (skb->head_frag)
  31064. + put_page(virt_to_head_page(skb->head));
  31065. + else
  31066. + kfree(skb->head);
  31067. +}
  31068. +
  31069. +static void skb_release_data(struct sk_buff *skb)
  31070. +{
  31071. + struct skb_shared_info *shinfo = skb_shinfo(skb);
  31072. + int i;
  31073. +
  31074. + if (skb->cloned &&
  31075. + atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1,
  31076. + &shinfo->dataref))
  31077. + return;
  31078. +
  31079. + for (i = 0; i < shinfo->nr_frags; i++)
  31080. + __skb_frag_unref(&shinfo->frags[i]);
  31081. +
  31082. + /*
  31083. + * If skb buf is from userspace, we need to notify the caller
  31084. + * the lower device DMA has done;
  31085. + */
  31086. + if (shinfo->tx_flags & SKBTX_DEV_ZEROCOPY) {
  31087. + struct ubuf_info *uarg;
  31088. +
  31089. + uarg = shinfo->destructor_arg;
  31090. + if (uarg->callback)
  31091. + uarg->callback(uarg, true);
  31092. + }
  31093. +
  31094. + if (shinfo->frag_list)
  31095. + kfree_skb_list(shinfo->frag_list);
  31096. +
  31097. + skb_free_head(skb);
  31098. +}
  31099. +
  31100. +/*
  31101. + * Free an skbuff by memory without cleaning the state.
  31102. + */
  31103. +static void kfree_skbmem(struct sk_buff *skb)
  31104. +{
  31105. + struct sk_buff_fclones *fclones;
  31106. +
  31107. + switch (skb->fclone) {
  31108. + case SKB_FCLONE_UNAVAILABLE:
  31109. + kmem_cache_free(skbuff_head_cache, skb);
  31110. + break;
  31111. +
  31112. + case SKB_FCLONE_ORIG:
  31113. + fclones = container_of(skb, struct sk_buff_fclones, skb1);
  31114. + if (atomic_dec_and_test(&fclones->fclone_ref))
  31115. + kmem_cache_free(skbuff_fclone_cache, fclones);
  31116. + break;
  31117. +
  31118. + case SKB_FCLONE_CLONE:
  31119. + fclones = container_of(skb, struct sk_buff_fclones, skb2);
  31120. +
  31121. + /* The clone portion is available for
  31122. + * fast-cloning again.
  31123. + */
  31124. + skb->fclone = SKB_FCLONE_FREE;
  31125. +
  31126. + if (atomic_dec_and_test(&fclones->fclone_ref))
  31127. + kmem_cache_free(skbuff_fclone_cache, fclones);
  31128. + break;
  31129. + }
  31130. +}
  31131. +
  31132. +static void skb_release_head_state(struct sk_buff *skb)
  31133. +{
  31134. + skb_dst_drop(skb);
  31135. +#ifdef CONFIG_XFRM
  31136. + secpath_put(skb->sp);
  31137. +#endif
  31138. + if (skb->destructor) {
  31139. + WARN_ON(in_irq());
  31140. + skb->destructor(skb);
  31141. + }
  31142. +#if IS_ENABLED(CONFIG_NF_CONNTRACK)
  31143. + nf_conntrack_put(skb->nfct);
  31144. +#endif
  31145. +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
  31146. + nf_bridge_put(skb->nf_bridge);
  31147. +#endif
  31148. +/* XXX: IS this still necessary? - JHS */
  31149. +#ifdef CONFIG_NET_SCHED
  31150. + skb->tc_index = 0;
  31151. +#ifdef CONFIG_NET_CLS_ACT
  31152. + skb->tc_verd = 0;
  31153. +#endif
  31154. +#endif
  31155. +}
  31156. +
  31157. +/* Free everything but the sk_buff shell. */
  31158. +static void skb_release_all(struct sk_buff *skb)
  31159. +{
  31160. + skb_release_head_state(skb);
  31161. + if (likely(skb->head))
  31162. + skb_release_data(skb);
  31163. +}
  31164. +
  31165. +/**
  31166. + * __kfree_skb - private function
  31167. + * @skb: buffer
  31168. + *
  31169. + * Free an sk_buff. Release anything attached to the buffer.
  31170. + * Clean the state. This is an internal helper function. Users should
  31171. + * always call kfree_skb
  31172. + */
  31173. +
  31174. +void __kfree_skb(struct sk_buff *skb)
  31175. +{
  31176. + skb_release_all(skb);
  31177. + kfree_skbmem(skb);
  31178. +}
  31179. +EXPORT_SYMBOL(__kfree_skb);
  31180. +
  31181. +/**
  31182. + * kfree_skb - free an sk_buff
  31183. + * @skb: buffer to free
  31184. + *
  31185. + * Drop a reference to the buffer and free it if the usage count has
  31186. + * hit zero.
  31187. + */
  31188. +void kfree_skb(struct sk_buff *skb)
  31189. +{
  31190. + if (unlikely(!skb))
  31191. + return;
  31192. + if (likely(atomic_read(&skb->users) == 1))
  31193. + smp_rmb();
  31194. + else if (likely(!atomic_dec_and_test(&skb->users)))
  31195. + return;
  31196. + trace_kfree_skb(skb, __builtin_return_address(0));
  31197. + __kfree_skb(skb);
  31198. +}
  31199. +EXPORT_SYMBOL(kfree_skb);
  31200. +
  31201. +void kfree_skb_list(struct sk_buff *segs)
  31202. +{
  31203. + while (segs) {
  31204. + struct sk_buff *next = segs->next;
  31205. +
  31206. + kfree_skb(segs);
  31207. + segs = next;
  31208. + }
  31209. +}
  31210. +EXPORT_SYMBOL(kfree_skb_list);
  31211. +
  31212. +/**
  31213. + * skb_tx_error - report an sk_buff xmit error
  31214. + * @skb: buffer that triggered an error
  31215. + *
  31216. + * Report xmit error if a device callback is tracking this skb.
  31217. + * skb must be freed afterwards.
  31218. + */
  31219. +void skb_tx_error(struct sk_buff *skb)
  31220. +{
  31221. + if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
  31222. + struct ubuf_info *uarg;
  31223. +
  31224. + uarg = skb_shinfo(skb)->destructor_arg;
  31225. + if (uarg->callback)
  31226. + uarg->callback(uarg, false);
  31227. + skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY;
  31228. + }
  31229. +}
  31230. +EXPORT_SYMBOL(skb_tx_error);
  31231. +
  31232. +/**
  31233. + * consume_skb - free an skbuff
  31234. + * @skb: buffer to free
  31235. + *
  31236. + * Drop a ref to the buffer and free it if the usage count has hit zero
  31237. + * Functions identically to kfree_skb, but kfree_skb assumes that the frame
  31238. + * is being dropped after a failure and notes that
  31239. + */
  31240. +void consume_skb(struct sk_buff *skb)
  31241. +{
  31242. + if (unlikely(!skb))
  31243. + return;
  31244. + if (likely(atomic_read(&skb->users) == 1))
  31245. + smp_rmb();
  31246. + else if (likely(!atomic_dec_and_test(&skb->users)))
  31247. + return;
  31248. + trace_consume_skb(skb);
  31249. + __kfree_skb(skb);
  31250. +}
  31251. +EXPORT_SYMBOL(consume_skb);
  31252. +
  31253. +/* Make sure a field is enclosed inside headers_start/headers_end section */
  31254. +#define CHECK_SKB_FIELD(field) \
  31255. + BUILD_BUG_ON(offsetof(struct sk_buff, field) < \
  31256. + offsetof(struct sk_buff, headers_start)); \
  31257. + BUILD_BUG_ON(offsetof(struct sk_buff, field) > \
  31258. + offsetof(struct sk_buff, headers_end)); \
  31259. +
  31260. +static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
  31261. +{
  31262. + new->tstamp = old->tstamp;
  31263. + /* We do not copy old->sk */
  31264. + new->dev = old->dev;
  31265. + memcpy(new->cb, old->cb, sizeof(old->cb));
  31266. + skb_dst_copy(new, old);
  31267. +#ifdef CONFIG_XFRM
  31268. + new->sp = secpath_get(old->sp);
  31269. +#endif
  31270. + __nf_copy(new, old, false);
  31271. +
  31272. + /* Note : this field could be in headers_start/headers_end section
  31273. + * It is not yet because we do not want to have a 16 bit hole
  31274. + */
  31275. + new->queue_mapping = old->queue_mapping;
  31276. +
  31277. + memcpy(&new->headers_start, &old->headers_start,
  31278. + offsetof(struct sk_buff, headers_end) -
  31279. + offsetof(struct sk_buff, headers_start));
  31280. + CHECK_SKB_FIELD(protocol);
  31281. + CHECK_SKB_FIELD(csum);
  31282. + CHECK_SKB_FIELD(hash);
  31283. + CHECK_SKB_FIELD(priority);
  31284. + CHECK_SKB_FIELD(skb_iif);
  31285. + CHECK_SKB_FIELD(vlan_proto);
  31286. + CHECK_SKB_FIELD(vlan_tci);
  31287. + CHECK_SKB_FIELD(transport_header);
  31288. + CHECK_SKB_FIELD(network_header);
  31289. + CHECK_SKB_FIELD(mac_header);
  31290. + CHECK_SKB_FIELD(inner_protocol);
  31291. + CHECK_SKB_FIELD(inner_transport_header);
  31292. + CHECK_SKB_FIELD(inner_network_header);
  31293. + CHECK_SKB_FIELD(inner_mac_header);
  31294. + CHECK_SKB_FIELD(mark);
  31295. +#ifdef CONFIG_NETWORK_SECMARK
  31296. + CHECK_SKB_FIELD(secmark);
  31297. +#endif
  31298. +#ifdef CONFIG_NET_RX_BUSY_POLL
  31299. + CHECK_SKB_FIELD(napi_id);
  31300. +#endif
  31301. +#ifdef CONFIG_NET_SCHED
  31302. + CHECK_SKB_FIELD(tc_index);
  31303. +#ifdef CONFIG_NET_CLS_ACT
  31304. + CHECK_SKB_FIELD(tc_verd);
  31305. +#endif
  31306. +#endif
  31307. +
  31308. +}
  31309. +
  31310. +/*
  31311. + * You should not add any new code to this function. Add it to
  31312. + * __copy_skb_header above instead.
  31313. + */
  31314. +static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb)
  31315. +{
  31316. +#define C(x) n->x = skb->x
  31317. +
  31318. + n->next = n->prev = NULL;
  31319. + n->sk = NULL;
  31320. + __copy_skb_header(n, skb);
  31321. +
  31322. + C(len);
  31323. + C(data_len);
  31324. + C(mac_len);
  31325. + n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len;
  31326. + n->cloned = 1;
  31327. + n->nohdr = 0;
  31328. + n->destructor = NULL;
  31329. + C(tail);
  31330. + C(end);
  31331. + C(head);
  31332. + C(head_frag);
  31333. + C(data);
  31334. + C(truesize);
  31335. + atomic_set(&n->users, 1);
  31336. +
  31337. + atomic_inc(&(skb_shinfo(skb)->dataref));
  31338. + skb->cloned = 1;
  31339. +
  31340. + return n;
  31341. +#undef C
  31342. +}
  31343. +
  31344. +/**
  31345. + * skb_morph - morph one skb into another
  31346. + * @dst: the skb to receive the contents
  31347. + * @src: the skb to supply the contents
  31348. + *
  31349. + * This is identical to skb_clone except that the target skb is
  31350. + * supplied by the user.
  31351. + *
  31352. + * The target skb is returned upon exit.
  31353. + */
  31354. +struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)
  31355. +{
  31356. + skb_release_all(dst);
  31357. + return __skb_clone(dst, src);
  31358. +}
  31359. +EXPORT_SYMBOL_GPL(skb_morph);
  31360. +
  31361. +/**
  31362. + * skb_copy_ubufs - copy userspace skb frags buffers to kernel
  31363. + * @skb: the skb to modify
  31364. + * @gfp_mask: allocation priority
  31365. + *
  31366. + * This must be called on SKBTX_DEV_ZEROCOPY skb.
  31367. + * It will copy all frags into kernel and drop the reference
  31368. + * to userspace pages.
  31369. + *
  31370. + * If this function is called from an interrupt gfp_mask() must be
  31371. + * %GFP_ATOMIC.
  31372. + *
  31373. + * Returns 0 on success or a negative error code on failure
  31374. + * to allocate kernel memory to copy to.
  31375. + */
  31376. +int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
  31377. +{
  31378. + int i;
  31379. + int num_frags = skb_shinfo(skb)->nr_frags;
  31380. + struct page *page, *head = NULL;
  31381. + struct ubuf_info *uarg = skb_shinfo(skb)->destructor_arg;
  31382. +
  31383. + for (i = 0; i < num_frags; i++) {
  31384. + u8 *vaddr;
  31385. + skb_frag_t *f = &skb_shinfo(skb)->frags[i];
  31386. +
  31387. + page = alloc_page(gfp_mask);
  31388. + if (!page) {
  31389. + while (head) {
  31390. + struct page *next = (struct page *)page_private(head);
  31391. + put_page(head);
  31392. + head = next;
  31393. + }
  31394. + return -ENOMEM;
  31395. + }
  31396. + vaddr = kmap_atomic(skb_frag_page(f));
  31397. + memcpy(page_address(page),
  31398. + vaddr + f->page_offset, skb_frag_size(f));
  31399. + kunmap_atomic(vaddr);
  31400. + set_page_private(page, (unsigned long)head);
  31401. + head = page;
  31402. + }
  31403. +
  31404. + /* skb frags release userspace buffers */
  31405. + for (i = 0; i < num_frags; i++)
  31406. + skb_frag_unref(skb, i);
  31407. +
  31408. + uarg->callback(uarg, false);
  31409. +
  31410. + /* skb frags point to kernel buffers */
  31411. + for (i = num_frags - 1; i >= 0; i--) {
  31412. + __skb_fill_page_desc(skb, i, head, 0,
  31413. + skb_shinfo(skb)->frags[i].size);
  31414. + head = (struct page *)page_private(head);
  31415. + }
  31416. +
  31417. + skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY;
  31418. + return 0;
  31419. +}
  31420. +EXPORT_SYMBOL_GPL(skb_copy_ubufs);
  31421. +
  31422. +/**
  31423. + * skb_clone - duplicate an sk_buff
  31424. + * @skb: buffer to clone
  31425. + * @gfp_mask: allocation priority
  31426. + *
  31427. + * Duplicate an &sk_buff. The new one is not owned by a socket. Both
  31428. + * copies share the same packet data but not structure. The new
  31429. + * buffer has a reference count of 1. If the allocation fails the
  31430. + * function returns %NULL otherwise the new buffer is returned.
  31431. + *
  31432. + * If this function is called from an interrupt gfp_mask() must be
  31433. + * %GFP_ATOMIC.
  31434. + */
  31435. +
  31436. +struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
  31437. +{
  31438. + struct sk_buff_fclones *fclones = container_of(skb,
  31439. + struct sk_buff_fclones,
  31440. + skb1);
  31441. + struct sk_buff *n = &fclones->skb2;
  31442. +
  31443. + if (skb_orphan_frags(skb, gfp_mask))
  31444. + return NULL;
  31445. +
  31446. + if (skb->fclone == SKB_FCLONE_ORIG &&
  31447. + n->fclone == SKB_FCLONE_FREE) {
  31448. + n->fclone = SKB_FCLONE_CLONE;
  31449. + atomic_inc(&fclones->fclone_ref);
  31450. + } else {
  31451. + if (skb_pfmemalloc(skb))
  31452. + gfp_mask |= __GFP_MEMALLOC;
  31453. +
  31454. + n = kmem_cache_alloc(skbuff_head_cache, gfp_mask);
  31455. + if (!n)
  31456. + return NULL;
  31457. +
  31458. + kmemcheck_annotate_bitfield(n, flags1);
  31459. + n->fclone = SKB_FCLONE_UNAVAILABLE;
  31460. + }
  31461. +
  31462. + return __skb_clone(n, skb);
  31463. +}
  31464. +EXPORT_SYMBOL(skb_clone);
  31465. +
  31466. +static void skb_headers_offset_update(struct sk_buff *skb, int off)
  31467. +{
  31468. + /* Only adjust this if it actually is csum_start rather than csum */
  31469. + if (skb->ip_summed == CHECKSUM_PARTIAL)
  31470. + skb->csum_start += off;
  31471. + /* {transport,network,mac}_header and tail are relative to skb->head */
  31472. + skb->transport_header += off;
  31473. + skb->network_header += off;
  31474. + if (skb_mac_header_was_set(skb))
  31475. + skb->mac_header += off;
  31476. + skb->inner_transport_header += off;
  31477. + skb->inner_network_header += off;
  31478. + skb->inner_mac_header += off;
  31479. +}
  31480. +
  31481. +static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
  31482. +{
  31483. + __copy_skb_header(new, old);
  31484. +
  31485. + skb_shinfo(new)->gso_size = skb_shinfo(old)->gso_size;
  31486. + skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs;
  31487. + skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type;
  31488. +}
  31489. +
  31490. +static inline int skb_alloc_rx_flag(const struct sk_buff *skb)
  31491. +{
  31492. + if (skb_pfmemalloc(skb))
  31493. + return SKB_ALLOC_RX;
  31494. + return 0;
  31495. +}
  31496. +
  31497. +/**
  31498. + * skb_copy - create private copy of an sk_buff
  31499. + * @skb: buffer to copy
  31500. + * @gfp_mask: allocation priority
  31501. + *
  31502. + * Make a copy of both an &sk_buff and its data. This is used when the
  31503. + * caller wishes to modify the data and needs a private copy of the
  31504. + * data to alter. Returns %NULL on failure or the pointer to the buffer
  31505. + * on success. The returned buffer has a reference count of 1.
  31506. + *
  31507. + * As by-product this function converts non-linear &sk_buff to linear
  31508. + * one, so that &sk_buff becomes completely private and caller is allowed
  31509. + * to modify all the data of returned buffer. This means that this
  31510. + * function is not recommended for use in circumstances when only
  31511. + * header is going to be modified. Use pskb_copy() instead.
  31512. + */
  31513. +
  31514. +struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
  31515. +{
  31516. + int headerlen = skb_headroom(skb);
  31517. + unsigned int size = skb_end_offset(skb) + skb->data_len;
  31518. + struct sk_buff *n = __alloc_skb(size, gfp_mask,
  31519. + skb_alloc_rx_flag(skb), NUMA_NO_NODE);
  31520. +
  31521. + if (!n)
  31522. + return NULL;
  31523. +
  31524. + /* Set the data pointer */
  31525. + skb_reserve(n, headerlen);
  31526. + /* Set the tail pointer and length */
  31527. + skb_put(n, skb->len);
  31528. +
  31529. + if (skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len))
  31530. + BUG();
  31531. +
  31532. + copy_skb_header(n, skb);
  31533. + return n;
  31534. +}
  31535. +EXPORT_SYMBOL(skb_copy);
  31536. +
  31537. +/**
  31538. + * __pskb_copy_fclone - create copy of an sk_buff with private head.
  31539. + * @skb: buffer to copy
  31540. + * @headroom: headroom of new skb
  31541. + * @gfp_mask: allocation priority
  31542. + * @fclone: if true allocate the copy of the skb from the fclone
  31543. + * cache instead of the head cache; it is recommended to set this
  31544. + * to true for the cases where the copy will likely be cloned
  31545. + *
  31546. + * Make a copy of both an &sk_buff and part of its data, located
  31547. + * in header. Fragmented data remain shared. This is used when
  31548. + * the caller wishes to modify only header of &sk_buff and needs
  31549. + * private copy of the header to alter. Returns %NULL on failure
  31550. + * or the pointer to the buffer on success.
  31551. + * The returned buffer has a reference count of 1.
  31552. + */
  31553. +
  31554. +struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom,
  31555. + gfp_t gfp_mask, bool fclone)
  31556. +{
  31557. + unsigned int size = skb_headlen(skb) + headroom;
  31558. + int flags = skb_alloc_rx_flag(skb) | (fclone ? SKB_ALLOC_FCLONE : 0);
  31559. + struct sk_buff *n = __alloc_skb(size, gfp_mask, flags, NUMA_NO_NODE);
  31560. +
  31561. + if (!n)
  31562. + goto out;
  31563. +
  31564. + /* Set the data pointer */
  31565. + skb_reserve(n, headroom);
  31566. + /* Set the tail pointer and length */
  31567. + skb_put(n, skb_headlen(skb));
  31568. + /* Copy the bytes */
  31569. + skb_copy_from_linear_data(skb, n->data, n->len);
  31570. +
  31571. + n->truesize += skb->data_len;
  31572. + n->data_len = skb->data_len;
  31573. + n->len = skb->len;
  31574. +
  31575. + if (skb_shinfo(skb)->nr_frags) {
  31576. + int i;
  31577. +
  31578. + if (skb_orphan_frags(skb, gfp_mask)) {
  31579. + kfree_skb(n);
  31580. + n = NULL;
  31581. + goto out;
  31582. + }
  31583. + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
  31584. + skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i];
  31585. + skb_frag_ref(skb, i);
  31586. + }
  31587. + skb_shinfo(n)->nr_frags = i;
  31588. + }
  31589. +
  31590. + if (skb_has_frag_list(skb)) {
  31591. + skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list;
  31592. + skb_clone_fraglist(n);
  31593. + }
  31594. +
  31595. + copy_skb_header(n, skb);
  31596. +out:
  31597. + return n;
  31598. +}
  31599. +EXPORT_SYMBOL(__pskb_copy_fclone);
  31600. +
  31601. +/**
  31602. + * pskb_expand_head - reallocate header of &sk_buff
  31603. + * @skb: buffer to reallocate
  31604. + * @nhead: room to add at head
  31605. + * @ntail: room to add at tail
  31606. + * @gfp_mask: allocation priority
  31607. + *
  31608. + * Expands (or creates identical copy, if @nhead and @ntail are zero)
  31609. + * header of @skb. &sk_buff itself is not changed. &sk_buff MUST have
  31610. + * reference count of 1. Returns zero in the case of success or error,
  31611. + * if expansion failed. In the last case, &sk_buff is not changed.
  31612. + *
  31613. + * All the pointers pointing into skb header may change and must be
  31614. + * reloaded after call to this function.
  31615. + */
  31616. +
  31617. +int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
  31618. + gfp_t gfp_mask)
  31619. +{
  31620. + int i;
  31621. + u8 *data;
  31622. + int size = nhead + skb_end_offset(skb) + ntail;
  31623. + long off;
  31624. +
  31625. + BUG_ON(nhead < 0);
  31626. +
  31627. + if (skb_shared(skb))
  31628. + BUG();
  31629. +
  31630. + size = SKB_DATA_ALIGN(size);
  31631. +
  31632. + if (skb_pfmemalloc(skb))
  31633. + gfp_mask |= __GFP_MEMALLOC;
  31634. + data = kmalloc_reserve(size + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)),
  31635. + gfp_mask, NUMA_NO_NODE, NULL);
  31636. + if (!data)
  31637. + goto nodata;
  31638. + size = SKB_WITH_OVERHEAD(ksize(data));
  31639. +
  31640. + /* Copy only real data... and, alas, header. This should be
  31641. + * optimized for the cases when header is void.
  31642. + */
  31643. + memcpy(data + nhead, skb->head, skb_tail_pointer(skb) - skb->head);
  31644. +
  31645. + memcpy((struct skb_shared_info *)(data + size),
  31646. + skb_shinfo(skb),
  31647. + offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags]));
  31648. +
  31649. + /*
  31650. + * if shinfo is shared we must drop the old head gracefully, but if it
  31651. + * is not we can just drop the old head and let the existing refcount
  31652. + * be since all we did is relocate the values
  31653. + */
  31654. + if (skb_cloned(skb)) {
  31655. + /* copy this zero copy skb frags */
  31656. + if (skb_orphan_frags(skb, gfp_mask))
  31657. + goto nofrags;
  31658. + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
  31659. + skb_frag_ref(skb, i);
  31660. +
  31661. + if (skb_has_frag_list(skb))
  31662. + skb_clone_fraglist(skb);
  31663. +
  31664. + skb_release_data(skb);
  31665. + } else {
  31666. + skb_free_head(skb);
  31667. + }
  31668. + off = (data + nhead) - skb->head;
  31669. +
  31670. + skb->head = data;
  31671. + skb->head_frag = 0;
  31672. + skb->data += off;
  31673. +#ifdef NET_SKBUFF_DATA_USES_OFFSET
  31674. + skb->end = size;
  31675. + off = nhead;
  31676. +#else
  31677. + skb->end = skb->head + size;
  31678. +#endif
  31679. + skb->tail += off;
  31680. + skb_headers_offset_update(skb, nhead);
  31681. + skb->cloned = 0;
  31682. + skb->hdr_len = 0;
  31683. + skb->nohdr = 0;
  31684. + atomic_set(&skb_shinfo(skb)->dataref, 1);
  31685. + return 0;
  31686. +
  31687. +nofrags:
  31688. + kfree(data);
  31689. +nodata:
  31690. + return -ENOMEM;
  31691. +}
  31692. +EXPORT_SYMBOL(pskb_expand_head);
  31693. +
  31694. +/* Make private copy of skb with writable head and some headroom */
  31695. +
  31696. +struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)
  31697. +{
  31698. + struct sk_buff *skb2;
  31699. + int delta = headroom - skb_headroom(skb);
  31700. +
  31701. + if (delta <= 0)
  31702. + skb2 = pskb_copy(skb, GFP_ATOMIC);
  31703. + else {
  31704. + skb2 = skb_clone(skb, GFP_ATOMIC);
  31705. + if (skb2 && pskb_expand_head(skb2, SKB_DATA_ALIGN(delta), 0,
  31706. + GFP_ATOMIC)) {
  31707. + kfree_skb(skb2);
  31708. + skb2 = NULL;
  31709. + }
  31710. + }
  31711. + return skb2;
  31712. +}
  31713. +EXPORT_SYMBOL(skb_realloc_headroom);
  31714. +
  31715. +/**
  31716. + * skb_copy_expand - copy and expand sk_buff
  31717. + * @skb: buffer to copy
  31718. + * @newheadroom: new free bytes at head
  31719. + * @newtailroom: new free bytes at tail
  31720. + * @gfp_mask: allocation priority
  31721. + *
  31722. + * Make a copy of both an &sk_buff and its data and while doing so
  31723. + * allocate additional space.
  31724. + *
  31725. + * This is used when the caller wishes to modify the data and needs a
  31726. + * private copy of the data to alter as well as more space for new fields.
  31727. + * Returns %NULL on failure or the pointer to the buffer
  31728. + * on success. The returned buffer has a reference count of 1.
  31729. + *
  31730. + * You must pass %GFP_ATOMIC as the allocation priority if this function
  31731. + * is called from an interrupt.
  31732. + */
  31733. +struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
  31734. + int newheadroom, int newtailroom,
  31735. + gfp_t gfp_mask)
  31736. +{
  31737. + /*
  31738. + * Allocate the copy buffer
  31739. + */
  31740. + struct sk_buff *n = __alloc_skb(newheadroom + skb->len + newtailroom,
  31741. + gfp_mask, skb_alloc_rx_flag(skb),
  31742. + NUMA_NO_NODE);
  31743. + int oldheadroom = skb_headroom(skb);
  31744. + int head_copy_len, head_copy_off;
  31745. +
  31746. + if (!n)
  31747. + return NULL;
  31748. +
  31749. + skb_reserve(n, newheadroom);
  31750. +
  31751. + /* Set the tail pointer and length */
  31752. + skb_put(n, skb->len);
  31753. +
  31754. + head_copy_len = oldheadroom;
  31755. + head_copy_off = 0;
  31756. + if (newheadroom <= head_copy_len)
  31757. + head_copy_len = newheadroom;
  31758. + else
  31759. + head_copy_off = newheadroom - head_copy_len;
  31760. +
  31761. + /* Copy the linear header and data. */
  31762. + if (skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off,
  31763. + skb->len + head_copy_len))
  31764. + BUG();
  31765. +
  31766. + copy_skb_header(n, skb);
  31767. +
  31768. + skb_headers_offset_update(n, newheadroom - oldheadroom);
  31769. +
  31770. + return n;
  31771. +}
  31772. +EXPORT_SYMBOL(skb_copy_expand);
  31773. +
  31774. +/**
  31775. + * skb_pad - zero pad the tail of an skb
  31776. + * @skb: buffer to pad
  31777. + * @pad: space to pad
  31778. + *
  31779. + * Ensure that a buffer is followed by a padding area that is zero
  31780. + * filled. Used by network drivers which may DMA or transfer data
  31781. + * beyond the buffer end onto the wire.
  31782. + *
  31783. + * May return error in out of memory cases. The skb is freed on error.
  31784. + */
  31785. +
  31786. +int skb_pad(struct sk_buff *skb, int pad)
  31787. +{
  31788. + int err;
  31789. + int ntail;
  31790. +
  31791. + /* If the skbuff is non linear tailroom is always zero.. */
  31792. + if (!skb_cloned(skb) && skb_tailroom(skb) >= pad) {
  31793. + memset(skb->data+skb->len, 0, pad);
  31794. + return 0;
  31795. + }
  31796. +
  31797. + ntail = skb->data_len + pad - (skb->end - skb->tail);
  31798. + if (likely(skb_cloned(skb) || ntail > 0)) {
  31799. + err = pskb_expand_head(skb, 0, ntail, GFP_ATOMIC);
  31800. + if (unlikely(err))
  31801. + goto free_skb;
  31802. + }
  31803. +
  31804. + /* FIXME: The use of this function with non-linear skb's really needs
  31805. + * to be audited.
  31806. + */
  31807. + err = skb_linearize(skb);
  31808. + if (unlikely(err))
  31809. + goto free_skb;
  31810. +
  31811. + memset(skb->data + skb->len, 0, pad);
  31812. + return 0;
  31813. +
  31814. +free_skb:
  31815. + kfree_skb(skb);
  31816. + return err;
  31817. +}
  31818. +EXPORT_SYMBOL(skb_pad);
  31819. +
  31820. +/**
  31821. + * pskb_put - add data to the tail of a potentially fragmented buffer
  31822. + * @skb: start of the buffer to use
  31823. + * @tail: tail fragment of the buffer to use
  31824. + * @len: amount of data to add
  31825. + *
  31826. + * This function extends the used data area of the potentially
  31827. + * fragmented buffer. @tail must be the last fragment of @skb -- or
  31828. + * @skb itself. If this would exceed the total buffer size the kernel
  31829. + * will panic. A pointer to the first byte of the extra data is
  31830. + * returned.
  31831. + */
  31832. +
  31833. +unsigned char *pskb_put(struct sk_buff *skb, struct sk_buff *tail, int len)
  31834. +{
  31835. + if (tail != skb) {
  31836. + skb->data_len += len;
  31837. + skb->len += len;
  31838. + }
  31839. + return skb_put(tail, len);
  31840. +}
  31841. +EXPORT_SYMBOL_GPL(pskb_put);
  31842. +
  31843. +/**
  31844. + * skb_put - add data to a buffer
  31845. + * @skb: buffer to use
  31846. + * @len: amount of data to add
  31847. + *
  31848. + * This function extends the used data area of the buffer. If this would
  31849. + * exceed the total buffer size the kernel will panic. A pointer to the
  31850. + * first byte of the extra data is returned.
  31851. + */
  31852. +unsigned char *skb_put(struct sk_buff *skb, unsigned int len)
  31853. +{
  31854. + unsigned char *tmp = skb_tail_pointer(skb);
  31855. + SKB_LINEAR_ASSERT(skb);
  31856. + skb->tail += len;
  31857. + skb->len += len;
  31858. + if (unlikely(skb->tail > skb->end))
  31859. + skb_over_panic(skb, len, __builtin_return_address(0));
  31860. + return tmp;
  31861. +}
  31862. +EXPORT_SYMBOL(skb_put);
  31863. +
  31864. +/**
  31865. + * skb_push - add data to the start of a buffer
  31866. + * @skb: buffer to use
  31867. + * @len: amount of data to add
  31868. + *
  31869. + * This function extends the used data area of the buffer at the buffer
  31870. + * start. If this would exceed the total buffer headroom the kernel will
  31871. + * panic. A pointer to the first byte of the extra data is returned.
  31872. + */
  31873. +unsigned char *skb_push(struct sk_buff *skb, unsigned int len)
  31874. +{
  31875. + skb->data -= len;
  31876. + skb->len += len;
  31877. + if (unlikely(skb->data<skb->head))
  31878. + skb_under_panic(skb, len, __builtin_return_address(0));
  31879. + return skb->data;
  31880. +}
  31881. +EXPORT_SYMBOL(skb_push);
  31882. +
  31883. +/**
  31884. + * skb_pull - remove data from the start of a buffer
  31885. + * @skb: buffer to use
  31886. + * @len: amount of data to remove
  31887. + *
  31888. + * This function removes data from the start of a buffer, returning
  31889. + * the memory to the headroom. A pointer to the next data in the buffer
  31890. + * is returned. Once the data has been pulled future pushes will overwrite
  31891. + * the old data.
  31892. + */
  31893. +unsigned char *skb_pull(struct sk_buff *skb, unsigned int len)
  31894. +{
  31895. + return skb_pull_inline(skb, len);
  31896. +}
  31897. +EXPORT_SYMBOL(skb_pull);
  31898. +
  31899. +/**
  31900. + * skb_trim - remove end from a buffer
  31901. + * @skb: buffer to alter
  31902. + * @len: new length
  31903. + *
  31904. + * Cut the length of a buffer down by removing data from the tail. If
  31905. + * the buffer is already under the length specified it is not modified.
  31906. + * The skb must be linear.
  31907. + */
  31908. +void skb_trim(struct sk_buff *skb, unsigned int len)
  31909. +{
  31910. + if (skb->len > len)
  31911. + __skb_trim(skb, len);
  31912. +}
  31913. +EXPORT_SYMBOL(skb_trim);
  31914. +
  31915. +/* Trims skb to length len. It can change skb pointers.
  31916. + */
  31917. +
  31918. +int ___pskb_trim(struct sk_buff *skb, unsigned int len)
  31919. +{
  31920. + struct sk_buff **fragp;
  31921. + struct sk_buff *frag;
  31922. + int offset = skb_headlen(skb);
  31923. + int nfrags = skb_shinfo(skb)->nr_frags;
  31924. + int i;
  31925. + int err;
  31926. +
  31927. + if (skb_cloned(skb) &&
  31928. + unlikely((err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))))
  31929. + return err;
  31930. +
  31931. + i = 0;
  31932. + if (offset >= len)
  31933. + goto drop_pages;
  31934. +
  31935. + for (; i < nfrags; i++) {
  31936. + int end = offset + skb_frag_size(&skb_shinfo(skb)->frags[i]);
  31937. +
  31938. + if (end < len) {
  31939. + offset = end;
  31940. + continue;
  31941. + }
  31942. +
  31943. + skb_frag_size_set(&skb_shinfo(skb)->frags[i++], len - offset);
  31944. +
  31945. +drop_pages:
  31946. + skb_shinfo(skb)->nr_frags = i;
  31947. +
  31948. + for (; i < nfrags; i++)
  31949. + skb_frag_unref(skb, i);
  31950. +
  31951. + if (skb_has_frag_list(skb))
  31952. + skb_drop_fraglist(skb);
  31953. + goto done;
  31954. + }
  31955. +
  31956. + for (fragp = &skb_shinfo(skb)->frag_list; (frag = *fragp);
  31957. + fragp = &frag->next) {
  31958. + int end = offset + frag->len;
  31959. +
  31960. + if (skb_shared(frag)) {
  31961. + struct sk_buff *nfrag;
  31962. +
  31963. + nfrag = skb_clone(frag, GFP_ATOMIC);
  31964. + if (unlikely(!nfrag))
  31965. + return -ENOMEM;
  31966. +
  31967. + nfrag->next = frag->next;
  31968. + consume_skb(frag);
  31969. + frag = nfrag;
  31970. + *fragp = frag;
  31971. + }
  31972. +
  31973. + if (end < len) {
  31974. + offset = end;
  31975. + continue;
  31976. + }
  31977. +
  31978. + if (end > len &&
  31979. + unlikely((err = pskb_trim(frag, len - offset))))
  31980. + return err;
  31981. +
  31982. + if (frag->next)
  31983. + skb_drop_list(&frag->next);
  31984. + break;
  31985. + }
  31986. +
  31987. +done:
  31988. + if (len > skb_headlen(skb)) {
  31989. + skb->data_len -= skb->len - len;
  31990. + skb->len = len;
  31991. + } else {
  31992. + skb->len = len;
  31993. + skb->data_len = 0;
  31994. + skb_set_tail_pointer(skb, len);
  31995. + }
  31996. +
  31997. + return 0;
  31998. +}
  31999. +EXPORT_SYMBOL(___pskb_trim);
  32000. +
  32001. +/**
  32002. + * __pskb_pull_tail - advance tail of skb header
  32003. + * @skb: buffer to reallocate
  32004. + * @delta: number of bytes to advance tail
  32005. + *
  32006. + * The function makes a sense only on a fragmented &sk_buff,
  32007. + * it expands header moving its tail forward and copying necessary
  32008. + * data from fragmented part.
  32009. + *
  32010. + * &sk_buff MUST have reference count of 1.
  32011. + *
  32012. + * Returns %NULL (and &sk_buff does not change) if pull failed
  32013. + * or value of new tail of skb in the case of success.
  32014. + *
  32015. + * All the pointers pointing into skb header may change and must be
  32016. + * reloaded after call to this function.
  32017. + */
  32018. +
  32019. +/* Moves tail of skb head forward, copying data from fragmented part,
  32020. + * when it is necessary.
  32021. + * 1. It may fail due to malloc failure.
  32022. + * 2. It may change skb pointers.
  32023. + *
  32024. + * It is pretty complicated. Luckily, it is called only in exceptional cases.
  32025. + */
  32026. +unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta)
  32027. +{
  32028. + /* If skb has not enough free space at tail, get new one
  32029. + * plus 128 bytes for future expansions. If we have enough
  32030. + * room at tail, reallocate without expansion only if skb is cloned.
  32031. + */
  32032. + int i, k, eat = (skb->tail + delta) - skb->end;
  32033. +
  32034. + if (eat > 0 || skb_cloned(skb)) {
  32035. + if (pskb_expand_head(skb, 0, eat > 0 ? eat + 128 : 0,
  32036. + GFP_ATOMIC))
  32037. + return NULL;
  32038. + }
  32039. +
  32040. + if (skb_copy_bits(skb, skb_headlen(skb), skb_tail_pointer(skb), delta))
  32041. + BUG();
  32042. +
  32043. + /* Optimization: no fragments, no reasons to preestimate
  32044. + * size of pulled pages. Superb.
  32045. + */
  32046. + if (!skb_has_frag_list(skb))
  32047. + goto pull_pages;
  32048. +
  32049. + /* Estimate size of pulled pages. */
  32050. + eat = delta;
  32051. + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
  32052. + int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);
  32053. +
  32054. + if (size >= eat)
  32055. + goto pull_pages;
  32056. + eat -= size;
  32057. + }
  32058. +
  32059. + /* If we need update frag list, we are in troubles.
  32060. + * Certainly, it possible to add an offset to skb data,
  32061. + * but taking into account that pulling is expected to
  32062. + * be very rare operation, it is worth to fight against
  32063. + * further bloating skb head and crucify ourselves here instead.
  32064. + * Pure masohism, indeed. 8)8)
  32065. + */
  32066. + if (eat) {
  32067. + struct sk_buff *list = skb_shinfo(skb)->frag_list;
  32068. + struct sk_buff *clone = NULL;
  32069. + struct sk_buff *insp = NULL;
  32070. +
  32071. + do {
  32072. + BUG_ON(!list);
  32073. +
  32074. + if (list->len <= eat) {
  32075. + /* Eaten as whole. */
  32076. + eat -= list->len;
  32077. + list = list->next;
  32078. + insp = list;
  32079. + } else {
  32080. + /* Eaten partially. */
  32081. +
  32082. + if (skb_shared(list)) {
  32083. + /* Sucks! We need to fork list. :-( */
  32084. + clone = skb_clone(list, GFP_ATOMIC);
  32085. + if (!clone)
  32086. + return NULL;
  32087. + insp = list->next;
  32088. + list = clone;
  32089. + } else {
  32090. + /* This may be pulled without
  32091. + * problems. */
  32092. + insp = list;
  32093. + }
  32094. + if (!pskb_pull(list, eat)) {
  32095. + kfree_skb(clone);
  32096. + return NULL;
  32097. + }
  32098. + break;
  32099. + }
  32100. + } while (eat);
  32101. +
  32102. + /* Free pulled out fragments. */
  32103. + while ((list = skb_shinfo(skb)->frag_list) != insp) {
  32104. + skb_shinfo(skb)->frag_list = list->next;
  32105. + kfree_skb(list);
  32106. + }
  32107. + /* And insert new clone at head. */
  32108. + if (clone) {
  32109. + clone->next = list;
  32110. + skb_shinfo(skb)->frag_list = clone;
  32111. + }
  32112. + }
  32113. + /* Success! Now we may commit changes to skb data. */
  32114. +
  32115. +pull_pages:
  32116. + eat = delta;
  32117. + k = 0;
  32118. + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
  32119. + int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);
  32120. +
  32121. + if (size <= eat) {
  32122. + skb_frag_unref(skb, i);
  32123. + eat -= size;
  32124. + } else {
  32125. + skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i];
  32126. + if (eat) {
  32127. + skb_shinfo(skb)->frags[k].page_offset += eat;
  32128. + skb_frag_size_sub(&skb_shinfo(skb)->frags[k], eat);
  32129. + eat = 0;
  32130. + }
  32131. + k++;
  32132. + }
  32133. + }
  32134. + skb_shinfo(skb)->nr_frags = k;
  32135. +
  32136. + skb->tail += delta;
  32137. + skb->data_len -= delta;
  32138. +
  32139. + return skb_tail_pointer(skb);
  32140. +}
  32141. +EXPORT_SYMBOL(__pskb_pull_tail);
  32142. +
  32143. +/**
  32144. + * skb_copy_bits - copy bits from skb to kernel buffer
  32145. + * @skb: source skb
  32146. + * @offset: offset in source
  32147. + * @to: destination buffer
  32148. + * @len: number of bytes to copy
  32149. + *
  32150. + * Copy the specified number of bytes from the source skb to the
  32151. + * destination buffer.
  32152. + *
  32153. + * CAUTION ! :
  32154. + * If its prototype is ever changed,
  32155. + * check arch/{*}/net/{*}.S files,
  32156. + * since it is called from BPF assembly code.
  32157. + */
  32158. +int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len)
  32159. +{
  32160. + int start = skb_headlen(skb);
  32161. + struct sk_buff *frag_iter;
  32162. + int i, copy;
  32163. +
  32164. + if (offset > (int)skb->len - len)
  32165. + goto fault;
  32166. +
  32167. + /* Copy header. */
  32168. + if ((copy = start - offset) > 0) {
  32169. + if (copy > len)
  32170. + copy = len;
  32171. + skb_copy_from_linear_data_offset(skb, offset, to, copy);
  32172. + if ((len -= copy) == 0)
  32173. + return 0;
  32174. + offset += copy;
  32175. + to += copy;
  32176. + }
  32177. +
  32178. + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
  32179. + int end;
  32180. + skb_frag_t *f = &skb_shinfo(skb)->frags[i];
  32181. +
  32182. + WARN_ON(start > offset + len);
  32183. +
  32184. + end = start + skb_frag_size(f);
  32185. + if ((copy = end - offset) > 0) {
  32186. + u8 *vaddr;
  32187. +
  32188. + if (copy > len)
  32189. + copy = len;
  32190. +
  32191. + vaddr = kmap_atomic(skb_frag_page(f));
  32192. + memcpy(to,
  32193. + vaddr + f->page_offset + offset - start,
  32194. + copy);
  32195. + kunmap_atomic(vaddr);
  32196. +
  32197. + if ((len -= copy) == 0)
  32198. + return 0;
  32199. + offset += copy;
  32200. + to += copy;
  32201. + }
  32202. + start = end;
  32203. + }
  32204. +
  32205. + skb_walk_frags(skb, frag_iter) {
  32206. + int end;
  32207. +
  32208. + WARN_ON(start > offset + len);
  32209. +
  32210. + end = start + frag_iter->len;
  32211. + if ((copy = end - offset) > 0) {
  32212. + if (copy > len)
  32213. + copy = len;
  32214. + if (skb_copy_bits(frag_iter, offset - start, to, copy))
  32215. + goto fault;
  32216. + if ((len -= copy) == 0)
  32217. + return 0;
  32218. + offset += copy;
  32219. + to += copy;
  32220. + }
  32221. + start = end;
  32222. + }
  32223. +
  32224. + if (!len)
  32225. + return 0;
  32226. +
  32227. +fault:
  32228. + return -EFAULT;
  32229. +}
  32230. +EXPORT_SYMBOL(skb_copy_bits);
  32231. +
  32232. +/*
  32233. + * Callback from splice_to_pipe(), if we need to release some pages
  32234. + * at the end of the spd in case we error'ed out in filling the pipe.
  32235. + */
  32236. +static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i)
  32237. +{
  32238. + put_page(spd->pages[i]);
  32239. +}
  32240. +
  32241. +static struct page *linear_to_page(struct page *page, unsigned int *len,
  32242. + unsigned int *offset,
  32243. + struct sock *sk)
  32244. +{
  32245. + struct page_frag *pfrag = sk_page_frag(sk);
  32246. +
  32247. + if (!sk_page_frag_refill(sk, pfrag))
  32248. + return NULL;
  32249. +
  32250. + *len = min_t(unsigned int, *len, pfrag->size - pfrag->offset);
  32251. +
  32252. + memcpy(page_address(pfrag->page) + pfrag->offset,
  32253. + page_address(page) + *offset, *len);
  32254. + *offset = pfrag->offset;
  32255. + pfrag->offset += *len;
  32256. +
  32257. + return pfrag->page;
  32258. +}
  32259. +
  32260. +static bool spd_can_coalesce(const struct splice_pipe_desc *spd,
  32261. + struct page *page,
  32262. + unsigned int offset)
  32263. +{
  32264. + return spd->nr_pages &&
  32265. + spd->pages[spd->nr_pages - 1] == page &&
  32266. + (spd->partial[spd->nr_pages - 1].offset +
  32267. + spd->partial[spd->nr_pages - 1].len == offset);
  32268. +}
  32269. +
  32270. +/*
  32271. + * Fill page/offset/length into spd, if it can hold more pages.
  32272. + */
  32273. +static bool spd_fill_page(struct splice_pipe_desc *spd,
  32274. + struct pipe_inode_info *pipe, struct page *page,
  32275. + unsigned int *len, unsigned int offset,
  32276. + bool linear,
  32277. + struct sock *sk)
  32278. +{
  32279. + if (unlikely(spd->nr_pages == MAX_SKB_FRAGS))
  32280. + return true;
  32281. +
  32282. + if (linear) {
  32283. + page = linear_to_page(page, len, &offset, sk);
  32284. + if (!page)
  32285. + return true;
  32286. + }
  32287. + if (spd_can_coalesce(spd, page, offset)) {
  32288. + spd->partial[spd->nr_pages - 1].len += *len;
  32289. + return false;
  32290. + }
  32291. + get_page(page);
  32292. + spd->pages[spd->nr_pages] = page;
  32293. + spd->partial[spd->nr_pages].len = *len;
  32294. + spd->partial[spd->nr_pages].offset = offset;
  32295. + spd->nr_pages++;
  32296. +
  32297. + return false;
  32298. +}
  32299. +
  32300. +static bool __splice_segment(struct page *page, unsigned int poff,
  32301. + unsigned int plen, unsigned int *off,
  32302. + unsigned int *len,
  32303. + struct splice_pipe_desc *spd, bool linear,
  32304. + struct sock *sk,
  32305. + struct pipe_inode_info *pipe)
  32306. +{
  32307. + if (!*len)
  32308. + return true;
  32309. +
  32310. + /* skip this segment if already processed */
  32311. + if (*off >= plen) {
  32312. + *off -= plen;
  32313. + return false;
  32314. + }
  32315. +
  32316. + /* ignore any bits we already processed */
  32317. + poff += *off;
  32318. + plen -= *off;
  32319. + *off = 0;
  32320. +
  32321. + do {
  32322. + unsigned int flen = min(*len, plen);
  32323. +
  32324. + if (spd_fill_page(spd, pipe, page, &flen, poff,
  32325. + linear, sk))
  32326. + return true;
  32327. + poff += flen;
  32328. + plen -= flen;
  32329. + *len -= flen;
  32330. + } while (*len && plen);
  32331. +
  32332. + return false;
  32333. +}
  32334. +
  32335. +/*
  32336. + * Map linear and fragment data from the skb to spd. It reports true if the
  32337. + * pipe is full or if we already spliced the requested length.
  32338. + */
  32339. +static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,
  32340. + unsigned int *offset, unsigned int *len,
  32341. + struct splice_pipe_desc *spd, struct sock *sk)
  32342. +{
  32343. + int seg;
  32344. +
  32345. + /* map the linear part :
  32346. + * If skb->head_frag is set, this 'linear' part is backed by a
  32347. + * fragment, and if the head is not shared with any clones then
  32348. + * we can avoid a copy since we own the head portion of this page.
  32349. + */
  32350. + if (__splice_segment(virt_to_page(skb->data),
  32351. + (unsigned long) skb->data & (PAGE_SIZE - 1),
  32352. + skb_headlen(skb),
  32353. + offset, len, spd,
  32354. + skb_head_is_locked(skb),
  32355. + sk, pipe))
  32356. + return true;
  32357. +
  32358. + /*
  32359. + * then map the fragments
  32360. + */
  32361. + for (seg = 0; seg < skb_shinfo(skb)->nr_frags; seg++) {
  32362. + const skb_frag_t *f = &skb_shinfo(skb)->frags[seg];
  32363. +
  32364. + if (__splice_segment(skb_frag_page(f),
  32365. + f->page_offset, skb_frag_size(f),
  32366. + offset, len, spd, false, sk, pipe))
  32367. + return true;
  32368. + }
  32369. +
  32370. + return false;
  32371. +}
  32372. +
  32373. +/*
  32374. + * Map data from the skb to a pipe. Should handle both the linear part,
  32375. + * the fragments, and the frag list. It does NOT handle frag lists within
  32376. + * the frag list, if such a thing exists. We'd probably need to recurse to
  32377. + * handle that cleanly.
  32378. + */
  32379. +int skb_splice_bits(struct sk_buff *skb, unsigned int offset,
  32380. + struct pipe_inode_info *pipe, unsigned int tlen,
  32381. + unsigned int flags)
  32382. +{
  32383. + struct partial_page partial[MAX_SKB_FRAGS];
  32384. + struct page *pages[MAX_SKB_FRAGS];
  32385. + struct splice_pipe_desc spd = {
  32386. + .pages = pages,
  32387. + .partial = partial,
  32388. + .nr_pages_max = MAX_SKB_FRAGS,
  32389. + .flags = flags,
  32390. + .ops = &nosteal_pipe_buf_ops,
  32391. + .spd_release = sock_spd_release,
  32392. + };
  32393. + struct sk_buff *frag_iter;
  32394. + struct sock *sk = skb->sk;
  32395. + int ret = 0;
  32396. +
  32397. + /*
  32398. + * __skb_splice_bits() only fails if the output has no room left,
  32399. + * so no point in going over the frag_list for the error case.
  32400. + */
  32401. + if (__skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk))
  32402. + goto done;
  32403. + else if (!tlen)
  32404. + goto done;
  32405. +
  32406. + /*
  32407. + * now see if we have a frag_list to map
  32408. + */
  32409. + skb_walk_frags(skb, frag_iter) {
  32410. + if (!tlen)
  32411. + break;
  32412. + if (__skb_splice_bits(frag_iter, pipe, &offset, &tlen, &spd, sk))
  32413. + break;
  32414. + }
  32415. +
  32416. +done:
  32417. + if (spd.nr_pages) {
  32418. + /*
  32419. + * Drop the socket lock, otherwise we have reverse
  32420. + * locking dependencies between sk_lock and i_mutex
  32421. + * here as compared to sendfile(). We enter here
  32422. + * with the socket lock held, and splice_to_pipe() will
  32423. + * grab the pipe inode lock. For sendfile() emulation,
  32424. + * we call into ->sendpage() with the i_mutex lock held
  32425. + * and networking will grab the socket lock.
  32426. + */
  32427. + release_sock(sk);
  32428. + ret = splice_to_pipe(pipe, &spd);
  32429. + lock_sock(sk);
  32430. + }
  32431. +
  32432. + return ret;
  32433. +}
  32434. +
  32435. +/**
  32436. + * skb_store_bits - store bits from kernel buffer to skb
  32437. + * @skb: destination buffer
  32438. + * @offset: offset in destination
  32439. + * @from: source buffer
  32440. + * @len: number of bytes to copy
  32441. + *
  32442. + * Copy the specified number of bytes from the source buffer to the
  32443. + * destination skb. This function handles all the messy bits of
  32444. + * traversing fragment lists and such.
  32445. + */
  32446. +
  32447. +int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len)
  32448. +{
  32449. + int start = skb_headlen(skb);
  32450. + struct sk_buff *frag_iter;
  32451. + int i, copy;
  32452. +
  32453. + if (offset > (int)skb->len - len)
  32454. + goto fault;
  32455. +
  32456. + if ((copy = start - offset) > 0) {
  32457. + if (copy > len)
  32458. + copy = len;
  32459. + skb_copy_to_linear_data_offset(skb, offset, from, copy);
  32460. + if ((len -= copy) == 0)
  32461. + return 0;
  32462. + offset += copy;
  32463. + from += copy;
  32464. + }
  32465. +
  32466. + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
  32467. + skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
  32468. + int end;
  32469. +
  32470. + WARN_ON(start > offset + len);
  32471. +
  32472. + end = start + skb_frag_size(frag);
  32473. + if ((copy = end - offset) > 0) {
  32474. + u8 *vaddr;
  32475. +
  32476. + if (copy > len)
  32477. + copy = len;
  32478. +
  32479. + vaddr = kmap_atomic(skb_frag_page(frag));
  32480. + memcpy(vaddr + frag->page_offset + offset - start,
  32481. + from, copy);
  32482. + kunmap_atomic(vaddr);
  32483. +
  32484. + if ((len -= copy) == 0)
  32485. + return 0;
  32486. + offset += copy;
  32487. + from += copy;
  32488. + }
  32489. + start = end;
  32490. + }
  32491. +
  32492. + skb_walk_frags(skb, frag_iter) {
  32493. + int end;
  32494. +
  32495. + WARN_ON(start > offset + len);
  32496. +
  32497. + end = start + frag_iter->len;
  32498. + if ((copy = end - offset) > 0) {
  32499. + if (copy > len)
  32500. + copy = len;
  32501. + if (skb_store_bits(frag_iter, offset - start,
  32502. + from, copy))
  32503. + goto fault;
  32504. + if ((len -= copy) == 0)
  32505. + return 0;
  32506. + offset += copy;
  32507. + from += copy;
  32508. + }
  32509. + start = end;
  32510. + }
  32511. + if (!len)
  32512. + return 0;
  32513. +
  32514. +fault:
  32515. + return -EFAULT;
  32516. +}
  32517. +EXPORT_SYMBOL(skb_store_bits);
  32518. +
  32519. +/* Checksum skb data. */
  32520. +__wsum __skb_checksum(const struct sk_buff *skb, int offset, int len,
  32521. + __wsum csum, const struct skb_checksum_ops *ops)
  32522. +{
  32523. + int start = skb_headlen(skb);
  32524. + int i, copy = start - offset;
  32525. + struct sk_buff *frag_iter;
  32526. + int pos = 0;
  32527. +
  32528. + /* Checksum header. */
  32529. + if (copy > 0) {
  32530. + if (copy > len)
  32531. + copy = len;
  32532. + csum = ops->update(skb->data + offset, copy, csum);
  32533. + if ((len -= copy) == 0)
  32534. + return csum;
  32535. + offset += copy;
  32536. + pos = copy;
  32537. + }
  32538. +
  32539. + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
  32540. + int end;
  32541. + skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
  32542. +
  32543. + WARN_ON(start > offset + len);
  32544. +
  32545. + end = start + skb_frag_size(frag);
  32546. + if ((copy = end - offset) > 0) {
  32547. + __wsum csum2;
  32548. + u8 *vaddr;
  32549. +
  32550. + if (copy > len)
  32551. + copy = len;
  32552. + vaddr = kmap_atomic(skb_frag_page(frag));
  32553. + csum2 = ops->update(vaddr + frag->page_offset +
  32554. + offset - start, copy, 0);
  32555. + kunmap_atomic(vaddr);
  32556. + csum = ops->combine(csum, csum2, pos, copy);
  32557. + if (!(len -= copy))
  32558. + return csum;
  32559. + offset += copy;
  32560. + pos += copy;
  32561. + }
  32562. + start = end;
  32563. + }
  32564. +
  32565. + skb_walk_frags(skb, frag_iter) {
  32566. + int end;
  32567. +
  32568. + WARN_ON(start > offset + len);
  32569. +
  32570. + end = start + frag_iter->len;
  32571. + if ((copy = end - offset) > 0) {
  32572. + __wsum csum2;
  32573. + if (copy > len)
  32574. + copy = len;
  32575. + csum2 = __skb_checksum(frag_iter, offset - start,
  32576. + copy, 0, ops);
  32577. + csum = ops->combine(csum, csum2, pos, copy);
  32578. + if ((len -= copy) == 0)
  32579. + return csum;
  32580. + offset += copy;
  32581. + pos += copy;
  32582. + }
  32583. + start = end;
  32584. + }
  32585. + BUG_ON(len);
  32586. +
  32587. + return csum;
  32588. +}
  32589. +EXPORT_SYMBOL(__skb_checksum);
  32590. +
  32591. +__wsum skb_checksum(const struct sk_buff *skb, int offset,
  32592. + int len, __wsum csum)
  32593. +{
  32594. + const struct skb_checksum_ops ops = {
  32595. + .update = csum_partial_ext,
  32596. + .combine = csum_block_add_ext,
  32597. + };
  32598. +
  32599. + return __skb_checksum(skb, offset, len, csum, &ops);
  32600. +}
  32601. +EXPORT_SYMBOL(skb_checksum);
  32602. +
  32603. +/* Both of above in one bottle. */
  32604. +
  32605. +__wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset,
  32606. + u8 *to, int len, __wsum csum)
  32607. +{
  32608. + int start = skb_headlen(skb);
  32609. + int i, copy = start - offset;
  32610. + struct sk_buff *frag_iter;
  32611. + int pos = 0;
  32612. +
  32613. + /* Copy header. */
  32614. + if (copy > 0) {
  32615. + if (copy > len)
  32616. + copy = len;
  32617. + csum = csum_partial_copy_nocheck(skb->data + offset, to,
  32618. + copy, csum);
  32619. + if ((len -= copy) == 0)
  32620. + return csum;
  32621. + offset += copy;
  32622. + to += copy;
  32623. + pos = copy;
  32624. + }
  32625. +
  32626. + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
  32627. + int end;
  32628. +
  32629. + WARN_ON(start > offset + len);
  32630. +
  32631. + end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]);
  32632. + if ((copy = end - offset) > 0) {
  32633. + __wsum csum2;
  32634. + u8 *vaddr;
  32635. + skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
  32636. +
  32637. + if (copy > len)
  32638. + copy = len;
  32639. + vaddr = kmap_atomic(skb_frag_page(frag));
  32640. + csum2 = csum_partial_copy_nocheck(vaddr +
  32641. + frag->page_offset +
  32642. + offset - start, to,
  32643. + copy, 0);
  32644. + kunmap_atomic(vaddr);
  32645. + csum = csum_block_add(csum, csum2, pos);
  32646. + if (!(len -= copy))
  32647. + return csum;
  32648. + offset += copy;
  32649. + to += copy;
  32650. + pos += copy;
  32651. + }
  32652. + start = end;
  32653. + }
  32654. +
  32655. + skb_walk_frags(skb, frag_iter) {
  32656. + __wsum csum2;
  32657. + int end;
  32658. +
  32659. + WARN_ON(start > offset + len);
  32660. +
  32661. + end = start + frag_iter->len;
  32662. + if ((copy = end - offset) > 0) {
  32663. + if (copy > len)
  32664. + copy = len;
  32665. + csum2 = skb_copy_and_csum_bits(frag_iter,
  32666. + offset - start,
  32667. + to, copy, 0);
  32668. + csum = csum_block_add(csum, csum2, pos);
  32669. + if ((len -= copy) == 0)
  32670. + return csum;
  32671. + offset += copy;
  32672. + to += copy;
  32673. + pos += copy;
  32674. + }
  32675. + start = end;
  32676. + }
  32677. + BUG_ON(len);
  32678. + return csum;
  32679. +}
  32680. +EXPORT_SYMBOL(skb_copy_and_csum_bits);
  32681. +
  32682. + /**
  32683. + * skb_zerocopy_headlen - Calculate headroom needed for skb_zerocopy()
  32684. + * @from: source buffer
  32685. + *
  32686. + * Calculates the amount of linear headroom needed in the 'to' skb passed
  32687. + * into skb_zerocopy().
  32688. + */
  32689. +unsigned int
  32690. +skb_zerocopy_headlen(const struct sk_buff *from)
  32691. +{
  32692. + unsigned int hlen = 0;
  32693. +
  32694. + if (!from->head_frag ||
  32695. + skb_headlen(from) < L1_CACHE_BYTES ||
  32696. + skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS)
  32697. + hlen = skb_headlen(from);
  32698. +
  32699. + if (skb_has_frag_list(from))
  32700. + hlen = from->len;
  32701. +
  32702. + return hlen;
  32703. +}
  32704. +EXPORT_SYMBOL_GPL(skb_zerocopy_headlen);
  32705. +
  32706. +/**
  32707. + * skb_zerocopy - Zero copy skb to skb
  32708. + * @to: destination buffer
  32709. + * @from: source buffer
  32710. + * @len: number of bytes to copy from source buffer
  32711. + * @hlen: size of linear headroom in destination buffer
  32712. + *
  32713. + * Copies up to `len` bytes from `from` to `to` by creating references
  32714. + * to the frags in the source buffer.
  32715. + *
  32716. + * The `hlen` as calculated by skb_zerocopy_headlen() specifies the
  32717. + * headroom in the `to` buffer.
  32718. + *
  32719. + * Return value:
  32720. + * 0: everything is OK
  32721. + * -ENOMEM: couldn't orphan frags of @from due to lack of memory
  32722. + * -EFAULT: skb_copy_bits() found some problem with skb geometry
  32723. + */
  32724. +int
  32725. +skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, int hlen)
  32726. +{
  32727. + int i, j = 0;
  32728. + int plen = 0; /* length of skb->head fragment */
  32729. + int ret;
  32730. + struct page *page;
  32731. + unsigned int offset;
  32732. +
  32733. + BUG_ON(!from->head_frag && !hlen);
  32734. +
  32735. + /* dont bother with small payloads */
  32736. + if (len <= skb_tailroom(to))
  32737. + return skb_copy_bits(from, 0, skb_put(to, len), len);
  32738. +
  32739. + if (hlen) {
  32740. + ret = skb_copy_bits(from, 0, skb_put(to, hlen), hlen);
  32741. + if (unlikely(ret))
  32742. + return ret;
  32743. + len -= hlen;
  32744. + } else {
  32745. + plen = min_t(int, skb_headlen(from), len);
  32746. + if (plen) {
  32747. + page = virt_to_head_page(from->head);
  32748. + offset = from->data - (unsigned char *)page_address(page);
  32749. + __skb_fill_page_desc(to, 0, page, offset, plen);
  32750. + get_page(page);
  32751. + j = 1;
  32752. + len -= plen;
  32753. + }
  32754. + }
  32755. +
  32756. + to->truesize += len + plen;
  32757. + to->len += len + plen;
  32758. + to->data_len += len + plen;
  32759. +
  32760. + if (unlikely(skb_orphan_frags(from, GFP_ATOMIC))) {
  32761. + skb_tx_error(from);
  32762. + return -ENOMEM;
  32763. + }
  32764. +
  32765. + for (i = 0; i < skb_shinfo(from)->nr_frags; i++) {
  32766. + if (!len)
  32767. + break;
  32768. + skb_shinfo(to)->frags[j] = skb_shinfo(from)->frags[i];
  32769. + skb_shinfo(to)->frags[j].size = min_t(int, skb_shinfo(to)->frags[j].size, len);
  32770. + len -= skb_shinfo(to)->frags[j].size;
  32771. + skb_frag_ref(to, j);
  32772. + j++;
  32773. + }
  32774. + skb_shinfo(to)->nr_frags = j;
  32775. +
  32776. + return 0;
  32777. +}
  32778. +EXPORT_SYMBOL_GPL(skb_zerocopy);
  32779. +
  32780. +void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to)
  32781. +{
  32782. + __wsum csum;
  32783. + long csstart;
  32784. +
  32785. + if (skb->ip_summed == CHECKSUM_PARTIAL)
  32786. + csstart = skb_checksum_start_offset(skb);
  32787. + else
  32788. + csstart = skb_headlen(skb);
  32789. +
  32790. + BUG_ON(csstart > skb_headlen(skb));
  32791. +
  32792. + skb_copy_from_linear_data(skb, to, csstart);
  32793. +
  32794. + csum = 0;
  32795. + if (csstart != skb->len)
  32796. + csum = skb_copy_and_csum_bits(skb, csstart, to + csstart,
  32797. + skb->len - csstart, 0);
  32798. +
  32799. + if (skb->ip_summed == CHECKSUM_PARTIAL) {
  32800. + long csstuff = csstart + skb->csum_offset;
  32801. +
  32802. + *((__sum16 *)(to + csstuff)) = csum_fold(csum);
  32803. + }
  32804. +}
  32805. +EXPORT_SYMBOL(skb_copy_and_csum_dev);
  32806. +
  32807. +/**
  32808. + * skb_dequeue - remove from the head of the queue
  32809. + * @list: list to dequeue from
  32810. + *
  32811. + * Remove the head of the list. The list lock is taken so the function
  32812. + * may be used safely with other locking list functions. The head item is
  32813. + * returned or %NULL if the list is empty.
  32814. + */
  32815. +
  32816. +struct sk_buff *skb_dequeue(struct sk_buff_head *list)
  32817. +{
  32818. + unsigned long flags;
  32819. + struct sk_buff *result;
  32820. +
  32821. + spin_lock_irqsave(&list->lock, flags);
  32822. + result = __skb_dequeue(list);
  32823. + spin_unlock_irqrestore(&list->lock, flags);
  32824. + return result;
  32825. +}
  32826. +EXPORT_SYMBOL(skb_dequeue);
  32827. +
  32828. +/**
  32829. + * skb_dequeue_tail - remove from the tail of the queue
  32830. + * @list: list to dequeue from
  32831. + *
  32832. + * Remove the tail of the list. The list lock is taken so the function
  32833. + * may be used safely with other locking list functions. The tail item is
  32834. + * returned or %NULL if the list is empty.
  32835. + */
  32836. +struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list)
  32837. +{
  32838. + unsigned long flags;
  32839. + struct sk_buff *result;
  32840. +
  32841. + spin_lock_irqsave(&list->lock, flags);
  32842. + result = __skb_dequeue_tail(list);
  32843. + spin_unlock_irqrestore(&list->lock, flags);
  32844. + return result;
  32845. +}
  32846. +EXPORT_SYMBOL(skb_dequeue_tail);
  32847. +
  32848. +/**
  32849. + * skb_queue_purge - empty a list
  32850. + * @list: list to empty
  32851. + *
  32852. + * Delete all buffers on an &sk_buff list. Each buffer is removed from
  32853. + * the list and one reference dropped. This function takes the list
  32854. + * lock and is atomic with respect to other list locking functions.
  32855. + */
  32856. +void skb_queue_purge(struct sk_buff_head *list)
  32857. +{
  32858. + struct sk_buff *skb;
  32859. + while ((skb = skb_dequeue(list)) != NULL)
  32860. + kfree_skb(skb);
  32861. +}
  32862. +EXPORT_SYMBOL(skb_queue_purge);
  32863. +
  32864. +/**
  32865. + * skb_queue_head - queue a buffer at the list head
  32866. + * @list: list to use
  32867. + * @newsk: buffer to queue
  32868. + *
  32869. + * Queue a buffer at the start of the list. This function takes the
  32870. + * list lock and can be used safely with other locking &sk_buff functions
  32871. + * safely.
  32872. + *
  32873. + * A buffer cannot be placed on two lists at the same time.
  32874. + */
  32875. +void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk)
  32876. +{
  32877. + unsigned long flags;
  32878. +
  32879. + spin_lock_irqsave(&list->lock, flags);
  32880. + __skb_queue_head(list, newsk);
  32881. + spin_unlock_irqrestore(&list->lock, flags);
  32882. +}
  32883. +EXPORT_SYMBOL(skb_queue_head);
  32884. +
  32885. +/**
  32886. + * skb_queue_tail - queue a buffer at the list tail
  32887. + * @list: list to use
  32888. + * @newsk: buffer to queue
  32889. + *
  32890. + * Queue a buffer at the tail of the list. This function takes the
  32891. + * list lock and can be used safely with other locking &sk_buff functions
  32892. + * safely.
  32893. + *
  32894. + * A buffer cannot be placed on two lists at the same time.
  32895. + */
  32896. +void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk)
  32897. +{
  32898. + unsigned long flags;
  32899. +
  32900. + spin_lock_irqsave(&list->lock, flags);
  32901. + __skb_queue_tail(list, newsk);
  32902. + spin_unlock_irqrestore(&list->lock, flags);
  32903. +}
  32904. +EXPORT_SYMBOL(skb_queue_tail);
  32905. +
  32906. +/**
  32907. + * skb_unlink - remove a buffer from a list
  32908. + * @skb: buffer to remove
  32909. + * @list: list to use
  32910. + *
  32911. + * Remove a packet from a list. The list locks are taken and this
  32912. + * function is atomic with respect to other list locked calls
  32913. + *
  32914. + * You must know what list the SKB is on.
  32915. + */
  32916. +void skb_unlink(struct sk_buff *skb, struct sk_buff_head *list)
  32917. +{
  32918. + unsigned long flags;
  32919. +
  32920. + spin_lock_irqsave(&list->lock, flags);
  32921. + __skb_unlink(skb, list);
  32922. + spin_unlock_irqrestore(&list->lock, flags);
  32923. +}
  32924. +EXPORT_SYMBOL(skb_unlink);
  32925. +
  32926. +/**
  32927. + * skb_append - append a buffer
  32928. + * @old: buffer to insert after
  32929. + * @newsk: buffer to insert
  32930. + * @list: list to use
  32931. + *
  32932. + * Place a packet after a given packet in a list. The list locks are taken
  32933. + * and this function is atomic with respect to other list locked calls.
  32934. + * A buffer cannot be placed on two lists at the same time.
  32935. + */
  32936. +void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list)
  32937. +{
  32938. + unsigned long flags;
  32939. +
  32940. + spin_lock_irqsave(&list->lock, flags);
  32941. + __skb_queue_after(list, old, newsk);
  32942. + spin_unlock_irqrestore(&list->lock, flags);
  32943. +}
  32944. +EXPORT_SYMBOL(skb_append);
  32945. +
  32946. +/**
  32947. + * skb_insert - insert a buffer
  32948. + * @old: buffer to insert before
  32949. + * @newsk: buffer to insert
  32950. + * @list: list to use
  32951. + *
  32952. + * Place a packet before a given packet in a list. The list locks are
  32953. + * taken and this function is atomic with respect to other list locked
  32954. + * calls.
  32955. + *
  32956. + * A buffer cannot be placed on two lists at the same time.
  32957. + */
  32958. +void skb_insert(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list)
  32959. +{
  32960. + unsigned long flags;
  32961. +
  32962. + spin_lock_irqsave(&list->lock, flags);
  32963. + __skb_insert(newsk, old->prev, old, list);
  32964. + spin_unlock_irqrestore(&list->lock, flags);
  32965. +}
  32966. +EXPORT_SYMBOL(skb_insert);
  32967. +
  32968. +static inline void skb_split_inside_header(struct sk_buff *skb,
  32969. + struct sk_buff* skb1,
  32970. + const u32 len, const int pos)
  32971. +{
  32972. + int i;
  32973. +
  32974. + skb_copy_from_linear_data_offset(skb, len, skb_put(skb1, pos - len),
  32975. + pos - len);
  32976. + /* And move data appendix as is. */
  32977. + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
  32978. + skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i];
  32979. +
  32980. + skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags;
  32981. + skb_shinfo(skb)->nr_frags = 0;
  32982. + skb1->data_len = skb->data_len;
  32983. + skb1->len += skb1->data_len;
  32984. + skb->data_len = 0;
  32985. + skb->len = len;
  32986. + skb_set_tail_pointer(skb, len);
  32987. +}
  32988. +
  32989. +static inline void skb_split_no_header(struct sk_buff *skb,
  32990. + struct sk_buff* skb1,
  32991. + const u32 len, int pos)
  32992. +{
  32993. + int i, k = 0;
  32994. + const int nfrags = skb_shinfo(skb)->nr_frags;
  32995. +
  32996. + skb_shinfo(skb)->nr_frags = 0;
  32997. + skb1->len = skb1->data_len = skb->len - len;
  32998. + skb->len = len;
  32999. + skb->data_len = len - pos;
  33000. +
  33001. + for (i = 0; i < nfrags; i++) {
  33002. + int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);
  33003. +
  33004. + if (pos + size > len) {
  33005. + skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i];
  33006. +
  33007. + if (pos < len) {
  33008. + /* Split frag.
  33009. + * We have two variants in this case:
  33010. + * 1. Move all the frag to the second
  33011. + * part, if it is possible. F.e.
  33012. + * this approach is mandatory for TUX,
  33013. + * where splitting is expensive.
  33014. + * 2. Split is accurately. We make this.
  33015. + */
  33016. + skb_frag_ref(skb, i);
  33017. + skb_shinfo(skb1)->frags[0].page_offset += len - pos;
  33018. + skb_frag_size_sub(&skb_shinfo(skb1)->frags[0], len - pos);
  33019. + skb_frag_size_set(&skb_shinfo(skb)->frags[i], len - pos);
  33020. + skb_shinfo(skb)->nr_frags++;
  33021. + }
  33022. + k++;
  33023. + } else
  33024. + skb_shinfo(skb)->nr_frags++;
  33025. + pos += size;
  33026. + }
  33027. + skb_shinfo(skb1)->nr_frags = k;
  33028. +}
  33029. +
  33030. +/**
  33031. + * skb_split - Split fragmented skb to two parts at length len.
  33032. + * @skb: the buffer to split
  33033. + * @skb1: the buffer to receive the second part
  33034. + * @len: new length for skb
  33035. + */
  33036. +void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len)
  33037. +{
  33038. + int pos = skb_headlen(skb);
  33039. +
  33040. + skb_shinfo(skb1)->tx_flags = skb_shinfo(skb)->tx_flags & SKBTX_SHARED_FRAG;
  33041. + if (len < pos) /* Split line is inside header. */
  33042. + skb_split_inside_header(skb, skb1, len, pos);
  33043. + else /* Second chunk has no header, nothing to copy. */
  33044. + skb_split_no_header(skb, skb1, len, pos);
  33045. +}
  33046. +EXPORT_SYMBOL(skb_split);
  33047. +
  33048. +/* Shifting from/to a cloned skb is a no-go.
  33049. + *
  33050. + * Caller cannot keep skb_shinfo related pointers past calling here!
  33051. + */
  33052. +static int skb_prepare_for_shift(struct sk_buff *skb)
  33053. +{
  33054. + return skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
  33055. +}
  33056. +
  33057. +/**
  33058. + * skb_shift - Shifts paged data partially from skb to another
  33059. + * @tgt: buffer into which tail data gets added
  33060. + * @skb: buffer from which the paged data comes from
  33061. + * @shiftlen: shift up to this many bytes
  33062. + *
  33063. + * Attempts to shift up to shiftlen worth of bytes, which may be less than
  33064. + * the length of the skb, from skb to tgt. Returns number bytes shifted.
  33065. + * It's up to caller to free skb if everything was shifted.
  33066. + *
  33067. + * If @tgt runs out of frags, the whole operation is aborted.
  33068. + *
  33069. + * Skb cannot include anything else but paged data while tgt is allowed
  33070. + * to have non-paged data as well.
  33071. + *
  33072. + * TODO: full sized shift could be optimized but that would need
  33073. + * specialized skb free'er to handle frags without up-to-date nr_frags.
  33074. + */
  33075. +int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen)
  33076. +{
  33077. + int from, to, merge, todo;
  33078. + struct skb_frag_struct *fragfrom, *fragto;
  33079. +
  33080. + BUG_ON(shiftlen > skb->len);
  33081. + BUG_ON(skb_headlen(skb)); /* Would corrupt stream */
  33082. +
  33083. + todo = shiftlen;
  33084. + from = 0;
  33085. + to = skb_shinfo(tgt)->nr_frags;
  33086. + fragfrom = &skb_shinfo(skb)->frags[from];
  33087. +
  33088. + /* Actual merge is delayed until the point when we know we can
  33089. + * commit all, so that we don't have to undo partial changes
  33090. + */
  33091. + if (!to ||
  33092. + !skb_can_coalesce(tgt, to, skb_frag_page(fragfrom),
  33093. + fragfrom->page_offset)) {
  33094. + merge = -1;
  33095. + } else {
  33096. + merge = to - 1;
  33097. +
  33098. + todo -= skb_frag_size(fragfrom);
  33099. + if (todo < 0) {
  33100. + if (skb_prepare_for_shift(skb) ||
  33101. + skb_prepare_for_shift(tgt))
  33102. + return 0;
  33103. +
  33104. + /* All previous frag pointers might be stale! */
  33105. + fragfrom = &skb_shinfo(skb)->frags[from];
  33106. + fragto = &skb_shinfo(tgt)->frags[merge];
  33107. +
  33108. + skb_frag_size_add(fragto, shiftlen);
  33109. + skb_frag_size_sub(fragfrom, shiftlen);
  33110. + fragfrom->page_offset += shiftlen;
  33111. +
  33112. + goto onlymerged;
  33113. + }
  33114. +
  33115. + from++;
  33116. + }
  33117. +
  33118. + /* Skip full, not-fitting skb to avoid expensive operations */
  33119. + if ((shiftlen == skb->len) &&
  33120. + (skb_shinfo(skb)->nr_frags - from) > (MAX_SKB_FRAGS - to))
  33121. + return 0;
  33122. +
  33123. + if (skb_prepare_for_shift(skb) || skb_prepare_for_shift(tgt))
  33124. + return 0;
  33125. +
  33126. + while ((todo > 0) && (from < skb_shinfo(skb)->nr_frags)) {
  33127. + if (to == MAX_SKB_FRAGS)
  33128. + return 0;
  33129. +
  33130. + fragfrom = &skb_shinfo(skb)->frags[from];
  33131. + fragto = &skb_shinfo(tgt)->frags[to];
  33132. +
  33133. + if (todo >= skb_frag_size(fragfrom)) {
  33134. + *fragto = *fragfrom;
  33135. + todo -= skb_frag_size(fragfrom);
  33136. + from++;
  33137. + to++;
  33138. +
  33139. + } else {
  33140. + __skb_frag_ref(fragfrom);
  33141. + fragto->page = fragfrom->page;
  33142. + fragto->page_offset = fragfrom->page_offset;
  33143. + skb_frag_size_set(fragto, todo);
  33144. +
  33145. + fragfrom->page_offset += todo;
  33146. + skb_frag_size_sub(fragfrom, todo);
  33147. + todo = 0;
  33148. +
  33149. + to++;
  33150. + break;
  33151. + }
  33152. + }
  33153. +
  33154. + /* Ready to "commit" this state change to tgt */
  33155. + skb_shinfo(tgt)->nr_frags = to;
  33156. +
  33157. + if (merge >= 0) {
  33158. + fragfrom = &skb_shinfo(skb)->frags[0];
  33159. + fragto = &skb_shinfo(tgt)->frags[merge];
  33160. +
  33161. + skb_frag_size_add(fragto, skb_frag_size(fragfrom));
  33162. + __skb_frag_unref(fragfrom);
  33163. + }
  33164. +
  33165. + /* Reposition in the original skb */
  33166. + to = 0;
  33167. + while (from < skb_shinfo(skb)->nr_frags)
  33168. + skb_shinfo(skb)->frags[to++] = skb_shinfo(skb)->frags[from++];
  33169. + skb_shinfo(skb)->nr_frags = to;
  33170. +
  33171. + BUG_ON(todo > 0 && !skb_shinfo(skb)->nr_frags);
  33172. +
  33173. +onlymerged:
  33174. + /* Most likely the tgt won't ever need its checksum anymore, skb on
  33175. + * the other hand might need it if it needs to be resent
  33176. + */
  33177. + tgt->ip_summed = CHECKSUM_PARTIAL;
  33178. + skb->ip_summed = CHECKSUM_PARTIAL;
  33179. +
  33180. + /* Yak, is it really working this way? Some helper please? */
  33181. + skb->len -= shiftlen;
  33182. + skb->data_len -= shiftlen;
  33183. + skb->truesize -= shiftlen;
  33184. + tgt->len += shiftlen;
  33185. + tgt->data_len += shiftlen;
  33186. + tgt->truesize += shiftlen;
  33187. +
  33188. + return shiftlen;
  33189. +}
  33190. +
  33191. +/**
  33192. + * skb_prepare_seq_read - Prepare a sequential read of skb data
  33193. + * @skb: the buffer to read
  33194. + * @from: lower offset of data to be read
  33195. + * @to: upper offset of data to be read
  33196. + * @st: state variable
  33197. + *
  33198. + * Initializes the specified state variable. Must be called before
  33199. + * invoking skb_seq_read() for the first time.
  33200. + */
  33201. +void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from,
  33202. + unsigned int to, struct skb_seq_state *st)
  33203. +{
  33204. + st->lower_offset = from;
  33205. + st->upper_offset = to;
  33206. + st->root_skb = st->cur_skb = skb;
  33207. + st->frag_idx = st->stepped_offset = 0;
  33208. + st->frag_data = NULL;
  33209. +}
  33210. +EXPORT_SYMBOL(skb_prepare_seq_read);
  33211. +
  33212. +/**
  33213. + * skb_seq_read - Sequentially read skb data
  33214. + * @consumed: number of bytes consumed by the caller so far
  33215. + * @data: destination pointer for data to be returned
  33216. + * @st: state variable
  33217. + *
  33218. + * Reads a block of skb data at @consumed relative to the
  33219. + * lower offset specified to skb_prepare_seq_read(). Assigns
  33220. + * the head of the data block to @data and returns the length
  33221. + * of the block or 0 if the end of the skb data or the upper
  33222. + * offset has been reached.
  33223. + *
  33224. + * The caller is not required to consume all of the data
  33225. + * returned, i.e. @consumed is typically set to the number
  33226. + * of bytes already consumed and the next call to
  33227. + * skb_seq_read() will return the remaining part of the block.
  33228. + *
  33229. + * Note 1: The size of each block of data returned can be arbitrary,
  33230. + * this limitation is the cost for zerocopy sequential
  33231. + * reads of potentially non linear data.
  33232. + *
  33233. + * Note 2: Fragment lists within fragments are not implemented
  33234. + * at the moment, state->root_skb could be replaced with
  33235. + * a stack for this purpose.
  33236. + */
  33237. +unsigned int skb_seq_read(unsigned int consumed, const u8 **data,
  33238. + struct skb_seq_state *st)
  33239. +{
  33240. + unsigned int block_limit, abs_offset = consumed + st->lower_offset;
  33241. + skb_frag_t *frag;
  33242. +
  33243. + if (unlikely(abs_offset >= st->upper_offset)) {
  33244. + if (st->frag_data) {
  33245. + kunmap_atomic(st->frag_data);
  33246. + st->frag_data = NULL;
  33247. + }
  33248. + return 0;
  33249. + }
  33250. +
  33251. +next_skb:
  33252. + block_limit = skb_headlen(st->cur_skb) + st->stepped_offset;
  33253. +
  33254. + if (abs_offset < block_limit && !st->frag_data) {
  33255. + *data = st->cur_skb->data + (abs_offset - st->stepped_offset);
  33256. + return block_limit - abs_offset;
  33257. + }
  33258. +
  33259. + if (st->frag_idx == 0 && !st->frag_data)
  33260. + st->stepped_offset += skb_headlen(st->cur_skb);
  33261. +
  33262. + while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) {
  33263. + frag = &skb_shinfo(st->cur_skb)->frags[st->frag_idx];
  33264. + block_limit = skb_frag_size(frag) + st->stepped_offset;
  33265. +
  33266. + if (abs_offset < block_limit) {
  33267. + if (!st->frag_data)
  33268. + st->frag_data = kmap_atomic(skb_frag_page(frag));
  33269. +
  33270. + *data = (u8 *) st->frag_data + frag->page_offset +
  33271. + (abs_offset - st->stepped_offset);
  33272. +
  33273. + return block_limit - abs_offset;
  33274. + }
  33275. +
  33276. + if (st->frag_data) {
  33277. + kunmap_atomic(st->frag_data);
  33278. + st->frag_data = NULL;
  33279. + }
  33280. +
  33281. + st->frag_idx++;
  33282. + st->stepped_offset += skb_frag_size(frag);
  33283. + }
  33284. +
  33285. + if (st->frag_data) {
  33286. + kunmap_atomic(st->frag_data);
  33287. + st->frag_data = NULL;
  33288. + }
  33289. +
  33290. + if (st->root_skb == st->cur_skb && skb_has_frag_list(st->root_skb)) {
  33291. + st->cur_skb = skb_shinfo(st->root_skb)->frag_list;
  33292. + st->frag_idx = 0;
  33293. + goto next_skb;
  33294. + } else if (st->cur_skb->next) {
  33295. + st->cur_skb = st->cur_skb->next;
  33296. + st->frag_idx = 0;
  33297. + goto next_skb;
  33298. + }
  33299. +
  33300. + return 0;
  33301. +}
  33302. +EXPORT_SYMBOL(skb_seq_read);
  33303. +
  33304. +/**
  33305. + * skb_abort_seq_read - Abort a sequential read of skb data
  33306. + * @st: state variable
  33307. + *
  33308. + * Must be called if skb_seq_read() was not called until it
  33309. + * returned 0.
  33310. + */
  33311. +void skb_abort_seq_read(struct skb_seq_state *st)
  33312. +{
  33313. + if (st->frag_data)
  33314. + kunmap_atomic(st->frag_data);
  33315. +}
  33316. +EXPORT_SYMBOL(skb_abort_seq_read);
  33317. +
  33318. +#define TS_SKB_CB(state) ((struct skb_seq_state *) &((state)->cb))
  33319. +
  33320. +static unsigned int skb_ts_get_next_block(unsigned int offset, const u8 **text,
  33321. + struct ts_config *conf,
  33322. + struct ts_state *state)
  33323. +{
  33324. + return skb_seq_read(offset, text, TS_SKB_CB(state));
  33325. +}
  33326. +
  33327. +static void skb_ts_finish(struct ts_config *conf, struct ts_state *state)
  33328. +{
  33329. + skb_abort_seq_read(TS_SKB_CB(state));
  33330. +}
  33331. +
  33332. +/**
  33333. + * skb_find_text - Find a text pattern in skb data
  33334. + * @skb: the buffer to look in
  33335. + * @from: search offset
  33336. + * @to: search limit
  33337. + * @config: textsearch configuration
  33338. + * @state: uninitialized textsearch state variable
  33339. + *
  33340. + * Finds a pattern in the skb data according to the specified
  33341. + * textsearch configuration. Use textsearch_next() to retrieve
  33342. + * subsequent occurrences of the pattern. Returns the offset
  33343. + * to the first occurrence or UINT_MAX if no match was found.
  33344. + */
  33345. +unsigned int skb_find_text(struct sk_buff *skb, unsigned int from,
  33346. + unsigned int to, struct ts_config *config,
  33347. + struct ts_state *state)
  33348. +{
  33349. + unsigned int ret;
  33350. +
  33351. + config->get_next_block = skb_ts_get_next_block;
  33352. + config->finish = skb_ts_finish;
  33353. +
  33354. + skb_prepare_seq_read(skb, from, to, TS_SKB_CB(state));
  33355. +
  33356. + ret = textsearch_find(config, state);
  33357. + return (ret <= to - from ? ret : UINT_MAX);
  33358. +}
  33359. +EXPORT_SYMBOL(skb_find_text);
  33360. +
  33361. +/**
  33362. + * skb_append_datato_frags - append the user data to a skb
  33363. + * @sk: sock structure
  33364. + * @skb: skb structure to be appended with user data.
  33365. + * @getfrag: call back function to be used for getting the user data
  33366. + * @from: pointer to user message iov
  33367. + * @length: length of the iov message
  33368. + *
  33369. + * Description: This procedure append the user data in the fragment part
  33370. + * of the skb if any page alloc fails user this procedure returns -ENOMEM
  33371. + */
  33372. +int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb,
  33373. + int (*getfrag)(void *from, char *to, int offset,
  33374. + int len, int odd, struct sk_buff *skb),
  33375. + void *from, int length)
  33376. +{
  33377. + int frg_cnt = skb_shinfo(skb)->nr_frags;
  33378. + int copy;
  33379. + int offset = 0;
  33380. + int ret;
  33381. + struct page_frag *pfrag = &current->task_frag;
  33382. +
  33383. + do {
  33384. + /* Return error if we don't have space for new frag */
  33385. + if (frg_cnt >= MAX_SKB_FRAGS)
  33386. + return -EMSGSIZE;
  33387. +
  33388. + if (!sk_page_frag_refill(sk, pfrag))
  33389. + return -ENOMEM;
  33390. +
  33391. + /* copy the user data to page */
  33392. + copy = min_t(int, length, pfrag->size - pfrag->offset);
  33393. +
  33394. + ret = getfrag(from, page_address(pfrag->page) + pfrag->offset,
  33395. + offset, copy, 0, skb);
  33396. + if (ret < 0)
  33397. + return -EFAULT;
  33398. +
  33399. + /* copy was successful so update the size parameters */
  33400. + skb_fill_page_desc(skb, frg_cnt, pfrag->page, pfrag->offset,
  33401. + copy);
  33402. + frg_cnt++;
  33403. + pfrag->offset += copy;
  33404. + get_page(pfrag->page);
  33405. +
  33406. + skb->truesize += copy;
  33407. + atomic_add(copy, &sk->sk_wmem_alloc);
  33408. + skb->len += copy;
  33409. + skb->data_len += copy;
  33410. + offset += copy;
  33411. + length -= copy;
  33412. +
  33413. + } while (length > 0);
  33414. +
  33415. + return 0;
  33416. +}
  33417. +EXPORT_SYMBOL(skb_append_datato_frags);
  33418. +
  33419. +/**
  33420. + * skb_pull_rcsum - pull skb and update receive checksum
  33421. + * @skb: buffer to update
  33422. + * @len: length of data pulled
  33423. + *
  33424. + * This function performs an skb_pull on the packet and updates
  33425. + * the CHECKSUM_COMPLETE checksum. It should be used on
  33426. + * receive path processing instead of skb_pull unless you know
  33427. + * that the checksum difference is zero (e.g., a valid IP header)
  33428. + * or you are setting ip_summed to CHECKSUM_NONE.
  33429. + */
  33430. +unsigned char *skb_pull_rcsum(struct sk_buff *skb, unsigned int len)
  33431. +{
  33432. + BUG_ON(len > skb->len);
  33433. + skb->len -= len;
  33434. + BUG_ON(skb->len < skb->data_len);
  33435. + skb_postpull_rcsum(skb, skb->data, len);
  33436. + return skb->data += len;
  33437. +}
  33438. +EXPORT_SYMBOL_GPL(skb_pull_rcsum);
  33439. +
  33440. +/**
  33441. + * skb_segment - Perform protocol segmentation on skb.
  33442. + * @head_skb: buffer to segment
  33443. + * @features: features for the output path (see dev->features)
  33444. + *
  33445. + * This function performs segmentation on the given skb. It returns
  33446. + * a pointer to the first in a list of new skbs for the segments.
  33447. + * In case of error it returns ERR_PTR(err).
  33448. + */
  33449. +struct sk_buff *skb_segment(struct sk_buff *head_skb,
  33450. + netdev_features_t features)
  33451. +{
  33452. + struct sk_buff *segs = NULL;
  33453. + struct sk_buff *tail = NULL;
  33454. + struct sk_buff *list_skb = skb_shinfo(head_skb)->frag_list;
  33455. + skb_frag_t *frag = skb_shinfo(head_skb)->frags;
  33456. + unsigned int mss = skb_shinfo(head_skb)->gso_size;
  33457. + unsigned int doffset = head_skb->data - skb_mac_header(head_skb);
  33458. + struct sk_buff *frag_skb = head_skb;
  33459. + unsigned int offset = doffset;
  33460. + unsigned int tnl_hlen = skb_tnl_header_len(head_skb);
  33461. + unsigned int headroom;
  33462. + unsigned int len;
  33463. + __be16 proto;
  33464. + bool csum;
  33465. + int sg = !!(features & NETIF_F_SG);
  33466. + int nfrags = skb_shinfo(head_skb)->nr_frags;
  33467. + int err = -ENOMEM;
  33468. + int i = 0;
  33469. + int pos;
  33470. + int dummy;
  33471. +
  33472. + __skb_push(head_skb, doffset);
  33473. + proto = skb_network_protocol(head_skb, &dummy);
  33474. + if (unlikely(!proto))
  33475. + return ERR_PTR(-EINVAL);
  33476. +
  33477. + csum = !head_skb->encap_hdr_csum &&
  33478. + !!can_checksum_protocol(features, proto);
  33479. +
  33480. + headroom = skb_headroom(head_skb);
  33481. + pos = skb_headlen(head_skb);
  33482. +
  33483. + do {
  33484. + struct sk_buff *nskb;
  33485. + skb_frag_t *nskb_frag;
  33486. + int hsize;
  33487. + int size;
  33488. +
  33489. + len = head_skb->len - offset;
  33490. + if (len > mss)
  33491. + len = mss;
  33492. +
  33493. + hsize = skb_headlen(head_skb) - offset;
  33494. + if (hsize < 0)
  33495. + hsize = 0;
  33496. + if (hsize > len || !sg)
  33497. + hsize = len;
  33498. +
  33499. + if (!hsize && i >= nfrags && skb_headlen(list_skb) &&
  33500. + (skb_headlen(list_skb) == len || sg)) {
  33501. + BUG_ON(skb_headlen(list_skb) > len);
  33502. +
  33503. + i = 0;
  33504. + nfrags = skb_shinfo(list_skb)->nr_frags;
  33505. + frag = skb_shinfo(list_skb)->frags;
  33506. + frag_skb = list_skb;
  33507. + pos += skb_headlen(list_skb);
  33508. +
  33509. + while (pos < offset + len) {
  33510. + BUG_ON(i >= nfrags);
  33511. +
  33512. + size = skb_frag_size(frag);
  33513. + if (pos + size > offset + len)
  33514. + break;
  33515. +
  33516. + i++;
  33517. + pos += size;
  33518. + frag++;
  33519. + }
  33520. +
  33521. + nskb = skb_clone(list_skb, GFP_ATOMIC);
  33522. + list_skb = list_skb->next;
  33523. +
  33524. + if (unlikely(!nskb))
  33525. + goto err;
  33526. +
  33527. + if (unlikely(pskb_trim(nskb, len))) {
  33528. + kfree_skb(nskb);
  33529. + goto err;
  33530. + }
  33531. +
  33532. + hsize = skb_end_offset(nskb);
  33533. + if (skb_cow_head(nskb, doffset + headroom)) {
  33534. + kfree_skb(nskb);
  33535. + goto err;
  33536. + }
  33537. +
  33538. + nskb->truesize += skb_end_offset(nskb) - hsize;
  33539. + skb_release_head_state(nskb);
  33540. + __skb_push(nskb, doffset);
  33541. + } else {
  33542. + nskb = __alloc_skb(hsize + doffset + headroom,
  33543. + GFP_ATOMIC, skb_alloc_rx_flag(head_skb),
  33544. + NUMA_NO_NODE);
  33545. +
  33546. + if (unlikely(!nskb))
  33547. + goto err;
  33548. +
  33549. + skb_reserve(nskb, headroom);
  33550. + __skb_put(nskb, doffset);
  33551. + }
  33552. +
  33553. + if (segs)
  33554. + tail->next = nskb;
  33555. + else
  33556. + segs = nskb;
  33557. + tail = nskb;
  33558. +
  33559. + __copy_skb_header(nskb, head_skb);
  33560. +
  33561. + skb_headers_offset_update(nskb, skb_headroom(nskb) - headroom);
  33562. + skb_reset_mac_len(nskb);
  33563. +
  33564. + skb_copy_from_linear_data_offset(head_skb, -tnl_hlen,
  33565. + nskb->data - tnl_hlen,
  33566. + doffset + tnl_hlen);
  33567. +
  33568. + if (nskb->len == len + doffset)
  33569. + goto perform_csum_check;
  33570. +
  33571. + if (!sg) {
  33572. + nskb->ip_summed = CHECKSUM_NONE;
  33573. + nskb->csum = skb_copy_and_csum_bits(head_skb, offset,
  33574. + skb_put(nskb, len),
  33575. + len, 0);
  33576. + SKB_GSO_CB(nskb)->csum_start =
  33577. + skb_headroom(nskb) + doffset;
  33578. + continue;
  33579. + }
  33580. +
  33581. + nskb_frag = skb_shinfo(nskb)->frags;
  33582. +
  33583. + skb_copy_from_linear_data_offset(head_skb, offset,
  33584. + skb_put(nskb, hsize), hsize);
  33585. +
  33586. + skb_shinfo(nskb)->tx_flags = skb_shinfo(head_skb)->tx_flags &
  33587. + SKBTX_SHARED_FRAG;
  33588. +
  33589. + while (pos < offset + len) {
  33590. + if (i >= nfrags) {
  33591. + BUG_ON(skb_headlen(list_skb));
  33592. +
  33593. + i = 0;
  33594. + nfrags = skb_shinfo(list_skb)->nr_frags;
  33595. + frag = skb_shinfo(list_skb)->frags;
  33596. + frag_skb = list_skb;
  33597. +
  33598. + BUG_ON(!nfrags);
  33599. +
  33600. + list_skb = list_skb->next;
  33601. + }
  33602. +
  33603. + if (unlikely(skb_shinfo(nskb)->nr_frags >=
  33604. + MAX_SKB_FRAGS)) {
  33605. + net_warn_ratelimited(
  33606. + "skb_segment: too many frags: %u %u\n",
  33607. + pos, mss);
  33608. + goto err;
  33609. + }
  33610. +
  33611. + if (unlikely(skb_orphan_frags(frag_skb, GFP_ATOMIC)))
  33612. + goto err;
  33613. +
  33614. + *nskb_frag = *frag;
  33615. + __skb_frag_ref(nskb_frag);
  33616. + size = skb_frag_size(nskb_frag);
  33617. +
  33618. + if (pos < offset) {
  33619. + nskb_frag->page_offset += offset - pos;
  33620. + skb_frag_size_sub(nskb_frag, offset - pos);
  33621. + }
  33622. +
  33623. + skb_shinfo(nskb)->nr_frags++;
  33624. +
  33625. + if (pos + size <= offset + len) {
  33626. + i++;
  33627. + frag++;
  33628. + pos += size;
  33629. + } else {
  33630. + skb_frag_size_sub(nskb_frag, pos + size - (offset + len));
  33631. + goto skip_fraglist;
  33632. + }
  33633. +
  33634. + nskb_frag++;
  33635. + }
  33636. +
  33637. +skip_fraglist:
  33638. + nskb->data_len = len - hsize;
  33639. + nskb->len += nskb->data_len;
  33640. + nskb->truesize += nskb->data_len;
  33641. +
  33642. +perform_csum_check:
  33643. + if (!csum) {
  33644. + nskb->csum = skb_checksum(nskb, doffset,
  33645. + nskb->len - doffset, 0);
  33646. + nskb->ip_summed = CHECKSUM_NONE;
  33647. + SKB_GSO_CB(nskb)->csum_start =
  33648. + skb_headroom(nskb) + doffset;
  33649. + }
  33650. + } while ((offset += len) < head_skb->len);
  33651. +
  33652. + /* Some callers want to get the end of the list.
  33653. + * Put it in segs->prev to avoid walking the list.
  33654. + * (see validate_xmit_skb_list() for example)
  33655. + */
  33656. + segs->prev = tail;
  33657. + return segs;
  33658. +
  33659. +err:
  33660. + kfree_skb_list(segs);
  33661. + return ERR_PTR(err);
  33662. +}
  33663. +EXPORT_SYMBOL_GPL(skb_segment);
  33664. +
  33665. +int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)
  33666. +{
  33667. + struct skb_shared_info *pinfo, *skbinfo = skb_shinfo(skb);
  33668. + unsigned int offset = skb_gro_offset(skb);
  33669. + unsigned int headlen = skb_headlen(skb);
  33670. + struct sk_buff *nskb, *lp, *p = *head;
  33671. + unsigned int len = skb_gro_len(skb);
  33672. + unsigned int delta_truesize;
  33673. + unsigned int headroom;
  33674. +
  33675. + if (unlikely(p->len + len >= 65536))
  33676. + return -E2BIG;
  33677. +
  33678. + lp = NAPI_GRO_CB(p)->last;
  33679. + pinfo = skb_shinfo(lp);
  33680. +
  33681. + if (headlen <= offset) {
  33682. + skb_frag_t *frag;
  33683. + skb_frag_t *frag2;
  33684. + int i = skbinfo->nr_frags;
  33685. + int nr_frags = pinfo->nr_frags + i;
  33686. +
  33687. + if (nr_frags > MAX_SKB_FRAGS)
  33688. + goto merge;
  33689. +
  33690. + offset -= headlen;
  33691. + pinfo->nr_frags = nr_frags;
  33692. + skbinfo->nr_frags = 0;
  33693. +
  33694. + frag = pinfo->frags + nr_frags;
  33695. + frag2 = skbinfo->frags + i;
  33696. + do {
  33697. + *--frag = *--frag2;
  33698. + } while (--i);
  33699. +
  33700. + frag->page_offset += offset;
  33701. + skb_frag_size_sub(frag, offset);
  33702. +
  33703. + /* all fragments truesize : remove (head size + sk_buff) */
  33704. + delta_truesize = skb->truesize -
  33705. + SKB_TRUESIZE(skb_end_offset(skb));
  33706. +
  33707. + skb->truesize -= skb->data_len;
  33708. + skb->len -= skb->data_len;
  33709. + skb->data_len = 0;
  33710. +
  33711. + NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE;
  33712. + goto done;
  33713. + } else if (skb->head_frag) {
  33714. + int nr_frags = pinfo->nr_frags;
  33715. + skb_frag_t *frag = pinfo->frags + nr_frags;
  33716. + struct page *page = virt_to_head_page(skb->head);
  33717. + unsigned int first_size = headlen - offset;
  33718. + unsigned int first_offset;
  33719. +
  33720. + if (nr_frags + 1 + skbinfo->nr_frags > MAX_SKB_FRAGS)
  33721. + goto merge;
  33722. +
  33723. + first_offset = skb->data -
  33724. + (unsigned char *)page_address(page) +
  33725. + offset;
  33726. +
  33727. + pinfo->nr_frags = nr_frags + 1 + skbinfo->nr_frags;
  33728. +
  33729. + frag->page.p = page;
  33730. + frag->page_offset = first_offset;
  33731. + skb_frag_size_set(frag, first_size);
  33732. +
  33733. + memcpy(frag + 1, skbinfo->frags, sizeof(*frag) * skbinfo->nr_frags);
  33734. + /* We dont need to clear skbinfo->nr_frags here */
  33735. +
  33736. + delta_truesize = skb->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff));
  33737. + NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE_STOLEN_HEAD;
  33738. + goto done;
  33739. + }
  33740. + /* switch back to head shinfo */
  33741. + pinfo = skb_shinfo(p);
  33742. +
  33743. + if (pinfo->frag_list)
  33744. + goto merge;
  33745. + if (skb_gro_len(p) != pinfo->gso_size)
  33746. + return -E2BIG;
  33747. +
  33748. + headroom = skb_headroom(p);
  33749. + nskb = alloc_skb(headroom + skb_gro_offset(p), GFP_ATOMIC);
  33750. + if (unlikely(!nskb))
  33751. + return -ENOMEM;
  33752. +
  33753. + __copy_skb_header(nskb, p);
  33754. + nskb->mac_len = p->mac_len;
  33755. +
  33756. + skb_reserve(nskb, headroom);
  33757. + __skb_put(nskb, skb_gro_offset(p));
  33758. +
  33759. + skb_set_mac_header(nskb, skb_mac_header(p) - p->data);
  33760. + skb_set_network_header(nskb, skb_network_offset(p));
  33761. + skb_set_transport_header(nskb, skb_transport_offset(p));
  33762. +
  33763. + __skb_pull(p, skb_gro_offset(p));
  33764. + memcpy(skb_mac_header(nskb), skb_mac_header(p),
  33765. + p->data - skb_mac_header(p));
  33766. +
  33767. + skb_shinfo(nskb)->frag_list = p;
  33768. + skb_shinfo(nskb)->gso_size = pinfo->gso_size;
  33769. + pinfo->gso_size = 0;
  33770. + __skb_header_release(p);
  33771. + NAPI_GRO_CB(nskb)->last = p;
  33772. +
  33773. + nskb->data_len += p->len;
  33774. + nskb->truesize += p->truesize;
  33775. + nskb->len += p->len;
  33776. +
  33777. + *head = nskb;
  33778. + nskb->next = p->next;
  33779. + p->next = NULL;
  33780. +
  33781. + p = nskb;
  33782. +
  33783. +merge:
  33784. + delta_truesize = skb->truesize;
  33785. + if (offset > headlen) {
  33786. + unsigned int eat = offset - headlen;
  33787. +
  33788. + skbinfo->frags[0].page_offset += eat;
  33789. + skb_frag_size_sub(&skbinfo->frags[0], eat);
  33790. + skb->data_len -= eat;
  33791. + skb->len -= eat;
  33792. + offset = headlen;
  33793. + }
  33794. +
  33795. + __skb_pull(skb, offset);
  33796. +
  33797. + if (NAPI_GRO_CB(p)->last == p)
  33798. + skb_shinfo(p)->frag_list = skb;
  33799. + else
  33800. + NAPI_GRO_CB(p)->last->next = skb;
  33801. + NAPI_GRO_CB(p)->last = skb;
  33802. + __skb_header_release(skb);
  33803. + lp = p;
  33804. +
  33805. +done:
  33806. + NAPI_GRO_CB(p)->count++;
  33807. + p->data_len += len;
  33808. + p->truesize += delta_truesize;
  33809. + p->len += len;
  33810. + if (lp != p) {
  33811. + lp->data_len += len;
  33812. + lp->truesize += delta_truesize;
  33813. + lp->len += len;
  33814. + }
  33815. + NAPI_GRO_CB(skb)->same_flow = 1;
  33816. + return 0;
  33817. +}
  33818. +
  33819. +void __init skb_init(void)
  33820. +{
  33821. + skbuff_head_cache = kmem_cache_create("skbuff_head_cache",
  33822. + sizeof(struct sk_buff),
  33823. + 0,
  33824. + SLAB_HWCACHE_ALIGN|SLAB_PANIC,
  33825. + NULL);
  33826. + skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache",
  33827. + sizeof(struct sk_buff_fclones),
  33828. + 0,
  33829. + SLAB_HWCACHE_ALIGN|SLAB_PANIC,
  33830. + NULL);
  33831. +}
  33832. +
  33833. +/**
  33834. + * skb_to_sgvec - Fill a scatter-gather list from a socket buffer
  33835. + * @skb: Socket buffer containing the buffers to be mapped
  33836. + * @sg: The scatter-gather list to map into
  33837. + * @offset: The offset into the buffer's contents to start mapping
  33838. + * @len: Length of buffer space to be mapped
  33839. + *
  33840. + * Fill the specified scatter-gather list with mappings/pointers into a
  33841. + * region of the buffer space attached to a socket buffer.
  33842. + */
  33843. +static int
  33844. +__skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len)
  33845. +{
  33846. + int start = skb_headlen(skb);
  33847. + int i, copy = start - offset;
  33848. + struct sk_buff *frag_iter;
  33849. + int elt = 0;
  33850. +
  33851. + if (copy > 0) {
  33852. + if (copy > len)
  33853. + copy = len;
  33854. + sg_set_buf(sg, skb->data + offset, copy);
  33855. + elt++;
  33856. + if ((len -= copy) == 0)
  33857. + return elt;
  33858. + offset += copy;
  33859. + }
  33860. +
  33861. + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
  33862. + int end;
  33863. +
  33864. + WARN_ON(start > offset + len);
  33865. +
  33866. + end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]);
  33867. + if ((copy = end - offset) > 0) {
  33868. + skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
  33869. +
  33870. + if (copy > len)
  33871. + copy = len;
  33872. + sg_set_page(&sg[elt], skb_frag_page(frag), copy,
  33873. + frag->page_offset+offset-start);
  33874. + elt++;
  33875. + if (!(len -= copy))
  33876. + return elt;
  33877. + offset += copy;
  33878. + }
  33879. + start = end;
  33880. + }
  33881. +
  33882. + skb_walk_frags(skb, frag_iter) {
  33883. + int end;
  33884. +
  33885. + WARN_ON(start > offset + len);
  33886. +
  33887. + end = start + frag_iter->len;
  33888. + if ((copy = end - offset) > 0) {
  33889. + if (copy > len)
  33890. + copy = len;
  33891. + elt += __skb_to_sgvec(frag_iter, sg+elt, offset - start,
  33892. + copy);
  33893. + if ((len -= copy) == 0)
  33894. + return elt;
  33895. + offset += copy;
  33896. + }
  33897. + start = end;
  33898. + }
  33899. + BUG_ON(len);
  33900. + return elt;
  33901. +}
  33902. +
  33903. +/* As compared with skb_to_sgvec, skb_to_sgvec_nomark only map skb to given
  33904. + * sglist without mark the sg which contain last skb data as the end.
  33905. + * So the caller can mannipulate sg list as will when padding new data after
  33906. + * the first call without calling sg_unmark_end to expend sg list.
  33907. + *
  33908. + * Scenario to use skb_to_sgvec_nomark:
  33909. + * 1. sg_init_table
  33910. + * 2. skb_to_sgvec_nomark(payload1)
  33911. + * 3. skb_to_sgvec_nomark(payload2)
  33912. + *
  33913. + * This is equivalent to:
  33914. + * 1. sg_init_table
  33915. + * 2. skb_to_sgvec(payload1)
  33916. + * 3. sg_unmark_end
  33917. + * 4. skb_to_sgvec(payload2)
  33918. + *
  33919. + * When mapping mutilple payload conditionally, skb_to_sgvec_nomark
  33920. + * is more preferable.
  33921. + */
  33922. +int skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg,
  33923. + int offset, int len)
  33924. +{
  33925. + return __skb_to_sgvec(skb, sg, offset, len);
  33926. +}
  33927. +EXPORT_SYMBOL_GPL(skb_to_sgvec_nomark);
  33928. +
  33929. +int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len)
  33930. +{
  33931. + int nsg = __skb_to_sgvec(skb, sg, offset, len);
  33932. +
  33933. + sg_mark_end(&sg[nsg - 1]);
  33934. +
  33935. + return nsg;
  33936. +}
  33937. +EXPORT_SYMBOL_GPL(skb_to_sgvec);
  33938. +
  33939. +/**
  33940. + * skb_cow_data - Check that a socket buffer's data buffers are writable
  33941. + * @skb: The socket buffer to check.
  33942. + * @tailbits: Amount of trailing space to be added
  33943. + * @trailer: Returned pointer to the skb where the @tailbits space begins
  33944. + *
  33945. + * Make sure that the data buffers attached to a socket buffer are
  33946. + * writable. If they are not, private copies are made of the data buffers
  33947. + * and the socket buffer is set to use these instead.
  33948. + *
  33949. + * If @tailbits is given, make sure that there is space to write @tailbits
  33950. + * bytes of data beyond current end of socket buffer. @trailer will be
  33951. + * set to point to the skb in which this space begins.
  33952. + *
  33953. + * The number of scatterlist elements required to completely map the
  33954. + * COW'd and extended socket buffer will be returned.
  33955. + */
  33956. +int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer)
  33957. +{
  33958. + int copyflag;
  33959. + int elt;
  33960. + struct sk_buff *skb1, **skb_p;
  33961. +
  33962. + /* If skb is cloned or its head is paged, reallocate
  33963. + * head pulling out all the pages (pages are considered not writable
  33964. + * at the moment even if they are anonymous).
  33965. + */
  33966. + if ((skb_cloned(skb) || skb_shinfo(skb)->nr_frags) &&
  33967. + __pskb_pull_tail(skb, skb_pagelen(skb)-skb_headlen(skb)) == NULL)
  33968. + return -ENOMEM;
  33969. +
  33970. + /* Easy case. Most of packets will go this way. */
  33971. + if (!skb_has_frag_list(skb)) {
  33972. + /* A little of trouble, not enough of space for trailer.
  33973. + * This should not happen, when stack is tuned to generate
  33974. + * good frames. OK, on miss we reallocate and reserve even more
  33975. + * space, 128 bytes is fair. */
  33976. +
  33977. + if (skb_tailroom(skb) < tailbits &&
  33978. + pskb_expand_head(skb, 0, tailbits-skb_tailroom(skb)+128, GFP_ATOMIC))
  33979. + return -ENOMEM;
  33980. +
  33981. + /* Voila! */
  33982. + *trailer = skb;
  33983. + return 1;
  33984. + }
  33985. +
  33986. + /* Misery. We are in troubles, going to mincer fragments... */
  33987. +
  33988. + elt = 1;
  33989. + skb_p = &skb_shinfo(skb)->frag_list;
  33990. + copyflag = 0;
  33991. +
  33992. + while ((skb1 = *skb_p) != NULL) {
  33993. + int ntail = 0;
  33994. +
  33995. + /* The fragment is partially pulled by someone,
  33996. + * this can happen on input. Copy it and everything
  33997. + * after it. */
  33998. +
  33999. + if (skb_shared(skb1))
  34000. + copyflag = 1;
  34001. +
  34002. + /* If the skb is the last, worry about trailer. */
  34003. +
  34004. + if (skb1->next == NULL && tailbits) {
  34005. + if (skb_shinfo(skb1)->nr_frags ||
  34006. + skb_has_frag_list(skb1) ||
  34007. + skb_tailroom(skb1) < tailbits)
  34008. + ntail = tailbits + 128;
  34009. + }
  34010. +
  34011. + if (copyflag ||
  34012. + skb_cloned(skb1) ||
  34013. + ntail ||
  34014. + skb_shinfo(skb1)->nr_frags ||
  34015. + skb_has_frag_list(skb1)) {
  34016. + struct sk_buff *skb2;
  34017. +
  34018. + /* Fuck, we are miserable poor guys... */
  34019. + if (ntail == 0)
  34020. + skb2 = skb_copy(skb1, GFP_ATOMIC);
  34021. + else
  34022. + skb2 = skb_copy_expand(skb1,
  34023. + skb_headroom(skb1),
  34024. + ntail,
  34025. + GFP_ATOMIC);
  34026. + if (unlikely(skb2 == NULL))
  34027. + return -ENOMEM;
  34028. +
  34029. + if (skb1->sk)
  34030. + skb_set_owner_w(skb2, skb1->sk);
  34031. +
  34032. + /* Looking around. Are we still alive?
  34033. + * OK, link new skb, drop old one */
  34034. +
  34035. + skb2->next = skb1->next;
  34036. + *skb_p = skb2;
  34037. + kfree_skb(skb1);
  34038. + skb1 = skb2;
  34039. + }
  34040. + elt++;
  34041. + *trailer = skb1;
  34042. + skb_p = &skb1->next;
  34043. + }
  34044. +
  34045. + return elt;
  34046. +}
  34047. +EXPORT_SYMBOL_GPL(skb_cow_data);
  34048. +
  34049. +static void sock_rmem_free(struct sk_buff *skb)
  34050. +{
  34051. + struct sock *sk = skb->sk;
  34052. +
  34053. + atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
  34054. +}
  34055. +
  34056. +/*
  34057. + * Note: We dont mem charge error packets (no sk_forward_alloc changes)
  34058. + */
  34059. +int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb)
  34060. +{
  34061. + if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
  34062. + (unsigned int)sk->sk_rcvbuf)
  34063. + return -ENOMEM;
  34064. +
  34065. + skb_orphan(skb);
  34066. + skb->sk = sk;
  34067. + skb->destructor = sock_rmem_free;
  34068. + atomic_add(skb->truesize, &sk->sk_rmem_alloc);
  34069. +
  34070. + /* before exiting rcu section, make sure dst is refcounted */
  34071. + skb_dst_force(skb);
  34072. +
  34073. + skb_queue_tail(&sk->sk_error_queue, skb);
  34074. + if (!sock_flag(sk, SOCK_DEAD))
  34075. + sk->sk_data_ready(sk);
  34076. + return 0;
  34077. +}
  34078. +EXPORT_SYMBOL(sock_queue_err_skb);
  34079. +
  34080. +struct sk_buff *sock_dequeue_err_skb(struct sock *sk)
  34081. +{
  34082. + struct sk_buff_head *q = &sk->sk_error_queue;
  34083. + struct sk_buff *skb, *skb_next;
  34084. + unsigned long flags;
  34085. + int err = 0;
  34086. +
  34087. + spin_lock_irqsave(&q->lock, flags);
  34088. + skb = __skb_dequeue(q);
  34089. + if (skb && (skb_next = skb_peek(q)))
  34090. + err = SKB_EXT_ERR(skb_next)->ee.ee_errno;
  34091. + spin_unlock_irqrestore(&q->lock, flags);
  34092. +
  34093. + sk->sk_err = err;
  34094. + if (err)
  34095. + sk->sk_error_report(sk);
  34096. +
  34097. + return skb;
  34098. +}
  34099. +EXPORT_SYMBOL(sock_dequeue_err_skb);
  34100. +
  34101. +/**
  34102. + * skb_clone_sk - create clone of skb, and take reference to socket
  34103. + * @skb: the skb to clone
  34104. + *
  34105. + * This function creates a clone of a buffer that holds a reference on
  34106. + * sk_refcnt. Buffers created via this function are meant to be
  34107. + * returned using sock_queue_err_skb, or free via kfree_skb.
  34108. + *
  34109. + * When passing buffers allocated with this function to sock_queue_err_skb
  34110. + * it is necessary to wrap the call with sock_hold/sock_put in order to
  34111. + * prevent the socket from being released prior to being enqueued on
  34112. + * the sk_error_queue.
  34113. + */
  34114. +struct sk_buff *skb_clone_sk(struct sk_buff *skb)
  34115. +{
  34116. + struct sock *sk = skb->sk;
  34117. + struct sk_buff *clone;
  34118. +
  34119. + if (!sk || !atomic_inc_not_zero(&sk->sk_refcnt))
  34120. + return NULL;
  34121. +
  34122. + clone = skb_clone(skb, GFP_ATOMIC);
  34123. + if (!clone) {
  34124. + sock_put(sk);
  34125. + return NULL;
  34126. + }
  34127. +
  34128. + clone->sk = sk;
  34129. + clone->destructor = sock_efree;
  34130. +
  34131. + return clone;
  34132. +}
  34133. +EXPORT_SYMBOL(skb_clone_sk);
  34134. +
  34135. +static void __skb_complete_tx_timestamp(struct sk_buff *skb,
  34136. + struct sock *sk,
  34137. + int tstype)
  34138. +{
  34139. + struct sock_exterr_skb *serr;
  34140. + int err;
  34141. +
  34142. + serr = SKB_EXT_ERR(skb);
  34143. + memset(serr, 0, sizeof(*serr));
  34144. + serr->ee.ee_errno = ENOMSG;
  34145. + serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING;
  34146. + serr->ee.ee_info = tstype;
  34147. + if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) {
  34148. + serr->ee.ee_data = skb_shinfo(skb)->tskey;
  34149. + if (sk->sk_protocol == IPPROTO_TCP)
  34150. + serr->ee.ee_data -= sk->sk_tskey;
  34151. + }
  34152. +
  34153. + err = sock_queue_err_skb(sk, skb);
  34154. +
  34155. + if (err)
  34156. + kfree_skb(skb);
  34157. +}
  34158. +
  34159. +void skb_complete_tx_timestamp(struct sk_buff *skb,
  34160. + struct skb_shared_hwtstamps *hwtstamps)
  34161. +{
  34162. + struct sock *sk = skb->sk;
  34163. +
  34164. + /* take a reference to prevent skb_orphan() from freeing the socket */
  34165. + sock_hold(sk);
  34166. +
  34167. + *skb_hwtstamps(skb) = *hwtstamps;
  34168. + __skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND);
  34169. +
  34170. + sock_put(sk);
  34171. +}
  34172. +EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp);
  34173. +
  34174. +void __skb_tstamp_tx(struct sk_buff *orig_skb,
  34175. + struct skb_shared_hwtstamps *hwtstamps,
  34176. + struct sock *sk, int tstype)
  34177. +{
  34178. + struct sk_buff *skb;
  34179. +
  34180. + if (!sk)
  34181. + return;
  34182. +
  34183. + if (hwtstamps)
  34184. + *skb_hwtstamps(orig_skb) = *hwtstamps;
  34185. + else
  34186. + orig_skb->tstamp = ktime_get_real();
  34187. +
  34188. + skb = skb_clone(orig_skb, GFP_ATOMIC);
  34189. + if (!skb)
  34190. + return;
  34191. +
  34192. + __skb_complete_tx_timestamp(skb, sk, tstype);
  34193. +}
  34194. +EXPORT_SYMBOL_GPL(__skb_tstamp_tx);
  34195. +
  34196. +void skb_tstamp_tx(struct sk_buff *orig_skb,
  34197. + struct skb_shared_hwtstamps *hwtstamps)
  34198. +{
  34199. + return __skb_tstamp_tx(orig_skb, hwtstamps, orig_skb->sk,
  34200. + SCM_TSTAMP_SND);
  34201. +}
  34202. +EXPORT_SYMBOL_GPL(skb_tstamp_tx);
  34203. +
  34204. +void skb_complete_wifi_ack(struct sk_buff *skb, bool acked)
  34205. +{
  34206. + struct sock *sk = skb->sk;
  34207. + struct sock_exterr_skb *serr;
  34208. + int err;
  34209. +
  34210. + skb->wifi_acked_valid = 1;
  34211. + skb->wifi_acked = acked;
  34212. +
  34213. + serr = SKB_EXT_ERR(skb);
  34214. + memset(serr, 0, sizeof(*serr));
  34215. + serr->ee.ee_errno = ENOMSG;
  34216. + serr->ee.ee_origin = SO_EE_ORIGIN_TXSTATUS;
  34217. +
  34218. + /* take a reference to prevent skb_orphan() from freeing the socket */
  34219. + sock_hold(sk);
  34220. +
  34221. + err = sock_queue_err_skb(sk, skb);
  34222. + if (err)
  34223. + kfree_skb(skb);
  34224. +
  34225. + sock_put(sk);
  34226. +}
  34227. +EXPORT_SYMBOL_GPL(skb_complete_wifi_ack);
  34228. +
  34229. +
  34230. +/**
  34231. + * skb_partial_csum_set - set up and verify partial csum values for packet
  34232. + * @skb: the skb to set
  34233. + * @start: the number of bytes after skb->data to start checksumming.
  34234. + * @off: the offset from start to place the checksum.
  34235. + *
  34236. + * For untrusted partially-checksummed packets, we need to make sure the values
  34237. + * for skb->csum_start and skb->csum_offset are valid so we don't oops.
  34238. + *
  34239. + * This function checks and sets those values and skb->ip_summed: if this
  34240. + * returns false you should drop the packet.
  34241. + */
  34242. +bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off)
  34243. +{
  34244. + if (unlikely(start > skb_headlen(skb)) ||
  34245. + unlikely((int)start + off > skb_headlen(skb) - 2)) {
  34246. + net_warn_ratelimited("bad partial csum: csum=%u/%u len=%u\n",
  34247. + start, off, skb_headlen(skb));
  34248. + return false;
  34249. + }
  34250. + skb->ip_summed = CHECKSUM_PARTIAL;
  34251. + skb->csum_start = skb_headroom(skb) + start;
  34252. + skb->csum_offset = off;
  34253. + skb_set_transport_header(skb, start);
  34254. + return true;
  34255. +}
  34256. +EXPORT_SYMBOL_GPL(skb_partial_csum_set);
  34257. +
  34258. +static int skb_maybe_pull_tail(struct sk_buff *skb, unsigned int len,
  34259. + unsigned int max)
  34260. +{
  34261. + if (skb_headlen(skb) >= len)
  34262. + return 0;
  34263. +
  34264. + /* If we need to pullup then pullup to the max, so we
  34265. + * won't need to do it again.
  34266. + */
  34267. + if (max > skb->len)
  34268. + max = skb->len;
  34269. +
  34270. + if (__pskb_pull_tail(skb, max - skb_headlen(skb)) == NULL)
  34271. + return -ENOMEM;
  34272. +
  34273. + if (skb_headlen(skb) < len)
  34274. + return -EPROTO;
  34275. +
  34276. + return 0;
  34277. +}
  34278. +
  34279. +#define MAX_TCP_HDR_LEN (15 * 4)
  34280. +
  34281. +static __sum16 *skb_checksum_setup_ip(struct sk_buff *skb,
  34282. + typeof(IPPROTO_IP) proto,
  34283. + unsigned int off)
  34284. +{
  34285. + switch (proto) {
  34286. + int err;
  34287. +
  34288. + case IPPROTO_TCP:
  34289. + err = skb_maybe_pull_tail(skb, off + sizeof(struct tcphdr),
  34290. + off + MAX_TCP_HDR_LEN);
  34291. + if (!err && !skb_partial_csum_set(skb, off,
  34292. + offsetof(struct tcphdr,
  34293. + check)))
  34294. + err = -EPROTO;
  34295. + return err ? ERR_PTR(err) : &tcp_hdr(skb)->check;
  34296. +
  34297. + case IPPROTO_UDP:
  34298. + err = skb_maybe_pull_tail(skb, off + sizeof(struct udphdr),
  34299. + off + sizeof(struct udphdr));
  34300. + if (!err && !skb_partial_csum_set(skb, off,
  34301. + offsetof(struct udphdr,
  34302. + check)))
  34303. + err = -EPROTO;
  34304. + return err ? ERR_PTR(err) : &udp_hdr(skb)->check;
  34305. + }
  34306. +
  34307. + return ERR_PTR(-EPROTO);
  34308. +}
  34309. +
  34310. +/* This value should be large enough to cover a tagged ethernet header plus
  34311. + * maximally sized IP and TCP or UDP headers.
  34312. + */
  34313. +#define MAX_IP_HDR_LEN 128
  34314. +
  34315. +static int skb_checksum_setup_ipv4(struct sk_buff *skb, bool recalculate)
  34316. +{
  34317. + unsigned int off;
  34318. + bool fragment;
  34319. + __sum16 *csum;
  34320. + int err;
  34321. +
  34322. + fragment = false;
  34323. +
  34324. + err = skb_maybe_pull_tail(skb,
  34325. + sizeof(struct iphdr),
  34326. + MAX_IP_HDR_LEN);
  34327. + if (err < 0)
  34328. + goto out;
  34329. +
  34330. + if (ip_hdr(skb)->frag_off & htons(IP_OFFSET | IP_MF))
  34331. + fragment = true;
  34332. +
  34333. + off = ip_hdrlen(skb);
  34334. +
  34335. + err = -EPROTO;
  34336. +
  34337. + if (fragment)
  34338. + goto out;
  34339. +
  34340. + csum = skb_checksum_setup_ip(skb, ip_hdr(skb)->protocol, off);
  34341. + if (IS_ERR(csum))
  34342. + return PTR_ERR(csum);
  34343. +
  34344. + if (recalculate)
  34345. + *csum = ~csum_tcpudp_magic(ip_hdr(skb)->saddr,
  34346. + ip_hdr(skb)->daddr,
  34347. + skb->len - off,
  34348. + ip_hdr(skb)->protocol, 0);
  34349. + err = 0;
  34350. +
  34351. +out:
  34352. + return err;
  34353. +}
  34354. +
  34355. +/* This value should be large enough to cover a tagged ethernet header plus
  34356. + * an IPv6 header, all options, and a maximal TCP or UDP header.
  34357. + */
  34358. +#define MAX_IPV6_HDR_LEN 256
  34359. +
  34360. +#define OPT_HDR(type, skb, off) \
  34361. + (type *)(skb_network_header(skb) + (off))
  34362. +
  34363. +static int skb_checksum_setup_ipv6(struct sk_buff *skb, bool recalculate)
  34364. +{
  34365. + int err;
  34366. + u8 nexthdr;
  34367. + unsigned int off;
  34368. + unsigned int len;
  34369. + bool fragment;
  34370. + bool done;
  34371. + __sum16 *csum;
  34372. +
  34373. + fragment = false;
  34374. + done = false;
  34375. +
  34376. + off = sizeof(struct ipv6hdr);
  34377. +
  34378. + err = skb_maybe_pull_tail(skb, off, MAX_IPV6_HDR_LEN);
  34379. + if (err < 0)
  34380. + goto out;
  34381. +
  34382. + nexthdr = ipv6_hdr(skb)->nexthdr;
  34383. +
  34384. + len = sizeof(struct ipv6hdr) + ntohs(ipv6_hdr(skb)->payload_len);
  34385. + while (off <= len && !done) {
  34386. + switch (nexthdr) {
  34387. + case IPPROTO_DSTOPTS:
  34388. + case IPPROTO_HOPOPTS:
  34389. + case IPPROTO_ROUTING: {
  34390. + struct ipv6_opt_hdr *hp;
  34391. +
  34392. + err = skb_maybe_pull_tail(skb,
  34393. + off +
  34394. + sizeof(struct ipv6_opt_hdr),
  34395. + MAX_IPV6_HDR_LEN);
  34396. + if (err < 0)
  34397. + goto out;
  34398. +
  34399. + hp = OPT_HDR(struct ipv6_opt_hdr, skb, off);
  34400. + nexthdr = hp->nexthdr;
  34401. + off += ipv6_optlen(hp);
  34402. + break;
  34403. + }
  34404. + case IPPROTO_AH: {
  34405. + struct ip_auth_hdr *hp;
  34406. +
  34407. + err = skb_maybe_pull_tail(skb,
  34408. + off +
  34409. + sizeof(struct ip_auth_hdr),
  34410. + MAX_IPV6_HDR_LEN);
  34411. + if (err < 0)
  34412. + goto out;
  34413. +
  34414. + hp = OPT_HDR(struct ip_auth_hdr, skb, off);
  34415. + nexthdr = hp->nexthdr;
  34416. + off += ipv6_authlen(hp);
  34417. + break;
  34418. + }
  34419. + case IPPROTO_FRAGMENT: {
  34420. + struct frag_hdr *hp;
  34421. +
  34422. + err = skb_maybe_pull_tail(skb,
  34423. + off +
  34424. + sizeof(struct frag_hdr),
  34425. + MAX_IPV6_HDR_LEN);
  34426. + if (err < 0)
  34427. + goto out;
  34428. +
  34429. + hp = OPT_HDR(struct frag_hdr, skb, off);
  34430. +
  34431. + if (hp->frag_off & htons(IP6_OFFSET | IP6_MF))
  34432. + fragment = true;
  34433. +
  34434. + nexthdr = hp->nexthdr;
  34435. + off += sizeof(struct frag_hdr);
  34436. + break;
  34437. + }
  34438. + default:
  34439. + done = true;
  34440. + break;
  34441. + }
  34442. + }
  34443. +
  34444. + err = -EPROTO;
  34445. +
  34446. + if (!done || fragment)
  34447. + goto out;
  34448. +
  34449. + csum = skb_checksum_setup_ip(skb, nexthdr, off);
  34450. + if (IS_ERR(csum))
  34451. + return PTR_ERR(csum);
  34452. +
  34453. + if (recalculate)
  34454. + *csum = ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
  34455. + &ipv6_hdr(skb)->daddr,
  34456. + skb->len - off, nexthdr, 0);
  34457. + err = 0;
  34458. +
  34459. +out:
  34460. + return err;
  34461. +}
  34462. +
  34463. +/**
  34464. + * skb_checksum_setup - set up partial checksum offset
  34465. + * @skb: the skb to set up
  34466. + * @recalculate: if true the pseudo-header checksum will be recalculated
  34467. + */
  34468. +int skb_checksum_setup(struct sk_buff *skb, bool recalculate)
  34469. +{
  34470. + int err;
  34471. +
  34472. + switch (skb->protocol) {
  34473. + case htons(ETH_P_IP):
  34474. + err = skb_checksum_setup_ipv4(skb, recalculate);
  34475. + break;
  34476. +
  34477. + case htons(ETH_P_IPV6):
  34478. + err = skb_checksum_setup_ipv6(skb, recalculate);
  34479. + break;
  34480. +
  34481. + default:
  34482. + err = -EPROTO;
  34483. + break;
  34484. + }
  34485. +
  34486. + return err;
  34487. +}
  34488. +EXPORT_SYMBOL(skb_checksum_setup);
  34489. +
  34490. +void __skb_warn_lro_forwarding(const struct sk_buff *skb)
  34491. +{
  34492. + net_warn_ratelimited("%s: received packets cannot be forwarded while LRO is enabled\n",
  34493. + skb->dev->name);
  34494. +}
  34495. +EXPORT_SYMBOL(__skb_warn_lro_forwarding);
  34496. +
  34497. +void kfree_skb_partial(struct sk_buff *skb, bool head_stolen)
  34498. +{
  34499. + if (head_stolen) {
  34500. + skb_release_head_state(skb);
  34501. + kmem_cache_free(skbuff_head_cache, skb);
  34502. + } else {
  34503. + __kfree_skb(skb);
  34504. + }
  34505. +}
  34506. +EXPORT_SYMBOL(kfree_skb_partial);
  34507. +
  34508. +/**
  34509. + * skb_try_coalesce - try to merge skb to prior one
  34510. + * @to: prior buffer
  34511. + * @from: buffer to add
  34512. + * @fragstolen: pointer to boolean
  34513. + * @delta_truesize: how much more was allocated than was requested
  34514. + */
  34515. +bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from,
  34516. + bool *fragstolen, int *delta_truesize)
  34517. +{
  34518. + int i, delta, len = from->len;
  34519. +
  34520. + *fragstolen = false;
  34521. +
  34522. + if (skb_cloned(to))
  34523. + return false;
  34524. +
  34525. + if (len <= skb_tailroom(to)) {
  34526. + if (len)
  34527. + BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len));
  34528. + *delta_truesize = 0;
  34529. + return true;
  34530. + }
  34531. +
  34532. + if (skb_has_frag_list(to) || skb_has_frag_list(from))
  34533. + return false;
  34534. +
  34535. + if (skb_headlen(from) != 0) {
  34536. + struct page *page;
  34537. + unsigned int offset;
  34538. +
  34539. + if (skb_shinfo(to)->nr_frags +
  34540. + skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS)
  34541. + return false;
  34542. +
  34543. + if (skb_head_is_locked(from))
  34544. + return false;
  34545. +
  34546. + delta = from->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff));
  34547. +
  34548. + page = virt_to_head_page(from->head);
  34549. + offset = from->data - (unsigned char *)page_address(page);
  34550. +
  34551. + skb_fill_page_desc(to, skb_shinfo(to)->nr_frags,
  34552. + page, offset, skb_headlen(from));
  34553. + *fragstolen = true;
  34554. + } else {
  34555. + if (skb_shinfo(to)->nr_frags +
  34556. + skb_shinfo(from)->nr_frags > MAX_SKB_FRAGS)
  34557. + return false;
  34558. +
  34559. + delta = from->truesize - SKB_TRUESIZE(skb_end_offset(from));
  34560. + }
  34561. +
  34562. + WARN_ON_ONCE(delta < len);
  34563. +
  34564. + memcpy(skb_shinfo(to)->frags + skb_shinfo(to)->nr_frags,
  34565. + skb_shinfo(from)->frags,
  34566. + skb_shinfo(from)->nr_frags * sizeof(skb_frag_t));
  34567. + skb_shinfo(to)->nr_frags += skb_shinfo(from)->nr_frags;
  34568. +
  34569. + if (!skb_cloned(from))
  34570. + skb_shinfo(from)->nr_frags = 0;
  34571. +
  34572. + /* if the skb is not cloned this does nothing
  34573. + * since we set nr_frags to 0.
  34574. + */
  34575. + for (i = 0; i < skb_shinfo(from)->nr_frags; i++)
  34576. + skb_frag_ref(from, i);
  34577. +
  34578. + to->truesize += delta;
  34579. + to->len += len;
  34580. + to->data_len += len;
  34581. +
  34582. + *delta_truesize = delta;
  34583. + return true;
  34584. +}
  34585. +EXPORT_SYMBOL(skb_try_coalesce);
  34586. +
  34587. +/**
  34588. + * skb_scrub_packet - scrub an skb
  34589. + *
  34590. + * @skb: buffer to clean
  34591. + * @xnet: packet is crossing netns
  34592. + *
  34593. + * skb_scrub_packet can be used after encapsulating or decapsulting a packet
  34594. + * into/from a tunnel. Some information have to be cleared during these
  34595. + * operations.
  34596. + * skb_scrub_packet can also be used to clean a skb before injecting it in
  34597. + * another namespace (@xnet == true). We have to clear all information in the
  34598. + * skb that could impact namespace isolation.
  34599. + */
  34600. +void skb_scrub_packet(struct sk_buff *skb, bool xnet)
  34601. +{
  34602. + skb->tstamp.tv64 = 0;
  34603. + skb->pkt_type = PACKET_HOST;
  34604. + skb->skb_iif = 0;
  34605. + skb->ignore_df = 0;
  34606. + skb_dst_drop(skb);
  34607. + secpath_reset(skb);
  34608. + nf_reset(skb);
  34609. + nf_reset_trace(skb);
  34610. +
  34611. + if (!xnet)
  34612. + return;
  34613. +
  34614. + skb_orphan(skb);
  34615. + skb->mark = 0;
  34616. +}
  34617. +EXPORT_SYMBOL_GPL(skb_scrub_packet);
  34618. +
  34619. +/**
  34620. + * skb_gso_transport_seglen - Return length of individual segments of a gso packet
  34621. + *
  34622. + * @skb: GSO skb
  34623. + *
  34624. + * skb_gso_transport_seglen is used to determine the real size of the
  34625. + * individual segments, including Layer4 headers (TCP/UDP).
  34626. + *
  34627. + * The MAC/L2 or network (IP, IPv6) headers are not accounted for.
  34628. + */
  34629. +unsigned int skb_gso_transport_seglen(const struct sk_buff *skb)
  34630. +{
  34631. + const struct skb_shared_info *shinfo = skb_shinfo(skb);
  34632. + unsigned int thlen = 0;
  34633. +
  34634. + if (skb->encapsulation) {
  34635. + thlen = skb_inner_transport_header(skb) -
  34636. + skb_transport_header(skb);
  34637. +
  34638. + if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
  34639. + thlen += inner_tcp_hdrlen(skb);
  34640. + } else if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
  34641. + thlen = tcp_hdrlen(skb);
  34642. + }
  34643. + /* UFO sets gso_size to the size of the fragmentation
  34644. + * payload, i.e. the size of the L4 (UDP) header is already
  34645. + * accounted for.
  34646. + */
  34647. + return thlen + shinfo->gso_size;
  34648. +}
  34649. +EXPORT_SYMBOL_GPL(skb_gso_transport_seglen);
  34650. +
  34651. +static struct sk_buff *skb_reorder_vlan_header(struct sk_buff *skb)
  34652. +{
  34653. + if (skb_cow(skb, skb_headroom(skb)) < 0) {
  34654. + kfree_skb(skb);
  34655. + return NULL;
  34656. + }
  34657. +
  34658. + memmove(skb->data - ETH_HLEN, skb->data - VLAN_ETH_HLEN, 2 * ETH_ALEN);
  34659. + skb->mac_header += VLAN_HLEN;
  34660. + return skb;
  34661. +}
  34662. +
  34663. +struct sk_buff *skb_vlan_untag(struct sk_buff *skb)
  34664. +{
  34665. + struct vlan_hdr *vhdr;
  34666. + u16 vlan_tci;
  34667. +
  34668. + if (unlikely(vlan_tx_tag_present(skb))) {
  34669. + /* vlan_tci is already set-up so leave this for another time */
  34670. + return skb;
  34671. + }
  34672. +
  34673. + skb = skb_share_check(skb, GFP_ATOMIC);
  34674. + if (unlikely(!skb))
  34675. + goto err_free;
  34676. +
  34677. + if (unlikely(!pskb_may_pull(skb, VLAN_HLEN)))
  34678. + goto err_free;
  34679. +
  34680. + vhdr = (struct vlan_hdr *)skb->data;
  34681. + vlan_tci = ntohs(vhdr->h_vlan_TCI);
  34682. + __vlan_hwaccel_put_tag(skb, skb->protocol, vlan_tci);
  34683. +
  34684. + skb_pull_rcsum(skb, VLAN_HLEN);
  34685. + vlan_set_encap_proto(skb, vhdr);
  34686. +
  34687. + skb = skb_reorder_vlan_header(skb);
  34688. + if (unlikely(!skb))
  34689. + goto err_free;
  34690. +
  34691. + skb_reset_network_header(skb);
  34692. + skb_reset_transport_header(skb);
  34693. + skb_reset_mac_len(skb);
  34694. +
  34695. + return skb;
  34696. +
  34697. +err_free:
  34698. + kfree_skb(skb);
  34699. + return NULL;
  34700. +}
  34701. +EXPORT_SYMBOL(skb_vlan_untag);
  34702. +
  34703. +/**
  34704. + * alloc_skb_with_frags - allocate skb with page frags
  34705. + *
  34706. + * @header_len: size of linear part
  34707. + * @data_len: needed length in frags
  34708. + * @max_page_order: max page order desired.
  34709. + * @errcode: pointer to error code if any
  34710. + * @gfp_mask: allocation mask
  34711. + *
  34712. + * This can be used to allocate a paged skb, given a maximal order for frags.
  34713. + */
  34714. +struct sk_buff *alloc_skb_with_frags(unsigned long header_len,
  34715. + unsigned long data_len,
  34716. + int max_page_order,
  34717. + int *errcode,
  34718. + gfp_t gfp_mask)
  34719. +{
  34720. + int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
  34721. + unsigned long chunk;
  34722. + struct sk_buff *skb;
  34723. + struct page *page;
  34724. + gfp_t gfp_head;
  34725. + int i;
  34726. +
  34727. + *errcode = -EMSGSIZE;
  34728. + /* Note this test could be relaxed, if we succeed to allocate
  34729. + * high order pages...
  34730. + */
  34731. + if (npages > MAX_SKB_FRAGS)
  34732. + return NULL;
  34733. +
  34734. + gfp_head = gfp_mask;
  34735. + if (gfp_head & __GFP_WAIT)
  34736. + gfp_head |= __GFP_REPEAT;
  34737. +
  34738. + *errcode = -ENOBUFS;
  34739. + skb = alloc_skb(header_len, gfp_head);
  34740. + if (!skb)
  34741. + return NULL;
  34742. +
  34743. + skb->truesize += npages << PAGE_SHIFT;
  34744. +
  34745. + for (i = 0; npages > 0; i++) {
  34746. + int order = max_page_order;
  34747. +
  34748. + while (order) {
  34749. + if (npages >= 1 << order) {
  34750. + page = alloc_pages(gfp_mask |
  34751. + __GFP_COMP |
  34752. + __GFP_NOWARN |
  34753. + __GFP_NORETRY,
  34754. + order);
  34755. + if (page)
  34756. + goto fill_page;
  34757. + /* Do not retry other high order allocations */
  34758. + order = 1;
  34759. + max_page_order = 0;
  34760. + }
  34761. + order--;
  34762. + }
  34763. + page = alloc_page(gfp_mask);
  34764. + if (!page)
  34765. + goto failure;
  34766. +fill_page:
  34767. + chunk = min_t(unsigned long, data_len,
  34768. + PAGE_SIZE << order);
  34769. + skb_fill_page_desc(skb, i, page, 0, chunk);
  34770. + data_len -= chunk;
  34771. + npages -= 1 << order;
  34772. + }
  34773. + return skb;
  34774. +
  34775. +failure:
  34776. + kfree_skb(skb);
  34777. + return NULL;
  34778. +}
  34779. +EXPORT_SYMBOL(alloc_skb_with_frags);
  34780. diff -Nur linux-3.18.14.orig/net/core/sock.c linux-3.18.14-rt/net/core/sock.c
  34781. --- linux-3.18.14.orig/net/core/sock.c 2015-05-20 10:04:50.000000000 -0500
  34782. +++ linux-3.18.14-rt/net/core/sock.c 2015-05-31 15:32:49.433635358 -0500
  34783. @@ -2345,12 +2345,11 @@
  34784. if (sk->sk_lock.owned)
  34785. __lock_sock(sk);
  34786. sk->sk_lock.owned = 1;
  34787. - spin_unlock(&sk->sk_lock.slock);
  34788. + spin_unlock_bh(&sk->sk_lock.slock);
  34789. /*
  34790. * The sk_lock has mutex_lock() semantics here:
  34791. */
  34792. mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
  34793. - local_bh_enable();
  34794. }
  34795. EXPORT_SYMBOL(lock_sock_nested);
  34796. diff -Nur linux-3.18.14.orig/net/ipv4/icmp.c linux-3.18.14-rt/net/ipv4/icmp.c
  34797. --- linux-3.18.14.orig/net/ipv4/icmp.c 2015-05-20 10:04:50.000000000 -0500
  34798. +++ linux-3.18.14-rt/net/ipv4/icmp.c 2015-05-31 15:32:49.457635357 -0500
  34799. @@ -69,6 +69,7 @@
  34800. #include <linux/jiffies.h>
  34801. #include <linux/kernel.h>
  34802. #include <linux/fcntl.h>
  34803. +#include <linux/sysrq.h>
  34804. #include <linux/socket.h>
  34805. #include <linux/in.h>
  34806. #include <linux/inet.h>
  34807. @@ -864,6 +865,30 @@
  34808. }
  34809. /*
  34810. + * 32bit and 64bit have different timestamp length, so we check for
  34811. + * the cookie at offset 20 and verify it is repeated at offset 50
  34812. + */
  34813. +#define CO_POS0 20
  34814. +#define CO_POS1 50
  34815. +#define CO_SIZE sizeof(int)
  34816. +#define ICMP_SYSRQ_SIZE 57
  34817. +
  34818. +/*
  34819. + * We got a ICMP_SYSRQ_SIZE sized ping request. Check for the cookie
  34820. + * pattern and if it matches send the next byte as a trigger to sysrq.
  34821. + */
  34822. +static void icmp_check_sysrq(struct net *net, struct sk_buff *skb)
  34823. +{
  34824. + int cookie = htonl(net->ipv4.sysctl_icmp_echo_sysrq);
  34825. + char *p = skb->data;
  34826. +
  34827. + if (!memcmp(&cookie, p + CO_POS0, CO_SIZE) &&
  34828. + !memcmp(&cookie, p + CO_POS1, CO_SIZE) &&
  34829. + p[CO_POS0 + CO_SIZE] == p[CO_POS1 + CO_SIZE])
  34830. + handle_sysrq(p[CO_POS0 + CO_SIZE]);
  34831. +}
  34832. +
  34833. +/*
  34834. * Handle ICMP_ECHO ("ping") requests.
  34835. *
  34836. * RFC 1122: 3.2.2.6 MUST have an echo server that answers ICMP echo
  34837. @@ -890,6 +915,11 @@
  34838. icmp_param.data_len = skb->len;
  34839. icmp_param.head_len = sizeof(struct icmphdr);
  34840. icmp_reply(&icmp_param, skb);
  34841. +
  34842. + if (skb->len == ICMP_SYSRQ_SIZE &&
  34843. + net->ipv4.sysctl_icmp_echo_sysrq) {
  34844. + icmp_check_sysrq(net, skb);
  34845. + }
  34846. }
  34847. }
  34848. diff -Nur linux-3.18.14.orig/net/ipv4/sysctl_net_ipv4.c linux-3.18.14-rt/net/ipv4/sysctl_net_ipv4.c
  34849. --- linux-3.18.14.orig/net/ipv4/sysctl_net_ipv4.c 2015-05-20 10:04:50.000000000 -0500
  34850. +++ linux-3.18.14-rt/net/ipv4/sysctl_net_ipv4.c 2015-05-31 15:32:49.485635357 -0500
  34851. @@ -779,6 +779,13 @@
  34852. .proc_handler = proc_dointvec
  34853. },
  34854. {
  34855. + .procname = "icmp_echo_sysrq",
  34856. + .data = &init_net.ipv4.sysctl_icmp_echo_sysrq,
  34857. + .maxlen = sizeof(int),
  34858. + .mode = 0644,
  34859. + .proc_handler = proc_dointvec
  34860. + },
  34861. + {
  34862. .procname = "icmp_ignore_bogus_error_responses",
  34863. .data = &init_net.ipv4.sysctl_icmp_ignore_bogus_error_responses,
  34864. .maxlen = sizeof(int),
  34865. diff -Nur linux-3.18.14.orig/net/mac80211/rx.c linux-3.18.14-rt/net/mac80211/rx.c
  34866. --- linux-3.18.14.orig/net/mac80211/rx.c 2015-05-20 10:04:50.000000000 -0500
  34867. +++ linux-3.18.14-rt/net/mac80211/rx.c 2015-05-31 15:32:49.501635357 -0500
  34868. @@ -3360,7 +3360,7 @@
  34869. struct ieee80211_supported_band *sband;
  34870. struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
  34871. - WARN_ON_ONCE(softirq_count() == 0);
  34872. + WARN_ON_ONCE_NONRT(softirq_count() == 0);
  34873. if (WARN_ON(status->band >= IEEE80211_NUM_BANDS))
  34874. goto drop;
  34875. diff -Nur linux-3.18.14.orig/net/netfilter/core.c linux-3.18.14-rt/net/netfilter/core.c
  34876. --- linux-3.18.14.orig/net/netfilter/core.c 2015-05-20 10:04:50.000000000 -0500
  34877. +++ linux-3.18.14-rt/net/netfilter/core.c 2015-05-31 15:32:49.549635357 -0500
  34878. @@ -21,11 +21,17 @@
  34879. #include <linux/proc_fs.h>
  34880. #include <linux/mutex.h>
  34881. #include <linux/slab.h>
  34882. +#include <linux/locallock.h>
  34883. #include <net/net_namespace.h>
  34884. #include <net/sock.h>
  34885. #include "nf_internals.h"
  34886. +#ifdef CONFIG_PREEMPT_RT_BASE
  34887. +DEFINE_LOCAL_IRQ_LOCK(xt_write_lock);
  34888. +EXPORT_PER_CPU_SYMBOL(xt_write_lock);
  34889. +#endif
  34890. +
  34891. static DEFINE_MUTEX(afinfo_mutex);
  34892. const struct nf_afinfo __rcu *nf_afinfo[NFPROTO_NUMPROTO] __read_mostly;
  34893. diff -Nur linux-3.18.14.orig/net/packet/af_packet.c linux-3.18.14-rt/net/packet/af_packet.c
  34894. --- linux-3.18.14.orig/net/packet/af_packet.c 2015-05-20 10:04:50.000000000 -0500
  34895. +++ linux-3.18.14-rt/net/packet/af_packet.c 2015-05-31 15:32:49.557635357 -0500
  34896. @@ -63,6 +63,7 @@
  34897. #include <linux/if_packet.h>
  34898. #include <linux/wireless.h>
  34899. #include <linux/kernel.h>
  34900. +#include <linux/delay.h>
  34901. #include <linux/kmod.h>
  34902. #include <linux/slab.h>
  34903. #include <linux/vmalloc.h>
  34904. @@ -692,7 +693,7 @@
  34905. if (BLOCK_NUM_PKTS(pbd)) {
  34906. while (atomic_read(&pkc->blk_fill_in_prog)) {
  34907. /* Waiting for skb_copy_bits to finish... */
  34908. - cpu_relax();
  34909. + cpu_chill();
  34910. }
  34911. }
  34912. @@ -943,7 +944,7 @@
  34913. if (!(status & TP_STATUS_BLK_TMO)) {
  34914. while (atomic_read(&pkc->blk_fill_in_prog)) {
  34915. /* Waiting for skb_copy_bits to finish... */
  34916. - cpu_relax();
  34917. + cpu_chill();
  34918. }
  34919. }
  34920. prb_close_block(pkc, pbd, po, status);
  34921. diff -Nur linux-3.18.14.orig/net/rds/ib_rdma.c linux-3.18.14-rt/net/rds/ib_rdma.c
  34922. --- linux-3.18.14.orig/net/rds/ib_rdma.c 2015-05-20 10:04:50.000000000 -0500
  34923. +++ linux-3.18.14-rt/net/rds/ib_rdma.c 2015-05-31 15:32:49.573635357 -0500
  34924. @@ -34,6 +34,7 @@
  34925. #include <linux/slab.h>
  34926. #include <linux/rculist.h>
  34927. #include <linux/llist.h>
  34928. +#include <linux/delay.h>
  34929. #include "rds.h"
  34930. #include "ib.h"
  34931. @@ -286,7 +287,7 @@
  34932. for_each_online_cpu(cpu) {
  34933. flag = &per_cpu(clean_list_grace, cpu);
  34934. while (test_bit(CLEAN_LIST_BUSY_BIT, flag))
  34935. - cpu_relax();
  34936. + cpu_chill();
  34937. }
  34938. }
  34939. diff -Nur linux-3.18.14.orig/net/sched/sch_generic.c linux-3.18.14-rt/net/sched/sch_generic.c
  34940. --- linux-3.18.14.orig/net/sched/sch_generic.c 2015-05-20 10:04:50.000000000 -0500
  34941. +++ linux-3.18.14-rt/net/sched/sch_generic.c 2015-05-31 15:32:49.593635356 -0500
  34942. @@ -894,7 +894,7 @@
  34943. /* Wait for outstanding qdisc_run calls. */
  34944. list_for_each_entry(dev, head, close_list)
  34945. while (some_qdisc_is_busy(dev))
  34946. - yield();
  34947. + msleep(1);
  34948. }
  34949. void dev_deactivate(struct net_device *dev)
  34950. diff -Nur linux-3.18.14.orig/net/sunrpc/svc_xprt.c linux-3.18.14-rt/net/sunrpc/svc_xprt.c
  34951. --- linux-3.18.14.orig/net/sunrpc/svc_xprt.c 2015-05-20 10:04:50.000000000 -0500
  34952. +++ linux-3.18.14-rt/net/sunrpc/svc_xprt.c 2015-05-31 15:32:49.617635356 -0500
  34953. @@ -357,7 +357,7 @@
  34954. return;
  34955. }
  34956. - cpu = get_cpu();
  34957. + cpu = get_cpu_light();
  34958. pool = svc_pool_for_cpu(xprt->xpt_server, cpu);
  34959. spin_lock_bh(&pool->sp_lock);
  34960. @@ -390,7 +390,7 @@
  34961. }
  34962. spin_unlock_bh(&pool->sp_lock);
  34963. - put_cpu();
  34964. + put_cpu_light();
  34965. }
  34966. /*
  34967. diff -Nur linux-3.18.14.orig/scripts/mkcompile_h linux-3.18.14-rt/scripts/mkcompile_h
  34968. --- linux-3.18.14.orig/scripts/mkcompile_h 2015-05-20 10:04:50.000000000 -0500
  34969. +++ linux-3.18.14-rt/scripts/mkcompile_h 2015-05-31 15:32:49.641635356 -0500
  34970. @@ -4,7 +4,8 @@
  34971. ARCH=$2
  34972. SMP=$3
  34973. PREEMPT=$4
  34974. -CC=$5
  34975. +RT=$5
  34976. +CC=$6
  34977. vecho() { [ "${quiet}" = "silent_" ] || echo "$@" ; }
  34978. @@ -57,6 +58,7 @@
  34979. CONFIG_FLAGS=""
  34980. if [ -n "$SMP" ] ; then CONFIG_FLAGS="SMP"; fi
  34981. if [ -n "$PREEMPT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS PREEMPT"; fi
  34982. +if [ -n "$RT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS RT"; fi
  34983. UTS_VERSION="$UTS_VERSION $CONFIG_FLAGS $TIMESTAMP"
  34984. # Truncate to maximum length
  34985. diff -Nur linux-3.18.14.orig/sound/core/pcm_native.c linux-3.18.14-rt/sound/core/pcm_native.c
  34986. --- linux-3.18.14.orig/sound/core/pcm_native.c 2015-05-20 10:04:50.000000000 -0500
  34987. +++ linux-3.18.14-rt/sound/core/pcm_native.c 2015-05-31 15:32:49.661635356 -0500
  34988. @@ -104,7 +104,7 @@
  34989. void snd_pcm_stream_lock_irq(struct snd_pcm_substream *substream)
  34990. {
  34991. if (!substream->pcm->nonatomic)
  34992. - local_irq_disable();
  34993. + local_irq_disable_nort();
  34994. snd_pcm_stream_lock(substream);
  34995. }
  34996. EXPORT_SYMBOL_GPL(snd_pcm_stream_lock_irq);
  34997. @@ -113,7 +113,7 @@
  34998. {
  34999. snd_pcm_stream_unlock(substream);
  35000. if (!substream->pcm->nonatomic)
  35001. - local_irq_enable();
  35002. + local_irq_enable_nort();
  35003. }
  35004. EXPORT_SYMBOL_GPL(snd_pcm_stream_unlock_irq);
  35005. @@ -121,7 +121,7 @@
  35006. {
  35007. unsigned long flags = 0;
  35008. if (!substream->pcm->nonatomic)
  35009. - local_irq_save(flags);
  35010. + local_irq_save_nort(flags);
  35011. snd_pcm_stream_lock(substream);
  35012. return flags;
  35013. }
  35014. @@ -132,7 +132,7 @@
  35015. {
  35016. snd_pcm_stream_unlock(substream);
  35017. if (!substream->pcm->nonatomic)
  35018. - local_irq_restore(flags);
  35019. + local_irq_restore_nort(flags);
  35020. }
  35021. EXPORT_SYMBOL_GPL(snd_pcm_stream_unlock_irqrestore);
  35022. diff -Nur linux-3.18.14.orig/virt/kvm/async_pf.c linux-3.18.14-rt/virt/kvm/async_pf.c
  35023. --- linux-3.18.14.orig/virt/kvm/async_pf.c 2015-05-20 10:04:50.000000000 -0500
  35024. +++ linux-3.18.14-rt/virt/kvm/async_pf.c 2015-05-31 15:32:49.661635356 -0500
  35025. @@ -94,8 +94,8 @@
  35026. trace_kvm_async_pf_completed(addr, gva);
  35027. - if (waitqueue_active(&vcpu->wq))
  35028. - wake_up_interruptible(&vcpu->wq);
  35029. + if (swaitqueue_active(&vcpu->wq))
  35030. + swait_wake_interruptible(&vcpu->wq);
  35031. mmput(mm);
  35032. kvm_put_kvm(vcpu->kvm);
  35033. diff -Nur linux-3.18.14.orig/virt/kvm/kvm_main.c linux-3.18.14-rt/virt/kvm/kvm_main.c
  35034. --- linux-3.18.14.orig/virt/kvm/kvm_main.c 2015-05-20 10:04:50.000000000 -0500
  35035. +++ linux-3.18.14-rt/virt/kvm/kvm_main.c 2015-05-31 15:32:49.697635356 -0500
  35036. @@ -221,7 +221,7 @@
  35037. vcpu->kvm = kvm;
  35038. vcpu->vcpu_id = id;
  35039. vcpu->pid = NULL;
  35040. - init_waitqueue_head(&vcpu->wq);
  35041. + init_swait_head(&vcpu->wq);
  35042. kvm_async_pf_vcpu_init(vcpu);
  35043. page = alloc_page(GFP_KERNEL | __GFP_ZERO);
  35044. @@ -1741,10 +1741,10 @@
  35045. */
  35046. void kvm_vcpu_block(struct kvm_vcpu *vcpu)
  35047. {
  35048. - DEFINE_WAIT(wait);
  35049. + DEFINE_SWAITER(wait);
  35050. for (;;) {
  35051. - prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
  35052. + swait_prepare(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
  35053. if (kvm_arch_vcpu_runnable(vcpu)) {
  35054. kvm_make_request(KVM_REQ_UNHALT, vcpu);
  35055. @@ -1758,7 +1758,7 @@
  35056. schedule();
  35057. }
  35058. - finish_wait(&vcpu->wq, &wait);
  35059. + swait_finish(&vcpu->wq, &wait);
  35060. }
  35061. EXPORT_SYMBOL_GPL(kvm_vcpu_block);
  35062. @@ -1770,11 +1770,11 @@
  35063. {
  35064. int me;
  35065. int cpu = vcpu->cpu;
  35066. - wait_queue_head_t *wqp;
  35067. + struct swait_head *wqp;
  35068. wqp = kvm_arch_vcpu_wq(vcpu);
  35069. - if (waitqueue_active(wqp)) {
  35070. - wake_up_interruptible(wqp);
  35071. + if (swaitqueue_active(wqp)) {
  35072. + swait_wake_interruptible(wqp);
  35073. ++vcpu->stat.halt_wakeup;
  35074. }
  35075. @@ -1879,7 +1879,7 @@
  35076. continue;
  35077. if (vcpu == me)
  35078. continue;
  35079. - if (waitqueue_active(&vcpu->wq) && !kvm_arch_vcpu_runnable(vcpu))
  35080. + if (swaitqueue_active(&vcpu->wq) && !kvm_arch_vcpu_runnable(vcpu))
  35081. continue;
  35082. if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
  35083. continue;