realtime.patch 1004 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515451645174518451945204521452245234524452545264527452845294530453145324533453445354536453745384539454045414542454345444545454645474548454945504551455245534554455545564557455845594560456145624563456445654566456745684569457045714572457345744575457645774578457945804581458245834584458545864587458845894590459145924593459445954596459745984599460046014602460346044605460646074608460946104611461246134614461546164617461846194620462146224623462446254626462746284629463046314632463346344635463646374638463946404641464246434644464546464647464846494650465146524653465446554656465746584659466046614662466346644665466646674668466946704671467246734674467546764677467846794680468146824683468446854686468746884689469046914692469346944695469646974698469947004701470247034704470547064707470847094710471147124713471447154716471747184719472047214722472347244725472647274728472947304731473247334734473547364737473847394740474147424743474447454746474747484749475047514752475347544755475647574758475947604761476247634764476547664767476847694770477147724773477447754776477747784779478047814782478347844785478647874788478947904791479247934794479547964797479847994800480148024803480448054806480748084809481048114812481348144815481648174818481948204821482248234824482548264827482848294830483148324833483448354836483748384839484048414842484348444845484648474848484948504851485248534854485548564857485848594860486148624863486448654866486748684869487048714872487348744875487648774878487948804881488248834884488548864887488848894890489148924893489448954896489748984899490049014902490349044905490649074908490949104911491249134914491549164917491849194920492149224923492449254926492749284929493049314932493349344935493649374938493949404941494249434944494549464947494849494950495149524953495449554956495749584959496049614962496349644965496649674968496949704971497249734974497549764977497849794980498149824983498449854986498749884989499049914992499349944995499649974998499950005001500250035004500550065007500850095010501150125013501450155016501750185019502050215022502350245025502650275028502950305031503250335034503550365037503850395040504150425043504450455046504750485049505050515052505350545055505650575058505950605061506250635064506550665067506850695070507150725073507450755076507750785079508050815082508350845085508650875088508950905091509250935094509550965097509850995100510151025103510451055106510751085109511051115112511351145115511651175118511951205121512251235124512551265127512851295130513151325133513451355136513751385139514051415142514351445145514651475148514951505151515251535154515551565157515851595160516151625163516451655166516751685169517051715172517351745175517651775178517951805181518251835184518551865187518851895190519151925193519451955196519751985199520052015202520352045205520652075208520952105211521252135214521552165217521852195220522152225223522452255226522752285229523052315232523352345235523652375238523952405241524252435244524552465247524852495250525152525253525452555256525752585259526052615262526352645265526652675268526952705271527252735274527552765277527852795280528152825283528452855286528752885289529052915292529352945295529652975298529953005301530253035304530553065307530853095310531153125313531453155316531753185319532053215322532353245325532653275328532953305331533253335334533553365337533853395340534153425343534453455346534753485349535053515352535353545355535653575358535953605361536253635364536553665367536853695370537153725373537453755376537753785379538053815382538353845385538653875388538953905391539253935394539553965397539853995400540154025403540454055406540754085409541054115412541354145415541654175418541954205421542254235424542554265427542854295430543154325433543454355436543754385439544054415442544354445445544654475448544954505451545254535454545554565457545854595460546154625463546454655466546754685469547054715472547354745475547654775478547954805481548254835484548554865487548854895490549154925493549454955496549754985499550055015502550355045505550655075508550955105511551255135514551555165517551855195520552155225523552455255526552755285529553055315532553355345535553655375538553955405541554255435544554555465547554855495550555155525553555455555556555755585559556055615562556355645565556655675568556955705571557255735574557555765577557855795580558155825583558455855586558755885589559055915592559355945595559655975598559956005601560256035604560556065607560856095610561156125613561456155616561756185619562056215622562356245625562656275628562956305631563256335634563556365637563856395640564156425643564456455646564756485649565056515652565356545655565656575658565956605661566256635664566556665667566856695670567156725673567456755676567756785679568056815682568356845685568656875688568956905691569256935694569556965697569856995700570157025703570457055706570757085709571057115712571357145715571657175718571957205721572257235724572557265727572857295730573157325733573457355736573757385739574057415742574357445745574657475748574957505751575257535754575557565757575857595760576157625763576457655766576757685769577057715772577357745775577657775778577957805781578257835784578557865787578857895790579157925793579457955796579757985799580058015802580358045805580658075808580958105811581258135814581558165817581858195820582158225823582458255826582758285829583058315832583358345835583658375838583958405841584258435844584558465847584858495850585158525853585458555856585758585859586058615862586358645865586658675868586958705871587258735874587558765877587858795880588158825883588458855886588758885889589058915892589358945895589658975898589959005901590259035904590559065907590859095910591159125913591459155916591759185919592059215922592359245925592659275928592959305931593259335934593559365937593859395940594159425943594459455946594759485949595059515952595359545955595659575958595959605961596259635964596559665967596859695970597159725973597459755976597759785979598059815982598359845985598659875988598959905991599259935994599559965997599859996000600160026003600460056006600760086009601060116012601360146015601660176018601960206021602260236024602560266027602860296030603160326033603460356036603760386039604060416042604360446045604660476048604960506051605260536054605560566057605860596060606160626063606460656066606760686069607060716072607360746075607660776078607960806081608260836084608560866087608860896090609160926093609460956096609760986099610061016102610361046105610661076108610961106111611261136114611561166117611861196120612161226123612461256126612761286129613061316132613361346135613661376138613961406141614261436144614561466147614861496150615161526153615461556156615761586159616061616162616361646165616661676168616961706171617261736174617561766177617861796180618161826183618461856186618761886189619061916192619361946195619661976198619962006201620262036204620562066207620862096210621162126213621462156216621762186219622062216222622362246225622662276228622962306231623262336234623562366237623862396240624162426243624462456246624762486249625062516252625362546255625662576258625962606261626262636264626562666267626862696270627162726273627462756276627762786279628062816282628362846285628662876288628962906291629262936294629562966297629862996300630163026303630463056306630763086309631063116312631363146315631663176318631963206321632263236324632563266327632863296330633163326333633463356336633763386339634063416342634363446345634663476348634963506351635263536354635563566357635863596360636163626363636463656366636763686369637063716372637363746375637663776378637963806381638263836384638563866387638863896390639163926393639463956396639763986399640064016402640364046405640664076408640964106411641264136414641564166417641864196420642164226423642464256426642764286429643064316432643364346435643664376438643964406441644264436444644564466447644864496450645164526453645464556456645764586459646064616462646364646465646664676468646964706471647264736474647564766477647864796480648164826483648464856486648764886489649064916492649364946495649664976498649965006501650265036504650565066507650865096510651165126513651465156516651765186519652065216522652365246525652665276528652965306531653265336534653565366537653865396540654165426543654465456546654765486549655065516552655365546555655665576558655965606561656265636564656565666567656865696570657165726573657465756576657765786579658065816582658365846585658665876588658965906591659265936594659565966597659865996600660166026603660466056606660766086609661066116612661366146615661666176618661966206621662266236624662566266627662866296630663166326633663466356636663766386639664066416642664366446645664666476648664966506651665266536654665566566657665866596660666166626663666466656666666766686669667066716672667366746675667666776678667966806681668266836684668566866687668866896690669166926693669466956696669766986699670067016702670367046705670667076708670967106711671267136714671567166717671867196720672167226723672467256726672767286729673067316732673367346735673667376738673967406741674267436744674567466747674867496750675167526753675467556756675767586759676067616762676367646765676667676768676967706771677267736774677567766777677867796780678167826783678467856786678767886789679067916792679367946795679667976798679968006801680268036804680568066807680868096810681168126813681468156816681768186819682068216822682368246825682668276828682968306831683268336834683568366837683868396840684168426843684468456846684768486849685068516852685368546855685668576858685968606861686268636864686568666867686868696870687168726873687468756876687768786879688068816882688368846885688668876888688968906891689268936894689568966897689868996900690169026903690469056906690769086909691069116912691369146915691669176918691969206921692269236924692569266927692869296930693169326933693469356936693769386939694069416942694369446945694669476948694969506951695269536954695569566957695869596960696169626963696469656966696769686969697069716972697369746975697669776978697969806981698269836984698569866987698869896990699169926993699469956996699769986999700070017002700370047005700670077008700970107011701270137014701570167017701870197020702170227023702470257026702770287029703070317032703370347035703670377038703970407041704270437044704570467047704870497050705170527053705470557056705770587059706070617062706370647065706670677068706970707071707270737074707570767077707870797080708170827083708470857086708770887089709070917092709370947095709670977098709971007101710271037104710571067107710871097110711171127113711471157116711771187119712071217122712371247125712671277128712971307131713271337134713571367137713871397140714171427143714471457146714771487149715071517152715371547155715671577158715971607161716271637164716571667167716871697170717171727173717471757176717771787179718071817182718371847185718671877188718971907191719271937194719571967197719871997200720172027203720472057206720772087209721072117212721372147215721672177218721972207221722272237224722572267227722872297230723172327233723472357236723772387239724072417242724372447245724672477248724972507251725272537254725572567257725872597260726172627263726472657266726772687269727072717272727372747275727672777278727972807281728272837284728572867287728872897290729172927293729472957296729772987299730073017302730373047305730673077308730973107311731273137314731573167317731873197320732173227323732473257326732773287329733073317332733373347335733673377338733973407341734273437344734573467347734873497350735173527353735473557356735773587359736073617362736373647365736673677368736973707371737273737374737573767377737873797380738173827383738473857386738773887389739073917392739373947395739673977398739974007401740274037404740574067407740874097410741174127413741474157416741774187419742074217422742374247425742674277428742974307431743274337434743574367437743874397440744174427443744474457446744774487449745074517452745374547455745674577458745974607461746274637464746574667467746874697470747174727473747474757476747774787479748074817482748374847485748674877488748974907491749274937494749574967497749874997500750175027503750475057506750775087509751075117512751375147515751675177518751975207521752275237524752575267527752875297530753175327533753475357536753775387539754075417542754375447545754675477548754975507551755275537554755575567557755875597560756175627563756475657566756775687569757075717572757375747575757675777578757975807581758275837584758575867587758875897590759175927593759475957596759775987599760076017602760376047605760676077608760976107611761276137614761576167617761876197620762176227623762476257626762776287629763076317632763376347635763676377638763976407641764276437644764576467647764876497650765176527653765476557656765776587659766076617662766376647665766676677668766976707671767276737674767576767677767876797680768176827683768476857686768776887689769076917692769376947695769676977698769977007701770277037704770577067707770877097710771177127713771477157716771777187719772077217722772377247725772677277728772977307731773277337734773577367737773877397740774177427743774477457746774777487749775077517752775377547755775677577758775977607761776277637764776577667767776877697770777177727773777477757776777777787779778077817782778377847785778677877788778977907791779277937794779577967797779877997800780178027803780478057806780778087809781078117812781378147815781678177818781978207821782278237824782578267827782878297830783178327833783478357836783778387839784078417842784378447845784678477848784978507851785278537854785578567857785878597860786178627863786478657866786778687869787078717872787378747875787678777878787978807881788278837884788578867887788878897890789178927893789478957896789778987899790079017902790379047905790679077908790979107911791279137914791579167917791879197920792179227923792479257926792779287929793079317932793379347935793679377938793979407941794279437944794579467947794879497950795179527953795479557956795779587959796079617962796379647965796679677968796979707971797279737974797579767977797879797980798179827983798479857986798779887989799079917992799379947995799679977998799980008001800280038004800580068007800880098010801180128013801480158016801780188019802080218022802380248025802680278028802980308031803280338034803580368037803880398040804180428043804480458046804780488049805080518052805380548055805680578058805980608061806280638064806580668067806880698070807180728073807480758076807780788079808080818082808380848085808680878088808980908091809280938094809580968097809880998100810181028103810481058106810781088109811081118112811381148115811681178118811981208121812281238124812581268127812881298130813181328133813481358136813781388139814081418142814381448145814681478148814981508151815281538154815581568157815881598160816181628163816481658166816781688169817081718172817381748175817681778178817981808181818281838184818581868187818881898190819181928193819481958196819781988199820082018202820382048205820682078208820982108211821282138214821582168217821882198220822182228223822482258226822782288229823082318232823382348235823682378238823982408241824282438244824582468247824882498250825182528253825482558256825782588259826082618262826382648265826682678268826982708271827282738274827582768277827882798280828182828283828482858286828782888289829082918292829382948295829682978298829983008301830283038304830583068307830883098310831183128313831483158316831783188319832083218322832383248325832683278328832983308331833283338334833583368337833883398340834183428343834483458346834783488349835083518352835383548355835683578358835983608361836283638364836583668367836883698370837183728373837483758376837783788379838083818382838383848385838683878388838983908391839283938394839583968397839883998400840184028403840484058406840784088409841084118412841384148415841684178418841984208421842284238424842584268427842884298430843184328433843484358436843784388439844084418442844384448445844684478448844984508451845284538454845584568457845884598460846184628463846484658466846784688469847084718472847384748475847684778478847984808481848284838484848584868487848884898490849184928493849484958496849784988499850085018502850385048505850685078508850985108511851285138514851585168517851885198520852185228523852485258526852785288529853085318532853385348535853685378538853985408541854285438544854585468547854885498550855185528553855485558556855785588559856085618562856385648565856685678568856985708571857285738574857585768577857885798580858185828583858485858586858785888589859085918592859385948595859685978598859986008601860286038604860586068607860886098610861186128613861486158616861786188619862086218622862386248625862686278628862986308631863286338634863586368637863886398640864186428643864486458646864786488649865086518652865386548655865686578658865986608661866286638664866586668667866886698670867186728673867486758676867786788679868086818682868386848685868686878688868986908691869286938694869586968697869886998700870187028703870487058706870787088709871087118712871387148715871687178718871987208721872287238724872587268727872887298730873187328733873487358736873787388739874087418742874387448745874687478748874987508751875287538754875587568757875887598760876187628763876487658766876787688769877087718772877387748775877687778778877987808781878287838784878587868787878887898790879187928793879487958796879787988799880088018802880388048805880688078808880988108811881288138814881588168817881888198820882188228823882488258826882788288829883088318832883388348835883688378838883988408841884288438844884588468847884888498850885188528853885488558856885788588859886088618862886388648865886688678868886988708871887288738874887588768877887888798880888188828883888488858886888788888889889088918892889388948895889688978898889989008901890289038904890589068907890889098910891189128913891489158916891789188919892089218922892389248925892689278928892989308931893289338934893589368937893889398940894189428943894489458946894789488949895089518952895389548955895689578958895989608961896289638964896589668967896889698970897189728973897489758976897789788979898089818982898389848985898689878988898989908991899289938994899589968997899889999000900190029003900490059006900790089009901090119012901390149015901690179018901990209021902290239024902590269027902890299030903190329033903490359036903790389039904090419042904390449045904690479048904990509051905290539054905590569057905890599060906190629063906490659066906790689069907090719072907390749075907690779078907990809081908290839084908590869087908890899090909190929093909490959096909790989099910091019102910391049105910691079108910991109111911291139114911591169117911891199120912191229123912491259126912791289129913091319132913391349135913691379138913991409141914291439144914591469147914891499150915191529153915491559156915791589159916091619162916391649165916691679168916991709171917291739174917591769177917891799180918191829183918491859186918791889189919091919192919391949195919691979198919992009201920292039204920592069207920892099210921192129213921492159216921792189219922092219222922392249225922692279228922992309231923292339234923592369237923892399240924192429243924492459246924792489249925092519252925392549255925692579258925992609261926292639264926592669267926892699270927192729273927492759276927792789279928092819282928392849285928692879288928992909291929292939294929592969297929892999300930193029303930493059306930793089309931093119312931393149315931693179318931993209321932293239324932593269327932893299330933193329333933493359336933793389339934093419342934393449345934693479348934993509351935293539354935593569357935893599360936193629363936493659366936793689369937093719372937393749375937693779378937993809381938293839384938593869387938893899390939193929393939493959396939793989399940094019402940394049405940694079408940994109411941294139414941594169417941894199420942194229423942494259426942794289429943094319432943394349435943694379438943994409441944294439444944594469447944894499450945194529453945494559456945794589459946094619462946394649465946694679468946994709471947294739474947594769477947894799480948194829483948494859486948794889489949094919492949394949495949694979498949995009501950295039504950595069507950895099510951195129513951495159516951795189519952095219522952395249525952695279528952995309531953295339534953595369537953895399540954195429543954495459546954795489549955095519552955395549555955695579558955995609561956295639564956595669567956895699570957195729573957495759576957795789579958095819582958395849585958695879588958995909591959295939594959595969597959895999600960196029603960496059606960796089609961096119612961396149615961696179618961996209621962296239624962596269627962896299630963196329633963496359636963796389639964096419642964396449645964696479648964996509651965296539654965596569657965896599660966196629663966496659666966796689669967096719672967396749675967696779678967996809681968296839684968596869687968896899690969196929693969496959696969796989699970097019702970397049705970697079708970997109711971297139714971597169717971897199720972197229723972497259726972797289729973097319732973397349735973697379738973997409741974297439744974597469747974897499750975197529753975497559756975797589759976097619762976397649765976697679768976997709771977297739774977597769777977897799780978197829783978497859786978797889789979097919792979397949795979697979798979998009801980298039804980598069807980898099810981198129813981498159816981798189819982098219822982398249825982698279828982998309831983298339834983598369837983898399840984198429843984498459846984798489849985098519852985398549855985698579858985998609861986298639864986598669867986898699870987198729873987498759876987798789879988098819882988398849885988698879888988998909891989298939894989598969897989898999900990199029903990499059906990799089909991099119912991399149915991699179918991999209921992299239924992599269927992899299930993199329933993499359936993799389939994099419942994399449945994699479948994999509951995299539954995599569957995899599960996199629963996499659966996799689969997099719972997399749975997699779978997999809981998299839984998599869987998899899990999199929993999499959996999799989999100001000110002100031000410005100061000710008100091001010011100121001310014100151001610017100181001910020100211002210023100241002510026100271002810029100301003110032100331003410035100361003710038100391004010041100421004310044100451004610047100481004910050100511005210053100541005510056100571005810059100601006110062100631006410065100661006710068100691007010071100721007310074100751007610077100781007910080100811008210083100841008510086100871008810089100901009110092100931009410095100961009710098100991010010101101021010310104101051010610107101081010910110101111011210113101141011510116101171011810119101201012110122101231012410125101261012710128101291013010131101321013310134101351013610137101381013910140101411014210143101441014510146101471014810149101501015110152101531015410155101561015710158101591016010161101621016310164101651016610167101681016910170101711017210173101741017510176101771017810179101801018110182101831018410185101861018710188101891019010191101921019310194101951019610197101981019910200102011020210203102041020510206102071020810209102101021110212102131021410215102161021710218102191022010221102221022310224102251022610227102281022910230102311023210233102341023510236102371023810239102401024110242102431024410245102461024710248102491025010251102521025310254102551025610257102581025910260102611026210263102641026510266102671026810269102701027110272102731027410275102761027710278102791028010281102821028310284102851028610287102881028910290102911029210293102941029510296102971029810299103001030110302103031030410305103061030710308103091031010311103121031310314103151031610317103181031910320103211032210323103241032510326103271032810329103301033110332103331033410335103361033710338103391034010341103421034310344103451034610347103481034910350103511035210353103541035510356103571035810359103601036110362103631036410365103661036710368103691037010371103721037310374103751037610377103781037910380103811038210383103841038510386103871038810389103901039110392103931039410395103961039710398103991040010401104021040310404104051040610407104081040910410104111041210413104141041510416104171041810419104201042110422104231042410425104261042710428104291043010431104321043310434104351043610437104381043910440104411044210443104441044510446104471044810449104501045110452104531045410455104561045710458104591046010461104621046310464104651046610467104681046910470104711047210473104741047510476104771047810479104801048110482104831048410485104861048710488104891049010491104921049310494104951049610497104981049910500105011050210503105041050510506105071050810509105101051110512105131051410515105161051710518105191052010521105221052310524105251052610527105281052910530105311053210533105341053510536105371053810539105401054110542105431054410545105461054710548105491055010551105521055310554105551055610557105581055910560105611056210563105641056510566105671056810569105701057110572105731057410575105761057710578105791058010581105821058310584105851058610587105881058910590105911059210593105941059510596105971059810599106001060110602106031060410605106061060710608106091061010611106121061310614106151061610617106181061910620106211062210623106241062510626106271062810629106301063110632106331063410635106361063710638106391064010641106421064310644106451064610647106481064910650106511065210653106541065510656106571065810659106601066110662106631066410665106661066710668106691067010671106721067310674106751067610677106781067910680106811068210683106841068510686106871068810689106901069110692106931069410695106961069710698106991070010701107021070310704107051070610707107081070910710107111071210713107141071510716107171071810719107201072110722107231072410725107261072710728107291073010731107321073310734107351073610737107381073910740107411074210743107441074510746107471074810749107501075110752107531075410755107561075710758107591076010761107621076310764107651076610767107681076910770107711077210773107741077510776107771077810779107801078110782107831078410785107861078710788107891079010791107921079310794107951079610797107981079910800108011080210803108041080510806108071080810809108101081110812108131081410815108161081710818108191082010821108221082310824108251082610827108281082910830108311083210833108341083510836108371083810839108401084110842108431084410845108461084710848108491085010851108521085310854108551085610857108581085910860108611086210863108641086510866108671086810869108701087110872108731087410875108761087710878108791088010881108821088310884108851088610887108881088910890108911089210893108941089510896108971089810899109001090110902109031090410905109061090710908109091091010911109121091310914109151091610917109181091910920109211092210923109241092510926109271092810929109301093110932109331093410935109361093710938109391094010941109421094310944109451094610947109481094910950109511095210953109541095510956109571095810959109601096110962109631096410965109661096710968109691097010971109721097310974109751097610977109781097910980109811098210983109841098510986109871098810989109901099110992109931099410995109961099710998109991100011001110021100311004110051100611007110081100911010110111101211013110141101511016110171101811019110201102111022110231102411025110261102711028110291103011031110321103311034110351103611037110381103911040110411104211043110441104511046110471104811049110501105111052110531105411055110561105711058110591106011061110621106311064110651106611067110681106911070110711107211073110741107511076110771107811079110801108111082110831108411085110861108711088110891109011091110921109311094110951109611097110981109911100111011110211103111041110511106111071110811109111101111111112111131111411115111161111711118111191112011121111221112311124111251112611127111281112911130111311113211133111341113511136111371113811139111401114111142111431114411145111461114711148111491115011151111521115311154111551115611157111581115911160111611116211163111641116511166111671116811169111701117111172111731117411175111761117711178111791118011181111821118311184111851118611187111881118911190111911119211193111941119511196111971119811199112001120111202112031120411205112061120711208112091121011211112121121311214112151121611217112181121911220112211122211223112241122511226112271122811229112301123111232112331123411235112361123711238112391124011241112421124311244112451124611247112481124911250112511125211253112541125511256112571125811259112601126111262112631126411265112661126711268112691127011271112721127311274112751127611277112781127911280112811128211283112841128511286112871128811289112901129111292112931129411295112961129711298112991130011301113021130311304113051130611307113081130911310113111131211313113141131511316113171131811319113201132111322113231132411325113261132711328113291133011331113321133311334113351133611337113381133911340113411134211343113441134511346113471134811349113501135111352113531135411355113561135711358113591136011361113621136311364113651136611367113681136911370113711137211373113741137511376113771137811379113801138111382113831138411385113861138711388113891139011391113921139311394113951139611397113981139911400114011140211403114041140511406114071140811409114101141111412114131141411415114161141711418114191142011421114221142311424114251142611427114281142911430114311143211433114341143511436114371143811439114401144111442114431144411445114461144711448114491145011451114521145311454114551145611457114581145911460114611146211463114641146511466114671146811469114701147111472114731147411475114761147711478114791148011481114821148311484114851148611487114881148911490114911149211493114941149511496114971149811499115001150111502115031150411505115061150711508115091151011511115121151311514115151151611517115181151911520115211152211523115241152511526115271152811529115301153111532115331153411535115361153711538115391154011541115421154311544115451154611547115481154911550115511155211553115541155511556115571155811559115601156111562115631156411565115661156711568115691157011571115721157311574115751157611577115781157911580115811158211583115841158511586115871158811589115901159111592115931159411595115961159711598115991160011601116021160311604116051160611607116081160911610116111161211613116141161511616116171161811619116201162111622116231162411625116261162711628116291163011631116321163311634116351163611637116381163911640116411164211643116441164511646116471164811649116501165111652116531165411655116561165711658116591166011661116621166311664116651166611667116681166911670116711167211673116741167511676116771167811679116801168111682116831168411685116861168711688116891169011691116921169311694116951169611697116981169911700117011170211703117041170511706117071170811709117101171111712117131171411715117161171711718117191172011721117221172311724117251172611727117281172911730117311173211733117341173511736117371173811739117401174111742117431174411745117461174711748117491175011751117521175311754117551175611757117581175911760117611176211763117641176511766117671176811769117701177111772117731177411775117761177711778117791178011781117821178311784117851178611787117881178911790117911179211793117941179511796117971179811799118001180111802118031180411805118061180711808118091181011811118121181311814118151181611817118181181911820118211182211823118241182511826118271182811829118301183111832118331183411835118361183711838118391184011841118421184311844118451184611847118481184911850118511185211853118541185511856118571185811859118601186111862118631186411865118661186711868118691187011871118721187311874118751187611877118781187911880118811188211883118841188511886118871188811889118901189111892118931189411895118961189711898118991190011901119021190311904119051190611907119081190911910119111191211913119141191511916119171191811919119201192111922119231192411925119261192711928119291193011931119321193311934119351193611937119381193911940119411194211943119441194511946119471194811949119501195111952119531195411955119561195711958119591196011961119621196311964119651196611967119681196911970119711197211973119741197511976119771197811979119801198111982119831198411985119861198711988119891199011991119921199311994119951199611997119981199912000120011200212003120041200512006120071200812009120101201112012120131201412015120161201712018120191202012021120221202312024120251202612027120281202912030120311203212033120341203512036120371203812039120401204112042120431204412045120461204712048120491205012051120521205312054120551205612057120581205912060120611206212063120641206512066120671206812069120701207112072120731207412075120761207712078120791208012081120821208312084120851208612087120881208912090120911209212093120941209512096120971209812099121001210112102121031210412105121061210712108121091211012111121121211312114121151211612117121181211912120121211212212123121241212512126121271212812129121301213112132121331213412135121361213712138121391214012141121421214312144121451214612147121481214912150121511215212153121541215512156121571215812159121601216112162121631216412165121661216712168121691217012171121721217312174121751217612177121781217912180121811218212183121841218512186121871218812189121901219112192121931219412195121961219712198121991220012201122021220312204122051220612207122081220912210122111221212213122141221512216122171221812219122201222112222122231222412225122261222712228122291223012231122321223312234122351223612237122381223912240122411224212243122441224512246122471224812249122501225112252122531225412255122561225712258122591226012261122621226312264122651226612267122681226912270122711227212273122741227512276122771227812279122801228112282122831228412285122861228712288122891229012291122921229312294122951229612297122981229912300123011230212303123041230512306123071230812309123101231112312123131231412315123161231712318123191232012321123221232312324123251232612327123281232912330123311233212333123341233512336123371233812339123401234112342123431234412345123461234712348123491235012351123521235312354123551235612357123581235912360123611236212363123641236512366123671236812369123701237112372123731237412375123761237712378123791238012381123821238312384123851238612387123881238912390123911239212393123941239512396123971239812399124001240112402124031240412405124061240712408124091241012411124121241312414124151241612417124181241912420124211242212423124241242512426124271242812429124301243112432124331243412435124361243712438124391244012441124421244312444124451244612447124481244912450124511245212453124541245512456124571245812459124601246112462124631246412465124661246712468124691247012471124721247312474124751247612477124781247912480124811248212483124841248512486124871248812489124901249112492124931249412495124961249712498124991250012501125021250312504125051250612507125081250912510125111251212513125141251512516125171251812519125201252112522125231252412525125261252712528125291253012531125321253312534125351253612537125381253912540125411254212543125441254512546125471254812549125501255112552125531255412555125561255712558125591256012561125621256312564125651256612567125681256912570125711257212573125741257512576125771257812579125801258112582125831258412585125861258712588125891259012591125921259312594125951259612597125981259912600126011260212603126041260512606126071260812609126101261112612126131261412615126161261712618126191262012621126221262312624126251262612627126281262912630126311263212633126341263512636126371263812639126401264112642126431264412645126461264712648126491265012651126521265312654126551265612657126581265912660126611266212663126641266512666126671266812669126701267112672126731267412675126761267712678126791268012681126821268312684126851268612687126881268912690126911269212693126941269512696126971269812699127001270112702127031270412705127061270712708127091271012711127121271312714127151271612717127181271912720127211272212723127241272512726127271272812729127301273112732127331273412735127361273712738127391274012741127421274312744127451274612747127481274912750127511275212753127541275512756127571275812759127601276112762127631276412765127661276712768127691277012771127721277312774127751277612777127781277912780127811278212783127841278512786127871278812789127901279112792127931279412795127961279712798127991280012801128021280312804128051280612807128081280912810128111281212813128141281512816128171281812819128201282112822128231282412825128261282712828128291283012831128321283312834128351283612837128381283912840128411284212843128441284512846128471284812849128501285112852128531285412855128561285712858128591286012861128621286312864128651286612867128681286912870128711287212873128741287512876128771287812879128801288112882128831288412885128861288712888128891289012891128921289312894128951289612897128981289912900129011290212903129041290512906129071290812909129101291112912129131291412915129161291712918129191292012921129221292312924129251292612927129281292912930129311293212933129341293512936129371293812939129401294112942129431294412945129461294712948129491295012951129521295312954129551295612957129581295912960129611296212963129641296512966129671296812969129701297112972129731297412975129761297712978129791298012981129821298312984129851298612987129881298912990129911299212993129941299512996129971299812999130001300113002130031300413005130061300713008130091301013011130121301313014130151301613017130181301913020130211302213023130241302513026130271302813029130301303113032130331303413035130361303713038130391304013041130421304313044130451304613047130481304913050130511305213053130541305513056130571305813059130601306113062130631306413065130661306713068130691307013071130721307313074130751307613077130781307913080130811308213083130841308513086130871308813089130901309113092130931309413095130961309713098130991310013101131021310313104131051310613107131081310913110131111311213113131141311513116131171311813119131201312113122131231312413125131261312713128131291313013131131321313313134131351313613137131381313913140131411314213143131441314513146131471314813149131501315113152131531315413155131561315713158131591316013161131621316313164131651316613167131681316913170131711317213173131741317513176131771317813179131801318113182131831318413185131861318713188131891319013191131921319313194131951319613197131981319913200132011320213203132041320513206132071320813209132101321113212132131321413215132161321713218132191322013221132221322313224132251322613227132281322913230132311323213233132341323513236132371323813239132401324113242132431324413245132461324713248132491325013251132521325313254132551325613257132581325913260132611326213263132641326513266132671326813269132701327113272132731327413275132761327713278132791328013281132821328313284132851328613287132881328913290132911329213293132941329513296132971329813299133001330113302133031330413305133061330713308133091331013311133121331313314133151331613317133181331913320133211332213323133241332513326133271332813329133301333113332133331333413335133361333713338133391334013341133421334313344133451334613347133481334913350133511335213353133541335513356133571335813359133601336113362133631336413365133661336713368133691337013371133721337313374133751337613377133781337913380133811338213383133841338513386133871338813389133901339113392133931339413395133961339713398133991340013401134021340313404134051340613407134081340913410134111341213413134141341513416134171341813419134201342113422134231342413425134261342713428134291343013431134321343313434134351343613437134381343913440134411344213443134441344513446134471344813449134501345113452134531345413455134561345713458134591346013461134621346313464134651346613467134681346913470134711347213473134741347513476134771347813479134801348113482134831348413485134861348713488134891349013491134921349313494134951349613497134981349913500135011350213503135041350513506135071350813509135101351113512135131351413515135161351713518135191352013521135221352313524135251352613527135281352913530135311353213533135341353513536135371353813539135401354113542135431354413545135461354713548135491355013551135521355313554135551355613557135581355913560135611356213563135641356513566135671356813569135701357113572135731357413575135761357713578135791358013581135821358313584135851358613587135881358913590135911359213593135941359513596135971359813599136001360113602136031360413605136061360713608136091361013611136121361313614136151361613617136181361913620136211362213623136241362513626136271362813629136301363113632136331363413635136361363713638136391364013641136421364313644136451364613647136481364913650136511365213653136541365513656136571365813659136601366113662136631366413665136661366713668136691367013671136721367313674136751367613677136781367913680136811368213683136841368513686136871368813689136901369113692136931369413695136961369713698136991370013701137021370313704137051370613707137081370913710137111371213713137141371513716137171371813719137201372113722137231372413725137261372713728137291373013731137321373313734137351373613737137381373913740137411374213743137441374513746137471374813749137501375113752137531375413755137561375713758137591376013761137621376313764137651376613767137681376913770137711377213773137741377513776137771377813779137801378113782137831378413785137861378713788137891379013791137921379313794137951379613797137981379913800138011380213803138041380513806138071380813809138101381113812138131381413815138161381713818138191382013821138221382313824138251382613827138281382913830138311383213833138341383513836138371383813839138401384113842138431384413845138461384713848138491385013851138521385313854138551385613857138581385913860138611386213863138641386513866138671386813869138701387113872138731387413875138761387713878138791388013881138821388313884138851388613887138881388913890138911389213893138941389513896138971389813899139001390113902139031390413905139061390713908139091391013911139121391313914139151391613917139181391913920139211392213923139241392513926139271392813929139301393113932139331393413935139361393713938139391394013941139421394313944139451394613947139481394913950139511395213953139541395513956139571395813959139601396113962139631396413965139661396713968139691397013971139721397313974139751397613977139781397913980139811398213983139841398513986139871398813989139901399113992139931399413995139961399713998139991400014001140021400314004140051400614007140081400914010140111401214013140141401514016140171401814019140201402114022140231402414025140261402714028140291403014031140321403314034140351403614037140381403914040140411404214043140441404514046140471404814049140501405114052140531405414055140561405714058140591406014061140621406314064140651406614067140681406914070140711407214073140741407514076140771407814079140801408114082140831408414085140861408714088140891409014091140921409314094140951409614097140981409914100141011410214103141041410514106141071410814109141101411114112141131411414115141161411714118141191412014121141221412314124141251412614127141281412914130141311413214133141341413514136141371413814139141401414114142141431414414145141461414714148141491415014151141521415314154141551415614157141581415914160141611416214163141641416514166141671416814169141701417114172141731417414175141761417714178141791418014181141821418314184141851418614187141881418914190141911419214193141941419514196141971419814199142001420114202142031420414205142061420714208142091421014211142121421314214142151421614217142181421914220142211422214223142241422514226142271422814229142301423114232142331423414235142361423714238142391424014241142421424314244142451424614247142481424914250142511425214253142541425514256142571425814259142601426114262142631426414265142661426714268142691427014271142721427314274142751427614277142781427914280142811428214283142841428514286142871428814289142901429114292142931429414295142961429714298142991430014301143021430314304143051430614307143081430914310143111431214313143141431514316143171431814319143201432114322143231432414325143261432714328143291433014331143321433314334143351433614337143381433914340143411434214343143441434514346143471434814349143501435114352143531435414355143561435714358143591436014361143621436314364143651436614367143681436914370143711437214373143741437514376143771437814379143801438114382143831438414385143861438714388143891439014391143921439314394143951439614397143981439914400144011440214403144041440514406144071440814409144101441114412144131441414415144161441714418144191442014421144221442314424144251442614427144281442914430144311443214433144341443514436144371443814439144401444114442144431444414445144461444714448144491445014451144521445314454144551445614457144581445914460144611446214463144641446514466144671446814469144701447114472144731447414475144761447714478144791448014481144821448314484144851448614487144881448914490144911449214493144941449514496144971449814499145001450114502145031450414505145061450714508145091451014511145121451314514145151451614517145181451914520145211452214523145241452514526145271452814529145301453114532145331453414535145361453714538145391454014541145421454314544145451454614547145481454914550145511455214553145541455514556145571455814559145601456114562145631456414565145661456714568145691457014571145721457314574145751457614577145781457914580145811458214583145841458514586145871458814589145901459114592145931459414595145961459714598145991460014601146021460314604146051460614607146081460914610146111461214613146141461514616146171461814619146201462114622146231462414625146261462714628146291463014631146321463314634146351463614637146381463914640146411464214643146441464514646146471464814649146501465114652146531465414655146561465714658146591466014661146621466314664146651466614667146681466914670146711467214673146741467514676146771467814679146801468114682146831468414685146861468714688146891469014691146921469314694146951469614697146981469914700147011470214703147041470514706147071470814709147101471114712147131471414715147161471714718147191472014721147221472314724147251472614727147281472914730147311473214733147341473514736147371473814739147401474114742147431474414745147461474714748147491475014751147521475314754147551475614757147581475914760147611476214763147641476514766147671476814769147701477114772147731477414775147761477714778147791478014781147821478314784147851478614787147881478914790147911479214793147941479514796147971479814799148001480114802148031480414805148061480714808148091481014811148121481314814148151481614817148181481914820148211482214823148241482514826148271482814829148301483114832148331483414835148361483714838148391484014841148421484314844148451484614847148481484914850148511485214853148541485514856148571485814859148601486114862148631486414865148661486714868148691487014871148721487314874148751487614877148781487914880148811488214883148841488514886148871488814889148901489114892148931489414895148961489714898148991490014901149021490314904149051490614907149081490914910149111491214913149141491514916149171491814919149201492114922149231492414925149261492714928149291493014931149321493314934149351493614937149381493914940149411494214943149441494514946149471494814949149501495114952149531495414955149561495714958149591496014961149621496314964149651496614967149681496914970149711497214973149741497514976149771497814979149801498114982149831498414985149861498714988149891499014991149921499314994149951499614997149981499915000150011500215003150041500515006150071500815009150101501115012150131501415015150161501715018150191502015021150221502315024150251502615027150281502915030150311503215033150341503515036150371503815039150401504115042150431504415045150461504715048150491505015051150521505315054150551505615057150581505915060150611506215063150641506515066150671506815069150701507115072150731507415075150761507715078150791508015081150821508315084150851508615087150881508915090150911509215093150941509515096150971509815099151001510115102151031510415105151061510715108151091511015111151121511315114151151511615117151181511915120151211512215123151241512515126151271512815129151301513115132151331513415135151361513715138151391514015141151421514315144151451514615147151481514915150151511515215153151541515515156151571515815159151601516115162151631516415165151661516715168151691517015171151721517315174151751517615177151781517915180151811518215183151841518515186151871518815189151901519115192151931519415195151961519715198151991520015201152021520315204152051520615207152081520915210152111521215213152141521515216152171521815219152201522115222152231522415225152261522715228152291523015231152321523315234152351523615237152381523915240152411524215243152441524515246152471524815249152501525115252152531525415255152561525715258152591526015261152621526315264152651526615267152681526915270152711527215273152741527515276152771527815279152801528115282152831528415285152861528715288152891529015291152921529315294152951529615297152981529915300153011530215303153041530515306153071530815309153101531115312153131531415315153161531715318153191532015321153221532315324153251532615327153281532915330153311533215333153341533515336153371533815339153401534115342153431534415345153461534715348153491535015351153521535315354153551535615357153581535915360153611536215363153641536515366153671536815369153701537115372153731537415375153761537715378153791538015381153821538315384153851538615387153881538915390153911539215393153941539515396153971539815399154001540115402154031540415405154061540715408154091541015411154121541315414154151541615417154181541915420154211542215423154241542515426154271542815429154301543115432154331543415435154361543715438154391544015441154421544315444154451544615447154481544915450154511545215453154541545515456154571545815459154601546115462154631546415465154661546715468154691547015471154721547315474154751547615477154781547915480154811548215483154841548515486154871548815489154901549115492154931549415495154961549715498154991550015501155021550315504155051550615507155081550915510155111551215513155141551515516155171551815519155201552115522155231552415525155261552715528155291553015531155321553315534155351553615537155381553915540155411554215543155441554515546155471554815549155501555115552155531555415555155561555715558155591556015561155621556315564155651556615567155681556915570155711557215573155741557515576155771557815579155801558115582155831558415585155861558715588155891559015591155921559315594155951559615597155981559915600156011560215603156041560515606156071560815609156101561115612156131561415615156161561715618156191562015621156221562315624156251562615627156281562915630156311563215633156341563515636156371563815639156401564115642156431564415645156461564715648156491565015651156521565315654156551565615657156581565915660156611566215663156641566515666156671566815669156701567115672156731567415675156761567715678156791568015681156821568315684156851568615687156881568915690156911569215693156941569515696156971569815699157001570115702157031570415705157061570715708157091571015711157121571315714157151571615717157181571915720157211572215723157241572515726157271572815729157301573115732157331573415735157361573715738157391574015741157421574315744157451574615747157481574915750157511575215753157541575515756157571575815759157601576115762157631576415765157661576715768157691577015771157721577315774157751577615777157781577915780157811578215783157841578515786157871578815789157901579115792157931579415795157961579715798157991580015801158021580315804158051580615807158081580915810158111581215813158141581515816158171581815819158201582115822158231582415825158261582715828158291583015831158321583315834158351583615837158381583915840158411584215843158441584515846158471584815849158501585115852158531585415855158561585715858158591586015861158621586315864158651586615867158681586915870158711587215873158741587515876158771587815879158801588115882158831588415885158861588715888158891589015891158921589315894158951589615897158981589915900159011590215903159041590515906159071590815909159101591115912159131591415915159161591715918159191592015921159221592315924159251592615927159281592915930159311593215933159341593515936159371593815939159401594115942159431594415945159461594715948159491595015951159521595315954159551595615957159581595915960159611596215963159641596515966159671596815969159701597115972159731597415975159761597715978159791598015981159821598315984159851598615987159881598915990159911599215993159941599515996159971599815999160001600116002160031600416005160061600716008160091601016011160121601316014160151601616017160181601916020160211602216023160241602516026160271602816029160301603116032160331603416035160361603716038160391604016041160421604316044160451604616047160481604916050160511605216053160541605516056160571605816059160601606116062160631606416065160661606716068160691607016071160721607316074160751607616077160781607916080160811608216083160841608516086160871608816089160901609116092160931609416095160961609716098160991610016101161021610316104161051610616107161081610916110161111611216113161141611516116161171611816119161201612116122161231612416125161261612716128161291613016131161321613316134161351613616137161381613916140161411614216143161441614516146161471614816149161501615116152161531615416155161561615716158161591616016161161621616316164161651616616167161681616916170161711617216173161741617516176161771617816179161801618116182161831618416185161861618716188161891619016191161921619316194161951619616197161981619916200162011620216203162041620516206162071620816209162101621116212162131621416215162161621716218162191622016221162221622316224162251622616227162281622916230162311623216233162341623516236162371623816239162401624116242162431624416245162461624716248162491625016251162521625316254162551625616257162581625916260162611626216263162641626516266162671626816269162701627116272162731627416275162761627716278162791628016281162821628316284162851628616287162881628916290162911629216293162941629516296162971629816299163001630116302163031630416305163061630716308163091631016311163121631316314163151631616317163181631916320163211632216323163241632516326163271632816329163301633116332163331633416335163361633716338163391634016341163421634316344163451634616347163481634916350163511635216353163541635516356163571635816359163601636116362163631636416365163661636716368163691637016371163721637316374163751637616377163781637916380163811638216383163841638516386163871638816389163901639116392163931639416395163961639716398163991640016401164021640316404164051640616407164081640916410164111641216413164141641516416164171641816419164201642116422164231642416425164261642716428164291643016431164321643316434164351643616437164381643916440164411644216443164441644516446164471644816449164501645116452164531645416455164561645716458164591646016461164621646316464164651646616467164681646916470164711647216473164741647516476164771647816479164801648116482164831648416485164861648716488164891649016491164921649316494164951649616497164981649916500165011650216503165041650516506165071650816509165101651116512165131651416515165161651716518165191652016521165221652316524165251652616527165281652916530165311653216533165341653516536165371653816539165401654116542165431654416545165461654716548165491655016551165521655316554165551655616557165581655916560165611656216563165641656516566165671656816569165701657116572165731657416575165761657716578165791658016581165821658316584165851658616587165881658916590165911659216593165941659516596165971659816599166001660116602166031660416605166061660716608166091661016611166121661316614166151661616617166181661916620166211662216623166241662516626166271662816629166301663116632166331663416635166361663716638166391664016641166421664316644166451664616647166481664916650166511665216653166541665516656166571665816659166601666116662166631666416665166661666716668166691667016671166721667316674166751667616677166781667916680166811668216683166841668516686166871668816689166901669116692166931669416695166961669716698166991670016701167021670316704167051670616707167081670916710167111671216713167141671516716167171671816719167201672116722167231672416725167261672716728167291673016731167321673316734167351673616737167381673916740167411674216743167441674516746167471674816749167501675116752167531675416755167561675716758167591676016761167621676316764167651676616767167681676916770167711677216773167741677516776167771677816779167801678116782167831678416785167861678716788167891679016791167921679316794167951679616797167981679916800168011680216803168041680516806168071680816809168101681116812168131681416815168161681716818168191682016821168221682316824168251682616827168281682916830168311683216833168341683516836168371683816839168401684116842168431684416845168461684716848168491685016851168521685316854168551685616857168581685916860168611686216863168641686516866168671686816869168701687116872168731687416875168761687716878168791688016881168821688316884168851688616887168881688916890168911689216893168941689516896168971689816899169001690116902169031690416905169061690716908169091691016911169121691316914169151691616917169181691916920169211692216923169241692516926169271692816929169301693116932169331693416935169361693716938169391694016941169421694316944169451694616947169481694916950169511695216953169541695516956169571695816959169601696116962169631696416965169661696716968169691697016971169721697316974169751697616977169781697916980169811698216983169841698516986169871698816989169901699116992169931699416995169961699716998169991700017001170021700317004170051700617007170081700917010170111701217013170141701517016170171701817019170201702117022170231702417025170261702717028170291703017031170321703317034170351703617037170381703917040170411704217043170441704517046170471704817049170501705117052170531705417055170561705717058170591706017061170621706317064170651706617067170681706917070170711707217073170741707517076170771707817079170801708117082170831708417085170861708717088170891709017091170921709317094170951709617097170981709917100171011710217103171041710517106171071710817109171101711117112171131711417115171161711717118171191712017121171221712317124171251712617127171281712917130171311713217133171341713517136171371713817139171401714117142171431714417145171461714717148171491715017151171521715317154171551715617157171581715917160171611716217163171641716517166171671716817169171701717117172171731717417175171761717717178171791718017181171821718317184171851718617187171881718917190171911719217193171941719517196171971719817199172001720117202172031720417205172061720717208172091721017211172121721317214172151721617217172181721917220172211722217223172241722517226172271722817229172301723117232172331723417235172361723717238172391724017241172421724317244172451724617247172481724917250172511725217253172541725517256172571725817259172601726117262172631726417265172661726717268172691727017271172721727317274172751727617277172781727917280172811728217283172841728517286172871728817289172901729117292172931729417295172961729717298172991730017301173021730317304173051730617307173081730917310173111731217313173141731517316173171731817319173201732117322173231732417325173261732717328173291733017331173321733317334173351733617337173381733917340173411734217343173441734517346173471734817349173501735117352173531735417355173561735717358173591736017361173621736317364173651736617367173681736917370173711737217373173741737517376173771737817379173801738117382173831738417385173861738717388173891739017391173921739317394173951739617397173981739917400174011740217403174041740517406174071740817409174101741117412174131741417415174161741717418174191742017421174221742317424174251742617427174281742917430174311743217433174341743517436174371743817439174401744117442174431744417445174461744717448174491745017451174521745317454174551745617457174581745917460174611746217463174641746517466174671746817469174701747117472174731747417475174761747717478174791748017481174821748317484174851748617487174881748917490174911749217493174941749517496174971749817499175001750117502175031750417505175061750717508175091751017511175121751317514175151751617517175181751917520175211752217523175241752517526175271752817529175301753117532175331753417535175361753717538175391754017541175421754317544175451754617547175481754917550175511755217553175541755517556175571755817559175601756117562175631756417565175661756717568175691757017571175721757317574175751757617577175781757917580175811758217583175841758517586175871758817589175901759117592175931759417595175961759717598175991760017601176021760317604176051760617607176081760917610176111761217613176141761517616176171761817619176201762117622176231762417625176261762717628176291763017631176321763317634176351763617637176381763917640176411764217643176441764517646176471764817649176501765117652176531765417655176561765717658176591766017661176621766317664176651766617667176681766917670176711767217673176741767517676176771767817679176801768117682176831768417685176861768717688176891769017691176921769317694176951769617697176981769917700177011770217703177041770517706177071770817709177101771117712177131771417715177161771717718177191772017721177221772317724177251772617727177281772917730177311773217733177341773517736177371773817739177401774117742177431774417745177461774717748177491775017751177521775317754177551775617757177581775917760177611776217763177641776517766177671776817769177701777117772177731777417775177761777717778177791778017781177821778317784177851778617787177881778917790177911779217793177941779517796177971779817799178001780117802178031780417805178061780717808178091781017811178121781317814178151781617817178181781917820178211782217823178241782517826178271782817829178301783117832178331783417835178361783717838178391784017841178421784317844178451784617847178481784917850178511785217853178541785517856178571785817859178601786117862178631786417865178661786717868178691787017871178721787317874178751787617877178781787917880178811788217883178841788517886178871788817889178901789117892178931789417895178961789717898178991790017901179021790317904179051790617907179081790917910179111791217913179141791517916179171791817919179201792117922179231792417925179261792717928179291793017931179321793317934179351793617937179381793917940179411794217943179441794517946179471794817949179501795117952179531795417955179561795717958179591796017961179621796317964179651796617967179681796917970179711797217973179741797517976179771797817979179801798117982179831798417985179861798717988179891799017991179921799317994179951799617997179981799918000180011800218003180041800518006180071800818009180101801118012180131801418015180161801718018180191802018021180221802318024180251802618027180281802918030180311803218033180341803518036180371803818039180401804118042180431804418045180461804718048180491805018051180521805318054180551805618057180581805918060180611806218063180641806518066180671806818069180701807118072180731807418075180761807718078180791808018081180821808318084180851808618087180881808918090180911809218093180941809518096180971809818099181001810118102181031810418105181061810718108181091811018111181121811318114181151811618117181181811918120181211812218123181241812518126181271812818129181301813118132181331813418135181361813718138181391814018141181421814318144181451814618147181481814918150181511815218153181541815518156181571815818159181601816118162181631816418165181661816718168181691817018171181721817318174181751817618177181781817918180181811818218183181841818518186181871818818189181901819118192181931819418195181961819718198181991820018201182021820318204182051820618207182081820918210182111821218213182141821518216182171821818219182201822118222182231822418225182261822718228182291823018231182321823318234182351823618237182381823918240182411824218243182441824518246182471824818249182501825118252182531825418255182561825718258182591826018261182621826318264182651826618267182681826918270182711827218273182741827518276182771827818279182801828118282182831828418285182861828718288182891829018291182921829318294182951829618297182981829918300183011830218303183041830518306183071830818309183101831118312183131831418315183161831718318183191832018321183221832318324183251832618327183281832918330183311833218333183341833518336183371833818339183401834118342183431834418345183461834718348183491835018351183521835318354183551835618357183581835918360183611836218363183641836518366183671836818369183701837118372183731837418375183761837718378183791838018381183821838318384183851838618387183881838918390183911839218393183941839518396183971839818399184001840118402184031840418405184061840718408184091841018411184121841318414184151841618417184181841918420184211842218423184241842518426184271842818429184301843118432184331843418435184361843718438184391844018441184421844318444184451844618447184481844918450184511845218453184541845518456184571845818459184601846118462184631846418465184661846718468184691847018471184721847318474184751847618477184781847918480184811848218483184841848518486184871848818489184901849118492184931849418495184961849718498184991850018501185021850318504185051850618507185081850918510185111851218513185141851518516185171851818519185201852118522185231852418525185261852718528185291853018531185321853318534185351853618537185381853918540185411854218543185441854518546185471854818549185501855118552185531855418555185561855718558185591856018561185621856318564185651856618567185681856918570185711857218573185741857518576185771857818579185801858118582185831858418585185861858718588185891859018591185921859318594185951859618597185981859918600186011860218603186041860518606186071860818609186101861118612186131861418615186161861718618186191862018621186221862318624186251862618627186281862918630186311863218633186341863518636186371863818639186401864118642186431864418645186461864718648186491865018651186521865318654186551865618657186581865918660186611866218663186641866518666186671866818669186701867118672186731867418675186761867718678186791868018681186821868318684186851868618687186881868918690186911869218693186941869518696186971869818699187001870118702187031870418705187061870718708187091871018711187121871318714187151871618717187181871918720187211872218723187241872518726187271872818729187301873118732187331873418735187361873718738187391874018741187421874318744187451874618747187481874918750187511875218753187541875518756187571875818759187601876118762187631876418765187661876718768187691877018771187721877318774187751877618777187781877918780187811878218783187841878518786187871878818789187901879118792187931879418795187961879718798187991880018801188021880318804188051880618807188081880918810188111881218813188141881518816188171881818819188201882118822188231882418825188261882718828188291883018831188321883318834188351883618837188381883918840188411884218843188441884518846188471884818849188501885118852188531885418855188561885718858188591886018861188621886318864188651886618867188681886918870188711887218873188741887518876188771887818879188801888118882188831888418885188861888718888188891889018891188921889318894188951889618897188981889918900189011890218903189041890518906189071890818909189101891118912189131891418915189161891718918189191892018921189221892318924189251892618927189281892918930189311893218933189341893518936189371893818939189401894118942189431894418945189461894718948189491895018951189521895318954189551895618957189581895918960189611896218963189641896518966189671896818969189701897118972189731897418975189761897718978189791898018981189821898318984189851898618987189881898918990189911899218993189941899518996189971899818999190001900119002190031900419005190061900719008190091901019011190121901319014190151901619017190181901919020190211902219023190241902519026190271902819029190301903119032190331903419035190361903719038190391904019041190421904319044190451904619047190481904919050190511905219053190541905519056190571905819059190601906119062190631906419065190661906719068190691907019071190721907319074190751907619077190781907919080190811908219083190841908519086190871908819089190901909119092190931909419095190961909719098190991910019101191021910319104191051910619107191081910919110191111911219113191141911519116191171911819119191201912119122191231912419125191261912719128191291913019131191321913319134191351913619137191381913919140191411914219143191441914519146191471914819149191501915119152191531915419155191561915719158191591916019161191621916319164191651916619167191681916919170191711917219173191741917519176191771917819179191801918119182191831918419185191861918719188191891919019191191921919319194191951919619197191981919919200192011920219203192041920519206192071920819209192101921119212192131921419215192161921719218192191922019221192221922319224192251922619227192281922919230192311923219233192341923519236192371923819239192401924119242192431924419245192461924719248192491925019251192521925319254192551925619257192581925919260192611926219263192641926519266192671926819269192701927119272192731927419275192761927719278192791928019281192821928319284192851928619287192881928919290192911929219293192941929519296192971929819299193001930119302193031930419305193061930719308193091931019311193121931319314193151931619317193181931919320193211932219323193241932519326193271932819329193301933119332193331933419335193361933719338193391934019341193421934319344193451934619347193481934919350193511935219353193541935519356193571935819359193601936119362193631936419365193661936719368193691937019371193721937319374193751937619377193781937919380193811938219383193841938519386193871938819389193901939119392193931939419395193961939719398193991940019401194021940319404194051940619407194081940919410194111941219413194141941519416194171941819419194201942119422194231942419425194261942719428194291943019431194321943319434194351943619437194381943919440194411944219443194441944519446194471944819449194501945119452194531945419455194561945719458194591946019461194621946319464194651946619467194681946919470194711947219473194741947519476194771947819479194801948119482194831948419485194861948719488194891949019491194921949319494194951949619497194981949919500195011950219503195041950519506195071950819509195101951119512195131951419515195161951719518195191952019521195221952319524195251952619527195281952919530195311953219533195341953519536195371953819539195401954119542195431954419545195461954719548195491955019551195521955319554195551955619557195581955919560195611956219563195641956519566195671956819569195701957119572195731957419575195761957719578195791958019581195821958319584195851958619587195881958919590195911959219593195941959519596195971959819599196001960119602196031960419605196061960719608196091961019611196121961319614196151961619617196181961919620196211962219623196241962519626196271962819629196301963119632196331963419635196361963719638196391964019641196421964319644196451964619647196481964919650196511965219653196541965519656196571965819659196601966119662196631966419665196661966719668196691967019671196721967319674196751967619677196781967919680196811968219683196841968519686196871968819689196901969119692196931969419695196961969719698196991970019701197021970319704197051970619707197081970919710197111971219713197141971519716197171971819719197201972119722197231972419725197261972719728197291973019731197321973319734197351973619737197381973919740197411974219743197441974519746197471974819749197501975119752197531975419755197561975719758197591976019761197621976319764197651976619767197681976919770197711977219773197741977519776197771977819779197801978119782197831978419785197861978719788197891979019791197921979319794197951979619797197981979919800198011980219803198041980519806198071980819809198101981119812198131981419815198161981719818198191982019821198221982319824198251982619827198281982919830198311983219833198341983519836198371983819839198401984119842198431984419845198461984719848198491985019851198521985319854198551985619857198581985919860198611986219863198641986519866198671986819869198701987119872198731987419875198761987719878198791988019881198821988319884198851988619887198881988919890198911989219893198941989519896198971989819899199001990119902199031990419905199061990719908199091991019911199121991319914199151991619917199181991919920199211992219923199241992519926199271992819929199301993119932199331993419935199361993719938199391994019941199421994319944199451994619947199481994919950199511995219953199541995519956199571995819959199601996119962199631996419965199661996719968199691997019971199721997319974199751997619977199781997919980199811998219983199841998519986199871998819989199901999119992199931999419995199961999719998199992000020001200022000320004200052000620007200082000920010200112001220013200142001520016200172001820019200202002120022200232002420025200262002720028200292003020031200322003320034200352003620037200382003920040200412004220043200442004520046200472004820049200502005120052200532005420055200562005720058200592006020061200622006320064200652006620067200682006920070200712007220073200742007520076200772007820079200802008120082200832008420085200862008720088200892009020091200922009320094200952009620097200982009920100201012010220103201042010520106201072010820109201102011120112201132011420115201162011720118201192012020121201222012320124201252012620127201282012920130201312013220133201342013520136201372013820139201402014120142201432014420145201462014720148201492015020151201522015320154201552015620157201582015920160201612016220163201642016520166201672016820169201702017120172201732017420175201762017720178201792018020181201822018320184201852018620187201882018920190201912019220193201942019520196201972019820199202002020120202202032020420205202062020720208202092021020211202122021320214202152021620217202182021920220202212022220223202242022520226202272022820229202302023120232202332023420235202362023720238202392024020241202422024320244202452024620247202482024920250202512025220253202542025520256202572025820259202602026120262202632026420265202662026720268202692027020271202722027320274202752027620277202782027920280202812028220283202842028520286202872028820289202902029120292202932029420295202962029720298202992030020301203022030320304203052030620307203082030920310203112031220313203142031520316203172031820319203202032120322203232032420325203262032720328203292033020331203322033320334203352033620337203382033920340203412034220343203442034520346203472034820349203502035120352203532035420355203562035720358203592036020361203622036320364203652036620367203682036920370203712037220373203742037520376203772037820379203802038120382203832038420385203862038720388203892039020391203922039320394203952039620397203982039920400204012040220403204042040520406204072040820409204102041120412204132041420415204162041720418204192042020421204222042320424204252042620427204282042920430204312043220433204342043520436204372043820439204402044120442204432044420445204462044720448204492045020451204522045320454204552045620457204582045920460204612046220463204642046520466204672046820469204702047120472204732047420475204762047720478204792048020481204822048320484204852048620487204882048920490204912049220493204942049520496204972049820499205002050120502205032050420505205062050720508205092051020511205122051320514205152051620517205182051920520205212052220523205242052520526205272052820529205302053120532205332053420535205362053720538205392054020541205422054320544205452054620547205482054920550205512055220553205542055520556205572055820559205602056120562205632056420565205662056720568205692057020571205722057320574205752057620577205782057920580205812058220583205842058520586205872058820589205902059120592205932059420595205962059720598205992060020601206022060320604206052060620607206082060920610206112061220613206142061520616206172061820619206202062120622206232062420625206262062720628206292063020631206322063320634206352063620637206382063920640206412064220643206442064520646206472064820649206502065120652206532065420655206562065720658206592066020661206622066320664206652066620667206682066920670206712067220673206742067520676206772067820679206802068120682206832068420685206862068720688206892069020691206922069320694206952069620697206982069920700207012070220703207042070520706207072070820709207102071120712207132071420715207162071720718207192072020721207222072320724207252072620727207282072920730207312073220733207342073520736207372073820739207402074120742207432074420745207462074720748207492075020751207522075320754207552075620757207582075920760207612076220763207642076520766207672076820769207702077120772207732077420775207762077720778207792078020781207822078320784207852078620787207882078920790207912079220793207942079520796207972079820799208002080120802208032080420805208062080720808208092081020811208122081320814208152081620817208182081920820208212082220823208242082520826208272082820829208302083120832208332083420835208362083720838208392084020841208422084320844208452084620847208482084920850208512085220853208542085520856208572085820859208602086120862208632086420865208662086720868208692087020871208722087320874208752087620877208782087920880208812088220883208842088520886208872088820889208902089120892208932089420895208962089720898208992090020901209022090320904209052090620907209082090920910209112091220913209142091520916209172091820919209202092120922209232092420925209262092720928209292093020931209322093320934209352093620937209382093920940209412094220943209442094520946209472094820949209502095120952209532095420955209562095720958209592096020961209622096320964209652096620967209682096920970209712097220973209742097520976209772097820979209802098120982209832098420985209862098720988209892099020991209922099320994209952099620997209982099921000210012100221003210042100521006210072100821009210102101121012210132101421015210162101721018210192102021021210222102321024210252102621027210282102921030210312103221033210342103521036210372103821039210402104121042210432104421045210462104721048210492105021051210522105321054210552105621057210582105921060210612106221063210642106521066210672106821069210702107121072210732107421075210762107721078210792108021081210822108321084210852108621087210882108921090210912109221093210942109521096210972109821099211002110121102211032110421105211062110721108211092111021111211122111321114211152111621117211182111921120211212112221123211242112521126211272112821129211302113121132211332113421135211362113721138211392114021141211422114321144211452114621147211482114921150211512115221153211542115521156211572115821159211602116121162211632116421165211662116721168211692117021171211722117321174211752117621177211782117921180211812118221183211842118521186211872118821189211902119121192211932119421195211962119721198211992120021201212022120321204212052120621207212082120921210212112121221213212142121521216212172121821219212202122121222212232122421225212262122721228212292123021231212322123321234212352123621237212382123921240212412124221243212442124521246212472124821249212502125121252212532125421255212562125721258212592126021261212622126321264212652126621267212682126921270212712127221273212742127521276212772127821279212802128121282212832128421285212862128721288212892129021291212922129321294212952129621297212982129921300213012130221303213042130521306213072130821309213102131121312213132131421315213162131721318213192132021321213222132321324213252132621327213282132921330213312133221333213342133521336213372133821339213402134121342213432134421345213462134721348213492135021351213522135321354213552135621357213582135921360213612136221363213642136521366213672136821369213702137121372213732137421375213762137721378213792138021381213822138321384213852138621387213882138921390213912139221393213942139521396213972139821399214002140121402214032140421405214062140721408214092141021411214122141321414214152141621417214182141921420214212142221423214242142521426214272142821429214302143121432214332143421435214362143721438214392144021441214422144321444214452144621447214482144921450214512145221453214542145521456214572145821459214602146121462214632146421465214662146721468214692147021471214722147321474214752147621477214782147921480214812148221483214842148521486214872148821489214902149121492214932149421495214962149721498214992150021501215022150321504215052150621507215082150921510215112151221513215142151521516215172151821519215202152121522215232152421525215262152721528215292153021531215322153321534215352153621537215382153921540215412154221543215442154521546215472154821549215502155121552215532155421555215562155721558215592156021561215622156321564215652156621567215682156921570215712157221573215742157521576215772157821579215802158121582215832158421585215862158721588215892159021591215922159321594215952159621597215982159921600216012160221603216042160521606216072160821609216102161121612216132161421615216162161721618216192162021621216222162321624216252162621627216282162921630216312163221633216342163521636216372163821639216402164121642216432164421645216462164721648216492165021651216522165321654216552165621657216582165921660216612166221663216642166521666216672166821669216702167121672216732167421675216762167721678216792168021681216822168321684216852168621687216882168921690216912169221693216942169521696216972169821699217002170121702217032170421705217062170721708217092171021711217122171321714217152171621717217182171921720217212172221723217242172521726217272172821729217302173121732217332173421735217362173721738217392174021741217422174321744217452174621747217482174921750217512175221753217542175521756217572175821759217602176121762217632176421765217662176721768217692177021771217722177321774217752177621777217782177921780217812178221783217842178521786217872178821789217902179121792217932179421795217962179721798217992180021801218022180321804218052180621807218082180921810218112181221813218142181521816218172181821819218202182121822218232182421825218262182721828218292183021831218322183321834218352183621837218382183921840218412184221843218442184521846218472184821849218502185121852218532185421855218562185721858218592186021861218622186321864218652186621867218682186921870218712187221873218742187521876218772187821879218802188121882218832188421885218862188721888218892189021891218922189321894218952189621897218982189921900219012190221903219042190521906219072190821909219102191121912219132191421915219162191721918219192192021921219222192321924219252192621927219282192921930219312193221933219342193521936219372193821939219402194121942219432194421945219462194721948219492195021951219522195321954219552195621957219582195921960219612196221963219642196521966219672196821969219702197121972219732197421975219762197721978219792198021981219822198321984219852198621987219882198921990219912199221993219942199521996219972199821999220002200122002220032200422005220062200722008220092201022011220122201322014220152201622017220182201922020220212202222023220242202522026220272202822029220302203122032220332203422035220362203722038220392204022041220422204322044220452204622047220482204922050220512205222053220542205522056220572205822059220602206122062220632206422065220662206722068220692207022071220722207322074220752207622077220782207922080220812208222083220842208522086220872208822089220902209122092220932209422095220962209722098220992210022101221022210322104221052210622107221082210922110221112211222113221142211522116221172211822119221202212122122221232212422125221262212722128221292213022131221322213322134221352213622137221382213922140221412214222143221442214522146221472214822149221502215122152221532215422155221562215722158221592216022161221622216322164221652216622167221682216922170221712217222173221742217522176221772217822179221802218122182221832218422185221862218722188221892219022191221922219322194221952219622197221982219922200222012220222203222042220522206222072220822209222102221122212222132221422215222162221722218222192222022221222222222322224222252222622227222282222922230222312223222233222342223522236222372223822239222402224122242222432224422245222462224722248222492225022251222522225322254222552225622257222582225922260222612226222263222642226522266222672226822269222702227122272222732227422275222762227722278222792228022281222822228322284222852228622287222882228922290222912229222293222942229522296222972229822299223002230122302223032230422305223062230722308223092231022311223122231322314223152231622317223182231922320223212232222323223242232522326223272232822329223302233122332223332233422335223362233722338223392234022341223422234322344223452234622347223482234922350223512235222353223542235522356223572235822359223602236122362223632236422365223662236722368223692237022371223722237322374223752237622377223782237922380223812238222383223842238522386223872238822389223902239122392223932239422395223962239722398223992240022401224022240322404224052240622407224082240922410224112241222413224142241522416224172241822419224202242122422224232242422425224262242722428224292243022431224322243322434224352243622437224382243922440224412244222443224442244522446224472244822449224502245122452224532245422455224562245722458224592246022461224622246322464224652246622467224682246922470224712247222473224742247522476224772247822479224802248122482224832248422485224862248722488224892249022491224922249322494224952249622497224982249922500225012250222503225042250522506225072250822509225102251122512225132251422515225162251722518225192252022521225222252322524225252252622527225282252922530225312253222533225342253522536225372253822539225402254122542225432254422545225462254722548225492255022551225522255322554225552255622557225582255922560225612256222563225642256522566225672256822569225702257122572225732257422575225762257722578225792258022581225822258322584225852258622587225882258922590225912259222593225942259522596225972259822599226002260122602226032260422605226062260722608226092261022611226122261322614226152261622617226182261922620226212262222623226242262522626226272262822629226302263122632226332263422635226362263722638226392264022641226422264322644226452264622647226482264922650226512265222653226542265522656226572265822659226602266122662226632266422665226662266722668226692267022671226722267322674226752267622677226782267922680226812268222683226842268522686226872268822689226902269122692226932269422695226962269722698226992270022701227022270322704227052270622707227082270922710227112271222713227142271522716227172271822719227202272122722227232272422725227262272722728227292273022731227322273322734227352273622737227382273922740227412274222743227442274522746227472274822749227502275122752227532275422755227562275722758227592276022761227622276322764227652276622767227682276922770227712277222773227742277522776227772277822779227802278122782227832278422785227862278722788227892279022791227922279322794227952279622797227982279922800228012280222803228042280522806228072280822809228102281122812228132281422815228162281722818228192282022821228222282322824228252282622827228282282922830228312283222833228342283522836228372283822839228402284122842228432284422845228462284722848228492285022851228522285322854228552285622857228582285922860228612286222863228642286522866228672286822869228702287122872228732287422875228762287722878228792288022881228822288322884228852288622887228882288922890228912289222893228942289522896228972289822899229002290122902229032290422905229062290722908229092291022911229122291322914229152291622917229182291922920229212292222923229242292522926229272292822929229302293122932229332293422935229362293722938229392294022941229422294322944229452294622947229482294922950229512295222953229542295522956229572295822959229602296122962229632296422965229662296722968229692297022971229722297322974229752297622977229782297922980229812298222983229842298522986229872298822989229902299122992229932299422995229962299722998229992300023001230022300323004230052300623007230082300923010230112301223013230142301523016230172301823019230202302123022230232302423025230262302723028230292303023031230322303323034230352303623037230382303923040230412304223043230442304523046230472304823049230502305123052230532305423055230562305723058230592306023061230622306323064230652306623067230682306923070230712307223073230742307523076230772307823079230802308123082230832308423085230862308723088230892309023091230922309323094230952309623097230982309923100231012310223103231042310523106231072310823109231102311123112231132311423115231162311723118231192312023121231222312323124231252312623127231282312923130231312313223133231342313523136231372313823139231402314123142231432314423145231462314723148231492315023151231522315323154231552315623157231582315923160231612316223163231642316523166231672316823169231702317123172231732317423175231762317723178231792318023181231822318323184231852318623187231882318923190231912319223193231942319523196231972319823199232002320123202232032320423205232062320723208232092321023211232122321323214232152321623217232182321923220232212322223223232242322523226232272322823229232302323123232232332323423235232362323723238232392324023241232422324323244232452324623247232482324923250232512325223253232542325523256232572325823259232602326123262232632326423265232662326723268232692327023271232722327323274232752327623277232782327923280232812328223283232842328523286232872328823289232902329123292232932329423295232962329723298232992330023301233022330323304233052330623307233082330923310233112331223313233142331523316233172331823319233202332123322233232332423325233262332723328233292333023331233322333323334233352333623337233382333923340233412334223343233442334523346233472334823349233502335123352233532335423355233562335723358233592336023361233622336323364233652336623367233682336923370233712337223373233742337523376233772337823379233802338123382233832338423385233862338723388233892339023391233922339323394233952339623397233982339923400234012340223403234042340523406234072340823409234102341123412234132341423415234162341723418234192342023421234222342323424234252342623427234282342923430234312343223433234342343523436234372343823439234402344123442234432344423445234462344723448234492345023451234522345323454234552345623457234582345923460234612346223463234642346523466234672346823469234702347123472234732347423475234762347723478234792348023481234822348323484234852348623487234882348923490234912349223493234942349523496234972349823499235002350123502235032350423505235062350723508235092351023511235122351323514235152351623517235182351923520235212352223523235242352523526235272352823529235302353123532235332353423535235362353723538235392354023541235422354323544235452354623547235482354923550235512355223553235542355523556235572355823559235602356123562235632356423565235662356723568235692357023571235722357323574235752357623577235782357923580235812358223583235842358523586235872358823589235902359123592235932359423595235962359723598235992360023601236022360323604236052360623607236082360923610236112361223613236142361523616236172361823619236202362123622236232362423625236262362723628236292363023631236322363323634236352363623637236382363923640236412364223643236442364523646236472364823649236502365123652236532365423655236562365723658236592366023661236622366323664236652366623667236682366923670236712367223673236742367523676236772367823679236802368123682236832368423685236862368723688236892369023691236922369323694236952369623697236982369923700237012370223703237042370523706237072370823709237102371123712237132371423715237162371723718237192372023721237222372323724237252372623727237282372923730237312373223733237342373523736237372373823739237402374123742237432374423745237462374723748237492375023751237522375323754237552375623757237582375923760237612376223763237642376523766237672376823769237702377123772237732377423775237762377723778237792378023781237822378323784237852378623787237882378923790237912379223793237942379523796237972379823799238002380123802238032380423805238062380723808238092381023811238122381323814238152381623817238182381923820238212382223823238242382523826238272382823829238302383123832238332383423835238362383723838238392384023841238422384323844238452384623847238482384923850238512385223853238542385523856238572385823859238602386123862238632386423865238662386723868238692387023871238722387323874238752387623877238782387923880238812388223883238842388523886238872388823889238902389123892238932389423895238962389723898238992390023901239022390323904239052390623907239082390923910239112391223913239142391523916239172391823919239202392123922239232392423925239262392723928239292393023931239322393323934239352393623937239382393923940239412394223943239442394523946239472394823949239502395123952239532395423955239562395723958239592396023961239622396323964239652396623967239682396923970239712397223973239742397523976239772397823979239802398123982239832398423985239862398723988239892399023991239922399323994239952399623997239982399924000240012400224003240042400524006240072400824009240102401124012240132401424015240162401724018240192402024021240222402324024240252402624027240282402924030240312403224033240342403524036240372403824039240402404124042240432404424045240462404724048240492405024051240522405324054240552405624057240582405924060240612406224063240642406524066240672406824069240702407124072240732407424075240762407724078240792408024081240822408324084240852408624087240882408924090240912409224093240942409524096240972409824099241002410124102241032410424105241062410724108241092411024111241122411324114241152411624117241182411924120241212412224123241242412524126241272412824129241302413124132241332413424135241362413724138241392414024141241422414324144241452414624147241482414924150241512415224153241542415524156241572415824159241602416124162241632416424165241662416724168241692417024171241722417324174241752417624177241782417924180241812418224183241842418524186241872418824189241902419124192241932419424195241962419724198241992420024201242022420324204242052420624207242082420924210242112421224213242142421524216242172421824219242202422124222242232422424225242262422724228242292423024231242322423324234242352423624237242382423924240242412424224243242442424524246242472424824249242502425124252242532425424255242562425724258242592426024261242622426324264242652426624267242682426924270242712427224273242742427524276242772427824279242802428124282242832428424285242862428724288242892429024291242922429324294242952429624297242982429924300243012430224303243042430524306243072430824309243102431124312243132431424315243162431724318243192432024321243222432324324243252432624327243282432924330243312433224333243342433524336243372433824339243402434124342243432434424345243462434724348243492435024351243522435324354243552435624357243582435924360243612436224363243642436524366243672436824369243702437124372243732437424375243762437724378243792438024381243822438324384243852438624387243882438924390243912439224393243942439524396243972439824399244002440124402244032440424405244062440724408244092441024411244122441324414244152441624417244182441924420244212442224423244242442524426244272442824429244302443124432244332443424435244362443724438244392444024441244422444324444244452444624447244482444924450244512445224453244542445524456244572445824459244602446124462244632446424465244662446724468244692447024471244722447324474244752447624477244782447924480244812448224483244842448524486244872448824489244902449124492244932449424495244962449724498244992450024501245022450324504245052450624507245082450924510245112451224513245142451524516245172451824519245202452124522245232452424525245262452724528245292453024531245322453324534245352453624537245382453924540245412454224543245442454524546245472454824549245502455124552245532455424555245562455724558245592456024561245622456324564245652456624567245682456924570245712457224573245742457524576245772457824579245802458124582245832458424585245862458724588245892459024591245922459324594245952459624597245982459924600246012460224603246042460524606246072460824609246102461124612246132461424615246162461724618246192462024621246222462324624246252462624627246282462924630246312463224633246342463524636246372463824639246402464124642246432464424645246462464724648246492465024651246522465324654246552465624657246582465924660246612466224663246642466524666246672466824669246702467124672246732467424675246762467724678246792468024681246822468324684246852468624687246882468924690246912469224693246942469524696246972469824699247002470124702247032470424705247062470724708247092471024711247122471324714247152471624717247182471924720247212472224723247242472524726247272472824729247302473124732247332473424735247362473724738247392474024741247422474324744247452474624747247482474924750247512475224753247542475524756247572475824759247602476124762247632476424765247662476724768247692477024771247722477324774247752477624777247782477924780247812478224783247842478524786247872478824789247902479124792247932479424795247962479724798247992480024801248022480324804248052480624807248082480924810248112481224813248142481524816248172481824819248202482124822248232482424825248262482724828248292483024831248322483324834248352483624837248382483924840248412484224843248442484524846248472484824849248502485124852248532485424855248562485724858248592486024861248622486324864248652486624867248682486924870248712487224873248742487524876248772487824879248802488124882248832488424885248862488724888248892489024891248922489324894248952489624897248982489924900249012490224903249042490524906249072490824909249102491124912249132491424915249162491724918249192492024921249222492324924249252492624927249282492924930249312493224933249342493524936249372493824939249402494124942249432494424945249462494724948249492495024951249522495324954249552495624957249582495924960249612496224963249642496524966249672496824969249702497124972249732497424975249762497724978249792498024981249822498324984249852498624987249882498924990249912499224993249942499524996249972499824999250002500125002250032500425005250062500725008250092501025011250122501325014250152501625017250182501925020250212502225023250242502525026250272502825029250302503125032250332503425035250362503725038250392504025041250422504325044250452504625047250482504925050250512505225053250542505525056250572505825059250602506125062250632506425065250662506725068250692507025071250722507325074250752507625077250782507925080250812508225083250842508525086250872508825089250902509125092250932509425095250962509725098250992510025101251022510325104251052510625107251082510925110251112511225113251142511525116251172511825119251202512125122251232512425125251262512725128251292513025131251322513325134251352513625137251382513925140251412514225143251442514525146251472514825149251502515125152251532515425155251562515725158251592516025161251622516325164251652516625167251682516925170251712517225173251742517525176251772517825179251802518125182251832518425185251862518725188251892519025191251922519325194251952519625197251982519925200252012520225203252042520525206252072520825209252102521125212252132521425215252162521725218252192522025221252222522325224252252522625227252282522925230252312523225233252342523525236252372523825239252402524125242252432524425245252462524725248252492525025251252522525325254252552525625257252582525925260252612526225263252642526525266252672526825269252702527125272252732527425275252762527725278252792528025281252822528325284252852528625287252882528925290252912529225293252942529525296252972529825299253002530125302253032530425305253062530725308253092531025311253122531325314253152531625317253182531925320253212532225323253242532525326253272532825329253302533125332253332533425335253362533725338253392534025341253422534325344253452534625347253482534925350253512535225353253542535525356253572535825359253602536125362253632536425365253662536725368253692537025371253722537325374253752537625377253782537925380253812538225383253842538525386253872538825389253902539125392253932539425395253962539725398253992540025401254022540325404254052540625407254082540925410254112541225413254142541525416254172541825419254202542125422254232542425425254262542725428254292543025431254322543325434254352543625437254382543925440254412544225443254442544525446254472544825449254502545125452254532545425455254562545725458254592546025461254622546325464254652546625467254682546925470254712547225473254742547525476254772547825479254802548125482254832548425485254862548725488254892549025491254922549325494254952549625497254982549925500255012550225503255042550525506255072550825509255102551125512255132551425515255162551725518255192552025521255222552325524255252552625527255282552925530255312553225533255342553525536255372553825539255402554125542255432554425545255462554725548255492555025551255522555325554255552555625557255582555925560255612556225563255642556525566255672556825569255702557125572255732557425575255762557725578255792558025581255822558325584255852558625587255882558925590255912559225593255942559525596255972559825599256002560125602256032560425605256062560725608256092561025611256122561325614256152561625617256182561925620256212562225623256242562525626256272562825629256302563125632256332563425635256362563725638256392564025641256422564325644256452564625647256482564925650256512565225653256542565525656256572565825659256602566125662256632566425665256662566725668256692567025671256722567325674256752567625677256782567925680256812568225683256842568525686256872568825689256902569125692256932569425695256962569725698256992570025701257022570325704257052570625707257082570925710257112571225713257142571525716257172571825719257202572125722257232572425725257262572725728257292573025731257322573325734257352573625737257382573925740257412574225743257442574525746257472574825749257502575125752257532575425755257562575725758257592576025761257622576325764257652576625767257682576925770257712577225773257742577525776257772577825779257802578125782257832578425785257862578725788257892579025791257922579325794257952579625797257982579925800258012580225803258042580525806258072580825809258102581125812258132581425815258162581725818258192582025821258222582325824258252582625827258282582925830258312583225833258342583525836258372583825839258402584125842258432584425845258462584725848258492585025851258522585325854258552585625857258582585925860258612586225863258642586525866258672586825869258702587125872258732587425875258762587725878258792588025881258822588325884258852588625887258882588925890258912589225893258942589525896258972589825899259002590125902259032590425905259062590725908259092591025911259122591325914259152591625917259182591925920259212592225923259242592525926259272592825929259302593125932259332593425935259362593725938259392594025941259422594325944259452594625947259482594925950259512595225953259542595525956259572595825959259602596125962259632596425965259662596725968259692597025971259722597325974259752597625977259782597925980259812598225983259842598525986259872598825989259902599125992259932599425995259962599725998259992600026001260022600326004260052600626007260082600926010260112601226013260142601526016260172601826019260202602126022260232602426025260262602726028260292603026031260322603326034260352603626037260382603926040260412604226043260442604526046260472604826049260502605126052260532605426055260562605726058260592606026061260622606326064260652606626067260682606926070260712607226073260742607526076260772607826079260802608126082260832608426085260862608726088260892609026091260922609326094260952609626097260982609926100261012610226103261042610526106261072610826109261102611126112261132611426115261162611726118261192612026121261222612326124261252612626127261282612926130261312613226133261342613526136261372613826139261402614126142261432614426145261462614726148261492615026151261522615326154261552615626157261582615926160261612616226163261642616526166261672616826169261702617126172261732617426175261762617726178261792618026181261822618326184261852618626187261882618926190261912619226193261942619526196261972619826199262002620126202262032620426205262062620726208262092621026211262122621326214262152621626217262182621926220262212622226223262242622526226262272622826229262302623126232262332623426235262362623726238262392624026241262422624326244262452624626247262482624926250262512625226253262542625526256262572625826259262602626126262262632626426265262662626726268262692627026271262722627326274262752627626277262782627926280262812628226283262842628526286262872628826289262902629126292262932629426295262962629726298262992630026301263022630326304263052630626307263082630926310263112631226313263142631526316263172631826319263202632126322263232632426325263262632726328263292633026331263322633326334263352633626337263382633926340263412634226343263442634526346263472634826349263502635126352263532635426355263562635726358263592636026361263622636326364263652636626367263682636926370263712637226373263742637526376263772637826379263802638126382263832638426385263862638726388263892639026391263922639326394263952639626397263982639926400264012640226403264042640526406264072640826409264102641126412264132641426415264162641726418264192642026421264222642326424264252642626427264282642926430264312643226433264342643526436264372643826439264402644126442264432644426445264462644726448264492645026451264522645326454264552645626457264582645926460264612646226463264642646526466264672646826469264702647126472264732647426475264762647726478264792648026481264822648326484264852648626487264882648926490264912649226493264942649526496264972649826499265002650126502265032650426505265062650726508265092651026511265122651326514265152651626517265182651926520265212652226523265242652526526265272652826529265302653126532265332653426535265362653726538265392654026541265422654326544265452654626547265482654926550265512655226553265542655526556265572655826559265602656126562265632656426565265662656726568265692657026571265722657326574265752657626577265782657926580265812658226583265842658526586265872658826589265902659126592265932659426595265962659726598265992660026601266022660326604266052660626607266082660926610266112661226613266142661526616266172661826619266202662126622266232662426625266262662726628266292663026631266322663326634266352663626637266382663926640266412664226643266442664526646266472664826649266502665126652266532665426655266562665726658266592666026661266622666326664266652666626667266682666926670266712667226673266742667526676266772667826679266802668126682266832668426685266862668726688266892669026691266922669326694266952669626697266982669926700267012670226703267042670526706267072670826709267102671126712267132671426715267162671726718267192672026721267222672326724267252672626727267282672926730267312673226733267342673526736267372673826739267402674126742267432674426745267462674726748267492675026751267522675326754267552675626757267582675926760267612676226763267642676526766267672676826769267702677126772267732677426775267762677726778267792678026781267822678326784267852678626787267882678926790267912679226793267942679526796267972679826799268002680126802268032680426805268062680726808268092681026811268122681326814268152681626817268182681926820268212682226823268242682526826268272682826829268302683126832268332683426835268362683726838268392684026841268422684326844268452684626847268482684926850268512685226853268542685526856268572685826859268602686126862268632686426865268662686726868268692687026871268722687326874268752687626877268782687926880268812688226883268842688526886268872688826889268902689126892268932689426895268962689726898268992690026901269022690326904269052690626907269082690926910269112691226913269142691526916269172691826919269202692126922269232692426925269262692726928269292693026931269322693326934269352693626937269382693926940269412694226943269442694526946269472694826949269502695126952269532695426955269562695726958269592696026961269622696326964269652696626967269682696926970269712697226973269742697526976269772697826979269802698126982269832698426985269862698726988269892699026991269922699326994269952699626997269982699927000270012700227003270042700527006270072700827009270102701127012270132701427015270162701727018270192702027021270222702327024270252702627027270282702927030270312703227033270342703527036270372703827039270402704127042270432704427045270462704727048270492705027051270522705327054270552705627057270582705927060270612706227063270642706527066270672706827069270702707127072270732707427075270762707727078270792708027081270822708327084270852708627087270882708927090270912709227093270942709527096270972709827099271002710127102271032710427105271062710727108271092711027111271122711327114271152711627117271182711927120271212712227123271242712527126271272712827129271302713127132271332713427135271362713727138271392714027141271422714327144271452714627147271482714927150271512715227153271542715527156271572715827159271602716127162271632716427165271662716727168271692717027171271722717327174271752717627177271782717927180271812718227183271842718527186271872718827189271902719127192271932719427195271962719727198271992720027201272022720327204272052720627207272082720927210272112721227213272142721527216272172721827219272202722127222272232722427225272262722727228272292723027231272322723327234272352723627237272382723927240272412724227243272442724527246272472724827249272502725127252272532725427255272562725727258272592726027261272622726327264272652726627267272682726927270272712727227273272742727527276272772727827279272802728127282272832728427285272862728727288272892729027291272922729327294272952729627297272982729927300273012730227303273042730527306273072730827309273102731127312273132731427315273162731727318273192732027321273222732327324273252732627327273282732927330273312733227333273342733527336273372733827339273402734127342273432734427345273462734727348273492735027351273522735327354273552735627357273582735927360273612736227363273642736527366273672736827369273702737127372273732737427375273762737727378273792738027381273822738327384273852738627387273882738927390273912739227393273942739527396273972739827399274002740127402274032740427405274062740727408274092741027411274122741327414274152741627417274182741927420274212742227423274242742527426274272742827429274302743127432274332743427435274362743727438274392744027441274422744327444274452744627447274482744927450274512745227453274542745527456274572745827459274602746127462274632746427465274662746727468274692747027471274722747327474274752747627477274782747927480274812748227483274842748527486274872748827489274902749127492274932749427495274962749727498274992750027501275022750327504275052750627507275082750927510275112751227513275142751527516275172751827519275202752127522275232752427525275262752727528275292753027531275322753327534275352753627537275382753927540275412754227543275442754527546275472754827549275502755127552275532755427555275562755727558275592756027561275622756327564275652756627567275682756927570275712757227573275742757527576275772757827579275802758127582275832758427585275862758727588275892759027591275922759327594275952759627597275982759927600276012760227603276042760527606276072760827609276102761127612276132761427615276162761727618276192762027621276222762327624276252762627627276282762927630276312763227633276342763527636276372763827639276402764127642276432764427645276462764727648276492765027651276522765327654276552765627657276582765927660276612766227663276642766527666276672766827669276702767127672276732767427675276762767727678276792768027681276822768327684276852768627687276882768927690276912769227693276942769527696276972769827699277002770127702277032770427705277062770727708277092771027711277122771327714277152771627717277182771927720277212772227723277242772527726277272772827729277302773127732277332773427735277362773727738277392774027741277422774327744277452774627747277482774927750277512775227753277542775527756277572775827759277602776127762277632776427765277662776727768277692777027771277722777327774277752777627777277782777927780277812778227783277842778527786277872778827789277902779127792277932779427795277962779727798277992780027801278022780327804278052780627807278082780927810278112781227813278142781527816278172781827819278202782127822278232782427825278262782727828278292783027831278322783327834278352783627837278382783927840278412784227843278442784527846278472784827849278502785127852278532785427855278562785727858278592786027861278622786327864278652786627867278682786927870278712787227873278742787527876278772787827879278802788127882278832788427885278862788727888278892789027891278922789327894278952789627897278982789927900279012790227903279042790527906279072790827909279102791127912279132791427915279162791727918279192792027921279222792327924279252792627927279282792927930279312793227933279342793527936279372793827939279402794127942279432794427945279462794727948279492795027951279522795327954279552795627957279582795927960279612796227963279642796527966279672796827969279702797127972279732797427975279762797727978279792798027981279822798327984279852798627987279882798927990279912799227993279942799527996279972799827999280002800128002280032800428005280062800728008280092801028011280122801328014280152801628017280182801928020280212802228023280242802528026280272802828029280302803128032280332803428035280362803728038280392804028041280422804328044280452804628047280482804928050280512805228053280542805528056280572805828059280602806128062280632806428065280662806728068280692807028071280722807328074280752807628077280782807928080280812808228083280842808528086280872808828089280902809128092280932809428095280962809728098280992810028101281022810328104281052810628107281082810928110281112811228113281142811528116281172811828119281202812128122281232812428125281262812728128281292813028131281322813328134281352813628137281382813928140281412814228143281442814528146281472814828149281502815128152281532815428155281562815728158281592816028161281622816328164281652816628167281682816928170281712817228173281742817528176281772817828179281802818128182281832818428185281862818728188281892819028191281922819328194281952819628197281982819928200282012820228203282042820528206282072820828209282102821128212282132821428215282162821728218282192822028221282222822328224282252822628227282282822928230282312823228233282342823528236282372823828239282402824128242282432824428245282462824728248282492825028251282522825328254282552825628257282582825928260282612826228263282642826528266282672826828269282702827128272282732827428275282762827728278282792828028281282822828328284282852828628287282882828928290282912829228293282942829528296282972829828299283002830128302283032830428305283062830728308283092831028311283122831328314283152831628317283182831928320283212832228323283242832528326283272832828329283302833128332283332833428335283362833728338283392834028341283422834328344283452834628347283482834928350283512835228353283542835528356283572835828359283602836128362283632836428365283662836728368283692837028371283722837328374283752837628377283782837928380283812838228383283842838528386283872838828389283902839128392283932839428395283962839728398283992840028401284022840328404284052840628407284082840928410284112841228413284142841528416284172841828419284202842128422284232842428425284262842728428284292843028431284322843328434284352843628437284382843928440284412844228443284442844528446284472844828449284502845128452284532845428455284562845728458284592846028461284622846328464284652846628467284682846928470284712847228473284742847528476284772847828479284802848128482284832848428485284862848728488284892849028491284922849328494284952849628497284982849928500285012850228503285042850528506285072850828509285102851128512285132851428515285162851728518285192852028521285222852328524285252852628527285282852928530285312853228533285342853528536285372853828539285402854128542285432854428545285462854728548285492855028551285522855328554285552855628557285582855928560285612856228563285642856528566285672856828569285702857128572285732857428575285762857728578285792858028581285822858328584285852858628587285882858928590285912859228593285942859528596285972859828599286002860128602286032860428605286062860728608286092861028611286122861328614286152861628617286182861928620286212862228623286242862528626286272862828629286302863128632286332863428635286362863728638286392864028641286422864328644286452864628647286482864928650286512865228653286542865528656286572865828659286602866128662286632866428665286662866728668286692867028671286722867328674286752867628677286782867928680286812868228683286842868528686286872868828689286902869128692286932869428695286962869728698286992870028701287022870328704287052870628707287082870928710287112871228713287142871528716287172871828719287202872128722287232872428725287262872728728287292873028731287322873328734287352873628737287382873928740287412874228743287442874528746287472874828749287502875128752287532875428755287562875728758287592876028761287622876328764287652876628767287682876928770287712877228773287742877528776287772877828779287802878128782287832878428785287862878728788287892879028791287922879328794287952879628797287982879928800288012880228803288042880528806288072880828809288102881128812288132881428815288162881728818288192882028821288222882328824288252882628827288282882928830288312883228833288342883528836288372883828839288402884128842288432884428845288462884728848288492885028851288522885328854288552885628857288582885928860288612886228863288642886528866288672886828869288702887128872288732887428875288762887728878288792888028881288822888328884288852888628887288882888928890288912889228893288942889528896288972889828899289002890128902289032890428905289062890728908289092891028911289122891328914289152891628917289182891928920289212892228923289242892528926289272892828929289302893128932289332893428935289362893728938289392894028941289422894328944289452894628947289482894928950289512895228953289542895528956289572895828959289602896128962289632896428965289662896728968289692897028971289722897328974289752897628977289782897928980289812898228983289842898528986289872898828989289902899128992289932899428995289962899728998289992900029001290022900329004290052900629007290082900929010290112901229013290142901529016290172901829019290202902129022290232902429025290262902729028290292903029031290322903329034290352903629037290382903929040290412904229043290442904529046290472904829049290502905129052290532905429055290562905729058290592906029061290622906329064290652906629067290682906929070290712907229073290742907529076290772907829079290802908129082290832908429085290862908729088290892909029091290922909329094290952909629097290982909929100291012910229103291042910529106291072910829109291102911129112291132911429115291162911729118291192912029121291222912329124291252912629127291282912929130291312913229133291342913529136291372913829139291402914129142291432914429145291462914729148291492915029151291522915329154291552915629157291582915929160291612916229163291642916529166291672916829169291702917129172291732917429175291762917729178291792918029181291822918329184291852918629187291882918929190291912919229193291942919529196291972919829199292002920129202292032920429205292062920729208292092921029211292122921329214292152921629217292182921929220292212922229223292242922529226292272922829229292302923129232292332923429235292362923729238292392924029241292422924329244292452924629247292482924929250292512925229253292542925529256292572925829259292602926129262292632926429265292662926729268292692927029271292722927329274292752927629277292782927929280292812928229283292842928529286292872928829289292902929129292292932929429295292962929729298292992930029301293022930329304293052930629307293082930929310293112931229313293142931529316293172931829319293202932129322293232932429325293262932729328293292933029331293322933329334293352933629337293382933929340293412934229343293442934529346293472934829349293502935129352293532935429355293562935729358293592936029361293622936329364293652936629367293682936929370293712937229373293742937529376293772937829379293802938129382293832938429385293862938729388293892939029391293922939329394293952939629397293982939929400294012940229403294042940529406294072940829409294102941129412294132941429415294162941729418294192942029421294222942329424294252942629427294282942929430294312943229433294342943529436294372943829439294402944129442294432944429445294462944729448294492945029451294522945329454294552945629457294582945929460294612946229463294642946529466294672946829469294702947129472294732947429475294762947729478294792948029481294822948329484294852948629487294882948929490294912949229493294942949529496294972949829499295002950129502295032950429505295062950729508295092951029511295122951329514295152951629517295182951929520295212952229523295242952529526295272952829529295302953129532295332953429535295362953729538295392954029541295422954329544295452954629547295482954929550295512955229553295542955529556295572955829559295602956129562295632956429565295662956729568295692957029571295722957329574295752957629577295782957929580295812958229583295842958529586295872958829589295902959129592295932959429595295962959729598295992960029601296022960329604296052960629607296082960929610296112961229613296142961529616296172961829619296202962129622296232962429625296262962729628296292963029631296322963329634296352963629637296382963929640296412964229643296442964529646296472964829649296502965129652296532965429655296562965729658296592966029661296622966329664296652966629667296682966929670296712967229673296742967529676296772967829679296802968129682296832968429685296862968729688296892969029691296922969329694296952969629697296982969929700297012970229703297042970529706297072970829709297102971129712297132971429715297162971729718297192972029721297222972329724297252972629727297282972929730297312973229733297342973529736297372973829739297402974129742297432974429745297462974729748297492975029751297522975329754297552975629757297582975929760297612976229763297642976529766297672976829769297702977129772297732977429775297762977729778297792978029781297822978329784297852978629787297882978929790297912979229793297942979529796297972979829799298002980129802298032980429805298062980729808298092981029811298122981329814298152981629817298182981929820298212982229823298242982529826298272982829829298302983129832298332983429835298362983729838298392984029841298422984329844298452984629847298482984929850298512985229853298542985529856298572985829859298602986129862298632986429865298662986729868298692987029871298722987329874298752987629877298782987929880298812988229883298842988529886298872988829889298902989129892298932989429895298962989729898298992990029901299022990329904299052990629907299082990929910299112991229913299142991529916299172991829919299202992129922299232992429925299262992729928299292993029931299322993329934299352993629937299382993929940299412994229943299442994529946299472994829949299502995129952299532995429955299562995729958299592996029961299622996329964299652996629967299682996929970299712997229973299742997529976299772997829979299802998129982299832998429985299862998729988299892999029991299922999329994299952999629997299982999930000300013000230003300043000530006300073000830009300103001130012300133001430015300163001730018300193002030021300223002330024300253002630027300283002930030300313003230033300343003530036300373003830039300403004130042300433004430045300463004730048300493005030051300523005330054300553005630057300583005930060300613006230063300643006530066300673006830069300703007130072300733007430075300763007730078300793008030081300823008330084300853008630087300883008930090300913009230093300943009530096300973009830099301003010130102301033010430105301063010730108301093011030111301123011330114301153011630117301183011930120301213012230123301243012530126301273012830129301303013130132301333013430135301363013730138301393014030141301423014330144301453014630147301483014930150301513015230153301543015530156301573015830159301603016130162301633016430165301663016730168301693017030171301723017330174301753017630177301783017930180301813018230183301843018530186301873018830189301903019130192301933019430195301963019730198301993020030201302023020330204302053020630207302083020930210302113021230213302143021530216302173021830219302203022130222302233022430225302263022730228302293023030231302323023330234302353023630237302383023930240302413024230243302443024530246302473024830249302503025130252302533025430255302563025730258302593026030261302623026330264302653026630267302683026930270302713027230273302743027530276302773027830279302803028130282302833028430285302863028730288302893029030291302923029330294302953029630297302983029930300303013030230303303043030530306303073030830309303103031130312303133031430315303163031730318303193032030321303223032330324303253032630327303283032930330303313033230333303343033530336303373033830339303403034130342303433034430345303463034730348303493035030351303523035330354303553035630357303583035930360303613036230363303643036530366303673036830369303703037130372303733037430375303763037730378303793038030381303823038330384303853038630387303883038930390303913039230393303943039530396303973039830399304003040130402304033040430405304063040730408304093041030411304123041330414304153041630417304183041930420304213042230423304243042530426304273042830429304303043130432304333043430435304363043730438304393044030441304423044330444304453044630447304483044930450304513045230453304543045530456304573045830459304603046130462304633046430465304663046730468304693047030471304723047330474304753047630477304783047930480304813048230483304843048530486304873048830489304903049130492304933049430495304963049730498304993050030501305023050330504305053050630507305083050930510305113051230513305143051530516305173051830519305203052130522305233052430525305263052730528305293053030531305323053330534305353053630537305383053930540305413054230543305443054530546305473054830549305503055130552305533055430555305563055730558305593056030561305623056330564305653056630567305683056930570305713057230573305743057530576305773057830579305803058130582305833058430585305863058730588305893059030591305923059330594305953059630597305983059930600306013060230603306043060530606306073060830609306103061130612306133061430615306163061730618306193062030621306223062330624306253062630627306283062930630306313063230633306343063530636306373063830639306403064130642306433064430645306463064730648306493065030651306523065330654306553065630657306583065930660306613066230663306643066530666306673066830669306703067130672306733067430675306763067730678306793068030681306823068330684306853068630687306883068930690306913069230693306943069530696306973069830699307003070130702307033070430705307063070730708307093071030711307123071330714307153071630717307183071930720307213072230723307243072530726307273072830729307303073130732307333073430735307363073730738307393074030741307423074330744307453074630747307483074930750307513075230753307543075530756307573075830759307603076130762307633076430765307663076730768307693077030771307723077330774307753077630777307783077930780307813078230783307843078530786307873078830789307903079130792307933079430795307963079730798307993080030801308023080330804308053080630807308083080930810308113081230813308143081530816308173081830819308203082130822308233082430825308263082730828308293083030831308323083330834308353083630837308383083930840308413084230843308443084530846308473084830849308503085130852308533085430855308563085730858308593086030861308623086330864308653086630867308683086930870308713087230873308743087530876308773087830879308803088130882308833088430885308863088730888308893089030891308923089330894308953089630897308983089930900309013090230903309043090530906309073090830909309103091130912309133091430915309163091730918309193092030921309223092330924309253092630927309283092930930309313093230933309343093530936309373093830939309403094130942309433094430945309463094730948309493095030951309523095330954309553095630957309583095930960309613096230963309643096530966309673096830969309703097130972309733097430975309763097730978309793098030981309823098330984309853098630987309883098930990309913099230993309943099530996309973099830999310003100131002310033100431005310063100731008310093101031011310123101331014310153101631017310183101931020310213102231023310243102531026310273102831029310303103131032310333103431035310363103731038310393104031041310423104331044310453104631047310483104931050310513105231053310543105531056310573105831059310603106131062310633106431065310663106731068310693107031071310723107331074310753107631077310783107931080310813108231083310843108531086310873108831089310903109131092310933109431095310963109731098310993110031101311023110331104311053110631107311083110931110311113111231113311143111531116311173111831119311203112131122311233112431125311263112731128311293113031131311323113331134311353113631137311383113931140311413114231143311443114531146311473114831149311503115131152311533115431155311563115731158311593116031161311623116331164311653116631167311683116931170311713117231173311743117531176311773117831179311803118131182311833118431185311863118731188311893119031191311923119331194311953119631197311983119931200312013120231203312043120531206312073120831209312103121131212312133121431215312163121731218312193122031221312223122331224312253122631227312283122931230312313123231233312343123531236312373123831239312403124131242312433124431245312463124731248312493125031251312523125331254312553125631257312583125931260312613126231263312643126531266312673126831269312703127131272312733127431275312763127731278312793128031281312823128331284312853128631287312883128931290312913129231293312943129531296312973129831299313003130131302313033130431305313063130731308313093131031311313123131331314313153131631317313183131931320313213132231323313243132531326313273132831329313303133131332313333133431335313363133731338313393134031341313423134331344313453134631347313483134931350313513135231353313543135531356313573135831359313603136131362313633136431365313663136731368313693137031371313723137331374313753137631377313783137931380313813138231383313843138531386313873138831389313903139131392313933139431395313963139731398313993140031401314023140331404314053140631407314083140931410314113141231413314143141531416314173141831419314203142131422314233142431425314263142731428314293143031431314323143331434314353143631437314383143931440314413144231443314443144531446314473144831449314503145131452314533145431455314563145731458314593146031461314623146331464314653146631467314683146931470314713147231473314743147531476314773147831479314803148131482314833148431485314863148731488314893149031491314923149331494314953149631497314983149931500315013150231503315043150531506315073150831509315103151131512315133151431515315163151731518315193152031521315223152331524315253152631527315283152931530315313153231533315343153531536315373153831539315403154131542315433154431545315463154731548315493155031551315523155331554315553155631557315583155931560315613156231563315643156531566315673156831569315703157131572315733157431575315763157731578315793158031581315823158331584315853158631587315883158931590315913159231593315943159531596315973159831599316003160131602316033160431605316063160731608316093161031611316123161331614316153161631617316183161931620316213162231623316243162531626316273162831629316303163131632316333163431635316363163731638316393164031641316423164331644316453164631647316483164931650316513165231653316543165531656316573165831659316603166131662316633166431665316663166731668316693167031671316723167331674316753167631677316783167931680316813168231683316843168531686316873168831689316903169131692316933169431695316963169731698316993170031701317023170331704317053170631707317083170931710317113171231713317143171531716317173171831719317203172131722317233172431725317263172731728317293173031731317323173331734317353173631737317383173931740317413174231743317443174531746317473174831749317503175131752317533175431755317563175731758317593176031761317623176331764317653176631767317683176931770317713177231773317743177531776317773177831779317803178131782317833178431785317863178731788317893179031791317923179331794317953179631797317983179931800318013180231803318043180531806318073180831809318103181131812318133181431815318163181731818318193182031821318223182331824318253182631827318283182931830318313183231833318343183531836318373183831839318403184131842318433184431845318463184731848318493185031851318523185331854318553185631857318583185931860318613186231863318643186531866318673186831869318703187131872318733187431875318763187731878318793188031881318823188331884318853188631887318883188931890318913189231893318943189531896318973189831899319003190131902319033190431905319063190731908319093191031911319123191331914319153191631917319183191931920319213192231923319243192531926319273192831929319303193131932319333193431935319363193731938319393194031941319423194331944319453194631947319483194931950319513195231953319543195531956319573195831959319603196131962319633196431965319663196731968319693197031971319723197331974319753197631977319783197931980319813198231983319843198531986319873198831989319903199131992319933199431995319963199731998319993200032001320023200332004320053200632007320083200932010320113201232013320143201532016320173201832019320203202132022320233202432025320263202732028320293203032031320323203332034320353203632037320383203932040320413204232043320443204532046320473204832049320503205132052320533205432055320563205732058320593206032061320623206332064320653206632067320683206932070320713207232073320743207532076320773207832079320803208132082320833208432085320863208732088320893209032091320923209332094320953209632097320983209932100321013210232103321043210532106321073210832109321103211132112321133211432115321163211732118321193212032121321223212332124321253212632127321283212932130321313213232133321343213532136321373213832139321403214132142321433214432145321463214732148321493215032151321523215332154321553215632157321583215932160321613216232163321643216532166321673216832169321703217132172321733217432175321763217732178321793218032181321823218332184321853218632187321883218932190321913219232193321943219532196321973219832199322003220132202322033220432205322063220732208322093221032211322123221332214322153221632217322183221932220322213222232223322243222532226322273222832229322303223132232322333223432235322363223732238322393224032241322423224332244322453224632247322483224932250322513225232253322543225532256322573225832259322603226132262322633226432265322663226732268322693227032271322723227332274322753227632277322783227932280322813228232283322843228532286322873228832289322903229132292322933229432295322963229732298322993230032301323023230332304323053230632307323083230932310323113231232313323143231532316323173231832319323203232132322323233232432325323263232732328323293233032331323323233332334323353233632337323383233932340323413234232343323443234532346323473234832349323503235132352323533235432355323563235732358323593236032361323623236332364323653236632367323683236932370323713237232373323743237532376323773237832379323803238132382323833238432385323863238732388323893239032391323923239332394323953239632397323983239932400324013240232403324043240532406324073240832409324103241132412324133241432415324163241732418324193242032421324223242332424324253242632427324283242932430324313243232433324343243532436324373243832439324403244132442324433244432445324463244732448324493245032451324523245332454324553245632457324583245932460324613246232463324643246532466324673246832469324703247132472324733247432475324763247732478324793248032481324823248332484324853248632487324883248932490324913249232493324943249532496324973249832499325003250132502325033250432505325063250732508325093251032511325123251332514325153251632517325183251932520325213252232523325243252532526325273252832529325303253132532325333253432535325363253732538325393254032541325423254332544325453254632547325483254932550325513255232553325543255532556325573255832559325603256132562325633256432565325663256732568325693257032571325723257332574325753257632577325783257932580325813258232583325843258532586325873258832589325903259132592325933259432595325963259732598325993260032601326023260332604326053260632607326083260932610326113261232613326143261532616326173261832619326203262132622326233262432625326263262732628326293263032631326323263332634326353263632637326383263932640326413264232643326443264532646326473264832649326503265132652326533265432655326563265732658326593266032661326623266332664326653266632667326683266932670326713267232673326743267532676326773267832679326803268132682326833268432685326863268732688326893269032691326923269332694326953269632697326983269932700327013270232703327043270532706327073270832709327103271132712327133271432715327163271732718327193272032721327223272332724327253272632727327283272932730327313273232733327343273532736327373273832739327403274132742327433274432745327463274732748327493275032751327523275332754327553275632757327583275932760327613276232763327643276532766327673276832769327703277132772327733277432775327763277732778327793278032781327823278332784327853278632787327883278932790327913279232793327943279532796327973279832799328003280132802328033280432805328063280732808328093281032811328123281332814328153281632817328183281932820328213282232823328243282532826328273282832829328303283132832328333283432835328363283732838328393284032841328423284332844328453284632847328483284932850328513285232853328543285532856328573285832859328603286132862328633286432865328663286732868328693287032871328723287332874328753287632877328783287932880328813288232883328843288532886328873288832889328903289132892328933289432895328963289732898328993290032901329023290332904329053290632907329083290932910329113291232913329143291532916329173291832919329203292132922329233292432925329263292732928329293293032931329323293332934329353293632937329383293932940329413294232943329443294532946329473294832949329503295132952329533295432955329563295732958329593296032961329623296332964329653296632967329683296932970329713297232973329743297532976329773297832979329803298132982329833298432985329863298732988329893299032991329923299332994329953299632997329983299933000330013300233003330043300533006330073300833009330103301133012330133301433015330163301733018330193302033021330223302333024330253302633027330283302933030330313303233033330343303533036330373303833039330403304133042330433304433045330463304733048330493305033051330523305333054330553305633057330583305933060330613306233063330643306533066330673306833069330703307133072330733307433075330763307733078330793308033081330823308333084330853308633087330883308933090330913309233093330943309533096330973309833099331003310133102331033310433105331063310733108331093311033111331123311333114331153311633117331183311933120331213312233123331243312533126331273312833129331303313133132331333313433135331363313733138331393314033141331423314333144331453314633147331483314933150331513315233153331543315533156331573315833159331603316133162331633316433165331663316733168331693317033171331723317333174331753317633177331783317933180331813318233183331843318533186331873318833189331903319133192331933319433195331963319733198331993320033201332023320333204332053320633207332083320933210332113321233213332143321533216332173321833219332203322133222332233322433225332263322733228332293323033231332323323333234332353323633237332383323933240332413324233243332443324533246332473324833249332503325133252332533325433255332563325733258332593326033261332623326333264332653326633267332683326933270332713327233273332743327533276332773327833279332803328133282332833328433285332863328733288332893329033291332923329333294332953329633297332983329933300333013330233303333043330533306333073330833309333103331133312333133331433315333163331733318333193332033321333223332333324333253332633327333283332933330333313333233333333343333533336333373333833339333403334133342333433334433345333463334733348333493335033351333523335333354333553335633357333583335933360333613336233363333643336533366333673336833369333703337133372333733337433375333763337733378333793338033381333823338333384333853338633387333883338933390333913339233393333943339533396333973339833399334003340133402334033340433405334063340733408334093341033411334123341333414334153341633417334183341933420334213342233423334243342533426334273342833429334303343133432334333343433435334363343733438334393344033441334423344333444334453344633447334483344933450334513345233453334543345533456334573345833459334603346133462334633346433465334663346733468334693347033471334723347333474334753347633477334783347933480334813348233483334843348533486334873348833489334903349133492334933349433495334963349733498334993350033501335023350333504335053350633507335083350933510335113351233513335143351533516335173351833519335203352133522335233352433525335263352733528335293353033531335323353333534335353353633537335383353933540335413354233543335443354533546335473354833549335503355133552335533355433555335563355733558335593356033561335623356333564335653356633567335683356933570335713357233573335743357533576335773357833579335803358133582335833358433585335863358733588335893359033591335923359333594335953359633597335983359933600336013360233603336043360533606336073360833609336103361133612336133361433615336163361733618336193362033621336223362333624336253362633627336283362933630336313363233633336343363533636336373363833639336403364133642336433364433645336463364733648336493365033651336523365333654336553365633657336583365933660336613366233663336643366533666336673366833669336703367133672336733367433675336763367733678336793368033681336823368333684336853368633687336883368933690336913369233693336943369533696336973369833699337003370133702337033370433705337063370733708337093371033711337123371333714337153371633717337183371933720337213372233723337243372533726337273372833729337303373133732337333373433735337363373733738337393374033741337423374333744337453374633747337483374933750337513375233753337543375533756337573375833759337603376133762337633376433765337663376733768337693377033771337723377333774337753377633777337783377933780337813378233783337843378533786337873378833789337903379133792337933379433795337963379733798337993380033801338023380333804338053380633807338083380933810338113381233813338143381533816338173381833819338203382133822338233382433825338263382733828338293383033831338323383333834338353383633837338383383933840338413384233843338443384533846338473384833849338503385133852338533385433855338563385733858338593386033861338623386333864338653386633867338683386933870338713387233873338743387533876338773387833879338803388133882338833388433885338863388733888338893389033891338923389333894338953389633897338983389933900339013390233903339043390533906339073390833909339103391133912339133391433915339163391733918339193392033921339223392333924339253392633927339283392933930339313393233933339343393533936339373393833939339403394133942339433394433945339463394733948339493395033951339523395333954339553395633957339583395933960339613396233963339643396533966339673396833969339703397133972339733397433975339763397733978339793398033981339823398333984339853398633987339883398933990339913399233993339943399533996339973399833999340003400134002340033400434005340063400734008340093401034011340123401334014340153401634017340183401934020340213402234023340243402534026340273402834029340303403134032340333403434035340363403734038340393404034041340423404334044340453404634047340483404934050340513405234053340543405534056340573405834059340603406134062340633406434065340663406734068340693407034071340723407334074340753407634077340783407934080340813408234083340843408534086340873408834089340903409134092340933409434095340963409734098340993410034101341023410334104341053410634107341083410934110341113411234113341143411534116341173411834119341203412134122341233412434125341263412734128341293413034131341323413334134341353413634137341383413934140341413414234143341443414534146341473414834149341503415134152341533415434155341563415734158341593416034161341623416334164341653416634167341683416934170341713417234173341743417534176341773417834179341803418134182341833418434185341863418734188341893419034191341923419334194341953419634197341983419934200342013420234203342043420534206342073420834209342103421134212342133421434215342163421734218342193422034221342223422334224342253422634227342283422934230342313423234233342343423534236342373423834239342403424134242342433424434245342463424734248342493425034251342523425334254342553425634257342583425934260342613426234263342643426534266342673426834269342703427134272342733427434275342763427734278342793428034281342823428334284342853428634287342883428934290342913429234293342943429534296342973429834299343003430134302343033430434305343063430734308343093431034311343123431334314343153431634317343183431934320343213432234323343243432534326343273432834329343303433134332343333433434335343363433734338343393434034341343423434334344343453434634347343483434934350343513435234353343543435534356343573435834359343603436134362343633436434365343663436734368343693437034371343723437334374343753437634377343783437934380343813438234383343843438534386343873438834389343903439134392343933439434395343963439734398343993440034401344023440334404344053440634407344083440934410344113441234413344143441534416344173441834419344203442134422344233442434425344263442734428344293443034431344323443334434344353443634437344383443934440344413444234443344443444534446344473444834449344503445134452344533445434455344563445734458344593446034461344623446334464344653446634467344683446934470344713447234473344743447534476344773447834479344803448134482344833448434485344863448734488344893449034491344923449334494344953449634497344983449934500345013450234503345043450534506345073450834509345103451134512345133451434515345163451734518345193452034521345223452334524345253452634527345283452934530345313453234533345343453534536345373453834539345403454134542345433454434545345463454734548345493455034551345523455334554345553455634557345583455934560345613456234563345643456534566345673456834569345703457134572345733457434575345763457734578345793458034581345823458334584345853458634587345883458934590345913459234593345943459534596345973459834599346003460134602346033460434605346063460734608346093461034611346123461334614346153461634617346183461934620346213462234623346243462534626346273462834629346303463134632346333463434635346363463734638346393464034641346423464334644346453464634647346483464934650346513465234653346543465534656346573465834659346603466134662346633466434665346663466734668346693467034671346723467334674346753467634677346783467934680346813468234683346843468534686346873468834689346903469134692346933469434695346963469734698346993470034701347023470334704347053470634707347083470934710347113471234713347143471534716347173471834719347203472134722347233472434725347263472734728347293473034731347323473334734347353473634737347383473934740347413474234743347443474534746347473474834749347503475134752347533475434755347563475734758347593476034761347623476334764347653476634767347683476934770347713477234773347743477534776347773477834779347803478134782347833478434785347863478734788347893479034791347923479334794347953479634797347983479934800348013480234803348043480534806348073480834809348103481134812348133481434815348163481734818348193482034821348223482334824348253482634827348283482934830348313483234833348343483534836348373483834839348403484134842348433484434845348463484734848348493485034851348523485334854348553485634857348583485934860348613486234863348643486534866348673486834869348703487134872348733487434875348763487734878348793488034881348823488334884348853488634887348883488934890348913489234893348943489534896348973489834899349003490134902349033490434905349063490734908349093491034911349123491334914349153491634917349183491934920349213492234923349243492534926349273492834929349303493134932349333493434935349363493734938349393494034941349423494334944349453494634947349483494934950349513495234953349543495534956349573495834959349603496134962349633496434965349663496734968349693497034971349723497334974349753497634977349783497934980349813498234983349843498534986349873498834989349903499134992349933499434995349963499734998349993500035001350023500335004350053500635007350083500935010350113501235013350143501535016350173501835019350203502135022350233502435025350263502735028350293503035031350323503335034350353503635037350383503935040350413504235043350443504535046350473504835049350503505135052350533505435055350563505735058350593506035061350623506335064350653506635067350683506935070350713507235073350743507535076350773507835079350803508135082350833508435085350863508735088350893509035091350923509335094350953509635097350983509935100351013510235103351043510535106351073510835109351103511135112351133511435115351163511735118351193512035121351223512335124351253512635127351283512935130351313513235133351343513535136351373513835139351403514135142351433514435145351463514735148351493515035151351523515335154351553515635157351583515935160351613516235163351643516535166351673516835169351703517135172351733517435175351763517735178351793518035181351823518335184351853518635187351883518935190351913519235193351943519535196351973519835199352003520135202352033520435205352063520735208352093521035211352123521335214352153521635217352183521935220352213522235223352243522535226352273522835229352303523135232352333523435235352363523735238352393524035241352423524335244352453524635247352483524935250352513525235253352543525535256352573525835259352603526135262352633526435265352663526735268352693527035271352723527335274352753527635277352783527935280352813528235283352843528535286352873528835289352903529135292352933529435295352963529735298352993530035301353023530335304353053530635307353083530935310353113531235313353143531535316353173531835319353203532135322353233532435325353263532735328353293533035331353323533335334353353533635337353383533935340353413534235343353443534535346353473534835349353503535135352353533535435355353563535735358353593536035361353623536335364353653536635367353683536935370353713537235373353743537535376353773537835379353803538135382353833538435385353863538735388353893539035391353923539335394353953539635397353983539935400354013540235403354043540535406354073540835409354103541135412354133541435415354163541735418354193542035421354223542335424354253542635427354283542935430354313543235433354343543535436354373543835439354403544135442354433544435445354463544735448354493545035451354523545335454354553545635457354583545935460354613546235463354643546535466354673546835469354703547135472354733547435475354763547735478354793548035481354823548335484354853548635487354883548935490354913549235493354943549535496354973549835499355003550135502355033550435505355063550735508355093551035511355123551335514355153551635517355183551935520355213552235523355243552535526355273552835529355303553135532355333553435535355363553735538355393554035541355423554335544355453554635547355483554935550355513555235553355543555535556355573555835559355603556135562355633556435565355663556735568355693557035571355723557335574355753557635577355783557935580355813558235583355843558535586355873558835589355903559135592355933559435595355963559735598355993560035601356023560335604356053560635607356083560935610356113561235613356143561535616356173561835619356203562135622356233562435625356263562735628356293563035631356323563335634356353563635637356383563935640356413564235643356443564535646356473564835649356503565135652356533565435655356563565735658356593566035661356623566335664356653566635667356683566935670356713567235673356743567535676356773567835679356803568135682356833568435685356863568735688356893569035691356923569335694356953569635697356983569935700357013570235703357043570535706357073570835709357103571135712357133571435715357163571735718357193572035721357223572335724357253572635727357283572935730357313573235733357343573535736357373573835739357403574135742357433574435745357463574735748357493575035751357523575335754357553575635757357583575935760357613576235763357643576535766357673576835769357703577135772357733577435775357763577735778357793578035781357823578335784357853578635787357883578935790357913579235793357943579535796357973579835799358003580135802358033580435805358063580735808358093581035811358123581335814358153581635817358183581935820358213582235823358243582535826358273582835829358303583135832358333583435835358363583735838358393584035841358423584335844358453584635847358483584935850358513585235853358543585535856358573585835859358603586135862358633586435865358663586735868358693587035871358723587335874358753587635877358783587935880358813588235883358843588535886358873588835889358903589135892358933589435895358963589735898358993590035901359023590335904359053590635907359083590935910359113591235913359143591535916359173591835919359203592135922359233592435925359263592735928359293593035931359323593335934359353593635937359383593935940359413594235943359443594535946359473594835949359503595135952359533595435955359563595735958359593596035961359623596335964359653596635967359683596935970359713597235973359743597535976359773597835979359803598135982359833598435985359863598735988359893599035991359923599335994359953599635997359983599936000360013600236003360043600536006360073600836009360103601136012360133601436015360163601736018360193602036021360223602336024360253602636027360283602936030360313603236033360343603536036360373603836039360403604136042360433604436045360463604736048360493605036051360523605336054360553605636057360583605936060360613606236063360643606536066360673606836069360703607136072360733607436075360763607736078360793608036081360823608336084360853608636087360883608936090360913609236093360943609536096360973609836099361003610136102361033610436105361063610736108361093611036111361123611336114361153611636117361183611936120361213612236123361243612536126361273612836129361303613136132361333613436135361363613736138361393614036141361423614336144361453614636147361483614936150361513615236153361543615536156361573615836159361603616136162361633616436165361663616736168361693617036171361723617336174361753617636177361783617936180361813618236183361843618536186361873618836189361903619136192361933619436195361963619736198361993620036201362023620336204362053620636207362083620936210362113621236213362143621536216362173621836219362203622136222362233622436225362263622736228362293623036231362323623336234362353623636237362383623936240362413624236243362443624536246362473624836249362503625136252362533625436255362563625736258362593626036261362623626336264362653626636267362683626936270362713627236273362743627536276362773627836279362803628136282362833628436285362863628736288362893629036291362923629336294362953629636297362983629936300363013630236303363043630536306363073630836309363103631136312363133631436315363163631736318363193632036321363223632336324363253632636327363283632936330
  1. diff -Nur linux-3.18.12.orig/arch/alpha/mm/fault.c linux-3.18.12/arch/alpha/mm/fault.c
  2. --- linux-3.18.12.orig/arch/alpha/mm/fault.c 2015-04-20 14:48:02.000000000 -0500
  3. +++ linux-3.18.12/arch/alpha/mm/fault.c 2015-04-26 13:32:22.351684003 -0500
  4. @@ -107,7 +107,7 @@
  5. /* If we're in an interrupt context, or have no user context,
  6. we must not take the fault. */
  7. - if (!mm || in_atomic())
  8. + if (!mm || pagefault_disabled())
  9. goto no_context;
  10. #ifdef CONFIG_ALPHA_LARGE_VMALLOC
  11. diff -Nur linux-3.18.12.orig/arch/arm/include/asm/cmpxchg.h linux-3.18.12/arch/arm/include/asm/cmpxchg.h
  12. --- linux-3.18.12.orig/arch/arm/include/asm/cmpxchg.h 2015-04-20 14:48:02.000000000 -0500
  13. +++ linux-3.18.12/arch/arm/include/asm/cmpxchg.h 2015-04-26 13:32:22.351684003 -0500
  14. @@ -129,6 +129,8 @@
  15. #else /* min ARCH >= ARMv6 */
  16. +#define __HAVE_ARCH_CMPXCHG 1
  17. +
  18. extern void __bad_cmpxchg(volatile void *ptr, int size);
  19. /*
  20. diff -Nur linux-3.18.12.orig/arch/arm/include/asm/futex.h linux-3.18.12/arch/arm/include/asm/futex.h
  21. --- linux-3.18.12.orig/arch/arm/include/asm/futex.h 2015-04-20 14:48:02.000000000 -0500
  22. +++ linux-3.18.12/arch/arm/include/asm/futex.h 2015-04-26 13:32:22.351684003 -0500
  23. @@ -93,6 +93,8 @@
  24. if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
  25. return -EFAULT;
  26. + preempt_disable_rt();
  27. +
  28. __asm__ __volatile__("@futex_atomic_cmpxchg_inatomic\n"
  29. "1: " TUSER(ldr) " %1, [%4]\n"
  30. " teq %1, %2\n"
  31. @@ -104,6 +106,8 @@
  32. : "cc", "memory");
  33. *uval = val;
  34. +
  35. + preempt_enable_rt();
  36. return ret;
  37. }
  38. diff -Nur linux-3.18.12.orig/arch/arm/include/asm/switch_to.h linux-3.18.12/arch/arm/include/asm/switch_to.h
  39. --- linux-3.18.12.orig/arch/arm/include/asm/switch_to.h 2015-04-20 14:48:02.000000000 -0500
  40. +++ linux-3.18.12/arch/arm/include/asm/switch_to.h 2015-04-26 13:32:22.355684003 -0500
  41. @@ -3,6 +3,13 @@
  42. #include <linux/thread_info.h>
  43. +#if defined CONFIG_PREEMPT_RT_FULL && defined CONFIG_HIGHMEM
  44. +void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p);
  45. +#else
  46. +static inline void
  47. +switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p) { }
  48. +#endif
  49. +
  50. /*
  51. * For v7 SMP cores running a preemptible kernel we may be pre-empted
  52. * during a TLB maintenance operation, so execute an inner-shareable dsb
  53. @@ -22,6 +29,7 @@
  54. #define switch_to(prev,next,last) \
  55. do { \
  56. + switch_kmaps(prev, next); \
  57. last = __switch_to(prev,task_thread_info(prev), task_thread_info(next)); \
  58. } while (0)
  59. diff -Nur linux-3.18.12.orig/arch/arm/include/asm/thread_info.h linux-3.18.12/arch/arm/include/asm/thread_info.h
  60. --- linux-3.18.12.orig/arch/arm/include/asm/thread_info.h 2015-04-20 14:48:02.000000000 -0500
  61. +++ linux-3.18.12/arch/arm/include/asm/thread_info.h 2015-04-26 13:32:22.355684003 -0500
  62. @@ -51,6 +51,7 @@
  63. struct thread_info {
  64. unsigned long flags; /* low level flags */
  65. int preempt_count; /* 0 => preemptable, <0 => bug */
  66. + int preempt_lazy_count; /* 0 => preemptable, <0 => bug */
  67. mm_segment_t addr_limit; /* address limit */
  68. struct task_struct *task; /* main task structure */
  69. struct exec_domain *exec_domain; /* execution domain */
  70. @@ -149,6 +150,7 @@
  71. #define TIF_SIGPENDING 0
  72. #define TIF_NEED_RESCHED 1
  73. #define TIF_NOTIFY_RESUME 2 /* callback before returning to user */
  74. +#define TIF_NEED_RESCHED_LAZY 3
  75. #define TIF_UPROBE 7
  76. #define TIF_SYSCALL_TRACE 8
  77. #define TIF_SYSCALL_AUDIT 9
  78. @@ -162,6 +164,7 @@
  79. #define _TIF_SIGPENDING (1 << TIF_SIGPENDING)
  80. #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED)
  81. #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
  82. +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
  83. #define _TIF_UPROBE (1 << TIF_UPROBE)
  84. #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
  85. #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
  86. diff -Nur linux-3.18.12.orig/arch/arm/Kconfig linux-3.18.12/arch/arm/Kconfig
  87. --- linux-3.18.12.orig/arch/arm/Kconfig 2015-04-20 14:48:02.000000000 -0500
  88. +++ linux-3.18.12/arch/arm/Kconfig 2015-04-26 13:32:22.351684003 -0500
  89. @@ -62,6 +62,7 @@
  90. select HAVE_PERF_EVENTS
  91. select HAVE_PERF_REGS
  92. select HAVE_PERF_USER_STACK_DUMP
  93. + select HAVE_PREEMPT_LAZY
  94. select HAVE_RCU_TABLE_FREE if (SMP && ARM_LPAE)
  95. select HAVE_REGS_AND_STACK_ACCESS_API
  96. select HAVE_SYSCALL_TRACEPOINTS
  97. diff -Nur linux-3.18.12.orig/arch/arm/kernel/asm-offsets.c linux-3.18.12/arch/arm/kernel/asm-offsets.c
  98. --- linux-3.18.12.orig/arch/arm/kernel/asm-offsets.c 2015-04-20 14:48:02.000000000 -0500
  99. +++ linux-3.18.12/arch/arm/kernel/asm-offsets.c 2015-04-26 13:32:22.355684003 -0500
  100. @@ -64,6 +64,7 @@
  101. BLANK();
  102. DEFINE(TI_FLAGS, offsetof(struct thread_info, flags));
  103. DEFINE(TI_PREEMPT, offsetof(struct thread_info, preempt_count));
  104. + DEFINE(TI_PREEMPT_LAZY, offsetof(struct thread_info, preempt_lazy_count));
  105. DEFINE(TI_ADDR_LIMIT, offsetof(struct thread_info, addr_limit));
  106. DEFINE(TI_TASK, offsetof(struct thread_info, task));
  107. DEFINE(TI_EXEC_DOMAIN, offsetof(struct thread_info, exec_domain));
  108. diff -Nur linux-3.18.12.orig/arch/arm/kernel/entry-armv.S linux-3.18.12/arch/arm/kernel/entry-armv.S
  109. --- linux-3.18.12.orig/arch/arm/kernel/entry-armv.S 2015-04-20 14:48:02.000000000 -0500
  110. +++ linux-3.18.12/arch/arm/kernel/entry-armv.S 2015-04-26 13:32:22.355684003 -0500
  111. @@ -207,11 +207,18 @@
  112. #ifdef CONFIG_PREEMPT
  113. get_thread_info tsk
  114. ldr r8, [tsk, #TI_PREEMPT] @ get preempt count
  115. - ldr r0, [tsk, #TI_FLAGS] @ get flags
  116. teq r8, #0 @ if preempt count != 0
  117. + bne 1f @ return from exeption
  118. + ldr r0, [tsk, #TI_FLAGS] @ get flags
  119. + tst r0, #_TIF_NEED_RESCHED @ if NEED_RESCHED is set
  120. + blne svc_preempt @ preempt!
  121. +
  122. + ldr r8, [tsk, #TI_PREEMPT_LAZY] @ get preempt lazy count
  123. + teq r8, #0 @ if preempt lazy count != 0
  124. movne r0, #0 @ force flags to 0
  125. - tst r0, #_TIF_NEED_RESCHED
  126. + tst r0, #_TIF_NEED_RESCHED_LAZY
  127. blne svc_preempt
  128. +1:
  129. #endif
  130. svc_exit r5, irq = 1 @ return from exception
  131. @@ -226,6 +233,8 @@
  132. 1: bl preempt_schedule_irq @ irq en/disable is done inside
  133. ldr r0, [tsk, #TI_FLAGS] @ get new tasks TI_FLAGS
  134. tst r0, #_TIF_NEED_RESCHED
  135. + bne 1b
  136. + tst r0, #_TIF_NEED_RESCHED_LAZY
  137. reteq r8 @ go again
  138. b 1b
  139. #endif
  140. diff -Nur linux-3.18.12.orig/arch/arm/kernel/process.c linux-3.18.12/arch/arm/kernel/process.c
  141. --- linux-3.18.12.orig/arch/arm/kernel/process.c 2015-04-20 14:48:02.000000000 -0500
  142. +++ linux-3.18.12/arch/arm/kernel/process.c 2015-04-26 13:32:22.355684003 -0500
  143. @@ -431,6 +431,30 @@
  144. }
  145. #ifdef CONFIG_MMU
  146. +/*
  147. + * CONFIG_SPLIT_PTLOCK_CPUS results in a page->ptl lock. If the lock is not
  148. + * initialized by pgtable_page_ctor() then a coredump of the vector page will
  149. + * fail.
  150. + */
  151. +static int __init vectors_user_mapping_init_page(void)
  152. +{
  153. + struct page *page;
  154. + unsigned long addr = 0xffff0000;
  155. + pgd_t *pgd;
  156. + pud_t *pud;
  157. + pmd_t *pmd;
  158. +
  159. + pgd = pgd_offset_k(addr);
  160. + pud = pud_offset(pgd, addr);
  161. + pmd = pmd_offset(pud, addr);
  162. + page = pmd_page(*(pmd));
  163. +
  164. + pgtable_page_ctor(page);
  165. +
  166. + return 0;
  167. +}
  168. +late_initcall(vectors_user_mapping_init_page);
  169. +
  170. #ifdef CONFIG_KUSER_HELPERS
  171. /*
  172. * The vectors page is always readable from user space for the
  173. diff -Nur linux-3.18.12.orig/arch/arm/kernel/signal.c linux-3.18.12/arch/arm/kernel/signal.c
  174. --- linux-3.18.12.orig/arch/arm/kernel/signal.c 2015-04-20 14:48:02.000000000 -0500
  175. +++ linux-3.18.12/arch/arm/kernel/signal.c 2015-04-26 13:32:22.359684003 -0500
  176. @@ -574,7 +574,8 @@
  177. do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall)
  178. {
  179. do {
  180. - if (likely(thread_flags & _TIF_NEED_RESCHED)) {
  181. + if (likely(thread_flags & (_TIF_NEED_RESCHED |
  182. + _TIF_NEED_RESCHED_LAZY))) {
  183. schedule();
  184. } else {
  185. if (unlikely(!user_mode(regs)))
  186. diff -Nur linux-3.18.12.orig/arch/arm/kernel/unwind.c linux-3.18.12/arch/arm/kernel/unwind.c
  187. --- linux-3.18.12.orig/arch/arm/kernel/unwind.c 2015-04-20 14:48:02.000000000 -0500
  188. +++ linux-3.18.12/arch/arm/kernel/unwind.c 2015-04-26 13:32:22.359684003 -0500
  189. @@ -93,7 +93,7 @@
  190. static const struct unwind_idx *__origin_unwind_idx;
  191. extern const struct unwind_idx __stop_unwind_idx[];
  192. -static DEFINE_SPINLOCK(unwind_lock);
  193. +static DEFINE_RAW_SPINLOCK(unwind_lock);
  194. static LIST_HEAD(unwind_tables);
  195. /* Convert a prel31 symbol to an absolute address */
  196. @@ -201,7 +201,7 @@
  197. /* module unwind tables */
  198. struct unwind_table *table;
  199. - spin_lock_irqsave(&unwind_lock, flags);
  200. + raw_spin_lock_irqsave(&unwind_lock, flags);
  201. list_for_each_entry(table, &unwind_tables, list) {
  202. if (addr >= table->begin_addr &&
  203. addr < table->end_addr) {
  204. @@ -213,7 +213,7 @@
  205. break;
  206. }
  207. }
  208. - spin_unlock_irqrestore(&unwind_lock, flags);
  209. + raw_spin_unlock_irqrestore(&unwind_lock, flags);
  210. }
  211. pr_debug("%s: idx = %p\n", __func__, idx);
  212. @@ -530,9 +530,9 @@
  213. tab->begin_addr = text_addr;
  214. tab->end_addr = text_addr + text_size;
  215. - spin_lock_irqsave(&unwind_lock, flags);
  216. + raw_spin_lock_irqsave(&unwind_lock, flags);
  217. list_add_tail(&tab->list, &unwind_tables);
  218. - spin_unlock_irqrestore(&unwind_lock, flags);
  219. + raw_spin_unlock_irqrestore(&unwind_lock, flags);
  220. return tab;
  221. }
  222. @@ -544,9 +544,9 @@
  223. if (!tab)
  224. return;
  225. - spin_lock_irqsave(&unwind_lock, flags);
  226. + raw_spin_lock_irqsave(&unwind_lock, flags);
  227. list_del(&tab->list);
  228. - spin_unlock_irqrestore(&unwind_lock, flags);
  229. + raw_spin_unlock_irqrestore(&unwind_lock, flags);
  230. kfree(tab);
  231. }
  232. diff -Nur linux-3.18.12.orig/arch/arm/kvm/arm.c linux-3.18.12/arch/arm/kvm/arm.c
  233. --- linux-3.18.12.orig/arch/arm/kvm/arm.c 2015-04-20 14:48:02.000000000 -0500
  234. +++ linux-3.18.12/arch/arm/kvm/arm.c 2015-04-26 13:32:22.359684003 -0500
  235. @@ -441,9 +441,9 @@
  236. static void vcpu_pause(struct kvm_vcpu *vcpu)
  237. {
  238. - wait_queue_head_t *wq = kvm_arch_vcpu_wq(vcpu);
  239. + struct swait_head *wq = kvm_arch_vcpu_wq(vcpu);
  240. - wait_event_interruptible(*wq, !vcpu->arch.pause);
  241. + swait_event_interruptible(*wq, !vcpu->arch.pause);
  242. }
  243. static int kvm_vcpu_initialized(struct kvm_vcpu *vcpu)
  244. diff -Nur linux-3.18.12.orig/arch/arm/kvm/psci.c linux-3.18.12/arch/arm/kvm/psci.c
  245. --- linux-3.18.12.orig/arch/arm/kvm/psci.c 2015-04-20 14:48:02.000000000 -0500
  246. +++ linux-3.18.12/arch/arm/kvm/psci.c 2015-04-26 13:32:22.359684003 -0500
  247. @@ -66,7 +66,7 @@
  248. {
  249. struct kvm *kvm = source_vcpu->kvm;
  250. struct kvm_vcpu *vcpu = NULL, *tmp;
  251. - wait_queue_head_t *wq;
  252. + struct swait_head *wq;
  253. unsigned long cpu_id;
  254. unsigned long context_id;
  255. unsigned long mpidr;
  256. @@ -123,7 +123,7 @@
  257. smp_mb(); /* Make sure the above is visible */
  258. wq = kvm_arch_vcpu_wq(vcpu);
  259. - wake_up_interruptible(wq);
  260. + swait_wake_interruptible(wq);
  261. return PSCI_RET_SUCCESS;
  262. }
  263. diff -Nur linux-3.18.12.orig/arch/arm/mach-at91/at91rm9200_time.c linux-3.18.12/arch/arm/mach-at91/at91rm9200_time.c
  264. --- linux-3.18.12.orig/arch/arm/mach-at91/at91rm9200_time.c 2015-04-20 14:48:02.000000000 -0500
  265. +++ linux-3.18.12/arch/arm/mach-at91/at91rm9200_time.c 2015-04-26 13:32:22.359684003 -0500
  266. @@ -135,6 +135,7 @@
  267. break;
  268. case CLOCK_EVT_MODE_SHUTDOWN:
  269. case CLOCK_EVT_MODE_UNUSED:
  270. + remove_irq(NR_IRQS_LEGACY + AT91_ID_SYS, &at91rm9200_timer_irq);
  271. case CLOCK_EVT_MODE_RESUME:
  272. irqmask = 0;
  273. break;
  274. diff -Nur linux-3.18.12.orig/arch/arm/mach-exynos/platsmp.c linux-3.18.12/arch/arm/mach-exynos/platsmp.c
  275. --- linux-3.18.12.orig/arch/arm/mach-exynos/platsmp.c 2015-04-20 14:48:02.000000000 -0500
  276. +++ linux-3.18.12/arch/arm/mach-exynos/platsmp.c 2015-04-26 13:32:22.359684003 -0500
  277. @@ -137,7 +137,7 @@
  278. return (void __iomem *)(S5P_VA_SCU);
  279. }
  280. -static DEFINE_SPINLOCK(boot_lock);
  281. +static DEFINE_RAW_SPINLOCK(boot_lock);
  282. static void exynos_secondary_init(unsigned int cpu)
  283. {
  284. @@ -150,8 +150,8 @@
  285. /*
  286. * Synchronise with the boot thread.
  287. */
  288. - spin_lock(&boot_lock);
  289. - spin_unlock(&boot_lock);
  290. + raw_spin_lock(&boot_lock);
  291. + raw_spin_unlock(&boot_lock);
  292. }
  293. static int exynos_boot_secondary(unsigned int cpu, struct task_struct *idle)
  294. @@ -165,7 +165,7 @@
  295. * Set synchronisation state between this boot processor
  296. * and the secondary one
  297. */
  298. - spin_lock(&boot_lock);
  299. + raw_spin_lock(&boot_lock);
  300. /*
  301. * The secondary processor is waiting to be released from
  302. @@ -192,7 +192,7 @@
  303. if (timeout == 0) {
  304. printk(KERN_ERR "cpu1 power enable failed");
  305. - spin_unlock(&boot_lock);
  306. + raw_spin_unlock(&boot_lock);
  307. return -ETIMEDOUT;
  308. }
  309. }
  310. @@ -242,7 +242,7 @@
  311. * calibrations, then wait for it to finish
  312. */
  313. fail:
  314. - spin_unlock(&boot_lock);
  315. + raw_spin_unlock(&boot_lock);
  316. return pen_release != -1 ? ret : 0;
  317. }
  318. diff -Nur linux-3.18.12.orig/arch/arm/mach-hisi/platmcpm.c linux-3.18.12/arch/arm/mach-hisi/platmcpm.c
  319. --- linux-3.18.12.orig/arch/arm/mach-hisi/platmcpm.c 2015-04-20 14:48:02.000000000 -0500
  320. +++ linux-3.18.12/arch/arm/mach-hisi/platmcpm.c 2015-04-26 13:32:22.363684003 -0500
  321. @@ -57,7 +57,7 @@
  322. static void __iomem *sysctrl, *fabric;
  323. static int hip04_cpu_table[HIP04_MAX_CLUSTERS][HIP04_MAX_CPUS_PER_CLUSTER];
  324. -static DEFINE_SPINLOCK(boot_lock);
  325. +static DEFINE_RAW_SPINLOCK(boot_lock);
  326. static u32 fabric_phys_addr;
  327. /*
  328. * [0]: bootwrapper physical address
  329. @@ -104,7 +104,7 @@
  330. if (cluster >= HIP04_MAX_CLUSTERS || cpu >= HIP04_MAX_CPUS_PER_CLUSTER)
  331. return -EINVAL;
  332. - spin_lock_irq(&boot_lock);
  333. + raw_spin_lock_irq(&boot_lock);
  334. if (hip04_cpu_table[cluster][cpu])
  335. goto out;
  336. @@ -133,7 +133,7 @@
  337. udelay(20);
  338. out:
  339. hip04_cpu_table[cluster][cpu]++;
  340. - spin_unlock_irq(&boot_lock);
  341. + raw_spin_unlock_irq(&boot_lock);
  342. return 0;
  343. }
  344. @@ -149,7 +149,7 @@
  345. __mcpm_cpu_going_down(cpu, cluster);
  346. - spin_lock(&boot_lock);
  347. + raw_spin_lock(&boot_lock);
  348. BUG_ON(__mcpm_cluster_state(cluster) != CLUSTER_UP);
  349. hip04_cpu_table[cluster][cpu]--;
  350. if (hip04_cpu_table[cluster][cpu] == 1) {
  351. @@ -162,7 +162,7 @@
  352. last_man = hip04_cluster_is_down(cluster);
  353. if (last_man && __mcpm_outbound_enter_critical(cpu, cluster)) {
  354. - spin_unlock(&boot_lock);
  355. + raw_spin_unlock(&boot_lock);
  356. /* Since it's Cortex A15, disable L2 prefetching. */
  357. asm volatile(
  358. "mcr p15, 1, %0, c15, c0, 3 \n\t"
  359. @@ -173,7 +173,7 @@
  360. hip04_set_snoop_filter(cluster, 0);
  361. __mcpm_outbound_leave_critical(cluster, CLUSTER_DOWN);
  362. } else {
  363. - spin_unlock(&boot_lock);
  364. + raw_spin_unlock(&boot_lock);
  365. v7_exit_coherency_flush(louis);
  366. }
  367. @@ -192,7 +192,7 @@
  368. cpu >= HIP04_MAX_CPUS_PER_CLUSTER);
  369. count = TIMEOUT_MSEC / POLL_MSEC;
  370. - spin_lock_irq(&boot_lock);
  371. + raw_spin_lock_irq(&boot_lock);
  372. for (tries = 0; tries < count; tries++) {
  373. if (hip04_cpu_table[cluster][cpu]) {
  374. ret = -EBUSY;
  375. @@ -202,10 +202,10 @@
  376. data = readl_relaxed(sysctrl + SC_CPU_RESET_STATUS(cluster));
  377. if (data & CORE_WFI_STATUS(cpu))
  378. break;
  379. - spin_unlock_irq(&boot_lock);
  380. + raw_spin_unlock_irq(&boot_lock);
  381. /* Wait for clean L2 when the whole cluster is down. */
  382. msleep(POLL_MSEC);
  383. - spin_lock_irq(&boot_lock);
  384. + raw_spin_lock_irq(&boot_lock);
  385. }
  386. if (tries >= count)
  387. goto err;
  388. @@ -220,10 +220,10 @@
  389. }
  390. if (tries >= count)
  391. goto err;
  392. - spin_unlock_irq(&boot_lock);
  393. + raw_spin_unlock_irq(&boot_lock);
  394. return 0;
  395. err:
  396. - spin_unlock_irq(&boot_lock);
  397. + raw_spin_unlock_irq(&boot_lock);
  398. return ret;
  399. }
  400. @@ -235,10 +235,10 @@
  401. cpu = MPIDR_AFFINITY_LEVEL(mpidr, 0);
  402. cluster = MPIDR_AFFINITY_LEVEL(mpidr, 1);
  403. - spin_lock(&boot_lock);
  404. + raw_spin_lock(&boot_lock);
  405. if (!hip04_cpu_table[cluster][cpu])
  406. hip04_cpu_table[cluster][cpu] = 1;
  407. - spin_unlock(&boot_lock);
  408. + raw_spin_unlock(&boot_lock);
  409. }
  410. static void __naked hip04_mcpm_power_up_setup(unsigned int affinity_level)
  411. diff -Nur linux-3.18.12.orig/arch/arm/mach-omap2/omap-smp.c linux-3.18.12/arch/arm/mach-omap2/omap-smp.c
  412. --- linux-3.18.12.orig/arch/arm/mach-omap2/omap-smp.c 2015-04-20 14:48:02.000000000 -0500
  413. +++ linux-3.18.12/arch/arm/mach-omap2/omap-smp.c 2015-04-26 13:32:22.363684003 -0500
  414. @@ -43,7 +43,7 @@
  415. /* SCU base address */
  416. static void __iomem *scu_base;
  417. -static DEFINE_SPINLOCK(boot_lock);
  418. +static DEFINE_RAW_SPINLOCK(boot_lock);
  419. void __iomem *omap4_get_scu_base(void)
  420. {
  421. @@ -74,8 +74,8 @@
  422. /*
  423. * Synchronise with the boot thread.
  424. */
  425. - spin_lock(&boot_lock);
  426. - spin_unlock(&boot_lock);
  427. + raw_spin_lock(&boot_lock);
  428. + raw_spin_unlock(&boot_lock);
  429. }
  430. static int omap4_boot_secondary(unsigned int cpu, struct task_struct *idle)
  431. @@ -89,7 +89,7 @@
  432. * Set synchronisation state between this boot processor
  433. * and the secondary one
  434. */
  435. - spin_lock(&boot_lock);
  436. + raw_spin_lock(&boot_lock);
  437. /*
  438. * Update the AuxCoreBoot0 with boot state for secondary core.
  439. @@ -166,7 +166,7 @@
  440. * Now the secondary core is starting up let it run its
  441. * calibrations, then wait for it to finish
  442. */
  443. - spin_unlock(&boot_lock);
  444. + raw_spin_unlock(&boot_lock);
  445. return 0;
  446. }
  447. diff -Nur linux-3.18.12.orig/arch/arm/mach-prima2/platsmp.c linux-3.18.12/arch/arm/mach-prima2/platsmp.c
  448. --- linux-3.18.12.orig/arch/arm/mach-prima2/platsmp.c 2015-04-20 14:48:02.000000000 -0500
  449. +++ linux-3.18.12/arch/arm/mach-prima2/platsmp.c 2015-04-26 13:32:22.363684003 -0500
  450. @@ -23,7 +23,7 @@
  451. static void __iomem *scu_base;
  452. static void __iomem *rsc_base;
  453. -static DEFINE_SPINLOCK(boot_lock);
  454. +static DEFINE_RAW_SPINLOCK(boot_lock);
  455. static struct map_desc scu_io_desc __initdata = {
  456. .length = SZ_4K,
  457. @@ -56,8 +56,8 @@
  458. /*
  459. * Synchronise with the boot thread.
  460. */
  461. - spin_lock(&boot_lock);
  462. - spin_unlock(&boot_lock);
  463. + raw_spin_lock(&boot_lock);
  464. + raw_spin_unlock(&boot_lock);
  465. }
  466. static struct of_device_id rsc_ids[] = {
  467. @@ -95,7 +95,7 @@
  468. /* make sure write buffer is drained */
  469. mb();
  470. - spin_lock(&boot_lock);
  471. + raw_spin_lock(&boot_lock);
  472. /*
  473. * The secondary processor is waiting to be released from
  474. @@ -127,7 +127,7 @@
  475. * now the secondary core is starting up let it run its
  476. * calibrations, then wait for it to finish
  477. */
  478. - spin_unlock(&boot_lock);
  479. + raw_spin_unlock(&boot_lock);
  480. return pen_release != -1 ? -ENOSYS : 0;
  481. }
  482. diff -Nur linux-3.18.12.orig/arch/arm/mach-qcom/platsmp.c linux-3.18.12/arch/arm/mach-qcom/platsmp.c
  483. --- linux-3.18.12.orig/arch/arm/mach-qcom/platsmp.c 2015-04-20 14:48:02.000000000 -0500
  484. +++ linux-3.18.12/arch/arm/mach-qcom/platsmp.c 2015-04-26 13:32:22.363684003 -0500
  485. @@ -46,7 +46,7 @@
  486. extern void secondary_startup(void);
  487. -static DEFINE_SPINLOCK(boot_lock);
  488. +static DEFINE_RAW_SPINLOCK(boot_lock);
  489. #ifdef CONFIG_HOTPLUG_CPU
  490. static void __ref qcom_cpu_die(unsigned int cpu)
  491. @@ -60,8 +60,8 @@
  492. /*
  493. * Synchronise with the boot thread.
  494. */
  495. - spin_lock(&boot_lock);
  496. - spin_unlock(&boot_lock);
  497. + raw_spin_lock(&boot_lock);
  498. + raw_spin_unlock(&boot_lock);
  499. }
  500. static int scss_release_secondary(unsigned int cpu)
  501. @@ -284,7 +284,7 @@
  502. * set synchronisation state between this boot processor
  503. * and the secondary one
  504. */
  505. - spin_lock(&boot_lock);
  506. + raw_spin_lock(&boot_lock);
  507. /*
  508. * Send the secondary CPU a soft interrupt, thereby causing
  509. @@ -297,7 +297,7 @@
  510. * now the secondary core is starting up let it run its
  511. * calibrations, then wait for it to finish
  512. */
  513. - spin_unlock(&boot_lock);
  514. + raw_spin_unlock(&boot_lock);
  515. return ret;
  516. }
  517. diff -Nur linux-3.18.12.orig/arch/arm/mach-spear/platsmp.c linux-3.18.12/arch/arm/mach-spear/platsmp.c
  518. --- linux-3.18.12.orig/arch/arm/mach-spear/platsmp.c 2015-04-20 14:48:02.000000000 -0500
  519. +++ linux-3.18.12/arch/arm/mach-spear/platsmp.c 2015-04-26 13:32:22.363684003 -0500
  520. @@ -32,7 +32,7 @@
  521. sync_cache_w(&pen_release);
  522. }
  523. -static DEFINE_SPINLOCK(boot_lock);
  524. +static DEFINE_RAW_SPINLOCK(boot_lock);
  525. static void __iomem *scu_base = IOMEM(VA_SCU_BASE);
  526. @@ -47,8 +47,8 @@
  527. /*
  528. * Synchronise with the boot thread.
  529. */
  530. - spin_lock(&boot_lock);
  531. - spin_unlock(&boot_lock);
  532. + raw_spin_lock(&boot_lock);
  533. + raw_spin_unlock(&boot_lock);
  534. }
  535. static int spear13xx_boot_secondary(unsigned int cpu, struct task_struct *idle)
  536. @@ -59,7 +59,7 @@
  537. * set synchronisation state between this boot processor
  538. * and the secondary one
  539. */
  540. - spin_lock(&boot_lock);
  541. + raw_spin_lock(&boot_lock);
  542. /*
  543. * The secondary processor is waiting to be released from
  544. @@ -84,7 +84,7 @@
  545. * now the secondary core is starting up let it run its
  546. * calibrations, then wait for it to finish
  547. */
  548. - spin_unlock(&boot_lock);
  549. + raw_spin_unlock(&boot_lock);
  550. return pen_release != -1 ? -ENOSYS : 0;
  551. }
  552. diff -Nur linux-3.18.12.orig/arch/arm/mach-sti/platsmp.c linux-3.18.12/arch/arm/mach-sti/platsmp.c
  553. --- linux-3.18.12.orig/arch/arm/mach-sti/platsmp.c 2015-04-20 14:48:02.000000000 -0500
  554. +++ linux-3.18.12/arch/arm/mach-sti/platsmp.c 2015-04-26 13:32:22.363684003 -0500
  555. @@ -34,7 +34,7 @@
  556. sync_cache_w(&pen_release);
  557. }
  558. -static DEFINE_SPINLOCK(boot_lock);
  559. +static DEFINE_RAW_SPINLOCK(boot_lock);
  560. static void sti_secondary_init(unsigned int cpu)
  561. {
  562. @@ -49,8 +49,8 @@
  563. /*
  564. * Synchronise with the boot thread.
  565. */
  566. - spin_lock(&boot_lock);
  567. - spin_unlock(&boot_lock);
  568. + raw_spin_lock(&boot_lock);
  569. + raw_spin_unlock(&boot_lock);
  570. }
  571. static int sti_boot_secondary(unsigned int cpu, struct task_struct *idle)
  572. @@ -61,7 +61,7 @@
  573. * set synchronisation state between this boot processor
  574. * and the secondary one
  575. */
  576. - spin_lock(&boot_lock);
  577. + raw_spin_lock(&boot_lock);
  578. /*
  579. * The secondary processor is waiting to be released from
  580. @@ -92,7 +92,7 @@
  581. * now the secondary core is starting up let it run its
  582. * calibrations, then wait for it to finish
  583. */
  584. - spin_unlock(&boot_lock);
  585. + raw_spin_unlock(&boot_lock);
  586. return pen_release != -1 ? -ENOSYS : 0;
  587. }
  588. diff -Nur linux-3.18.12.orig/arch/arm/mach-ux500/platsmp.c linux-3.18.12/arch/arm/mach-ux500/platsmp.c
  589. --- linux-3.18.12.orig/arch/arm/mach-ux500/platsmp.c 2015-04-20 14:48:02.000000000 -0500
  590. +++ linux-3.18.12/arch/arm/mach-ux500/platsmp.c 2015-04-26 13:32:22.363684003 -0500
  591. @@ -51,7 +51,7 @@
  592. return NULL;
  593. }
  594. -static DEFINE_SPINLOCK(boot_lock);
  595. +static DEFINE_RAW_SPINLOCK(boot_lock);
  596. static void ux500_secondary_init(unsigned int cpu)
  597. {
  598. @@ -64,8 +64,8 @@
  599. /*
  600. * Synchronise with the boot thread.
  601. */
  602. - spin_lock(&boot_lock);
  603. - spin_unlock(&boot_lock);
  604. + raw_spin_lock(&boot_lock);
  605. + raw_spin_unlock(&boot_lock);
  606. }
  607. static int ux500_boot_secondary(unsigned int cpu, struct task_struct *idle)
  608. @@ -76,7 +76,7 @@
  609. * set synchronisation state between this boot processor
  610. * and the secondary one
  611. */
  612. - spin_lock(&boot_lock);
  613. + raw_spin_lock(&boot_lock);
  614. /*
  615. * The secondary processor is waiting to be released from
  616. @@ -97,7 +97,7 @@
  617. * now the secondary core is starting up let it run its
  618. * calibrations, then wait for it to finish
  619. */
  620. - spin_unlock(&boot_lock);
  621. + raw_spin_unlock(&boot_lock);
  622. return pen_release != -1 ? -ENOSYS : 0;
  623. }
  624. diff -Nur linux-3.18.12.orig/arch/arm/mm/fault.c linux-3.18.12/arch/arm/mm/fault.c
  625. --- linux-3.18.12.orig/arch/arm/mm/fault.c 2015-04-20 14:48:02.000000000 -0500
  626. +++ linux-3.18.12/arch/arm/mm/fault.c 2015-04-26 13:32:22.367684003 -0500
  627. @@ -277,7 +277,7 @@
  628. * If we're in an interrupt or have no user
  629. * context, we must not take the fault..
  630. */
  631. - if (in_atomic() || !mm)
  632. + if (!mm || pagefault_disabled())
  633. goto no_context;
  634. if (user_mode(regs))
  635. @@ -431,6 +431,9 @@
  636. if (addr < TASK_SIZE)
  637. return do_page_fault(addr, fsr, regs);
  638. + if (interrupts_enabled(regs))
  639. + local_irq_enable();
  640. +
  641. if (user_mode(regs))
  642. goto bad_area;
  643. @@ -498,6 +501,9 @@
  644. static int
  645. do_sect_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
  646. {
  647. + if (interrupts_enabled(regs))
  648. + local_irq_enable();
  649. +
  650. do_bad_area(addr, fsr, regs);
  651. return 0;
  652. }
  653. diff -Nur linux-3.18.12.orig/arch/arm/mm/highmem.c linux-3.18.12/arch/arm/mm/highmem.c
  654. --- linux-3.18.12.orig/arch/arm/mm/highmem.c 2015-04-20 14:48:02.000000000 -0500
  655. +++ linux-3.18.12/arch/arm/mm/highmem.c 2015-04-26 13:32:22.367684003 -0500
  656. @@ -53,6 +53,7 @@
  657. void *kmap_atomic(struct page *page)
  658. {
  659. + pte_t pte = mk_pte(page, kmap_prot);
  660. unsigned int idx;
  661. unsigned long vaddr;
  662. void *kmap;
  663. @@ -91,7 +92,10 @@
  664. * in place, so the contained TLB flush ensures the TLB is updated
  665. * with the new mapping.
  666. */
  667. - set_fixmap_pte(idx, mk_pte(page, kmap_prot));
  668. +#ifdef CONFIG_PREEMPT_RT_FULL
  669. + current->kmap_pte[type] = pte;
  670. +#endif
  671. + set_fixmap_pte(idx, pte);
  672. return (void *)vaddr;
  673. }
  674. @@ -108,12 +112,15 @@
  675. if (cache_is_vivt())
  676. __cpuc_flush_dcache_area((void *)vaddr, PAGE_SIZE);
  677. +#ifdef CONFIG_PREEMPT_RT_FULL
  678. + current->kmap_pte[type] = __pte(0);
  679. +#endif
  680. #ifdef CONFIG_DEBUG_HIGHMEM
  681. BUG_ON(vaddr != __fix_to_virt(idx));
  682. - set_fixmap_pte(idx, __pte(0));
  683. #else
  684. (void) idx; /* to kill a warning */
  685. #endif
  686. + set_fixmap_pte(idx, __pte(0));
  687. kmap_atomic_idx_pop();
  688. } else if (vaddr >= PKMAP_ADDR(0) && vaddr < PKMAP_ADDR(LAST_PKMAP)) {
  689. /* this address was obtained through kmap_high_get() */
  690. @@ -125,6 +132,7 @@
  691. void *kmap_atomic_pfn(unsigned long pfn)
  692. {
  693. + pte_t pte = pfn_pte(pfn, kmap_prot);
  694. unsigned long vaddr;
  695. int idx, type;
  696. struct page *page = pfn_to_page(pfn);
  697. @@ -139,7 +147,10 @@
  698. #ifdef CONFIG_DEBUG_HIGHMEM
  699. BUG_ON(!pte_none(*(fixmap_page_table + idx)));
  700. #endif
  701. - set_fixmap_pte(idx, pfn_pte(pfn, kmap_prot));
  702. +#ifdef CONFIG_PREEMPT_RT_FULL
  703. + current->kmap_pte[type] = pte;
  704. +#endif
  705. + set_fixmap_pte(idx, pte);
  706. return (void *)vaddr;
  707. }
  708. @@ -153,3 +164,28 @@
  709. return pte_page(get_fixmap_pte(vaddr));
  710. }
  711. +
  712. +#if defined CONFIG_PREEMPT_RT_FULL
  713. +void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p)
  714. +{
  715. + int i;
  716. +
  717. + /*
  718. + * Clear @prev's kmap_atomic mappings
  719. + */
  720. + for (i = 0; i < prev_p->kmap_idx; i++) {
  721. + int idx = i + KM_TYPE_NR * smp_processor_id();
  722. +
  723. + set_fixmap_pte(idx, __pte(0));
  724. + }
  725. + /*
  726. + * Restore @next_p's kmap_atomic mappings
  727. + */
  728. + for (i = 0; i < next_p->kmap_idx; i++) {
  729. + int idx = i + KM_TYPE_NR * smp_processor_id();
  730. +
  731. + if (!pte_none(next_p->kmap_pte[i]))
  732. + set_fixmap_pte(idx, next_p->kmap_pte[i]);
  733. + }
  734. +}
  735. +#endif
  736. diff -Nur linux-3.18.12.orig/arch/arm/plat-versatile/platsmp.c linux-3.18.12/arch/arm/plat-versatile/platsmp.c
  737. --- linux-3.18.12.orig/arch/arm/plat-versatile/platsmp.c 2015-04-20 14:48:02.000000000 -0500
  738. +++ linux-3.18.12/arch/arm/plat-versatile/platsmp.c 2015-04-26 13:32:22.367684003 -0500
  739. @@ -30,7 +30,7 @@
  740. sync_cache_w(&pen_release);
  741. }
  742. -static DEFINE_SPINLOCK(boot_lock);
  743. +static DEFINE_RAW_SPINLOCK(boot_lock);
  744. void versatile_secondary_init(unsigned int cpu)
  745. {
  746. @@ -43,8 +43,8 @@
  747. /*
  748. * Synchronise with the boot thread.
  749. */
  750. - spin_lock(&boot_lock);
  751. - spin_unlock(&boot_lock);
  752. + raw_spin_lock(&boot_lock);
  753. + raw_spin_unlock(&boot_lock);
  754. }
  755. int versatile_boot_secondary(unsigned int cpu, struct task_struct *idle)
  756. @@ -55,7 +55,7 @@
  757. * Set synchronisation state between this boot processor
  758. * and the secondary one
  759. */
  760. - spin_lock(&boot_lock);
  761. + raw_spin_lock(&boot_lock);
  762. /*
  763. * This is really belt and braces; we hold unintended secondary
  764. @@ -85,7 +85,7 @@
  765. * now the secondary core is starting up let it run its
  766. * calibrations, then wait for it to finish
  767. */
  768. - spin_unlock(&boot_lock);
  769. + raw_spin_unlock(&boot_lock);
  770. return pen_release != -1 ? -ENOSYS : 0;
  771. }
  772. diff -Nur linux-3.18.12.orig/arch/avr32/mm/fault.c linux-3.18.12/arch/avr32/mm/fault.c
  773. --- linux-3.18.12.orig/arch/avr32/mm/fault.c 2015-04-20 14:48:02.000000000 -0500
  774. +++ linux-3.18.12/arch/avr32/mm/fault.c 2015-04-26 13:32:22.367684003 -0500
  775. @@ -81,7 +81,7 @@
  776. * If we're in an interrupt or have no user context, we must
  777. * not take the fault...
  778. */
  779. - if (in_atomic() || !mm || regs->sr & SYSREG_BIT(GM))
  780. + if (!mm || regs->sr & SYSREG_BIT(GM) || pagefault_disabled())
  781. goto no_context;
  782. local_irq_enable();
  783. diff -Nur linux-3.18.12.orig/arch/cris/mm/fault.c linux-3.18.12/arch/cris/mm/fault.c
  784. --- linux-3.18.12.orig/arch/cris/mm/fault.c 2015-04-20 14:48:02.000000000 -0500
  785. +++ linux-3.18.12/arch/cris/mm/fault.c 2015-04-26 13:32:22.367684003 -0500
  786. @@ -113,7 +113,7 @@
  787. * user context, we must not take the fault.
  788. */
  789. - if (in_atomic() || !mm)
  790. + if (!mm || pagefault_disabled())
  791. goto no_context;
  792. if (user_mode(regs))
  793. diff -Nur linux-3.18.12.orig/arch/frv/mm/fault.c linux-3.18.12/arch/frv/mm/fault.c
  794. --- linux-3.18.12.orig/arch/frv/mm/fault.c 2015-04-20 14:48:02.000000000 -0500
  795. +++ linux-3.18.12/arch/frv/mm/fault.c 2015-04-26 13:32:22.367684003 -0500
  796. @@ -78,7 +78,7 @@
  797. * If we're in an interrupt or have no user
  798. * context, we must not take the fault..
  799. */
  800. - if (in_atomic() || !mm)
  801. + if (!mm || pagefault_disabled())
  802. goto no_context;
  803. if (user_mode(__frame))
  804. diff -Nur linux-3.18.12.orig/arch/ia64/mm/fault.c linux-3.18.12/arch/ia64/mm/fault.c
  805. --- linux-3.18.12.orig/arch/ia64/mm/fault.c 2015-04-20 14:48:02.000000000 -0500
  806. +++ linux-3.18.12/arch/ia64/mm/fault.c 2015-04-26 13:32:22.367684003 -0500
  807. @@ -96,7 +96,7 @@
  808. /*
  809. * If we're in an interrupt or have no user context, we must not take the fault..
  810. */
  811. - if (in_atomic() || !mm)
  812. + if (!mm || pagefault_disabled())
  813. goto no_context;
  814. #ifdef CONFIG_VIRTUAL_MEM_MAP
  815. diff -Nur linux-3.18.12.orig/arch/Kconfig linux-3.18.12/arch/Kconfig
  816. --- linux-3.18.12.orig/arch/Kconfig 2015-04-20 14:48:02.000000000 -0500
  817. +++ linux-3.18.12/arch/Kconfig 2015-04-26 13:32:22.351684003 -0500
  818. @@ -6,6 +6,7 @@
  819. tristate "OProfile system profiling"
  820. depends on PROFILING
  821. depends on HAVE_OPROFILE
  822. + depends on !PREEMPT_RT_FULL
  823. select RING_BUFFER
  824. select RING_BUFFER_ALLOW_SWAP
  825. help
  826. diff -Nur linux-3.18.12.orig/arch/m32r/mm/fault.c linux-3.18.12/arch/m32r/mm/fault.c
  827. --- linux-3.18.12.orig/arch/m32r/mm/fault.c 2015-04-20 14:48:02.000000000 -0500
  828. +++ linux-3.18.12/arch/m32r/mm/fault.c 2015-04-26 13:32:22.367684003 -0500
  829. @@ -114,7 +114,7 @@
  830. * If we're in an interrupt or have no user context or are running in an
  831. * atomic region then we must not take the fault..
  832. */
  833. - if (in_atomic() || !mm)
  834. + if (!mm || pagefault_disabled())
  835. goto bad_area_nosemaphore;
  836. if (error_code & ACE_USERMODE)
  837. diff -Nur linux-3.18.12.orig/arch/m68k/mm/fault.c linux-3.18.12/arch/m68k/mm/fault.c
  838. --- linux-3.18.12.orig/arch/m68k/mm/fault.c 2015-04-20 14:48:02.000000000 -0500
  839. +++ linux-3.18.12/arch/m68k/mm/fault.c 2015-04-26 13:32:22.367684003 -0500
  840. @@ -81,7 +81,7 @@
  841. * If we're in an interrupt or have no user
  842. * context, we must not take the fault..
  843. */
  844. - if (in_atomic() || !mm)
  845. + if (!mm || pagefault_disabled())
  846. goto no_context;
  847. if (user_mode(regs))
  848. diff -Nur linux-3.18.12.orig/arch/microblaze/mm/fault.c linux-3.18.12/arch/microblaze/mm/fault.c
  849. --- linux-3.18.12.orig/arch/microblaze/mm/fault.c 2015-04-20 14:48:02.000000000 -0500
  850. +++ linux-3.18.12/arch/microblaze/mm/fault.c 2015-04-26 13:32:22.367684003 -0500
  851. @@ -107,7 +107,7 @@
  852. if ((error_code & 0x13) == 0x13 || (error_code & 0x11) == 0x11)
  853. is_write = 0;
  854. - if (unlikely(in_atomic() || !mm)) {
  855. + if (unlikely(!mm || pagefault_disabled())) {
  856. if (kernel_mode(regs))
  857. goto bad_area_nosemaphore;
  858. diff -Nur linux-3.18.12.orig/arch/mips/Kconfig linux-3.18.12/arch/mips/Kconfig
  859. --- linux-3.18.12.orig/arch/mips/Kconfig 2015-04-20 14:48:02.000000000 -0500
  860. +++ linux-3.18.12/arch/mips/Kconfig 2015-04-26 13:32:22.367684003 -0500
  861. @@ -2196,7 +2196,7 @@
  862. #
  863. config HIGHMEM
  864. bool "High Memory Support"
  865. - depends on 32BIT && CPU_SUPPORTS_HIGHMEM && SYS_SUPPORTS_HIGHMEM && !CPU_MIPS32_3_5_EVA
  866. + depends on 32BIT && CPU_SUPPORTS_HIGHMEM && SYS_SUPPORTS_HIGHMEM && !CPU_MIPS32_3_5_EVA && !PREEMPT_RT_FULL
  867. config CPU_SUPPORTS_HIGHMEM
  868. bool
  869. diff -Nur linux-3.18.12.orig/arch/mips/kernel/signal.c linux-3.18.12/arch/mips/kernel/signal.c
  870. --- linux-3.18.12.orig/arch/mips/kernel/signal.c 2015-04-20 14:48:02.000000000 -0500
  871. +++ linux-3.18.12/arch/mips/kernel/signal.c 2015-04-26 13:32:22.367684003 -0500
  872. @@ -613,6 +613,7 @@
  873. __u32 thread_info_flags)
  874. {
  875. local_irq_enable();
  876. + preempt_check_resched();
  877. user_exit();
  878. diff -Nur linux-3.18.12.orig/arch/mips/mm/fault.c linux-3.18.12/arch/mips/mm/fault.c
  879. --- linux-3.18.12.orig/arch/mips/mm/fault.c 2015-04-20 14:48:02.000000000 -0500
  880. +++ linux-3.18.12/arch/mips/mm/fault.c 2015-04-26 13:32:22.367684003 -0500
  881. @@ -89,7 +89,7 @@
  882. * If we're in an interrupt or have no user
  883. * context, we must not take the fault..
  884. */
  885. - if (in_atomic() || !mm)
  886. + if (!mm || pagefault_disabled())
  887. goto bad_area_nosemaphore;
  888. if (user_mode(regs))
  889. diff -Nur linux-3.18.12.orig/arch/mips/mm/init.c linux-3.18.12/arch/mips/mm/init.c
  890. --- linux-3.18.12.orig/arch/mips/mm/init.c 2015-04-20 14:48:02.000000000 -0500
  891. +++ linux-3.18.12/arch/mips/mm/init.c 2015-04-26 13:32:22.367684003 -0500
  892. @@ -90,7 +90,7 @@
  893. BUG_ON(Page_dcache_dirty(page));
  894. - pagefault_disable();
  895. + raw_pagefault_disable();
  896. idx = (addr >> PAGE_SHIFT) & (FIX_N_COLOURS - 1);
  897. idx += in_interrupt() ? FIX_N_COLOURS : 0;
  898. vaddr = __fix_to_virt(FIX_CMAP_END - idx);
  899. @@ -146,7 +146,7 @@
  900. tlbw_use_hazard();
  901. write_c0_entryhi(old_ctx);
  902. local_irq_restore(flags);
  903. - pagefault_enable();
  904. + raw_pagefault_enable();
  905. }
  906. void copy_user_highpage(struct page *to, struct page *from,
  907. diff -Nur linux-3.18.12.orig/arch/mn10300/mm/fault.c linux-3.18.12/arch/mn10300/mm/fault.c
  908. --- linux-3.18.12.orig/arch/mn10300/mm/fault.c 2015-04-20 14:48:02.000000000 -0500
  909. +++ linux-3.18.12/arch/mn10300/mm/fault.c 2015-04-26 13:32:22.367684003 -0500
  910. @@ -168,7 +168,7 @@
  911. * If we're in an interrupt or have no user
  912. * context, we must not take the fault..
  913. */
  914. - if (in_atomic() || !mm)
  915. + if (!mm || pagefault_disabled())
  916. goto no_context;
  917. if ((fault_code & MMUFCR_xFC_ACCESS) == MMUFCR_xFC_ACCESS_USR)
  918. diff -Nur linux-3.18.12.orig/arch/parisc/mm/fault.c linux-3.18.12/arch/parisc/mm/fault.c
  919. --- linux-3.18.12.orig/arch/parisc/mm/fault.c 2015-04-20 14:48:02.000000000 -0500
  920. +++ linux-3.18.12/arch/parisc/mm/fault.c 2015-04-26 13:32:22.367684003 -0500
  921. @@ -207,7 +207,7 @@
  922. int fault;
  923. unsigned int flags;
  924. - if (in_atomic())
  925. + if (pagefault_disabled())
  926. goto no_context;
  927. tsk = current;
  928. diff -Nur linux-3.18.12.orig/arch/powerpc/include/asm/kvm_host.h linux-3.18.12/arch/powerpc/include/asm/kvm_host.h
  929. --- linux-3.18.12.orig/arch/powerpc/include/asm/kvm_host.h 2015-04-20 14:48:02.000000000 -0500
  930. +++ linux-3.18.12/arch/powerpc/include/asm/kvm_host.h 2015-04-26 13:32:22.367684003 -0500
  931. @@ -296,7 +296,7 @@
  932. u8 in_guest;
  933. struct list_head runnable_threads;
  934. spinlock_t lock;
  935. - wait_queue_head_t wq;
  936. + struct swait_head wq;
  937. u64 stolen_tb;
  938. u64 preempt_tb;
  939. struct kvm_vcpu *runner;
  940. @@ -618,7 +618,7 @@
  941. u8 prodded;
  942. u32 last_inst;
  943. - wait_queue_head_t *wqp;
  944. + struct swait_head *wqp;
  945. struct kvmppc_vcore *vcore;
  946. int ret;
  947. int trap;
  948. diff -Nur linux-3.18.12.orig/arch/powerpc/include/asm/thread_info.h linux-3.18.12/arch/powerpc/include/asm/thread_info.h
  949. --- linux-3.18.12.orig/arch/powerpc/include/asm/thread_info.h 2015-04-20 14:48:02.000000000 -0500
  950. +++ linux-3.18.12/arch/powerpc/include/asm/thread_info.h 2015-04-26 13:32:22.367684003 -0500
  951. @@ -43,6 +43,8 @@
  952. int cpu; /* cpu we're on */
  953. int preempt_count; /* 0 => preemptable,
  954. <0 => BUG */
  955. + int preempt_lazy_count; /* 0 => preemptable,
  956. + <0 => BUG */
  957. struct restart_block restart_block;
  958. unsigned long local_flags; /* private flags for thread */
  959. @@ -88,8 +90,7 @@
  960. #define TIF_SYSCALL_TRACE 0 /* syscall trace active */
  961. #define TIF_SIGPENDING 1 /* signal pending */
  962. #define TIF_NEED_RESCHED 2 /* rescheduling necessary */
  963. -#define TIF_POLLING_NRFLAG 3 /* true if poll_idle() is polling
  964. - TIF_NEED_RESCHED */
  965. +#define TIF_NEED_RESCHED_LAZY 3 /* lazy rescheduling necessary */
  966. #define TIF_32BIT 4 /* 32 bit binary */
  967. #define TIF_RESTORE_TM 5 /* need to restore TM FP/VEC/VSX */
  968. #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */
  969. @@ -107,6 +108,8 @@
  970. #if defined(CONFIG_PPC64)
  971. #define TIF_ELF2ABI 18 /* function descriptors must die! */
  972. #endif
  973. +#define TIF_POLLING_NRFLAG 19 /* true if poll_idle() is polling
  974. + TIF_NEED_RESCHED */
  975. /* as above, but as bit values */
  976. #define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE)
  977. @@ -125,14 +128,16 @@
  978. #define _TIF_SYSCALL_TRACEPOINT (1<<TIF_SYSCALL_TRACEPOINT)
  979. #define _TIF_EMULATE_STACK_STORE (1<<TIF_EMULATE_STACK_STORE)
  980. #define _TIF_NOHZ (1<<TIF_NOHZ)
  981. +#define _TIF_NEED_RESCHED_LAZY (1<<TIF_NEED_RESCHED_LAZY)
  982. #define _TIF_SYSCALL_T_OR_A (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
  983. _TIF_SECCOMP | _TIF_SYSCALL_TRACEPOINT | \
  984. _TIF_NOHZ)
  985. #define _TIF_USER_WORK_MASK (_TIF_SIGPENDING | _TIF_NEED_RESCHED | \
  986. _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
  987. - _TIF_RESTORE_TM)
  988. + _TIF_RESTORE_TM | _TIF_NEED_RESCHED_LAZY)
  989. #define _TIF_PERSYSCALL_MASK (_TIF_RESTOREALL|_TIF_NOERROR)
  990. +#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
  991. /* Bits in local_flags */
  992. /* Don't move TLF_NAPPING without adjusting the code in entry_32.S */
  993. diff -Nur linux-3.18.12.orig/arch/powerpc/Kconfig linux-3.18.12/arch/powerpc/Kconfig
  994. --- linux-3.18.12.orig/arch/powerpc/Kconfig 2015-04-20 14:48:02.000000000 -0500
  995. +++ linux-3.18.12/arch/powerpc/Kconfig 2015-04-26 13:32:22.367684003 -0500
  996. @@ -60,10 +60,11 @@
  997. config RWSEM_GENERIC_SPINLOCK
  998. bool
  999. + default y if PREEMPT_RT_FULL
  1000. config RWSEM_XCHGADD_ALGORITHM
  1001. bool
  1002. - default y
  1003. + default y if !PREEMPT_RT_FULL
  1004. config GENERIC_LOCKBREAK
  1005. bool
  1006. @@ -136,6 +137,7 @@
  1007. select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
  1008. select GENERIC_STRNCPY_FROM_USER
  1009. select GENERIC_STRNLEN_USER
  1010. + select HAVE_PREEMPT_LAZY
  1011. select HAVE_MOD_ARCH_SPECIFIC
  1012. select MODULES_USE_ELF_RELA
  1013. select CLONE_BACKWARDS
  1014. @@ -303,7 +305,7 @@
  1015. config HIGHMEM
  1016. bool "High memory support"
  1017. - depends on PPC32
  1018. + depends on PPC32 && !PREEMPT_RT_FULL
  1019. source kernel/Kconfig.hz
  1020. source kernel/Kconfig.preempt
  1021. diff -Nur linux-3.18.12.orig/arch/powerpc/kernel/asm-offsets.c linux-3.18.12/arch/powerpc/kernel/asm-offsets.c
  1022. --- linux-3.18.12.orig/arch/powerpc/kernel/asm-offsets.c 2015-04-20 14:48:02.000000000 -0500
  1023. +++ linux-3.18.12/arch/powerpc/kernel/asm-offsets.c 2015-04-26 13:32:22.371684003 -0500
  1024. @@ -159,6 +159,7 @@
  1025. DEFINE(TI_FLAGS, offsetof(struct thread_info, flags));
  1026. DEFINE(TI_LOCAL_FLAGS, offsetof(struct thread_info, local_flags));
  1027. DEFINE(TI_PREEMPT, offsetof(struct thread_info, preempt_count));
  1028. + DEFINE(TI_PREEMPT_LAZY, offsetof(struct thread_info, preempt_lazy_count));
  1029. DEFINE(TI_TASK, offsetof(struct thread_info, task));
  1030. DEFINE(TI_CPU, offsetof(struct thread_info, cpu));
  1031. diff -Nur linux-3.18.12.orig/arch/powerpc/kernel/entry_32.S linux-3.18.12/arch/powerpc/kernel/entry_32.S
  1032. --- linux-3.18.12.orig/arch/powerpc/kernel/entry_32.S 2015-04-20 14:48:02.000000000 -0500
  1033. +++ linux-3.18.12/arch/powerpc/kernel/entry_32.S 2015-04-26 13:32:22.371684003 -0500
  1034. @@ -890,7 +890,14 @@
  1035. cmpwi 0,r0,0 /* if non-zero, just restore regs and return */
  1036. bne restore
  1037. andi. r8,r8,_TIF_NEED_RESCHED
  1038. + bne+ 1f
  1039. + lwz r0,TI_PREEMPT_LAZY(r9)
  1040. + cmpwi 0,r0,0 /* if non-zero, just restore regs and return */
  1041. + bne restore
  1042. + lwz r0,TI_FLAGS(r9)
  1043. + andi. r0,r0,_TIF_NEED_RESCHED_LAZY
  1044. beq+ restore
  1045. +1:
  1046. lwz r3,_MSR(r1)
  1047. andi. r0,r3,MSR_EE /* interrupts off? */
  1048. beq restore /* don't schedule if so */
  1049. @@ -901,11 +908,11 @@
  1050. */
  1051. bl trace_hardirqs_off
  1052. #endif
  1053. -1: bl preempt_schedule_irq
  1054. +2: bl preempt_schedule_irq
  1055. CURRENT_THREAD_INFO(r9, r1)
  1056. lwz r3,TI_FLAGS(r9)
  1057. - andi. r0,r3,_TIF_NEED_RESCHED
  1058. - bne- 1b
  1059. + andi. r0,r3,_TIF_NEED_RESCHED_MASK
  1060. + bne- 2b
  1061. #ifdef CONFIG_TRACE_IRQFLAGS
  1062. /* And now, to properly rebalance the above, we tell lockdep they
  1063. * are being turned back on, which will happen when we return
  1064. @@ -1226,7 +1233,7 @@
  1065. #endif /* !(CONFIG_4xx || CONFIG_BOOKE) */
  1066. do_work: /* r10 contains MSR_KERNEL here */
  1067. - andi. r0,r9,_TIF_NEED_RESCHED
  1068. + andi. r0,r9,_TIF_NEED_RESCHED_MASK
  1069. beq do_user_signal
  1070. do_resched: /* r10 contains MSR_KERNEL here */
  1071. @@ -1247,7 +1254,7 @@
  1072. MTMSRD(r10) /* disable interrupts */
  1073. CURRENT_THREAD_INFO(r9, r1)
  1074. lwz r9,TI_FLAGS(r9)
  1075. - andi. r0,r9,_TIF_NEED_RESCHED
  1076. + andi. r0,r9,_TIF_NEED_RESCHED_MASK
  1077. bne- do_resched
  1078. andi. r0,r9,_TIF_USER_WORK_MASK
  1079. beq restore_user
  1080. diff -Nur linux-3.18.12.orig/arch/powerpc/kernel/entry_64.S linux-3.18.12/arch/powerpc/kernel/entry_64.S
  1081. --- linux-3.18.12.orig/arch/powerpc/kernel/entry_64.S 2015-04-20 14:48:02.000000000 -0500
  1082. +++ linux-3.18.12/arch/powerpc/kernel/entry_64.S 2015-04-26 13:32:22.371684003 -0500
  1083. @@ -644,7 +644,7 @@
  1084. #else
  1085. beq restore
  1086. #endif
  1087. -1: andi. r0,r4,_TIF_NEED_RESCHED
  1088. +1: andi. r0,r4,_TIF_NEED_RESCHED_MASK
  1089. beq 2f
  1090. bl restore_interrupts
  1091. SCHEDULE_USER
  1092. @@ -706,10 +706,18 @@
  1093. #ifdef CONFIG_PREEMPT
  1094. /* Check if we need to preempt */
  1095. + lwz r8,TI_PREEMPT(r9)
  1096. + cmpwi 0,r8,0 /* if non-zero, just restore regs and return */
  1097. + bne restore
  1098. andi. r0,r4,_TIF_NEED_RESCHED
  1099. + bne+ check_count
  1100. +
  1101. + andi. r0,r4,_TIF_NEED_RESCHED_LAZY
  1102. beq+ restore
  1103. + lwz r8,TI_PREEMPT_LAZY(r9)
  1104. +
  1105. /* Check that preempt_count() == 0 and interrupts are enabled */
  1106. - lwz r8,TI_PREEMPT(r9)
  1107. +check_count:
  1108. cmpwi cr1,r8,0
  1109. ld r0,SOFTE(r1)
  1110. cmpdi r0,0
  1111. @@ -726,7 +734,7 @@
  1112. /* Re-test flags and eventually loop */
  1113. CURRENT_THREAD_INFO(r9, r1)
  1114. ld r4,TI_FLAGS(r9)
  1115. - andi. r0,r4,_TIF_NEED_RESCHED
  1116. + andi. r0,r4,_TIF_NEED_RESCHED_MASK
  1117. bne 1b
  1118. /*
  1119. diff -Nur linux-3.18.12.orig/arch/powerpc/kernel/irq.c linux-3.18.12/arch/powerpc/kernel/irq.c
  1120. --- linux-3.18.12.orig/arch/powerpc/kernel/irq.c 2015-04-20 14:48:02.000000000 -0500
  1121. +++ linux-3.18.12/arch/powerpc/kernel/irq.c 2015-04-26 13:32:22.371684003 -0500
  1122. @@ -615,6 +615,7 @@
  1123. }
  1124. }
  1125. +#ifndef CONFIG_PREEMPT_RT_FULL
  1126. void do_softirq_own_stack(void)
  1127. {
  1128. struct thread_info *curtp, *irqtp;
  1129. @@ -632,6 +633,7 @@
  1130. if (irqtp->flags)
  1131. set_bits(irqtp->flags, &curtp->flags);
  1132. }
  1133. +#endif
  1134. irq_hw_number_t virq_to_hw(unsigned int virq)
  1135. {
  1136. diff -Nur linux-3.18.12.orig/arch/powerpc/kernel/misc_32.S linux-3.18.12/arch/powerpc/kernel/misc_32.S
  1137. --- linux-3.18.12.orig/arch/powerpc/kernel/misc_32.S 2015-04-20 14:48:02.000000000 -0500
  1138. +++ linux-3.18.12/arch/powerpc/kernel/misc_32.S 2015-04-26 13:32:22.371684003 -0500
  1139. @@ -40,6 +40,7 @@
  1140. * We store the saved ksp_limit in the unused part
  1141. * of the STACK_FRAME_OVERHEAD
  1142. */
  1143. +#ifndef CONFIG_PREEMPT_RT_FULL
  1144. _GLOBAL(call_do_softirq)
  1145. mflr r0
  1146. stw r0,4(r1)
  1147. @@ -56,6 +57,7 @@
  1148. stw r10,THREAD+KSP_LIMIT(r2)
  1149. mtlr r0
  1150. blr
  1151. +#endif
  1152. /*
  1153. * void call_do_irq(struct pt_regs *regs, struct thread_info *irqtp);
  1154. diff -Nur linux-3.18.12.orig/arch/powerpc/kernel/misc_64.S linux-3.18.12/arch/powerpc/kernel/misc_64.S
  1155. --- linux-3.18.12.orig/arch/powerpc/kernel/misc_64.S 2015-04-20 14:48:02.000000000 -0500
  1156. +++ linux-3.18.12/arch/powerpc/kernel/misc_64.S 2015-04-26 13:32:22.371684003 -0500
  1157. @@ -29,6 +29,7 @@
  1158. .text
  1159. +#ifndef CONFIG_PREEMPT_RT_FULL
  1160. _GLOBAL(call_do_softirq)
  1161. mflr r0
  1162. std r0,16(r1)
  1163. @@ -39,6 +40,7 @@
  1164. ld r0,16(r1)
  1165. mtlr r0
  1166. blr
  1167. +#endif
  1168. _GLOBAL(call_do_irq)
  1169. mflr r0
  1170. diff -Nur linux-3.18.12.orig/arch/powerpc/kernel/time.c linux-3.18.12/arch/powerpc/kernel/time.c
  1171. --- linux-3.18.12.orig/arch/powerpc/kernel/time.c 2015-04-20 14:48:02.000000000 -0500
  1172. +++ linux-3.18.12/arch/powerpc/kernel/time.c 2015-04-26 13:32:22.371684003 -0500
  1173. @@ -424,7 +424,7 @@
  1174. EXPORT_SYMBOL(profile_pc);
  1175. #endif
  1176. -#ifdef CONFIG_IRQ_WORK
  1177. +#if defined(CONFIG_IRQ_WORK)
  1178. /*
  1179. * 64-bit uses a byte in the PACA, 32-bit uses a per-cpu variable...
  1180. diff -Nur linux-3.18.12.orig/arch/powerpc/kvm/book3s_hv.c linux-3.18.12/arch/powerpc/kvm/book3s_hv.c
  1181. --- linux-3.18.12.orig/arch/powerpc/kvm/book3s_hv.c 2015-04-20 14:48:02.000000000 -0500
  1182. +++ linux-3.18.12/arch/powerpc/kvm/book3s_hv.c 2015-04-26 13:32:22.371684003 -0500
  1183. @@ -84,11 +84,11 @@
  1184. {
  1185. int me;
  1186. int cpu = vcpu->cpu;
  1187. - wait_queue_head_t *wqp;
  1188. + struct swait_head *wqp;
  1189. wqp = kvm_arch_vcpu_wq(vcpu);
  1190. - if (waitqueue_active(wqp)) {
  1191. - wake_up_interruptible(wqp);
  1192. + if (swaitqueue_active(wqp)) {
  1193. + swait_wake_interruptible(wqp);
  1194. ++vcpu->stat.halt_wakeup;
  1195. }
  1196. @@ -639,8 +639,8 @@
  1197. tvcpu->arch.prodded = 1;
  1198. smp_mb();
  1199. if (vcpu->arch.ceded) {
  1200. - if (waitqueue_active(&vcpu->wq)) {
  1201. - wake_up_interruptible(&vcpu->wq);
  1202. + if (swaitqueue_active(&vcpu->wq)) {
  1203. + swait_wake_interruptible(&vcpu->wq);
  1204. vcpu->stat.halt_wakeup++;
  1205. }
  1206. }
  1207. @@ -1357,7 +1357,7 @@
  1208. INIT_LIST_HEAD(&vcore->runnable_threads);
  1209. spin_lock_init(&vcore->lock);
  1210. - init_waitqueue_head(&vcore->wq);
  1211. + init_swait_head(&vcore->wq);
  1212. vcore->preempt_tb = TB_NIL;
  1213. vcore->lpcr = kvm->arch.lpcr;
  1214. vcore->first_vcpuid = core * threads_per_subcore;
  1215. @@ -1826,13 +1826,13 @@
  1216. */
  1217. static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
  1218. {
  1219. - DEFINE_WAIT(wait);
  1220. + DEFINE_SWAITER(wait);
  1221. - prepare_to_wait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
  1222. + swait_prepare(&vc->wq, &wait, TASK_INTERRUPTIBLE);
  1223. vc->vcore_state = VCORE_SLEEPING;
  1224. spin_unlock(&vc->lock);
  1225. schedule();
  1226. - finish_wait(&vc->wq, &wait);
  1227. + swait_finish(&vc->wq, &wait);
  1228. spin_lock(&vc->lock);
  1229. vc->vcore_state = VCORE_INACTIVE;
  1230. }
  1231. @@ -1873,7 +1873,7 @@
  1232. kvmppc_create_dtl_entry(vcpu, vc);
  1233. kvmppc_start_thread(vcpu);
  1234. } else if (vc->vcore_state == VCORE_SLEEPING) {
  1235. - wake_up(&vc->wq);
  1236. + swait_wake(&vc->wq);
  1237. }
  1238. }
  1239. diff -Nur linux-3.18.12.orig/arch/powerpc/mm/fault.c linux-3.18.12/arch/powerpc/mm/fault.c
  1240. --- linux-3.18.12.orig/arch/powerpc/mm/fault.c 2015-04-20 14:48:02.000000000 -0500
  1241. +++ linux-3.18.12/arch/powerpc/mm/fault.c 2015-04-26 13:32:22.371684003 -0500
  1242. @@ -273,7 +273,7 @@
  1243. if (!arch_irq_disabled_regs(regs))
  1244. local_irq_enable();
  1245. - if (in_atomic() || mm == NULL) {
  1246. + if (in_atomic() || mm == NULL || pagefault_disabled()) {
  1247. if (!user_mode(regs)) {
  1248. rc = SIGSEGV;
  1249. goto bail;
  1250. diff -Nur linux-3.18.12.orig/arch/s390/include/asm/kvm_host.h linux-3.18.12/arch/s390/include/asm/kvm_host.h
  1251. --- linux-3.18.12.orig/arch/s390/include/asm/kvm_host.h 2015-04-20 14:48:02.000000000 -0500
  1252. +++ linux-3.18.12/arch/s390/include/asm/kvm_host.h 2015-04-26 13:32:22.371684003 -0500
  1253. @@ -311,7 +311,7 @@
  1254. struct list_head list;
  1255. atomic_t active;
  1256. struct kvm_s390_float_interrupt *float_int;
  1257. - wait_queue_head_t *wq;
  1258. + struct swait_head *wq;
  1259. atomic_t *cpuflags;
  1260. unsigned int action_bits;
  1261. };
  1262. diff -Nur linux-3.18.12.orig/arch/s390/kvm/interrupt.c linux-3.18.12/arch/s390/kvm/interrupt.c
  1263. --- linux-3.18.12.orig/arch/s390/kvm/interrupt.c 2015-04-20 14:48:02.000000000 -0500
  1264. +++ linux-3.18.12/arch/s390/kvm/interrupt.c 2015-04-26 13:32:22.371684003 -0500
  1265. @@ -619,13 +619,13 @@
  1266. void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu)
  1267. {
  1268. - if (waitqueue_active(&vcpu->wq)) {
  1269. + if (swaitqueue_active(&vcpu->wq)) {
  1270. /*
  1271. * The vcpu gave up the cpu voluntarily, mark it as a good
  1272. * yield-candidate.
  1273. */
  1274. vcpu->preempted = true;
  1275. - wake_up_interruptible(&vcpu->wq);
  1276. + swait_wake_interruptible(&vcpu->wq);
  1277. vcpu->stat.halt_wakeup++;
  1278. }
  1279. }
  1280. @@ -746,7 +746,7 @@
  1281. spin_lock(&li->lock);
  1282. list_add(&inti->list, &li->list);
  1283. atomic_set(&li->active, 1);
  1284. - BUG_ON(waitqueue_active(li->wq));
  1285. + BUG_ON(swaitqueue_active(li->wq));
  1286. spin_unlock(&li->lock);
  1287. return 0;
  1288. }
  1289. @@ -771,7 +771,7 @@
  1290. spin_lock(&li->lock);
  1291. list_add(&inti->list, &li->list);
  1292. atomic_set(&li->active, 1);
  1293. - BUG_ON(waitqueue_active(li->wq));
  1294. + BUG_ON(swaitqueue_active(li->wq));
  1295. spin_unlock(&li->lock);
  1296. return 0;
  1297. }
  1298. diff -Nur linux-3.18.12.orig/arch/s390/mm/fault.c linux-3.18.12/arch/s390/mm/fault.c
  1299. --- linux-3.18.12.orig/arch/s390/mm/fault.c 2015-04-20 14:48:02.000000000 -0500
  1300. +++ linux-3.18.12/arch/s390/mm/fault.c 2015-04-26 13:32:22.371684003 -0500
  1301. @@ -435,7 +435,8 @@
  1302. * user context.
  1303. */
  1304. fault = VM_FAULT_BADCONTEXT;
  1305. - if (unlikely(!user_space_fault(regs) || in_atomic() || !mm))
  1306. + if (unlikely(!user_space_fault(regs) || !mm ||
  1307. + tsk->pagefault_disabled))
  1308. goto out;
  1309. address = trans_exc_code & __FAIL_ADDR_MASK;
  1310. diff -Nur linux-3.18.12.orig/arch/score/mm/fault.c linux-3.18.12/arch/score/mm/fault.c
  1311. --- linux-3.18.12.orig/arch/score/mm/fault.c 2015-04-20 14:48:02.000000000 -0500
  1312. +++ linux-3.18.12/arch/score/mm/fault.c 2015-04-26 13:32:22.371684003 -0500
  1313. @@ -73,7 +73,7 @@
  1314. * If we're in an interrupt or have no user
  1315. * context, we must not take the fault..
  1316. */
  1317. - if (in_atomic() || !mm)
  1318. + if (!mm || pagefault_disabled())
  1319. goto bad_area_nosemaphore;
  1320. if (user_mode(regs))
  1321. diff -Nur linux-3.18.12.orig/arch/sh/kernel/irq.c linux-3.18.12/arch/sh/kernel/irq.c
  1322. --- linux-3.18.12.orig/arch/sh/kernel/irq.c 2015-04-20 14:48:02.000000000 -0500
  1323. +++ linux-3.18.12/arch/sh/kernel/irq.c 2015-04-26 13:32:22.371684003 -0500
  1324. @@ -149,6 +149,7 @@
  1325. hardirq_ctx[cpu] = NULL;
  1326. }
  1327. +#ifndef CONFIG_PREEMPT_RT_FULL
  1328. void do_softirq_own_stack(void)
  1329. {
  1330. struct thread_info *curctx;
  1331. @@ -176,6 +177,7 @@
  1332. "r5", "r6", "r7", "r8", "r9", "r15", "t", "pr"
  1333. );
  1334. }
  1335. +#endif
  1336. #else
  1337. static inline void handle_one_irq(unsigned int irq)
  1338. {
  1339. diff -Nur linux-3.18.12.orig/arch/sh/mm/fault.c linux-3.18.12/arch/sh/mm/fault.c
  1340. --- linux-3.18.12.orig/arch/sh/mm/fault.c 2015-04-20 14:48:02.000000000 -0500
  1341. +++ linux-3.18.12/arch/sh/mm/fault.c 2015-04-26 13:32:22.371684003 -0500
  1342. @@ -440,7 +440,7 @@
  1343. * If we're in an interrupt, have no user context or are running
  1344. * in an atomic region then we must not take the fault:
  1345. */
  1346. - if (unlikely(in_atomic() || !mm)) {
  1347. + if (unlikely(!mm || pagefault_disabled())) {
  1348. bad_area_nosemaphore(regs, error_code, address);
  1349. return;
  1350. }
  1351. diff -Nur linux-3.18.12.orig/arch/sparc/Kconfig linux-3.18.12/arch/sparc/Kconfig
  1352. --- linux-3.18.12.orig/arch/sparc/Kconfig 2015-04-20 14:48:02.000000000 -0500
  1353. +++ linux-3.18.12/arch/sparc/Kconfig 2015-04-26 13:32:22.371684003 -0500
  1354. @@ -182,12 +182,10 @@
  1355. source kernel/Kconfig.hz
  1356. config RWSEM_GENERIC_SPINLOCK
  1357. - bool
  1358. - default y if SPARC32
  1359. + def_bool PREEMPT_RT_FULL
  1360. config RWSEM_XCHGADD_ALGORITHM
  1361. - bool
  1362. - default y if SPARC64
  1363. + def_bool !RWSEM_GENERIC_SPINLOCK && !PREEMPT_RT_FULL
  1364. config GENERIC_HWEIGHT
  1365. bool
  1366. @@ -528,6 +526,10 @@
  1367. source "fs/Kconfig.binfmt"
  1368. +config EARLY_PRINTK
  1369. + bool
  1370. + default y
  1371. +
  1372. config COMPAT
  1373. bool
  1374. depends on SPARC64
  1375. diff -Nur linux-3.18.12.orig/arch/sparc/kernel/irq_64.c linux-3.18.12/arch/sparc/kernel/irq_64.c
  1376. --- linux-3.18.12.orig/arch/sparc/kernel/irq_64.c 2015-04-20 14:48:02.000000000 -0500
  1377. +++ linux-3.18.12/arch/sparc/kernel/irq_64.c 2015-04-26 13:32:22.375684003 -0500
  1378. @@ -849,6 +849,7 @@
  1379. set_irq_regs(old_regs);
  1380. }
  1381. +#ifndef CONFIG_PREEMPT_RT_FULL
  1382. void do_softirq_own_stack(void)
  1383. {
  1384. void *orig_sp, *sp = softirq_stack[smp_processor_id()];
  1385. @@ -863,6 +864,7 @@
  1386. __asm__ __volatile__("mov %0, %%sp"
  1387. : : "r" (orig_sp));
  1388. }
  1389. +#endif
  1390. #ifdef CONFIG_HOTPLUG_CPU
  1391. void fixup_irqs(void)
  1392. diff -Nur linux-3.18.12.orig/arch/sparc/kernel/setup_32.c linux-3.18.12/arch/sparc/kernel/setup_32.c
  1393. --- linux-3.18.12.orig/arch/sparc/kernel/setup_32.c 2015-04-20 14:48:02.000000000 -0500
  1394. +++ linux-3.18.12/arch/sparc/kernel/setup_32.c 2015-04-26 13:32:22.375684003 -0500
  1395. @@ -309,6 +309,7 @@
  1396. boot_flags_init(*cmdline_p);
  1397. + early_console = &prom_early_console;
  1398. register_console(&prom_early_console);
  1399. printk("ARCH: ");
  1400. diff -Nur linux-3.18.12.orig/arch/sparc/kernel/setup_64.c linux-3.18.12/arch/sparc/kernel/setup_64.c
  1401. --- linux-3.18.12.orig/arch/sparc/kernel/setup_64.c 2015-04-20 14:48:02.000000000 -0500
  1402. +++ linux-3.18.12/arch/sparc/kernel/setup_64.c 2015-04-26 13:32:22.375684003 -0500
  1403. @@ -563,6 +563,12 @@
  1404. pause_patch();
  1405. }
  1406. +static inline void register_prom_console(void)
  1407. +{
  1408. + early_console = &prom_early_console;
  1409. + register_console(&prom_early_console);
  1410. +}
  1411. +
  1412. void __init setup_arch(char **cmdline_p)
  1413. {
  1414. /* Initialize PROM console and command line. */
  1415. @@ -574,7 +580,7 @@
  1416. #ifdef CONFIG_EARLYFB
  1417. if (btext_find_display())
  1418. #endif
  1419. - register_console(&prom_early_console);
  1420. + register_prom_console();
  1421. if (tlb_type == hypervisor)
  1422. printk("ARCH: SUN4V\n");
  1423. diff -Nur linux-3.18.12.orig/arch/sparc/mm/fault_32.c linux-3.18.12/arch/sparc/mm/fault_32.c
  1424. --- linux-3.18.12.orig/arch/sparc/mm/fault_32.c 2015-04-20 14:48:02.000000000 -0500
  1425. +++ linux-3.18.12/arch/sparc/mm/fault_32.c 2015-04-26 13:32:22.375684003 -0500
  1426. @@ -196,7 +196,7 @@
  1427. * If we're in an interrupt or have no user
  1428. * context, we must not take the fault..
  1429. */
  1430. - if (in_atomic() || !mm)
  1431. + if (!mm || pagefault_disabled())
  1432. goto no_context;
  1433. perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
  1434. diff -Nur linux-3.18.12.orig/arch/sparc/mm/fault_64.c linux-3.18.12/arch/sparc/mm/fault_64.c
  1435. --- linux-3.18.12.orig/arch/sparc/mm/fault_64.c 2015-04-20 14:48:02.000000000 -0500
  1436. +++ linux-3.18.12/arch/sparc/mm/fault_64.c 2015-04-26 13:32:22.375684003 -0500
  1437. @@ -330,7 +330,7 @@
  1438. * If we're in an interrupt or have no user
  1439. * context, we must not take the fault..
  1440. */
  1441. - if (in_atomic() || !mm)
  1442. + if (!mm || pagefault_disabled())
  1443. goto intr_or_no_mm;
  1444. perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
  1445. diff -Nur linux-3.18.12.orig/arch/tile/mm/fault.c linux-3.18.12/arch/tile/mm/fault.c
  1446. --- linux-3.18.12.orig/arch/tile/mm/fault.c 2015-04-20 14:48:02.000000000 -0500
  1447. +++ linux-3.18.12/arch/tile/mm/fault.c 2015-04-26 13:32:22.375684003 -0500
  1448. @@ -357,7 +357,7 @@
  1449. * If we're in an interrupt, have no user context or are running in an
  1450. * atomic region then we must not take the fault.
  1451. */
  1452. - if (in_atomic() || !mm) {
  1453. + if (!mm || pagefault_disabled()) {
  1454. vma = NULL; /* happy compiler */
  1455. goto bad_area_nosemaphore;
  1456. }
  1457. diff -Nur linux-3.18.12.orig/arch/um/kernel/trap.c linux-3.18.12/arch/um/kernel/trap.c
  1458. --- linux-3.18.12.orig/arch/um/kernel/trap.c 2015-04-20 14:48:02.000000000 -0500
  1459. +++ linux-3.18.12/arch/um/kernel/trap.c 2015-04-26 13:32:22.375684003 -0500
  1460. @@ -38,7 +38,7 @@
  1461. * If the fault was during atomic operation, don't take the fault, just
  1462. * fail.
  1463. */
  1464. - if (in_atomic())
  1465. + if (pagefault_disabled())
  1466. goto out_nosemaphore;
  1467. if (is_user)
  1468. diff -Nur linux-3.18.12.orig/arch/x86/crypto/aesni-intel_glue.c linux-3.18.12/arch/x86/crypto/aesni-intel_glue.c
  1469. --- linux-3.18.12.orig/arch/x86/crypto/aesni-intel_glue.c 2015-04-20 14:48:02.000000000 -0500
  1470. +++ linux-3.18.12/arch/x86/crypto/aesni-intel_glue.c 2015-04-26 13:32:22.375684003 -0500
  1471. @@ -381,14 +381,14 @@
  1472. err = blkcipher_walk_virt(desc, &walk);
  1473. desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
  1474. - kernel_fpu_begin();
  1475. while ((nbytes = walk.nbytes)) {
  1476. + kernel_fpu_begin();
  1477. aesni_ecb_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
  1478. - nbytes & AES_BLOCK_MASK);
  1479. + nbytes & AES_BLOCK_MASK);
  1480. + kernel_fpu_end();
  1481. nbytes &= AES_BLOCK_SIZE - 1;
  1482. err = blkcipher_walk_done(desc, &walk, nbytes);
  1483. }
  1484. - kernel_fpu_end();
  1485. return err;
  1486. }
  1487. @@ -405,14 +405,14 @@
  1488. err = blkcipher_walk_virt(desc, &walk);
  1489. desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
  1490. - kernel_fpu_begin();
  1491. while ((nbytes = walk.nbytes)) {
  1492. + kernel_fpu_begin();
  1493. aesni_ecb_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
  1494. nbytes & AES_BLOCK_MASK);
  1495. + kernel_fpu_end();
  1496. nbytes &= AES_BLOCK_SIZE - 1;
  1497. err = blkcipher_walk_done(desc, &walk, nbytes);
  1498. }
  1499. - kernel_fpu_end();
  1500. return err;
  1501. }
  1502. @@ -429,14 +429,14 @@
  1503. err = blkcipher_walk_virt(desc, &walk);
  1504. desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
  1505. - kernel_fpu_begin();
  1506. while ((nbytes = walk.nbytes)) {
  1507. + kernel_fpu_begin();
  1508. aesni_cbc_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
  1509. nbytes & AES_BLOCK_MASK, walk.iv);
  1510. + kernel_fpu_end();
  1511. nbytes &= AES_BLOCK_SIZE - 1;
  1512. err = blkcipher_walk_done(desc, &walk, nbytes);
  1513. }
  1514. - kernel_fpu_end();
  1515. return err;
  1516. }
  1517. @@ -453,14 +453,14 @@
  1518. err = blkcipher_walk_virt(desc, &walk);
  1519. desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
  1520. - kernel_fpu_begin();
  1521. while ((nbytes = walk.nbytes)) {
  1522. + kernel_fpu_begin();
  1523. aesni_cbc_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
  1524. nbytes & AES_BLOCK_MASK, walk.iv);
  1525. + kernel_fpu_end();
  1526. nbytes &= AES_BLOCK_SIZE - 1;
  1527. err = blkcipher_walk_done(desc, &walk, nbytes);
  1528. }
  1529. - kernel_fpu_end();
  1530. return err;
  1531. }
  1532. @@ -512,18 +512,20 @@
  1533. err = blkcipher_walk_virt_block(desc, &walk, AES_BLOCK_SIZE);
  1534. desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
  1535. - kernel_fpu_begin();
  1536. while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) {
  1537. + kernel_fpu_begin();
  1538. aesni_ctr_enc_tfm(ctx, walk.dst.virt.addr, walk.src.virt.addr,
  1539. nbytes & AES_BLOCK_MASK, walk.iv);
  1540. + kernel_fpu_end();
  1541. nbytes &= AES_BLOCK_SIZE - 1;
  1542. err = blkcipher_walk_done(desc, &walk, nbytes);
  1543. }
  1544. if (walk.nbytes) {
  1545. + kernel_fpu_begin();
  1546. ctr_crypt_final(ctx, &walk);
  1547. + kernel_fpu_end();
  1548. err = blkcipher_walk_done(desc, &walk, 0);
  1549. }
  1550. - kernel_fpu_end();
  1551. return err;
  1552. }
  1553. diff -Nur linux-3.18.12.orig/arch/x86/crypto/cast5_avx_glue.c linux-3.18.12/arch/x86/crypto/cast5_avx_glue.c
  1554. --- linux-3.18.12.orig/arch/x86/crypto/cast5_avx_glue.c 2015-04-20 14:48:02.000000000 -0500
  1555. +++ linux-3.18.12/arch/x86/crypto/cast5_avx_glue.c 2015-04-26 13:32:22.375684003 -0500
  1556. @@ -60,7 +60,7 @@
  1557. static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
  1558. bool enc)
  1559. {
  1560. - bool fpu_enabled = false;
  1561. + bool fpu_enabled;
  1562. struct cast5_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
  1563. const unsigned int bsize = CAST5_BLOCK_SIZE;
  1564. unsigned int nbytes;
  1565. @@ -76,7 +76,7 @@
  1566. u8 *wsrc = walk->src.virt.addr;
  1567. u8 *wdst = walk->dst.virt.addr;
  1568. - fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
  1569. + fpu_enabled = cast5_fpu_begin(false, nbytes);
  1570. /* Process multi-block batch */
  1571. if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) {
  1572. @@ -104,10 +104,9 @@
  1573. } while (nbytes >= bsize);
  1574. done:
  1575. + cast5_fpu_end(fpu_enabled);
  1576. err = blkcipher_walk_done(desc, walk, nbytes);
  1577. }
  1578. -
  1579. - cast5_fpu_end(fpu_enabled);
  1580. return err;
  1581. }
  1582. @@ -228,7 +227,7 @@
  1583. static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
  1584. struct scatterlist *src, unsigned int nbytes)
  1585. {
  1586. - bool fpu_enabled = false;
  1587. + bool fpu_enabled;
  1588. struct blkcipher_walk walk;
  1589. int err;
  1590. @@ -237,12 +236,11 @@
  1591. desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
  1592. while ((nbytes = walk.nbytes)) {
  1593. - fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
  1594. + fpu_enabled = cast5_fpu_begin(false, nbytes);
  1595. nbytes = __cbc_decrypt(desc, &walk);
  1596. + cast5_fpu_end(fpu_enabled);
  1597. err = blkcipher_walk_done(desc, &walk, nbytes);
  1598. }
  1599. -
  1600. - cast5_fpu_end(fpu_enabled);
  1601. return err;
  1602. }
  1603. @@ -312,7 +310,7 @@
  1604. static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
  1605. struct scatterlist *src, unsigned int nbytes)
  1606. {
  1607. - bool fpu_enabled = false;
  1608. + bool fpu_enabled;
  1609. struct blkcipher_walk walk;
  1610. int err;
  1611. @@ -321,13 +319,12 @@
  1612. desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
  1613. while ((nbytes = walk.nbytes) >= CAST5_BLOCK_SIZE) {
  1614. - fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
  1615. + fpu_enabled = cast5_fpu_begin(false, nbytes);
  1616. nbytes = __ctr_crypt(desc, &walk);
  1617. + cast5_fpu_end(fpu_enabled);
  1618. err = blkcipher_walk_done(desc, &walk, nbytes);
  1619. }
  1620. - cast5_fpu_end(fpu_enabled);
  1621. -
  1622. if (walk.nbytes) {
  1623. ctr_crypt_final(desc, &walk);
  1624. err = blkcipher_walk_done(desc, &walk, 0);
  1625. diff -Nur linux-3.18.12.orig/arch/x86/crypto/glue_helper.c linux-3.18.12/arch/x86/crypto/glue_helper.c
  1626. --- linux-3.18.12.orig/arch/x86/crypto/glue_helper.c 2015-04-20 14:48:02.000000000 -0500
  1627. +++ linux-3.18.12/arch/x86/crypto/glue_helper.c 2015-04-26 13:32:22.375684003 -0500
  1628. @@ -39,7 +39,7 @@
  1629. void *ctx = crypto_blkcipher_ctx(desc->tfm);
  1630. const unsigned int bsize = 128 / 8;
  1631. unsigned int nbytes, i, func_bytes;
  1632. - bool fpu_enabled = false;
  1633. + bool fpu_enabled;
  1634. int err;
  1635. err = blkcipher_walk_virt(desc, walk);
  1636. @@ -49,7 +49,7 @@
  1637. u8 *wdst = walk->dst.virt.addr;
  1638. fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
  1639. - desc, fpu_enabled, nbytes);
  1640. + desc, false, nbytes);
  1641. for (i = 0; i < gctx->num_funcs; i++) {
  1642. func_bytes = bsize * gctx->funcs[i].num_blocks;
  1643. @@ -71,10 +71,10 @@
  1644. }
  1645. done:
  1646. + glue_fpu_end(fpu_enabled);
  1647. err = blkcipher_walk_done(desc, walk, nbytes);
  1648. }
  1649. - glue_fpu_end(fpu_enabled);
  1650. return err;
  1651. }
  1652. @@ -194,7 +194,7 @@
  1653. struct scatterlist *src, unsigned int nbytes)
  1654. {
  1655. const unsigned int bsize = 128 / 8;
  1656. - bool fpu_enabled = false;
  1657. + bool fpu_enabled;
  1658. struct blkcipher_walk walk;
  1659. int err;
  1660. @@ -203,12 +203,12 @@
  1661. while ((nbytes = walk.nbytes)) {
  1662. fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
  1663. - desc, fpu_enabled, nbytes);
  1664. + desc, false, nbytes);
  1665. nbytes = __glue_cbc_decrypt_128bit(gctx, desc, &walk);
  1666. + glue_fpu_end(fpu_enabled);
  1667. err = blkcipher_walk_done(desc, &walk, nbytes);
  1668. }
  1669. - glue_fpu_end(fpu_enabled);
  1670. return err;
  1671. }
  1672. EXPORT_SYMBOL_GPL(glue_cbc_decrypt_128bit);
  1673. @@ -278,7 +278,7 @@
  1674. struct scatterlist *src, unsigned int nbytes)
  1675. {
  1676. const unsigned int bsize = 128 / 8;
  1677. - bool fpu_enabled = false;
  1678. + bool fpu_enabled;
  1679. struct blkcipher_walk walk;
  1680. int err;
  1681. @@ -287,13 +287,12 @@
  1682. while ((nbytes = walk.nbytes) >= bsize) {
  1683. fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
  1684. - desc, fpu_enabled, nbytes);
  1685. + desc, false, nbytes);
  1686. nbytes = __glue_ctr_crypt_128bit(gctx, desc, &walk);
  1687. + glue_fpu_end(fpu_enabled);
  1688. err = blkcipher_walk_done(desc, &walk, nbytes);
  1689. }
  1690. - glue_fpu_end(fpu_enabled);
  1691. -
  1692. if (walk.nbytes) {
  1693. glue_ctr_crypt_final_128bit(
  1694. gctx->funcs[gctx->num_funcs - 1].fn_u.ctr, desc, &walk);
  1695. @@ -348,7 +347,7 @@
  1696. void *tweak_ctx, void *crypt_ctx)
  1697. {
  1698. const unsigned int bsize = 128 / 8;
  1699. - bool fpu_enabled = false;
  1700. + bool fpu_enabled;
  1701. struct blkcipher_walk walk;
  1702. int err;
  1703. @@ -361,21 +360,21 @@
  1704. /* set minimum length to bsize, for tweak_fn */
  1705. fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
  1706. - desc, fpu_enabled,
  1707. + desc, false,
  1708. nbytes < bsize ? bsize : nbytes);
  1709. -
  1710. /* calculate first value of T */
  1711. tweak_fn(tweak_ctx, walk.iv, walk.iv);
  1712. + glue_fpu_end(fpu_enabled);
  1713. while (nbytes) {
  1714. + fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
  1715. + desc, false, nbytes);
  1716. nbytes = __glue_xts_crypt_128bit(gctx, crypt_ctx, desc, &walk);
  1717. + glue_fpu_end(fpu_enabled);
  1718. err = blkcipher_walk_done(desc, &walk, nbytes);
  1719. nbytes = walk.nbytes;
  1720. }
  1721. -
  1722. - glue_fpu_end(fpu_enabled);
  1723. -
  1724. return err;
  1725. }
  1726. EXPORT_SYMBOL_GPL(glue_xts_crypt_128bit);
  1727. diff -Nur linux-3.18.12.orig/arch/x86/include/asm/preempt.h linux-3.18.12/arch/x86/include/asm/preempt.h
  1728. --- linux-3.18.12.orig/arch/x86/include/asm/preempt.h 2015-04-20 14:48:02.000000000 -0500
  1729. +++ linux-3.18.12/arch/x86/include/asm/preempt.h 2015-04-26 13:32:22.375684003 -0500
  1730. @@ -85,17 +85,33 @@
  1731. * a decrement which hits zero means we have no preempt_count and should
  1732. * reschedule.
  1733. */
  1734. -static __always_inline bool __preempt_count_dec_and_test(void)
  1735. +static __always_inline bool ____preempt_count_dec_and_test(void)
  1736. {
  1737. GEN_UNARY_RMWcc("decl", __preempt_count, __percpu_arg(0), "e");
  1738. }
  1739. +static __always_inline bool __preempt_count_dec_and_test(void)
  1740. +{
  1741. + if (____preempt_count_dec_and_test())
  1742. + return true;
  1743. +#ifdef CONFIG_PREEMPT_LAZY
  1744. + return test_thread_flag(TIF_NEED_RESCHED_LAZY);
  1745. +#else
  1746. + return false;
  1747. +#endif
  1748. +}
  1749. +
  1750. /*
  1751. * Returns true when we need to resched and can (barring IRQ state).
  1752. */
  1753. static __always_inline bool should_resched(void)
  1754. {
  1755. +#ifdef CONFIG_PREEMPT_LAZY
  1756. + return unlikely(!raw_cpu_read_4(__preempt_count) || \
  1757. + test_thread_flag(TIF_NEED_RESCHED_LAZY));
  1758. +#else
  1759. return unlikely(!raw_cpu_read_4(__preempt_count));
  1760. +#endif
  1761. }
  1762. #ifdef CONFIG_PREEMPT
  1763. diff -Nur linux-3.18.12.orig/arch/x86/include/asm/signal.h linux-3.18.12/arch/x86/include/asm/signal.h
  1764. --- linux-3.18.12.orig/arch/x86/include/asm/signal.h 2015-04-20 14:48:02.000000000 -0500
  1765. +++ linux-3.18.12/arch/x86/include/asm/signal.h 2015-04-26 13:32:22.375684003 -0500
  1766. @@ -23,6 +23,19 @@
  1767. unsigned long sig[_NSIG_WORDS];
  1768. } sigset_t;
  1769. +/*
  1770. + * Because some traps use the IST stack, we must keep preemption
  1771. + * disabled while calling do_trap(), but do_trap() may call
  1772. + * force_sig_info() which will grab the signal spin_locks for the
  1773. + * task, which in PREEMPT_RT_FULL are mutexes. By defining
  1774. + * ARCH_RT_DELAYS_SIGNAL_SEND the force_sig_info() will set
  1775. + * TIF_NOTIFY_RESUME and set up the signal to be sent on exit of the
  1776. + * trap.
  1777. + */
  1778. +#if defined(CONFIG_PREEMPT_RT_FULL) && defined(CONFIG_X86_64)
  1779. +#define ARCH_RT_DELAYS_SIGNAL_SEND
  1780. +#endif
  1781. +
  1782. #ifndef CONFIG_COMPAT
  1783. typedef sigset_t compat_sigset_t;
  1784. #endif
  1785. diff -Nur linux-3.18.12.orig/arch/x86/include/asm/stackprotector.h linux-3.18.12/arch/x86/include/asm/stackprotector.h
  1786. --- linux-3.18.12.orig/arch/x86/include/asm/stackprotector.h 2015-04-20 14:48:02.000000000 -0500
  1787. +++ linux-3.18.12/arch/x86/include/asm/stackprotector.h 2015-04-26 13:32:22.375684003 -0500
  1788. @@ -57,7 +57,7 @@
  1789. */
  1790. static __always_inline void boot_init_stack_canary(void)
  1791. {
  1792. - u64 canary;
  1793. + u64 uninitialized_var(canary);
  1794. u64 tsc;
  1795. #ifdef CONFIG_X86_64
  1796. @@ -68,8 +68,16 @@
  1797. * of randomness. The TSC only matters for very early init,
  1798. * there it already has some randomness on most systems. Later
  1799. * on during the bootup the random pool has true entropy too.
  1800. + *
  1801. + * For preempt-rt we need to weaken the randomness a bit, as
  1802. + * we can't call into the random generator from atomic context
  1803. + * due to locking constraints. We just leave canary
  1804. + * uninitialized and use the TSC based randomness on top of
  1805. + * it.
  1806. */
  1807. +#ifndef CONFIG_PREEMPT_RT_FULL
  1808. get_random_bytes(&canary, sizeof(canary));
  1809. +#endif
  1810. tsc = __native_read_tsc();
  1811. canary += tsc + (tsc << 32UL);
  1812. diff -Nur linux-3.18.12.orig/arch/x86/include/asm/thread_info.h linux-3.18.12/arch/x86/include/asm/thread_info.h
  1813. --- linux-3.18.12.orig/arch/x86/include/asm/thread_info.h 2015-04-20 14:48:02.000000000 -0500
  1814. +++ linux-3.18.12/arch/x86/include/asm/thread_info.h 2015-04-26 13:32:22.375684003 -0500
  1815. @@ -30,6 +30,8 @@
  1816. __u32 status; /* thread synchronous flags */
  1817. __u32 cpu; /* current CPU */
  1818. int saved_preempt_count;
  1819. + int preempt_lazy_count; /* 0 => lazy preemptable
  1820. + <0 => BUG */
  1821. mm_segment_t addr_limit;
  1822. struct restart_block restart_block;
  1823. void __user *sysenter_return;
  1824. @@ -75,6 +77,7 @@
  1825. #define TIF_SYSCALL_EMU 6 /* syscall emulation active */
  1826. #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */
  1827. #define TIF_SECCOMP 8 /* secure computing */
  1828. +#define TIF_NEED_RESCHED_LAZY 9 /* lazy rescheduling necessary */
  1829. #define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */
  1830. #define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */
  1831. #define TIF_UPROBE 12 /* breakpointed or singlestepping */
  1832. @@ -100,6 +103,7 @@
  1833. #define _TIF_SYSCALL_EMU (1 << TIF_SYSCALL_EMU)
  1834. #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
  1835. #define _TIF_SECCOMP (1 << TIF_SECCOMP)
  1836. +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
  1837. #define _TIF_MCE_NOTIFY (1 << TIF_MCE_NOTIFY)
  1838. #define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY)
  1839. #define _TIF_UPROBE (1 << TIF_UPROBE)
  1840. @@ -150,6 +154,8 @@
  1841. #define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
  1842. #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW)
  1843. +#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
  1844. +
  1845. #define STACK_WARN (THREAD_SIZE/8)
  1846. #define KERNEL_STACK_OFFSET (5*(BITS_PER_LONG/8))
  1847. diff -Nur linux-3.18.12.orig/arch/x86/include/asm/uv/uv_bau.h linux-3.18.12/arch/x86/include/asm/uv/uv_bau.h
  1848. --- linux-3.18.12.orig/arch/x86/include/asm/uv/uv_bau.h 2015-04-20 14:48:02.000000000 -0500
  1849. +++ linux-3.18.12/arch/x86/include/asm/uv/uv_bau.h 2015-04-26 13:32:22.375684003 -0500
  1850. @@ -615,9 +615,9 @@
  1851. cycles_t send_message;
  1852. cycles_t period_end;
  1853. cycles_t period_time;
  1854. - spinlock_t uvhub_lock;
  1855. - spinlock_t queue_lock;
  1856. - spinlock_t disable_lock;
  1857. + raw_spinlock_t uvhub_lock;
  1858. + raw_spinlock_t queue_lock;
  1859. + raw_spinlock_t disable_lock;
  1860. /* tunables */
  1861. int max_concurr;
  1862. int max_concurr_const;
  1863. @@ -776,15 +776,15 @@
  1864. * to be lowered below the current 'v'. atomic_add_unless can only stop
  1865. * on equal.
  1866. */
  1867. -static inline int atomic_inc_unless_ge(spinlock_t *lock, atomic_t *v, int u)
  1868. +static inline int atomic_inc_unless_ge(raw_spinlock_t *lock, atomic_t *v, int u)
  1869. {
  1870. - spin_lock(lock);
  1871. + raw_spin_lock(lock);
  1872. if (atomic_read(v) >= u) {
  1873. - spin_unlock(lock);
  1874. + raw_spin_unlock(lock);
  1875. return 0;
  1876. }
  1877. atomic_inc(v);
  1878. - spin_unlock(lock);
  1879. + raw_spin_unlock(lock);
  1880. return 1;
  1881. }
  1882. diff -Nur linux-3.18.12.orig/arch/x86/include/asm/uv/uv_hub.h linux-3.18.12/arch/x86/include/asm/uv/uv_hub.h
  1883. --- linux-3.18.12.orig/arch/x86/include/asm/uv/uv_hub.h 2015-04-20 14:48:02.000000000 -0500
  1884. +++ linux-3.18.12/arch/x86/include/asm/uv/uv_hub.h 2015-04-26 13:32:22.375684003 -0500
  1885. @@ -492,7 +492,7 @@
  1886. unsigned short nr_online_cpus;
  1887. unsigned short pnode;
  1888. short memory_nid;
  1889. - spinlock_t nmi_lock; /* obsolete, see uv_hub_nmi */
  1890. + raw_spinlock_t nmi_lock; /* obsolete, see uv_hub_nmi */
  1891. unsigned long nmi_count; /* obsolete, see uv_hub_nmi */
  1892. };
  1893. extern struct uv_blade_info *uv_blade_info;
  1894. diff -Nur linux-3.18.12.orig/arch/x86/Kconfig linux-3.18.12/arch/x86/Kconfig
  1895. --- linux-3.18.12.orig/arch/x86/Kconfig 2015-04-20 14:48:02.000000000 -0500
  1896. +++ linux-3.18.12/arch/x86/Kconfig 2015-04-26 13:32:22.375684003 -0500
  1897. @@ -21,6 +21,7 @@
  1898. ### Arch settings
  1899. config X86
  1900. def_bool y
  1901. + select HAVE_PREEMPT_LAZY
  1902. select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI
  1903. select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS
  1904. select ARCH_HAS_FAST_MULTIPLIER
  1905. @@ -197,8 +198,11 @@
  1906. def_bool y
  1907. depends on ISA_DMA_API
  1908. +config RWSEM_GENERIC_SPINLOCK
  1909. + def_bool PREEMPT_RT_FULL
  1910. +
  1911. config RWSEM_XCHGADD_ALGORITHM
  1912. - def_bool y
  1913. + def_bool !RWSEM_GENERIC_SPINLOCK && !PREEMPT_RT_FULL
  1914. config GENERIC_CALIBRATE_DELAY
  1915. def_bool y
  1916. @@ -811,7 +815,7 @@
  1917. config MAXSMP
  1918. bool "Enable Maximum number of SMP Processors and NUMA Nodes"
  1919. depends on X86_64 && SMP && DEBUG_KERNEL
  1920. - select CPUMASK_OFFSTACK
  1921. + select CPUMASK_OFFSTACK if !PREEMPT_RT_FULL
  1922. ---help---
  1923. Enable maximum number of CPUS and NUMA Nodes for this architecture.
  1924. If unsure, say N.
  1925. diff -Nur linux-3.18.12.orig/arch/x86/kernel/apic/io_apic.c linux-3.18.12/arch/x86/kernel/apic/io_apic.c
  1926. --- linux-3.18.12.orig/arch/x86/kernel/apic/io_apic.c 2015-04-20 14:48:02.000000000 -0500
  1927. +++ linux-3.18.12/arch/x86/kernel/apic/io_apic.c 2015-04-26 13:32:22.379684003 -0500
  1928. @@ -2494,7 +2494,8 @@
  1929. static inline bool ioapic_irqd_mask(struct irq_data *data, struct irq_cfg *cfg)
  1930. {
  1931. /* If we are moving the irq we need to mask it */
  1932. - if (unlikely(irqd_is_setaffinity_pending(data))) {
  1933. + if (unlikely(irqd_is_setaffinity_pending(data) &&
  1934. + !irqd_irq_inprogress(data))) {
  1935. mask_ioapic(cfg);
  1936. return true;
  1937. }
  1938. diff -Nur linux-3.18.12.orig/arch/x86/kernel/apic/x2apic_uv_x.c linux-3.18.12/arch/x86/kernel/apic/x2apic_uv_x.c
  1939. --- linux-3.18.12.orig/arch/x86/kernel/apic/x2apic_uv_x.c 2015-04-20 14:48:02.000000000 -0500
  1940. +++ linux-3.18.12/arch/x86/kernel/apic/x2apic_uv_x.c 2015-04-26 13:32:22.379684003 -0500
  1941. @@ -918,7 +918,7 @@
  1942. uv_blade_info[blade].pnode = pnode;
  1943. uv_blade_info[blade].nr_possible_cpus = 0;
  1944. uv_blade_info[blade].nr_online_cpus = 0;
  1945. - spin_lock_init(&uv_blade_info[blade].nmi_lock);
  1946. + raw_spin_lock_init(&uv_blade_info[blade].nmi_lock);
  1947. min_pnode = min(pnode, min_pnode);
  1948. max_pnode = max(pnode, max_pnode);
  1949. blade++;
  1950. diff -Nur linux-3.18.12.orig/arch/x86/kernel/asm-offsets.c linux-3.18.12/arch/x86/kernel/asm-offsets.c
  1951. --- linux-3.18.12.orig/arch/x86/kernel/asm-offsets.c 2015-04-20 14:48:02.000000000 -0500
  1952. +++ linux-3.18.12/arch/x86/kernel/asm-offsets.c 2015-04-26 13:32:22.379684003 -0500
  1953. @@ -32,6 +32,7 @@
  1954. OFFSET(TI_flags, thread_info, flags);
  1955. OFFSET(TI_status, thread_info, status);
  1956. OFFSET(TI_addr_limit, thread_info, addr_limit);
  1957. + OFFSET(TI_preempt_lazy_count, thread_info, preempt_lazy_count);
  1958. BLANK();
  1959. OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
  1960. @@ -71,4 +72,5 @@
  1961. BLANK();
  1962. DEFINE(PTREGS_SIZE, sizeof(struct pt_regs));
  1963. + DEFINE(_PREEMPT_ENABLED, PREEMPT_ENABLED);
  1964. }
  1965. diff -Nur linux-3.18.12.orig/arch/x86/kernel/cpu/mcheck/mce.c linux-3.18.12/arch/x86/kernel/cpu/mcheck/mce.c
  1966. --- linux-3.18.12.orig/arch/x86/kernel/cpu/mcheck/mce.c 2015-04-20 14:48:02.000000000 -0500
  1967. +++ linux-3.18.12/arch/x86/kernel/cpu/mcheck/mce.c 2015-04-26 13:32:22.379684003 -0500
  1968. @@ -41,6 +41,8 @@
  1969. #include <linux/debugfs.h>
  1970. #include <linux/irq_work.h>
  1971. #include <linux/export.h>
  1972. +#include <linux/jiffies.h>
  1973. +#include <linux/work-simple.h>
  1974. #include <asm/processor.h>
  1975. #include <asm/mce.h>
  1976. @@ -1266,7 +1268,7 @@
  1977. static unsigned long check_interval = 5 * 60; /* 5 minutes */
  1978. static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
  1979. -static DEFINE_PER_CPU(struct timer_list, mce_timer);
  1980. +static DEFINE_PER_CPU(struct hrtimer, mce_timer);
  1981. static unsigned long mce_adjust_timer_default(unsigned long interval)
  1982. {
  1983. @@ -1283,14 +1285,11 @@
  1984. return test_and_clear_bit(0, v);
  1985. }
  1986. -static void mce_timer_fn(unsigned long data)
  1987. +static enum hrtimer_restart mce_timer_fn(struct hrtimer *timer)
  1988. {
  1989. - struct timer_list *t = this_cpu_ptr(&mce_timer);
  1990. unsigned long iv;
  1991. int notify;
  1992. - WARN_ON(smp_processor_id() != data);
  1993. -
  1994. if (mce_available(this_cpu_ptr(&cpu_info))) {
  1995. machine_check_poll(MCP_TIMESTAMP,
  1996. this_cpu_ptr(&mce_poll_banks));
  1997. @@ -1313,9 +1312,11 @@
  1998. __this_cpu_write(mce_next_interval, iv);
  1999. /* Might have become 0 after CMCI storm subsided */
  2000. if (iv) {
  2001. - t->expires = jiffies + iv;
  2002. - add_timer_on(t, smp_processor_id());
  2003. + hrtimer_forward_now(timer, ns_to_ktime(
  2004. + jiffies_to_usecs(iv) * 1000ULL));
  2005. + return HRTIMER_RESTART;
  2006. }
  2007. + return HRTIMER_NORESTART;
  2008. }
  2009. /*
  2010. @@ -1323,28 +1324,37 @@
  2011. */
  2012. void mce_timer_kick(unsigned long interval)
  2013. {
  2014. - struct timer_list *t = this_cpu_ptr(&mce_timer);
  2015. - unsigned long when = jiffies + interval;
  2016. + struct hrtimer *t = this_cpu_ptr(&mce_timer);
  2017. unsigned long iv = __this_cpu_read(mce_next_interval);
  2018. - if (timer_pending(t)) {
  2019. - if (time_before(when, t->expires))
  2020. - mod_timer_pinned(t, when);
  2021. + if (hrtimer_active(t)) {
  2022. + s64 exp;
  2023. + s64 intv_us;
  2024. +
  2025. + intv_us = jiffies_to_usecs(interval);
  2026. + exp = ktime_to_us(hrtimer_expires_remaining(t));
  2027. + if (intv_us < exp) {
  2028. + hrtimer_cancel(t);
  2029. + hrtimer_start_range_ns(t,
  2030. + ns_to_ktime(intv_us * 1000),
  2031. + 0, HRTIMER_MODE_REL_PINNED);
  2032. + }
  2033. } else {
  2034. - t->expires = round_jiffies(when);
  2035. - add_timer_on(t, smp_processor_id());
  2036. + hrtimer_start_range_ns(t,
  2037. + ns_to_ktime(jiffies_to_usecs(interval) * 1000ULL),
  2038. + 0, HRTIMER_MODE_REL_PINNED);
  2039. }
  2040. if (interval < iv)
  2041. __this_cpu_write(mce_next_interval, interval);
  2042. }
  2043. -/* Must not be called in IRQ context where del_timer_sync() can deadlock */
  2044. +/* Must not be called in IRQ context where hrtimer_cancel() can deadlock */
  2045. static void mce_timer_delete_all(void)
  2046. {
  2047. int cpu;
  2048. for_each_online_cpu(cpu)
  2049. - del_timer_sync(&per_cpu(mce_timer, cpu));
  2050. + hrtimer_cancel(&per_cpu(mce_timer, cpu));
  2051. }
  2052. static void mce_do_trigger(struct work_struct *work)
  2053. @@ -1354,6 +1364,56 @@
  2054. static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
  2055. +static void __mce_notify_work(struct swork_event *event)
  2056. +{
  2057. + /* Not more than two messages every minute */
  2058. + static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
  2059. +
  2060. + /* wake processes polling /dev/mcelog */
  2061. + wake_up_interruptible(&mce_chrdev_wait);
  2062. +
  2063. + /*
  2064. + * There is no risk of missing notifications because
  2065. + * work_pending is always cleared before the function is
  2066. + * executed.
  2067. + */
  2068. + if (mce_helper[0] && !work_pending(&mce_trigger_work))
  2069. + schedule_work(&mce_trigger_work);
  2070. +
  2071. + if (__ratelimit(&ratelimit))
  2072. + pr_info(HW_ERR "Machine check events logged\n");
  2073. +}
  2074. +
  2075. +#ifdef CONFIG_PREEMPT_RT_FULL
  2076. +static bool notify_work_ready __read_mostly;
  2077. +static struct swork_event notify_work;
  2078. +
  2079. +static int mce_notify_work_init(void)
  2080. +{
  2081. + int err;
  2082. +
  2083. + err = swork_get();
  2084. + if (err)
  2085. + return err;
  2086. +
  2087. + INIT_SWORK(&notify_work, __mce_notify_work);
  2088. + notify_work_ready = true;
  2089. + return 0;
  2090. +}
  2091. +
  2092. +static void mce_notify_work(void)
  2093. +{
  2094. + if (notify_work_ready)
  2095. + swork_queue(&notify_work);
  2096. +}
  2097. +#else
  2098. +static void mce_notify_work(void)
  2099. +{
  2100. + __mce_notify_work(NULL);
  2101. +}
  2102. +static inline int mce_notify_work_init(void) { return 0; }
  2103. +#endif
  2104. +
  2105. /*
  2106. * Notify the user(s) about new machine check events.
  2107. * Can be called from interrupt context, but not from machine check/NMI
  2108. @@ -1361,19 +1421,8 @@
  2109. */
  2110. int mce_notify_irq(void)
  2111. {
  2112. - /* Not more than two messages every minute */
  2113. - static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
  2114. -
  2115. if (test_and_clear_bit(0, &mce_need_notify)) {
  2116. - /* wake processes polling /dev/mcelog */
  2117. - wake_up_interruptible(&mce_chrdev_wait);
  2118. -
  2119. - if (mce_helper[0])
  2120. - schedule_work(&mce_trigger_work);
  2121. -
  2122. - if (__ratelimit(&ratelimit))
  2123. - pr_info(HW_ERR "Machine check events logged\n");
  2124. -
  2125. + mce_notify_work();
  2126. return 1;
  2127. }
  2128. return 0;
  2129. @@ -1644,7 +1693,7 @@
  2130. }
  2131. }
  2132. -static void mce_start_timer(unsigned int cpu, struct timer_list *t)
  2133. +static void mce_start_timer(unsigned int cpu, struct hrtimer *t)
  2134. {
  2135. unsigned long iv = check_interval * HZ;
  2136. @@ -1653,16 +1702,17 @@
  2137. per_cpu(mce_next_interval, cpu) = iv;
  2138. - t->expires = round_jiffies(jiffies + iv);
  2139. - add_timer_on(t, cpu);
  2140. + hrtimer_start_range_ns(t, ns_to_ktime(jiffies_to_usecs(iv) * 1000ULL),
  2141. + 0, HRTIMER_MODE_REL_PINNED);
  2142. }
  2143. static void __mcheck_cpu_init_timer(void)
  2144. {
  2145. - struct timer_list *t = this_cpu_ptr(&mce_timer);
  2146. + struct hrtimer *t = this_cpu_ptr(&mce_timer);
  2147. unsigned int cpu = smp_processor_id();
  2148. - setup_timer(t, mce_timer_fn, cpu);
  2149. + hrtimer_init(t, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
  2150. + t->function = mce_timer_fn;
  2151. mce_start_timer(cpu, t);
  2152. }
  2153. @@ -2339,6 +2389,8 @@
  2154. if (!mce_available(raw_cpu_ptr(&cpu_info)))
  2155. return;
  2156. + hrtimer_cancel(this_cpu_ptr(&mce_timer));
  2157. +
  2158. if (!(action & CPU_TASKS_FROZEN))
  2159. cmci_clear();
  2160. for (i = 0; i < mca_cfg.banks; i++) {
  2161. @@ -2365,6 +2417,7 @@
  2162. if (b->init)
  2163. wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);
  2164. }
  2165. + __mcheck_cpu_init_timer();
  2166. }
  2167. /* Get notified when a cpu comes on/off. Be hotplug friendly. */
  2168. @@ -2372,7 +2425,6 @@
  2169. mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
  2170. {
  2171. unsigned int cpu = (unsigned long)hcpu;
  2172. - struct timer_list *t = &per_cpu(mce_timer, cpu);
  2173. switch (action & ~CPU_TASKS_FROZEN) {
  2174. case CPU_ONLINE:
  2175. @@ -2392,11 +2444,9 @@
  2176. break;
  2177. case CPU_DOWN_PREPARE:
  2178. smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
  2179. - del_timer_sync(t);
  2180. break;
  2181. case CPU_DOWN_FAILED:
  2182. smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
  2183. - mce_start_timer(cpu, t);
  2184. break;
  2185. }
  2186. @@ -2435,6 +2485,10 @@
  2187. goto err_out;
  2188. }
  2189. + err = mce_notify_work_init();
  2190. + if (err)
  2191. + goto err_out;
  2192. +
  2193. if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) {
  2194. err = -ENOMEM;
  2195. goto err_out;
  2196. diff -Nur linux-3.18.12.orig/arch/x86/kernel/entry_32.S linux-3.18.12/arch/x86/kernel/entry_32.S
  2197. --- linux-3.18.12.orig/arch/x86/kernel/entry_32.S 2015-04-20 14:48:02.000000000 -0500
  2198. +++ linux-3.18.12/arch/x86/kernel/entry_32.S 2015-04-26 13:32:22.379684003 -0500
  2199. @@ -359,8 +359,24 @@
  2200. ENTRY(resume_kernel)
  2201. DISABLE_INTERRUPTS(CLBR_ANY)
  2202. need_resched:
  2203. + # preempt count == 0 + NEED_RS set?
  2204. cmpl $0,PER_CPU_VAR(__preempt_count)
  2205. +#ifndef CONFIG_PREEMPT_LAZY
  2206. jnz restore_all
  2207. +#else
  2208. + jz test_int_off
  2209. +
  2210. + # atleast preempt count == 0 ?
  2211. + cmpl $_PREEMPT_ENABLED,PER_CPU_VAR(__preempt_count)
  2212. + jne restore_all
  2213. +
  2214. + cmpl $0,TI_preempt_lazy_count(%ebp) # non-zero preempt_lazy_count ?
  2215. + jnz restore_all
  2216. +
  2217. + testl $_TIF_NEED_RESCHED_LAZY, TI_flags(%ebp)
  2218. + jz restore_all
  2219. +test_int_off:
  2220. +#endif
  2221. testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off (exception path) ?
  2222. jz restore_all
  2223. call preempt_schedule_irq
  2224. @@ -591,7 +607,7 @@
  2225. ALIGN
  2226. RING0_PTREGS_FRAME # can't unwind into user space anyway
  2227. work_pending:
  2228. - testb $_TIF_NEED_RESCHED, %cl
  2229. + testl $_TIF_NEED_RESCHED_MASK, %ecx
  2230. jz work_notifysig
  2231. work_resched:
  2232. call schedule
  2233. @@ -604,7 +620,7 @@
  2234. andl $_TIF_WORK_MASK, %ecx # is there any work to be done other
  2235. # than syscall tracing?
  2236. jz restore_all
  2237. - testb $_TIF_NEED_RESCHED, %cl
  2238. + testl $_TIF_NEED_RESCHED_MASK, %ecx
  2239. jnz work_resched
  2240. work_notifysig: # deal with pending signals and
  2241. diff -Nur linux-3.18.12.orig/arch/x86/kernel/entry_64.S linux-3.18.12/arch/x86/kernel/entry_64.S
  2242. --- linux-3.18.12.orig/arch/x86/kernel/entry_64.S 2015-04-20 14:48:02.000000000 -0500
  2243. +++ linux-3.18.12/arch/x86/kernel/entry_64.S 2015-04-26 13:32:22.379684003 -0500
  2244. @@ -454,8 +454,8 @@
  2245. /* Handle reschedules */
  2246. /* edx: work, edi: workmask */
  2247. sysret_careful:
  2248. - bt $TIF_NEED_RESCHED,%edx
  2249. - jnc sysret_signal
  2250. + testl $_TIF_NEED_RESCHED_MASK,%edx
  2251. + jz sysret_signal
  2252. TRACE_IRQS_ON
  2253. ENABLE_INTERRUPTS(CLBR_NONE)
  2254. pushq_cfi %rdi
  2255. @@ -554,8 +554,8 @@
  2256. /* First do a reschedule test. */
  2257. /* edx: work, edi: workmask */
  2258. int_careful:
  2259. - bt $TIF_NEED_RESCHED,%edx
  2260. - jnc int_very_careful
  2261. + testl $_TIF_NEED_RESCHED_MASK,%edx
  2262. + jz int_very_careful
  2263. TRACE_IRQS_ON
  2264. ENABLE_INTERRUPTS(CLBR_NONE)
  2265. pushq_cfi %rdi
  2266. @@ -870,8 +870,8 @@
  2267. /* edi: workmask, edx: work */
  2268. retint_careful:
  2269. CFI_RESTORE_STATE
  2270. - bt $TIF_NEED_RESCHED,%edx
  2271. - jnc retint_signal
  2272. + testl $_TIF_NEED_RESCHED_MASK,%edx
  2273. + jz retint_signal
  2274. TRACE_IRQS_ON
  2275. ENABLE_INTERRUPTS(CLBR_NONE)
  2276. pushq_cfi %rdi
  2277. @@ -903,7 +903,22 @@
  2278. /* rcx: threadinfo. interrupts off. */
  2279. ENTRY(retint_kernel)
  2280. cmpl $0,PER_CPU_VAR(__preempt_count)
  2281. +#ifndef CONFIG_PREEMPT_LAZY
  2282. jnz retint_restore_args
  2283. +#else
  2284. + jz check_int_off
  2285. +
  2286. + # atleast preempt count == 0 ?
  2287. + cmpl $_PREEMPT_ENABLED,PER_CPU_VAR(__preempt_count)
  2288. + jnz retint_restore_args
  2289. +
  2290. + cmpl $0, TI_preempt_lazy_count(%rcx)
  2291. + jnz retint_restore_args
  2292. +
  2293. + bt $TIF_NEED_RESCHED_LAZY,TI_flags(%rcx)
  2294. + jnc retint_restore_args
  2295. +check_int_off:
  2296. +#endif
  2297. bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */
  2298. jnc retint_restore_args
  2299. call preempt_schedule_irq
  2300. @@ -1119,6 +1134,7 @@
  2301. jmp 2b
  2302. .previous
  2303. +#ifndef CONFIG_PREEMPT_RT_FULL
  2304. /* Call softirq on interrupt stack. Interrupts are off. */
  2305. ENTRY(do_softirq_own_stack)
  2306. CFI_STARTPROC
  2307. @@ -1138,6 +1154,7 @@
  2308. ret
  2309. CFI_ENDPROC
  2310. END(do_softirq_own_stack)
  2311. +#endif
  2312. #ifdef CONFIG_XEN
  2313. idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0
  2314. @@ -1302,7 +1319,7 @@
  2315. movq %rsp,%rdi /* &pt_regs */
  2316. call sync_regs
  2317. movq %rax,%rsp /* switch stack for scheduling */
  2318. - testl $_TIF_NEED_RESCHED,%ebx
  2319. + testl $_TIF_NEED_RESCHED_MASK,%ebx
  2320. jnz paranoid_schedule
  2321. movl %ebx,%edx /* arg3: thread flags */
  2322. TRACE_IRQS_ON
  2323. diff -Nur linux-3.18.12.orig/arch/x86/kernel/irq_32.c linux-3.18.12/arch/x86/kernel/irq_32.c
  2324. --- linux-3.18.12.orig/arch/x86/kernel/irq_32.c 2015-04-20 14:48:02.000000000 -0500
  2325. +++ linux-3.18.12/arch/x86/kernel/irq_32.c 2015-04-26 13:32:22.379684003 -0500
  2326. @@ -142,6 +142,7 @@
  2327. cpu, per_cpu(hardirq_stack, cpu), per_cpu(softirq_stack, cpu));
  2328. }
  2329. +#ifndef CONFIG_PREEMPT_RT_FULL
  2330. void do_softirq_own_stack(void)
  2331. {
  2332. struct thread_info *curstk;
  2333. @@ -160,6 +161,7 @@
  2334. call_on_stack(__do_softirq, isp);
  2335. }
  2336. +#endif
  2337. bool handle_irq(unsigned irq, struct pt_regs *regs)
  2338. {
  2339. diff -Nur linux-3.18.12.orig/arch/x86/kernel/process_32.c linux-3.18.12/arch/x86/kernel/process_32.c
  2340. --- linux-3.18.12.orig/arch/x86/kernel/process_32.c 2015-04-20 14:48:02.000000000 -0500
  2341. +++ linux-3.18.12/arch/x86/kernel/process_32.c 2015-04-26 13:32:22.379684003 -0500
  2342. @@ -35,6 +35,7 @@
  2343. #include <linux/uaccess.h>
  2344. #include <linux/io.h>
  2345. #include <linux/kdebug.h>
  2346. +#include <linux/highmem.h>
  2347. #include <asm/pgtable.h>
  2348. #include <asm/ldt.h>
  2349. @@ -214,6 +215,35 @@
  2350. }
  2351. EXPORT_SYMBOL_GPL(start_thread);
  2352. +#ifdef CONFIG_PREEMPT_RT_FULL
  2353. +static void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p)
  2354. +{
  2355. + int i;
  2356. +
  2357. + /*
  2358. + * Clear @prev's kmap_atomic mappings
  2359. + */
  2360. + for (i = 0; i < prev_p->kmap_idx; i++) {
  2361. + int idx = i + KM_TYPE_NR * smp_processor_id();
  2362. + pte_t *ptep = kmap_pte - idx;
  2363. +
  2364. + kpte_clear_flush(ptep, __fix_to_virt(FIX_KMAP_BEGIN + idx));
  2365. + }
  2366. + /*
  2367. + * Restore @next_p's kmap_atomic mappings
  2368. + */
  2369. + for (i = 0; i < next_p->kmap_idx; i++) {
  2370. + int idx = i + KM_TYPE_NR * smp_processor_id();
  2371. +
  2372. + if (!pte_none(next_p->kmap_pte[i]))
  2373. + set_pte(kmap_pte - idx, next_p->kmap_pte[i]);
  2374. + }
  2375. +}
  2376. +#else
  2377. +static inline void
  2378. +switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p) { }
  2379. +#endif
  2380. +
  2381. /*
  2382. * switch_to(x,y) should switch tasks from x to y.
  2383. @@ -301,6 +331,8 @@
  2384. task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT))
  2385. __switch_to_xtra(prev_p, next_p, tss);
  2386. + switch_kmaps(prev_p, next_p);
  2387. +
  2388. /*
  2389. * Leave lazy mode, flushing any hypercalls made here.
  2390. * This must be done before restoring TLS segments so
  2391. diff -Nur linux-3.18.12.orig/arch/x86/kernel/signal.c linux-3.18.12/arch/x86/kernel/signal.c
  2392. --- linux-3.18.12.orig/arch/x86/kernel/signal.c 2015-04-20 14:48:02.000000000 -0500
  2393. +++ linux-3.18.12/arch/x86/kernel/signal.c 2015-04-26 13:32:22.379684003 -0500
  2394. @@ -746,6 +746,14 @@
  2395. mce_notify_process();
  2396. #endif /* CONFIG_X86_64 && CONFIG_X86_MCE */
  2397. +#ifdef ARCH_RT_DELAYS_SIGNAL_SEND
  2398. + if (unlikely(current->forced_info.si_signo)) {
  2399. + struct task_struct *t = current;
  2400. + force_sig_info(t->forced_info.si_signo, &t->forced_info, t);
  2401. + t->forced_info.si_signo = 0;
  2402. + }
  2403. +#endif
  2404. +
  2405. if (thread_info_flags & _TIF_UPROBE)
  2406. uprobe_notify_resume(regs);
  2407. diff -Nur linux-3.18.12.orig/arch/x86/kernel/traps.c linux-3.18.12/arch/x86/kernel/traps.c
  2408. --- linux-3.18.12.orig/arch/x86/kernel/traps.c 2015-04-20 14:48:02.000000000 -0500
  2409. +++ linux-3.18.12/arch/x86/kernel/traps.c 2015-04-26 13:32:22.379684003 -0500
  2410. @@ -87,9 +87,21 @@
  2411. local_irq_enable();
  2412. }
  2413. -static inline void preempt_conditional_sti(struct pt_regs *regs)
  2414. +static inline void conditional_sti_ist(struct pt_regs *regs)
  2415. {
  2416. +#ifdef CONFIG_X86_64
  2417. + /*
  2418. + * X86_64 uses a per CPU stack on the IST for certain traps
  2419. + * like int3. The task can not be preempted when using one
  2420. + * of these stacks, thus preemption must be disabled, otherwise
  2421. + * the stack can be corrupted if the task is scheduled out,
  2422. + * and another task comes in and uses this stack.
  2423. + *
  2424. + * On x86_32 the task keeps its own stack and it is OK if the
  2425. + * task schedules out.
  2426. + */
  2427. preempt_count_inc();
  2428. +#endif
  2429. if (regs->flags & X86_EFLAGS_IF)
  2430. local_irq_enable();
  2431. }
  2432. @@ -100,11 +112,13 @@
  2433. local_irq_disable();
  2434. }
  2435. -static inline void preempt_conditional_cli(struct pt_regs *regs)
  2436. +static inline void conditional_cli_ist(struct pt_regs *regs)
  2437. {
  2438. if (regs->flags & X86_EFLAGS_IF)
  2439. local_irq_disable();
  2440. +#ifdef CONFIG_X86_64
  2441. preempt_count_dec();
  2442. +#endif
  2443. }
  2444. static nokprobe_inline int
  2445. @@ -372,9 +386,9 @@
  2446. * as we may switch to the interrupt stack.
  2447. */
  2448. debug_stack_usage_inc();
  2449. - preempt_conditional_sti(regs);
  2450. + conditional_sti_ist(regs);
  2451. do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, error_code, NULL);
  2452. - preempt_conditional_cli(regs);
  2453. + conditional_cli_ist(regs);
  2454. debug_stack_usage_dec();
  2455. exit:
  2456. exception_exit(prev_state);
  2457. @@ -517,12 +531,12 @@
  2458. debug_stack_usage_inc();
  2459. /* It's safe to allow irq's after DR6 has been saved */
  2460. - preempt_conditional_sti(regs);
  2461. + conditional_sti_ist(regs);
  2462. if (regs->flags & X86_VM_MASK) {
  2463. handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code,
  2464. X86_TRAP_DB);
  2465. - preempt_conditional_cli(regs);
  2466. + conditional_cli_ist(regs);
  2467. debug_stack_usage_dec();
  2468. goto exit;
  2469. }
  2470. @@ -542,7 +556,7 @@
  2471. si_code = get_si_code(tsk->thread.debugreg6);
  2472. if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp)
  2473. send_sigtrap(tsk, regs, error_code, si_code);
  2474. - preempt_conditional_cli(regs);
  2475. + conditional_cli_ist(regs);
  2476. debug_stack_usage_dec();
  2477. exit:
  2478. diff -Nur linux-3.18.12.orig/arch/x86/kvm/lapic.c linux-3.18.12/arch/x86/kvm/lapic.c
  2479. --- linux-3.18.12.orig/arch/x86/kvm/lapic.c 2015-04-20 14:48:02.000000000 -0500
  2480. +++ linux-3.18.12/arch/x86/kvm/lapic.c 2015-04-26 13:32:22.379684003 -0500
  2481. @@ -1034,8 +1034,38 @@
  2482. apic->divide_count);
  2483. }
  2484. +
  2485. +static enum hrtimer_restart apic_timer_fn(struct hrtimer *data);
  2486. +
  2487. +static void apic_timer_expired(struct hrtimer *data)
  2488. +{
  2489. + int ret, i = 0;
  2490. + enum hrtimer_restart r;
  2491. + struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
  2492. +
  2493. + r = apic_timer_fn(data);
  2494. +
  2495. + if (r == HRTIMER_RESTART) {
  2496. + do {
  2497. + ret = hrtimer_start_expires(data, HRTIMER_MODE_ABS);
  2498. + if (ret == -ETIME)
  2499. + hrtimer_add_expires_ns(&ktimer->timer,
  2500. + ktimer->period);
  2501. + i++;
  2502. + } while (ret == -ETIME && i < 10);
  2503. +
  2504. + if (ret == -ETIME) {
  2505. + printk_once(KERN_ERR "%s: failed to reprogram timer\n",
  2506. + __func__);
  2507. + WARN_ON_ONCE(1);
  2508. + }
  2509. + }
  2510. +}
  2511. +
  2512. +
  2513. static void start_apic_timer(struct kvm_lapic *apic)
  2514. {
  2515. + int ret;
  2516. ktime_t now;
  2517. atomic_set(&apic->lapic_timer.pending, 0);
  2518. @@ -1065,9 +1095,11 @@
  2519. }
  2520. }
  2521. - hrtimer_start(&apic->lapic_timer.timer,
  2522. + ret = hrtimer_start(&apic->lapic_timer.timer,
  2523. ktime_add_ns(now, apic->lapic_timer.period),
  2524. HRTIMER_MODE_ABS);
  2525. + if (ret == -ETIME)
  2526. + apic_timer_expired(&apic->lapic_timer.timer);
  2527. apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016"
  2528. PRIx64 ", "
  2529. @@ -1097,8 +1129,10 @@
  2530. ns = (tscdeadline - guest_tsc) * 1000000ULL;
  2531. do_div(ns, this_tsc_khz);
  2532. }
  2533. - hrtimer_start(&apic->lapic_timer.timer,
  2534. + ret = hrtimer_start(&apic->lapic_timer.timer,
  2535. ktime_add_ns(now, ns), HRTIMER_MODE_ABS);
  2536. + if (ret == -ETIME)
  2537. + apic_timer_expired(&apic->lapic_timer.timer);
  2538. local_irq_restore(flags);
  2539. }
  2540. @@ -1539,7 +1573,7 @@
  2541. struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
  2542. struct kvm_lapic *apic = container_of(ktimer, struct kvm_lapic, lapic_timer);
  2543. struct kvm_vcpu *vcpu = apic->vcpu;
  2544. - wait_queue_head_t *q = &vcpu->wq;
  2545. + struct swait_head *q = &vcpu->wq;
  2546. /*
  2547. * There is a race window between reading and incrementing, but we do
  2548. @@ -1553,8 +1587,8 @@
  2549. kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
  2550. }
  2551. - if (waitqueue_active(q))
  2552. - wake_up_interruptible(q);
  2553. + if (swaitqueue_active(q))
  2554. + swait_wake_interruptible(q);
  2555. if (lapic_is_periodic(apic)) {
  2556. hrtimer_add_expires_ns(&ktimer->timer, ktimer->period);
  2557. @@ -1587,6 +1621,7 @@
  2558. hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
  2559. HRTIMER_MODE_ABS);
  2560. apic->lapic_timer.timer.function = apic_timer_fn;
  2561. + apic->lapic_timer.timer.irqsafe = 1;
  2562. /*
  2563. * APIC is created enabled. This will prevent kvm_lapic_set_base from
  2564. @@ -1707,7 +1742,8 @@
  2565. timer = &vcpu->arch.apic->lapic_timer.timer;
  2566. if (hrtimer_cancel(timer))
  2567. - hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
  2568. + if (hrtimer_start_expires(timer, HRTIMER_MODE_ABS) == -ETIME)
  2569. + apic_timer_expired(timer);
  2570. }
  2571. /*
  2572. diff -Nur linux-3.18.12.orig/arch/x86/kvm/x86.c linux-3.18.12/arch/x86/kvm/x86.c
  2573. --- linux-3.18.12.orig/arch/x86/kvm/x86.c 2015-04-20 14:48:02.000000000 -0500
  2574. +++ linux-3.18.12/arch/x86/kvm/x86.c 2015-04-26 13:32:22.383684003 -0500
  2575. @@ -5772,6 +5772,13 @@
  2576. goto out;
  2577. }
  2578. +#ifdef CONFIG_PREEMPT_RT_FULL
  2579. + if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
  2580. + printk(KERN_ERR "RT requires X86_FEATURE_CONSTANT_TSC\n");
  2581. + return -EOPNOTSUPP;
  2582. + }
  2583. +#endif
  2584. +
  2585. r = kvm_mmu_module_init();
  2586. if (r)
  2587. goto out_free_percpu;
  2588. diff -Nur linux-3.18.12.orig/arch/x86/mm/fault.c linux-3.18.12/arch/x86/mm/fault.c
  2589. --- linux-3.18.12.orig/arch/x86/mm/fault.c 2015-04-20 14:48:02.000000000 -0500
  2590. +++ linux-3.18.12/arch/x86/mm/fault.c 2015-04-26 13:32:22.383684003 -0500
  2591. @@ -1128,7 +1128,7 @@
  2592. * If we're in an interrupt, have no user context or are running
  2593. * in an atomic region then we must not take the fault:
  2594. */
  2595. - if (unlikely(in_atomic() || !mm)) {
  2596. + if (unlikely(!mm || pagefault_disabled())) {
  2597. bad_area_nosemaphore(regs, error_code, address);
  2598. return;
  2599. }
  2600. diff -Nur linux-3.18.12.orig/arch/x86/mm/highmem_32.c linux-3.18.12/arch/x86/mm/highmem_32.c
  2601. --- linux-3.18.12.orig/arch/x86/mm/highmem_32.c 2015-04-20 14:48:02.000000000 -0500
  2602. +++ linux-3.18.12/arch/x86/mm/highmem_32.c 2015-04-26 13:32:22.383684003 -0500
  2603. @@ -32,6 +32,7 @@
  2604. */
  2605. void *kmap_atomic_prot(struct page *page, pgprot_t prot)
  2606. {
  2607. + pte_t pte = mk_pte(page, prot);
  2608. unsigned long vaddr;
  2609. int idx, type;
  2610. @@ -45,7 +46,10 @@
  2611. idx = type + KM_TYPE_NR*smp_processor_id();
  2612. vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
  2613. BUG_ON(!pte_none(*(kmap_pte-idx)));
  2614. - set_pte(kmap_pte-idx, mk_pte(page, prot));
  2615. +#ifdef CONFIG_PREEMPT_RT_FULL
  2616. + current->kmap_pte[type] = pte;
  2617. +#endif
  2618. + set_pte(kmap_pte-idx, pte);
  2619. arch_flush_lazy_mmu_mode();
  2620. return (void *)vaddr;
  2621. @@ -88,6 +92,9 @@
  2622. * is a bad idea also, in case the page changes cacheability
  2623. * attributes or becomes a protected page in a hypervisor.
  2624. */
  2625. +#ifdef CONFIG_PREEMPT_RT_FULL
  2626. + current->kmap_pte[type] = __pte(0);
  2627. +#endif
  2628. kpte_clear_flush(kmap_pte-idx, vaddr);
  2629. kmap_atomic_idx_pop();
  2630. arch_flush_lazy_mmu_mode();
  2631. diff -Nur linux-3.18.12.orig/arch/x86/mm/iomap_32.c linux-3.18.12/arch/x86/mm/iomap_32.c
  2632. --- linux-3.18.12.orig/arch/x86/mm/iomap_32.c 2015-04-20 14:48:02.000000000 -0500
  2633. +++ linux-3.18.12/arch/x86/mm/iomap_32.c 2015-04-26 13:32:22.383684003 -0500
  2634. @@ -56,6 +56,7 @@
  2635. void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
  2636. {
  2637. + pte_t pte = pfn_pte(pfn, prot);
  2638. unsigned long vaddr;
  2639. int idx, type;
  2640. @@ -64,7 +65,12 @@
  2641. type = kmap_atomic_idx_push();
  2642. idx = type + KM_TYPE_NR * smp_processor_id();
  2643. vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
  2644. - set_pte(kmap_pte - idx, pfn_pte(pfn, prot));
  2645. + WARN_ON(!pte_none(*(kmap_pte - idx)));
  2646. +
  2647. +#ifdef CONFIG_PREEMPT_RT_FULL
  2648. + current->kmap_pte[type] = pte;
  2649. +#endif
  2650. + set_pte(kmap_pte - idx, pte);
  2651. arch_flush_lazy_mmu_mode();
  2652. return (void *)vaddr;
  2653. @@ -110,6 +116,9 @@
  2654. * is a bad idea also, in case the page changes cacheability
  2655. * attributes or becomes a protected page in a hypervisor.
  2656. */
  2657. +#ifdef CONFIG_PREEMPT_RT_FULL
  2658. + current->kmap_pte[type] = __pte(0);
  2659. +#endif
  2660. kpte_clear_flush(kmap_pte-idx, vaddr);
  2661. kmap_atomic_idx_pop();
  2662. }
  2663. diff -Nur linux-3.18.12.orig/arch/x86/platform/uv/tlb_uv.c linux-3.18.12/arch/x86/platform/uv/tlb_uv.c
  2664. --- linux-3.18.12.orig/arch/x86/platform/uv/tlb_uv.c 2015-04-20 14:48:02.000000000 -0500
  2665. +++ linux-3.18.12/arch/x86/platform/uv/tlb_uv.c 2015-04-26 13:32:22.383684003 -0500
  2666. @@ -714,9 +714,9 @@
  2667. quiesce_local_uvhub(hmaster);
  2668. - spin_lock(&hmaster->queue_lock);
  2669. + raw_spin_lock(&hmaster->queue_lock);
  2670. reset_with_ipi(&bau_desc->distribution, bcp);
  2671. - spin_unlock(&hmaster->queue_lock);
  2672. + raw_spin_unlock(&hmaster->queue_lock);
  2673. end_uvhub_quiesce(hmaster);
  2674. @@ -736,9 +736,9 @@
  2675. quiesce_local_uvhub(hmaster);
  2676. - spin_lock(&hmaster->queue_lock);
  2677. + raw_spin_lock(&hmaster->queue_lock);
  2678. reset_with_ipi(&bau_desc->distribution, bcp);
  2679. - spin_unlock(&hmaster->queue_lock);
  2680. + raw_spin_unlock(&hmaster->queue_lock);
  2681. end_uvhub_quiesce(hmaster);
  2682. @@ -759,7 +759,7 @@
  2683. cycles_t tm1;
  2684. hmaster = bcp->uvhub_master;
  2685. - spin_lock(&hmaster->disable_lock);
  2686. + raw_spin_lock(&hmaster->disable_lock);
  2687. if (!bcp->baudisabled) {
  2688. stat->s_bau_disabled++;
  2689. tm1 = get_cycles();
  2690. @@ -772,7 +772,7 @@
  2691. }
  2692. }
  2693. }
  2694. - spin_unlock(&hmaster->disable_lock);
  2695. + raw_spin_unlock(&hmaster->disable_lock);
  2696. }
  2697. static void count_max_concurr(int stat, struct bau_control *bcp,
  2698. @@ -835,7 +835,7 @@
  2699. */
  2700. static void uv1_throttle(struct bau_control *hmaster, struct ptc_stats *stat)
  2701. {
  2702. - spinlock_t *lock = &hmaster->uvhub_lock;
  2703. + raw_spinlock_t *lock = &hmaster->uvhub_lock;
  2704. atomic_t *v;
  2705. v = &hmaster->active_descriptor_count;
  2706. @@ -968,7 +968,7 @@
  2707. struct bau_control *hmaster;
  2708. hmaster = bcp->uvhub_master;
  2709. - spin_lock(&hmaster->disable_lock);
  2710. + raw_spin_lock(&hmaster->disable_lock);
  2711. if (bcp->baudisabled && (get_cycles() >= bcp->set_bau_on_time)) {
  2712. stat->s_bau_reenabled++;
  2713. for_each_present_cpu(tcpu) {
  2714. @@ -980,10 +980,10 @@
  2715. tbcp->period_giveups = 0;
  2716. }
  2717. }
  2718. - spin_unlock(&hmaster->disable_lock);
  2719. + raw_spin_unlock(&hmaster->disable_lock);
  2720. return 0;
  2721. }
  2722. - spin_unlock(&hmaster->disable_lock);
  2723. + raw_spin_unlock(&hmaster->disable_lock);
  2724. return -1;
  2725. }
  2726. @@ -1899,9 +1899,9 @@
  2727. bcp->cong_reps = congested_reps;
  2728. bcp->disabled_period = sec_2_cycles(disabled_period);
  2729. bcp->giveup_limit = giveup_limit;
  2730. - spin_lock_init(&bcp->queue_lock);
  2731. - spin_lock_init(&bcp->uvhub_lock);
  2732. - spin_lock_init(&bcp->disable_lock);
  2733. + raw_spin_lock_init(&bcp->queue_lock);
  2734. + raw_spin_lock_init(&bcp->uvhub_lock);
  2735. + raw_spin_lock_init(&bcp->disable_lock);
  2736. }
  2737. }
  2738. diff -Nur linux-3.18.12.orig/arch/x86/platform/uv/uv_time.c linux-3.18.12/arch/x86/platform/uv/uv_time.c
  2739. --- linux-3.18.12.orig/arch/x86/platform/uv/uv_time.c 2015-04-20 14:48:02.000000000 -0500
  2740. +++ linux-3.18.12/arch/x86/platform/uv/uv_time.c 2015-04-26 13:32:22.383684003 -0500
  2741. @@ -58,7 +58,7 @@
  2742. /* There is one of these allocated per node */
  2743. struct uv_rtc_timer_head {
  2744. - spinlock_t lock;
  2745. + raw_spinlock_t lock;
  2746. /* next cpu waiting for timer, local node relative: */
  2747. int next_cpu;
  2748. /* number of cpus on this node: */
  2749. @@ -178,7 +178,7 @@
  2750. uv_rtc_deallocate_timers();
  2751. return -ENOMEM;
  2752. }
  2753. - spin_lock_init(&head->lock);
  2754. + raw_spin_lock_init(&head->lock);
  2755. head->ncpus = uv_blade_nr_possible_cpus(bid);
  2756. head->next_cpu = -1;
  2757. blade_info[bid] = head;
  2758. @@ -232,7 +232,7 @@
  2759. unsigned long flags;
  2760. int next_cpu;
  2761. - spin_lock_irqsave(&head->lock, flags);
  2762. + raw_spin_lock_irqsave(&head->lock, flags);
  2763. next_cpu = head->next_cpu;
  2764. *t = expires;
  2765. @@ -244,12 +244,12 @@
  2766. if (uv_setup_intr(cpu, expires)) {
  2767. *t = ULLONG_MAX;
  2768. uv_rtc_find_next_timer(head, pnode);
  2769. - spin_unlock_irqrestore(&head->lock, flags);
  2770. + raw_spin_unlock_irqrestore(&head->lock, flags);
  2771. return -ETIME;
  2772. }
  2773. }
  2774. - spin_unlock_irqrestore(&head->lock, flags);
  2775. + raw_spin_unlock_irqrestore(&head->lock, flags);
  2776. return 0;
  2777. }
  2778. @@ -268,7 +268,7 @@
  2779. unsigned long flags;
  2780. int rc = 0;
  2781. - spin_lock_irqsave(&head->lock, flags);
  2782. + raw_spin_lock_irqsave(&head->lock, flags);
  2783. if ((head->next_cpu == bcpu && uv_read_rtc(NULL) >= *t) || force)
  2784. rc = 1;
  2785. @@ -280,7 +280,7 @@
  2786. uv_rtc_find_next_timer(head, pnode);
  2787. }
  2788. - spin_unlock_irqrestore(&head->lock, flags);
  2789. + raw_spin_unlock_irqrestore(&head->lock, flags);
  2790. return rc;
  2791. }
  2792. @@ -300,13 +300,18 @@
  2793. static cycle_t uv_read_rtc(struct clocksource *cs)
  2794. {
  2795. unsigned long offset;
  2796. + cycle_t cycles;
  2797. + preempt_disable();
  2798. if (uv_get_min_hub_revision_id() == 1)
  2799. offset = 0;
  2800. else
  2801. offset = (uv_blade_processor_id() * L1_CACHE_BYTES) % PAGE_SIZE;
  2802. - return (cycle_t)uv_read_local_mmr(UVH_RTC | offset);
  2803. + cycles = (cycle_t)uv_read_local_mmr(UVH_RTC | offset);
  2804. + preempt_enable();
  2805. +
  2806. + return cycles;
  2807. }
  2808. /*
  2809. diff -Nur linux-3.18.12.orig/arch/xtensa/mm/fault.c linux-3.18.12/arch/xtensa/mm/fault.c
  2810. --- linux-3.18.12.orig/arch/xtensa/mm/fault.c 2015-04-20 14:48:02.000000000 -0500
  2811. +++ linux-3.18.12/arch/xtensa/mm/fault.c 2015-04-26 13:32:22.383684003 -0500
  2812. @@ -57,7 +57,7 @@
  2813. /* If we're in an interrupt or have no user
  2814. * context, we must not take the fault..
  2815. */
  2816. - if (in_atomic() || !mm) {
  2817. + if (!mm || pagefault_disabled()) {
  2818. bad_page_fault(regs, address, SIGSEGV);
  2819. return;
  2820. }
  2821. diff -Nur linux-3.18.12.orig/block/blk-core.c linux-3.18.12/block/blk-core.c
  2822. --- linux-3.18.12.orig/block/blk-core.c 2015-04-20 14:48:02.000000000 -0500
  2823. +++ linux-3.18.12/block/blk-core.c 2015-04-26 13:32:22.383684003 -0500
  2824. @@ -100,6 +100,9 @@
  2825. INIT_LIST_HEAD(&rq->queuelist);
  2826. INIT_LIST_HEAD(&rq->timeout_list);
  2827. +#if CONFIG_PREEMPT_RT_FULL
  2828. + INIT_WORK(&rq->work, __blk_mq_complete_request_remote_work);
  2829. +#endif
  2830. rq->cpu = -1;
  2831. rq->q = q;
  2832. rq->__sector = (sector_t) -1;
  2833. @@ -194,7 +197,7 @@
  2834. **/
  2835. void blk_start_queue(struct request_queue *q)
  2836. {
  2837. - WARN_ON(!irqs_disabled());
  2838. + WARN_ON_NONRT(!irqs_disabled());
  2839. queue_flag_clear(QUEUE_FLAG_STOPPED, q);
  2840. __blk_run_queue(q);
  2841. @@ -627,7 +630,7 @@
  2842. q->bypass_depth = 1;
  2843. __set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
  2844. - init_waitqueue_head(&q->mq_freeze_wq);
  2845. + init_swait_head(&q->mq_freeze_wq);
  2846. if (blkcg_init_queue(q))
  2847. goto fail_bdi;
  2848. @@ -3037,7 +3040,7 @@
  2849. blk_run_queue_async(q);
  2850. else
  2851. __blk_run_queue(q);
  2852. - spin_unlock(q->queue_lock);
  2853. + spin_unlock_irq(q->queue_lock);
  2854. }
  2855. static void flush_plug_callbacks(struct blk_plug *plug, bool from_schedule)
  2856. @@ -3085,7 +3088,6 @@
  2857. void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
  2858. {
  2859. struct request_queue *q;
  2860. - unsigned long flags;
  2861. struct request *rq;
  2862. LIST_HEAD(list);
  2863. unsigned int depth;
  2864. @@ -3105,11 +3107,6 @@
  2865. q = NULL;
  2866. depth = 0;
  2867. - /*
  2868. - * Save and disable interrupts here, to avoid doing it for every
  2869. - * queue lock we have to take.
  2870. - */
  2871. - local_irq_save(flags);
  2872. while (!list_empty(&list)) {
  2873. rq = list_entry_rq(list.next);
  2874. list_del_init(&rq->queuelist);
  2875. @@ -3122,7 +3119,7 @@
  2876. queue_unplugged(q, depth, from_schedule);
  2877. q = rq->q;
  2878. depth = 0;
  2879. - spin_lock(q->queue_lock);
  2880. + spin_lock_irq(q->queue_lock);
  2881. }
  2882. /*
  2883. @@ -3149,8 +3146,6 @@
  2884. */
  2885. if (q)
  2886. queue_unplugged(q, depth, from_schedule);
  2887. -
  2888. - local_irq_restore(flags);
  2889. }
  2890. void blk_finish_plug(struct blk_plug *plug)
  2891. diff -Nur linux-3.18.12.orig/block/blk-ioc.c linux-3.18.12/block/blk-ioc.c
  2892. --- linux-3.18.12.orig/block/blk-ioc.c 2015-04-20 14:48:02.000000000 -0500
  2893. +++ linux-3.18.12/block/blk-ioc.c 2015-04-26 13:32:22.383684003 -0500
  2894. @@ -7,6 +7,7 @@
  2895. #include <linux/bio.h>
  2896. #include <linux/blkdev.h>
  2897. #include <linux/slab.h>
  2898. +#include <linux/delay.h>
  2899. #include "blk.h"
  2900. @@ -109,7 +110,7 @@
  2901. spin_unlock(q->queue_lock);
  2902. } else {
  2903. spin_unlock_irqrestore(&ioc->lock, flags);
  2904. - cpu_relax();
  2905. + cpu_chill();
  2906. spin_lock_irqsave_nested(&ioc->lock, flags, 1);
  2907. }
  2908. }
  2909. @@ -187,7 +188,7 @@
  2910. spin_unlock(icq->q->queue_lock);
  2911. } else {
  2912. spin_unlock_irqrestore(&ioc->lock, flags);
  2913. - cpu_relax();
  2914. + cpu_chill();
  2915. goto retry;
  2916. }
  2917. }
  2918. diff -Nur linux-3.18.12.orig/block/blk-iopoll.c linux-3.18.12/block/blk-iopoll.c
  2919. --- linux-3.18.12.orig/block/blk-iopoll.c 2015-04-20 14:48:02.000000000 -0500
  2920. +++ linux-3.18.12/block/blk-iopoll.c 2015-04-26 13:32:22.383684003 -0500
  2921. @@ -35,6 +35,7 @@
  2922. list_add_tail(&iop->list, this_cpu_ptr(&blk_cpu_iopoll));
  2923. __raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
  2924. local_irq_restore(flags);
  2925. + preempt_check_resched_rt();
  2926. }
  2927. EXPORT_SYMBOL(blk_iopoll_sched);
  2928. @@ -132,6 +133,7 @@
  2929. __raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
  2930. local_irq_enable();
  2931. + preempt_check_resched_rt();
  2932. }
  2933. /**
  2934. @@ -201,6 +203,7 @@
  2935. this_cpu_ptr(&blk_cpu_iopoll));
  2936. __raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
  2937. local_irq_enable();
  2938. + preempt_check_resched_rt();
  2939. }
  2940. return NOTIFY_OK;
  2941. diff -Nur linux-3.18.12.orig/block/blk-mq.c linux-3.18.12/block/blk-mq.c
  2942. --- linux-3.18.12.orig/block/blk-mq.c 2015-04-20 14:48:02.000000000 -0500
  2943. +++ linux-3.18.12/block/blk-mq.c 2015-04-26 13:32:22.383684003 -0500
  2944. @@ -85,7 +85,7 @@
  2945. if (percpu_ref_tryget_live(&q->mq_usage_counter))
  2946. return 0;
  2947. - ret = wait_event_interruptible(q->mq_freeze_wq,
  2948. + ret = swait_event_interruptible(q->mq_freeze_wq,
  2949. !q->mq_freeze_depth || blk_queue_dying(q));
  2950. if (blk_queue_dying(q))
  2951. return -ENODEV;
  2952. @@ -104,7 +104,7 @@
  2953. struct request_queue *q =
  2954. container_of(ref, struct request_queue, mq_usage_counter);
  2955. - wake_up_all(&q->mq_freeze_wq);
  2956. + swait_wake_all(&q->mq_freeze_wq);
  2957. }
  2958. static void blk_mq_freeze_queue_start(struct request_queue *q)
  2959. @@ -123,7 +123,7 @@
  2960. static void blk_mq_freeze_queue_wait(struct request_queue *q)
  2961. {
  2962. - wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->mq_usage_counter));
  2963. + swait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->mq_usage_counter));
  2964. }
  2965. /*
  2966. @@ -146,7 +146,7 @@
  2967. spin_unlock_irq(q->queue_lock);
  2968. if (wake) {
  2969. percpu_ref_reinit(&q->mq_usage_counter);
  2970. - wake_up_all(&q->mq_freeze_wq);
  2971. + swait_wake_all(&q->mq_freeze_wq);
  2972. }
  2973. }
  2974. @@ -194,6 +194,9 @@
  2975. rq->resid_len = 0;
  2976. rq->sense = NULL;
  2977. +#ifdef CONFIG_PREEMPT_RT_FULL
  2978. + INIT_WORK(&rq->work, __blk_mq_complete_request_remote_work);
  2979. +#endif
  2980. INIT_LIST_HEAD(&rq->timeout_list);
  2981. rq->timeout = 0;
  2982. @@ -313,6 +316,17 @@
  2983. }
  2984. EXPORT_SYMBOL(blk_mq_end_request);
  2985. +#ifdef CONFIG_PREEMPT_RT_FULL
  2986. +
  2987. +void __blk_mq_complete_request_remote_work(struct work_struct *work)
  2988. +{
  2989. + struct request *rq = container_of(work, struct request, work);
  2990. +
  2991. + rq->q->softirq_done_fn(rq);
  2992. +}
  2993. +
  2994. +#else
  2995. +
  2996. static void __blk_mq_complete_request_remote(void *data)
  2997. {
  2998. struct request *rq = data;
  2999. @@ -320,6 +334,8 @@
  3000. rq->q->softirq_done_fn(rq);
  3001. }
  3002. +#endif
  3003. +
  3004. static void blk_mq_ipi_complete_request(struct request *rq)
  3005. {
  3006. struct blk_mq_ctx *ctx = rq->mq_ctx;
  3007. @@ -331,19 +347,23 @@
  3008. return;
  3009. }
  3010. - cpu = get_cpu();
  3011. + cpu = get_cpu_light();
  3012. if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags))
  3013. shared = cpus_share_cache(cpu, ctx->cpu);
  3014. if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
  3015. +#ifdef CONFIG_PREEMPT_RT_FULL
  3016. + schedule_work_on(ctx->cpu, &rq->work);
  3017. +#else
  3018. rq->csd.func = __blk_mq_complete_request_remote;
  3019. rq->csd.info = rq;
  3020. rq->csd.flags = 0;
  3021. smp_call_function_single_async(ctx->cpu, &rq->csd);
  3022. +#endif
  3023. } else {
  3024. rq->q->softirq_done_fn(rq);
  3025. }
  3026. - put_cpu();
  3027. + put_cpu_light();
  3028. }
  3029. void __blk_mq_complete_request(struct request *rq)
  3030. @@ -814,9 +834,9 @@
  3031. test_bit(BLK_MQ_S_STOPPED, &hctx->state))
  3032. continue;
  3033. - preempt_disable();
  3034. + migrate_disable();
  3035. blk_mq_run_hw_queue(hctx, async);
  3036. - preempt_enable();
  3037. + migrate_enable();
  3038. }
  3039. }
  3040. EXPORT_SYMBOL(blk_mq_run_queues);
  3041. @@ -843,9 +863,9 @@
  3042. {
  3043. clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
  3044. - preempt_disable();
  3045. + migrate_disable();
  3046. blk_mq_run_hw_queue(hctx, false);
  3047. - preempt_enable();
  3048. + migrate_enable();
  3049. }
  3050. EXPORT_SYMBOL(blk_mq_start_hw_queue);
  3051. @@ -870,9 +890,9 @@
  3052. continue;
  3053. clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
  3054. - preempt_disable();
  3055. + migrate_disable();
  3056. blk_mq_run_hw_queue(hctx, async);
  3057. - preempt_enable();
  3058. + migrate_enable();
  3059. }
  3060. }
  3061. EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues);
  3062. @@ -1494,7 +1514,7 @@
  3063. {
  3064. struct blk_mq_hw_ctx *hctx = data;
  3065. - if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
  3066. + if (action == CPU_POST_DEAD)
  3067. return blk_mq_hctx_cpu_offline(hctx, cpu);
  3068. else if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
  3069. return blk_mq_hctx_cpu_online(hctx, cpu);
  3070. diff -Nur linux-3.18.12.orig/block/blk-mq-cpu.c linux-3.18.12/block/blk-mq-cpu.c
  3071. --- linux-3.18.12.orig/block/blk-mq-cpu.c 2015-04-20 14:48:02.000000000 -0500
  3072. +++ linux-3.18.12/block/blk-mq-cpu.c 2015-04-26 13:32:22.383684003 -0500
  3073. @@ -16,7 +16,7 @@
  3074. #include "blk-mq.h"
  3075. static LIST_HEAD(blk_mq_cpu_notify_list);
  3076. -static DEFINE_RAW_SPINLOCK(blk_mq_cpu_notify_lock);
  3077. +static DEFINE_SPINLOCK(blk_mq_cpu_notify_lock);
  3078. static int blk_mq_main_cpu_notify(struct notifier_block *self,
  3079. unsigned long action, void *hcpu)
  3080. @@ -25,7 +25,10 @@
  3081. struct blk_mq_cpu_notifier *notify;
  3082. int ret = NOTIFY_OK;
  3083. - raw_spin_lock(&blk_mq_cpu_notify_lock);
  3084. + if (action != CPU_POST_DEAD)
  3085. + return NOTIFY_OK;
  3086. +
  3087. + spin_lock(&blk_mq_cpu_notify_lock);
  3088. list_for_each_entry(notify, &blk_mq_cpu_notify_list, list) {
  3089. ret = notify->notify(notify->data, action, cpu);
  3090. @@ -33,7 +36,7 @@
  3091. break;
  3092. }
  3093. - raw_spin_unlock(&blk_mq_cpu_notify_lock);
  3094. + spin_unlock(&blk_mq_cpu_notify_lock);
  3095. return ret;
  3096. }
  3097. @@ -41,16 +44,16 @@
  3098. {
  3099. BUG_ON(!notifier->notify);
  3100. - raw_spin_lock(&blk_mq_cpu_notify_lock);
  3101. + spin_lock(&blk_mq_cpu_notify_lock);
  3102. list_add_tail(&notifier->list, &blk_mq_cpu_notify_list);
  3103. - raw_spin_unlock(&blk_mq_cpu_notify_lock);
  3104. + spin_unlock(&blk_mq_cpu_notify_lock);
  3105. }
  3106. void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier)
  3107. {
  3108. - raw_spin_lock(&blk_mq_cpu_notify_lock);
  3109. + spin_lock(&blk_mq_cpu_notify_lock);
  3110. list_del(&notifier->list);
  3111. - raw_spin_unlock(&blk_mq_cpu_notify_lock);
  3112. + spin_unlock(&blk_mq_cpu_notify_lock);
  3113. }
  3114. void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier,
  3115. diff -Nur linux-3.18.12.orig/block/blk-mq.h linux-3.18.12/block/blk-mq.h
  3116. --- linux-3.18.12.orig/block/blk-mq.h 2015-04-20 14:48:02.000000000 -0500
  3117. +++ linux-3.18.12/block/blk-mq.h 2015-04-26 13:32:22.383684003 -0500
  3118. @@ -73,7 +73,10 @@
  3119. static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
  3120. unsigned int cpu)
  3121. {
  3122. - return per_cpu_ptr(q->queue_ctx, cpu);
  3123. + struct blk_mq_ctx *ctx;
  3124. +
  3125. + ctx = per_cpu_ptr(q->queue_ctx, cpu);
  3126. + return ctx;
  3127. }
  3128. /*
  3129. @@ -84,12 +87,12 @@
  3130. */
  3131. static inline struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q)
  3132. {
  3133. - return __blk_mq_get_ctx(q, get_cpu());
  3134. + return __blk_mq_get_ctx(q, get_cpu_light());
  3135. }
  3136. static inline void blk_mq_put_ctx(struct blk_mq_ctx *ctx)
  3137. {
  3138. - put_cpu();
  3139. + put_cpu_light();
  3140. }
  3141. struct blk_mq_alloc_data {
  3142. diff -Nur linux-3.18.12.orig/block/blk-softirq.c linux-3.18.12/block/blk-softirq.c
  3143. --- linux-3.18.12.orig/block/blk-softirq.c 2015-04-20 14:48:02.000000000 -0500
  3144. +++ linux-3.18.12/block/blk-softirq.c 2015-04-26 13:32:22.387684003 -0500
  3145. @@ -51,6 +51,7 @@
  3146. raise_softirq_irqoff(BLOCK_SOFTIRQ);
  3147. local_irq_restore(flags);
  3148. + preempt_check_resched_rt();
  3149. }
  3150. /*
  3151. @@ -93,6 +94,7 @@
  3152. this_cpu_ptr(&blk_cpu_done));
  3153. raise_softirq_irqoff(BLOCK_SOFTIRQ);
  3154. local_irq_enable();
  3155. + preempt_check_resched_rt();
  3156. }
  3157. return NOTIFY_OK;
  3158. @@ -150,6 +152,7 @@
  3159. goto do_local;
  3160. local_irq_restore(flags);
  3161. + preempt_check_resched_rt();
  3162. }
  3163. /**
  3164. diff -Nur linux-3.18.12.orig/block/bounce.c linux-3.18.12/block/bounce.c
  3165. --- linux-3.18.12.orig/block/bounce.c 2015-04-20 14:48:02.000000000 -0500
  3166. +++ linux-3.18.12/block/bounce.c 2015-04-26 13:32:22.387684003 -0500
  3167. @@ -54,11 +54,11 @@
  3168. unsigned long flags;
  3169. unsigned char *vto;
  3170. - local_irq_save(flags);
  3171. + local_irq_save_nort(flags);
  3172. vto = kmap_atomic(to->bv_page);
  3173. memcpy(vto + to->bv_offset, vfrom, to->bv_len);
  3174. kunmap_atomic(vto);
  3175. - local_irq_restore(flags);
  3176. + local_irq_restore_nort(flags);
  3177. }
  3178. #else /* CONFIG_HIGHMEM */
  3179. diff -Nur linux-3.18.12.orig/crypto/algapi.c linux-3.18.12/crypto/algapi.c
  3180. --- linux-3.18.12.orig/crypto/algapi.c 2015-04-20 14:48:02.000000000 -0500
  3181. +++ linux-3.18.12/crypto/algapi.c 2015-04-26 13:32:22.387684003 -0500
  3182. @@ -698,13 +698,13 @@
  3183. int crypto_register_notifier(struct notifier_block *nb)
  3184. {
  3185. - return blocking_notifier_chain_register(&crypto_chain, nb);
  3186. + return srcu_notifier_chain_register(&crypto_chain, nb);
  3187. }
  3188. EXPORT_SYMBOL_GPL(crypto_register_notifier);
  3189. int crypto_unregister_notifier(struct notifier_block *nb)
  3190. {
  3191. - return blocking_notifier_chain_unregister(&crypto_chain, nb);
  3192. + return srcu_notifier_chain_unregister(&crypto_chain, nb);
  3193. }
  3194. EXPORT_SYMBOL_GPL(crypto_unregister_notifier);
  3195. diff -Nur linux-3.18.12.orig/crypto/api.c linux-3.18.12/crypto/api.c
  3196. --- linux-3.18.12.orig/crypto/api.c 2015-04-20 14:48:02.000000000 -0500
  3197. +++ linux-3.18.12/crypto/api.c 2015-04-26 13:32:22.387684003 -0500
  3198. @@ -31,7 +31,7 @@
  3199. DECLARE_RWSEM(crypto_alg_sem);
  3200. EXPORT_SYMBOL_GPL(crypto_alg_sem);
  3201. -BLOCKING_NOTIFIER_HEAD(crypto_chain);
  3202. +SRCU_NOTIFIER_HEAD(crypto_chain);
  3203. EXPORT_SYMBOL_GPL(crypto_chain);
  3204. static struct crypto_alg *crypto_larval_wait(struct crypto_alg *alg);
  3205. @@ -236,10 +236,10 @@
  3206. {
  3207. int ok;
  3208. - ok = blocking_notifier_call_chain(&crypto_chain, val, v);
  3209. + ok = srcu_notifier_call_chain(&crypto_chain, val, v);
  3210. if (ok == NOTIFY_DONE) {
  3211. request_module("cryptomgr");
  3212. - ok = blocking_notifier_call_chain(&crypto_chain, val, v);
  3213. + ok = srcu_notifier_call_chain(&crypto_chain, val, v);
  3214. }
  3215. return ok;
  3216. diff -Nur linux-3.18.12.orig/crypto/internal.h linux-3.18.12/crypto/internal.h
  3217. --- linux-3.18.12.orig/crypto/internal.h 2015-04-20 14:48:02.000000000 -0500
  3218. +++ linux-3.18.12/crypto/internal.h 2015-04-26 13:32:22.387684003 -0500
  3219. @@ -48,7 +48,7 @@
  3220. extern struct list_head crypto_alg_list;
  3221. extern struct rw_semaphore crypto_alg_sem;
  3222. -extern struct blocking_notifier_head crypto_chain;
  3223. +extern struct srcu_notifier_head crypto_chain;
  3224. #ifdef CONFIG_PROC_FS
  3225. void __init crypto_init_proc(void);
  3226. @@ -142,7 +142,7 @@
  3227. static inline void crypto_notify(unsigned long val, void *v)
  3228. {
  3229. - blocking_notifier_call_chain(&crypto_chain, val, v);
  3230. + srcu_notifier_call_chain(&crypto_chain, val, v);
  3231. }
  3232. #endif /* _CRYPTO_INTERNAL_H */
  3233. diff -Nur linux-3.18.12.orig/Documentation/hwlat_detector.txt linux-3.18.12/Documentation/hwlat_detector.txt
  3234. --- linux-3.18.12.orig/Documentation/hwlat_detector.txt 1969-12-31 18:00:00.000000000 -0600
  3235. +++ linux-3.18.12/Documentation/hwlat_detector.txt 2015-04-26 13:32:22.347684003 -0500
  3236. @@ -0,0 +1,64 @@
  3237. +Introduction:
  3238. +-------------
  3239. +
  3240. +The module hwlat_detector is a special purpose kernel module that is used to
  3241. +detect large system latencies induced by the behavior of certain underlying
  3242. +hardware or firmware, independent of Linux itself. The code was developed
  3243. +originally to detect SMIs (System Management Interrupts) on x86 systems,
  3244. +however there is nothing x86 specific about this patchset. It was
  3245. +originally written for use by the "RT" patch since the Real Time
  3246. +kernel is highly latency sensitive.
  3247. +
  3248. +SMIs are usually not serviced by the Linux kernel, which typically does not
  3249. +even know that they are occuring. SMIs are instead are set up by BIOS code
  3250. +and are serviced by BIOS code, usually for "critical" events such as
  3251. +management of thermal sensors and fans. Sometimes though, SMIs are used for
  3252. +other tasks and those tasks can spend an inordinate amount of time in the
  3253. +handler (sometimes measured in milliseconds). Obviously this is a problem if
  3254. +you are trying to keep event service latencies down in the microsecond range.
  3255. +
  3256. +The hardware latency detector works by hogging all of the cpus for configurable
  3257. +amounts of time (by calling stop_machine()), polling the CPU Time Stamp Counter
  3258. +for some period, then looking for gaps in the TSC data. Any gap indicates a
  3259. +time when the polling was interrupted and since the machine is stopped and
  3260. +interrupts turned off the only thing that could do that would be an SMI.
  3261. +
  3262. +Note that the SMI detector should *NEVER* be used in a production environment.
  3263. +It is intended to be run manually to determine if the hardware platform has a
  3264. +problem with long system firmware service routines.
  3265. +
  3266. +Usage:
  3267. +------
  3268. +
  3269. +Loading the module hwlat_detector passing the parameter "enabled=1" (or by
  3270. +setting the "enable" entry in "hwlat_detector" debugfs toggled on) is the only
  3271. +step required to start the hwlat_detector. It is possible to redefine the
  3272. +threshold in microseconds (us) above which latency spikes will be taken
  3273. +into account (parameter "threshold=").
  3274. +
  3275. +Example:
  3276. +
  3277. + # modprobe hwlat_detector enabled=1 threshold=100
  3278. +
  3279. +After the module is loaded, it creates a directory named "hwlat_detector" under
  3280. +the debugfs mountpoint, "/debug/hwlat_detector" for this text. It is necessary
  3281. +to have debugfs mounted, which might be on /sys/debug on your system.
  3282. +
  3283. +The /debug/hwlat_detector interface contains the following files:
  3284. +
  3285. +count - number of latency spikes observed since last reset
  3286. +enable - a global enable/disable toggle (0/1), resets count
  3287. +max - maximum hardware latency actually observed (usecs)
  3288. +sample - a pipe from which to read current raw sample data
  3289. + in the format <timestamp> <latency observed usecs>
  3290. + (can be opened O_NONBLOCK for a single sample)
  3291. +threshold - minimum latency value to be considered (usecs)
  3292. +width - time period to sample with CPUs held (usecs)
  3293. + must be less than the total window size (enforced)
  3294. +window - total period of sampling, width being inside (usecs)
  3295. +
  3296. +By default we will set width to 500,000 and window to 1,000,000, meaning that
  3297. +we will sample every 1,000,000 usecs (1s) for 500,000 usecs (0.5s). If we
  3298. +observe any latencies that exceed the threshold (initially 100 usecs),
  3299. +then we write to a global sample ring buffer of 8K samples, which is
  3300. +consumed by reading from the "sample" (pipe) debugfs file interface.
  3301. diff -Nur linux-3.18.12.orig/Documentation/sysrq.txt linux-3.18.12/Documentation/sysrq.txt
  3302. --- linux-3.18.12.orig/Documentation/sysrq.txt 2015-04-20 14:48:02.000000000 -0500
  3303. +++ linux-3.18.12/Documentation/sysrq.txt 2015-04-26 13:32:22.347684003 -0500
  3304. @@ -59,10 +59,17 @@
  3305. On other - If you know of the key combos for other architectures, please
  3306. let me know so I can add them to this section.
  3307. -On all - write a character to /proc/sysrq-trigger. e.g.:
  3308. -
  3309. +On all - write a character to /proc/sysrq-trigger, e.g.:
  3310. echo t > /proc/sysrq-trigger
  3311. +On all - Enable network SysRq by writing a cookie to icmp_echo_sysrq, e.g.
  3312. + echo 0x01020304 >/proc/sys/net/ipv4/icmp_echo_sysrq
  3313. + Send an ICMP echo request with this pattern plus the particular
  3314. + SysRq command key. Example:
  3315. + # ping -c1 -s57 -p0102030468
  3316. + will trigger the SysRq-H (help) command.
  3317. +
  3318. +
  3319. * What are the 'command' keys?
  3320. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  3321. 'b' - Will immediately reboot the system without syncing or unmounting
  3322. diff -Nur linux-3.18.12.orig/Documentation/trace/histograms.txt linux-3.18.12/Documentation/trace/histograms.txt
  3323. --- linux-3.18.12.orig/Documentation/trace/histograms.txt 1969-12-31 18:00:00.000000000 -0600
  3324. +++ linux-3.18.12/Documentation/trace/histograms.txt 2015-04-26 13:32:22.351684003 -0500
  3325. @@ -0,0 +1,186 @@
  3326. + Using the Linux Kernel Latency Histograms
  3327. +
  3328. +
  3329. +This document gives a short explanation how to enable, configure and use
  3330. +latency histograms. Latency histograms are primarily relevant in the
  3331. +context of real-time enabled kernels (CONFIG_PREEMPT/CONFIG_PREEMPT_RT)
  3332. +and are used in the quality management of the Linux real-time
  3333. +capabilities.
  3334. +
  3335. +
  3336. +* Purpose of latency histograms
  3337. +
  3338. +A latency histogram continuously accumulates the frequencies of latency
  3339. +data. There are two types of histograms
  3340. +- potential sources of latencies
  3341. +- effective latencies
  3342. +
  3343. +
  3344. +* Potential sources of latencies
  3345. +
  3346. +Potential sources of latencies are code segments where interrupts,
  3347. +preemption or both are disabled (aka critical sections). To create
  3348. +histograms of potential sources of latency, the kernel stores the time
  3349. +stamp at the start of a critical section, determines the time elapsed
  3350. +when the end of the section is reached, and increments the frequency
  3351. +counter of that latency value - irrespective of whether any concurrently
  3352. +running process is affected by latency or not.
  3353. +- Configuration items (in the Kernel hacking/Tracers submenu)
  3354. + CONFIG_INTERRUPT_OFF_LATENCY
  3355. + CONFIG_PREEMPT_OFF_LATENCY
  3356. +
  3357. +
  3358. +* Effective latencies
  3359. +
  3360. +Effective latencies are actually occuring during wakeup of a process. To
  3361. +determine effective latencies, the kernel stores the time stamp when a
  3362. +process is scheduled to be woken up, and determines the duration of the
  3363. +wakeup time shortly before control is passed over to this process. Note
  3364. +that the apparent latency in user space may be somewhat longer, since the
  3365. +process may be interrupted after control is passed over to it but before
  3366. +the execution in user space takes place. Simply measuring the interval
  3367. +between enqueuing and wakeup may also not appropriate in cases when a
  3368. +process is scheduled as a result of a timer expiration. The timer may have
  3369. +missed its deadline, e.g. due to disabled interrupts, but this latency
  3370. +would not be registered. Therefore, the offsets of missed timers are
  3371. +recorded in a separate histogram. If both wakeup latency and missed timer
  3372. +offsets are configured and enabled, a third histogram may be enabled that
  3373. +records the overall latency as a sum of the timer latency, if any, and the
  3374. +wakeup latency. This histogram is called "timerandwakeup".
  3375. +- Configuration items (in the Kernel hacking/Tracers submenu)
  3376. + CONFIG_WAKEUP_LATENCY
  3377. + CONFIG_MISSED_TIMER_OFSETS
  3378. +
  3379. +
  3380. +* Usage
  3381. +
  3382. +The interface to the administration of the latency histograms is located
  3383. +in the debugfs file system. To mount it, either enter
  3384. +
  3385. +mount -t sysfs nodev /sys
  3386. +mount -t debugfs nodev /sys/kernel/debug
  3387. +
  3388. +from shell command line level, or add
  3389. +
  3390. +nodev /sys sysfs defaults 0 0
  3391. +nodev /sys/kernel/debug debugfs defaults 0 0
  3392. +
  3393. +to the file /etc/fstab. All latency histogram related files are then
  3394. +available in the directory /sys/kernel/debug/tracing/latency_hist. A
  3395. +particular histogram type is enabled by writing non-zero to the related
  3396. +variable in the /sys/kernel/debug/tracing/latency_hist/enable directory.
  3397. +Select "preemptirqsoff" for the histograms of potential sources of
  3398. +latencies and "wakeup" for histograms of effective latencies etc. The
  3399. +histogram data - one per CPU - are available in the files
  3400. +
  3401. +/sys/kernel/debug/tracing/latency_hist/preemptoff/CPUx
  3402. +/sys/kernel/debug/tracing/latency_hist/irqsoff/CPUx
  3403. +/sys/kernel/debug/tracing/latency_hist/preemptirqsoff/CPUx
  3404. +/sys/kernel/debug/tracing/latency_hist/wakeup/CPUx
  3405. +/sys/kernel/debug/tracing/latency_hist/wakeup/sharedprio/CPUx
  3406. +/sys/kernel/debug/tracing/latency_hist/missed_timer_offsets/CPUx
  3407. +/sys/kernel/debug/tracing/latency_hist/timerandwakeup/CPUx
  3408. +
  3409. +The histograms are reset by writing non-zero to the file "reset" in a
  3410. +particular latency directory. To reset all latency data, use
  3411. +
  3412. +#!/bin/sh
  3413. +
  3414. +TRACINGDIR=/sys/kernel/debug/tracing
  3415. +HISTDIR=$TRACINGDIR/latency_hist
  3416. +
  3417. +if test -d $HISTDIR
  3418. +then
  3419. + cd $HISTDIR
  3420. + for i in `find . | grep /reset$`
  3421. + do
  3422. + echo 1 >$i
  3423. + done
  3424. +fi
  3425. +
  3426. +
  3427. +* Data format
  3428. +
  3429. +Latency data are stored with a resolution of one microsecond. The
  3430. +maximum latency is 10,240 microseconds. The data are only valid, if the
  3431. +overflow register is empty. Every output line contains the latency in
  3432. +microseconds in the first row and the number of samples in the second
  3433. +row. To display only lines with a positive latency count, use, for
  3434. +example,
  3435. +
  3436. +grep -v " 0$" /sys/kernel/debug/tracing/latency_hist/preemptoff/CPU0
  3437. +
  3438. +#Minimum latency: 0 microseconds.
  3439. +#Average latency: 0 microseconds.
  3440. +#Maximum latency: 25 microseconds.
  3441. +#Total samples: 3104770694
  3442. +#There are 0 samples greater or equal than 10240 microseconds
  3443. +#usecs samples
  3444. + 0 2984486876
  3445. + 1 49843506
  3446. + 2 58219047
  3447. + 3 5348126
  3448. + 4 2187960
  3449. + 5 3388262
  3450. + 6 959289
  3451. + 7 208294
  3452. + 8 40420
  3453. + 9 4485
  3454. + 10 14918
  3455. + 11 18340
  3456. + 12 25052
  3457. + 13 19455
  3458. + 14 5602
  3459. + 15 969
  3460. + 16 47
  3461. + 17 18
  3462. + 18 14
  3463. + 19 1
  3464. + 20 3
  3465. + 21 2
  3466. + 22 5
  3467. + 23 2
  3468. + 25 1
  3469. +
  3470. +
  3471. +* Wakeup latency of a selected process
  3472. +
  3473. +To only collect wakeup latency data of a particular process, write the
  3474. +PID of the requested process to
  3475. +
  3476. +/sys/kernel/debug/tracing/latency_hist/wakeup/pid
  3477. +
  3478. +PIDs are not considered, if this variable is set to 0.
  3479. +
  3480. +
  3481. +* Details of the process with the highest wakeup latency so far
  3482. +
  3483. +Selected data of the process that suffered from the highest wakeup
  3484. +latency that occurred in a particular CPU are available in the file
  3485. +
  3486. +/sys/kernel/debug/tracing/latency_hist/wakeup/max_latency-CPUx.
  3487. +
  3488. +In addition, other relevant system data at the time when the
  3489. +latency occurred are given.
  3490. +
  3491. +The format of the data is (all in one line):
  3492. +<PID> <Priority> <Latency> (<Timeroffset>) <Command> \
  3493. +<- <PID> <Priority> <Command> <Timestamp>
  3494. +
  3495. +The value of <Timeroffset> is only relevant in the combined timer
  3496. +and wakeup latency recording. In the wakeup recording, it is
  3497. +always 0, in the missed_timer_offsets recording, it is the same
  3498. +as <Latency>.
  3499. +
  3500. +When retrospectively searching for the origin of a latency and
  3501. +tracing was not enabled, it may be helpful to know the name and
  3502. +some basic data of the task that (finally) was switching to the
  3503. +late real-tlme task. In addition to the victim's data, also the
  3504. +data of the possible culprit are therefore displayed after the
  3505. +"<-" symbol.
  3506. +
  3507. +Finally, the timestamp of the time when the latency occurred
  3508. +in <seconds>.<microseconds> after the most recent system boot
  3509. +is provided.
  3510. +
  3511. +These data are also reset when the wakeup histogram is reset.
  3512. diff -Nur linux-3.18.12.orig/drivers/acpi/acpica/acglobal.h linux-3.18.12/drivers/acpi/acpica/acglobal.h
  3513. --- linux-3.18.12.orig/drivers/acpi/acpica/acglobal.h 2015-04-20 14:48:02.000000000 -0500
  3514. +++ linux-3.18.12/drivers/acpi/acpica/acglobal.h 2015-04-26 13:32:22.387684003 -0500
  3515. @@ -112,7 +112,7 @@
  3516. * interrupt level
  3517. */
  3518. ACPI_GLOBAL(acpi_spinlock, acpi_gbl_gpe_lock); /* For GPE data structs and registers */
  3519. -ACPI_GLOBAL(acpi_spinlock, acpi_gbl_hardware_lock); /* For ACPI H/W except GPE registers */
  3520. +ACPI_GLOBAL(acpi_raw_spinlock, acpi_gbl_hardware_lock); /* For ACPI H/W except GPE registers */
  3521. ACPI_GLOBAL(acpi_spinlock, acpi_gbl_reference_count_lock);
  3522. /* Mutex for _OSI support */
  3523. diff -Nur linux-3.18.12.orig/drivers/acpi/acpica/hwregs.c linux-3.18.12/drivers/acpi/acpica/hwregs.c
  3524. --- linux-3.18.12.orig/drivers/acpi/acpica/hwregs.c 2015-04-20 14:48:02.000000000 -0500
  3525. +++ linux-3.18.12/drivers/acpi/acpica/hwregs.c 2015-04-26 13:32:22.387684003 -0500
  3526. @@ -269,14 +269,14 @@
  3527. ACPI_BITMASK_ALL_FIXED_STATUS,
  3528. ACPI_FORMAT_UINT64(acpi_gbl_xpm1a_status.address)));
  3529. - lock_flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock);
  3530. + raw_spin_lock_irqsave(acpi_gbl_hardware_lock, lock_flags);
  3531. /* Clear the fixed events in PM1 A/B */
  3532. status = acpi_hw_register_write(ACPI_REGISTER_PM1_STATUS,
  3533. ACPI_BITMASK_ALL_FIXED_STATUS);
  3534. - acpi_os_release_lock(acpi_gbl_hardware_lock, lock_flags);
  3535. + raw_spin_unlock_irqrestore(acpi_gbl_hardware_lock, lock_flags);
  3536. if (ACPI_FAILURE(status)) {
  3537. goto exit;
  3538. diff -Nur linux-3.18.12.orig/drivers/acpi/acpica/hwxface.c linux-3.18.12/drivers/acpi/acpica/hwxface.c
  3539. --- linux-3.18.12.orig/drivers/acpi/acpica/hwxface.c 2015-04-20 14:48:02.000000000 -0500
  3540. +++ linux-3.18.12/drivers/acpi/acpica/hwxface.c 2015-04-26 13:32:22.387684003 -0500
  3541. @@ -374,7 +374,7 @@
  3542. return_ACPI_STATUS(AE_BAD_PARAMETER);
  3543. }
  3544. - lock_flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock);
  3545. + raw_spin_lock_irqsave(acpi_gbl_hardware_lock, lock_flags);
  3546. /*
  3547. * At this point, we know that the parent register is one of the
  3548. @@ -435,7 +435,7 @@
  3549. unlock_and_exit:
  3550. - acpi_os_release_lock(acpi_gbl_hardware_lock, lock_flags);
  3551. + raw_spin_unlock_irqrestore(acpi_gbl_hardware_lock, lock_flags);
  3552. return_ACPI_STATUS(status);
  3553. }
  3554. diff -Nur linux-3.18.12.orig/drivers/acpi/acpica/utmutex.c linux-3.18.12/drivers/acpi/acpica/utmutex.c
  3555. --- linux-3.18.12.orig/drivers/acpi/acpica/utmutex.c 2015-04-20 14:48:02.000000000 -0500
  3556. +++ linux-3.18.12/drivers/acpi/acpica/utmutex.c 2015-04-26 13:32:22.387684003 -0500
  3557. @@ -88,7 +88,7 @@
  3558. return_ACPI_STATUS (status);
  3559. }
  3560. - status = acpi_os_create_lock (&acpi_gbl_hardware_lock);
  3561. + status = acpi_os_create_raw_lock (&acpi_gbl_hardware_lock);
  3562. if (ACPI_FAILURE (status)) {
  3563. return_ACPI_STATUS (status);
  3564. }
  3565. @@ -141,7 +141,7 @@
  3566. /* Delete the spinlocks */
  3567. acpi_os_delete_lock(acpi_gbl_gpe_lock);
  3568. - acpi_os_delete_lock(acpi_gbl_hardware_lock);
  3569. + acpi_os_delete_raw_lock(acpi_gbl_hardware_lock);
  3570. acpi_os_delete_lock(acpi_gbl_reference_count_lock);
  3571. /* Delete the reader/writer lock */
  3572. diff -Nur linux-3.18.12.orig/drivers/ata/libata-sff.c linux-3.18.12/drivers/ata/libata-sff.c
  3573. --- linux-3.18.12.orig/drivers/ata/libata-sff.c 2015-04-20 14:48:02.000000000 -0500
  3574. +++ linux-3.18.12/drivers/ata/libata-sff.c 2015-04-26 13:32:22.387684003 -0500
  3575. @@ -678,9 +678,9 @@
  3576. unsigned long flags;
  3577. unsigned int consumed;
  3578. - local_irq_save(flags);
  3579. + local_irq_save_nort(flags);
  3580. consumed = ata_sff_data_xfer32(dev, buf, buflen, rw);
  3581. - local_irq_restore(flags);
  3582. + local_irq_restore_nort(flags);
  3583. return consumed;
  3584. }
  3585. @@ -719,7 +719,7 @@
  3586. unsigned long flags;
  3587. /* FIXME: use a bounce buffer */
  3588. - local_irq_save(flags);
  3589. + local_irq_save_nort(flags);
  3590. buf = kmap_atomic(page);
  3591. /* do the actual data transfer */
  3592. @@ -727,7 +727,7 @@
  3593. do_write);
  3594. kunmap_atomic(buf);
  3595. - local_irq_restore(flags);
  3596. + local_irq_restore_nort(flags);
  3597. } else {
  3598. buf = page_address(page);
  3599. ap->ops->sff_data_xfer(qc->dev, buf + offset, qc->sect_size,
  3600. @@ -864,7 +864,7 @@
  3601. unsigned long flags;
  3602. /* FIXME: use bounce buffer */
  3603. - local_irq_save(flags);
  3604. + local_irq_save_nort(flags);
  3605. buf = kmap_atomic(page);
  3606. /* do the actual data transfer */
  3607. @@ -872,7 +872,7 @@
  3608. count, rw);
  3609. kunmap_atomic(buf);
  3610. - local_irq_restore(flags);
  3611. + local_irq_restore_nort(flags);
  3612. } else {
  3613. buf = page_address(page);
  3614. consumed = ap->ops->sff_data_xfer(dev, buf + offset,
  3615. diff -Nur linux-3.18.12.orig/drivers/char/random.c linux-3.18.12/drivers/char/random.c
  3616. --- linux-3.18.12.orig/drivers/char/random.c 2015-04-20 14:48:02.000000000 -0500
  3617. +++ linux-3.18.12/drivers/char/random.c 2015-04-26 13:32:22.387684003 -0500
  3618. @@ -776,8 +776,6 @@
  3619. } sample;
  3620. long delta, delta2, delta3;
  3621. - preempt_disable();
  3622. -
  3623. sample.jiffies = jiffies;
  3624. sample.cycles = random_get_entropy();
  3625. sample.num = num;
  3626. @@ -818,7 +816,6 @@
  3627. */
  3628. credit_entropy_bits(r, min_t(int, fls(delta>>1), 11));
  3629. }
  3630. - preempt_enable();
  3631. }
  3632. void add_input_randomness(unsigned int type, unsigned int code,
  3633. @@ -871,28 +868,27 @@
  3634. return *(ptr + f->reg_idx++);
  3635. }
  3636. -void add_interrupt_randomness(int irq, int irq_flags)
  3637. +void add_interrupt_randomness(int irq, int irq_flags, __u64 ip)
  3638. {
  3639. struct entropy_store *r;
  3640. struct fast_pool *fast_pool = this_cpu_ptr(&irq_randomness);
  3641. - struct pt_regs *regs = get_irq_regs();
  3642. unsigned long now = jiffies;
  3643. cycles_t cycles = random_get_entropy();
  3644. __u32 c_high, j_high;
  3645. - __u64 ip;
  3646. unsigned long seed;
  3647. int credit = 0;
  3648. if (cycles == 0)
  3649. - cycles = get_reg(fast_pool, regs);
  3650. + cycles = get_reg(fast_pool, NULL);
  3651. c_high = (sizeof(cycles) > 4) ? cycles >> 32 : 0;
  3652. j_high = (sizeof(now) > 4) ? now >> 32 : 0;
  3653. fast_pool->pool[0] ^= cycles ^ j_high ^ irq;
  3654. fast_pool->pool[1] ^= now ^ c_high;
  3655. - ip = regs ? instruction_pointer(regs) : _RET_IP_;
  3656. + if (!ip)
  3657. + ip = _RET_IP_;
  3658. fast_pool->pool[2] ^= ip;
  3659. fast_pool->pool[3] ^= (sizeof(ip) > 4) ? ip >> 32 :
  3660. - get_reg(fast_pool, regs);
  3661. + get_reg(fast_pool, NULL);
  3662. fast_mix(fast_pool);
  3663. add_interrupt_bench(cycles);
  3664. diff -Nur linux-3.18.12.orig/drivers/clocksource/tcb_clksrc.c linux-3.18.12/drivers/clocksource/tcb_clksrc.c
  3665. --- linux-3.18.12.orig/drivers/clocksource/tcb_clksrc.c 2015-04-20 14:48:02.000000000 -0500
  3666. +++ linux-3.18.12/drivers/clocksource/tcb_clksrc.c 2015-04-26 13:32:22.387684003 -0500
  3667. @@ -23,8 +23,7 @@
  3668. * this 32 bit free-running counter. the second channel is not used.
  3669. *
  3670. * - The third channel may be used to provide a 16-bit clockevent
  3671. - * source, used in either periodic or oneshot mode. This runs
  3672. - * at 32 KiHZ, and can handle delays of up to two seconds.
  3673. + * source, used in either periodic or oneshot mode.
  3674. *
  3675. * A boot clocksource and clockevent source are also currently needed,
  3676. * unless the relevant platforms (ARM/AT91, AVR32/AT32) are changed so
  3677. @@ -74,6 +73,7 @@
  3678. struct tc_clkevt_device {
  3679. struct clock_event_device clkevt;
  3680. struct clk *clk;
  3681. + u32 freq;
  3682. void __iomem *regs;
  3683. };
  3684. @@ -82,13 +82,6 @@
  3685. return container_of(clkevt, struct tc_clkevt_device, clkevt);
  3686. }
  3687. -/* For now, we always use the 32K clock ... this optimizes for NO_HZ,
  3688. - * because using one of the divided clocks would usually mean the
  3689. - * tick rate can never be less than several dozen Hz (vs 0.5 Hz).
  3690. - *
  3691. - * A divided clock could be good for high resolution timers, since
  3692. - * 30.5 usec resolution can seem "low".
  3693. - */
  3694. static u32 timer_clock;
  3695. static void tc_mode(enum clock_event_mode m, struct clock_event_device *d)
  3696. @@ -111,11 +104,12 @@
  3697. case CLOCK_EVT_MODE_PERIODIC:
  3698. clk_enable(tcd->clk);
  3699. - /* slow clock, count up to RC, then irq and restart */
  3700. + /* count up to RC, then irq and restart */
  3701. __raw_writel(timer_clock
  3702. | ATMEL_TC_WAVE | ATMEL_TC_WAVESEL_UP_AUTO,
  3703. regs + ATMEL_TC_REG(2, CMR));
  3704. - __raw_writel((32768 + HZ/2) / HZ, tcaddr + ATMEL_TC_REG(2, RC));
  3705. + __raw_writel((tcd->freq + HZ / 2) / HZ,
  3706. + tcaddr + ATMEL_TC_REG(2, RC));
  3707. /* Enable clock and interrupts on RC compare */
  3708. __raw_writel(ATMEL_TC_CPCS, regs + ATMEL_TC_REG(2, IER));
  3709. @@ -128,7 +122,7 @@
  3710. case CLOCK_EVT_MODE_ONESHOT:
  3711. clk_enable(tcd->clk);
  3712. - /* slow clock, count up to RC, then irq and stop */
  3713. + /* count up to RC, then irq and stop */
  3714. __raw_writel(timer_clock | ATMEL_TC_CPCSTOP
  3715. | ATMEL_TC_WAVE | ATMEL_TC_WAVESEL_UP_AUTO,
  3716. regs + ATMEL_TC_REG(2, CMR));
  3717. @@ -157,8 +151,12 @@
  3718. .name = "tc_clkevt",
  3719. .features = CLOCK_EVT_FEAT_PERIODIC
  3720. | CLOCK_EVT_FEAT_ONESHOT,
  3721. +#ifdef CONFIG_ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
  3722. /* Should be lower than at91rm9200's system timer */
  3723. .rating = 125,
  3724. +#else
  3725. + .rating = 200,
  3726. +#endif
  3727. .set_next_event = tc_next_event,
  3728. .set_mode = tc_mode,
  3729. },
  3730. @@ -178,8 +176,9 @@
  3731. return IRQ_NONE;
  3732. }
  3733. -static int __init setup_clkevents(struct atmel_tc *tc, int clk32k_divisor_idx)
  3734. +static int __init setup_clkevents(struct atmel_tc *tc, int divisor_idx)
  3735. {
  3736. + unsigned divisor = atmel_tc_divisors[divisor_idx];
  3737. int ret;
  3738. struct clk *t2_clk = tc->clk[2];
  3739. int irq = tc->irq[2];
  3740. @@ -193,7 +192,11 @@
  3741. clkevt.regs = tc->regs;
  3742. clkevt.clk = t2_clk;
  3743. - timer_clock = clk32k_divisor_idx;
  3744. + timer_clock = divisor_idx;
  3745. + if (!divisor)
  3746. + clkevt.freq = 32768;
  3747. + else
  3748. + clkevt.freq = clk_get_rate(t2_clk) / divisor;
  3749. clkevt.clkevt.cpumask = cpumask_of(0);
  3750. @@ -203,7 +206,7 @@
  3751. return ret;
  3752. }
  3753. - clockevents_config_and_register(&clkevt.clkevt, 32768, 1, 0xffff);
  3754. + clockevents_config_and_register(&clkevt.clkevt, clkevt.freq, 1, 0xffff);
  3755. return ret;
  3756. }
  3757. @@ -340,7 +343,11 @@
  3758. goto err_disable_t1;
  3759. /* channel 2: periodic and oneshot timer support */
  3760. +#ifdef CONFIG_ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
  3761. ret = setup_clkevents(tc, clk32k_divisor_idx);
  3762. +#else
  3763. + ret = setup_clkevents(tc, best_divisor_idx);
  3764. +#endif
  3765. if (ret)
  3766. goto err_unregister_clksrc;
  3767. diff -Nur linux-3.18.12.orig/drivers/clocksource/timer-atmel-pit.c linux-3.18.12/drivers/clocksource/timer-atmel-pit.c
  3768. --- linux-3.18.12.orig/drivers/clocksource/timer-atmel-pit.c 2015-04-20 14:48:02.000000000 -0500
  3769. +++ linux-3.18.12/drivers/clocksource/timer-atmel-pit.c 2015-04-26 13:32:22.387684003 -0500
  3770. @@ -90,6 +90,7 @@
  3771. return elapsed;
  3772. }
  3773. +static struct irqaction at91sam926x_pit_irq;
  3774. /*
  3775. * Clockevent device: interrupts every 1/HZ (== pit_cycles * MCK/16)
  3776. */
  3777. @@ -100,6 +101,8 @@
  3778. switch (mode) {
  3779. case CLOCK_EVT_MODE_PERIODIC:
  3780. + /* Set up irq handler */
  3781. + setup_irq(at91sam926x_pit_irq.irq, &at91sam926x_pit_irq);
  3782. /* update clocksource counter */
  3783. data->cnt += data->cycle * PIT_PICNT(pit_read(data->base, AT91_PIT_PIVR));
  3784. pit_write(data->base, AT91_PIT_MR,
  3785. @@ -113,6 +116,7 @@
  3786. /* disable irq, leaving the clocksource active */
  3787. pit_write(data->base, AT91_PIT_MR,
  3788. (data->cycle - 1) | AT91_PIT_PITEN);
  3789. + remove_irq(at91sam926x_pit_irq.irq, &at91sam926x_pit_irq);
  3790. break;
  3791. case CLOCK_EVT_MODE_RESUME:
  3792. break;
  3793. diff -Nur linux-3.18.12.orig/drivers/cpufreq/Kconfig.x86 linux-3.18.12/drivers/cpufreq/Kconfig.x86
  3794. --- linux-3.18.12.orig/drivers/cpufreq/Kconfig.x86 2015-04-20 14:48:02.000000000 -0500
  3795. +++ linux-3.18.12/drivers/cpufreq/Kconfig.x86 2015-04-26 13:32:22.387684003 -0500
  3796. @@ -113,7 +113,7 @@
  3797. config X86_POWERNOW_K8
  3798. tristate "AMD Opteron/Athlon64 PowerNow!"
  3799. - depends on ACPI && ACPI_PROCESSOR && X86_ACPI_CPUFREQ
  3800. + depends on ACPI && ACPI_PROCESSOR && X86_ACPI_CPUFREQ && !PREEMPT_RT_BASE
  3801. help
  3802. This adds the CPUFreq driver for K8/early Opteron/Athlon64 processors.
  3803. Support for K10 and newer processors is now in acpi-cpufreq.
  3804. diff -Nur linux-3.18.12.orig/drivers/gpio/gpio-omap.c linux-3.18.12/drivers/gpio/gpio-omap.c
  3805. --- linux-3.18.12.orig/drivers/gpio/gpio-omap.c 2015-04-20 14:48:02.000000000 -0500
  3806. +++ linux-3.18.12/drivers/gpio/gpio-omap.c 2015-04-26 13:32:22.387684003 -0500
  3807. @@ -57,7 +57,7 @@
  3808. u32 saved_datain;
  3809. u32 level_mask;
  3810. u32 toggle_mask;
  3811. - spinlock_t lock;
  3812. + raw_spinlock_t lock;
  3813. struct gpio_chip chip;
  3814. struct clk *dbck;
  3815. u32 mod_usage;
  3816. @@ -503,19 +503,19 @@
  3817. (type & (IRQ_TYPE_LEVEL_LOW|IRQ_TYPE_LEVEL_HIGH)))
  3818. return -EINVAL;
  3819. - spin_lock_irqsave(&bank->lock, flags);
  3820. + raw_spin_lock_irqsave(&bank->lock, flags);
  3821. offset = GPIO_INDEX(bank, gpio);
  3822. retval = omap_set_gpio_triggering(bank, offset, type);
  3823. if (!LINE_USED(bank->mod_usage, offset)) {
  3824. omap_enable_gpio_module(bank, offset);
  3825. omap_set_gpio_direction(bank, offset, 1);
  3826. } else if (!omap_gpio_is_input(bank, BIT(offset))) {
  3827. - spin_unlock_irqrestore(&bank->lock, flags);
  3828. + raw_spin_unlock_irqrestore(&bank->lock, flags);
  3829. return -EINVAL;
  3830. }
  3831. bank->irq_usage |= BIT(GPIO_INDEX(bank, gpio));
  3832. - spin_unlock_irqrestore(&bank->lock, flags);
  3833. + raw_spin_unlock_irqrestore(&bank->lock, flags);
  3834. if (type & (IRQ_TYPE_LEVEL_LOW | IRQ_TYPE_LEVEL_HIGH))
  3835. __irq_set_handler_locked(d->irq, handle_level_irq);
  3836. @@ -633,14 +633,14 @@
  3837. return -EINVAL;
  3838. }
  3839. - spin_lock_irqsave(&bank->lock, flags);
  3840. + raw_spin_lock_irqsave(&bank->lock, flags);
  3841. if (enable)
  3842. bank->context.wake_en |= gpio_bit;
  3843. else
  3844. bank->context.wake_en &= ~gpio_bit;
  3845. writel_relaxed(bank->context.wake_en, bank->base + bank->regs->wkup_en);
  3846. - spin_unlock_irqrestore(&bank->lock, flags);
  3847. + raw_spin_unlock_irqrestore(&bank->lock, flags);
  3848. return 0;
  3849. }
  3850. @@ -675,7 +675,7 @@
  3851. if (!BANK_USED(bank))
  3852. pm_runtime_get_sync(bank->dev);
  3853. - spin_lock_irqsave(&bank->lock, flags);
  3854. + raw_spin_lock_irqsave(&bank->lock, flags);
  3855. /* Set trigger to none. You need to enable the desired trigger with
  3856. * request_irq() or set_irq_type(). Only do this if the IRQ line has
  3857. * not already been requested.
  3858. @@ -685,7 +685,7 @@
  3859. omap_enable_gpio_module(bank, offset);
  3860. }
  3861. bank->mod_usage |= BIT(offset);
  3862. - spin_unlock_irqrestore(&bank->lock, flags);
  3863. + raw_spin_unlock_irqrestore(&bank->lock, flags);
  3864. return 0;
  3865. }
  3866. @@ -695,11 +695,11 @@
  3867. struct gpio_bank *bank = container_of(chip, struct gpio_bank, chip);
  3868. unsigned long flags;
  3869. - spin_lock_irqsave(&bank->lock, flags);
  3870. + raw_spin_lock_irqsave(&bank->lock, flags);
  3871. bank->mod_usage &= ~(BIT(offset));
  3872. omap_disable_gpio_module(bank, offset);
  3873. omap_reset_gpio(bank, bank->chip.base + offset);
  3874. - spin_unlock_irqrestore(&bank->lock, flags);
  3875. + raw_spin_unlock_irqrestore(&bank->lock, flags);
  3876. /*
  3877. * If this is the last gpio to be freed in the bank,
  3878. @@ -799,12 +799,12 @@
  3879. unsigned long flags;
  3880. unsigned offset = GPIO_INDEX(bank, gpio);
  3881. - spin_lock_irqsave(&bank->lock, flags);
  3882. + raw_spin_lock_irqsave(&bank->lock, flags);
  3883. gpio_unlock_as_irq(&bank->chip, offset);
  3884. bank->irq_usage &= ~(BIT(offset));
  3885. omap_disable_gpio_module(bank, offset);
  3886. omap_reset_gpio(bank, gpio);
  3887. - spin_unlock_irqrestore(&bank->lock, flags);
  3888. + raw_spin_unlock_irqrestore(&bank->lock, flags);
  3889. /*
  3890. * If this is the last IRQ to be freed in the bank,
  3891. @@ -828,10 +828,10 @@
  3892. unsigned int gpio = omap_irq_to_gpio(bank, d->hwirq);
  3893. unsigned long flags;
  3894. - spin_lock_irqsave(&bank->lock, flags);
  3895. + raw_spin_lock_irqsave(&bank->lock, flags);
  3896. omap_set_gpio_irqenable(bank, gpio, 0);
  3897. omap_set_gpio_triggering(bank, GPIO_INDEX(bank, gpio), IRQ_TYPE_NONE);
  3898. - spin_unlock_irqrestore(&bank->lock, flags);
  3899. + raw_spin_unlock_irqrestore(&bank->lock, flags);
  3900. }
  3901. static void omap_gpio_unmask_irq(struct irq_data *d)
  3902. @@ -842,7 +842,7 @@
  3903. u32 trigger = irqd_get_trigger_type(d);
  3904. unsigned long flags;
  3905. - spin_lock_irqsave(&bank->lock, flags);
  3906. + raw_spin_lock_irqsave(&bank->lock, flags);
  3907. if (trigger)
  3908. omap_set_gpio_triggering(bank, GPIO_INDEX(bank, gpio), trigger);
  3909. @@ -854,7 +854,7 @@
  3910. }
  3911. omap_set_gpio_irqenable(bank, gpio, 1);
  3912. - spin_unlock_irqrestore(&bank->lock, flags);
  3913. + raw_spin_unlock_irqrestore(&bank->lock, flags);
  3914. }
  3915. /*---------------------------------------------------------------------*/
  3916. @@ -867,9 +867,9 @@
  3917. OMAP_MPUIO_GPIO_MASKIT / bank->stride;
  3918. unsigned long flags;
  3919. - spin_lock_irqsave(&bank->lock, flags);
  3920. + raw_spin_lock_irqsave(&bank->lock, flags);
  3921. writel_relaxed(0xffff & ~bank->context.wake_en, mask_reg);
  3922. - spin_unlock_irqrestore(&bank->lock, flags);
  3923. + raw_spin_unlock_irqrestore(&bank->lock, flags);
  3924. return 0;
  3925. }
  3926. @@ -882,9 +882,9 @@
  3927. OMAP_MPUIO_GPIO_MASKIT / bank->stride;
  3928. unsigned long flags;
  3929. - spin_lock_irqsave(&bank->lock, flags);
  3930. + raw_spin_lock_irqsave(&bank->lock, flags);
  3931. writel_relaxed(bank->context.wake_en, mask_reg);
  3932. - spin_unlock_irqrestore(&bank->lock, flags);
  3933. + raw_spin_unlock_irqrestore(&bank->lock, flags);
  3934. return 0;
  3935. }
  3936. @@ -930,9 +930,9 @@
  3937. bank = container_of(chip, struct gpio_bank, chip);
  3938. reg = bank->base + bank->regs->direction;
  3939. - spin_lock_irqsave(&bank->lock, flags);
  3940. + raw_spin_lock_irqsave(&bank->lock, flags);
  3941. dir = !!(readl_relaxed(reg) & BIT(offset));
  3942. - spin_unlock_irqrestore(&bank->lock, flags);
  3943. + raw_spin_unlock_irqrestore(&bank->lock, flags);
  3944. return dir;
  3945. }
  3946. @@ -942,9 +942,9 @@
  3947. unsigned long flags;
  3948. bank = container_of(chip, struct gpio_bank, chip);
  3949. - spin_lock_irqsave(&bank->lock, flags);
  3950. + raw_spin_lock_irqsave(&bank->lock, flags);
  3951. omap_set_gpio_direction(bank, offset, 1);
  3952. - spin_unlock_irqrestore(&bank->lock, flags);
  3953. + raw_spin_unlock_irqrestore(&bank->lock, flags);
  3954. return 0;
  3955. }
  3956. @@ -968,10 +968,10 @@
  3957. unsigned long flags;
  3958. bank = container_of(chip, struct gpio_bank, chip);
  3959. - spin_lock_irqsave(&bank->lock, flags);
  3960. + raw_spin_lock_irqsave(&bank->lock, flags);
  3961. bank->set_dataout(bank, offset, value);
  3962. omap_set_gpio_direction(bank, offset, 0);
  3963. - spin_unlock_irqrestore(&bank->lock, flags);
  3964. + raw_spin_unlock_irqrestore(&bank->lock, flags);
  3965. return 0;
  3966. }
  3967. @@ -983,9 +983,9 @@
  3968. bank = container_of(chip, struct gpio_bank, chip);
  3969. - spin_lock_irqsave(&bank->lock, flags);
  3970. + raw_spin_lock_irqsave(&bank->lock, flags);
  3971. omap2_set_gpio_debounce(bank, offset, debounce);
  3972. - spin_unlock_irqrestore(&bank->lock, flags);
  3973. + raw_spin_unlock_irqrestore(&bank->lock, flags);
  3974. return 0;
  3975. }
  3976. @@ -996,9 +996,9 @@
  3977. unsigned long flags;
  3978. bank = container_of(chip, struct gpio_bank, chip);
  3979. - spin_lock_irqsave(&bank->lock, flags);
  3980. + raw_spin_lock_irqsave(&bank->lock, flags);
  3981. bank->set_dataout(bank, offset, value);
  3982. - spin_unlock_irqrestore(&bank->lock, flags);
  3983. + raw_spin_unlock_irqrestore(&bank->lock, flags);
  3984. }
  3985. /*---------------------------------------------------------------------*/
  3986. @@ -1223,7 +1223,7 @@
  3987. else
  3988. bank->set_dataout = omap_set_gpio_dataout_mask;
  3989. - spin_lock_init(&bank->lock);
  3990. + raw_spin_lock_init(&bank->lock);
  3991. /* Static mapping, never released */
  3992. res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
  3993. @@ -1270,7 +1270,7 @@
  3994. unsigned long flags;
  3995. u32 wake_low, wake_hi;
  3996. - spin_lock_irqsave(&bank->lock, flags);
  3997. + raw_spin_lock_irqsave(&bank->lock, flags);
  3998. /*
  3999. * Only edges can generate a wakeup event to the PRCM.
  4000. @@ -1323,7 +1323,7 @@
  4001. bank->get_context_loss_count(bank->dev);
  4002. omap_gpio_dbck_disable(bank);
  4003. - spin_unlock_irqrestore(&bank->lock, flags);
  4004. + raw_spin_unlock_irqrestore(&bank->lock, flags);
  4005. return 0;
  4006. }
  4007. @@ -1338,7 +1338,7 @@
  4008. unsigned long flags;
  4009. int c;
  4010. - spin_lock_irqsave(&bank->lock, flags);
  4011. + raw_spin_lock_irqsave(&bank->lock, flags);
  4012. /*
  4013. * On the first resume during the probe, the context has not
  4014. @@ -1374,14 +1374,14 @@
  4015. if (c != bank->context_loss_count) {
  4016. omap_gpio_restore_context(bank);
  4017. } else {
  4018. - spin_unlock_irqrestore(&bank->lock, flags);
  4019. + raw_spin_unlock_irqrestore(&bank->lock, flags);
  4020. return 0;
  4021. }
  4022. }
  4023. }
  4024. if (!bank->workaround_enabled) {
  4025. - spin_unlock_irqrestore(&bank->lock, flags);
  4026. + raw_spin_unlock_irqrestore(&bank->lock, flags);
  4027. return 0;
  4028. }
  4029. @@ -1436,7 +1436,7 @@
  4030. }
  4031. bank->workaround_enabled = false;
  4032. - spin_unlock_irqrestore(&bank->lock, flags);
  4033. + raw_spin_unlock_irqrestore(&bank->lock, flags);
  4034. return 0;
  4035. }
  4036. diff -Nur linux-3.18.12.orig/drivers/gpu/drm/i915/i915_gem.c linux-3.18.12/drivers/gpu/drm/i915/i915_gem.c
  4037. --- linux-3.18.12.orig/drivers/gpu/drm/i915/i915_gem.c 2015-04-20 14:48:02.000000000 -0500
  4038. +++ linux-3.18.12/drivers/gpu/drm/i915/i915_gem.c 2015-04-26 13:32:22.391684003 -0500
  4039. @@ -5144,7 +5144,7 @@
  4040. if (!mutex_is_locked(mutex))
  4041. return false;
  4042. -#if defined(CONFIG_SMP) && !defined(CONFIG_DEBUG_MUTEXES)
  4043. +#if defined(CONFIG_SMP) && !defined(CONFIG_DEBUG_MUTEXES) && !defined(CONFIG_PREEMPT_RT_BASE)
  4044. return mutex->owner == task;
  4045. #else
  4046. /* Since UP may be pre-empted, we cannot assume that we own the lock */
  4047. diff -Nur linux-3.18.12.orig/drivers/gpu/drm/i915/i915_gem_execbuffer.c linux-3.18.12/drivers/gpu/drm/i915/i915_gem_execbuffer.c
  4048. --- linux-3.18.12.orig/drivers/gpu/drm/i915/i915_gem_execbuffer.c 2015-04-20 14:48:02.000000000 -0500
  4049. +++ linux-3.18.12/drivers/gpu/drm/i915/i915_gem_execbuffer.c 2015-04-26 13:32:22.391684003 -0500
  4050. @@ -1170,7 +1170,9 @@
  4051. return ret;
  4052. }
  4053. +#ifndef CONFIG_PREEMPT_RT_BASE
  4054. trace_i915_gem_ring_dispatch(ring, intel_ring_get_seqno(ring), flags);
  4055. +#endif
  4056. i915_gem_execbuffer_move_to_active(vmas, ring);
  4057. i915_gem_execbuffer_retire_commands(dev, file, ring, batch_obj);
  4058. diff -Nur linux-3.18.12.orig/drivers/i2c/busses/i2c-omap.c linux-3.18.12/drivers/i2c/busses/i2c-omap.c
  4059. --- linux-3.18.12.orig/drivers/i2c/busses/i2c-omap.c 2015-04-20 14:48:02.000000000 -0500
  4060. +++ linux-3.18.12/drivers/i2c/busses/i2c-omap.c 2015-04-26 13:32:22.391684003 -0500
  4061. @@ -875,15 +875,12 @@
  4062. u16 mask;
  4063. u16 stat;
  4064. - spin_lock(&dev->lock);
  4065. - mask = omap_i2c_read_reg(dev, OMAP_I2C_IE_REG);
  4066. stat = omap_i2c_read_reg(dev, OMAP_I2C_STAT_REG);
  4067. + mask = omap_i2c_read_reg(dev, OMAP_I2C_IE_REG);
  4068. if (stat & mask)
  4069. ret = IRQ_WAKE_THREAD;
  4070. - spin_unlock(&dev->lock);
  4071. -
  4072. return ret;
  4073. }
  4074. diff -Nur linux-3.18.12.orig/drivers/ide/alim15x3.c linux-3.18.12/drivers/ide/alim15x3.c
  4075. --- linux-3.18.12.orig/drivers/ide/alim15x3.c 2015-04-20 14:48:02.000000000 -0500
  4076. +++ linux-3.18.12/drivers/ide/alim15x3.c 2015-04-26 13:32:22.391684003 -0500
  4077. @@ -234,7 +234,7 @@
  4078. isa_dev = pci_get_device(PCI_VENDOR_ID_AL, PCI_DEVICE_ID_AL_M1533, NULL);
  4079. - local_irq_save(flags);
  4080. + local_irq_save_nort(flags);
  4081. if (m5229_revision < 0xC2) {
  4082. /*
  4083. @@ -325,7 +325,7 @@
  4084. }
  4085. pci_dev_put(north);
  4086. pci_dev_put(isa_dev);
  4087. - local_irq_restore(flags);
  4088. + local_irq_restore_nort(flags);
  4089. return 0;
  4090. }
  4091. diff -Nur linux-3.18.12.orig/drivers/ide/hpt366.c linux-3.18.12/drivers/ide/hpt366.c
  4092. --- linux-3.18.12.orig/drivers/ide/hpt366.c 2015-04-20 14:48:02.000000000 -0500
  4093. +++ linux-3.18.12/drivers/ide/hpt366.c 2015-04-26 13:32:22.391684003 -0500
  4094. @@ -1241,7 +1241,7 @@
  4095. dma_old = inb(base + 2);
  4096. - local_irq_save(flags);
  4097. + local_irq_save_nort(flags);
  4098. dma_new = dma_old;
  4099. pci_read_config_byte(dev, hwif->channel ? 0x4b : 0x43, &masterdma);
  4100. @@ -1252,7 +1252,7 @@
  4101. if (dma_new != dma_old)
  4102. outb(dma_new, base + 2);
  4103. - local_irq_restore(flags);
  4104. + local_irq_restore_nort(flags);
  4105. printk(KERN_INFO " %s: BM-DMA at 0x%04lx-0x%04lx\n",
  4106. hwif->name, base, base + 7);
  4107. diff -Nur linux-3.18.12.orig/drivers/ide/ide-io.c linux-3.18.12/drivers/ide/ide-io.c
  4108. --- linux-3.18.12.orig/drivers/ide/ide-io.c 2015-04-20 14:48:02.000000000 -0500
  4109. +++ linux-3.18.12/drivers/ide/ide-io.c 2015-04-26 13:32:22.391684003 -0500
  4110. @@ -659,7 +659,7 @@
  4111. /* disable_irq_nosync ?? */
  4112. disable_irq(hwif->irq);
  4113. /* local CPU only, as if we were handling an interrupt */
  4114. - local_irq_disable();
  4115. + local_irq_disable_nort();
  4116. if (hwif->polling) {
  4117. startstop = handler(drive);
  4118. } else if (drive_is_ready(drive)) {
  4119. diff -Nur linux-3.18.12.orig/drivers/ide/ide-iops.c linux-3.18.12/drivers/ide/ide-iops.c
  4120. --- linux-3.18.12.orig/drivers/ide/ide-iops.c 2015-04-20 14:48:02.000000000 -0500
  4121. +++ linux-3.18.12/drivers/ide/ide-iops.c 2015-04-26 13:32:22.391684003 -0500
  4122. @@ -129,12 +129,12 @@
  4123. if ((stat & ATA_BUSY) == 0)
  4124. break;
  4125. - local_irq_restore(flags);
  4126. + local_irq_restore_nort(flags);
  4127. *rstat = stat;
  4128. return -EBUSY;
  4129. }
  4130. }
  4131. - local_irq_restore(flags);
  4132. + local_irq_restore_nort(flags);
  4133. }
  4134. /*
  4135. * Allow status to settle, then read it again.
  4136. diff -Nur linux-3.18.12.orig/drivers/ide/ide-io-std.c linux-3.18.12/drivers/ide/ide-io-std.c
  4137. --- linux-3.18.12.orig/drivers/ide/ide-io-std.c 2015-04-20 14:48:02.000000000 -0500
  4138. +++ linux-3.18.12/drivers/ide/ide-io-std.c 2015-04-26 13:32:22.391684003 -0500
  4139. @@ -175,7 +175,7 @@
  4140. unsigned long uninitialized_var(flags);
  4141. if ((io_32bit & 2) && !mmio) {
  4142. - local_irq_save(flags);
  4143. + local_irq_save_nort(flags);
  4144. ata_vlb_sync(io_ports->nsect_addr);
  4145. }
  4146. @@ -186,7 +186,7 @@
  4147. insl(data_addr, buf, words);
  4148. if ((io_32bit & 2) && !mmio)
  4149. - local_irq_restore(flags);
  4150. + local_irq_restore_nort(flags);
  4151. if (((len + 1) & 3) < 2)
  4152. return;
  4153. @@ -219,7 +219,7 @@
  4154. unsigned long uninitialized_var(flags);
  4155. if ((io_32bit & 2) && !mmio) {
  4156. - local_irq_save(flags);
  4157. + local_irq_save_nort(flags);
  4158. ata_vlb_sync(io_ports->nsect_addr);
  4159. }
  4160. @@ -230,7 +230,7 @@
  4161. outsl(data_addr, buf, words);
  4162. if ((io_32bit & 2) && !mmio)
  4163. - local_irq_restore(flags);
  4164. + local_irq_restore_nort(flags);
  4165. if (((len + 1) & 3) < 2)
  4166. return;
  4167. diff -Nur linux-3.18.12.orig/drivers/ide/ide-probe.c linux-3.18.12/drivers/ide/ide-probe.c
  4168. --- linux-3.18.12.orig/drivers/ide/ide-probe.c 2015-04-20 14:48:02.000000000 -0500
  4169. +++ linux-3.18.12/drivers/ide/ide-probe.c 2015-04-26 13:32:22.391684003 -0500
  4170. @@ -196,10 +196,10 @@
  4171. int bswap = 1;
  4172. /* local CPU only; some systems need this */
  4173. - local_irq_save(flags);
  4174. + local_irq_save_nort(flags);
  4175. /* read 512 bytes of id info */
  4176. hwif->tp_ops->input_data(drive, NULL, id, SECTOR_SIZE);
  4177. - local_irq_restore(flags);
  4178. + local_irq_restore_nort(flags);
  4179. drive->dev_flags |= IDE_DFLAG_ID_READ;
  4180. #ifdef DEBUG
  4181. diff -Nur linux-3.18.12.orig/drivers/ide/ide-taskfile.c linux-3.18.12/drivers/ide/ide-taskfile.c
  4182. --- linux-3.18.12.orig/drivers/ide/ide-taskfile.c 2015-04-20 14:48:02.000000000 -0500
  4183. +++ linux-3.18.12/drivers/ide/ide-taskfile.c 2015-04-26 13:32:22.391684003 -0500
  4184. @@ -250,7 +250,7 @@
  4185. page_is_high = PageHighMem(page);
  4186. if (page_is_high)
  4187. - local_irq_save(flags);
  4188. + local_irq_save_nort(flags);
  4189. buf = kmap_atomic(page) + offset;
  4190. @@ -271,7 +271,7 @@
  4191. kunmap_atomic(buf);
  4192. if (page_is_high)
  4193. - local_irq_restore(flags);
  4194. + local_irq_restore_nort(flags);
  4195. len -= nr_bytes;
  4196. }
  4197. @@ -414,7 +414,7 @@
  4198. }
  4199. if ((drive->dev_flags & IDE_DFLAG_UNMASK) == 0)
  4200. - local_irq_disable();
  4201. + local_irq_disable_nort();
  4202. ide_set_handler(drive, &task_pio_intr, WAIT_WORSTCASE);
  4203. diff -Nur linux-3.18.12.orig/drivers/infiniband/ulp/ipoib/ipoib_multicast.c linux-3.18.12/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
  4204. --- linux-3.18.12.orig/drivers/infiniband/ulp/ipoib/ipoib_multicast.c 2015-04-20 14:48:02.000000000 -0500
  4205. +++ linux-3.18.12/drivers/infiniband/ulp/ipoib/ipoib_multicast.c 2015-04-26 13:32:22.391684003 -0500
  4206. @@ -796,7 +796,7 @@
  4207. ipoib_mcast_stop_thread(dev, 0);
  4208. - local_irq_save(flags);
  4209. + local_irq_save_nort(flags);
  4210. netif_addr_lock(dev);
  4211. spin_lock(&priv->lock);
  4212. @@ -878,7 +878,7 @@
  4213. spin_unlock(&priv->lock);
  4214. netif_addr_unlock(dev);
  4215. - local_irq_restore(flags);
  4216. + local_irq_restore_nort(flags);
  4217. /* We have to cancel outside of the spinlock */
  4218. list_for_each_entry_safe(mcast, tmcast, &remove_list, list) {
  4219. diff -Nur linux-3.18.12.orig/drivers/input/gameport/gameport.c linux-3.18.12/drivers/input/gameport/gameport.c
  4220. --- linux-3.18.12.orig/drivers/input/gameport/gameport.c 2015-04-20 14:48:02.000000000 -0500
  4221. +++ linux-3.18.12/drivers/input/gameport/gameport.c 2015-04-26 13:32:22.391684003 -0500
  4222. @@ -124,12 +124,12 @@
  4223. tx = 1 << 30;
  4224. for(i = 0; i < 50; i++) {
  4225. - local_irq_save(flags);
  4226. + local_irq_save_nort(flags);
  4227. GET_TIME(t1);
  4228. for (t = 0; t < 50; t++) gameport_read(gameport);
  4229. GET_TIME(t2);
  4230. GET_TIME(t3);
  4231. - local_irq_restore(flags);
  4232. + local_irq_restore_nort(flags);
  4233. udelay(i * 10);
  4234. if ((t = DELTA(t2,t1) - DELTA(t3,t2)) < tx) tx = t;
  4235. }
  4236. @@ -148,11 +148,11 @@
  4237. tx = 1 << 30;
  4238. for(i = 0; i < 50; i++) {
  4239. - local_irq_save(flags);
  4240. + local_irq_save_nort(flags);
  4241. rdtscl(t1);
  4242. for (t = 0; t < 50; t++) gameport_read(gameport);
  4243. rdtscl(t2);
  4244. - local_irq_restore(flags);
  4245. + local_irq_restore_nort(flags);
  4246. udelay(i * 10);
  4247. if (t2 - t1 < tx) tx = t2 - t1;
  4248. }
  4249. diff -Nur linux-3.18.12.orig/drivers/leds/trigger/Kconfig linux-3.18.12/drivers/leds/trigger/Kconfig
  4250. --- linux-3.18.12.orig/drivers/leds/trigger/Kconfig 2015-04-20 14:48:02.000000000 -0500
  4251. +++ linux-3.18.12/drivers/leds/trigger/Kconfig 2015-04-26 13:32:22.391684003 -0500
  4252. @@ -61,7 +61,7 @@
  4253. config LEDS_TRIGGER_CPU
  4254. bool "LED CPU Trigger"
  4255. - depends on LEDS_TRIGGERS
  4256. + depends on LEDS_TRIGGERS && !PREEMPT_RT_BASE
  4257. help
  4258. This allows LEDs to be controlled by active CPUs. This shows
  4259. the active CPUs across an array of LEDs so you can see which
  4260. diff -Nur linux-3.18.12.orig/drivers/md/bcache/Kconfig linux-3.18.12/drivers/md/bcache/Kconfig
  4261. --- linux-3.18.12.orig/drivers/md/bcache/Kconfig 2015-04-20 14:48:02.000000000 -0500
  4262. +++ linux-3.18.12/drivers/md/bcache/Kconfig 2015-04-26 13:32:22.391684003 -0500
  4263. @@ -1,6 +1,7 @@
  4264. config BCACHE
  4265. tristate "Block device as cache"
  4266. + depends on !PREEMPT_RT_FULL
  4267. ---help---
  4268. Allows a block device to be used as cache for other devices; uses
  4269. a btree for indexing and the layout is optimized for SSDs.
  4270. diff -Nur linux-3.18.12.orig/drivers/md/dm.c linux-3.18.12/drivers/md/dm.c
  4271. --- linux-3.18.12.orig/drivers/md/dm.c 2015-04-20 14:48:02.000000000 -0500
  4272. +++ linux-3.18.12/drivers/md/dm.c 2015-04-26 13:32:22.395684003 -0500
  4273. @@ -1898,14 +1898,14 @@
  4274. if (map_request(ti, clone, md))
  4275. goto requeued;
  4276. - BUG_ON(!irqs_disabled());
  4277. + BUG_ON_NONRT(!irqs_disabled());
  4278. spin_lock(q->queue_lock);
  4279. }
  4280. goto out;
  4281. requeued:
  4282. - BUG_ON(!irqs_disabled());
  4283. + BUG_ON_NONRT(!irqs_disabled());
  4284. spin_lock(q->queue_lock);
  4285. delay_and_out:
  4286. diff -Nur linux-3.18.12.orig/drivers/md/raid5.c linux-3.18.12/drivers/md/raid5.c
  4287. --- linux-3.18.12.orig/drivers/md/raid5.c 2015-04-20 14:48:02.000000000 -0500
  4288. +++ linux-3.18.12/drivers/md/raid5.c 2015-04-26 13:32:22.395684003 -0500
  4289. @@ -1649,8 +1649,9 @@
  4290. struct raid5_percpu *percpu;
  4291. unsigned long cpu;
  4292. - cpu = get_cpu();
  4293. + cpu = get_cpu_light();
  4294. percpu = per_cpu_ptr(conf->percpu, cpu);
  4295. + spin_lock(&percpu->lock);
  4296. if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
  4297. ops_run_biofill(sh);
  4298. overlap_clear++;
  4299. @@ -1702,7 +1703,8 @@
  4300. if (test_and_clear_bit(R5_Overlap, &dev->flags))
  4301. wake_up(&sh->raid_conf->wait_for_overlap);
  4302. }
  4303. - put_cpu();
  4304. + spin_unlock(&percpu->lock);
  4305. + put_cpu_light();
  4306. }
  4307. static int grow_one_stripe(struct r5conf *conf, int hash)
  4308. @@ -5708,6 +5710,7 @@
  4309. __func__, cpu);
  4310. break;
  4311. }
  4312. + spin_lock_init(&per_cpu_ptr(conf->percpu, cpu)->lock);
  4313. }
  4314. put_online_cpus();
  4315. diff -Nur linux-3.18.12.orig/drivers/md/raid5.h linux-3.18.12/drivers/md/raid5.h
  4316. --- linux-3.18.12.orig/drivers/md/raid5.h 2015-04-20 14:48:02.000000000 -0500
  4317. +++ linux-3.18.12/drivers/md/raid5.h 2015-04-26 13:32:22.395684003 -0500
  4318. @@ -457,6 +457,7 @@
  4319. int recovery_disabled;
  4320. /* per cpu variables */
  4321. struct raid5_percpu {
  4322. + spinlock_t lock; /* Protection for -RT */
  4323. struct page *spare_page; /* Used when checking P/Q in raid6 */
  4324. void *scribble; /* space for constructing buffer
  4325. * lists and performing address
  4326. diff -Nur linux-3.18.12.orig/drivers/misc/hwlat_detector.c linux-3.18.12/drivers/misc/hwlat_detector.c
  4327. --- linux-3.18.12.orig/drivers/misc/hwlat_detector.c 1969-12-31 18:00:00.000000000 -0600
  4328. +++ linux-3.18.12/drivers/misc/hwlat_detector.c 2015-04-26 13:32:22.395684003 -0500
  4329. @@ -0,0 +1,1240 @@
  4330. +/*
  4331. + * hwlat_detector.c - A simple Hardware Latency detector.
  4332. + *
  4333. + * Use this module to detect large system latencies induced by the behavior of
  4334. + * certain underlying system hardware or firmware, independent of Linux itself.
  4335. + * The code was developed originally to detect the presence of SMIs on Intel
  4336. + * and AMD systems, although there is no dependency upon x86 herein.
  4337. + *
  4338. + * The classical example usage of this module is in detecting the presence of
  4339. + * SMIs or System Management Interrupts on Intel and AMD systems. An SMI is a
  4340. + * somewhat special form of hardware interrupt spawned from earlier CPU debug
  4341. + * modes in which the (BIOS/EFI/etc.) firmware arranges for the South Bridge
  4342. + * LPC (or other device) to generate a special interrupt under certain
  4343. + * circumstances, for example, upon expiration of a special SMI timer device,
  4344. + * due to certain external thermal readings, on certain I/O address accesses,
  4345. + * and other situations. An SMI hits a special CPU pin, triggers a special
  4346. + * SMI mode (complete with special memory map), and the OS is unaware.
  4347. + *
  4348. + * Although certain hardware-inducing latencies are necessary (for example,
  4349. + * a modern system often requires an SMI handler for correct thermal control
  4350. + * and remote management) they can wreak havoc upon any OS-level performance
  4351. + * guarantees toward low-latency, especially when the OS is not even made
  4352. + * aware of the presence of these interrupts. For this reason, we need a
  4353. + * somewhat brute force mechanism to detect these interrupts. In this case,
  4354. + * we do it by hogging all of the CPU(s) for configurable timer intervals,
  4355. + * sampling the built-in CPU timer, looking for discontiguous readings.
  4356. + *
  4357. + * WARNING: This implementation necessarily introduces latencies. Therefore,
  4358. + * you should NEVER use this module in a production environment
  4359. + * requiring any kind of low-latency performance guarantee(s).
  4360. + *
  4361. + * Copyright (C) 2008-2009 Jon Masters, Red Hat, Inc. <jcm@redhat.com>
  4362. + *
  4363. + * Includes useful feedback from Clark Williams <clark@redhat.com>
  4364. + *
  4365. + * This file is licensed under the terms of the GNU General Public
  4366. + * License version 2. This program is licensed "as is" without any
  4367. + * warranty of any kind, whether express or implied.
  4368. + */
  4369. +
  4370. +#include <linux/module.h>
  4371. +#include <linux/init.h>
  4372. +#include <linux/ring_buffer.h>
  4373. +#include <linux/time.h>
  4374. +#include <linux/hrtimer.h>
  4375. +#include <linux/kthread.h>
  4376. +#include <linux/debugfs.h>
  4377. +#include <linux/seq_file.h>
  4378. +#include <linux/uaccess.h>
  4379. +#include <linux/version.h>
  4380. +#include <linux/delay.h>
  4381. +#include <linux/slab.h>
  4382. +#include <linux/trace_clock.h>
  4383. +
  4384. +#define BUF_SIZE_DEFAULT 262144UL /* 8K*(sizeof(entry)) */
  4385. +#define BUF_FLAGS (RB_FL_OVERWRITE) /* no block on full */
  4386. +#define U64STR_SIZE 22 /* 20 digits max */
  4387. +
  4388. +#define VERSION "1.0.0"
  4389. +#define BANNER "hwlat_detector: "
  4390. +#define DRVNAME "hwlat_detector"
  4391. +#define DEFAULT_SAMPLE_WINDOW 1000000 /* 1s */
  4392. +#define DEFAULT_SAMPLE_WIDTH 500000 /* 0.5s */
  4393. +#define DEFAULT_LAT_THRESHOLD 10 /* 10us */
  4394. +
  4395. +/* Module metadata */
  4396. +
  4397. +MODULE_LICENSE("GPL");
  4398. +MODULE_AUTHOR("Jon Masters <jcm@redhat.com>");
  4399. +MODULE_DESCRIPTION("A simple hardware latency detector");
  4400. +MODULE_VERSION(VERSION);
  4401. +
  4402. +/* Module parameters */
  4403. +
  4404. +static int debug;
  4405. +static int enabled;
  4406. +static int threshold;
  4407. +
  4408. +module_param(debug, int, 0); /* enable debug */
  4409. +module_param(enabled, int, 0); /* enable detector */
  4410. +module_param(threshold, int, 0); /* latency threshold */
  4411. +
  4412. +/* Buffering and sampling */
  4413. +
  4414. +static struct ring_buffer *ring_buffer; /* sample buffer */
  4415. +static DEFINE_MUTEX(ring_buffer_mutex); /* lock changes */
  4416. +static unsigned long buf_size = BUF_SIZE_DEFAULT;
  4417. +static struct task_struct *kthread; /* sampling thread */
  4418. +
  4419. +/* DebugFS filesystem entries */
  4420. +
  4421. +static struct dentry *debug_dir; /* debugfs directory */
  4422. +static struct dentry *debug_max; /* maximum TSC delta */
  4423. +static struct dentry *debug_count; /* total detect count */
  4424. +static struct dentry *debug_sample_width; /* sample width us */
  4425. +static struct dentry *debug_sample_window; /* sample window us */
  4426. +static struct dentry *debug_sample; /* raw samples us */
  4427. +static struct dentry *debug_threshold; /* threshold us */
  4428. +static struct dentry *debug_enable; /* enable/disable */
  4429. +
  4430. +/* Individual samples and global state */
  4431. +
  4432. +struct sample; /* latency sample */
  4433. +struct data; /* Global state */
  4434. +
  4435. +/* Sampling functions */
  4436. +static int __buffer_add_sample(struct sample *sample);
  4437. +static struct sample *buffer_get_sample(struct sample *sample);
  4438. +
  4439. +/* Threading and state */
  4440. +static int kthread_fn(void *unused);
  4441. +static int start_kthread(void);
  4442. +static int stop_kthread(void);
  4443. +static void __reset_stats(void);
  4444. +static int init_stats(void);
  4445. +
  4446. +/* Debugfs interface */
  4447. +static ssize_t simple_data_read(struct file *filp, char __user *ubuf,
  4448. + size_t cnt, loff_t *ppos, const u64 *entry);
  4449. +static ssize_t simple_data_write(struct file *filp, const char __user *ubuf,
  4450. + size_t cnt, loff_t *ppos, u64 *entry);
  4451. +static int debug_sample_fopen(struct inode *inode, struct file *filp);
  4452. +static ssize_t debug_sample_fread(struct file *filp, char __user *ubuf,
  4453. + size_t cnt, loff_t *ppos);
  4454. +static int debug_sample_release(struct inode *inode, struct file *filp);
  4455. +static int debug_enable_fopen(struct inode *inode, struct file *filp);
  4456. +static ssize_t debug_enable_fread(struct file *filp, char __user *ubuf,
  4457. + size_t cnt, loff_t *ppos);
  4458. +static ssize_t debug_enable_fwrite(struct file *file,
  4459. + const char __user *user_buffer,
  4460. + size_t user_size, loff_t *offset);
  4461. +
  4462. +/* Initialization functions */
  4463. +static int init_debugfs(void);
  4464. +static void free_debugfs(void);
  4465. +static int detector_init(void);
  4466. +static void detector_exit(void);
  4467. +
  4468. +/* Individual latency samples are stored here when detected and packed into
  4469. + * the ring_buffer circular buffer, where they are overwritten when
  4470. + * more than buf_size/sizeof(sample) samples are received. */
  4471. +struct sample {
  4472. + u64 seqnum; /* unique sequence */
  4473. + u64 duration; /* ktime delta */
  4474. + u64 outer_duration; /* ktime delta (outer loop) */
  4475. + struct timespec timestamp; /* wall time */
  4476. + unsigned long lost;
  4477. +};
  4478. +
  4479. +/* keep the global state somewhere. */
  4480. +static struct data {
  4481. +
  4482. + struct mutex lock; /* protect changes */
  4483. +
  4484. + u64 count; /* total since reset */
  4485. + u64 max_sample; /* max hardware latency */
  4486. + u64 threshold; /* sample threshold level */
  4487. +
  4488. + u64 sample_window; /* total sampling window (on+off) */
  4489. + u64 sample_width; /* active sampling portion of window */
  4490. +
  4491. + atomic_t sample_open; /* whether the sample file is open */
  4492. +
  4493. + wait_queue_head_t wq; /* waitqeue for new sample values */
  4494. +
  4495. +} data;
  4496. +
  4497. +/**
  4498. + * __buffer_add_sample - add a new latency sample recording to the ring buffer
  4499. + * @sample: The new latency sample value
  4500. + *
  4501. + * This receives a new latency sample and records it in a global ring buffer.
  4502. + * No additional locking is used in this case.
  4503. + */
  4504. +static int __buffer_add_sample(struct sample *sample)
  4505. +{
  4506. + return ring_buffer_write(ring_buffer,
  4507. + sizeof(struct sample), sample);
  4508. +}
  4509. +
  4510. +/**
  4511. + * buffer_get_sample - remove a hardware latency sample from the ring buffer
  4512. + * @sample: Pre-allocated storage for the sample
  4513. + *
  4514. + * This retrieves a hardware latency sample from the global circular buffer
  4515. + */
  4516. +static struct sample *buffer_get_sample(struct sample *sample)
  4517. +{
  4518. + struct ring_buffer_event *e = NULL;
  4519. + struct sample *s = NULL;
  4520. + unsigned int cpu = 0;
  4521. +
  4522. + if (!sample)
  4523. + return NULL;
  4524. +
  4525. + mutex_lock(&ring_buffer_mutex);
  4526. + for_each_online_cpu(cpu) {
  4527. + e = ring_buffer_consume(ring_buffer, cpu, NULL, &sample->lost);
  4528. + if (e)
  4529. + break;
  4530. + }
  4531. +
  4532. + if (e) {
  4533. + s = ring_buffer_event_data(e);
  4534. + memcpy(sample, s, sizeof(struct sample));
  4535. + } else
  4536. + sample = NULL;
  4537. + mutex_unlock(&ring_buffer_mutex);
  4538. +
  4539. + return sample;
  4540. +}
  4541. +
  4542. +#ifndef CONFIG_TRACING
  4543. +#define time_type ktime_t
  4544. +#define time_get() ktime_get()
  4545. +#define time_to_us(x) ktime_to_us(x)
  4546. +#define time_sub(a, b) ktime_sub(a, b)
  4547. +#define init_time(a, b) (a).tv64 = b
  4548. +#define time_u64(a) ((a).tv64)
  4549. +#else
  4550. +#define time_type u64
  4551. +#define time_get() trace_clock_local()
  4552. +#define time_to_us(x) div_u64(x, 1000)
  4553. +#define time_sub(a, b) ((a) - (b))
  4554. +#define init_time(a, b) (a = b)
  4555. +#define time_u64(a) a
  4556. +#endif
  4557. +/**
  4558. + * get_sample - sample the CPU TSC and look for likely hardware latencies
  4559. + *
  4560. + * Used to repeatedly capture the CPU TSC (or similar), looking for potential
  4561. + * hardware-induced latency. Called with interrupts disabled and with
  4562. + * data.lock held.
  4563. + */
  4564. +static int get_sample(void)
  4565. +{
  4566. + time_type start, t1, t2, last_t2;
  4567. + s64 diff, total = 0;
  4568. + u64 sample = 0;
  4569. + u64 outer_sample = 0;
  4570. + int ret = -1;
  4571. +
  4572. + init_time(last_t2, 0);
  4573. + start = time_get(); /* start timestamp */
  4574. +
  4575. + do {
  4576. +
  4577. + t1 = time_get(); /* we'll look for a discontinuity */
  4578. + t2 = time_get();
  4579. +
  4580. + if (time_u64(last_t2)) {
  4581. + /* Check the delta from outer loop (t2 to next t1) */
  4582. + diff = time_to_us(time_sub(t1, last_t2));
  4583. + /* This shouldn't happen */
  4584. + if (diff < 0) {
  4585. + pr_err(BANNER "time running backwards\n");
  4586. + goto out;
  4587. + }
  4588. + if (diff > outer_sample)
  4589. + outer_sample = diff;
  4590. + }
  4591. + last_t2 = t2;
  4592. +
  4593. + total = time_to_us(time_sub(t2, start)); /* sample width */
  4594. +
  4595. + /* This checks the inner loop (t1 to t2) */
  4596. + diff = time_to_us(time_sub(t2, t1)); /* current diff */
  4597. +
  4598. + /* This shouldn't happen */
  4599. + if (diff < 0) {
  4600. + pr_err(BANNER "time running backwards\n");
  4601. + goto out;
  4602. + }
  4603. +
  4604. + if (diff > sample)
  4605. + sample = diff; /* only want highest value */
  4606. +
  4607. + } while (total <= data.sample_width);
  4608. +
  4609. + ret = 0;
  4610. +
  4611. + /* If we exceed the threshold value, we have found a hardware latency */
  4612. + if (sample > data.threshold || outer_sample > data.threshold) {
  4613. + struct sample s;
  4614. +
  4615. + ret = 1;
  4616. +
  4617. + data.count++;
  4618. + s.seqnum = data.count;
  4619. + s.duration = sample;
  4620. + s.outer_duration = outer_sample;
  4621. + s.timestamp = CURRENT_TIME;
  4622. + __buffer_add_sample(&s);
  4623. +
  4624. + /* Keep a running maximum ever recorded hardware latency */
  4625. + if (sample > data.max_sample)
  4626. + data.max_sample = sample;
  4627. + }
  4628. +
  4629. +out:
  4630. + return ret;
  4631. +}
  4632. +
  4633. +/*
  4634. + * kthread_fn - The CPU time sampling/hardware latency detection kernel thread
  4635. + * @unused: A required part of the kthread API.
  4636. + *
  4637. + * Used to periodically sample the CPU TSC via a call to get_sample. We
  4638. + * disable interrupts, which does (intentionally) introduce latency since we
  4639. + * need to ensure nothing else might be running (and thus pre-empting).
  4640. + * Obviously this should never be used in production environments.
  4641. + *
  4642. + * Currently this runs on which ever CPU it was scheduled on, but most
  4643. + * real-worald hardware latency situations occur across several CPUs,
  4644. + * but we might later generalize this if we find there are any actualy
  4645. + * systems with alternate SMI delivery or other hardware latencies.
  4646. + */
  4647. +static int kthread_fn(void *unused)
  4648. +{
  4649. + int ret;
  4650. + u64 interval;
  4651. +
  4652. + while (!kthread_should_stop()) {
  4653. +
  4654. + mutex_lock(&data.lock);
  4655. +
  4656. + local_irq_disable();
  4657. + ret = get_sample();
  4658. + local_irq_enable();
  4659. +
  4660. + if (ret > 0)
  4661. + wake_up(&data.wq); /* wake up reader(s) */
  4662. +
  4663. + interval = data.sample_window - data.sample_width;
  4664. + do_div(interval, USEC_PER_MSEC); /* modifies interval value */
  4665. +
  4666. + mutex_unlock(&data.lock);
  4667. +
  4668. + if (msleep_interruptible(interval))
  4669. + break;
  4670. + }
  4671. +
  4672. + return 0;
  4673. +}
  4674. +
  4675. +/**
  4676. + * start_kthread - Kick off the hardware latency sampling/detector kthread
  4677. + *
  4678. + * This starts a kernel thread that will sit and sample the CPU timestamp
  4679. + * counter (TSC or similar) and look for potential hardware latencies.
  4680. + */
  4681. +static int start_kthread(void)
  4682. +{
  4683. + kthread = kthread_run(kthread_fn, NULL,
  4684. + DRVNAME);
  4685. + if (IS_ERR(kthread)) {
  4686. + pr_err(BANNER "could not start sampling thread\n");
  4687. + enabled = 0;
  4688. + return -ENOMEM;
  4689. + }
  4690. +
  4691. + return 0;
  4692. +}
  4693. +
  4694. +/**
  4695. + * stop_kthread - Inform the hardware latency samping/detector kthread to stop
  4696. + *
  4697. + * This kicks the running hardware latency sampling/detector kernel thread and
  4698. + * tells it to stop sampling now. Use this on unload and at system shutdown.
  4699. + */
  4700. +static int stop_kthread(void)
  4701. +{
  4702. + int ret;
  4703. +
  4704. + ret = kthread_stop(kthread);
  4705. +
  4706. + return ret;
  4707. +}
  4708. +
  4709. +/**
  4710. + * __reset_stats - Reset statistics for the hardware latency detector
  4711. + *
  4712. + * We use data to store various statistics and global state. We call this
  4713. + * function in order to reset those when "enable" is toggled on or off, and
  4714. + * also at initialization. Should be called with data.lock held.
  4715. + */
  4716. +static void __reset_stats(void)
  4717. +{
  4718. + data.count = 0;
  4719. + data.max_sample = 0;
  4720. + ring_buffer_reset(ring_buffer); /* flush out old sample entries */
  4721. +}
  4722. +
  4723. +/**
  4724. + * init_stats - Setup global state statistics for the hardware latency detector
  4725. + *
  4726. + * We use data to store various statistics and global state. We also use
  4727. + * a global ring buffer (ring_buffer) to keep raw samples of detected hardware
  4728. + * induced system latencies. This function initializes these structures and
  4729. + * allocates the global ring buffer also.
  4730. + */
  4731. +static int init_stats(void)
  4732. +{
  4733. + int ret = -ENOMEM;
  4734. +
  4735. + mutex_init(&data.lock);
  4736. + init_waitqueue_head(&data.wq);
  4737. + atomic_set(&data.sample_open, 0);
  4738. +
  4739. + ring_buffer = ring_buffer_alloc(buf_size, BUF_FLAGS);
  4740. +
  4741. + if (WARN(!ring_buffer, KERN_ERR BANNER
  4742. + "failed to allocate ring buffer!\n"))
  4743. + goto out;
  4744. +
  4745. + __reset_stats();
  4746. + data.threshold = threshold ?: DEFAULT_LAT_THRESHOLD; /* threshold us */
  4747. + data.sample_window = DEFAULT_SAMPLE_WINDOW; /* window us */
  4748. + data.sample_width = DEFAULT_SAMPLE_WIDTH; /* width us */
  4749. +
  4750. + ret = 0;
  4751. +
  4752. +out:
  4753. + return ret;
  4754. +
  4755. +}
  4756. +
  4757. +/*
  4758. + * simple_data_read - Wrapper read function for global state debugfs entries
  4759. + * @filp: The active open file structure for the debugfs "file"
  4760. + * @ubuf: The userspace provided buffer to read value into
  4761. + * @cnt: The maximum number of bytes to read
  4762. + * @ppos: The current "file" position
  4763. + * @entry: The entry to read from
  4764. + *
  4765. + * This function provides a generic read implementation for the global state
  4766. + * "data" structure debugfs filesystem entries. It would be nice to use
  4767. + * simple_attr_read directly, but we need to make sure that the data.lock
  4768. + * is held during the actual read.
  4769. + */
  4770. +static ssize_t simple_data_read(struct file *filp, char __user *ubuf,
  4771. + size_t cnt, loff_t *ppos, const u64 *entry)
  4772. +{
  4773. + char buf[U64STR_SIZE];
  4774. + u64 val = 0;
  4775. + int len = 0;
  4776. +
  4777. + memset(buf, 0, sizeof(buf));
  4778. +
  4779. + if (!entry)
  4780. + return -EFAULT;
  4781. +
  4782. + mutex_lock(&data.lock);
  4783. + val = *entry;
  4784. + mutex_unlock(&data.lock);
  4785. +
  4786. + len = snprintf(buf, sizeof(buf), "%llu\n", (unsigned long long)val);
  4787. +
  4788. + return simple_read_from_buffer(ubuf, cnt, ppos, buf, len);
  4789. +
  4790. +}
  4791. +
  4792. +/*
  4793. + * simple_data_write - Wrapper write function for global state debugfs entries
  4794. + * @filp: The active open file structure for the debugfs "file"
  4795. + * @ubuf: The userspace provided buffer to write value from
  4796. + * @cnt: The maximum number of bytes to write
  4797. + * @ppos: The current "file" position
  4798. + * @entry: The entry to write to
  4799. + *
  4800. + * This function provides a generic write implementation for the global state
  4801. + * "data" structure debugfs filesystem entries. It would be nice to use
  4802. + * simple_attr_write directly, but we need to make sure that the data.lock
  4803. + * is held during the actual write.
  4804. + */
  4805. +static ssize_t simple_data_write(struct file *filp, const char __user *ubuf,
  4806. + size_t cnt, loff_t *ppos, u64 *entry)
  4807. +{
  4808. + char buf[U64STR_SIZE];
  4809. + int csize = min(cnt, sizeof(buf));
  4810. + u64 val = 0;
  4811. + int err = 0;
  4812. +
  4813. + memset(buf, '\0', sizeof(buf));
  4814. + if (copy_from_user(buf, ubuf, csize))
  4815. + return -EFAULT;
  4816. +
  4817. + buf[U64STR_SIZE-1] = '\0'; /* just in case */
  4818. + err = kstrtoull(buf, 10, &val);
  4819. + if (err)
  4820. + return -EINVAL;
  4821. +
  4822. + mutex_lock(&data.lock);
  4823. + *entry = val;
  4824. + mutex_unlock(&data.lock);
  4825. +
  4826. + return csize;
  4827. +}
  4828. +
  4829. +/**
  4830. + * debug_count_fopen - Open function for "count" debugfs entry
  4831. + * @inode: The in-kernel inode representation of the debugfs "file"
  4832. + * @filp: The active open file structure for the debugfs "file"
  4833. + *
  4834. + * This function provides an open implementation for the "count" debugfs
  4835. + * interface to the hardware latency detector.
  4836. + */
  4837. +static int debug_count_fopen(struct inode *inode, struct file *filp)
  4838. +{
  4839. + return 0;
  4840. +}
  4841. +
  4842. +/**
  4843. + * debug_count_fread - Read function for "count" debugfs entry
  4844. + * @filp: The active open file structure for the debugfs "file"
  4845. + * @ubuf: The userspace provided buffer to read value into
  4846. + * @cnt: The maximum number of bytes to read
  4847. + * @ppos: The current "file" position
  4848. + *
  4849. + * This function provides a read implementation for the "count" debugfs
  4850. + * interface to the hardware latency detector. Can be used to read the
  4851. + * number of latency readings exceeding the configured threshold since
  4852. + * the detector was last reset (e.g. by writing a zero into "count").
  4853. + */
  4854. +static ssize_t debug_count_fread(struct file *filp, char __user *ubuf,
  4855. + size_t cnt, loff_t *ppos)
  4856. +{
  4857. + return simple_data_read(filp, ubuf, cnt, ppos, &data.count);
  4858. +}
  4859. +
  4860. +/**
  4861. + * debug_count_fwrite - Write function for "count" debugfs entry
  4862. + * @filp: The active open file structure for the debugfs "file"
  4863. + * @ubuf: The user buffer that contains the value to write
  4864. + * @cnt: The maximum number of bytes to write to "file"
  4865. + * @ppos: The current position in the debugfs "file"
  4866. + *
  4867. + * This function provides a write implementation for the "count" debugfs
  4868. + * interface to the hardware latency detector. Can be used to write a
  4869. + * desired value, especially to zero the total count.
  4870. + */
  4871. +static ssize_t debug_count_fwrite(struct file *filp,
  4872. + const char __user *ubuf,
  4873. + size_t cnt,
  4874. + loff_t *ppos)
  4875. +{
  4876. + return simple_data_write(filp, ubuf, cnt, ppos, &data.count);
  4877. +}
  4878. +
  4879. +/**
  4880. + * debug_enable_fopen - Dummy open function for "enable" debugfs interface
  4881. + * @inode: The in-kernel inode representation of the debugfs "file"
  4882. + * @filp: The active open file structure for the debugfs "file"
  4883. + *
  4884. + * This function provides an open implementation for the "enable" debugfs
  4885. + * interface to the hardware latency detector.
  4886. + */
  4887. +static int debug_enable_fopen(struct inode *inode, struct file *filp)
  4888. +{
  4889. + return 0;
  4890. +}
  4891. +
  4892. +/**
  4893. + * debug_enable_fread - Read function for "enable" debugfs interface
  4894. + * @filp: The active open file structure for the debugfs "file"
  4895. + * @ubuf: The userspace provided buffer to read value into
  4896. + * @cnt: The maximum number of bytes to read
  4897. + * @ppos: The current "file" position
  4898. + *
  4899. + * This function provides a read implementation for the "enable" debugfs
  4900. + * interface to the hardware latency detector. Can be used to determine
  4901. + * whether the detector is currently enabled ("0\n" or "1\n" returned).
  4902. + */
  4903. +static ssize_t debug_enable_fread(struct file *filp, char __user *ubuf,
  4904. + size_t cnt, loff_t *ppos)
  4905. +{
  4906. + char buf[4];
  4907. +
  4908. + if ((cnt < sizeof(buf)) || (*ppos))
  4909. + return 0;
  4910. +
  4911. + buf[0] = enabled ? '1' : '0';
  4912. + buf[1] = '\n';
  4913. + buf[2] = '\0';
  4914. + if (copy_to_user(ubuf, buf, strlen(buf)))
  4915. + return -EFAULT;
  4916. + return *ppos = strlen(buf);
  4917. +}
  4918. +
  4919. +/**
  4920. + * debug_enable_fwrite - Write function for "enable" debugfs interface
  4921. + * @filp: The active open file structure for the debugfs "file"
  4922. + * @ubuf: The user buffer that contains the value to write
  4923. + * @cnt: The maximum number of bytes to write to "file"
  4924. + * @ppos: The current position in the debugfs "file"
  4925. + *
  4926. + * This function provides a write implementation for the "enable" debugfs
  4927. + * interface to the hardware latency detector. Can be used to enable or
  4928. + * disable the detector, which will have the side-effect of possibly
  4929. + * also resetting the global stats and kicking off the measuring
  4930. + * kthread (on an enable) or the converse (upon a disable).
  4931. + */
  4932. +static ssize_t debug_enable_fwrite(struct file *filp,
  4933. + const char __user *ubuf,
  4934. + size_t cnt,
  4935. + loff_t *ppos)
  4936. +{
  4937. + char buf[4];
  4938. + int csize = min(cnt, sizeof(buf));
  4939. + long val = 0;
  4940. + int err = 0;
  4941. +
  4942. + memset(buf, '\0', sizeof(buf));
  4943. + if (copy_from_user(buf, ubuf, csize))
  4944. + return -EFAULT;
  4945. +
  4946. + buf[sizeof(buf)-1] = '\0'; /* just in case */
  4947. + err = kstrtoul(buf, 10, &val);
  4948. + if (0 != err)
  4949. + return -EINVAL;
  4950. +
  4951. + if (val) {
  4952. + if (enabled)
  4953. + goto unlock;
  4954. + enabled = 1;
  4955. + __reset_stats();
  4956. + if (start_kthread())
  4957. + return -EFAULT;
  4958. + } else {
  4959. + if (!enabled)
  4960. + goto unlock;
  4961. + enabled = 0;
  4962. + err = stop_kthread();
  4963. + if (err) {
  4964. + pr_err(BANNER "cannot stop kthread\n");
  4965. + return -EFAULT;
  4966. + }
  4967. + wake_up(&data.wq); /* reader(s) should return */
  4968. + }
  4969. +unlock:
  4970. + return csize;
  4971. +}
  4972. +
  4973. +/**
  4974. + * debug_max_fopen - Open function for "max" debugfs entry
  4975. + * @inode: The in-kernel inode representation of the debugfs "file"
  4976. + * @filp: The active open file structure for the debugfs "file"
  4977. + *
  4978. + * This function provides an open implementation for the "max" debugfs
  4979. + * interface to the hardware latency detector.
  4980. + */
  4981. +static int debug_max_fopen(struct inode *inode, struct file *filp)
  4982. +{
  4983. + return 0;
  4984. +}
  4985. +
  4986. +/**
  4987. + * debug_max_fread - Read function for "max" debugfs entry
  4988. + * @filp: The active open file structure for the debugfs "file"
  4989. + * @ubuf: The userspace provided buffer to read value into
  4990. + * @cnt: The maximum number of bytes to read
  4991. + * @ppos: The current "file" position
  4992. + *
  4993. + * This function provides a read implementation for the "max" debugfs
  4994. + * interface to the hardware latency detector. Can be used to determine
  4995. + * the maximum latency value observed since it was last reset.
  4996. + */
  4997. +static ssize_t debug_max_fread(struct file *filp, char __user *ubuf,
  4998. + size_t cnt, loff_t *ppos)
  4999. +{
  5000. + return simple_data_read(filp, ubuf, cnt, ppos, &data.max_sample);
  5001. +}
  5002. +
  5003. +/**
  5004. + * debug_max_fwrite - Write function for "max" debugfs entry
  5005. + * @filp: The active open file structure for the debugfs "file"
  5006. + * @ubuf: The user buffer that contains the value to write
  5007. + * @cnt: The maximum number of bytes to write to "file"
  5008. + * @ppos: The current position in the debugfs "file"
  5009. + *
  5010. + * This function provides a write implementation for the "max" debugfs
  5011. + * interface to the hardware latency detector. Can be used to reset the
  5012. + * maximum or set it to some other desired value - if, then, subsequent
  5013. + * measurements exceed this value, the maximum will be updated.
  5014. + */
  5015. +static ssize_t debug_max_fwrite(struct file *filp,
  5016. + const char __user *ubuf,
  5017. + size_t cnt,
  5018. + loff_t *ppos)
  5019. +{
  5020. + return simple_data_write(filp, ubuf, cnt, ppos, &data.max_sample);
  5021. +}
  5022. +
  5023. +
  5024. +/**
  5025. + * debug_sample_fopen - An open function for "sample" debugfs interface
  5026. + * @inode: The in-kernel inode representation of this debugfs "file"
  5027. + * @filp: The active open file structure for the debugfs "file"
  5028. + *
  5029. + * This function handles opening the "sample" file within the hardware
  5030. + * latency detector debugfs directory interface. This file is used to read
  5031. + * raw samples from the global ring_buffer and allows the user to see a
  5032. + * running latency history. Can be opened blocking or non-blocking,
  5033. + * affecting whether it behaves as a buffer read pipe, or does not.
  5034. + * Implements simple locking to prevent multiple simultaneous use.
  5035. + */
  5036. +static int debug_sample_fopen(struct inode *inode, struct file *filp)
  5037. +{
  5038. + if (!atomic_add_unless(&data.sample_open, 1, 1))
  5039. + return -EBUSY;
  5040. + else
  5041. + return 0;
  5042. +}
  5043. +
  5044. +/**
  5045. + * debug_sample_fread - A read function for "sample" debugfs interface
  5046. + * @filp: The active open file structure for the debugfs "file"
  5047. + * @ubuf: The user buffer that will contain the samples read
  5048. + * @cnt: The maximum bytes to read from the debugfs "file"
  5049. + * @ppos: The current position in the debugfs "file"
  5050. + *
  5051. + * This function handles reading from the "sample" file within the hardware
  5052. + * latency detector debugfs directory interface. This file is used to read
  5053. + * raw samples from the global ring_buffer and allows the user to see a
  5054. + * running latency history. By default this will block pending a new
  5055. + * value written into the sample buffer, unless there are already a
  5056. + * number of value(s) waiting in the buffer, or the sample file was
  5057. + * previously opened in a non-blocking mode of operation.
  5058. + */
  5059. +static ssize_t debug_sample_fread(struct file *filp, char __user *ubuf,
  5060. + size_t cnt, loff_t *ppos)
  5061. +{
  5062. + int len = 0;
  5063. + char buf[64];
  5064. + struct sample *sample = NULL;
  5065. +
  5066. + if (!enabled)
  5067. + return 0;
  5068. +
  5069. + sample = kzalloc(sizeof(struct sample), GFP_KERNEL);
  5070. + if (!sample)
  5071. + return -ENOMEM;
  5072. +
  5073. + while (!buffer_get_sample(sample)) {
  5074. +
  5075. + DEFINE_WAIT(wait);
  5076. +
  5077. + if (filp->f_flags & O_NONBLOCK) {
  5078. + len = -EAGAIN;
  5079. + goto out;
  5080. + }
  5081. +
  5082. + prepare_to_wait(&data.wq, &wait, TASK_INTERRUPTIBLE);
  5083. + schedule();
  5084. + finish_wait(&data.wq, &wait);
  5085. +
  5086. + if (signal_pending(current)) {
  5087. + len = -EINTR;
  5088. + goto out;
  5089. + }
  5090. +
  5091. + if (!enabled) { /* enable was toggled */
  5092. + len = 0;
  5093. + goto out;
  5094. + }
  5095. + }
  5096. +
  5097. + len = snprintf(buf, sizeof(buf), "%010lu.%010lu\t%llu\t%llu\n",
  5098. + sample->timestamp.tv_sec,
  5099. + sample->timestamp.tv_nsec,
  5100. + sample->duration,
  5101. + sample->outer_duration);
  5102. +
  5103. +
  5104. + /* handling partial reads is more trouble than it's worth */
  5105. + if (len > cnt)
  5106. + goto out;
  5107. +
  5108. + if (copy_to_user(ubuf, buf, len))
  5109. + len = -EFAULT;
  5110. +
  5111. +out:
  5112. + kfree(sample);
  5113. + return len;
  5114. +}
  5115. +
  5116. +/**
  5117. + * debug_sample_release - Release function for "sample" debugfs interface
  5118. + * @inode: The in-kernel inode represenation of the debugfs "file"
  5119. + * @filp: The active open file structure for the debugfs "file"
  5120. + *
  5121. + * This function completes the close of the debugfs interface "sample" file.
  5122. + * Frees the sample_open "lock" so that other users may open the interface.
  5123. + */
  5124. +static int debug_sample_release(struct inode *inode, struct file *filp)
  5125. +{
  5126. + atomic_dec(&data.sample_open);
  5127. +
  5128. + return 0;
  5129. +}
  5130. +
  5131. +/**
  5132. + * debug_threshold_fopen - Open function for "threshold" debugfs entry
  5133. + * @inode: The in-kernel inode representation of the debugfs "file"
  5134. + * @filp: The active open file structure for the debugfs "file"
  5135. + *
  5136. + * This function provides an open implementation for the "threshold" debugfs
  5137. + * interface to the hardware latency detector.
  5138. + */
  5139. +static int debug_threshold_fopen(struct inode *inode, struct file *filp)
  5140. +{
  5141. + return 0;
  5142. +}
  5143. +
  5144. +/**
  5145. + * debug_threshold_fread - Read function for "threshold" debugfs entry
  5146. + * @filp: The active open file structure for the debugfs "file"
  5147. + * @ubuf: The userspace provided buffer to read value into
  5148. + * @cnt: The maximum number of bytes to read
  5149. + * @ppos: The current "file" position
  5150. + *
  5151. + * This function provides a read implementation for the "threshold" debugfs
  5152. + * interface to the hardware latency detector. It can be used to determine
  5153. + * the current threshold level at which a latency will be recorded in the
  5154. + * global ring buffer, typically on the order of 10us.
  5155. + */
  5156. +static ssize_t debug_threshold_fread(struct file *filp, char __user *ubuf,
  5157. + size_t cnt, loff_t *ppos)
  5158. +{
  5159. + return simple_data_read(filp, ubuf, cnt, ppos, &data.threshold);
  5160. +}
  5161. +
  5162. +/**
  5163. + * debug_threshold_fwrite - Write function for "threshold" debugfs entry
  5164. + * @filp: The active open file structure for the debugfs "file"
  5165. + * @ubuf: The user buffer that contains the value to write
  5166. + * @cnt: The maximum number of bytes to write to "file"
  5167. + * @ppos: The current position in the debugfs "file"
  5168. + *
  5169. + * This function provides a write implementation for the "threshold" debugfs
  5170. + * interface to the hardware latency detector. It can be used to configure
  5171. + * the threshold level at which any subsequently detected latencies will
  5172. + * be recorded into the global ring buffer.
  5173. + */
  5174. +static ssize_t debug_threshold_fwrite(struct file *filp,
  5175. + const char __user *ubuf,
  5176. + size_t cnt,
  5177. + loff_t *ppos)
  5178. +{
  5179. + int ret;
  5180. +
  5181. + ret = simple_data_write(filp, ubuf, cnt, ppos, &data.threshold);
  5182. +
  5183. + if (enabled)
  5184. + wake_up_process(kthread);
  5185. +
  5186. + return ret;
  5187. +}
  5188. +
  5189. +/**
  5190. + * debug_width_fopen - Open function for "width" debugfs entry
  5191. + * @inode: The in-kernel inode representation of the debugfs "file"
  5192. + * @filp: The active open file structure for the debugfs "file"
  5193. + *
  5194. + * This function provides an open implementation for the "width" debugfs
  5195. + * interface to the hardware latency detector.
  5196. + */
  5197. +static int debug_width_fopen(struct inode *inode, struct file *filp)
  5198. +{
  5199. + return 0;
  5200. +}
  5201. +
  5202. +/**
  5203. + * debug_width_fread - Read function for "width" debugfs entry
  5204. + * @filp: The active open file structure for the debugfs "file"
  5205. + * @ubuf: The userspace provided buffer to read value into
  5206. + * @cnt: The maximum number of bytes to read
  5207. + * @ppos: The current "file" position
  5208. + *
  5209. + * This function provides a read implementation for the "width" debugfs
  5210. + * interface to the hardware latency detector. It can be used to determine
  5211. + * for how many us of the total window us we will actively sample for any
  5212. + * hardware-induced latecy periods. Obviously, it is not possible to
  5213. + * sample constantly and have the system respond to a sample reader, or,
  5214. + * worse, without having the system appear to have gone out to lunch.
  5215. + */
  5216. +static ssize_t debug_width_fread(struct file *filp, char __user *ubuf,
  5217. + size_t cnt, loff_t *ppos)
  5218. +{
  5219. + return simple_data_read(filp, ubuf, cnt, ppos, &data.sample_width);
  5220. +}
  5221. +
  5222. +/**
  5223. + * debug_width_fwrite - Write function for "width" debugfs entry
  5224. + * @filp: The active open file structure for the debugfs "file"
  5225. + * @ubuf: The user buffer that contains the value to write
  5226. + * @cnt: The maximum number of bytes to write to "file"
  5227. + * @ppos: The current position in the debugfs "file"
  5228. + *
  5229. + * This function provides a write implementation for the "width" debugfs
  5230. + * interface to the hardware latency detector. It can be used to configure
  5231. + * for how many us of the total window us we will actively sample for any
  5232. + * hardware-induced latency periods. Obviously, it is not possible to
  5233. + * sample constantly and have the system respond to a sample reader, or,
  5234. + * worse, without having the system appear to have gone out to lunch. It
  5235. + * is enforced that width is less that the total window size.
  5236. + */
  5237. +static ssize_t debug_width_fwrite(struct file *filp,
  5238. + const char __user *ubuf,
  5239. + size_t cnt,
  5240. + loff_t *ppos)
  5241. +{
  5242. + char buf[U64STR_SIZE];
  5243. + int csize = min(cnt, sizeof(buf));
  5244. + u64 val = 0;
  5245. + int err = 0;
  5246. +
  5247. + memset(buf, '\0', sizeof(buf));
  5248. + if (copy_from_user(buf, ubuf, csize))
  5249. + return -EFAULT;
  5250. +
  5251. + buf[U64STR_SIZE-1] = '\0'; /* just in case */
  5252. + err = kstrtoull(buf, 10, &val);
  5253. + if (0 != err)
  5254. + return -EINVAL;
  5255. +
  5256. + mutex_lock(&data.lock);
  5257. + if (val < data.sample_window)
  5258. + data.sample_width = val;
  5259. + else {
  5260. + mutex_unlock(&data.lock);
  5261. + return -EINVAL;
  5262. + }
  5263. + mutex_unlock(&data.lock);
  5264. +
  5265. + if (enabled)
  5266. + wake_up_process(kthread);
  5267. +
  5268. + return csize;
  5269. +}
  5270. +
  5271. +/**
  5272. + * debug_window_fopen - Open function for "window" debugfs entry
  5273. + * @inode: The in-kernel inode representation of the debugfs "file"
  5274. + * @filp: The active open file structure for the debugfs "file"
  5275. + *
  5276. + * This function provides an open implementation for the "window" debugfs
  5277. + * interface to the hardware latency detector. The window is the total time
  5278. + * in us that will be considered one sample period. Conceptually, windows
  5279. + * occur back-to-back and contain a sample width period during which
  5280. + * actual sampling occurs.
  5281. + */
  5282. +static int debug_window_fopen(struct inode *inode, struct file *filp)
  5283. +{
  5284. + return 0;
  5285. +}
  5286. +
  5287. +/**
  5288. + * debug_window_fread - Read function for "window" debugfs entry
  5289. + * @filp: The active open file structure for the debugfs "file"
  5290. + * @ubuf: The userspace provided buffer to read value into
  5291. + * @cnt: The maximum number of bytes to read
  5292. + * @ppos: The current "file" position
  5293. + *
  5294. + * This function provides a read implementation for the "window" debugfs
  5295. + * interface to the hardware latency detector. The window is the total time
  5296. + * in us that will be considered one sample period. Conceptually, windows
  5297. + * occur back-to-back and contain a sample width period during which
  5298. + * actual sampling occurs. Can be used to read the total window size.
  5299. + */
  5300. +static ssize_t debug_window_fread(struct file *filp, char __user *ubuf,
  5301. + size_t cnt, loff_t *ppos)
  5302. +{
  5303. + return simple_data_read(filp, ubuf, cnt, ppos, &data.sample_window);
  5304. +}
  5305. +
  5306. +/**
  5307. + * debug_window_fwrite - Write function for "window" debugfs entry
  5308. + * @filp: The active open file structure for the debugfs "file"
  5309. + * @ubuf: The user buffer that contains the value to write
  5310. + * @cnt: The maximum number of bytes to write to "file"
  5311. + * @ppos: The current position in the debugfs "file"
  5312. + *
  5313. + * This function provides a write implementation for the "window" debufds
  5314. + * interface to the hardware latency detetector. The window is the total time
  5315. + * in us that will be considered one sample period. Conceptually, windows
  5316. + * occur back-to-back and contain a sample width period during which
  5317. + * actual sampling occurs. Can be used to write a new total window size. It
  5318. + * is enfoced that any value written must be greater than the sample width
  5319. + * size, or an error results.
  5320. + */
  5321. +static ssize_t debug_window_fwrite(struct file *filp,
  5322. + const char __user *ubuf,
  5323. + size_t cnt,
  5324. + loff_t *ppos)
  5325. +{
  5326. + char buf[U64STR_SIZE];
  5327. + int csize = min(cnt, sizeof(buf));
  5328. + u64 val = 0;
  5329. + int err = 0;
  5330. +
  5331. + memset(buf, '\0', sizeof(buf));
  5332. + if (copy_from_user(buf, ubuf, csize))
  5333. + return -EFAULT;
  5334. +
  5335. + buf[U64STR_SIZE-1] = '\0'; /* just in case */
  5336. + err = kstrtoull(buf, 10, &val);
  5337. + if (0 != err)
  5338. + return -EINVAL;
  5339. +
  5340. + mutex_lock(&data.lock);
  5341. + if (data.sample_width < val)
  5342. + data.sample_window = val;
  5343. + else {
  5344. + mutex_unlock(&data.lock);
  5345. + return -EINVAL;
  5346. + }
  5347. + mutex_unlock(&data.lock);
  5348. +
  5349. + return csize;
  5350. +}
  5351. +
  5352. +/*
  5353. + * Function pointers for the "count" debugfs file operations
  5354. + */
  5355. +static const struct file_operations count_fops = {
  5356. + .open = debug_count_fopen,
  5357. + .read = debug_count_fread,
  5358. + .write = debug_count_fwrite,
  5359. + .owner = THIS_MODULE,
  5360. +};
  5361. +
  5362. +/*
  5363. + * Function pointers for the "enable" debugfs file operations
  5364. + */
  5365. +static const struct file_operations enable_fops = {
  5366. + .open = debug_enable_fopen,
  5367. + .read = debug_enable_fread,
  5368. + .write = debug_enable_fwrite,
  5369. + .owner = THIS_MODULE,
  5370. +};
  5371. +
  5372. +/*
  5373. + * Function pointers for the "max" debugfs file operations
  5374. + */
  5375. +static const struct file_operations max_fops = {
  5376. + .open = debug_max_fopen,
  5377. + .read = debug_max_fread,
  5378. + .write = debug_max_fwrite,
  5379. + .owner = THIS_MODULE,
  5380. +};
  5381. +
  5382. +/*
  5383. + * Function pointers for the "sample" debugfs file operations
  5384. + */
  5385. +static const struct file_operations sample_fops = {
  5386. + .open = debug_sample_fopen,
  5387. + .read = debug_sample_fread,
  5388. + .release = debug_sample_release,
  5389. + .owner = THIS_MODULE,
  5390. +};
  5391. +
  5392. +/*
  5393. + * Function pointers for the "threshold" debugfs file operations
  5394. + */
  5395. +static const struct file_operations threshold_fops = {
  5396. + .open = debug_threshold_fopen,
  5397. + .read = debug_threshold_fread,
  5398. + .write = debug_threshold_fwrite,
  5399. + .owner = THIS_MODULE,
  5400. +};
  5401. +
  5402. +/*
  5403. + * Function pointers for the "width" debugfs file operations
  5404. + */
  5405. +static const struct file_operations width_fops = {
  5406. + .open = debug_width_fopen,
  5407. + .read = debug_width_fread,
  5408. + .write = debug_width_fwrite,
  5409. + .owner = THIS_MODULE,
  5410. +};
  5411. +
  5412. +/*
  5413. + * Function pointers for the "window" debugfs file operations
  5414. + */
  5415. +static const struct file_operations window_fops = {
  5416. + .open = debug_window_fopen,
  5417. + .read = debug_window_fread,
  5418. + .write = debug_window_fwrite,
  5419. + .owner = THIS_MODULE,
  5420. +};
  5421. +
  5422. +/**
  5423. + * init_debugfs - A function to initialize the debugfs interface files
  5424. + *
  5425. + * This function creates entries in debugfs for "hwlat_detector", including
  5426. + * files to read values from the detector, current samples, and the
  5427. + * maximum sample that has been captured since the hardware latency
  5428. + * dectector was started.
  5429. + */
  5430. +static int init_debugfs(void)
  5431. +{
  5432. + int ret = -ENOMEM;
  5433. +
  5434. + debug_dir = debugfs_create_dir(DRVNAME, NULL);
  5435. + if (!debug_dir)
  5436. + goto err_debug_dir;
  5437. +
  5438. + debug_sample = debugfs_create_file("sample", 0444,
  5439. + debug_dir, NULL,
  5440. + &sample_fops);
  5441. + if (!debug_sample)
  5442. + goto err_sample;
  5443. +
  5444. + debug_count = debugfs_create_file("count", 0444,
  5445. + debug_dir, NULL,
  5446. + &count_fops);
  5447. + if (!debug_count)
  5448. + goto err_count;
  5449. +
  5450. + debug_max = debugfs_create_file("max", 0444,
  5451. + debug_dir, NULL,
  5452. + &max_fops);
  5453. + if (!debug_max)
  5454. + goto err_max;
  5455. +
  5456. + debug_sample_window = debugfs_create_file("window", 0644,
  5457. + debug_dir, NULL,
  5458. + &window_fops);
  5459. + if (!debug_sample_window)
  5460. + goto err_window;
  5461. +
  5462. + debug_sample_width = debugfs_create_file("width", 0644,
  5463. + debug_dir, NULL,
  5464. + &width_fops);
  5465. + if (!debug_sample_width)
  5466. + goto err_width;
  5467. +
  5468. + debug_threshold = debugfs_create_file("threshold", 0644,
  5469. + debug_dir, NULL,
  5470. + &threshold_fops);
  5471. + if (!debug_threshold)
  5472. + goto err_threshold;
  5473. +
  5474. + debug_enable = debugfs_create_file("enable", 0644,
  5475. + debug_dir, &enabled,
  5476. + &enable_fops);
  5477. + if (!debug_enable)
  5478. + goto err_enable;
  5479. +
  5480. + else {
  5481. + ret = 0;
  5482. + goto out;
  5483. + }
  5484. +
  5485. +err_enable:
  5486. + debugfs_remove(debug_threshold);
  5487. +err_threshold:
  5488. + debugfs_remove(debug_sample_width);
  5489. +err_width:
  5490. + debugfs_remove(debug_sample_window);
  5491. +err_window:
  5492. + debugfs_remove(debug_max);
  5493. +err_max:
  5494. + debugfs_remove(debug_count);
  5495. +err_count:
  5496. + debugfs_remove(debug_sample);
  5497. +err_sample:
  5498. + debugfs_remove(debug_dir);
  5499. +err_debug_dir:
  5500. +out:
  5501. + return ret;
  5502. +}
  5503. +
  5504. +/**
  5505. + * free_debugfs - A function to cleanup the debugfs file interface
  5506. + */
  5507. +static void free_debugfs(void)
  5508. +{
  5509. + /* could also use a debugfs_remove_recursive */
  5510. + debugfs_remove(debug_enable);
  5511. + debugfs_remove(debug_threshold);
  5512. + debugfs_remove(debug_sample_width);
  5513. + debugfs_remove(debug_sample_window);
  5514. + debugfs_remove(debug_max);
  5515. + debugfs_remove(debug_count);
  5516. + debugfs_remove(debug_sample);
  5517. + debugfs_remove(debug_dir);
  5518. +}
  5519. +
  5520. +/**
  5521. + * detector_init - Standard module initialization code
  5522. + */
  5523. +static int detector_init(void)
  5524. +{
  5525. + int ret = -ENOMEM;
  5526. +
  5527. + pr_info(BANNER "version %s\n", VERSION);
  5528. +
  5529. + ret = init_stats();
  5530. + if (0 != ret)
  5531. + goto out;
  5532. +
  5533. + ret = init_debugfs();
  5534. + if (0 != ret)
  5535. + goto err_stats;
  5536. +
  5537. + if (enabled)
  5538. + ret = start_kthread();
  5539. +
  5540. + goto out;
  5541. +
  5542. +err_stats:
  5543. + ring_buffer_free(ring_buffer);
  5544. +out:
  5545. + return ret;
  5546. +
  5547. +}
  5548. +
  5549. +/**
  5550. + * detector_exit - Standard module cleanup code
  5551. + */
  5552. +static void detector_exit(void)
  5553. +{
  5554. + int err;
  5555. +
  5556. + if (enabled) {
  5557. + enabled = 0;
  5558. + err = stop_kthread();
  5559. + if (err)
  5560. + pr_err(BANNER "cannot stop kthread\n");
  5561. + }
  5562. +
  5563. + free_debugfs();
  5564. + ring_buffer_free(ring_buffer); /* free up the ring buffer */
  5565. +
  5566. +}
  5567. +
  5568. +module_init(detector_init);
  5569. +module_exit(detector_exit);
  5570. diff -Nur linux-3.18.12.orig/drivers/misc/Kconfig linux-3.18.12/drivers/misc/Kconfig
  5571. --- linux-3.18.12.orig/drivers/misc/Kconfig 2015-04-20 14:48:02.000000000 -0500
  5572. +++ linux-3.18.12/drivers/misc/Kconfig 2015-04-26 13:32:22.395684003 -0500
  5573. @@ -54,6 +54,7 @@
  5574. config ATMEL_TCLIB
  5575. bool "Atmel AT32/AT91 Timer/Counter Library"
  5576. depends on (AVR32 || ARCH_AT91)
  5577. + default y if PREEMPT_RT_FULL
  5578. help
  5579. Select this if you want a library to allocate the Timer/Counter
  5580. blocks found on many Atmel processors. This facilitates using
  5581. @@ -69,8 +70,7 @@
  5582. are combined to make a single 32-bit timer.
  5583. When GENERIC_CLOCKEVENTS is defined, the third timer channel
  5584. - may be used as a clock event device supporting oneshot mode
  5585. - (delays of up to two seconds) based on the 32 KiHz clock.
  5586. + may be used as a clock event device supporting oneshot mode.
  5587. config ATMEL_TCB_CLKSRC_BLOCK
  5588. int
  5589. @@ -84,6 +84,15 @@
  5590. TC can be used for other purposes, such as PWM generation and
  5591. interval timing.
  5592. +config ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
  5593. + bool "TC Block use 32 KiHz clock"
  5594. + depends on ATMEL_TCB_CLKSRC
  5595. + default y if !PREEMPT_RT_FULL
  5596. + help
  5597. + Select this to use 32 KiHz base clock rate as TC block clock
  5598. + source for clock events.
  5599. +
  5600. +
  5601. config DUMMY_IRQ
  5602. tristate "Dummy IRQ handler"
  5603. default n
  5604. @@ -113,6 +122,35 @@
  5605. for information on the specific driver level and support statement
  5606. for your IBM server.
  5607. +config HWLAT_DETECTOR
  5608. + tristate "Testing module to detect hardware-induced latencies"
  5609. + depends on DEBUG_FS
  5610. + depends on RING_BUFFER
  5611. + default m
  5612. + ---help---
  5613. + A simple hardware latency detector. Use this module to detect
  5614. + large latencies introduced by the behavior of the underlying
  5615. + system firmware external to Linux. We do this using periodic
  5616. + use of stop_machine to grab all available CPUs and measure
  5617. + for unexplainable gaps in the CPU timestamp counter(s). By
  5618. + default, the module is not enabled until the "enable" file
  5619. + within the "hwlat_detector" debugfs directory is toggled.
  5620. +
  5621. + This module is often used to detect SMI (System Management
  5622. + Interrupts) on x86 systems, though is not x86 specific. To
  5623. + this end, we default to using a sample window of 1 second,
  5624. + during which we will sample for 0.5 seconds. If an SMI or
  5625. + similar event occurs during that time, it is recorded
  5626. + into an 8K samples global ring buffer until retreived.
  5627. +
  5628. + WARNING: This software should never be enabled (it can be built
  5629. + but should not be turned on after it is loaded) in a production
  5630. + environment where high latencies are a concern since the
  5631. + sampling mechanism actually introduces latencies for
  5632. + regular tasks while the CPU(s) are being held.
  5633. +
  5634. + If unsure, say N
  5635. +
  5636. config PHANTOM
  5637. tristate "Sensable PHANToM (PCI)"
  5638. depends on PCI
  5639. diff -Nur linux-3.18.12.orig/drivers/misc/Makefile linux-3.18.12/drivers/misc/Makefile
  5640. --- linux-3.18.12.orig/drivers/misc/Makefile 2015-04-20 14:48:02.000000000 -0500
  5641. +++ linux-3.18.12/drivers/misc/Makefile 2015-04-26 13:32:22.395684003 -0500
  5642. @@ -38,6 +38,7 @@
  5643. obj-$(CONFIG_HMC6352) += hmc6352.o
  5644. obj-y += eeprom/
  5645. obj-y += cb710/
  5646. +obj-$(CONFIG_HWLAT_DETECTOR) += hwlat_detector.o
  5647. obj-$(CONFIG_SPEAR13XX_PCIE_GADGET) += spear13xx_pcie_gadget.o
  5648. obj-$(CONFIG_VMWARE_BALLOON) += vmw_balloon.o
  5649. obj-$(CONFIG_ARM_CHARLCD) += arm-charlcd.o
  5650. diff -Nur linux-3.18.12.orig/drivers/mmc/host/mmci.c linux-3.18.12/drivers/mmc/host/mmci.c
  5651. --- linux-3.18.12.orig/drivers/mmc/host/mmci.c 2015-04-20 14:48:02.000000000 -0500
  5652. +++ linux-3.18.12/drivers/mmc/host/mmci.c 2015-04-26 13:32:22.395684003 -0500
  5653. @@ -1153,15 +1153,12 @@
  5654. struct sg_mapping_iter *sg_miter = &host->sg_miter;
  5655. struct variant_data *variant = host->variant;
  5656. void __iomem *base = host->base;
  5657. - unsigned long flags;
  5658. u32 status;
  5659. status = readl(base + MMCISTATUS);
  5660. dev_dbg(mmc_dev(host->mmc), "irq1 (pio) %08x\n", status);
  5661. - local_irq_save(flags);
  5662. -
  5663. do {
  5664. unsigned int remain, len;
  5665. char *buffer;
  5666. @@ -1201,8 +1198,6 @@
  5667. sg_miter_stop(sg_miter);
  5668. - local_irq_restore(flags);
  5669. -
  5670. /*
  5671. * If we have less than the fifo 'half-full' threshold to transfer,
  5672. * trigger a PIO interrupt as soon as any data is available.
  5673. diff -Nur linux-3.18.12.orig/drivers/mmc/host/sdhci.c linux-3.18.12/drivers/mmc/host/sdhci.c
  5674. --- linux-3.18.12.orig/drivers/mmc/host/sdhci.c 2015-04-20 14:48:02.000000000 -0500
  5675. +++ linux-3.18.12/drivers/mmc/host/sdhci.c 2015-04-26 13:32:22.399684003 -0500
  5676. @@ -2565,6 +2565,31 @@
  5677. return isr ? IRQ_HANDLED : IRQ_NONE;
  5678. }
  5679. +#ifdef CONFIG_PREEMPT_RT_BASE
  5680. +static irqreturn_t sdhci_rt_irq(int irq, void *dev_id)
  5681. +{
  5682. + irqreturn_t ret;
  5683. +
  5684. + local_bh_disable();
  5685. + ret = sdhci_irq(irq, dev_id);
  5686. + local_bh_enable();
  5687. + if (ret == IRQ_WAKE_THREAD)
  5688. + ret = sdhci_thread_irq(irq, dev_id);
  5689. + return ret;
  5690. +}
  5691. +#endif
  5692. +
  5693. +static int sdhci_req_irq(struct sdhci_host *host)
  5694. +{
  5695. +#ifdef CONFIG_PREEMPT_RT_BASE
  5696. + return request_threaded_irq(host->irq, NULL, sdhci_rt_irq,
  5697. + IRQF_SHARED, mmc_hostname(host->mmc), host);
  5698. +#else
  5699. + return request_threaded_irq(host->irq, sdhci_irq, sdhci_thread_irq,
  5700. + IRQF_SHARED, mmc_hostname(host->mmc), host);
  5701. +#endif
  5702. +}
  5703. +
  5704. /*****************************************************************************\
  5705. * *
  5706. * Suspend/resume *
  5707. @@ -2632,9 +2657,7 @@
  5708. }
  5709. if (!device_may_wakeup(mmc_dev(host->mmc))) {
  5710. - ret = request_threaded_irq(host->irq, sdhci_irq,
  5711. - sdhci_thread_irq, IRQF_SHARED,
  5712. - mmc_hostname(host->mmc), host);
  5713. + ret = sdhci_req_irq(host);
  5714. if (ret)
  5715. return ret;
  5716. } else {
  5717. @@ -3253,8 +3276,7 @@
  5718. sdhci_init(host, 0);
  5719. - ret = request_threaded_irq(host->irq, sdhci_irq, sdhci_thread_irq,
  5720. - IRQF_SHARED, mmc_hostname(mmc), host);
  5721. + ret = sdhci_req_irq(host);
  5722. if (ret) {
  5723. pr_err("%s: Failed to request IRQ %d: %d\n",
  5724. mmc_hostname(mmc), host->irq, ret);
  5725. diff -Nur linux-3.18.12.orig/drivers/net/ethernet/3com/3c59x.c linux-3.18.12/drivers/net/ethernet/3com/3c59x.c
  5726. --- linux-3.18.12.orig/drivers/net/ethernet/3com/3c59x.c 2015-04-20 14:48:02.000000000 -0500
  5727. +++ linux-3.18.12/drivers/net/ethernet/3com/3c59x.c 2015-04-26 13:32:22.399684003 -0500
  5728. @@ -842,9 +842,9 @@
  5729. {
  5730. struct vortex_private *vp = netdev_priv(dev);
  5731. unsigned long flags;
  5732. - local_irq_save(flags);
  5733. + local_irq_save_nort(flags);
  5734. (vp->full_bus_master_rx ? boomerang_interrupt:vortex_interrupt)(dev->irq,dev);
  5735. - local_irq_restore(flags);
  5736. + local_irq_restore_nort(flags);
  5737. }
  5738. #endif
  5739. @@ -1916,12 +1916,12 @@
  5740. * Block interrupts because vortex_interrupt does a bare spin_lock()
  5741. */
  5742. unsigned long flags;
  5743. - local_irq_save(flags);
  5744. + local_irq_save_nort(flags);
  5745. if (vp->full_bus_master_tx)
  5746. boomerang_interrupt(dev->irq, dev);
  5747. else
  5748. vortex_interrupt(dev->irq, dev);
  5749. - local_irq_restore(flags);
  5750. + local_irq_restore_nort(flags);
  5751. }
  5752. }
  5753. diff -Nur linux-3.18.12.orig/drivers/net/ethernet/atheros/atl1c/atl1c_main.c linux-3.18.12/drivers/net/ethernet/atheros/atl1c/atl1c_main.c
  5754. --- linux-3.18.12.orig/drivers/net/ethernet/atheros/atl1c/atl1c_main.c 2015-04-20 14:48:02.000000000 -0500
  5755. +++ linux-3.18.12/drivers/net/ethernet/atheros/atl1c/atl1c_main.c 2015-04-26 13:32:22.399684003 -0500
  5756. @@ -2213,11 +2213,7 @@
  5757. }
  5758. tpd_req = atl1c_cal_tpd_req(skb);
  5759. - if (!spin_trylock_irqsave(&adapter->tx_lock, flags)) {
  5760. - if (netif_msg_pktdata(adapter))
  5761. - dev_info(&adapter->pdev->dev, "tx locked\n");
  5762. - return NETDEV_TX_LOCKED;
  5763. - }
  5764. + spin_lock_irqsave(&adapter->tx_lock, flags);
  5765. if (atl1c_tpd_avail(adapter, type) < tpd_req) {
  5766. /* no enough descriptor, just stop queue */
  5767. diff -Nur linux-3.18.12.orig/drivers/net/ethernet/atheros/atl1e/atl1e_main.c linux-3.18.12/drivers/net/ethernet/atheros/atl1e/atl1e_main.c
  5768. --- linux-3.18.12.orig/drivers/net/ethernet/atheros/atl1e/atl1e_main.c 2015-04-20 14:48:02.000000000 -0500
  5769. +++ linux-3.18.12/drivers/net/ethernet/atheros/atl1e/atl1e_main.c 2015-04-26 13:32:22.399684003 -0500
  5770. @@ -1880,8 +1880,7 @@
  5771. return NETDEV_TX_OK;
  5772. }
  5773. tpd_req = atl1e_cal_tdp_req(skb);
  5774. - if (!spin_trylock_irqsave(&adapter->tx_lock, flags))
  5775. - return NETDEV_TX_LOCKED;
  5776. + spin_lock_irqsave(&adapter->tx_lock, flags);
  5777. if (atl1e_tpd_avail(adapter) < tpd_req) {
  5778. /* no enough descriptor, just stop queue */
  5779. diff -Nur linux-3.18.12.orig/drivers/net/ethernet/chelsio/cxgb/sge.c linux-3.18.12/drivers/net/ethernet/chelsio/cxgb/sge.c
  5780. --- linux-3.18.12.orig/drivers/net/ethernet/chelsio/cxgb/sge.c 2015-04-20 14:48:02.000000000 -0500
  5781. +++ linux-3.18.12/drivers/net/ethernet/chelsio/cxgb/sge.c 2015-04-26 13:32:22.399684003 -0500
  5782. @@ -1663,8 +1663,7 @@
  5783. struct cmdQ *q = &sge->cmdQ[qid];
  5784. unsigned int credits, pidx, genbit, count, use_sched_skb = 0;
  5785. - if (!spin_trylock(&q->lock))
  5786. - return NETDEV_TX_LOCKED;
  5787. + spin_lock(&q->lock);
  5788. reclaim_completed_tx(sge, q);
  5789. diff -Nur linux-3.18.12.orig/drivers/net/ethernet/freescale/gianfar.c linux-3.18.12/drivers/net/ethernet/freescale/gianfar.c
  5790. --- linux-3.18.12.orig/drivers/net/ethernet/freescale/gianfar.c 2015-04-20 14:48:02.000000000 -0500
  5791. +++ linux-3.18.12/drivers/net/ethernet/freescale/gianfar.c 2015-04-26 13:32:22.399684003 -0500
  5792. @@ -1483,7 +1483,7 @@
  5793. if (netif_running(ndev)) {
  5794. - local_irq_save(flags);
  5795. + local_irq_save_nort(flags);
  5796. lock_tx_qs(priv);
  5797. gfar_halt_nodisable(priv);
  5798. @@ -1499,7 +1499,7 @@
  5799. gfar_write(&regs->maccfg1, tempval);
  5800. unlock_tx_qs(priv);
  5801. - local_irq_restore(flags);
  5802. + local_irq_restore_nort(flags);
  5803. disable_napi(priv);
  5804. @@ -1541,7 +1541,7 @@
  5805. /* Disable Magic Packet mode, in case something
  5806. * else woke us up.
  5807. */
  5808. - local_irq_save(flags);
  5809. + local_irq_save_nort(flags);
  5810. lock_tx_qs(priv);
  5811. tempval = gfar_read(&regs->maccfg2);
  5812. @@ -1551,7 +1551,7 @@
  5813. gfar_start(priv);
  5814. unlock_tx_qs(priv);
  5815. - local_irq_restore(flags);
  5816. + local_irq_restore_nort(flags);
  5817. netif_device_attach(ndev);
  5818. @@ -3307,14 +3307,14 @@
  5819. dev->stats.tx_dropped++;
  5820. atomic64_inc(&priv->extra_stats.tx_underrun);
  5821. - local_irq_save(flags);
  5822. + local_irq_save_nort(flags);
  5823. lock_tx_qs(priv);
  5824. /* Reactivate the Tx Queues */
  5825. gfar_write(&regs->tstat, gfargrp->tstat);
  5826. unlock_tx_qs(priv);
  5827. - local_irq_restore(flags);
  5828. + local_irq_restore_nort(flags);
  5829. }
  5830. netif_dbg(priv, tx_err, dev, "Transmit Error\n");
  5831. }
  5832. diff -Nur linux-3.18.12.orig/drivers/net/ethernet/neterion/s2io.c linux-3.18.12/drivers/net/ethernet/neterion/s2io.c
  5833. --- linux-3.18.12.orig/drivers/net/ethernet/neterion/s2io.c 2015-04-20 14:48:02.000000000 -0500
  5834. +++ linux-3.18.12/drivers/net/ethernet/neterion/s2io.c 2015-04-26 13:32:22.403684003 -0500
  5835. @@ -4084,12 +4084,7 @@
  5836. [skb->priority & (MAX_TX_FIFOS - 1)];
  5837. fifo = &mac_control->fifos[queue];
  5838. - if (do_spin_lock)
  5839. - spin_lock_irqsave(&fifo->tx_lock, flags);
  5840. - else {
  5841. - if (unlikely(!spin_trylock_irqsave(&fifo->tx_lock, flags)))
  5842. - return NETDEV_TX_LOCKED;
  5843. - }
  5844. + spin_lock_irqsave(&fifo->tx_lock, flags);
  5845. if (sp->config.multiq) {
  5846. if (__netif_subqueue_stopped(dev, fifo->fifo_no)) {
  5847. diff -Nur linux-3.18.12.orig/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c linux-3.18.12/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c
  5848. --- linux-3.18.12.orig/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c 2015-04-20 14:48:02.000000000 -0500
  5849. +++ linux-3.18.12/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c 2015-04-26 13:32:22.403684003 -0500
  5850. @@ -2137,10 +2137,8 @@
  5851. struct pch_gbe_tx_ring *tx_ring = adapter->tx_ring;
  5852. unsigned long flags;
  5853. - if (!spin_trylock_irqsave(&tx_ring->tx_lock, flags)) {
  5854. - /* Collision - tell upper layer to requeue */
  5855. - return NETDEV_TX_LOCKED;
  5856. - }
  5857. + spin_lock_irqsave(&tx_ring->tx_lock, flags);
  5858. +
  5859. if (unlikely(!PCH_GBE_DESC_UNUSED(tx_ring))) {
  5860. netif_stop_queue(netdev);
  5861. spin_unlock_irqrestore(&tx_ring->tx_lock, flags);
  5862. diff -Nur linux-3.18.12.orig/drivers/net/ethernet/realtek/8139too.c linux-3.18.12/drivers/net/ethernet/realtek/8139too.c
  5863. --- linux-3.18.12.orig/drivers/net/ethernet/realtek/8139too.c 2015-04-20 14:48:02.000000000 -0500
  5864. +++ linux-3.18.12/drivers/net/ethernet/realtek/8139too.c 2015-04-26 13:32:22.403684003 -0500
  5865. @@ -2215,7 +2215,7 @@
  5866. struct rtl8139_private *tp = netdev_priv(dev);
  5867. const int irq = tp->pci_dev->irq;
  5868. - disable_irq(irq);
  5869. + disable_irq_nosync(irq);
  5870. rtl8139_interrupt(irq, dev);
  5871. enable_irq(irq);
  5872. }
  5873. diff -Nur linux-3.18.12.orig/drivers/net/ethernet/tehuti/tehuti.c linux-3.18.12/drivers/net/ethernet/tehuti/tehuti.c
  5874. --- linux-3.18.12.orig/drivers/net/ethernet/tehuti/tehuti.c 2015-04-20 14:48:02.000000000 -0500
  5875. +++ linux-3.18.12/drivers/net/ethernet/tehuti/tehuti.c 2015-04-26 13:32:22.403684003 -0500
  5876. @@ -1629,13 +1629,8 @@
  5877. unsigned long flags;
  5878. ENTER;
  5879. - local_irq_save(flags);
  5880. - if (!spin_trylock(&priv->tx_lock)) {
  5881. - local_irq_restore(flags);
  5882. - DBG("%s[%s]: TX locked, returning NETDEV_TX_LOCKED\n",
  5883. - BDX_DRV_NAME, ndev->name);
  5884. - return NETDEV_TX_LOCKED;
  5885. - }
  5886. +
  5887. + spin_lock_irqsave(&priv->tx_lock, flags);
  5888. /* build tx descriptor */
  5889. BDX_ASSERT(f->m.wptr >= f->m.memsz); /* started with valid wptr */
  5890. diff -Nur linux-3.18.12.orig/drivers/net/rionet.c linux-3.18.12/drivers/net/rionet.c
  5891. --- linux-3.18.12.orig/drivers/net/rionet.c 2015-04-20 14:48:02.000000000 -0500
  5892. +++ linux-3.18.12/drivers/net/rionet.c 2015-04-26 13:32:22.403684003 -0500
  5893. @@ -174,11 +174,7 @@
  5894. unsigned long flags;
  5895. int add_num = 1;
  5896. - local_irq_save(flags);
  5897. - if (!spin_trylock(&rnet->tx_lock)) {
  5898. - local_irq_restore(flags);
  5899. - return NETDEV_TX_LOCKED;
  5900. - }
  5901. + spin_lock_irqsave(&rnet->tx_lock, flags);
  5902. if (is_multicast_ether_addr(eth->h_dest))
  5903. add_num = nets[rnet->mport->id].nact;
  5904. diff -Nur linux-3.18.12.orig/drivers/net/wireless/orinoco/orinoco_usb.c linux-3.18.12/drivers/net/wireless/orinoco/orinoco_usb.c
  5905. --- linux-3.18.12.orig/drivers/net/wireless/orinoco/orinoco_usb.c 2015-04-20 14:48:02.000000000 -0500
  5906. +++ linux-3.18.12/drivers/net/wireless/orinoco/orinoco_usb.c 2015-04-26 13:32:22.403684003 -0500
  5907. @@ -699,7 +699,7 @@
  5908. while (!ctx->done.done && msecs--)
  5909. udelay(1000);
  5910. } else {
  5911. - wait_event_interruptible(ctx->done.wait,
  5912. + swait_event_interruptible(ctx->done.wait,
  5913. ctx->done.done);
  5914. }
  5915. break;
  5916. diff -Nur linux-3.18.12.orig/drivers/pci/access.c linux-3.18.12/drivers/pci/access.c
  5917. --- linux-3.18.12.orig/drivers/pci/access.c 2015-04-20 14:48:02.000000000 -0500
  5918. +++ linux-3.18.12/drivers/pci/access.c 2015-04-26 13:32:22.403684003 -0500
  5919. @@ -434,7 +434,7 @@
  5920. WARN_ON(!dev->block_cfg_access);
  5921. dev->block_cfg_access = 0;
  5922. - wake_up_all(&pci_cfg_wait);
  5923. + wake_up_all_locked(&pci_cfg_wait);
  5924. raw_spin_unlock_irqrestore(&pci_lock, flags);
  5925. }
  5926. EXPORT_SYMBOL_GPL(pci_cfg_access_unlock);
  5927. diff -Nur linux-3.18.12.orig/drivers/scsi/fcoe/fcoe.c linux-3.18.12/drivers/scsi/fcoe/fcoe.c
  5928. --- linux-3.18.12.orig/drivers/scsi/fcoe/fcoe.c 2015-04-20 14:48:02.000000000 -0500
  5929. +++ linux-3.18.12/drivers/scsi/fcoe/fcoe.c 2015-04-26 13:32:22.403684003 -0500
  5930. @@ -1286,7 +1286,7 @@
  5931. struct sk_buff *skb;
  5932. #ifdef CONFIG_SMP
  5933. struct fcoe_percpu_s *p0;
  5934. - unsigned targ_cpu = get_cpu();
  5935. + unsigned targ_cpu = get_cpu_light();
  5936. #endif /* CONFIG_SMP */
  5937. FCOE_DBG("Destroying receive thread for CPU %d\n", cpu);
  5938. @@ -1342,7 +1342,7 @@
  5939. kfree_skb(skb);
  5940. spin_unlock_bh(&p->fcoe_rx_list.lock);
  5941. }
  5942. - put_cpu();
  5943. + put_cpu_light();
  5944. #else
  5945. /*
  5946. * This a non-SMP scenario where the singular Rx thread is
  5947. @@ -1566,11 +1566,11 @@
  5948. static int fcoe_alloc_paged_crc_eof(struct sk_buff *skb, int tlen)
  5949. {
  5950. struct fcoe_percpu_s *fps;
  5951. - int rc;
  5952. + int rc, cpu = get_cpu_light();
  5953. - fps = &get_cpu_var(fcoe_percpu);
  5954. + fps = &per_cpu(fcoe_percpu, cpu);
  5955. rc = fcoe_get_paged_crc_eof(skb, tlen, fps);
  5956. - put_cpu_var(fcoe_percpu);
  5957. + put_cpu_light();
  5958. return rc;
  5959. }
  5960. @@ -1768,11 +1768,11 @@
  5961. return 0;
  5962. }
  5963. - stats = per_cpu_ptr(lport->stats, get_cpu());
  5964. + stats = per_cpu_ptr(lport->stats, get_cpu_light());
  5965. stats->InvalidCRCCount++;
  5966. if (stats->InvalidCRCCount < 5)
  5967. printk(KERN_WARNING "fcoe: dropping frame with CRC error\n");
  5968. - put_cpu();
  5969. + put_cpu_light();
  5970. return -EINVAL;
  5971. }
  5972. @@ -1848,13 +1848,13 @@
  5973. goto drop;
  5974. if (!fcoe_filter_frames(lport, fp)) {
  5975. - put_cpu();
  5976. + put_cpu_light();
  5977. fc_exch_recv(lport, fp);
  5978. return;
  5979. }
  5980. drop:
  5981. stats->ErrorFrames++;
  5982. - put_cpu();
  5983. + put_cpu_light();
  5984. kfree_skb(skb);
  5985. }
  5986. diff -Nur linux-3.18.12.orig/drivers/scsi/fcoe/fcoe_ctlr.c linux-3.18.12/drivers/scsi/fcoe/fcoe_ctlr.c
  5987. --- linux-3.18.12.orig/drivers/scsi/fcoe/fcoe_ctlr.c 2015-04-20 14:48:02.000000000 -0500
  5988. +++ linux-3.18.12/drivers/scsi/fcoe/fcoe_ctlr.c 2015-04-26 13:32:22.403684003 -0500
  5989. @@ -831,7 +831,7 @@
  5990. INIT_LIST_HEAD(&del_list);
  5991. - stats = per_cpu_ptr(fip->lp->stats, get_cpu());
  5992. + stats = per_cpu_ptr(fip->lp->stats, get_cpu_light());
  5993. list_for_each_entry_safe(fcf, next, &fip->fcfs, list) {
  5994. deadline = fcf->time + fcf->fka_period + fcf->fka_period / 2;
  5995. @@ -867,7 +867,7 @@
  5996. sel_time = fcf->time;
  5997. }
  5998. }
  5999. - put_cpu();
  6000. + put_cpu_light();
  6001. list_for_each_entry_safe(fcf, next, &del_list, list) {
  6002. /* Removes fcf from current list */
  6003. diff -Nur linux-3.18.12.orig/drivers/scsi/libfc/fc_exch.c linux-3.18.12/drivers/scsi/libfc/fc_exch.c
  6004. --- linux-3.18.12.orig/drivers/scsi/libfc/fc_exch.c 2015-04-20 14:48:02.000000000 -0500
  6005. +++ linux-3.18.12/drivers/scsi/libfc/fc_exch.c 2015-04-26 13:32:22.403684003 -0500
  6006. @@ -816,10 +816,10 @@
  6007. }
  6008. memset(ep, 0, sizeof(*ep));
  6009. - cpu = get_cpu();
  6010. + cpu = get_cpu_light();
  6011. pool = per_cpu_ptr(mp->pool, cpu);
  6012. spin_lock_bh(&pool->lock);
  6013. - put_cpu();
  6014. + put_cpu_light();
  6015. /* peek cache of free slot */
  6016. if (pool->left != FC_XID_UNKNOWN) {
  6017. diff -Nur linux-3.18.12.orig/drivers/scsi/libsas/sas_ata.c linux-3.18.12/drivers/scsi/libsas/sas_ata.c
  6018. --- linux-3.18.12.orig/drivers/scsi/libsas/sas_ata.c 2015-04-20 14:48:02.000000000 -0500
  6019. +++ linux-3.18.12/drivers/scsi/libsas/sas_ata.c 2015-04-26 13:32:22.407684003 -0500
  6020. @@ -191,7 +191,7 @@
  6021. /* TODO: audit callers to ensure they are ready for qc_issue to
  6022. * unconditionally re-enable interrupts
  6023. */
  6024. - local_irq_save(flags);
  6025. + local_irq_save_nort(flags);
  6026. spin_unlock(ap->lock);
  6027. /* If the device fell off, no sense in issuing commands */
  6028. @@ -261,7 +261,7 @@
  6029. out:
  6030. spin_lock(ap->lock);
  6031. - local_irq_restore(flags);
  6032. + local_irq_restore_nort(flags);
  6033. return ret;
  6034. }
  6035. diff -Nur linux-3.18.12.orig/drivers/scsi/qla2xxx/qla_inline.h linux-3.18.12/drivers/scsi/qla2xxx/qla_inline.h
  6036. --- linux-3.18.12.orig/drivers/scsi/qla2xxx/qla_inline.h 2015-04-20 14:48:02.000000000 -0500
  6037. +++ linux-3.18.12/drivers/scsi/qla2xxx/qla_inline.h 2015-04-26 13:32:22.407684003 -0500
  6038. @@ -59,12 +59,12 @@
  6039. {
  6040. unsigned long flags;
  6041. struct qla_hw_data *ha = rsp->hw;
  6042. - local_irq_save(flags);
  6043. + local_irq_save_nort(flags);
  6044. if (IS_P3P_TYPE(ha))
  6045. qla82xx_poll(0, rsp);
  6046. else
  6047. ha->isp_ops->intr_handler(0, rsp);
  6048. - local_irq_restore(flags);
  6049. + local_irq_restore_nort(flags);
  6050. }
  6051. static inline uint8_t *
  6052. diff -Nur linux-3.18.12.orig/drivers/thermal/x86_pkg_temp_thermal.c linux-3.18.12/drivers/thermal/x86_pkg_temp_thermal.c
  6053. --- linux-3.18.12.orig/drivers/thermal/x86_pkg_temp_thermal.c 2015-04-20 14:48:02.000000000 -0500
  6054. +++ linux-3.18.12/drivers/thermal/x86_pkg_temp_thermal.c 2015-04-26 13:32:22.407684003 -0500
  6055. @@ -29,6 +29,7 @@
  6056. #include <linux/pm.h>
  6057. #include <linux/thermal.h>
  6058. #include <linux/debugfs.h>
  6059. +#include <linux/work-simple.h>
  6060. #include <asm/cpu_device_id.h>
  6061. #include <asm/mce.h>
  6062. @@ -352,7 +353,7 @@
  6063. }
  6064. }
  6065. -static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
  6066. +static void platform_thermal_notify_work(struct swork_event *event)
  6067. {
  6068. unsigned long flags;
  6069. int cpu = smp_processor_id();
  6070. @@ -369,7 +370,7 @@
  6071. pkg_work_scheduled[phy_id]) {
  6072. disable_pkg_thres_interrupt();
  6073. spin_unlock_irqrestore(&pkg_work_lock, flags);
  6074. - return -EINVAL;
  6075. + return;
  6076. }
  6077. pkg_work_scheduled[phy_id] = 1;
  6078. spin_unlock_irqrestore(&pkg_work_lock, flags);
  6079. @@ -378,9 +379,48 @@
  6080. schedule_delayed_work_on(cpu,
  6081. &per_cpu(pkg_temp_thermal_threshold_work, cpu),
  6082. msecs_to_jiffies(notify_delay_ms));
  6083. +}
  6084. +
  6085. +#ifdef CONFIG_PREEMPT_RT_FULL
  6086. +static struct swork_event notify_work;
  6087. +
  6088. +static int thermal_notify_work_init(void)
  6089. +{
  6090. + int err;
  6091. +
  6092. + err = swork_get();
  6093. + if (err)
  6094. + return err;
  6095. +
  6096. + INIT_SWORK(&notify_work, platform_thermal_notify_work);
  6097. return 0;
  6098. }
  6099. +static void thermal_notify_work_cleanup(void)
  6100. +{
  6101. + swork_put();
  6102. +}
  6103. +
  6104. +static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
  6105. +{
  6106. + swork_queue(&notify_work);
  6107. + return 0;
  6108. +}
  6109. +
  6110. +#else /* !CONFIG_PREEMPT_RT_FULL */
  6111. +
  6112. +static int thermal_notify_work_init(void) { return 0; }
  6113. +
  6114. +static int thermal_notify_work_cleanup(void) { }
  6115. +
  6116. +static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
  6117. +{
  6118. + platform_thermal_notify_work(NULL);
  6119. +
  6120. + return 0;
  6121. +}
  6122. +#endif /* CONFIG_PREEMPT_RT_FULL */
  6123. +
  6124. static int find_siblings_cpu(int cpu)
  6125. {
  6126. int i;
  6127. @@ -584,6 +624,9 @@
  6128. if (!x86_match_cpu(pkg_temp_thermal_ids))
  6129. return -ENODEV;
  6130. + if (!thermal_notify_work_init())
  6131. + return -ENODEV;
  6132. +
  6133. spin_lock_init(&pkg_work_lock);
  6134. platform_thermal_package_notify =
  6135. pkg_temp_thermal_platform_thermal_notify;
  6136. @@ -608,7 +651,7 @@
  6137. kfree(pkg_work_scheduled);
  6138. platform_thermal_package_notify = NULL;
  6139. platform_thermal_package_rate_control = NULL;
  6140. -
  6141. + thermal_notify_work_cleanup();
  6142. return -ENODEV;
  6143. }
  6144. @@ -633,6 +676,7 @@
  6145. mutex_unlock(&phy_dev_list_mutex);
  6146. platform_thermal_package_notify = NULL;
  6147. platform_thermal_package_rate_control = NULL;
  6148. + thermal_notify_work_cleanup();
  6149. for_each_online_cpu(i)
  6150. cancel_delayed_work_sync(
  6151. &per_cpu(pkg_temp_thermal_threshold_work, i));
  6152. diff -Nur linux-3.18.12.orig/drivers/tty/serial/8250/8250_core.c linux-3.18.12/drivers/tty/serial/8250/8250_core.c
  6153. --- linux-3.18.12.orig/drivers/tty/serial/8250/8250_core.c 2015-04-20 14:48:02.000000000 -0500
  6154. +++ linux-3.18.12/drivers/tty/serial/8250/8250_core.c 2015-04-26 13:32:22.407684003 -0500
  6155. @@ -37,6 +37,7 @@
  6156. #include <linux/nmi.h>
  6157. #include <linux/mutex.h>
  6158. #include <linux/slab.h>
  6159. +#include <linux/kdb.h>
  6160. #include <linux/uaccess.h>
  6161. #include <linux/pm_runtime.h>
  6162. #ifdef CONFIG_SPARC
  6163. @@ -81,7 +82,16 @@
  6164. #define DEBUG_INTR(fmt...) do { } while (0)
  6165. #endif
  6166. -#define PASS_LIMIT 512
  6167. +/*
  6168. + * On -rt we can have a more delays, and legitimately
  6169. + * so - so don't drop work spuriously and spam the
  6170. + * syslog:
  6171. + */
  6172. +#ifdef CONFIG_PREEMPT_RT_FULL
  6173. +# define PASS_LIMIT 1000000
  6174. +#else
  6175. +# define PASS_LIMIT 512
  6176. +#endif
  6177. #define BOTH_EMPTY (UART_LSR_TEMT | UART_LSR_THRE)
  6178. @@ -3197,7 +3207,7 @@
  6179. serial8250_rpm_get(up);
  6180. - if (port->sysrq || oops_in_progress)
  6181. + if (port->sysrq || oops_in_progress || in_kdb_printk())
  6182. locked = spin_trylock_irqsave(&port->lock, flags);
  6183. else
  6184. spin_lock_irqsave(&port->lock, flags);
  6185. diff -Nur linux-3.18.12.orig/drivers/tty/serial/amba-pl011.c linux-3.18.12/drivers/tty/serial/amba-pl011.c
  6186. --- linux-3.18.12.orig/drivers/tty/serial/amba-pl011.c 2015-04-20 14:48:02.000000000 -0500
  6187. +++ linux-3.18.12/drivers/tty/serial/amba-pl011.c 2015-04-26 13:32:22.407684003 -0500
  6188. @@ -1935,13 +1935,19 @@
  6189. clk_enable(uap->clk);
  6190. - local_irq_save(flags);
  6191. + /*
  6192. + * local_irq_save(flags);
  6193. + *
  6194. + * This local_irq_save() is nonsense. If we come in via sysrq
  6195. + * handling then interrupts are already disabled. Aside of
  6196. + * that the port.sysrq check is racy on SMP regardless.
  6197. + */
  6198. if (uap->port.sysrq)
  6199. locked = 0;
  6200. else if (oops_in_progress)
  6201. - locked = spin_trylock(&uap->port.lock);
  6202. + locked = spin_trylock_irqsave(&uap->port.lock, flags);
  6203. else
  6204. - spin_lock(&uap->port.lock);
  6205. + spin_lock_irqsave(&uap->port.lock, flags);
  6206. /*
  6207. * First save the CR then disable the interrupts
  6208. @@ -1963,8 +1969,7 @@
  6209. writew(old_cr, uap->port.membase + UART011_CR);
  6210. if (locked)
  6211. - spin_unlock(&uap->port.lock);
  6212. - local_irq_restore(flags);
  6213. + spin_unlock_irqrestore(&uap->port.lock, flags);
  6214. clk_disable(uap->clk);
  6215. }
  6216. diff -Nur linux-3.18.12.orig/drivers/tty/serial/omap-serial.c linux-3.18.12/drivers/tty/serial/omap-serial.c
  6217. --- linux-3.18.12.orig/drivers/tty/serial/omap-serial.c 2015-04-20 14:48:02.000000000 -0500
  6218. +++ linux-3.18.12/drivers/tty/serial/omap-serial.c 2015-04-26 13:32:22.407684003 -0500
  6219. @@ -1270,13 +1270,10 @@
  6220. pm_runtime_get_sync(up->dev);
  6221. - local_irq_save(flags);
  6222. - if (up->port.sysrq)
  6223. - locked = 0;
  6224. - else if (oops_in_progress)
  6225. - locked = spin_trylock(&up->port.lock);
  6226. + if (up->port.sysrq || oops_in_progress)
  6227. + locked = spin_trylock_irqsave(&up->port.lock, flags);
  6228. else
  6229. - spin_lock(&up->port.lock);
  6230. + spin_lock_irqsave(&up->port.lock, flags);
  6231. /*
  6232. * First save the IER then disable the interrupts
  6233. @@ -1305,8 +1302,7 @@
  6234. pm_runtime_mark_last_busy(up->dev);
  6235. pm_runtime_put_autosuspend(up->dev);
  6236. if (locked)
  6237. - spin_unlock(&up->port.lock);
  6238. - local_irq_restore(flags);
  6239. + spin_unlock_irqrestore(&up->port.lock, flags);
  6240. }
  6241. static int __init
  6242. diff -Nur linux-3.18.12.orig/drivers/usb/core/hcd.c linux-3.18.12/drivers/usb/core/hcd.c
  6243. --- linux-3.18.12.orig/drivers/usb/core/hcd.c 2015-04-20 14:48:02.000000000 -0500
  6244. +++ linux-3.18.12/drivers/usb/core/hcd.c 2015-04-26 13:32:22.407684003 -0500
  6245. @@ -1681,9 +1681,9 @@
  6246. * and no one may trigger the above deadlock situation when
  6247. * running complete() in tasklet.
  6248. */
  6249. - local_irq_save(flags);
  6250. + local_irq_save_nort(flags);
  6251. urb->complete(urb);
  6252. - local_irq_restore(flags);
  6253. + local_irq_restore_nort(flags);
  6254. usb_anchor_resume_wakeups(anchor);
  6255. atomic_dec(&urb->use_count);
  6256. diff -Nur linux-3.18.12.orig/drivers/usb/gadget/function/f_fs.c linux-3.18.12/drivers/usb/gadget/function/f_fs.c
  6257. --- linux-3.18.12.orig/drivers/usb/gadget/function/f_fs.c 2015-04-20 14:48:02.000000000 -0500
  6258. +++ linux-3.18.12/drivers/usb/gadget/function/f_fs.c 2015-04-26 13:32:22.407684003 -0500
  6259. @@ -1428,7 +1428,7 @@
  6260. pr_info("%s(): freeing\n", __func__);
  6261. ffs_data_clear(ffs);
  6262. BUG_ON(waitqueue_active(&ffs->ev.waitq) ||
  6263. - waitqueue_active(&ffs->ep0req_completion.wait));
  6264. + swaitqueue_active(&ffs->ep0req_completion.wait));
  6265. kfree(ffs->dev_name);
  6266. kfree(ffs);
  6267. }
  6268. diff -Nur linux-3.18.12.orig/drivers/usb/gadget/legacy/inode.c linux-3.18.12/drivers/usb/gadget/legacy/inode.c
  6269. --- linux-3.18.12.orig/drivers/usb/gadget/legacy/inode.c 2015-04-20 14:48:02.000000000 -0500
  6270. +++ linux-3.18.12/drivers/usb/gadget/legacy/inode.c 2015-04-26 13:32:22.407684003 -0500
  6271. @@ -339,7 +339,7 @@
  6272. spin_unlock_irq (&epdata->dev->lock);
  6273. if (likely (value == 0)) {
  6274. - value = wait_event_interruptible (done.wait, done.done);
  6275. + value = swait_event_interruptible (done.wait, done.done);
  6276. if (value != 0) {
  6277. spin_lock_irq (&epdata->dev->lock);
  6278. if (likely (epdata->ep != NULL)) {
  6279. @@ -348,7 +348,7 @@
  6280. usb_ep_dequeue (epdata->ep, epdata->req);
  6281. spin_unlock_irq (&epdata->dev->lock);
  6282. - wait_event (done.wait, done.done);
  6283. + swait_event (done.wait, done.done);
  6284. if (epdata->status == -ECONNRESET)
  6285. epdata->status = -EINTR;
  6286. } else {
  6287. diff -Nur linux-3.18.12.orig/fs/aio.c linux-3.18.12/fs/aio.c
  6288. --- linux-3.18.12.orig/fs/aio.c 2015-04-20 14:48:02.000000000 -0500
  6289. +++ linux-3.18.12/fs/aio.c 2015-04-26 13:32:22.407684003 -0500
  6290. @@ -40,6 +40,7 @@
  6291. #include <linux/ramfs.h>
  6292. #include <linux/percpu-refcount.h>
  6293. #include <linux/mount.h>
  6294. +#include <linux/work-simple.h>
  6295. #include <asm/kmap_types.h>
  6296. #include <asm/uaccess.h>
  6297. @@ -110,7 +111,7 @@
  6298. struct page **ring_pages;
  6299. long nr_pages;
  6300. - struct work_struct free_work;
  6301. + struct swork_event free_work;
  6302. /*
  6303. * signals when all in-flight requests are done
  6304. @@ -226,6 +227,7 @@
  6305. .mount = aio_mount,
  6306. .kill_sb = kill_anon_super,
  6307. };
  6308. + BUG_ON(swork_get());
  6309. aio_mnt = kern_mount(&aio_fs);
  6310. if (IS_ERR(aio_mnt))
  6311. panic("Failed to create aio fs mount.");
  6312. @@ -505,9 +507,9 @@
  6313. return cancel(kiocb);
  6314. }
  6315. -static void free_ioctx(struct work_struct *work)
  6316. +static void free_ioctx(struct swork_event *sev)
  6317. {
  6318. - struct kioctx *ctx = container_of(work, struct kioctx, free_work);
  6319. + struct kioctx *ctx = container_of(sev, struct kioctx, free_work);
  6320. pr_debug("freeing %p\n", ctx);
  6321. @@ -526,8 +528,8 @@
  6322. if (ctx->requests_done)
  6323. complete(ctx->requests_done);
  6324. - INIT_WORK(&ctx->free_work, free_ioctx);
  6325. - schedule_work(&ctx->free_work);
  6326. + INIT_SWORK(&ctx->free_work, free_ioctx);
  6327. + swork_queue(&ctx->free_work);
  6328. }
  6329. /*
  6330. @@ -535,9 +537,9 @@
  6331. * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted -
  6332. * now it's safe to cancel any that need to be.
  6333. */
  6334. -static void free_ioctx_users(struct percpu_ref *ref)
  6335. +static void free_ioctx_users_work(struct swork_event *sev)
  6336. {
  6337. - struct kioctx *ctx = container_of(ref, struct kioctx, users);
  6338. + struct kioctx *ctx = container_of(sev, struct kioctx, free_work);
  6339. struct kiocb *req;
  6340. spin_lock_irq(&ctx->ctx_lock);
  6341. @@ -556,6 +558,14 @@
  6342. percpu_ref_put(&ctx->reqs);
  6343. }
  6344. +static void free_ioctx_users(struct percpu_ref *ref)
  6345. +{
  6346. + struct kioctx *ctx = container_of(ref, struct kioctx, users);
  6347. +
  6348. + INIT_SWORK(&ctx->free_work, free_ioctx_users_work);
  6349. + swork_queue(&ctx->free_work);
  6350. +}
  6351. +
  6352. static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
  6353. {
  6354. unsigned i, new_nr;
  6355. diff -Nur linux-3.18.12.orig/fs/autofs4/autofs_i.h linux-3.18.12/fs/autofs4/autofs_i.h
  6356. --- linux-3.18.12.orig/fs/autofs4/autofs_i.h 2015-04-20 14:48:02.000000000 -0500
  6357. +++ linux-3.18.12/fs/autofs4/autofs_i.h 2015-04-26 13:32:22.411684003 -0500
  6358. @@ -34,6 +34,7 @@
  6359. #include <linux/sched.h>
  6360. #include <linux/mount.h>
  6361. #include <linux/namei.h>
  6362. +#include <linux/delay.h>
  6363. #include <asm/current.h>
  6364. #include <asm/uaccess.h>
  6365. diff -Nur linux-3.18.12.orig/fs/autofs4/expire.c linux-3.18.12/fs/autofs4/expire.c
  6366. --- linux-3.18.12.orig/fs/autofs4/expire.c 2015-04-20 14:48:02.000000000 -0500
  6367. +++ linux-3.18.12/fs/autofs4/expire.c 2015-04-26 13:32:22.411684003 -0500
  6368. @@ -151,7 +151,7 @@
  6369. parent = p->d_parent;
  6370. if (!spin_trylock(&parent->d_lock)) {
  6371. spin_unlock(&p->d_lock);
  6372. - cpu_relax();
  6373. + cpu_chill();
  6374. goto relock;
  6375. }
  6376. spin_unlock(&p->d_lock);
  6377. diff -Nur linux-3.18.12.orig/fs/buffer.c linux-3.18.12/fs/buffer.c
  6378. --- linux-3.18.12.orig/fs/buffer.c 2015-04-20 14:48:02.000000000 -0500
  6379. +++ linux-3.18.12/fs/buffer.c 2015-04-26 13:32:22.411684003 -0500
  6380. @@ -301,8 +301,7 @@
  6381. * decide that the page is now completely done.
  6382. */
  6383. first = page_buffers(page);
  6384. - local_irq_save(flags);
  6385. - bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
  6386. + flags = bh_uptodate_lock_irqsave(first);
  6387. clear_buffer_async_read(bh);
  6388. unlock_buffer(bh);
  6389. tmp = bh;
  6390. @@ -315,8 +314,7 @@
  6391. }
  6392. tmp = tmp->b_this_page;
  6393. } while (tmp != bh);
  6394. - bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
  6395. - local_irq_restore(flags);
  6396. + bh_uptodate_unlock_irqrestore(first, flags);
  6397. /*
  6398. * If none of the buffers had errors and they are all
  6399. @@ -328,9 +326,7 @@
  6400. return;
  6401. still_busy:
  6402. - bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
  6403. - local_irq_restore(flags);
  6404. - return;
  6405. + bh_uptodate_unlock_irqrestore(first, flags);
  6406. }
  6407. /*
  6408. @@ -358,8 +354,7 @@
  6409. }
  6410. first = page_buffers(page);
  6411. - local_irq_save(flags);
  6412. - bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
  6413. + flags = bh_uptodate_lock_irqsave(first);
  6414. clear_buffer_async_write(bh);
  6415. unlock_buffer(bh);
  6416. @@ -371,15 +366,12 @@
  6417. }
  6418. tmp = tmp->b_this_page;
  6419. }
  6420. - bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
  6421. - local_irq_restore(flags);
  6422. + bh_uptodate_unlock_irqrestore(first, flags);
  6423. end_page_writeback(page);
  6424. return;
  6425. still_busy:
  6426. - bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
  6427. - local_irq_restore(flags);
  6428. - return;
  6429. + bh_uptodate_unlock_irqrestore(first, flags);
  6430. }
  6431. EXPORT_SYMBOL(end_buffer_async_write);
  6432. @@ -3325,6 +3317,7 @@
  6433. struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
  6434. if (ret) {
  6435. INIT_LIST_HEAD(&ret->b_assoc_buffers);
  6436. + buffer_head_init_locks(ret);
  6437. preempt_disable();
  6438. __this_cpu_inc(bh_accounting.nr);
  6439. recalc_bh_state();
  6440. diff -Nur linux-3.18.12.orig/fs/dcache.c linux-3.18.12/fs/dcache.c
  6441. --- linux-3.18.12.orig/fs/dcache.c 2015-04-20 14:48:02.000000000 -0500
  6442. +++ linux-3.18.12/fs/dcache.c 2015-04-26 13:32:22.411684003 -0500
  6443. @@ -19,6 +19,7 @@
  6444. #include <linux/mm.h>
  6445. #include <linux/fs.h>
  6446. #include <linux/fsnotify.h>
  6447. +#include <linux/delay.h>
  6448. #include <linux/slab.h>
  6449. #include <linux/init.h>
  6450. #include <linux/hash.h>
  6451. @@ -552,7 +553,7 @@
  6452. failed:
  6453. spin_unlock(&dentry->d_lock);
  6454. - cpu_relax();
  6455. + cpu_chill();
  6456. return dentry; /* try again with same dentry */
  6457. }
  6458. @@ -2285,7 +2286,7 @@
  6459. if (dentry->d_lockref.count == 1) {
  6460. if (!spin_trylock(&inode->i_lock)) {
  6461. spin_unlock(&dentry->d_lock);
  6462. - cpu_relax();
  6463. + cpu_chill();
  6464. goto again;
  6465. }
  6466. dentry->d_flags &= ~DCACHE_CANT_MOUNT;
  6467. diff -Nur linux-3.18.12.orig/fs/eventpoll.c linux-3.18.12/fs/eventpoll.c
  6468. --- linux-3.18.12.orig/fs/eventpoll.c 2015-04-20 14:48:02.000000000 -0500
  6469. +++ linux-3.18.12/fs/eventpoll.c 2015-04-26 13:32:22.411684003 -0500
  6470. @@ -505,12 +505,12 @@
  6471. */
  6472. static void ep_poll_safewake(wait_queue_head_t *wq)
  6473. {
  6474. - int this_cpu = get_cpu();
  6475. + int this_cpu = get_cpu_light();
  6476. ep_call_nested(&poll_safewake_ncalls, EP_MAX_NESTS,
  6477. ep_poll_wakeup_proc, NULL, wq, (void *) (long) this_cpu);
  6478. - put_cpu();
  6479. + put_cpu_light();
  6480. }
  6481. static void ep_remove_wait_queue(struct eppoll_entry *pwq)
  6482. diff -Nur linux-3.18.12.orig/fs/exec.c linux-3.18.12/fs/exec.c
  6483. --- linux-3.18.12.orig/fs/exec.c 2015-04-20 14:48:02.000000000 -0500
  6484. +++ linux-3.18.12/fs/exec.c 2015-04-26 13:32:22.411684003 -0500
  6485. @@ -841,12 +841,14 @@
  6486. }
  6487. }
  6488. task_lock(tsk);
  6489. + preempt_disable_rt();
  6490. active_mm = tsk->active_mm;
  6491. tsk->mm = mm;
  6492. tsk->active_mm = mm;
  6493. activate_mm(active_mm, mm);
  6494. tsk->mm->vmacache_seqnum = 0;
  6495. vmacache_flush(tsk);
  6496. + preempt_enable_rt();
  6497. task_unlock(tsk);
  6498. if (old_mm) {
  6499. up_read(&old_mm->mmap_sem);
  6500. diff -Nur linux-3.18.12.orig/fs/jbd/checkpoint.c linux-3.18.12/fs/jbd/checkpoint.c
  6501. --- linux-3.18.12.orig/fs/jbd/checkpoint.c 2015-04-20 14:48:02.000000000 -0500
  6502. +++ linux-3.18.12/fs/jbd/checkpoint.c 2015-04-26 13:32:22.411684003 -0500
  6503. @@ -129,6 +129,8 @@
  6504. if (journal->j_flags & JFS_ABORT)
  6505. return;
  6506. spin_unlock(&journal->j_state_lock);
  6507. + if (current->plug)
  6508. + io_schedule();
  6509. mutex_lock(&journal->j_checkpoint_mutex);
  6510. /*
  6511. diff -Nur linux-3.18.12.orig/fs/jbd2/checkpoint.c linux-3.18.12/fs/jbd2/checkpoint.c
  6512. --- linux-3.18.12.orig/fs/jbd2/checkpoint.c 2015-04-20 14:48:02.000000000 -0500
  6513. +++ linux-3.18.12/fs/jbd2/checkpoint.c 2015-04-26 13:32:22.411684003 -0500
  6514. @@ -116,6 +116,8 @@
  6515. nblocks = jbd2_space_needed(journal);
  6516. while (jbd2_log_space_left(journal) < nblocks) {
  6517. write_unlock(&journal->j_state_lock);
  6518. + if (current->plug)
  6519. + io_schedule();
  6520. mutex_lock(&journal->j_checkpoint_mutex);
  6521. /*
  6522. diff -Nur linux-3.18.12.orig/fs/namespace.c linux-3.18.12/fs/namespace.c
  6523. --- linux-3.18.12.orig/fs/namespace.c 2015-04-20 14:48:02.000000000 -0500
  6524. +++ linux-3.18.12/fs/namespace.c 2015-04-26 13:32:22.411684003 -0500
  6525. @@ -14,6 +14,7 @@
  6526. #include <linux/mnt_namespace.h>
  6527. #include <linux/user_namespace.h>
  6528. #include <linux/namei.h>
  6529. +#include <linux/delay.h>
  6530. #include <linux/security.h>
  6531. #include <linux/idr.h>
  6532. #include <linux/init.h> /* init_rootfs */
  6533. @@ -344,8 +345,11 @@
  6534. * incremented count after it has set MNT_WRITE_HOLD.
  6535. */
  6536. smp_mb();
  6537. - while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD)
  6538. - cpu_relax();
  6539. + while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) {
  6540. + preempt_enable();
  6541. + cpu_chill();
  6542. + preempt_disable();
  6543. + }
  6544. /*
  6545. * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
  6546. * be set to match its requirements. So we must not load that until
  6547. diff -Nur linux-3.18.12.orig/fs/ntfs/aops.c linux-3.18.12/fs/ntfs/aops.c
  6548. --- linux-3.18.12.orig/fs/ntfs/aops.c 2015-04-20 14:48:02.000000000 -0500
  6549. +++ linux-3.18.12/fs/ntfs/aops.c 2015-04-26 13:32:22.411684003 -0500
  6550. @@ -107,8 +107,7 @@
  6551. "0x%llx.", (unsigned long long)bh->b_blocknr);
  6552. }
  6553. first = page_buffers(page);
  6554. - local_irq_save(flags);
  6555. - bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
  6556. + flags = bh_uptodate_lock_irqsave(first);
  6557. clear_buffer_async_read(bh);
  6558. unlock_buffer(bh);
  6559. tmp = bh;
  6560. @@ -123,8 +122,7 @@
  6561. }
  6562. tmp = tmp->b_this_page;
  6563. } while (tmp != bh);
  6564. - bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
  6565. - local_irq_restore(flags);
  6566. + bh_uptodate_unlock_irqrestore(first, flags);
  6567. /*
  6568. * If none of the buffers had errors then we can set the page uptodate,
  6569. * but we first have to perform the post read mst fixups, if the
  6570. @@ -145,13 +143,13 @@
  6571. recs = PAGE_CACHE_SIZE / rec_size;
  6572. /* Should have been verified before we got here... */
  6573. BUG_ON(!recs);
  6574. - local_irq_save(flags);
  6575. + local_irq_save_nort(flags);
  6576. kaddr = kmap_atomic(page);
  6577. for (i = 0; i < recs; i++)
  6578. post_read_mst_fixup((NTFS_RECORD*)(kaddr +
  6579. i * rec_size), rec_size);
  6580. kunmap_atomic(kaddr);
  6581. - local_irq_restore(flags);
  6582. + local_irq_restore_nort(flags);
  6583. flush_dcache_page(page);
  6584. if (likely(page_uptodate && !PageError(page)))
  6585. SetPageUptodate(page);
  6586. @@ -159,9 +157,7 @@
  6587. unlock_page(page);
  6588. return;
  6589. still_busy:
  6590. - bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
  6591. - local_irq_restore(flags);
  6592. - return;
  6593. + bh_uptodate_unlock_irqrestore(first, flags);
  6594. }
  6595. /**
  6596. diff -Nur linux-3.18.12.orig/fs/timerfd.c linux-3.18.12/fs/timerfd.c
  6597. --- linux-3.18.12.orig/fs/timerfd.c 2015-04-20 14:48:02.000000000 -0500
  6598. +++ linux-3.18.12/fs/timerfd.c 2015-04-26 13:32:22.411684003 -0500
  6599. @@ -449,7 +449,10 @@
  6600. break;
  6601. }
  6602. spin_unlock_irq(&ctx->wqh.lock);
  6603. - cpu_relax();
  6604. + if (isalarm(ctx))
  6605. + hrtimer_wait_for_timer(&ctx->t.alarm.timer);
  6606. + else
  6607. + hrtimer_wait_for_timer(&ctx->t.tmr);
  6608. }
  6609. /*
  6610. diff -Nur linux-3.18.12.orig/include/acpi/platform/aclinux.h linux-3.18.12/include/acpi/platform/aclinux.h
  6611. --- linux-3.18.12.orig/include/acpi/platform/aclinux.h 2015-04-20 14:48:02.000000000 -0500
  6612. +++ linux-3.18.12/include/acpi/platform/aclinux.h 2015-04-26 13:32:22.415684003 -0500
  6613. @@ -123,6 +123,7 @@
  6614. #define acpi_cache_t struct kmem_cache
  6615. #define acpi_spinlock spinlock_t *
  6616. +#define acpi_raw_spinlock raw_spinlock_t *
  6617. #define acpi_cpu_flags unsigned long
  6618. /* Use native linux version of acpi_os_allocate_zeroed */
  6619. @@ -141,6 +142,20 @@
  6620. #define ACPI_USE_ALTERNATE_PROTOTYPE_acpi_os_get_thread_id
  6621. #define ACPI_USE_ALTERNATE_PROTOTYPE_acpi_os_create_lock
  6622. +#define acpi_os_create_raw_lock(__handle) \
  6623. +({ \
  6624. + raw_spinlock_t *lock = ACPI_ALLOCATE(sizeof(*lock)); \
  6625. + \
  6626. + if (lock) { \
  6627. + *(__handle) = lock; \
  6628. + raw_spin_lock_init(*(__handle)); \
  6629. + } \
  6630. + lock ? AE_OK : AE_NO_MEMORY; \
  6631. + })
  6632. +
  6633. +#define acpi_os_delete_raw_lock(__handle) kfree(__handle)
  6634. +
  6635. +
  6636. /*
  6637. * OSL interfaces used by debugger/disassembler
  6638. */
  6639. diff -Nur linux-3.18.12.orig/include/asm-generic/bug.h linux-3.18.12/include/asm-generic/bug.h
  6640. --- linux-3.18.12.orig/include/asm-generic/bug.h 2015-04-20 14:48:02.000000000 -0500
  6641. +++ linux-3.18.12/include/asm-generic/bug.h 2015-04-26 13:32:22.415684003 -0500
  6642. @@ -206,6 +206,20 @@
  6643. # define WARN_ON_SMP(x) ({0;})
  6644. #endif
  6645. +#ifdef CONFIG_PREEMPT_RT_BASE
  6646. +# define BUG_ON_RT(c) BUG_ON(c)
  6647. +# define BUG_ON_NONRT(c) do { } while (0)
  6648. +# define WARN_ON_RT(condition) WARN_ON(condition)
  6649. +# define WARN_ON_NONRT(condition) do { } while (0)
  6650. +# define WARN_ON_ONCE_NONRT(condition) do { } while (0)
  6651. +#else
  6652. +# define BUG_ON_RT(c) do { } while (0)
  6653. +# define BUG_ON_NONRT(c) BUG_ON(c)
  6654. +# define WARN_ON_RT(condition) do { } while (0)
  6655. +# define WARN_ON_NONRT(condition) WARN_ON(condition)
  6656. +# define WARN_ON_ONCE_NONRT(condition) WARN_ON_ONCE(condition)
  6657. +#endif
  6658. +
  6659. #endif /* __ASSEMBLY__ */
  6660. #endif
  6661. diff -Nur linux-3.18.12.orig/include/linux/blkdev.h linux-3.18.12/include/linux/blkdev.h
  6662. --- linux-3.18.12.orig/include/linux/blkdev.h 2015-04-20 14:48:02.000000000 -0500
  6663. +++ linux-3.18.12/include/linux/blkdev.h 2015-04-26 13:32:22.415684003 -0500
  6664. @@ -101,6 +101,7 @@
  6665. struct list_head queuelist;
  6666. union {
  6667. struct call_single_data csd;
  6668. + struct work_struct work;
  6669. unsigned long fifo_time;
  6670. };
  6671. @@ -478,7 +479,7 @@
  6672. struct throtl_data *td;
  6673. #endif
  6674. struct rcu_head rcu_head;
  6675. - wait_queue_head_t mq_freeze_wq;
  6676. + struct swait_head mq_freeze_wq;
  6677. struct percpu_ref mq_usage_counter;
  6678. struct list_head all_q_node;
  6679. diff -Nur linux-3.18.12.orig/include/linux/blk-mq.h linux-3.18.12/include/linux/blk-mq.h
  6680. --- linux-3.18.12.orig/include/linux/blk-mq.h 2015-04-20 14:48:02.000000000 -0500
  6681. +++ linux-3.18.12/include/linux/blk-mq.h 2015-04-26 13:32:22.415684003 -0500
  6682. @@ -169,6 +169,7 @@
  6683. struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *, const int ctx_index);
  6684. struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_tag_set *, unsigned int, int);
  6685. +void __blk_mq_complete_request_remote_work(struct work_struct *work);
  6686. void blk_mq_start_request(struct request *rq);
  6687. void blk_mq_end_request(struct request *rq, int error);
  6688. diff -Nur linux-3.18.12.orig/include/linux/bottom_half.h linux-3.18.12/include/linux/bottom_half.h
  6689. --- linux-3.18.12.orig/include/linux/bottom_half.h 2015-04-20 14:48:02.000000000 -0500
  6690. +++ linux-3.18.12/include/linux/bottom_half.h 2015-04-26 13:32:22.415684003 -0500
  6691. @@ -4,6 +4,17 @@
  6692. #include <linux/preempt.h>
  6693. #include <linux/preempt_mask.h>
  6694. +#ifdef CONFIG_PREEMPT_RT_FULL
  6695. +
  6696. +extern void local_bh_disable(void);
  6697. +extern void _local_bh_enable(void);
  6698. +extern void local_bh_enable(void);
  6699. +extern void local_bh_enable_ip(unsigned long ip);
  6700. +extern void __local_bh_disable_ip(unsigned long ip, unsigned int cnt);
  6701. +extern void __local_bh_enable_ip(unsigned long ip, unsigned int cnt);
  6702. +
  6703. +#else
  6704. +
  6705. #ifdef CONFIG_TRACE_IRQFLAGS
  6706. extern void __local_bh_disable_ip(unsigned long ip, unsigned int cnt);
  6707. #else
  6708. @@ -31,5 +42,6 @@
  6709. {
  6710. __local_bh_enable_ip(_THIS_IP_, SOFTIRQ_DISABLE_OFFSET);
  6711. }
  6712. +#endif
  6713. #endif /* _LINUX_BH_H */
  6714. diff -Nur linux-3.18.12.orig/include/linux/buffer_head.h linux-3.18.12/include/linux/buffer_head.h
  6715. --- linux-3.18.12.orig/include/linux/buffer_head.h 2015-04-20 14:48:02.000000000 -0500
  6716. +++ linux-3.18.12/include/linux/buffer_head.h 2015-04-26 13:32:22.415684003 -0500
  6717. @@ -75,8 +75,52 @@
  6718. struct address_space *b_assoc_map; /* mapping this buffer is
  6719. associated with */
  6720. atomic_t b_count; /* users using this buffer_head */
  6721. +#ifdef CONFIG_PREEMPT_RT_BASE
  6722. + spinlock_t b_uptodate_lock;
  6723. +#if defined(CONFIG_JBD) || defined(CONFIG_JBD_MODULE) || \
  6724. + defined(CONFIG_JBD2) || defined(CONFIG_JBD2_MODULE)
  6725. + spinlock_t b_state_lock;
  6726. + spinlock_t b_journal_head_lock;
  6727. +#endif
  6728. +#endif
  6729. };
  6730. +static inline unsigned long bh_uptodate_lock_irqsave(struct buffer_head *bh)
  6731. +{
  6732. + unsigned long flags;
  6733. +
  6734. +#ifndef CONFIG_PREEMPT_RT_BASE
  6735. + local_irq_save(flags);
  6736. + bit_spin_lock(BH_Uptodate_Lock, &bh->b_state);
  6737. +#else
  6738. + spin_lock_irqsave(&bh->b_uptodate_lock, flags);
  6739. +#endif
  6740. + return flags;
  6741. +}
  6742. +
  6743. +static inline void
  6744. +bh_uptodate_unlock_irqrestore(struct buffer_head *bh, unsigned long flags)
  6745. +{
  6746. +#ifndef CONFIG_PREEMPT_RT_BASE
  6747. + bit_spin_unlock(BH_Uptodate_Lock, &bh->b_state);
  6748. + local_irq_restore(flags);
  6749. +#else
  6750. + spin_unlock_irqrestore(&bh->b_uptodate_lock, flags);
  6751. +#endif
  6752. +}
  6753. +
  6754. +static inline void buffer_head_init_locks(struct buffer_head *bh)
  6755. +{
  6756. +#ifdef CONFIG_PREEMPT_RT_BASE
  6757. + spin_lock_init(&bh->b_uptodate_lock);
  6758. +#if defined(CONFIG_JBD) || defined(CONFIG_JBD_MODULE) || \
  6759. + defined(CONFIG_JBD2) || defined(CONFIG_JBD2_MODULE)
  6760. + spin_lock_init(&bh->b_state_lock);
  6761. + spin_lock_init(&bh->b_journal_head_lock);
  6762. +#endif
  6763. +#endif
  6764. +}
  6765. +
  6766. /*
  6767. * macro tricks to expand the set_buffer_foo(), clear_buffer_foo()
  6768. * and buffer_foo() functions.
  6769. diff -Nur linux-3.18.12.orig/include/linux/cgroup.h linux-3.18.12/include/linux/cgroup.h
  6770. --- linux-3.18.12.orig/include/linux/cgroup.h 2015-04-20 14:48:02.000000000 -0500
  6771. +++ linux-3.18.12/include/linux/cgroup.h 2015-04-26 13:32:22.415684003 -0500
  6772. @@ -22,6 +22,7 @@
  6773. #include <linux/seq_file.h>
  6774. #include <linux/kernfs.h>
  6775. #include <linux/wait.h>
  6776. +#include <linux/work-simple.h>
  6777. #ifdef CONFIG_CGROUPS
  6778. @@ -91,6 +92,7 @@
  6779. /* percpu_ref killing and RCU release */
  6780. struct rcu_head rcu_head;
  6781. struct work_struct destroy_work;
  6782. + struct swork_event destroy_swork;
  6783. };
  6784. /* bits in struct cgroup_subsys_state flags field */
  6785. diff -Nur linux-3.18.12.orig/include/linux/completion.h linux-3.18.12/include/linux/completion.h
  6786. --- linux-3.18.12.orig/include/linux/completion.h 2015-04-20 14:48:02.000000000 -0500
  6787. +++ linux-3.18.12/include/linux/completion.h 2015-04-26 13:32:22.415684003 -0500
  6788. @@ -7,8 +7,7 @@
  6789. * Atomic wait-for-completion handler data structures.
  6790. * See kernel/sched/completion.c for details.
  6791. */
  6792. -
  6793. -#include <linux/wait.h>
  6794. +#include <linux/wait-simple.h>
  6795. /*
  6796. * struct completion - structure used to maintain state for a "completion"
  6797. @@ -24,11 +23,11 @@
  6798. */
  6799. struct completion {
  6800. unsigned int done;
  6801. - wait_queue_head_t wait;
  6802. + struct swait_head wait;
  6803. };
  6804. #define COMPLETION_INITIALIZER(work) \
  6805. - { 0, __WAIT_QUEUE_HEAD_INITIALIZER((work).wait) }
  6806. + { 0, SWAIT_HEAD_INITIALIZER((work).wait) }
  6807. #define COMPLETION_INITIALIZER_ONSTACK(work) \
  6808. ({ init_completion(&work); work; })
  6809. @@ -73,7 +72,7 @@
  6810. static inline void init_completion(struct completion *x)
  6811. {
  6812. x->done = 0;
  6813. - init_waitqueue_head(&x->wait);
  6814. + init_swait_head(&x->wait);
  6815. }
  6816. /**
  6817. diff -Nur linux-3.18.12.orig/include/linux/cpu.h linux-3.18.12/include/linux/cpu.h
  6818. --- linux-3.18.12.orig/include/linux/cpu.h 2015-04-20 14:48:02.000000000 -0500
  6819. +++ linux-3.18.12/include/linux/cpu.h 2015-04-26 13:32:22.415684003 -0500
  6820. @@ -217,6 +217,8 @@
  6821. extern void put_online_cpus(void);
  6822. extern void cpu_hotplug_disable(void);
  6823. extern void cpu_hotplug_enable(void);
  6824. +extern void pin_current_cpu(void);
  6825. +extern void unpin_current_cpu(void);
  6826. #define hotcpu_notifier(fn, pri) cpu_notifier(fn, pri)
  6827. #define __hotcpu_notifier(fn, pri) __cpu_notifier(fn, pri)
  6828. #define register_hotcpu_notifier(nb) register_cpu_notifier(nb)
  6829. @@ -235,6 +237,8 @@
  6830. #define put_online_cpus() do { } while (0)
  6831. #define cpu_hotplug_disable() do { } while (0)
  6832. #define cpu_hotplug_enable() do { } while (0)
  6833. +static inline void pin_current_cpu(void) { }
  6834. +static inline void unpin_current_cpu(void) { }
  6835. #define hotcpu_notifier(fn, pri) do { (void)(fn); } while (0)
  6836. #define __hotcpu_notifier(fn, pri) do { (void)(fn); } while (0)
  6837. /* These aren't inline functions due to a GCC bug. */
  6838. diff -Nur linux-3.18.12.orig/include/linux/delay.h linux-3.18.12/include/linux/delay.h
  6839. --- linux-3.18.12.orig/include/linux/delay.h 2015-04-20 14:48:02.000000000 -0500
  6840. +++ linux-3.18.12/include/linux/delay.h 2015-04-26 13:32:22.415684003 -0500
  6841. @@ -52,4 +52,10 @@
  6842. msleep(seconds * 1000);
  6843. }
  6844. +#ifdef CONFIG_PREEMPT_RT_FULL
  6845. +extern void cpu_chill(void);
  6846. +#else
  6847. +# define cpu_chill() cpu_relax()
  6848. +#endif
  6849. +
  6850. #endif /* defined(_LINUX_DELAY_H) */
  6851. diff -Nur linux-3.18.12.orig/include/linux/ftrace_event.h linux-3.18.12/include/linux/ftrace_event.h
  6852. --- linux-3.18.12.orig/include/linux/ftrace_event.h 2015-04-20 14:48:02.000000000 -0500
  6853. +++ linux-3.18.12/include/linux/ftrace_event.h 2015-04-26 13:32:22.415684003 -0500
  6854. @@ -61,6 +61,9 @@
  6855. unsigned char flags;
  6856. unsigned char preempt_count;
  6857. int pid;
  6858. + unsigned short migrate_disable;
  6859. + unsigned short padding;
  6860. + unsigned char preempt_lazy_count;
  6861. };
  6862. #define FTRACE_MAX_EVENT \
  6863. diff -Nur linux-3.18.12.orig/include/linux/highmem.h linux-3.18.12/include/linux/highmem.h
  6864. --- linux-3.18.12.orig/include/linux/highmem.h 2015-04-20 14:48:02.000000000 -0500
  6865. +++ linux-3.18.12/include/linux/highmem.h 2015-04-26 13:32:22.415684003 -0500
  6866. @@ -7,6 +7,7 @@
  6867. #include <linux/mm.h>
  6868. #include <linux/uaccess.h>
  6869. #include <linux/hardirq.h>
  6870. +#include <linux/sched.h>
  6871. #include <asm/cacheflush.h>
  6872. @@ -85,32 +86,51 @@
  6873. #if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32)
  6874. +#ifndef CONFIG_PREEMPT_RT_FULL
  6875. DECLARE_PER_CPU(int, __kmap_atomic_idx);
  6876. +#endif
  6877. static inline int kmap_atomic_idx_push(void)
  6878. {
  6879. +#ifndef CONFIG_PREEMPT_RT_FULL
  6880. int idx = __this_cpu_inc_return(__kmap_atomic_idx) - 1;
  6881. -#ifdef CONFIG_DEBUG_HIGHMEM
  6882. +# ifdef CONFIG_DEBUG_HIGHMEM
  6883. WARN_ON_ONCE(in_irq() && !irqs_disabled());
  6884. BUG_ON(idx >= KM_TYPE_NR);
  6885. -#endif
  6886. +# endif
  6887. return idx;
  6888. +#else
  6889. + current->kmap_idx++;
  6890. + BUG_ON(current->kmap_idx > KM_TYPE_NR);
  6891. + return current->kmap_idx - 1;
  6892. +#endif
  6893. }
  6894. static inline int kmap_atomic_idx(void)
  6895. {
  6896. +#ifndef CONFIG_PREEMPT_RT_FULL
  6897. return __this_cpu_read(__kmap_atomic_idx) - 1;
  6898. +#else
  6899. + return current->kmap_idx - 1;
  6900. +#endif
  6901. }
  6902. static inline void kmap_atomic_idx_pop(void)
  6903. {
  6904. -#ifdef CONFIG_DEBUG_HIGHMEM
  6905. +#ifndef CONFIG_PREEMPT_RT_FULL
  6906. +# ifdef CONFIG_DEBUG_HIGHMEM
  6907. int idx = __this_cpu_dec_return(__kmap_atomic_idx);
  6908. BUG_ON(idx < 0);
  6909. -#else
  6910. +# else
  6911. __this_cpu_dec(__kmap_atomic_idx);
  6912. +# endif
  6913. +#else
  6914. + current->kmap_idx--;
  6915. +# ifdef CONFIG_DEBUG_HIGHMEM
  6916. + BUG_ON(current->kmap_idx < 0);
  6917. +# endif
  6918. #endif
  6919. }
  6920. diff -Nur linux-3.18.12.orig/include/linux/hrtimer.h linux-3.18.12/include/linux/hrtimer.h
  6921. --- linux-3.18.12.orig/include/linux/hrtimer.h 2015-04-20 14:48:02.000000000 -0500
  6922. +++ linux-3.18.12/include/linux/hrtimer.h 2015-04-26 13:32:22.415684003 -0500
  6923. @@ -111,6 +111,11 @@
  6924. enum hrtimer_restart (*function)(struct hrtimer *);
  6925. struct hrtimer_clock_base *base;
  6926. unsigned long state;
  6927. + struct list_head cb_entry;
  6928. + int irqsafe;
  6929. +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
  6930. + ktime_t praecox;
  6931. +#endif
  6932. #ifdef CONFIG_TIMER_STATS
  6933. int start_pid;
  6934. void *start_site;
  6935. @@ -147,6 +152,7 @@
  6936. int index;
  6937. clockid_t clockid;
  6938. struct timerqueue_head active;
  6939. + struct list_head expired;
  6940. ktime_t resolution;
  6941. ktime_t (*get_time)(void);
  6942. ktime_t softirq_time;
  6943. @@ -192,6 +198,9 @@
  6944. unsigned long nr_hangs;
  6945. ktime_t max_hang_time;
  6946. #endif
  6947. +#ifdef CONFIG_PREEMPT_RT_BASE
  6948. + wait_queue_head_t wait;
  6949. +#endif
  6950. struct hrtimer_clock_base clock_base[HRTIMER_MAX_CLOCK_BASES];
  6951. };
  6952. @@ -379,6 +388,13 @@
  6953. return hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
  6954. }
  6955. +/* Softirq preemption could deadlock timer removal */
  6956. +#ifdef CONFIG_PREEMPT_RT_BASE
  6957. + extern void hrtimer_wait_for_timer(const struct hrtimer *timer);
  6958. +#else
  6959. +# define hrtimer_wait_for_timer(timer) do { cpu_relax(); } while (0)
  6960. +#endif
  6961. +
  6962. /* Query timers: */
  6963. extern ktime_t hrtimer_get_remaining(const struct hrtimer *timer);
  6964. extern int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp);
  6965. diff -Nur linux-3.18.12.orig/include/linux/idr.h linux-3.18.12/include/linux/idr.h
  6966. --- linux-3.18.12.orig/include/linux/idr.h 2015-04-20 14:48:02.000000000 -0500
  6967. +++ linux-3.18.12/include/linux/idr.h 2015-04-26 13:32:22.415684003 -0500
  6968. @@ -95,10 +95,14 @@
  6969. * Each idr_preload() should be matched with an invocation of this
  6970. * function. See idr_preload() for details.
  6971. */
  6972. +#ifdef CONFIG_PREEMPT_RT_FULL
  6973. +void idr_preload_end(void);
  6974. +#else
  6975. static inline void idr_preload_end(void)
  6976. {
  6977. preempt_enable();
  6978. }
  6979. +#endif
  6980. /**
  6981. * idr_find - return pointer for given id
  6982. diff -Nur linux-3.18.12.orig/include/linux/init_task.h linux-3.18.12/include/linux/init_task.h
  6983. --- linux-3.18.12.orig/include/linux/init_task.h 2015-04-20 14:48:02.000000000 -0500
  6984. +++ linux-3.18.12/include/linux/init_task.h 2015-04-26 13:32:22.415684003 -0500
  6985. @@ -147,9 +147,16 @@
  6986. # define INIT_PERF_EVENTS(tsk)
  6987. #endif
  6988. +#ifdef CONFIG_PREEMPT_RT_BASE
  6989. +# define INIT_TIMER_LIST .posix_timer_list = NULL,
  6990. +#else
  6991. +# define INIT_TIMER_LIST
  6992. +#endif
  6993. +
  6994. #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
  6995. # define INIT_VTIME(tsk) \
  6996. - .vtime_seqlock = __SEQLOCK_UNLOCKED(tsk.vtime_seqlock), \
  6997. + .vtime_lock = __RAW_SPIN_LOCK_UNLOCKED(tsk.vtime_lock), \
  6998. + .vtime_seq = SEQCNT_ZERO(tsk.vtime_seq), \
  6999. .vtime_snap = 0, \
  7000. .vtime_snap_whence = VTIME_SYS,
  7001. #else
  7002. @@ -219,6 +226,7 @@
  7003. .cpu_timers = INIT_CPU_TIMERS(tsk.cpu_timers), \
  7004. .pi_lock = __RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock), \
  7005. .timer_slack_ns = 50000, /* 50 usec default slack */ \
  7006. + INIT_TIMER_LIST \
  7007. .pids = { \
  7008. [PIDTYPE_PID] = INIT_PID_LINK(PIDTYPE_PID), \
  7009. [PIDTYPE_PGID] = INIT_PID_LINK(PIDTYPE_PGID), \
  7010. diff -Nur linux-3.18.12.orig/include/linux/interrupt.h linux-3.18.12/include/linux/interrupt.h
  7011. --- linux-3.18.12.orig/include/linux/interrupt.h 2015-04-20 14:48:02.000000000 -0500
  7012. +++ linux-3.18.12/include/linux/interrupt.h 2015-04-26 13:32:22.415684003 -0500
  7013. @@ -57,6 +57,7 @@
  7014. * IRQF_NO_THREAD - Interrupt cannot be threaded
  7015. * IRQF_EARLY_RESUME - Resume IRQ early during syscore instead of at device
  7016. * resume time.
  7017. + * IRQF_NO_SOFTIRQ_CALL - Do not process softirqs in the irq thread context (RT)
  7018. */
  7019. #define IRQF_DISABLED 0x00000020
  7020. #define IRQF_SHARED 0x00000080
  7021. @@ -70,6 +71,7 @@
  7022. #define IRQF_FORCE_RESUME 0x00008000
  7023. #define IRQF_NO_THREAD 0x00010000
  7024. #define IRQF_EARLY_RESUME 0x00020000
  7025. +#define IRQF_NO_SOFTIRQ_CALL 0x00080000
  7026. #define IRQF_TIMER (__IRQF_TIMER | IRQF_NO_SUSPEND | IRQF_NO_THREAD)
  7027. @@ -180,7 +182,7 @@
  7028. #ifdef CONFIG_LOCKDEP
  7029. # define local_irq_enable_in_hardirq() do { } while (0)
  7030. #else
  7031. -# define local_irq_enable_in_hardirq() local_irq_enable()
  7032. +# define local_irq_enable_in_hardirq() local_irq_enable_nort()
  7033. #endif
  7034. extern void disable_irq_nosync(unsigned int irq);
  7035. @@ -210,6 +212,7 @@
  7036. unsigned int irq;
  7037. struct kref kref;
  7038. struct work_struct work;
  7039. + struct list_head list;
  7040. void (*notify)(struct irq_affinity_notify *, const cpumask_t *mask);
  7041. void (*release)(struct kref *ref);
  7042. };
  7043. @@ -358,9 +361,13 @@
  7044. #ifdef CONFIG_IRQ_FORCED_THREADING
  7045. +# ifndef CONFIG_PREEMPT_RT_BASE
  7046. extern bool force_irqthreads;
  7047. +# else
  7048. +# define force_irqthreads (true)
  7049. +# endif
  7050. #else
  7051. -#define force_irqthreads (0)
  7052. +#define force_irqthreads (false)
  7053. #endif
  7054. #ifndef __ARCH_SET_SOFTIRQ_PENDING
  7055. @@ -416,9 +423,10 @@
  7056. void (*action)(struct softirq_action *);
  7057. };
  7058. +#ifndef CONFIG_PREEMPT_RT_FULL
  7059. asmlinkage void do_softirq(void);
  7060. asmlinkage void __do_softirq(void);
  7061. -
  7062. +static inline void thread_do_softirq(void) { do_softirq(); }
  7063. #ifdef __ARCH_HAS_DO_SOFTIRQ
  7064. void do_softirq_own_stack(void);
  7065. #else
  7066. @@ -427,6 +435,9 @@
  7067. __do_softirq();
  7068. }
  7069. #endif
  7070. +#else
  7071. +extern void thread_do_softirq(void);
  7072. +#endif
  7073. extern void open_softirq(int nr, void (*action)(struct softirq_action *));
  7074. extern void softirq_init(void);
  7075. @@ -434,6 +445,7 @@
  7076. extern void raise_softirq_irqoff(unsigned int nr);
  7077. extern void raise_softirq(unsigned int nr);
  7078. +extern void softirq_check_pending_idle(void);
  7079. DECLARE_PER_CPU(struct task_struct *, ksoftirqd);
  7080. @@ -455,8 +467,9 @@
  7081. to be executed on some cpu at least once after this.
  7082. * If the tasklet is already scheduled, but its execution is still not
  7083. started, it will be executed only once.
  7084. - * If this tasklet is already running on another CPU (or schedule is called
  7085. - from tasklet itself), it is rescheduled for later.
  7086. + * If this tasklet is already running on another CPU, it is rescheduled
  7087. + for later.
  7088. + * Schedule must not be called from the tasklet itself (a lockup occurs)
  7089. * Tasklet is strictly serialized wrt itself, but not
  7090. wrt another tasklets. If client needs some intertask synchronization,
  7091. he makes it with spinlocks.
  7092. @@ -481,27 +494,36 @@
  7093. enum
  7094. {
  7095. TASKLET_STATE_SCHED, /* Tasklet is scheduled for execution */
  7096. - TASKLET_STATE_RUN /* Tasklet is running (SMP only) */
  7097. + TASKLET_STATE_RUN, /* Tasklet is running (SMP only) */
  7098. + TASKLET_STATE_PENDING /* Tasklet is pending */
  7099. };
  7100. -#ifdef CONFIG_SMP
  7101. +#define TASKLET_STATEF_SCHED (1 << TASKLET_STATE_SCHED)
  7102. +#define TASKLET_STATEF_RUN (1 << TASKLET_STATE_RUN)
  7103. +#define TASKLET_STATEF_PENDING (1 << TASKLET_STATE_PENDING)
  7104. +
  7105. +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
  7106. static inline int tasklet_trylock(struct tasklet_struct *t)
  7107. {
  7108. return !test_and_set_bit(TASKLET_STATE_RUN, &(t)->state);
  7109. }
  7110. +static inline int tasklet_tryunlock(struct tasklet_struct *t)
  7111. +{
  7112. + return cmpxchg(&t->state, TASKLET_STATEF_RUN, 0) == TASKLET_STATEF_RUN;
  7113. +}
  7114. +
  7115. static inline void tasklet_unlock(struct tasklet_struct *t)
  7116. {
  7117. smp_mb__before_atomic();
  7118. clear_bit(TASKLET_STATE_RUN, &(t)->state);
  7119. }
  7120. -static inline void tasklet_unlock_wait(struct tasklet_struct *t)
  7121. -{
  7122. - while (test_bit(TASKLET_STATE_RUN, &(t)->state)) { barrier(); }
  7123. -}
  7124. +extern void tasklet_unlock_wait(struct tasklet_struct *t);
  7125. +
  7126. #else
  7127. #define tasklet_trylock(t) 1
  7128. +#define tasklet_tryunlock(t) 1
  7129. #define tasklet_unlock_wait(t) do { } while (0)
  7130. #define tasklet_unlock(t) do { } while (0)
  7131. #endif
  7132. @@ -550,17 +572,8 @@
  7133. smp_mb();
  7134. }
  7135. -static inline void tasklet_enable(struct tasklet_struct *t)
  7136. -{
  7137. - smp_mb__before_atomic();
  7138. - atomic_dec(&t->count);
  7139. -}
  7140. -
  7141. -static inline void tasklet_hi_enable(struct tasklet_struct *t)
  7142. -{
  7143. - smp_mb__before_atomic();
  7144. - atomic_dec(&t->count);
  7145. -}
  7146. +extern void tasklet_enable(struct tasklet_struct *t);
  7147. +extern void tasklet_hi_enable(struct tasklet_struct *t);
  7148. extern void tasklet_kill(struct tasklet_struct *t);
  7149. extern void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu);
  7150. @@ -592,6 +605,12 @@
  7151. tasklet_kill(&ttimer->tasklet);
  7152. }
  7153. +#ifdef CONFIG_PREEMPT_RT_FULL
  7154. +extern void softirq_early_init(void);
  7155. +#else
  7156. +static inline void softirq_early_init(void) { }
  7157. +#endif
  7158. +
  7159. /*
  7160. * Autoprobing for irqs:
  7161. *
  7162. diff -Nur linux-3.18.12.orig/include/linux/irqdesc.h linux-3.18.12/include/linux/irqdesc.h
  7163. --- linux-3.18.12.orig/include/linux/irqdesc.h 2015-04-20 14:48:02.000000000 -0500
  7164. +++ linux-3.18.12/include/linux/irqdesc.h 2015-04-26 13:32:22.415684003 -0500
  7165. @@ -63,6 +63,7 @@
  7166. unsigned int irqs_unhandled;
  7167. atomic_t threads_handled;
  7168. int threads_handled_last;
  7169. + u64 random_ip;
  7170. raw_spinlock_t lock;
  7171. struct cpumask *percpu_enabled;
  7172. #ifdef CONFIG_SMP
  7173. diff -Nur linux-3.18.12.orig/include/linux/irqflags.h linux-3.18.12/include/linux/irqflags.h
  7174. --- linux-3.18.12.orig/include/linux/irqflags.h 2015-04-20 14:48:02.000000000 -0500
  7175. +++ linux-3.18.12/include/linux/irqflags.h 2015-04-26 13:32:22.415684003 -0500
  7176. @@ -25,8 +25,6 @@
  7177. # define trace_softirqs_enabled(p) ((p)->softirqs_enabled)
  7178. # define trace_hardirq_enter() do { current->hardirq_context++; } while (0)
  7179. # define trace_hardirq_exit() do { current->hardirq_context--; } while (0)
  7180. -# define lockdep_softirq_enter() do { current->softirq_context++; } while (0)
  7181. -# define lockdep_softirq_exit() do { current->softirq_context--; } while (0)
  7182. # define INIT_TRACE_IRQFLAGS .softirqs_enabled = 1,
  7183. #else
  7184. # define trace_hardirqs_on() do { } while (0)
  7185. @@ -39,9 +37,15 @@
  7186. # define trace_softirqs_enabled(p) 0
  7187. # define trace_hardirq_enter() do { } while (0)
  7188. # define trace_hardirq_exit() do { } while (0)
  7189. +# define INIT_TRACE_IRQFLAGS
  7190. +#endif
  7191. +
  7192. +#if defined(CONFIG_TRACE_IRQFLAGS) && !defined(CONFIG_PREEMPT_RT_FULL)
  7193. +# define lockdep_softirq_enter() do { current->softirq_context++; } while (0)
  7194. +# define lockdep_softirq_exit() do { current->softirq_context--; } while (0)
  7195. +#else
  7196. # define lockdep_softirq_enter() do { } while (0)
  7197. # define lockdep_softirq_exit() do { } while (0)
  7198. -# define INIT_TRACE_IRQFLAGS
  7199. #endif
  7200. #if defined(CONFIG_IRQSOFF_TRACER) || \
  7201. @@ -147,4 +151,23 @@
  7202. #endif /* CONFIG_TRACE_IRQFLAGS_SUPPORT */
  7203. +/*
  7204. + * local_irq* variants depending on RT/!RT
  7205. + */
  7206. +#ifdef CONFIG_PREEMPT_RT_FULL
  7207. +# define local_irq_disable_nort() do { } while (0)
  7208. +# define local_irq_enable_nort() do { } while (0)
  7209. +# define local_irq_save_nort(flags) local_save_flags(flags)
  7210. +# define local_irq_restore_nort(flags) (void)(flags)
  7211. +# define local_irq_disable_rt() local_irq_disable()
  7212. +# define local_irq_enable_rt() local_irq_enable()
  7213. +#else
  7214. +# define local_irq_disable_nort() local_irq_disable()
  7215. +# define local_irq_enable_nort() local_irq_enable()
  7216. +# define local_irq_save_nort(flags) local_irq_save(flags)
  7217. +# define local_irq_restore_nort(flags) local_irq_restore(flags)
  7218. +# define local_irq_disable_rt() do { } while (0)
  7219. +# define local_irq_enable_rt() do { } while (0)
  7220. +#endif
  7221. +
  7222. #endif
  7223. diff -Nur linux-3.18.12.orig/include/linux/irq.h linux-3.18.12/include/linux/irq.h
  7224. --- linux-3.18.12.orig/include/linux/irq.h 2015-04-20 14:48:02.000000000 -0500
  7225. +++ linux-3.18.12/include/linux/irq.h 2015-04-26 13:32:22.415684003 -0500
  7226. @@ -73,6 +73,7 @@
  7227. * IRQ_IS_POLLED - Always polled by another interrupt. Exclude
  7228. * it from the spurious interrupt detection
  7229. * mechanism and from core side polling.
  7230. + * IRQ_NO_SOFTIRQ_CALL - No softirq processing in the irq thread context (RT)
  7231. */
  7232. enum {
  7233. IRQ_TYPE_NONE = 0x00000000,
  7234. @@ -98,13 +99,14 @@
  7235. IRQ_NOTHREAD = (1 << 16),
  7236. IRQ_PER_CPU_DEVID = (1 << 17),
  7237. IRQ_IS_POLLED = (1 << 18),
  7238. + IRQ_NO_SOFTIRQ_CALL = (1 << 19),
  7239. };
  7240. #define IRQF_MODIFY_MASK \
  7241. (IRQ_TYPE_SENSE_MASK | IRQ_NOPROBE | IRQ_NOREQUEST | \
  7242. IRQ_NOAUTOEN | IRQ_MOVE_PCNTXT | IRQ_LEVEL | IRQ_NO_BALANCING | \
  7243. IRQ_PER_CPU | IRQ_NESTED_THREAD | IRQ_NOTHREAD | IRQ_PER_CPU_DEVID | \
  7244. - IRQ_IS_POLLED)
  7245. + IRQ_IS_POLLED | IRQ_NO_SOFTIRQ_CALL)
  7246. #define IRQ_NO_BALANCING_MASK (IRQ_PER_CPU | IRQ_NO_BALANCING)
  7247. diff -Nur linux-3.18.12.orig/include/linux/irq_work.h linux-3.18.12/include/linux/irq_work.h
  7248. --- linux-3.18.12.orig/include/linux/irq_work.h 2015-04-20 14:48:02.000000000 -0500
  7249. +++ linux-3.18.12/include/linux/irq_work.h 2015-04-26 13:32:22.415684003 -0500
  7250. @@ -16,6 +16,7 @@
  7251. #define IRQ_WORK_BUSY 2UL
  7252. #define IRQ_WORK_FLAGS 3UL
  7253. #define IRQ_WORK_LAZY 4UL /* Doesn't want IPI, wait for tick */
  7254. +#define IRQ_WORK_HARD_IRQ 8UL /* Run hard IRQ context, even on RT */
  7255. struct irq_work {
  7256. unsigned long flags;
  7257. diff -Nur linux-3.18.12.orig/include/linux/jbd_common.h linux-3.18.12/include/linux/jbd_common.h
  7258. --- linux-3.18.12.orig/include/linux/jbd_common.h 2015-04-20 14:48:02.000000000 -0500
  7259. +++ linux-3.18.12/include/linux/jbd_common.h 2015-04-26 13:32:22.415684003 -0500
  7260. @@ -15,32 +15,56 @@
  7261. static inline void jbd_lock_bh_state(struct buffer_head *bh)
  7262. {
  7263. +#ifndef CONFIG_PREEMPT_RT_BASE
  7264. bit_spin_lock(BH_State, &bh->b_state);
  7265. +#else
  7266. + spin_lock(&bh->b_state_lock);
  7267. +#endif
  7268. }
  7269. static inline int jbd_trylock_bh_state(struct buffer_head *bh)
  7270. {
  7271. +#ifndef CONFIG_PREEMPT_RT_BASE
  7272. return bit_spin_trylock(BH_State, &bh->b_state);
  7273. +#else
  7274. + return spin_trylock(&bh->b_state_lock);
  7275. +#endif
  7276. }
  7277. static inline int jbd_is_locked_bh_state(struct buffer_head *bh)
  7278. {
  7279. +#ifndef CONFIG_PREEMPT_RT_BASE
  7280. return bit_spin_is_locked(BH_State, &bh->b_state);
  7281. +#else
  7282. + return spin_is_locked(&bh->b_state_lock);
  7283. +#endif
  7284. }
  7285. static inline void jbd_unlock_bh_state(struct buffer_head *bh)
  7286. {
  7287. +#ifndef CONFIG_PREEMPT_RT_BASE
  7288. bit_spin_unlock(BH_State, &bh->b_state);
  7289. +#else
  7290. + spin_unlock(&bh->b_state_lock);
  7291. +#endif
  7292. }
  7293. static inline void jbd_lock_bh_journal_head(struct buffer_head *bh)
  7294. {
  7295. +#ifndef CONFIG_PREEMPT_RT_BASE
  7296. bit_spin_lock(BH_JournalHead, &bh->b_state);
  7297. +#else
  7298. + spin_lock(&bh->b_journal_head_lock);
  7299. +#endif
  7300. }
  7301. static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh)
  7302. {
  7303. +#ifndef CONFIG_PREEMPT_RT_BASE
  7304. bit_spin_unlock(BH_JournalHead, &bh->b_state);
  7305. +#else
  7306. + spin_unlock(&bh->b_journal_head_lock);
  7307. +#endif
  7308. }
  7309. #endif
  7310. diff -Nur linux-3.18.12.orig/include/linux/jump_label.h linux-3.18.12/include/linux/jump_label.h
  7311. --- linux-3.18.12.orig/include/linux/jump_label.h 2015-04-20 14:48:02.000000000 -0500
  7312. +++ linux-3.18.12/include/linux/jump_label.h 2015-04-26 13:32:22.419684003 -0500
  7313. @@ -55,7 +55,8 @@
  7314. "%s used before call to jump_label_init", \
  7315. __func__)
  7316. -#if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_JUMP_LABEL)
  7317. +#if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_JUMP_LABEL) && \
  7318. + !defined(CONFIG_PREEMPT_BASE)
  7319. struct static_key {
  7320. atomic_t enabled;
  7321. diff -Nur linux-3.18.12.orig/include/linux/kdb.h linux-3.18.12/include/linux/kdb.h
  7322. --- linux-3.18.12.orig/include/linux/kdb.h 2015-04-20 14:48:02.000000000 -0500
  7323. +++ linux-3.18.12/include/linux/kdb.h 2015-04-26 13:32:22.419684003 -0500
  7324. @@ -116,7 +116,7 @@
  7325. extern __printf(1, 0) int vkdb_printf(const char *fmt, va_list args);
  7326. extern __printf(1, 2) int kdb_printf(const char *, ...);
  7327. typedef __printf(1, 2) int (*kdb_printf_t)(const char *, ...);
  7328. -
  7329. +#define in_kdb_printk() (kdb_trap_printk)
  7330. extern void kdb_init(int level);
  7331. /* Access to kdb specific polling devices */
  7332. @@ -151,6 +151,7 @@
  7333. extern int kdb_unregister(char *);
  7334. #else /* ! CONFIG_KGDB_KDB */
  7335. static inline __printf(1, 2) int kdb_printf(const char *fmt, ...) { return 0; }
  7336. +#define in_kdb_printk() (0)
  7337. static inline void kdb_init(int level) {}
  7338. static inline int kdb_register(char *cmd, kdb_func_t func, char *usage,
  7339. char *help, short minlen) { return 0; }
  7340. diff -Nur linux-3.18.12.orig/include/linux/kernel.h linux-3.18.12/include/linux/kernel.h
  7341. --- linux-3.18.12.orig/include/linux/kernel.h 2015-04-20 14:48:02.000000000 -0500
  7342. +++ linux-3.18.12/include/linux/kernel.h 2015-04-26 13:32:22.419684003 -0500
  7343. @@ -451,6 +451,7 @@
  7344. SYSTEM_HALT,
  7345. SYSTEM_POWER_OFF,
  7346. SYSTEM_RESTART,
  7347. + SYSTEM_SUSPEND,
  7348. } system_state;
  7349. #define TAINT_PROPRIETARY_MODULE 0
  7350. diff -Nur linux-3.18.12.orig/include/linux/kvm_host.h linux-3.18.12/include/linux/kvm_host.h
  7351. --- linux-3.18.12.orig/include/linux/kvm_host.h 2015-04-20 14:48:02.000000000 -0500
  7352. +++ linux-3.18.12/include/linux/kvm_host.h 2015-04-26 13:32:22.419684003 -0500
  7353. @@ -244,7 +244,7 @@
  7354. int fpu_active;
  7355. int guest_fpu_loaded, guest_xcr0_loaded;
  7356. - wait_queue_head_t wq;
  7357. + struct swait_head wq;
  7358. struct pid *pid;
  7359. int sigset_active;
  7360. sigset_t sigset;
  7361. @@ -687,7 +687,7 @@
  7362. }
  7363. #endif
  7364. -static inline wait_queue_head_t *kvm_arch_vcpu_wq(struct kvm_vcpu *vcpu)
  7365. +static inline struct swait_head *kvm_arch_vcpu_wq(struct kvm_vcpu *vcpu)
  7366. {
  7367. #ifdef __KVM_HAVE_ARCH_WQP
  7368. return vcpu->arch.wqp;
  7369. diff -Nur linux-3.18.12.orig/include/linux/lglock.h linux-3.18.12/include/linux/lglock.h
  7370. --- linux-3.18.12.orig/include/linux/lglock.h 2015-04-20 14:48:02.000000000 -0500
  7371. +++ linux-3.18.12/include/linux/lglock.h 2015-04-26 13:32:22.419684003 -0500
  7372. @@ -34,22 +34,39 @@
  7373. #endif
  7374. struct lglock {
  7375. +#ifndef CONFIG_PREEMPT_RT_FULL
  7376. arch_spinlock_t __percpu *lock;
  7377. +#else
  7378. + struct rt_mutex __percpu *lock;
  7379. +#endif
  7380. #ifdef CONFIG_DEBUG_LOCK_ALLOC
  7381. struct lock_class_key lock_key;
  7382. struct lockdep_map lock_dep_map;
  7383. #endif
  7384. };
  7385. -#define DEFINE_LGLOCK(name) \
  7386. +#ifndef CONFIG_PREEMPT_RT_FULL
  7387. +# define DEFINE_LGLOCK(name) \
  7388. static DEFINE_PER_CPU(arch_spinlock_t, name ## _lock) \
  7389. = __ARCH_SPIN_LOCK_UNLOCKED; \
  7390. struct lglock name = { .lock = &name ## _lock }
  7391. -#define DEFINE_STATIC_LGLOCK(name) \
  7392. +# define DEFINE_STATIC_LGLOCK(name) \
  7393. static DEFINE_PER_CPU(arch_spinlock_t, name ## _lock) \
  7394. = __ARCH_SPIN_LOCK_UNLOCKED; \
  7395. static struct lglock name = { .lock = &name ## _lock }
  7396. +#else
  7397. +
  7398. +# define DEFINE_LGLOCK(name) \
  7399. + static DEFINE_PER_CPU(struct rt_mutex, name ## _lock) \
  7400. + = __RT_MUTEX_INITIALIZER( name ## _lock); \
  7401. + struct lglock name = { .lock = &name ## _lock }
  7402. +
  7403. +# define DEFINE_STATIC_LGLOCK(name) \
  7404. + static DEFINE_PER_CPU(struct rt_mutex, name ## _lock) \
  7405. + = __RT_MUTEX_INITIALIZER( name ## _lock); \
  7406. + static struct lglock name = { .lock = &name ## _lock }
  7407. +#endif
  7408. void lg_lock_init(struct lglock *lg, char *name);
  7409. void lg_local_lock(struct lglock *lg);
  7410. @@ -59,6 +76,12 @@
  7411. void lg_global_lock(struct lglock *lg);
  7412. void lg_global_unlock(struct lglock *lg);
  7413. +#ifndef CONFIG_PREEMPT_RT_FULL
  7414. +#define lg_global_trylock_relax(name) lg_global_lock(name)
  7415. +#else
  7416. +void lg_global_trylock_relax(struct lglock *lg);
  7417. +#endif
  7418. +
  7419. #else
  7420. /* When !CONFIG_SMP, map lglock to spinlock */
  7421. #define lglock spinlock
  7422. diff -Nur linux-3.18.12.orig/include/linux/list_bl.h linux-3.18.12/include/linux/list_bl.h
  7423. --- linux-3.18.12.orig/include/linux/list_bl.h 2015-04-20 14:48:02.000000000 -0500
  7424. +++ linux-3.18.12/include/linux/list_bl.h 2015-04-26 13:32:22.419684003 -0500
  7425. @@ -2,6 +2,7 @@
  7426. #define _LINUX_LIST_BL_H
  7427. #include <linux/list.h>
  7428. +#include <linux/spinlock.h>
  7429. #include <linux/bit_spinlock.h>
  7430. /*
  7431. @@ -32,13 +33,22 @@
  7432. struct hlist_bl_head {
  7433. struct hlist_bl_node *first;
  7434. +#ifdef CONFIG_PREEMPT_RT_BASE
  7435. + raw_spinlock_t lock;
  7436. +#endif
  7437. };
  7438. struct hlist_bl_node {
  7439. struct hlist_bl_node *next, **pprev;
  7440. };
  7441. -#define INIT_HLIST_BL_HEAD(ptr) \
  7442. - ((ptr)->first = NULL)
  7443. +
  7444. +static inline void INIT_HLIST_BL_HEAD(struct hlist_bl_head *h)
  7445. +{
  7446. + h->first = NULL;
  7447. +#ifdef CONFIG_PREEMPT_RT_BASE
  7448. + raw_spin_lock_init(&h->lock);
  7449. +#endif
  7450. +}
  7451. static inline void INIT_HLIST_BL_NODE(struct hlist_bl_node *h)
  7452. {
  7453. @@ -117,12 +127,26 @@
  7454. static inline void hlist_bl_lock(struct hlist_bl_head *b)
  7455. {
  7456. +#ifndef CONFIG_PREEMPT_RT_BASE
  7457. bit_spin_lock(0, (unsigned long *)b);
  7458. +#else
  7459. + raw_spin_lock(&b->lock);
  7460. +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
  7461. + __set_bit(0, (unsigned long *)b);
  7462. +#endif
  7463. +#endif
  7464. }
  7465. static inline void hlist_bl_unlock(struct hlist_bl_head *b)
  7466. {
  7467. +#ifndef CONFIG_PREEMPT_RT_BASE
  7468. __bit_spin_unlock(0, (unsigned long *)b);
  7469. +#else
  7470. +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
  7471. + __clear_bit(0, (unsigned long *)b);
  7472. +#endif
  7473. + raw_spin_unlock(&b->lock);
  7474. +#endif
  7475. }
  7476. static inline bool hlist_bl_is_locked(struct hlist_bl_head *b)
  7477. diff -Nur linux-3.18.12.orig/include/linux/locallock.h linux-3.18.12/include/linux/locallock.h
  7478. --- linux-3.18.12.orig/include/linux/locallock.h 1969-12-31 18:00:00.000000000 -0600
  7479. +++ linux-3.18.12/include/linux/locallock.h 2015-04-26 13:32:22.419684003 -0500
  7480. @@ -0,0 +1,270 @@
  7481. +#ifndef _LINUX_LOCALLOCK_H
  7482. +#define _LINUX_LOCALLOCK_H
  7483. +
  7484. +#include <linux/percpu.h>
  7485. +#include <linux/spinlock.h>
  7486. +
  7487. +#ifdef CONFIG_PREEMPT_RT_BASE
  7488. +
  7489. +#ifdef CONFIG_DEBUG_SPINLOCK
  7490. +# define LL_WARN(cond) WARN_ON(cond)
  7491. +#else
  7492. +# define LL_WARN(cond) do { } while (0)
  7493. +#endif
  7494. +
  7495. +/*
  7496. + * per cpu lock based substitute for local_irq_*()
  7497. + */
  7498. +struct local_irq_lock {
  7499. + spinlock_t lock;
  7500. + struct task_struct *owner;
  7501. + int nestcnt;
  7502. + unsigned long flags;
  7503. +};
  7504. +
  7505. +#define DEFINE_LOCAL_IRQ_LOCK(lvar) \
  7506. + DEFINE_PER_CPU(struct local_irq_lock, lvar) = { \
  7507. + .lock = __SPIN_LOCK_UNLOCKED((lvar).lock) }
  7508. +
  7509. +#define DECLARE_LOCAL_IRQ_LOCK(lvar) \
  7510. + DECLARE_PER_CPU(struct local_irq_lock, lvar)
  7511. +
  7512. +#define local_irq_lock_init(lvar) \
  7513. + do { \
  7514. + int __cpu; \
  7515. + for_each_possible_cpu(__cpu) \
  7516. + spin_lock_init(&per_cpu(lvar, __cpu).lock); \
  7517. + } while (0)
  7518. +
  7519. +/*
  7520. + * spin_lock|trylock|unlock_local flavour that does not migrate disable
  7521. + * used for __local_lock|trylock|unlock where get_local_var/put_local_var
  7522. + * already takes care of the migrate_disable/enable
  7523. + * for CONFIG_PREEMPT_BASE map to the normal spin_* calls.
  7524. + */
  7525. +#ifdef CONFIG_PREEMPT_RT_FULL
  7526. +# define spin_lock_local(lock) rt_spin_lock(lock)
  7527. +# define spin_trylock_local(lock) rt_spin_trylock(lock)
  7528. +# define spin_unlock_local(lock) rt_spin_unlock(lock)
  7529. +#else
  7530. +# define spin_lock_local(lock) spin_lock(lock)
  7531. +# define spin_trylock_local(lock) spin_trylock(lock)
  7532. +# define spin_unlock_local(lock) spin_unlock(lock)
  7533. +#endif
  7534. +
  7535. +static inline void __local_lock(struct local_irq_lock *lv)
  7536. +{
  7537. + if (lv->owner != current) {
  7538. + spin_lock_local(&lv->lock);
  7539. + LL_WARN(lv->owner);
  7540. + LL_WARN(lv->nestcnt);
  7541. + lv->owner = current;
  7542. + }
  7543. + lv->nestcnt++;
  7544. +}
  7545. +
  7546. +#define local_lock(lvar) \
  7547. + do { __local_lock(&get_local_var(lvar)); } while (0)
  7548. +
  7549. +static inline int __local_trylock(struct local_irq_lock *lv)
  7550. +{
  7551. + if (lv->owner != current && spin_trylock_local(&lv->lock)) {
  7552. + LL_WARN(lv->owner);
  7553. + LL_WARN(lv->nestcnt);
  7554. + lv->owner = current;
  7555. + lv->nestcnt = 1;
  7556. + return 1;
  7557. + }
  7558. + return 0;
  7559. +}
  7560. +
  7561. +#define local_trylock(lvar) \
  7562. + ({ \
  7563. + int __locked; \
  7564. + __locked = __local_trylock(&get_local_var(lvar)); \
  7565. + if (!__locked) \
  7566. + put_local_var(lvar); \
  7567. + __locked; \
  7568. + })
  7569. +
  7570. +static inline void __local_unlock(struct local_irq_lock *lv)
  7571. +{
  7572. + LL_WARN(lv->nestcnt == 0);
  7573. + LL_WARN(lv->owner != current);
  7574. + if (--lv->nestcnt)
  7575. + return;
  7576. +
  7577. + lv->owner = NULL;
  7578. + spin_unlock_local(&lv->lock);
  7579. +}
  7580. +
  7581. +#define local_unlock(lvar) \
  7582. + do { \
  7583. + __local_unlock(&__get_cpu_var(lvar)); \
  7584. + put_local_var(lvar); \
  7585. + } while (0)
  7586. +
  7587. +static inline void __local_lock_irq(struct local_irq_lock *lv)
  7588. +{
  7589. + spin_lock_irqsave(&lv->lock, lv->flags);
  7590. + LL_WARN(lv->owner);
  7591. + LL_WARN(lv->nestcnt);
  7592. + lv->owner = current;
  7593. + lv->nestcnt = 1;
  7594. +}
  7595. +
  7596. +#define local_lock_irq(lvar) \
  7597. + do { __local_lock_irq(&get_local_var(lvar)); } while (0)
  7598. +
  7599. +#define local_lock_irq_on(lvar, cpu) \
  7600. + do { __local_lock_irq(&per_cpu(lvar, cpu)); } while (0)
  7601. +
  7602. +static inline void __local_unlock_irq(struct local_irq_lock *lv)
  7603. +{
  7604. + LL_WARN(!lv->nestcnt);
  7605. + LL_WARN(lv->owner != current);
  7606. + lv->owner = NULL;
  7607. + lv->nestcnt = 0;
  7608. + spin_unlock_irq(&lv->lock);
  7609. +}
  7610. +
  7611. +#define local_unlock_irq(lvar) \
  7612. + do { \
  7613. + __local_unlock_irq(&__get_cpu_var(lvar)); \
  7614. + put_local_var(lvar); \
  7615. + } while (0)
  7616. +
  7617. +#define local_unlock_irq_on(lvar, cpu) \
  7618. + do { \
  7619. + __local_unlock_irq(&per_cpu(lvar, cpu)); \
  7620. + } while (0)
  7621. +
  7622. +static inline int __local_lock_irqsave(struct local_irq_lock *lv)
  7623. +{
  7624. + if (lv->owner != current) {
  7625. + __local_lock_irq(lv);
  7626. + return 0;
  7627. + } else {
  7628. + lv->nestcnt++;
  7629. + return 1;
  7630. + }
  7631. +}
  7632. +
  7633. +#define local_lock_irqsave(lvar, _flags) \
  7634. + do { \
  7635. + if (__local_lock_irqsave(&get_local_var(lvar))) \
  7636. + put_local_var(lvar); \
  7637. + _flags = __get_cpu_var(lvar).flags; \
  7638. + } while (0)
  7639. +
  7640. +#define local_lock_irqsave_on(lvar, _flags, cpu) \
  7641. + do { \
  7642. + __local_lock_irqsave(&per_cpu(lvar, cpu)); \
  7643. + _flags = per_cpu(lvar, cpu).flags; \
  7644. + } while (0)
  7645. +
  7646. +static inline int __local_unlock_irqrestore(struct local_irq_lock *lv,
  7647. + unsigned long flags)
  7648. +{
  7649. + LL_WARN(!lv->nestcnt);
  7650. + LL_WARN(lv->owner != current);
  7651. + if (--lv->nestcnt)
  7652. + return 0;
  7653. +
  7654. + lv->owner = NULL;
  7655. + spin_unlock_irqrestore(&lv->lock, lv->flags);
  7656. + return 1;
  7657. +}
  7658. +
  7659. +#define local_unlock_irqrestore(lvar, flags) \
  7660. + do { \
  7661. + if (__local_unlock_irqrestore(&__get_cpu_var(lvar), flags)) \
  7662. + put_local_var(lvar); \
  7663. + } while (0)
  7664. +
  7665. +#define local_unlock_irqrestore_on(lvar, flags, cpu) \
  7666. + do { \
  7667. + __local_unlock_irqrestore(&per_cpu(lvar, cpu), flags); \
  7668. + } while (0)
  7669. +
  7670. +#define local_spin_trylock_irq(lvar, lock) \
  7671. + ({ \
  7672. + int __locked; \
  7673. + local_lock_irq(lvar); \
  7674. + __locked = spin_trylock(lock); \
  7675. + if (!__locked) \
  7676. + local_unlock_irq(lvar); \
  7677. + __locked; \
  7678. + })
  7679. +
  7680. +#define local_spin_lock_irq(lvar, lock) \
  7681. + do { \
  7682. + local_lock_irq(lvar); \
  7683. + spin_lock(lock); \
  7684. + } while (0)
  7685. +
  7686. +#define local_spin_unlock_irq(lvar, lock) \
  7687. + do { \
  7688. + spin_unlock(lock); \
  7689. + local_unlock_irq(lvar); \
  7690. + } while (0)
  7691. +
  7692. +#define local_spin_lock_irqsave(lvar, lock, flags) \
  7693. + do { \
  7694. + local_lock_irqsave(lvar, flags); \
  7695. + spin_lock(lock); \
  7696. + } while (0)
  7697. +
  7698. +#define local_spin_unlock_irqrestore(lvar, lock, flags) \
  7699. + do { \
  7700. + spin_unlock(lock); \
  7701. + local_unlock_irqrestore(lvar, flags); \
  7702. + } while (0)
  7703. +
  7704. +#define get_locked_var(lvar, var) \
  7705. + (*({ \
  7706. + local_lock(lvar); \
  7707. + &__get_cpu_var(var); \
  7708. + }))
  7709. +
  7710. +#define put_locked_var(lvar, var) local_unlock(lvar);
  7711. +
  7712. +#define local_lock_cpu(lvar) \
  7713. + ({ \
  7714. + local_lock(lvar); \
  7715. + smp_processor_id(); \
  7716. + })
  7717. +
  7718. +#define local_unlock_cpu(lvar) local_unlock(lvar)
  7719. +
  7720. +#else /* PREEMPT_RT_BASE */
  7721. +
  7722. +#define DEFINE_LOCAL_IRQ_LOCK(lvar) __typeof__(const int) lvar
  7723. +#define DECLARE_LOCAL_IRQ_LOCK(lvar) extern __typeof__(const int) lvar
  7724. +
  7725. +static inline void local_irq_lock_init(int lvar) { }
  7726. +
  7727. +#define local_lock(lvar) preempt_disable()
  7728. +#define local_unlock(lvar) preempt_enable()
  7729. +#define local_lock_irq(lvar) local_irq_disable()
  7730. +#define local_unlock_irq(lvar) local_irq_enable()
  7731. +#define local_lock_irqsave(lvar, flags) local_irq_save(flags)
  7732. +#define local_unlock_irqrestore(lvar, flags) local_irq_restore(flags)
  7733. +
  7734. +#define local_spin_trylock_irq(lvar, lock) spin_trylock_irq(lock)
  7735. +#define local_spin_lock_irq(lvar, lock) spin_lock_irq(lock)
  7736. +#define local_spin_unlock_irq(lvar, lock) spin_unlock_irq(lock)
  7737. +#define local_spin_lock_irqsave(lvar, lock, flags) \
  7738. + spin_lock_irqsave(lock, flags)
  7739. +#define local_spin_unlock_irqrestore(lvar, lock, flags) \
  7740. + spin_unlock_irqrestore(lock, flags)
  7741. +
  7742. +#define get_locked_var(lvar, var) get_cpu_var(var)
  7743. +#define put_locked_var(lvar, var) put_cpu_var(var)
  7744. +
  7745. +#define local_lock_cpu(lvar) get_cpu()
  7746. +#define local_unlock_cpu(lvar) put_cpu()
  7747. +
  7748. +#endif
  7749. +
  7750. +#endif
  7751. diff -Nur linux-3.18.12.orig/include/linux/mm_types.h linux-3.18.12/include/linux/mm_types.h
  7752. --- linux-3.18.12.orig/include/linux/mm_types.h 2015-04-20 14:48:02.000000000 -0500
  7753. +++ linux-3.18.12/include/linux/mm_types.h 2015-04-26 13:32:22.419684003 -0500
  7754. @@ -11,6 +11,7 @@
  7755. #include <linux/completion.h>
  7756. #include <linux/cpumask.h>
  7757. #include <linux/page-debug-flags.h>
  7758. +#include <linux/rcupdate.h>
  7759. #include <linux/uprobes.h>
  7760. #include <linux/page-flags-layout.h>
  7761. #include <asm/page.h>
  7762. @@ -454,6 +455,9 @@
  7763. bool tlb_flush_pending;
  7764. #endif
  7765. struct uprobes_state uprobes_state;
  7766. +#ifdef CONFIG_PREEMPT_RT_BASE
  7767. + struct rcu_head delayed_drop;
  7768. +#endif
  7769. };
  7770. static inline void mm_init_cpumask(struct mm_struct *mm)
  7771. diff -Nur linux-3.18.12.orig/include/linux/mutex.h linux-3.18.12/include/linux/mutex.h
  7772. --- linux-3.18.12.orig/include/linux/mutex.h 2015-04-20 14:48:02.000000000 -0500
  7773. +++ linux-3.18.12/include/linux/mutex.h 2015-04-26 13:32:22.419684003 -0500
  7774. @@ -19,6 +19,17 @@
  7775. #include <asm/processor.h>
  7776. #include <linux/osq_lock.h>
  7777. +#ifdef CONFIG_DEBUG_LOCK_ALLOC
  7778. +# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \
  7779. + , .dep_map = { .name = #lockname }
  7780. +#else
  7781. +# define __DEP_MAP_MUTEX_INITIALIZER(lockname)
  7782. +#endif
  7783. +
  7784. +#ifdef CONFIG_PREEMPT_RT_FULL
  7785. +# include <linux/mutex_rt.h>
  7786. +#else
  7787. +
  7788. /*
  7789. * Simple, straightforward mutexes with strict semantics:
  7790. *
  7791. @@ -100,13 +111,6 @@
  7792. static inline void mutex_destroy(struct mutex *lock) {}
  7793. #endif
  7794. -#ifdef CONFIG_DEBUG_LOCK_ALLOC
  7795. -# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \
  7796. - , .dep_map = { .name = #lockname }
  7797. -#else
  7798. -# define __DEP_MAP_MUTEX_INITIALIZER(lockname)
  7799. -#endif
  7800. -
  7801. #define __MUTEX_INITIALIZER(lockname) \
  7802. { .count = ATOMIC_INIT(1) \
  7803. , .wait_lock = __SPIN_LOCK_UNLOCKED(lockname.wait_lock) \
  7804. @@ -174,6 +178,8 @@
  7805. extern int mutex_trylock(struct mutex *lock);
  7806. extern void mutex_unlock(struct mutex *lock);
  7807. +#endif /* !PREEMPT_RT_FULL */
  7808. +
  7809. extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock);
  7810. #endif /* __LINUX_MUTEX_H */
  7811. diff -Nur linux-3.18.12.orig/include/linux/mutex_rt.h linux-3.18.12/include/linux/mutex_rt.h
  7812. --- linux-3.18.12.orig/include/linux/mutex_rt.h 1969-12-31 18:00:00.000000000 -0600
  7813. +++ linux-3.18.12/include/linux/mutex_rt.h 2015-04-26 13:32:22.419684003 -0500
  7814. @@ -0,0 +1,84 @@
  7815. +#ifndef __LINUX_MUTEX_RT_H
  7816. +#define __LINUX_MUTEX_RT_H
  7817. +
  7818. +#ifndef __LINUX_MUTEX_H
  7819. +#error "Please include mutex.h"
  7820. +#endif
  7821. +
  7822. +#include <linux/rtmutex.h>
  7823. +
  7824. +/* FIXME: Just for __lockfunc */
  7825. +#include <linux/spinlock.h>
  7826. +
  7827. +struct mutex {
  7828. + struct rt_mutex lock;
  7829. +#ifdef CONFIG_DEBUG_LOCK_ALLOC
  7830. + struct lockdep_map dep_map;
  7831. +#endif
  7832. +};
  7833. +
  7834. +#define __MUTEX_INITIALIZER(mutexname) \
  7835. + { \
  7836. + .lock = __RT_MUTEX_INITIALIZER(mutexname.lock) \
  7837. + __DEP_MAP_MUTEX_INITIALIZER(mutexname) \
  7838. + }
  7839. +
  7840. +#define DEFINE_MUTEX(mutexname) \
  7841. + struct mutex mutexname = __MUTEX_INITIALIZER(mutexname)
  7842. +
  7843. +extern void __mutex_do_init(struct mutex *lock, const char *name, struct lock_class_key *key);
  7844. +extern void __lockfunc _mutex_lock(struct mutex *lock);
  7845. +extern int __lockfunc _mutex_lock_interruptible(struct mutex *lock);
  7846. +extern int __lockfunc _mutex_lock_killable(struct mutex *lock);
  7847. +extern void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass);
  7848. +extern void __lockfunc _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest_lock);
  7849. +extern int __lockfunc _mutex_lock_interruptible_nested(struct mutex *lock, int subclass);
  7850. +extern int __lockfunc _mutex_lock_killable_nested(struct mutex *lock, int subclass);
  7851. +extern int __lockfunc _mutex_trylock(struct mutex *lock);
  7852. +extern void __lockfunc _mutex_unlock(struct mutex *lock);
  7853. +
  7854. +#define mutex_is_locked(l) rt_mutex_is_locked(&(l)->lock)
  7855. +#define mutex_lock(l) _mutex_lock(l)
  7856. +#define mutex_lock_interruptible(l) _mutex_lock_interruptible(l)
  7857. +#define mutex_lock_killable(l) _mutex_lock_killable(l)
  7858. +#define mutex_trylock(l) _mutex_trylock(l)
  7859. +#define mutex_unlock(l) _mutex_unlock(l)
  7860. +#define mutex_destroy(l) rt_mutex_destroy(&(l)->lock)
  7861. +
  7862. +#ifdef CONFIG_DEBUG_LOCK_ALLOC
  7863. +# define mutex_lock_nested(l, s) _mutex_lock_nested(l, s)
  7864. +# define mutex_lock_interruptible_nested(l, s) \
  7865. + _mutex_lock_interruptible_nested(l, s)
  7866. +# define mutex_lock_killable_nested(l, s) \
  7867. + _mutex_lock_killable_nested(l, s)
  7868. +
  7869. +# define mutex_lock_nest_lock(lock, nest_lock) \
  7870. +do { \
  7871. + typecheck(struct lockdep_map *, &(nest_lock)->dep_map); \
  7872. + _mutex_lock_nest_lock(lock, &(nest_lock)->dep_map); \
  7873. +} while (0)
  7874. +
  7875. +#else
  7876. +# define mutex_lock_nested(l, s) _mutex_lock(l)
  7877. +# define mutex_lock_interruptible_nested(l, s) \
  7878. + _mutex_lock_interruptible(l)
  7879. +# define mutex_lock_killable_nested(l, s) \
  7880. + _mutex_lock_killable(l)
  7881. +# define mutex_lock_nest_lock(lock, nest_lock) mutex_lock(lock)
  7882. +#endif
  7883. +
  7884. +# define mutex_init(mutex) \
  7885. +do { \
  7886. + static struct lock_class_key __key; \
  7887. + \
  7888. + rt_mutex_init(&(mutex)->lock); \
  7889. + __mutex_do_init((mutex), #mutex, &__key); \
  7890. +} while (0)
  7891. +
  7892. +# define __mutex_init(mutex, name, key) \
  7893. +do { \
  7894. + rt_mutex_init(&(mutex)->lock); \
  7895. + __mutex_do_init((mutex), name, key); \
  7896. +} while (0)
  7897. +
  7898. +#endif
  7899. diff -Nur linux-3.18.12.orig/include/linux/netdevice.h linux-3.18.12/include/linux/netdevice.h
  7900. --- linux-3.18.12.orig/include/linux/netdevice.h 2015-04-20 14:48:02.000000000 -0500
  7901. +++ linux-3.18.12/include/linux/netdevice.h 2015-04-26 13:32:22.419684003 -0500
  7902. @@ -2345,6 +2345,7 @@
  7903. unsigned int dropped;
  7904. struct sk_buff_head input_pkt_queue;
  7905. struct napi_struct backlog;
  7906. + struct sk_buff_head tofree_queue;
  7907. #ifdef CONFIG_NET_FLOW_LIMIT
  7908. struct sd_flow_limit __rcu *flow_limit;
  7909. diff -Nur linux-3.18.12.orig/include/linux/netfilter/x_tables.h linux-3.18.12/include/linux/netfilter/x_tables.h
  7910. --- linux-3.18.12.orig/include/linux/netfilter/x_tables.h 2015-04-20 14:48:02.000000000 -0500
  7911. +++ linux-3.18.12/include/linux/netfilter/x_tables.h 2015-04-26 13:32:22.419684003 -0500
  7912. @@ -3,6 +3,7 @@
  7913. #include <linux/netdevice.h>
  7914. +#include <linux/locallock.h>
  7915. #include <uapi/linux/netfilter/x_tables.h>
  7916. /**
  7917. @@ -282,6 +283,8 @@
  7918. */
  7919. DECLARE_PER_CPU(seqcount_t, xt_recseq);
  7920. +DECLARE_LOCAL_IRQ_LOCK(xt_write_lock);
  7921. +
  7922. /**
  7923. * xt_write_recseq_begin - start of a write section
  7924. *
  7925. @@ -296,6 +299,9 @@
  7926. {
  7927. unsigned int addend;
  7928. + /* RT protection */
  7929. + local_lock(xt_write_lock);
  7930. +
  7931. /*
  7932. * Low order bit of sequence is set if we already
  7933. * called xt_write_recseq_begin().
  7934. @@ -326,6 +332,7 @@
  7935. /* this is kind of a write_seqcount_end(), but addend is 0 or 1 */
  7936. smp_wmb();
  7937. __this_cpu_add(xt_recseq.sequence, addend);
  7938. + local_unlock(xt_write_lock);
  7939. }
  7940. /*
  7941. diff -Nur linux-3.18.12.orig/include/linux/notifier.h linux-3.18.12/include/linux/notifier.h
  7942. --- linux-3.18.12.orig/include/linux/notifier.h 2015-04-20 14:48:02.000000000 -0500
  7943. +++ linux-3.18.12/include/linux/notifier.h 2015-04-26 13:32:22.419684003 -0500
  7944. @@ -6,7 +6,7 @@
  7945. *
  7946. * Alan Cox <Alan.Cox@linux.org>
  7947. */
  7948. -
  7949. +
  7950. #ifndef _LINUX_NOTIFIER_H
  7951. #define _LINUX_NOTIFIER_H
  7952. #include <linux/errno.h>
  7953. @@ -42,9 +42,7 @@
  7954. * in srcu_notifier_call_chain(): no cache bounces and no memory barriers.
  7955. * As compensation, srcu_notifier_chain_unregister() is rather expensive.
  7956. * SRCU notifier chains should be used when the chain will be called very
  7957. - * often but notifier_blocks will seldom be removed. Also, SRCU notifier
  7958. - * chains are slightly more difficult to use because they require special
  7959. - * runtime initialization.
  7960. + * often but notifier_blocks will seldom be removed.
  7961. */
  7962. typedef int (*notifier_fn_t)(struct notifier_block *nb,
  7963. @@ -88,7 +86,7 @@
  7964. (name)->head = NULL; \
  7965. } while (0)
  7966. -/* srcu_notifier_heads must be initialized and cleaned up dynamically */
  7967. +/* srcu_notifier_heads must be cleaned up dynamically */
  7968. extern void srcu_init_notifier_head(struct srcu_notifier_head *nh);
  7969. #define srcu_cleanup_notifier_head(name) \
  7970. cleanup_srcu_struct(&(name)->srcu);
  7971. @@ -101,7 +99,13 @@
  7972. .head = NULL }
  7973. #define RAW_NOTIFIER_INIT(name) { \
  7974. .head = NULL }
  7975. -/* srcu_notifier_heads cannot be initialized statically */
  7976. +
  7977. +#define SRCU_NOTIFIER_INIT(name, pcpu) \
  7978. + { \
  7979. + .mutex = __MUTEX_INITIALIZER(name.mutex), \
  7980. + .head = NULL, \
  7981. + .srcu = __SRCU_STRUCT_INIT(name.srcu, pcpu), \
  7982. + }
  7983. #define ATOMIC_NOTIFIER_HEAD(name) \
  7984. struct atomic_notifier_head name = \
  7985. @@ -113,6 +117,18 @@
  7986. struct raw_notifier_head name = \
  7987. RAW_NOTIFIER_INIT(name)
  7988. +#define _SRCU_NOTIFIER_HEAD(name, mod) \
  7989. + static DEFINE_PER_CPU(struct srcu_struct_array, \
  7990. + name##_head_srcu_array); \
  7991. + mod struct srcu_notifier_head name = \
  7992. + SRCU_NOTIFIER_INIT(name, name##_head_srcu_array)
  7993. +
  7994. +#define SRCU_NOTIFIER_HEAD(name) \
  7995. + _SRCU_NOTIFIER_HEAD(name, )
  7996. +
  7997. +#define SRCU_NOTIFIER_HEAD_STATIC(name) \
  7998. + _SRCU_NOTIFIER_HEAD(name, static)
  7999. +
  8000. #ifdef __KERNEL__
  8001. extern int atomic_notifier_chain_register(struct atomic_notifier_head *nh,
  8002. @@ -182,12 +198,12 @@
  8003. /*
  8004. * Declared notifiers so far. I can imagine quite a few more chains
  8005. - * over time (eg laptop power reset chains, reboot chain (to clean
  8006. + * over time (eg laptop power reset chains, reboot chain (to clean
  8007. * device units up), device [un]mount chain, module load/unload chain,
  8008. - * low memory chain, screenblank chain (for plug in modular screenblankers)
  8009. + * low memory chain, screenblank chain (for plug in modular screenblankers)
  8010. * VC switch chains (for loadable kernel svgalib VC switch helpers) etc...
  8011. */
  8012. -
  8013. +
  8014. /* CPU notfiers are defined in include/linux/cpu.h. */
  8015. /* netdevice notifiers are defined in include/linux/netdevice.h */
  8016. diff -Nur linux-3.18.12.orig/include/linux/percpu.h linux-3.18.12/include/linux/percpu.h
  8017. --- linux-3.18.12.orig/include/linux/percpu.h 2015-04-20 14:48:02.000000000 -0500
  8018. +++ linux-3.18.12/include/linux/percpu.h 2015-04-26 13:32:22.419684003 -0500
  8019. @@ -23,6 +23,35 @@
  8020. PERCPU_MODULE_RESERVE)
  8021. #endif
  8022. +#ifdef CONFIG_PREEMPT_RT_FULL
  8023. +
  8024. +#define get_local_var(var) (*({ \
  8025. + migrate_disable(); \
  8026. + &__get_cpu_var(var); }))
  8027. +
  8028. +#define put_local_var(var) do { \
  8029. + (void)&(var); \
  8030. + migrate_enable(); \
  8031. +} while (0)
  8032. +
  8033. +# define get_local_ptr(var) ({ \
  8034. + migrate_disable(); \
  8035. + this_cpu_ptr(var); })
  8036. +
  8037. +# define put_local_ptr(var) do { \
  8038. + (void)(var); \
  8039. + migrate_enable(); \
  8040. +} while (0)
  8041. +
  8042. +#else
  8043. +
  8044. +#define get_local_var(var) get_cpu_var(var)
  8045. +#define put_local_var(var) put_cpu_var(var)
  8046. +#define get_local_ptr(var) get_cpu_ptr(var)
  8047. +#define put_local_ptr(var) put_cpu_ptr(var)
  8048. +
  8049. +#endif
  8050. +
  8051. /* minimum unit size, also is the maximum supported allocation size */
  8052. #define PCPU_MIN_UNIT_SIZE PFN_ALIGN(32 << 10)
  8053. diff -Nur linux-3.18.12.orig/include/linux/pid.h linux-3.18.12/include/linux/pid.h
  8054. --- linux-3.18.12.orig/include/linux/pid.h 2015-04-20 14:48:02.000000000 -0500
  8055. +++ linux-3.18.12/include/linux/pid.h 2015-04-26 13:32:22.419684003 -0500
  8056. @@ -2,6 +2,7 @@
  8057. #define _LINUX_PID_H
  8058. #include <linux/rcupdate.h>
  8059. +#include <linux/atomic.h>
  8060. enum pid_type
  8061. {
  8062. diff -Nur linux-3.18.12.orig/include/linux/preempt.h linux-3.18.12/include/linux/preempt.h
  8063. --- linux-3.18.12.orig/include/linux/preempt.h 2015-04-20 14:48:02.000000000 -0500
  8064. +++ linux-3.18.12/include/linux/preempt.h 2015-04-26 13:32:22.419684003 -0500
  8065. @@ -33,6 +33,20 @@
  8066. #define preempt_count_inc() preempt_count_add(1)
  8067. #define preempt_count_dec() preempt_count_sub(1)
  8068. +#ifdef CONFIG_PREEMPT_LAZY
  8069. +#define add_preempt_lazy_count(val) do { preempt_lazy_count() += (val); } while (0)
  8070. +#define sub_preempt_lazy_count(val) do { preempt_lazy_count() -= (val); } while (0)
  8071. +#define inc_preempt_lazy_count() add_preempt_lazy_count(1)
  8072. +#define dec_preempt_lazy_count() sub_preempt_lazy_count(1)
  8073. +#define preempt_lazy_count() (current_thread_info()->preempt_lazy_count)
  8074. +#else
  8075. +#define add_preempt_lazy_count(val) do { } while (0)
  8076. +#define sub_preempt_lazy_count(val) do { } while (0)
  8077. +#define inc_preempt_lazy_count() do { } while (0)
  8078. +#define dec_preempt_lazy_count() do { } while (0)
  8079. +#define preempt_lazy_count() (0)
  8080. +#endif
  8081. +
  8082. #ifdef CONFIG_PREEMPT_COUNT
  8083. #define preempt_disable() \
  8084. @@ -41,13 +55,25 @@
  8085. barrier(); \
  8086. } while (0)
  8087. +#define preempt_lazy_disable() \
  8088. +do { \
  8089. + inc_preempt_lazy_count(); \
  8090. + barrier(); \
  8091. +} while (0)
  8092. +
  8093. #define sched_preempt_enable_no_resched() \
  8094. do { \
  8095. barrier(); \
  8096. preempt_count_dec(); \
  8097. } while (0)
  8098. -#define preempt_enable_no_resched() sched_preempt_enable_no_resched()
  8099. +#ifdef CONFIG_PREEMPT_RT_BASE
  8100. +# define preempt_enable_no_resched() sched_preempt_enable_no_resched()
  8101. +# define preempt_check_resched_rt() preempt_check_resched()
  8102. +#else
  8103. +# define preempt_enable_no_resched() preempt_enable()
  8104. +# define preempt_check_resched_rt() barrier();
  8105. +#endif
  8106. #ifdef CONFIG_PREEMPT
  8107. #define preempt_enable() \
  8108. @@ -63,6 +89,13 @@
  8109. __preempt_schedule(); \
  8110. } while (0)
  8111. +#define preempt_lazy_enable() \
  8112. +do { \
  8113. + dec_preempt_lazy_count(); \
  8114. + barrier(); \
  8115. + preempt_check_resched(); \
  8116. +} while (0)
  8117. +
  8118. #else
  8119. #define preempt_enable() \
  8120. do { \
  8121. @@ -121,6 +154,7 @@
  8122. #define preempt_disable_notrace() barrier()
  8123. #define preempt_enable_no_resched_notrace() barrier()
  8124. #define preempt_enable_notrace() barrier()
  8125. +#define preempt_check_resched_rt() barrier()
  8126. #endif /* CONFIG_PREEMPT_COUNT */
  8127. @@ -140,10 +174,31 @@
  8128. } while (0)
  8129. #define preempt_fold_need_resched() \
  8130. do { \
  8131. - if (tif_need_resched()) \
  8132. + if (tif_need_resched_now()) \
  8133. set_preempt_need_resched(); \
  8134. } while (0)
  8135. +#ifdef CONFIG_PREEMPT_RT_FULL
  8136. +# define preempt_disable_rt() preempt_disable()
  8137. +# define preempt_enable_rt() preempt_enable()
  8138. +# define preempt_disable_nort() barrier()
  8139. +# define preempt_enable_nort() barrier()
  8140. +# ifdef CONFIG_SMP
  8141. + extern void migrate_disable(void);
  8142. + extern void migrate_enable(void);
  8143. +# else /* CONFIG_SMP */
  8144. +# define migrate_disable() barrier()
  8145. +# define migrate_enable() barrier()
  8146. +# endif /* CONFIG_SMP */
  8147. +#else
  8148. +# define preempt_disable_rt() barrier()
  8149. +# define preempt_enable_rt() barrier()
  8150. +# define preempt_disable_nort() preempt_disable()
  8151. +# define preempt_enable_nort() preempt_enable()
  8152. +# define migrate_disable() preempt_disable()
  8153. +# define migrate_enable() preempt_enable()
  8154. +#endif
  8155. +
  8156. #ifdef CONFIG_PREEMPT_NOTIFIERS
  8157. struct preempt_notifier;
  8158. diff -Nur linux-3.18.12.orig/include/linux/preempt_mask.h linux-3.18.12/include/linux/preempt_mask.h
  8159. --- linux-3.18.12.orig/include/linux/preempt_mask.h 2015-04-20 14:48:02.000000000 -0500
  8160. +++ linux-3.18.12/include/linux/preempt_mask.h 2015-04-26 13:32:22.419684003 -0500
  8161. @@ -44,16 +44,26 @@
  8162. #define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT)
  8163. #define NMI_OFFSET (1UL << NMI_SHIFT)
  8164. -#define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET)
  8165. +#ifndef CONFIG_PREEMPT_RT_FULL
  8166. +# define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET)
  8167. +#else
  8168. +# define SOFTIRQ_DISABLE_OFFSET (0)
  8169. +#endif
  8170. #define PREEMPT_ACTIVE_BITS 1
  8171. #define PREEMPT_ACTIVE_SHIFT (NMI_SHIFT + NMI_BITS)
  8172. #define PREEMPT_ACTIVE (__IRQ_MASK(PREEMPT_ACTIVE_BITS) << PREEMPT_ACTIVE_SHIFT)
  8173. #define hardirq_count() (preempt_count() & HARDIRQ_MASK)
  8174. -#define softirq_count() (preempt_count() & SOFTIRQ_MASK)
  8175. #define irq_count() (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK \
  8176. | NMI_MASK))
  8177. +#ifndef CONFIG_PREEMPT_RT_FULL
  8178. +# define softirq_count() (preempt_count() & SOFTIRQ_MASK)
  8179. +# define in_serving_softirq() (softirq_count() & SOFTIRQ_OFFSET)
  8180. +#else
  8181. +# define softirq_count() (0UL)
  8182. +extern int in_serving_softirq(void);
  8183. +#endif
  8184. /*
  8185. * Are we doing bottom half or hardware interrupt processing?
  8186. @@ -64,7 +74,6 @@
  8187. #define in_irq() (hardirq_count())
  8188. #define in_softirq() (softirq_count())
  8189. #define in_interrupt() (irq_count())
  8190. -#define in_serving_softirq() (softirq_count() & SOFTIRQ_OFFSET)
  8191. /*
  8192. * Are we in NMI context?
  8193. diff -Nur linux-3.18.12.orig/include/linux/printk.h linux-3.18.12/include/linux/printk.h
  8194. --- linux-3.18.12.orig/include/linux/printk.h 2015-04-20 14:48:02.000000000 -0500
  8195. +++ linux-3.18.12/include/linux/printk.h 2015-04-26 13:32:22.419684003 -0500
  8196. @@ -119,9 +119,11 @@
  8197. extern asmlinkage __printf(1, 2)
  8198. void early_printk(const char *fmt, ...);
  8199. void early_vprintk(const char *fmt, va_list ap);
  8200. +extern void printk_kill(void);
  8201. #else
  8202. static inline __printf(1, 2) __cold
  8203. void early_printk(const char *s, ...) { }
  8204. +static inline void printk_kill(void) { }
  8205. #endif
  8206. #ifdef CONFIG_PRINTK
  8207. @@ -155,7 +157,6 @@
  8208. #define printk_ratelimit() __printk_ratelimit(__func__)
  8209. extern bool printk_timed_ratelimit(unsigned long *caller_jiffies,
  8210. unsigned int interval_msec);
  8211. -
  8212. extern int printk_delay_msec;
  8213. extern int dmesg_restrict;
  8214. extern int kptr_restrict;
  8215. diff -Nur linux-3.18.12.orig/include/linux/radix-tree.h linux-3.18.12/include/linux/radix-tree.h
  8216. --- linux-3.18.12.orig/include/linux/radix-tree.h 2015-04-20 14:48:02.000000000 -0500
  8217. +++ linux-3.18.12/include/linux/radix-tree.h 2015-04-26 13:32:22.419684003 -0500
  8218. @@ -277,8 +277,13 @@
  8219. unsigned int radix_tree_gang_lookup_slot(struct radix_tree_root *root,
  8220. void ***results, unsigned long *indices,
  8221. unsigned long first_index, unsigned int max_items);
  8222. +#ifndef CONFIG_PREEMPT_RT_FULL
  8223. int radix_tree_preload(gfp_t gfp_mask);
  8224. int radix_tree_maybe_preload(gfp_t gfp_mask);
  8225. +#else
  8226. +static inline int radix_tree_preload(gfp_t gm) { return 0; }
  8227. +static inline int radix_tree_maybe_preload(gfp_t gfp_mask) { return 0; }
  8228. +#endif
  8229. void radix_tree_init(void);
  8230. void *radix_tree_tag_set(struct radix_tree_root *root,
  8231. unsigned long index, unsigned int tag);
  8232. @@ -303,7 +308,7 @@
  8233. static inline void radix_tree_preload_end(void)
  8234. {
  8235. - preempt_enable();
  8236. + preempt_enable_nort();
  8237. }
  8238. /**
  8239. diff -Nur linux-3.18.12.orig/include/linux/random.h linux-3.18.12/include/linux/random.h
  8240. --- linux-3.18.12.orig/include/linux/random.h 2015-04-20 14:48:02.000000000 -0500
  8241. +++ linux-3.18.12/include/linux/random.h 2015-04-26 13:32:22.423684003 -0500
  8242. @@ -11,7 +11,7 @@
  8243. extern void add_device_randomness(const void *, unsigned int);
  8244. extern void add_input_randomness(unsigned int type, unsigned int code,
  8245. unsigned int value);
  8246. -extern void add_interrupt_randomness(int irq, int irq_flags);
  8247. +extern void add_interrupt_randomness(int irq, int irq_flags, __u64 ip);
  8248. extern void get_random_bytes(void *buf, int nbytes);
  8249. extern void get_random_bytes_arch(void *buf, int nbytes);
  8250. diff -Nur linux-3.18.12.orig/include/linux/rcupdate.h linux-3.18.12/include/linux/rcupdate.h
  8251. --- linux-3.18.12.orig/include/linux/rcupdate.h 2015-04-20 14:48:02.000000000 -0500
  8252. +++ linux-3.18.12/include/linux/rcupdate.h 2015-04-26 13:32:22.423684003 -0500
  8253. @@ -147,6 +147,9 @@
  8254. #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
  8255. +#ifdef CONFIG_PREEMPT_RT_FULL
  8256. +#define call_rcu_bh call_rcu
  8257. +#else
  8258. /**
  8259. * call_rcu_bh() - Queue an RCU for invocation after a quicker grace period.
  8260. * @head: structure to be used for queueing the RCU updates.
  8261. @@ -170,6 +173,7 @@
  8262. */
  8263. void call_rcu_bh(struct rcu_head *head,
  8264. void (*func)(struct rcu_head *head));
  8265. +#endif
  8266. /**
  8267. * call_rcu_sched() - Queue an RCU for invocation after sched grace period.
  8268. @@ -231,6 +235,11 @@
  8269. * types of kernel builds, the rcu_read_lock() nesting depth is unknowable.
  8270. */
  8271. #define rcu_preempt_depth() (current->rcu_read_lock_nesting)
  8272. +#ifndef CONFIG_PREEMPT_RT_FULL
  8273. +#define sched_rcu_preempt_depth() rcu_preempt_depth()
  8274. +#else
  8275. +static inline int sched_rcu_preempt_depth(void) { return 0; }
  8276. +#endif
  8277. #else /* #ifdef CONFIG_PREEMPT_RCU */
  8278. @@ -254,6 +263,8 @@
  8279. return 0;
  8280. }
  8281. +#define sched_rcu_preempt_depth() rcu_preempt_depth()
  8282. +
  8283. #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
  8284. /* Internal to kernel */
  8285. @@ -430,7 +441,14 @@
  8286. int debug_lockdep_rcu_enabled(void);
  8287. int rcu_read_lock_held(void);
  8288. +#ifdef CONFIG_PREEMPT_RT_FULL
  8289. +static inline int rcu_read_lock_bh_held(void)
  8290. +{
  8291. + return rcu_read_lock_held();
  8292. +}
  8293. +#else
  8294. int rcu_read_lock_bh_held(void);
  8295. +#endif
  8296. /**
  8297. * rcu_read_lock_sched_held() - might we be in RCU-sched read-side critical section?
  8298. @@ -955,10 +973,14 @@
  8299. static inline void rcu_read_lock_bh(void)
  8300. {
  8301. local_bh_disable();
  8302. +#ifdef CONFIG_PREEMPT_RT_FULL
  8303. + rcu_read_lock();
  8304. +#else
  8305. __acquire(RCU_BH);
  8306. rcu_lock_acquire(&rcu_bh_lock_map);
  8307. rcu_lockdep_assert(rcu_is_watching(),
  8308. "rcu_read_lock_bh() used illegally while idle");
  8309. +#endif
  8310. }
  8311. /*
  8312. @@ -968,10 +990,14 @@
  8313. */
  8314. static inline void rcu_read_unlock_bh(void)
  8315. {
  8316. +#ifdef CONFIG_PREEMPT_RT_FULL
  8317. + rcu_read_unlock();
  8318. +#else
  8319. rcu_lockdep_assert(rcu_is_watching(),
  8320. "rcu_read_unlock_bh() used illegally while idle");
  8321. rcu_lock_release(&rcu_bh_lock_map);
  8322. __release(RCU_BH);
  8323. +#endif
  8324. local_bh_enable();
  8325. }
  8326. diff -Nur linux-3.18.12.orig/include/linux/rcutree.h linux-3.18.12/include/linux/rcutree.h
  8327. --- linux-3.18.12.orig/include/linux/rcutree.h 2015-04-20 14:48:02.000000000 -0500
  8328. +++ linux-3.18.12/include/linux/rcutree.h 2015-04-26 13:32:22.423684003 -0500
  8329. @@ -46,7 +46,11 @@
  8330. rcu_note_context_switch(cpu);
  8331. }
  8332. +#ifdef CONFIG_PREEMPT_RT_FULL
  8333. +# define synchronize_rcu_bh synchronize_rcu
  8334. +#else
  8335. void synchronize_rcu_bh(void);
  8336. +#endif
  8337. void synchronize_sched_expedited(void);
  8338. void synchronize_rcu_expedited(void);
  8339. @@ -74,7 +78,11 @@
  8340. }
  8341. void rcu_barrier(void);
  8342. +#ifdef CONFIG_PREEMPT_RT_FULL
  8343. +# define rcu_barrier_bh rcu_barrier
  8344. +#else
  8345. void rcu_barrier_bh(void);
  8346. +#endif
  8347. void rcu_barrier_sched(void);
  8348. unsigned long get_state_synchronize_rcu(void);
  8349. void cond_synchronize_rcu(unsigned long oldstate);
  8350. @@ -82,12 +90,10 @@
  8351. extern unsigned long rcutorture_testseq;
  8352. extern unsigned long rcutorture_vernum;
  8353. long rcu_batches_completed(void);
  8354. -long rcu_batches_completed_bh(void);
  8355. long rcu_batches_completed_sched(void);
  8356. void show_rcu_gp_kthreads(void);
  8357. void rcu_force_quiescent_state(void);
  8358. -void rcu_bh_force_quiescent_state(void);
  8359. void rcu_sched_force_quiescent_state(void);
  8360. void exit_rcu(void);
  8361. @@ -97,4 +103,12 @@
  8362. bool rcu_is_watching(void);
  8363. +#ifndef CONFIG_PREEMPT_RT_FULL
  8364. +void rcu_bh_force_quiescent_state(void);
  8365. +long rcu_batches_completed_bh(void);
  8366. +#else
  8367. +# define rcu_bh_force_quiescent_state rcu_force_quiescent_state
  8368. +# define rcu_batches_completed_bh rcu_batches_completed
  8369. +#endif
  8370. +
  8371. #endif /* __LINUX_RCUTREE_H */
  8372. diff -Nur linux-3.18.12.orig/include/linux/rtmutex.h linux-3.18.12/include/linux/rtmutex.h
  8373. --- linux-3.18.12.orig/include/linux/rtmutex.h 2015-04-20 14:48:02.000000000 -0500
  8374. +++ linux-3.18.12/include/linux/rtmutex.h 2015-04-26 13:32:22.423684003 -0500
  8375. @@ -14,10 +14,14 @@
  8376. #include <linux/linkage.h>
  8377. #include <linux/rbtree.h>
  8378. -#include <linux/spinlock_types.h>
  8379. +#include <linux/spinlock_types_raw.h>
  8380. extern int max_lock_depth; /* for sysctl */
  8381. +#ifdef CONFIG_DEBUG_MUTEXES
  8382. +#include <linux/debug_locks.h>
  8383. +#endif
  8384. +
  8385. /**
  8386. * The rt_mutex structure
  8387. *
  8388. @@ -31,8 +35,8 @@
  8389. struct rb_root waiters;
  8390. struct rb_node *waiters_leftmost;
  8391. struct task_struct *owner;
  8392. -#ifdef CONFIG_DEBUG_RT_MUTEXES
  8393. int save_state;
  8394. +#ifdef CONFIG_DEBUG_RT_MUTEXES
  8395. const char *name, *file;
  8396. int line;
  8397. void *magic;
  8398. @@ -55,22 +59,33 @@
  8399. # define rt_mutex_debug_check_no_locks_held(task) do { } while (0)
  8400. #endif
  8401. +# define rt_mutex_init(mutex) \
  8402. + do { \
  8403. + raw_spin_lock_init(&(mutex)->wait_lock); \
  8404. + __rt_mutex_init(mutex, #mutex); \
  8405. + } while (0)
  8406. +
  8407. #ifdef CONFIG_DEBUG_RT_MUTEXES
  8408. # define __DEBUG_RT_MUTEX_INITIALIZER(mutexname) \
  8409. , .name = #mutexname, .file = __FILE__, .line = __LINE__
  8410. -# define rt_mutex_init(mutex) __rt_mutex_init(mutex, __func__)
  8411. extern void rt_mutex_debug_task_free(struct task_struct *tsk);
  8412. #else
  8413. # define __DEBUG_RT_MUTEX_INITIALIZER(mutexname)
  8414. -# define rt_mutex_init(mutex) __rt_mutex_init(mutex, NULL)
  8415. # define rt_mutex_debug_task_free(t) do { } while (0)
  8416. #endif
  8417. -#define __RT_MUTEX_INITIALIZER(mutexname) \
  8418. - { .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \
  8419. +#define __RT_MUTEX_INITIALIZER_PLAIN(mutexname) \
  8420. + .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \
  8421. , .waiters = RB_ROOT \
  8422. , .owner = NULL \
  8423. - __DEBUG_RT_MUTEX_INITIALIZER(mutexname)}
  8424. + __DEBUG_RT_MUTEX_INITIALIZER(mutexname)
  8425. +
  8426. +#define __RT_MUTEX_INITIALIZER(mutexname) \
  8427. + { __RT_MUTEX_INITIALIZER_PLAIN(mutexname) }
  8428. +
  8429. +#define __RT_MUTEX_INITIALIZER_SAVE_STATE(mutexname) \
  8430. + { __RT_MUTEX_INITIALIZER_PLAIN(mutexname) \
  8431. + , .save_state = 1 }
  8432. #define DEFINE_RT_MUTEX(mutexname) \
  8433. struct rt_mutex mutexname = __RT_MUTEX_INITIALIZER(mutexname)
  8434. @@ -91,6 +106,7 @@
  8435. extern void rt_mutex_lock(struct rt_mutex *lock);
  8436. extern int rt_mutex_lock_interruptible(struct rt_mutex *lock);
  8437. +extern int rt_mutex_lock_killable(struct rt_mutex *lock);
  8438. extern int rt_mutex_timed_lock(struct rt_mutex *lock,
  8439. struct hrtimer_sleeper *timeout);
  8440. diff -Nur linux-3.18.12.orig/include/linux/rwlock_rt.h linux-3.18.12/include/linux/rwlock_rt.h
  8441. --- linux-3.18.12.orig/include/linux/rwlock_rt.h 1969-12-31 18:00:00.000000000 -0600
  8442. +++ linux-3.18.12/include/linux/rwlock_rt.h 2015-04-26 13:32:22.423684003 -0500
  8443. @@ -0,0 +1,99 @@
  8444. +#ifndef __LINUX_RWLOCK_RT_H
  8445. +#define __LINUX_RWLOCK_RT_H
  8446. +
  8447. +#ifndef __LINUX_SPINLOCK_H
  8448. +#error Do not include directly. Use spinlock.h
  8449. +#endif
  8450. +
  8451. +#define rwlock_init(rwl) \
  8452. +do { \
  8453. + static struct lock_class_key __key; \
  8454. + \
  8455. + rt_mutex_init(&(rwl)->lock); \
  8456. + __rt_rwlock_init(rwl, #rwl, &__key); \
  8457. +} while (0)
  8458. +
  8459. +extern void __lockfunc rt_write_lock(rwlock_t *rwlock);
  8460. +extern void __lockfunc rt_read_lock(rwlock_t *rwlock);
  8461. +extern int __lockfunc rt_write_trylock(rwlock_t *rwlock);
  8462. +extern int __lockfunc rt_write_trylock_irqsave(rwlock_t *trylock, unsigned long *flags);
  8463. +extern int __lockfunc rt_read_trylock(rwlock_t *rwlock);
  8464. +extern void __lockfunc rt_write_unlock(rwlock_t *rwlock);
  8465. +extern void __lockfunc rt_read_unlock(rwlock_t *rwlock);
  8466. +extern unsigned long __lockfunc rt_write_lock_irqsave(rwlock_t *rwlock);
  8467. +extern unsigned long __lockfunc rt_read_lock_irqsave(rwlock_t *rwlock);
  8468. +extern void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key);
  8469. +
  8470. +#define read_trylock(lock) __cond_lock(lock, rt_read_trylock(lock))
  8471. +#define write_trylock(lock) __cond_lock(lock, rt_write_trylock(lock))
  8472. +
  8473. +#define write_trylock_irqsave(lock, flags) \
  8474. + __cond_lock(lock, rt_write_trylock_irqsave(lock, &flags))
  8475. +
  8476. +#define read_lock_irqsave(lock, flags) \
  8477. + do { \
  8478. + typecheck(unsigned long, flags); \
  8479. + flags = rt_read_lock_irqsave(lock); \
  8480. + } while (0)
  8481. +
  8482. +#define write_lock_irqsave(lock, flags) \
  8483. + do { \
  8484. + typecheck(unsigned long, flags); \
  8485. + flags = rt_write_lock_irqsave(lock); \
  8486. + } while (0)
  8487. +
  8488. +#define read_lock(lock) rt_read_lock(lock)
  8489. +
  8490. +#define read_lock_bh(lock) \
  8491. + do { \
  8492. + local_bh_disable(); \
  8493. + rt_read_lock(lock); \
  8494. + } while (0)
  8495. +
  8496. +#define read_lock_irq(lock) read_lock(lock)
  8497. +
  8498. +#define write_lock(lock) rt_write_lock(lock)
  8499. +
  8500. +#define write_lock_bh(lock) \
  8501. + do { \
  8502. + local_bh_disable(); \
  8503. + rt_write_lock(lock); \
  8504. + } while (0)
  8505. +
  8506. +#define write_lock_irq(lock) write_lock(lock)
  8507. +
  8508. +#define read_unlock(lock) rt_read_unlock(lock)
  8509. +
  8510. +#define read_unlock_bh(lock) \
  8511. + do { \
  8512. + rt_read_unlock(lock); \
  8513. + local_bh_enable(); \
  8514. + } while (0)
  8515. +
  8516. +#define read_unlock_irq(lock) read_unlock(lock)
  8517. +
  8518. +#define write_unlock(lock) rt_write_unlock(lock)
  8519. +
  8520. +#define write_unlock_bh(lock) \
  8521. + do { \
  8522. + rt_write_unlock(lock); \
  8523. + local_bh_enable(); \
  8524. + } while (0)
  8525. +
  8526. +#define write_unlock_irq(lock) write_unlock(lock)
  8527. +
  8528. +#define read_unlock_irqrestore(lock, flags) \
  8529. + do { \
  8530. + typecheck(unsigned long, flags); \
  8531. + (void) flags; \
  8532. + rt_read_unlock(lock); \
  8533. + } while (0)
  8534. +
  8535. +#define write_unlock_irqrestore(lock, flags) \
  8536. + do { \
  8537. + typecheck(unsigned long, flags); \
  8538. + (void) flags; \
  8539. + rt_write_unlock(lock); \
  8540. + } while (0)
  8541. +
  8542. +#endif
  8543. diff -Nur linux-3.18.12.orig/include/linux/rwlock_types.h linux-3.18.12/include/linux/rwlock_types.h
  8544. --- linux-3.18.12.orig/include/linux/rwlock_types.h 2015-04-20 14:48:02.000000000 -0500
  8545. +++ linux-3.18.12/include/linux/rwlock_types.h 2015-04-26 13:32:22.423684003 -0500
  8546. @@ -1,6 +1,10 @@
  8547. #ifndef __LINUX_RWLOCK_TYPES_H
  8548. #define __LINUX_RWLOCK_TYPES_H
  8549. +#if !defined(__LINUX_SPINLOCK_TYPES_H)
  8550. +# error "Do not include directly, include spinlock_types.h"
  8551. +#endif
  8552. +
  8553. /*
  8554. * include/linux/rwlock_types.h - generic rwlock type definitions
  8555. * and initializers
  8556. @@ -43,6 +47,7 @@
  8557. RW_DEP_MAP_INIT(lockname) }
  8558. #endif
  8559. -#define DEFINE_RWLOCK(x) rwlock_t x = __RW_LOCK_UNLOCKED(x)
  8560. +#define DEFINE_RWLOCK(name) \
  8561. + rwlock_t name __cacheline_aligned_in_smp = __RW_LOCK_UNLOCKED(name)
  8562. #endif /* __LINUX_RWLOCK_TYPES_H */
  8563. diff -Nur linux-3.18.12.orig/include/linux/rwlock_types_rt.h linux-3.18.12/include/linux/rwlock_types_rt.h
  8564. --- linux-3.18.12.orig/include/linux/rwlock_types_rt.h 1969-12-31 18:00:00.000000000 -0600
  8565. +++ linux-3.18.12/include/linux/rwlock_types_rt.h 2015-04-26 13:32:22.423684003 -0500
  8566. @@ -0,0 +1,33 @@
  8567. +#ifndef __LINUX_RWLOCK_TYPES_RT_H
  8568. +#define __LINUX_RWLOCK_TYPES_RT_H
  8569. +
  8570. +#ifndef __LINUX_SPINLOCK_TYPES_H
  8571. +#error "Do not include directly. Include spinlock_types.h instead"
  8572. +#endif
  8573. +
  8574. +/*
  8575. + * rwlocks - rtmutex which allows single reader recursion
  8576. + */
  8577. +typedef struct {
  8578. + struct rt_mutex lock;
  8579. + int read_depth;
  8580. + unsigned int break_lock;
  8581. +#ifdef CONFIG_DEBUG_LOCK_ALLOC
  8582. + struct lockdep_map dep_map;
  8583. +#endif
  8584. +} rwlock_t;
  8585. +
  8586. +#ifdef CONFIG_DEBUG_LOCK_ALLOC
  8587. +# define RW_DEP_MAP_INIT(lockname) .dep_map = { .name = #lockname }
  8588. +#else
  8589. +# define RW_DEP_MAP_INIT(lockname)
  8590. +#endif
  8591. +
  8592. +#define __RW_LOCK_UNLOCKED(name) \
  8593. + { .lock = __RT_MUTEX_INITIALIZER_SAVE_STATE(name.lock), \
  8594. + RW_DEP_MAP_INIT(name) }
  8595. +
  8596. +#define DEFINE_RWLOCK(name) \
  8597. + rwlock_t name __cacheline_aligned_in_smp = __RW_LOCK_UNLOCKED(name)
  8598. +
  8599. +#endif
  8600. diff -Nur linux-3.18.12.orig/include/linux/rwsem.h linux-3.18.12/include/linux/rwsem.h
  8601. --- linux-3.18.12.orig/include/linux/rwsem.h 2015-04-20 14:48:02.000000000 -0500
  8602. +++ linux-3.18.12/include/linux/rwsem.h 2015-04-26 13:32:22.423684003 -0500
  8603. @@ -18,6 +18,10 @@
  8604. #include <linux/osq_lock.h>
  8605. #endif
  8606. +#ifdef CONFIG_PREEMPT_RT_FULL
  8607. +#include <linux/rwsem_rt.h>
  8608. +#else /* PREEMPT_RT_FULL */
  8609. +
  8610. struct rw_semaphore;
  8611. #ifdef CONFIG_RWSEM_GENERIC_SPINLOCK
  8612. @@ -177,4 +181,6 @@
  8613. # define up_read_non_owner(sem) up_read(sem)
  8614. #endif
  8615. +#endif /* !PREEMPT_RT_FULL */
  8616. +
  8617. #endif /* _LINUX_RWSEM_H */
  8618. diff -Nur linux-3.18.12.orig/include/linux/rwsem_rt.h linux-3.18.12/include/linux/rwsem_rt.h
  8619. --- linux-3.18.12.orig/include/linux/rwsem_rt.h 1969-12-31 18:00:00.000000000 -0600
  8620. +++ linux-3.18.12/include/linux/rwsem_rt.h 2015-04-26 13:32:22.423684003 -0500
  8621. @@ -0,0 +1,134 @@
  8622. +#ifndef _LINUX_RWSEM_RT_H
  8623. +#define _LINUX_RWSEM_RT_H
  8624. +
  8625. +#ifndef _LINUX_RWSEM_H
  8626. +#error "Include rwsem.h"
  8627. +#endif
  8628. +
  8629. +/*
  8630. + * RW-semaphores are a spinlock plus a reader-depth count.
  8631. + *
  8632. + * Note that the semantics are different from the usual
  8633. + * Linux rw-sems, in PREEMPT_RT mode we do not allow
  8634. + * multiple readers to hold the lock at once, we only allow
  8635. + * a read-lock owner to read-lock recursively. This is
  8636. + * better for latency, makes the implementation inherently
  8637. + * fair and makes it simpler as well.
  8638. + */
  8639. +
  8640. +#include <linux/rtmutex.h>
  8641. +
  8642. +struct rw_semaphore {
  8643. + struct rt_mutex lock;
  8644. + int read_depth;
  8645. +#ifdef CONFIG_DEBUG_LOCK_ALLOC
  8646. + struct lockdep_map dep_map;
  8647. +#endif
  8648. +};
  8649. +
  8650. +#define __RWSEM_INITIALIZER(name) \
  8651. + { .lock = __RT_MUTEX_INITIALIZER(name.lock), \
  8652. + RW_DEP_MAP_INIT(name) }
  8653. +
  8654. +#define DECLARE_RWSEM(lockname) \
  8655. + struct rw_semaphore lockname = __RWSEM_INITIALIZER(lockname)
  8656. +
  8657. +extern void __rt_rwsem_init(struct rw_semaphore *rwsem, const char *name,
  8658. + struct lock_class_key *key);
  8659. +
  8660. +#define __rt_init_rwsem(sem, name, key) \
  8661. + do { \
  8662. + rt_mutex_init(&(sem)->lock); \
  8663. + __rt_rwsem_init((sem), (name), (key));\
  8664. + } while (0)
  8665. +
  8666. +#define __init_rwsem(sem, name, key) __rt_init_rwsem(sem, name, key)
  8667. +
  8668. +# define rt_init_rwsem(sem) \
  8669. +do { \
  8670. + static struct lock_class_key __key; \
  8671. + \
  8672. + __rt_init_rwsem((sem), #sem, &__key); \
  8673. +} while (0)
  8674. +
  8675. +extern void rt_down_write(struct rw_semaphore *rwsem);
  8676. +extern void rt_down_read_nested(struct rw_semaphore *rwsem, int subclass);
  8677. +extern void rt_down_write_nested(struct rw_semaphore *rwsem, int subclass);
  8678. +extern void rt_down_write_nested_lock(struct rw_semaphore *rwsem,
  8679. + struct lockdep_map *nest);
  8680. +extern void rt_down_read(struct rw_semaphore *rwsem);
  8681. +extern int rt_down_write_trylock(struct rw_semaphore *rwsem);
  8682. +extern int rt_down_read_trylock(struct rw_semaphore *rwsem);
  8683. +extern void rt_up_read(struct rw_semaphore *rwsem);
  8684. +extern void rt_up_write(struct rw_semaphore *rwsem);
  8685. +extern void rt_downgrade_write(struct rw_semaphore *rwsem);
  8686. +
  8687. +#define init_rwsem(sem) rt_init_rwsem(sem)
  8688. +#define rwsem_is_locked(s) rt_mutex_is_locked(&(s)->lock)
  8689. +
  8690. +static inline int rwsem_is_contended(struct rw_semaphore *sem)
  8691. +{
  8692. + /* rt_mutex_has_waiters() */
  8693. + return !RB_EMPTY_ROOT(&sem->lock.waiters);
  8694. +}
  8695. +
  8696. +static inline void down_read(struct rw_semaphore *sem)
  8697. +{
  8698. + rt_down_read(sem);
  8699. +}
  8700. +
  8701. +static inline int down_read_trylock(struct rw_semaphore *sem)
  8702. +{
  8703. + return rt_down_read_trylock(sem);
  8704. +}
  8705. +
  8706. +static inline void down_write(struct rw_semaphore *sem)
  8707. +{
  8708. + rt_down_write(sem);
  8709. +}
  8710. +
  8711. +static inline int down_write_trylock(struct rw_semaphore *sem)
  8712. +{
  8713. + return rt_down_write_trylock(sem);
  8714. +}
  8715. +
  8716. +static inline void up_read(struct rw_semaphore *sem)
  8717. +{
  8718. + rt_up_read(sem);
  8719. +}
  8720. +
  8721. +static inline void up_write(struct rw_semaphore *sem)
  8722. +{
  8723. + rt_up_write(sem);
  8724. +}
  8725. +
  8726. +static inline void downgrade_write(struct rw_semaphore *sem)
  8727. +{
  8728. + rt_downgrade_write(sem);
  8729. +}
  8730. +
  8731. +static inline void down_read_nested(struct rw_semaphore *sem, int subclass)
  8732. +{
  8733. + return rt_down_read_nested(sem, subclass);
  8734. +}
  8735. +
  8736. +static inline void down_write_nested(struct rw_semaphore *sem, int subclass)
  8737. +{
  8738. + rt_down_write_nested(sem, subclass);
  8739. +}
  8740. +#ifdef CONFIG_DEBUG_LOCK_ALLOC
  8741. +static inline void down_write_nest_lock(struct rw_semaphore *sem,
  8742. + struct rw_semaphore *nest_lock)
  8743. +{
  8744. + rt_down_write_nested_lock(sem, &nest_lock->dep_map);
  8745. +}
  8746. +
  8747. +#else
  8748. +
  8749. +static inline void down_write_nest_lock(struct rw_semaphore *sem,
  8750. + struct rw_semaphore *nest_lock)
  8751. +{
  8752. + rt_down_write_nested_lock(sem, NULL);
  8753. +}
  8754. +#endif
  8755. +#endif
  8756. diff -Nur linux-3.18.12.orig/include/linux/sched.h linux-3.18.12/include/linux/sched.h
  8757. --- linux-3.18.12.orig/include/linux/sched.h 2015-04-20 14:48:02.000000000 -0500
  8758. +++ linux-3.18.12/include/linux/sched.h 2015-04-26 13:32:22.423684003 -0500
  8759. @@ -26,6 +26,7 @@
  8760. #include <linux/nodemask.h>
  8761. #include <linux/mm_types.h>
  8762. #include <linux/preempt_mask.h>
  8763. +#include <asm/kmap_types.h>
  8764. #include <asm/page.h>
  8765. #include <asm/ptrace.h>
  8766. @@ -56,6 +57,7 @@
  8767. #include <linux/cred.h>
  8768. #include <linux/llist.h>
  8769. #include <linux/uidgid.h>
  8770. +#include <linux/hardirq.h>
  8771. #include <linux/gfp.h>
  8772. #include <linux/magic.h>
  8773. @@ -235,10 +237,7 @@
  8774. TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \
  8775. __TASK_TRACED | EXIT_ZOMBIE | EXIT_DEAD)
  8776. -#define task_is_traced(task) ((task->state & __TASK_TRACED) != 0)
  8777. #define task_is_stopped(task) ((task->state & __TASK_STOPPED) != 0)
  8778. -#define task_is_stopped_or_traced(task) \
  8779. - ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0)
  8780. #define task_contributes_to_load(task) \
  8781. ((task->state & TASK_UNINTERRUPTIBLE) != 0 && \
  8782. (task->flags & PF_FROZEN) == 0)
  8783. @@ -1234,6 +1233,7 @@
  8784. struct task_struct {
  8785. volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
  8786. + volatile long saved_state; /* saved state for "spinlock sleepers" */
  8787. void *stack;
  8788. atomic_t usage;
  8789. unsigned int flags; /* per process flags, defined below */
  8790. @@ -1270,6 +1270,12 @@
  8791. #endif
  8792. unsigned int policy;
  8793. +#ifdef CONFIG_PREEMPT_RT_FULL
  8794. + int migrate_disable;
  8795. +# ifdef CONFIG_SCHED_DEBUG
  8796. + int migrate_disable_atomic;
  8797. +# endif
  8798. +#endif
  8799. int nr_cpus_allowed;
  8800. cpumask_t cpus_allowed;
  8801. @@ -1371,7 +1377,8 @@
  8802. struct cputime prev_cputime;
  8803. #endif
  8804. #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
  8805. - seqlock_t vtime_seqlock;
  8806. + raw_spinlock_t vtime_lock;
  8807. + seqcount_t vtime_seq;
  8808. unsigned long long vtime_snap;
  8809. enum {
  8810. VTIME_SLEEPING = 0,
  8811. @@ -1387,6 +1394,9 @@
  8812. struct task_cputime cputime_expires;
  8813. struct list_head cpu_timers[3];
  8814. +#ifdef CONFIG_PREEMPT_RT_BASE
  8815. + struct task_struct *posix_timer_list;
  8816. +#endif
  8817. /* process credentials */
  8818. const struct cred __rcu *real_cred; /* objective and real subjective task
  8819. @@ -1419,10 +1429,15 @@
  8820. /* signal handlers */
  8821. struct signal_struct *signal;
  8822. struct sighand_struct *sighand;
  8823. + struct sigqueue *sigqueue_cache;
  8824. sigset_t blocked, real_blocked;
  8825. sigset_t saved_sigmask; /* restored if set_restore_sigmask() was used */
  8826. struct sigpending pending;
  8827. +#ifdef CONFIG_PREEMPT_RT_FULL
  8828. + /* TODO: move me into ->restart_block ? */
  8829. + struct siginfo forced_info;
  8830. +#endif
  8831. unsigned long sas_ss_sp;
  8832. size_t sas_ss_size;
  8833. @@ -1460,6 +1475,9 @@
  8834. /* mutex deadlock detection */
  8835. struct mutex_waiter *blocked_on;
  8836. #endif
  8837. +#ifdef CONFIG_PREEMPT_RT_FULL
  8838. + int pagefault_disabled;
  8839. +#endif
  8840. #ifdef CONFIG_TRACE_IRQFLAGS
  8841. unsigned int irq_events;
  8842. unsigned long hardirq_enable_ip;
  8843. @@ -1644,6 +1662,12 @@
  8844. unsigned long trace;
  8845. /* bitmask and counter of trace recursion */
  8846. unsigned long trace_recursion;
  8847. +#ifdef CONFIG_WAKEUP_LATENCY_HIST
  8848. + u64 preempt_timestamp_hist;
  8849. +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
  8850. + long timer_offset;
  8851. +#endif
  8852. +#endif
  8853. #endif /* CONFIG_TRACING */
  8854. #ifdef CONFIG_MEMCG /* memcg uses this to do batch job */
  8855. unsigned int memcg_kmem_skip_account;
  8856. @@ -1661,11 +1685,19 @@
  8857. unsigned int sequential_io;
  8858. unsigned int sequential_io_avg;
  8859. #endif
  8860. +#ifdef CONFIG_PREEMPT_RT_BASE
  8861. + struct rcu_head put_rcu;
  8862. + int softirq_nestcnt;
  8863. + unsigned int softirqs_raised;
  8864. +#endif
  8865. +#ifdef CONFIG_PREEMPT_RT_FULL
  8866. +# if defined CONFIG_HIGHMEM || defined CONFIG_X86_32
  8867. + int kmap_idx;
  8868. + pte_t kmap_pte[KM_TYPE_NR];
  8869. +# endif
  8870. +#endif
  8871. };
  8872. -/* Future-safe accessor for struct task_struct's cpus_allowed. */
  8873. -#define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
  8874. -
  8875. #define TNF_MIGRATED 0x01
  8876. #define TNF_NO_GROUP 0x02
  8877. #define TNF_SHARED 0x04
  8878. @@ -1700,6 +1732,17 @@
  8879. }
  8880. #endif
  8881. +#ifdef CONFIG_PREEMPT_RT_FULL
  8882. +static inline bool cur_pf_disabled(void) { return current->pagefault_disabled; }
  8883. +#else
  8884. +static inline bool cur_pf_disabled(void) { return false; }
  8885. +#endif
  8886. +
  8887. +static inline bool pagefault_disabled(void)
  8888. +{
  8889. + return in_atomic() || cur_pf_disabled();
  8890. +}
  8891. +
  8892. static inline struct pid *task_pid(struct task_struct *task)
  8893. {
  8894. return task->pids[PIDTYPE_PID].pid;
  8895. @@ -1853,6 +1896,15 @@
  8896. extern void free_task(struct task_struct *tsk);
  8897. #define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0)
  8898. +#ifdef CONFIG_PREEMPT_RT_BASE
  8899. +extern void __put_task_struct_cb(struct rcu_head *rhp);
  8900. +
  8901. +static inline void put_task_struct(struct task_struct *t)
  8902. +{
  8903. + if (atomic_dec_and_test(&t->usage))
  8904. + call_rcu(&t->put_rcu, __put_task_struct_cb);
  8905. +}
  8906. +#else
  8907. extern void __put_task_struct(struct task_struct *t);
  8908. static inline void put_task_struct(struct task_struct *t)
  8909. @@ -1860,6 +1912,7 @@
  8910. if (atomic_dec_and_test(&t->usage))
  8911. __put_task_struct(t);
  8912. }
  8913. +#endif
  8914. #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
  8915. extern void task_cputime(struct task_struct *t,
  8916. @@ -1898,6 +1951,7 @@
  8917. /*
  8918. * Per process flags
  8919. */
  8920. +#define PF_IN_SOFTIRQ 0x00000001 /* Task is serving softirq */
  8921. #define PF_EXITING 0x00000004 /* getting shut down */
  8922. #define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */
  8923. #define PF_VCPU 0x00000010 /* I'm a virtual CPU */
  8924. @@ -2058,6 +2112,10 @@
  8925. extern int set_cpus_allowed_ptr(struct task_struct *p,
  8926. const struct cpumask *new_mask);
  8927. +int migrate_me(void);
  8928. +void tell_sched_cpu_down_begin(int cpu);
  8929. +void tell_sched_cpu_down_done(int cpu);
  8930. +
  8931. #else
  8932. static inline void do_set_cpus_allowed(struct task_struct *p,
  8933. const struct cpumask *new_mask)
  8934. @@ -2070,6 +2128,9 @@
  8935. return -EINVAL;
  8936. return 0;
  8937. }
  8938. +static inline int migrate_me(void) { return 0; }
  8939. +static inline void tell_sched_cpu_down_begin(int cpu) { }
  8940. +static inline void tell_sched_cpu_down_done(int cpu) { }
  8941. #endif
  8942. #ifdef CONFIG_NO_HZ_COMMON
  8943. @@ -2290,6 +2351,7 @@
  8944. extern int wake_up_state(struct task_struct *tsk, unsigned int state);
  8945. extern int wake_up_process(struct task_struct *tsk);
  8946. +extern int wake_up_lock_sleeper(struct task_struct * tsk);
  8947. extern void wake_up_new_task(struct task_struct *tsk);
  8948. #ifdef CONFIG_SMP
  8949. extern void kick_process(struct task_struct *tsk);
  8950. @@ -2406,12 +2468,24 @@
  8951. /* mmdrop drops the mm and the page tables */
  8952. extern void __mmdrop(struct mm_struct *);
  8953. +
  8954. static inline void mmdrop(struct mm_struct * mm)
  8955. {
  8956. if (unlikely(atomic_dec_and_test(&mm->mm_count)))
  8957. __mmdrop(mm);
  8958. }
  8959. +#ifdef CONFIG_PREEMPT_RT_BASE
  8960. +extern void __mmdrop_delayed(struct rcu_head *rhp);
  8961. +static inline void mmdrop_delayed(struct mm_struct *mm)
  8962. +{
  8963. + if (atomic_dec_and_test(&mm->mm_count))
  8964. + call_rcu(&mm->delayed_drop, __mmdrop_delayed);
  8965. +}
  8966. +#else
  8967. +# define mmdrop_delayed(mm) mmdrop(mm)
  8968. +#endif
  8969. +
  8970. /* mmput gets rid of the mappings and all user-space */
  8971. extern void mmput(struct mm_struct *);
  8972. /* Grab a reference to a task's mm, if it is not already going away */
  8973. @@ -2719,6 +2793,43 @@
  8974. return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED));
  8975. }
  8976. +#ifdef CONFIG_PREEMPT_LAZY
  8977. +static inline void set_tsk_need_resched_lazy(struct task_struct *tsk)
  8978. +{
  8979. + set_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY);
  8980. +}
  8981. +
  8982. +static inline void clear_tsk_need_resched_lazy(struct task_struct *tsk)
  8983. +{
  8984. + clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY);
  8985. +}
  8986. +
  8987. +static inline int test_tsk_need_resched_lazy(struct task_struct *tsk)
  8988. +{
  8989. + return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY));
  8990. +}
  8991. +
  8992. +static inline int need_resched_lazy(void)
  8993. +{
  8994. + return test_thread_flag(TIF_NEED_RESCHED_LAZY);
  8995. +}
  8996. +
  8997. +static inline int need_resched_now(void)
  8998. +{
  8999. + return test_thread_flag(TIF_NEED_RESCHED);
  9000. +}
  9001. +
  9002. +#else
  9003. +static inline void clear_tsk_need_resched_lazy(struct task_struct *tsk) { }
  9004. +static inline int need_resched_lazy(void) { return 0; }
  9005. +
  9006. +static inline int need_resched_now(void)
  9007. +{
  9008. + return test_thread_flag(TIF_NEED_RESCHED);
  9009. +}
  9010. +
  9011. +#endif
  9012. +
  9013. static inline int restart_syscall(void)
  9014. {
  9015. set_tsk_thread_flag(current, TIF_SIGPENDING);
  9016. @@ -2750,6 +2861,51 @@
  9017. return (state & TASK_INTERRUPTIBLE) || __fatal_signal_pending(p);
  9018. }
  9019. +static inline bool __task_is_stopped_or_traced(struct task_struct *task)
  9020. +{
  9021. + if (task->state & (__TASK_STOPPED | __TASK_TRACED))
  9022. + return true;
  9023. +#ifdef CONFIG_PREEMPT_RT_FULL
  9024. + if (task->saved_state & (__TASK_STOPPED | __TASK_TRACED))
  9025. + return true;
  9026. +#endif
  9027. + return false;
  9028. +}
  9029. +
  9030. +static inline bool task_is_stopped_or_traced(struct task_struct *task)
  9031. +{
  9032. + bool traced_stopped;
  9033. +
  9034. +#ifdef CONFIG_PREEMPT_RT_FULL
  9035. + unsigned long flags;
  9036. +
  9037. + raw_spin_lock_irqsave(&task->pi_lock, flags);
  9038. + traced_stopped = __task_is_stopped_or_traced(task);
  9039. + raw_spin_unlock_irqrestore(&task->pi_lock, flags);
  9040. +#else
  9041. + traced_stopped = __task_is_stopped_or_traced(task);
  9042. +#endif
  9043. + return traced_stopped;
  9044. +}
  9045. +
  9046. +static inline bool task_is_traced(struct task_struct *task)
  9047. +{
  9048. + bool traced = false;
  9049. +
  9050. + if (task->state & __TASK_TRACED)
  9051. + return true;
  9052. +#ifdef CONFIG_PREEMPT_RT_FULL
  9053. + /* in case the task is sleeping on tasklist_lock */
  9054. + raw_spin_lock_irq(&task->pi_lock);
  9055. + if (task->state & __TASK_TRACED)
  9056. + traced = true;
  9057. + else if (task->saved_state & __TASK_TRACED)
  9058. + traced = true;
  9059. + raw_spin_unlock_irq(&task->pi_lock);
  9060. +#endif
  9061. + return traced;
  9062. +}
  9063. +
  9064. /*
  9065. * cond_resched() and cond_resched_lock(): latency reduction via
  9066. * explicit rescheduling in places that are safe. The return
  9067. @@ -2766,7 +2922,7 @@
  9068. extern int __cond_resched_lock(spinlock_t *lock);
  9069. -#ifdef CONFIG_PREEMPT_COUNT
  9070. +#if defined(CONFIG_PREEMPT_COUNT) && !defined(CONFIG_PREEMPT_RT_FULL)
  9071. #define PREEMPT_LOCK_OFFSET PREEMPT_OFFSET
  9072. #else
  9073. #define PREEMPT_LOCK_OFFSET 0
  9074. @@ -2777,12 +2933,16 @@
  9075. __cond_resched_lock(lock); \
  9076. })
  9077. +#ifndef CONFIG_PREEMPT_RT_FULL
  9078. extern int __cond_resched_softirq(void);
  9079. #define cond_resched_softirq() ({ \
  9080. __might_sleep(__FILE__, __LINE__, SOFTIRQ_DISABLE_OFFSET); \
  9081. __cond_resched_softirq(); \
  9082. })
  9083. +#else
  9084. +# define cond_resched_softirq() cond_resched()
  9085. +#endif
  9086. static inline void cond_resched_rcu(void)
  9087. {
  9088. @@ -2949,6 +3109,26 @@
  9089. #endif /* CONFIG_SMP */
  9090. +static inline int __migrate_disabled(struct task_struct *p)
  9091. +{
  9092. +#ifdef CONFIG_PREEMPT_RT_FULL
  9093. + return p->migrate_disable;
  9094. +#else
  9095. + return 0;
  9096. +#endif
  9097. +}
  9098. +
  9099. +/* Future-safe accessor for struct task_struct's cpus_allowed. */
  9100. +static inline const struct cpumask *tsk_cpus_allowed(struct task_struct *p)
  9101. +{
  9102. +#ifdef CONFIG_PREEMPT_RT_FULL
  9103. + if (p->migrate_disable)
  9104. + return cpumask_of(task_cpu(p));
  9105. +#endif
  9106. +
  9107. + return &p->cpus_allowed;
  9108. +}
  9109. +
  9110. extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);
  9111. extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
  9112. diff -Nur linux-3.18.12.orig/include/linux/seqlock.h linux-3.18.12/include/linux/seqlock.h
  9113. --- linux-3.18.12.orig/include/linux/seqlock.h 2015-04-20 14:48:02.000000000 -0500
  9114. +++ linux-3.18.12/include/linux/seqlock.h 2015-04-26 13:32:22.423684003 -0500
  9115. @@ -219,20 +219,30 @@
  9116. return __read_seqcount_retry(s, start);
  9117. }
  9118. -
  9119. -
  9120. -static inline void raw_write_seqcount_begin(seqcount_t *s)
  9121. +static inline void __raw_write_seqcount_begin(seqcount_t *s)
  9122. {
  9123. s->sequence++;
  9124. smp_wmb();
  9125. }
  9126. -static inline void raw_write_seqcount_end(seqcount_t *s)
  9127. +static inline void raw_write_seqcount_begin(seqcount_t *s)
  9128. +{
  9129. + preempt_disable_rt();
  9130. + __raw_write_seqcount_begin(s);
  9131. +}
  9132. +
  9133. +static inline void __raw_write_seqcount_end(seqcount_t *s)
  9134. {
  9135. smp_wmb();
  9136. s->sequence++;
  9137. }
  9138. +static inline void raw_write_seqcount_end(seqcount_t *s)
  9139. +{
  9140. + __raw_write_seqcount_end(s);
  9141. + preempt_enable_rt();
  9142. +}
  9143. +
  9144. /*
  9145. * raw_write_seqcount_latch - redirect readers to even/odd copy
  9146. * @s: pointer to seqcount_t
  9147. @@ -305,10 +315,32 @@
  9148. /*
  9149. * Read side functions for starting and finalizing a read side section.
  9150. */
  9151. +#ifndef CONFIG_PREEMPT_RT_FULL
  9152. static inline unsigned read_seqbegin(const seqlock_t *sl)
  9153. {
  9154. return read_seqcount_begin(&sl->seqcount);
  9155. }
  9156. +#else
  9157. +/*
  9158. + * Starvation safe read side for RT
  9159. + */
  9160. +static inline unsigned read_seqbegin(seqlock_t *sl)
  9161. +{
  9162. + unsigned ret;
  9163. +
  9164. +repeat:
  9165. + ret = ACCESS_ONCE(sl->seqcount.sequence);
  9166. + if (unlikely(ret & 1)) {
  9167. + /*
  9168. + * Take the lock and let the writer proceed (i.e. evtl
  9169. + * boost it), otherwise we could loop here forever.
  9170. + */
  9171. + spin_unlock_wait(&sl->lock);
  9172. + goto repeat;
  9173. + }
  9174. + return ret;
  9175. +}
  9176. +#endif
  9177. static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start)
  9178. {
  9179. @@ -323,36 +355,36 @@
  9180. static inline void write_seqlock(seqlock_t *sl)
  9181. {
  9182. spin_lock(&sl->lock);
  9183. - write_seqcount_begin(&sl->seqcount);
  9184. + __raw_write_seqcount_begin(&sl->seqcount);
  9185. }
  9186. static inline void write_sequnlock(seqlock_t *sl)
  9187. {
  9188. - write_seqcount_end(&sl->seqcount);
  9189. + __raw_write_seqcount_end(&sl->seqcount);
  9190. spin_unlock(&sl->lock);
  9191. }
  9192. static inline void write_seqlock_bh(seqlock_t *sl)
  9193. {
  9194. spin_lock_bh(&sl->lock);
  9195. - write_seqcount_begin(&sl->seqcount);
  9196. + __raw_write_seqcount_begin(&sl->seqcount);
  9197. }
  9198. static inline void write_sequnlock_bh(seqlock_t *sl)
  9199. {
  9200. - write_seqcount_end(&sl->seqcount);
  9201. + __raw_write_seqcount_end(&sl->seqcount);
  9202. spin_unlock_bh(&sl->lock);
  9203. }
  9204. static inline void write_seqlock_irq(seqlock_t *sl)
  9205. {
  9206. spin_lock_irq(&sl->lock);
  9207. - write_seqcount_begin(&sl->seqcount);
  9208. + __raw_write_seqcount_begin(&sl->seqcount);
  9209. }
  9210. static inline void write_sequnlock_irq(seqlock_t *sl)
  9211. {
  9212. - write_seqcount_end(&sl->seqcount);
  9213. + __raw_write_seqcount_end(&sl->seqcount);
  9214. spin_unlock_irq(&sl->lock);
  9215. }
  9216. @@ -361,7 +393,7 @@
  9217. unsigned long flags;
  9218. spin_lock_irqsave(&sl->lock, flags);
  9219. - write_seqcount_begin(&sl->seqcount);
  9220. + __raw_write_seqcount_begin(&sl->seqcount);
  9221. return flags;
  9222. }
  9223. @@ -371,7 +403,7 @@
  9224. static inline void
  9225. write_sequnlock_irqrestore(seqlock_t *sl, unsigned long flags)
  9226. {
  9227. - write_seqcount_end(&sl->seqcount);
  9228. + __raw_write_seqcount_end(&sl->seqcount);
  9229. spin_unlock_irqrestore(&sl->lock, flags);
  9230. }
  9231. diff -Nur linux-3.18.12.orig/include/linux/signal.h linux-3.18.12/include/linux/signal.h
  9232. --- linux-3.18.12.orig/include/linux/signal.h 2015-04-20 14:48:02.000000000 -0500
  9233. +++ linux-3.18.12/include/linux/signal.h 2015-04-26 13:32:22.423684003 -0500
  9234. @@ -218,6 +218,7 @@
  9235. }
  9236. extern void flush_sigqueue(struct sigpending *queue);
  9237. +extern void flush_task_sigqueue(struct task_struct *tsk);
  9238. /* Test if 'sig' is valid signal. Use this instead of testing _NSIG directly */
  9239. static inline int valid_signal(unsigned long sig)
  9240. diff -Nur linux-3.18.12.orig/include/linux/skbuff.h linux-3.18.12/include/linux/skbuff.h
  9241. --- linux-3.18.12.orig/include/linux/skbuff.h 2015-04-20 14:48:02.000000000 -0500
  9242. +++ linux-3.18.12/include/linux/skbuff.h 2015-04-26 13:32:22.423684003 -0500
  9243. @@ -172,6 +172,7 @@
  9244. __u32 qlen;
  9245. spinlock_t lock;
  9246. + raw_spinlock_t raw_lock;
  9247. };
  9248. struct sk_buff;
  9249. @@ -1327,6 +1328,12 @@
  9250. __skb_queue_head_init(list);
  9251. }
  9252. +static inline void skb_queue_head_init_raw(struct sk_buff_head *list)
  9253. +{
  9254. + raw_spin_lock_init(&list->raw_lock);
  9255. + __skb_queue_head_init(list);
  9256. +}
  9257. +
  9258. static inline void skb_queue_head_init_class(struct sk_buff_head *list,
  9259. struct lock_class_key *class)
  9260. {
  9261. diff -Nur linux-3.18.12.orig/include/linux/smp.h linux-3.18.12/include/linux/smp.h
  9262. --- linux-3.18.12.orig/include/linux/smp.h 2015-04-20 14:48:02.000000000 -0500
  9263. +++ linux-3.18.12/include/linux/smp.h 2015-04-26 13:32:22.423684003 -0500
  9264. @@ -178,6 +178,9 @@
  9265. #define get_cpu() ({ preempt_disable(); smp_processor_id(); })
  9266. #define put_cpu() preempt_enable()
  9267. +#define get_cpu_light() ({ migrate_disable(); smp_processor_id(); })
  9268. +#define put_cpu_light() migrate_enable()
  9269. +
  9270. /*
  9271. * Callback to arch code if there's nosmp or maxcpus=0 on the
  9272. * boot command line:
  9273. diff -Nur linux-3.18.12.orig/include/linux/spinlock_api_smp.h linux-3.18.12/include/linux/spinlock_api_smp.h
  9274. --- linux-3.18.12.orig/include/linux/spinlock_api_smp.h 2015-04-20 14:48:02.000000000 -0500
  9275. +++ linux-3.18.12/include/linux/spinlock_api_smp.h 2015-04-26 13:32:22.423684003 -0500
  9276. @@ -187,6 +187,8 @@
  9277. return 0;
  9278. }
  9279. -#include <linux/rwlock_api_smp.h>
  9280. +#ifndef CONFIG_PREEMPT_RT_FULL
  9281. +# include <linux/rwlock_api_smp.h>
  9282. +#endif
  9283. #endif /* __LINUX_SPINLOCK_API_SMP_H */
  9284. diff -Nur linux-3.18.12.orig/include/linux/spinlock.h linux-3.18.12/include/linux/spinlock.h
  9285. --- linux-3.18.12.orig/include/linux/spinlock.h 2015-04-20 14:48:02.000000000 -0500
  9286. +++ linux-3.18.12/include/linux/spinlock.h 2015-04-26 13:32:22.423684003 -0500
  9287. @@ -278,7 +278,11 @@
  9288. #define raw_spin_can_lock(lock) (!raw_spin_is_locked(lock))
  9289. /* Include rwlock functions */
  9290. -#include <linux/rwlock.h>
  9291. +#ifdef CONFIG_PREEMPT_RT_FULL
  9292. +# include <linux/rwlock_rt.h>
  9293. +#else
  9294. +# include <linux/rwlock.h>
  9295. +#endif
  9296. /*
  9297. * Pull the _spin_*()/_read_*()/_write_*() functions/declarations:
  9298. @@ -289,6 +293,10 @@
  9299. # include <linux/spinlock_api_up.h>
  9300. #endif
  9301. +#ifdef CONFIG_PREEMPT_RT_FULL
  9302. +# include <linux/spinlock_rt.h>
  9303. +#else /* PREEMPT_RT_FULL */
  9304. +
  9305. /*
  9306. * Map the spin_lock functions to the raw variants for PREEMPT_RT=n
  9307. */
  9308. @@ -418,4 +426,6 @@
  9309. #define atomic_dec_and_lock(atomic, lock) \
  9310. __cond_lock(lock, _atomic_dec_and_lock(atomic, lock))
  9311. +#endif /* !PREEMPT_RT_FULL */
  9312. +
  9313. #endif /* __LINUX_SPINLOCK_H */
  9314. diff -Nur linux-3.18.12.orig/include/linux/spinlock_rt.h linux-3.18.12/include/linux/spinlock_rt.h
  9315. --- linux-3.18.12.orig/include/linux/spinlock_rt.h 1969-12-31 18:00:00.000000000 -0600
  9316. +++ linux-3.18.12/include/linux/spinlock_rt.h 2015-04-26 13:32:22.423684003 -0500
  9317. @@ -0,0 +1,167 @@
  9318. +#ifndef __LINUX_SPINLOCK_RT_H
  9319. +#define __LINUX_SPINLOCK_RT_H
  9320. +
  9321. +#ifndef __LINUX_SPINLOCK_H
  9322. +#error Do not include directly. Use spinlock.h
  9323. +#endif
  9324. +
  9325. +#include <linux/bug.h>
  9326. +
  9327. +extern void
  9328. +__rt_spin_lock_init(spinlock_t *lock, char *name, struct lock_class_key *key);
  9329. +
  9330. +#define spin_lock_init(slock) \
  9331. +do { \
  9332. + static struct lock_class_key __key; \
  9333. + \
  9334. + rt_mutex_init(&(slock)->lock); \
  9335. + __rt_spin_lock_init(slock, #slock, &__key); \
  9336. +} while (0)
  9337. +
  9338. +extern void __lockfunc rt_spin_lock(spinlock_t *lock);
  9339. +extern unsigned long __lockfunc rt_spin_lock_trace_flags(spinlock_t *lock);
  9340. +extern void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass);
  9341. +extern void __lockfunc rt_spin_unlock(spinlock_t *lock);
  9342. +extern void __lockfunc rt_spin_unlock_after_trylock_in_irq(spinlock_t *lock);
  9343. +extern void __lockfunc rt_spin_unlock_wait(spinlock_t *lock);
  9344. +extern int __lockfunc rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags);
  9345. +extern int __lockfunc rt_spin_trylock_bh(spinlock_t *lock);
  9346. +extern int __lockfunc rt_spin_trylock(spinlock_t *lock);
  9347. +extern int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock);
  9348. +
  9349. +/*
  9350. + * lockdep-less calls, for derived types like rwlock:
  9351. + * (for trylock they can use rt_mutex_trylock() directly.
  9352. + */
  9353. +extern void __lockfunc __rt_spin_lock(struct rt_mutex *lock);
  9354. +extern void __lockfunc __rt_spin_unlock(struct rt_mutex *lock);
  9355. +extern int __lockfunc __rt_spin_trylock(struct rt_mutex *lock);
  9356. +
  9357. +#define spin_lock(lock) \
  9358. + do { \
  9359. + migrate_disable(); \
  9360. + rt_spin_lock(lock); \
  9361. + } while (0)
  9362. +
  9363. +#define spin_lock_bh(lock) \
  9364. + do { \
  9365. + local_bh_disable(); \
  9366. + migrate_disable(); \
  9367. + rt_spin_lock(lock); \
  9368. + } while (0)
  9369. +
  9370. +#define spin_lock_irq(lock) spin_lock(lock)
  9371. +
  9372. +#define spin_do_trylock(lock) __cond_lock(lock, rt_spin_trylock(lock))
  9373. +
  9374. +#define spin_trylock(lock) \
  9375. +({ \
  9376. + int __locked; \
  9377. + migrate_disable(); \
  9378. + __locked = spin_do_trylock(lock); \
  9379. + if (!__locked) \
  9380. + migrate_enable(); \
  9381. + __locked; \
  9382. +})
  9383. +
  9384. +#ifdef CONFIG_LOCKDEP
  9385. +# define spin_lock_nested(lock, subclass) \
  9386. + do { \
  9387. + migrate_disable(); \
  9388. + rt_spin_lock_nested(lock, subclass); \
  9389. + } while (0)
  9390. +
  9391. +# define spin_lock_irqsave_nested(lock, flags, subclass) \
  9392. + do { \
  9393. + typecheck(unsigned long, flags); \
  9394. + flags = 0; \
  9395. + migrate_disable(); \
  9396. + rt_spin_lock_nested(lock, subclass); \
  9397. + } while (0)
  9398. +#else
  9399. +# define spin_lock_nested(lock, subclass) spin_lock(lock)
  9400. +
  9401. +# define spin_lock_irqsave_nested(lock, flags, subclass) \
  9402. + do { \
  9403. + typecheck(unsigned long, flags); \
  9404. + flags = 0; \
  9405. + spin_lock(lock); \
  9406. + } while (0)
  9407. +#endif
  9408. +
  9409. +#define spin_lock_irqsave(lock, flags) \
  9410. + do { \
  9411. + typecheck(unsigned long, flags); \
  9412. + flags = 0; \
  9413. + spin_lock(lock); \
  9414. + } while (0)
  9415. +
  9416. +static inline unsigned long spin_lock_trace_flags(spinlock_t *lock)
  9417. +{
  9418. + unsigned long flags = 0;
  9419. +#ifdef CONFIG_TRACE_IRQFLAGS
  9420. + flags = rt_spin_lock_trace_flags(lock);
  9421. +#else
  9422. + spin_lock(lock); /* lock_local */
  9423. +#endif
  9424. + return flags;
  9425. +}
  9426. +
  9427. +/* FIXME: we need rt_spin_lock_nest_lock */
  9428. +#define spin_lock_nest_lock(lock, nest_lock) spin_lock_nested(lock, 0)
  9429. +
  9430. +#define spin_unlock(lock) \
  9431. + do { \
  9432. + rt_spin_unlock(lock); \
  9433. + migrate_enable(); \
  9434. + } while (0)
  9435. +
  9436. +#define spin_unlock_bh(lock) \
  9437. + do { \
  9438. + rt_spin_unlock(lock); \
  9439. + migrate_enable(); \
  9440. + local_bh_enable(); \
  9441. + } while (0)
  9442. +
  9443. +#define spin_unlock_irq(lock) spin_unlock(lock)
  9444. +
  9445. +#define spin_unlock_irqrestore(lock, flags) \
  9446. + do { \
  9447. + typecheck(unsigned long, flags); \
  9448. + (void) flags; \
  9449. + spin_unlock(lock); \
  9450. + } while (0)
  9451. +
  9452. +#define spin_trylock_bh(lock) __cond_lock(lock, rt_spin_trylock_bh(lock))
  9453. +#define spin_trylock_irq(lock) spin_trylock(lock)
  9454. +
  9455. +#define spin_trylock_irqsave(lock, flags) \
  9456. + rt_spin_trylock_irqsave(lock, &(flags))
  9457. +
  9458. +#define spin_unlock_wait(lock) rt_spin_unlock_wait(lock)
  9459. +
  9460. +#ifdef CONFIG_GENERIC_LOCKBREAK
  9461. +# define spin_is_contended(lock) ((lock)->break_lock)
  9462. +#else
  9463. +# define spin_is_contended(lock) (((void)(lock), 0))
  9464. +#endif
  9465. +
  9466. +static inline int spin_can_lock(spinlock_t *lock)
  9467. +{
  9468. + return !rt_mutex_is_locked(&lock->lock);
  9469. +}
  9470. +
  9471. +static inline int spin_is_locked(spinlock_t *lock)
  9472. +{
  9473. + return rt_mutex_is_locked(&lock->lock);
  9474. +}
  9475. +
  9476. +static inline void assert_spin_locked(spinlock_t *lock)
  9477. +{
  9478. + BUG_ON(!spin_is_locked(lock));
  9479. +}
  9480. +
  9481. +#define atomic_dec_and_lock(atomic, lock) \
  9482. + atomic_dec_and_spin_lock(atomic, lock)
  9483. +
  9484. +#endif
  9485. diff -Nur linux-3.18.12.orig/include/linux/spinlock_types.h linux-3.18.12/include/linux/spinlock_types.h
  9486. --- linux-3.18.12.orig/include/linux/spinlock_types.h 2015-04-20 14:48:02.000000000 -0500
  9487. +++ linux-3.18.12/include/linux/spinlock_types.h 2015-04-26 13:32:22.423684003 -0500
  9488. @@ -9,80 +9,15 @@
  9489. * Released under the General Public License (GPL).
  9490. */
  9491. -#if defined(CONFIG_SMP)
  9492. -# include <asm/spinlock_types.h>
  9493. -#else
  9494. -# include <linux/spinlock_types_up.h>
  9495. -#endif
  9496. -
  9497. -#include <linux/lockdep.h>
  9498. -
  9499. -typedef struct raw_spinlock {
  9500. - arch_spinlock_t raw_lock;
  9501. -#ifdef CONFIG_GENERIC_LOCKBREAK
  9502. - unsigned int break_lock;
  9503. -#endif
  9504. -#ifdef CONFIG_DEBUG_SPINLOCK
  9505. - unsigned int magic, owner_cpu;
  9506. - void *owner;
  9507. -#endif
  9508. -#ifdef CONFIG_DEBUG_LOCK_ALLOC
  9509. - struct lockdep_map dep_map;
  9510. -#endif
  9511. -} raw_spinlock_t;
  9512. -
  9513. -#define SPINLOCK_MAGIC 0xdead4ead
  9514. -
  9515. -#define SPINLOCK_OWNER_INIT ((void *)-1L)
  9516. -
  9517. -#ifdef CONFIG_DEBUG_LOCK_ALLOC
  9518. -# define SPIN_DEP_MAP_INIT(lockname) .dep_map = { .name = #lockname }
  9519. -#else
  9520. -# define SPIN_DEP_MAP_INIT(lockname)
  9521. -#endif
  9522. +#include <linux/spinlock_types_raw.h>
  9523. -#ifdef CONFIG_DEBUG_SPINLOCK
  9524. -# define SPIN_DEBUG_INIT(lockname) \
  9525. - .magic = SPINLOCK_MAGIC, \
  9526. - .owner_cpu = -1, \
  9527. - .owner = SPINLOCK_OWNER_INIT,
  9528. +#ifndef CONFIG_PREEMPT_RT_FULL
  9529. +# include <linux/spinlock_types_nort.h>
  9530. +# include <linux/rwlock_types.h>
  9531. #else
  9532. -# define SPIN_DEBUG_INIT(lockname)
  9533. +# include <linux/rtmutex.h>
  9534. +# include <linux/spinlock_types_rt.h>
  9535. +# include <linux/rwlock_types_rt.h>
  9536. #endif
  9537. -#define __RAW_SPIN_LOCK_INITIALIZER(lockname) \
  9538. - { \
  9539. - .raw_lock = __ARCH_SPIN_LOCK_UNLOCKED, \
  9540. - SPIN_DEBUG_INIT(lockname) \
  9541. - SPIN_DEP_MAP_INIT(lockname) }
  9542. -
  9543. -#define __RAW_SPIN_LOCK_UNLOCKED(lockname) \
  9544. - (raw_spinlock_t) __RAW_SPIN_LOCK_INITIALIZER(lockname)
  9545. -
  9546. -#define DEFINE_RAW_SPINLOCK(x) raw_spinlock_t x = __RAW_SPIN_LOCK_UNLOCKED(x)
  9547. -
  9548. -typedef struct spinlock {
  9549. - union {
  9550. - struct raw_spinlock rlock;
  9551. -
  9552. -#ifdef CONFIG_DEBUG_LOCK_ALLOC
  9553. -# define LOCK_PADSIZE (offsetof(struct raw_spinlock, dep_map))
  9554. - struct {
  9555. - u8 __padding[LOCK_PADSIZE];
  9556. - struct lockdep_map dep_map;
  9557. - };
  9558. -#endif
  9559. - };
  9560. -} spinlock_t;
  9561. -
  9562. -#define __SPIN_LOCK_INITIALIZER(lockname) \
  9563. - { { .rlock = __RAW_SPIN_LOCK_INITIALIZER(lockname) } }
  9564. -
  9565. -#define __SPIN_LOCK_UNLOCKED(lockname) \
  9566. - (spinlock_t ) __SPIN_LOCK_INITIALIZER(lockname)
  9567. -
  9568. -#define DEFINE_SPINLOCK(x) spinlock_t x = __SPIN_LOCK_UNLOCKED(x)
  9569. -
  9570. -#include <linux/rwlock_types.h>
  9571. -
  9572. #endif /* __LINUX_SPINLOCK_TYPES_H */
  9573. diff -Nur linux-3.18.12.orig/include/linux/spinlock_types_nort.h linux-3.18.12/include/linux/spinlock_types_nort.h
  9574. --- linux-3.18.12.orig/include/linux/spinlock_types_nort.h 1969-12-31 18:00:00.000000000 -0600
  9575. +++ linux-3.18.12/include/linux/spinlock_types_nort.h 2015-04-26 13:32:22.423684003 -0500
  9576. @@ -0,0 +1,33 @@
  9577. +#ifndef __LINUX_SPINLOCK_TYPES_NORT_H
  9578. +#define __LINUX_SPINLOCK_TYPES_NORT_H
  9579. +
  9580. +#ifndef __LINUX_SPINLOCK_TYPES_H
  9581. +#error "Do not include directly. Include spinlock_types.h instead"
  9582. +#endif
  9583. +
  9584. +/*
  9585. + * The non RT version maps spinlocks to raw_spinlocks
  9586. + */
  9587. +typedef struct spinlock {
  9588. + union {
  9589. + struct raw_spinlock rlock;
  9590. +
  9591. +#ifdef CONFIG_DEBUG_LOCK_ALLOC
  9592. +# define LOCK_PADSIZE (offsetof(struct raw_spinlock, dep_map))
  9593. + struct {
  9594. + u8 __padding[LOCK_PADSIZE];
  9595. + struct lockdep_map dep_map;
  9596. + };
  9597. +#endif
  9598. + };
  9599. +} spinlock_t;
  9600. +
  9601. +#define __SPIN_LOCK_INITIALIZER(lockname) \
  9602. + { { .rlock = __RAW_SPIN_LOCK_INITIALIZER(lockname) } }
  9603. +
  9604. +#define __SPIN_LOCK_UNLOCKED(lockname) \
  9605. + (spinlock_t ) __SPIN_LOCK_INITIALIZER(lockname)
  9606. +
  9607. +#define DEFINE_SPINLOCK(x) spinlock_t x = __SPIN_LOCK_UNLOCKED(x)
  9608. +
  9609. +#endif
  9610. diff -Nur linux-3.18.12.orig/include/linux/spinlock_types_raw.h linux-3.18.12/include/linux/spinlock_types_raw.h
  9611. --- linux-3.18.12.orig/include/linux/spinlock_types_raw.h 1969-12-31 18:00:00.000000000 -0600
  9612. +++ linux-3.18.12/include/linux/spinlock_types_raw.h 2015-04-26 13:32:22.423684003 -0500
  9613. @@ -0,0 +1,56 @@
  9614. +#ifndef __LINUX_SPINLOCK_TYPES_RAW_H
  9615. +#define __LINUX_SPINLOCK_TYPES_RAW_H
  9616. +
  9617. +#if defined(CONFIG_SMP)
  9618. +# include <asm/spinlock_types.h>
  9619. +#else
  9620. +# include <linux/spinlock_types_up.h>
  9621. +#endif
  9622. +
  9623. +#include <linux/lockdep.h>
  9624. +
  9625. +typedef struct raw_spinlock {
  9626. + arch_spinlock_t raw_lock;
  9627. +#ifdef CONFIG_GENERIC_LOCKBREAK
  9628. + unsigned int break_lock;
  9629. +#endif
  9630. +#ifdef CONFIG_DEBUG_SPINLOCK
  9631. + unsigned int magic, owner_cpu;
  9632. + void *owner;
  9633. +#endif
  9634. +#ifdef CONFIG_DEBUG_LOCK_ALLOC
  9635. + struct lockdep_map dep_map;
  9636. +#endif
  9637. +} raw_spinlock_t;
  9638. +
  9639. +#define SPINLOCK_MAGIC 0xdead4ead
  9640. +
  9641. +#define SPINLOCK_OWNER_INIT ((void *)-1L)
  9642. +
  9643. +#ifdef CONFIG_DEBUG_LOCK_ALLOC
  9644. +# define SPIN_DEP_MAP_INIT(lockname) .dep_map = { .name = #lockname }
  9645. +#else
  9646. +# define SPIN_DEP_MAP_INIT(lockname)
  9647. +#endif
  9648. +
  9649. +#ifdef CONFIG_DEBUG_SPINLOCK
  9650. +# define SPIN_DEBUG_INIT(lockname) \
  9651. + .magic = SPINLOCK_MAGIC, \
  9652. + .owner_cpu = -1, \
  9653. + .owner = SPINLOCK_OWNER_INIT,
  9654. +#else
  9655. +# define SPIN_DEBUG_INIT(lockname)
  9656. +#endif
  9657. +
  9658. +#define __RAW_SPIN_LOCK_INITIALIZER(lockname) \
  9659. + { \
  9660. + .raw_lock = __ARCH_SPIN_LOCK_UNLOCKED, \
  9661. + SPIN_DEBUG_INIT(lockname) \
  9662. + SPIN_DEP_MAP_INIT(lockname) }
  9663. +
  9664. +#define __RAW_SPIN_LOCK_UNLOCKED(lockname) \
  9665. + (raw_spinlock_t) __RAW_SPIN_LOCK_INITIALIZER(lockname)
  9666. +
  9667. +#define DEFINE_RAW_SPINLOCK(x) raw_spinlock_t x = __RAW_SPIN_LOCK_UNLOCKED(x)
  9668. +
  9669. +#endif
  9670. diff -Nur linux-3.18.12.orig/include/linux/spinlock_types_rt.h linux-3.18.12/include/linux/spinlock_types_rt.h
  9671. --- linux-3.18.12.orig/include/linux/spinlock_types_rt.h 1969-12-31 18:00:00.000000000 -0600
  9672. +++ linux-3.18.12/include/linux/spinlock_types_rt.h 2015-04-26 13:32:22.423684003 -0500
  9673. @@ -0,0 +1,51 @@
  9674. +#ifndef __LINUX_SPINLOCK_TYPES_RT_H
  9675. +#define __LINUX_SPINLOCK_TYPES_RT_H
  9676. +
  9677. +#ifndef __LINUX_SPINLOCK_TYPES_H
  9678. +#error "Do not include directly. Include spinlock_types.h instead"
  9679. +#endif
  9680. +
  9681. +#include <linux/cache.h>
  9682. +
  9683. +/*
  9684. + * PREEMPT_RT: spinlocks - an RT mutex plus lock-break field:
  9685. + */
  9686. +typedef struct spinlock {
  9687. + struct rt_mutex lock;
  9688. + unsigned int break_lock;
  9689. +#ifdef CONFIG_DEBUG_LOCK_ALLOC
  9690. + struct lockdep_map dep_map;
  9691. +#endif
  9692. +} spinlock_t;
  9693. +
  9694. +#ifdef CONFIG_DEBUG_RT_MUTEXES
  9695. +# define __RT_SPIN_INITIALIZER(name) \
  9696. + { \
  9697. + .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \
  9698. + .save_state = 1, \
  9699. + .file = __FILE__, \
  9700. + .line = __LINE__ , \
  9701. + }
  9702. +#else
  9703. +# define __RT_SPIN_INITIALIZER(name) \
  9704. + { \
  9705. + .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \
  9706. + .save_state = 1, \
  9707. + }
  9708. +#endif
  9709. +
  9710. +/*
  9711. +.wait_list = PLIST_HEAD_INIT_RAW((name).lock.wait_list, (name).lock.wait_lock)
  9712. +*/
  9713. +
  9714. +#define __SPIN_LOCK_UNLOCKED(name) \
  9715. + { .lock = __RT_SPIN_INITIALIZER(name.lock), \
  9716. + SPIN_DEP_MAP_INIT(name) }
  9717. +
  9718. +#define __DEFINE_SPINLOCK(name) \
  9719. + spinlock_t name = __SPIN_LOCK_UNLOCKED(name)
  9720. +
  9721. +#define DEFINE_SPINLOCK(name) \
  9722. + spinlock_t name __cacheline_aligned_in_smp = __SPIN_LOCK_UNLOCKED(name)
  9723. +
  9724. +#endif
  9725. diff -Nur linux-3.18.12.orig/include/linux/srcu.h linux-3.18.12/include/linux/srcu.h
  9726. --- linux-3.18.12.orig/include/linux/srcu.h 2015-04-20 14:48:02.000000000 -0500
  9727. +++ linux-3.18.12/include/linux/srcu.h 2015-04-26 13:32:22.427684003 -0500
  9728. @@ -84,10 +84,10 @@
  9729. void process_srcu(struct work_struct *work);
  9730. -#define __SRCU_STRUCT_INIT(name) \
  9731. +#define __SRCU_STRUCT_INIT(name, pcpu_name) \
  9732. { \
  9733. .completed = -300, \
  9734. - .per_cpu_ref = &name##_srcu_array, \
  9735. + .per_cpu_ref = &pcpu_name, \
  9736. .queue_lock = __SPIN_LOCK_UNLOCKED(name.queue_lock), \
  9737. .running = false, \
  9738. .batch_queue = RCU_BATCH_INIT(name.batch_queue), \
  9739. @@ -104,11 +104,12 @@
  9740. */
  9741. #define DEFINE_SRCU(name) \
  9742. static DEFINE_PER_CPU(struct srcu_struct_array, name##_srcu_array);\
  9743. - struct srcu_struct name = __SRCU_STRUCT_INIT(name);
  9744. + struct srcu_struct name = __SRCU_STRUCT_INIT(name, name##_srcu_array);
  9745. #define DEFINE_STATIC_SRCU(name) \
  9746. static DEFINE_PER_CPU(struct srcu_struct_array, name##_srcu_array);\
  9747. - static struct srcu_struct name = __SRCU_STRUCT_INIT(name);
  9748. + static struct srcu_struct name = __SRCU_STRUCT_INIT(\
  9749. + name, name##_srcu_array);
  9750. /**
  9751. * call_srcu() - Queue a callback for invocation after an SRCU grace period
  9752. diff -Nur linux-3.18.12.orig/include/linux/swap.h linux-3.18.12/include/linux/swap.h
  9753. --- linux-3.18.12.orig/include/linux/swap.h 2015-04-20 14:48:02.000000000 -0500
  9754. +++ linux-3.18.12/include/linux/swap.h 2015-04-26 13:32:22.427684003 -0500
  9755. @@ -11,6 +11,7 @@
  9756. #include <linux/fs.h>
  9757. #include <linux/atomic.h>
  9758. #include <linux/page-flags.h>
  9759. +#include <linux/locallock.h>
  9760. #include <asm/page.h>
  9761. struct notifier_block;
  9762. @@ -260,7 +261,8 @@
  9763. void *workingset_eviction(struct address_space *mapping, struct page *page);
  9764. bool workingset_refault(void *shadow);
  9765. void workingset_activation(struct page *page);
  9766. -extern struct list_lru workingset_shadow_nodes;
  9767. +extern struct list_lru __workingset_shadow_nodes;
  9768. +DECLARE_LOCAL_IRQ_LOCK(workingset_shadow_lock);
  9769. static inline unsigned int workingset_node_pages(struct radix_tree_node *node)
  9770. {
  9771. diff -Nur linux-3.18.12.orig/include/linux/sysctl.h linux-3.18.12/include/linux/sysctl.h
  9772. --- linux-3.18.12.orig/include/linux/sysctl.h 2015-04-20 14:48:02.000000000 -0500
  9773. +++ linux-3.18.12/include/linux/sysctl.h 2015-04-26 13:32:22.427684003 -0500
  9774. @@ -25,6 +25,7 @@
  9775. #include <linux/rcupdate.h>
  9776. #include <linux/wait.h>
  9777. #include <linux/rbtree.h>
  9778. +#include <linux/atomic.h>
  9779. #include <uapi/linux/sysctl.h>
  9780. /* For the /proc/sys support */
  9781. diff -Nur linux-3.18.12.orig/include/linux/thread_info.h linux-3.18.12/include/linux/thread_info.h
  9782. --- linux-3.18.12.orig/include/linux/thread_info.h 2015-04-20 14:48:02.000000000 -0500
  9783. +++ linux-3.18.12/include/linux/thread_info.h 2015-04-26 13:32:22.427684003 -0500
  9784. @@ -102,7 +102,17 @@
  9785. #define test_thread_flag(flag) \
  9786. test_ti_thread_flag(current_thread_info(), flag)
  9787. -#define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED)
  9788. +#ifdef CONFIG_PREEMPT_LAZY
  9789. +#define tif_need_resched() (test_thread_flag(TIF_NEED_RESCHED) || \
  9790. + test_thread_flag(TIF_NEED_RESCHED_LAZY))
  9791. +#define tif_need_resched_now() (test_thread_flag(TIF_NEED_RESCHED))
  9792. +#define tif_need_resched_lazy() test_thread_flag(TIF_NEED_RESCHED_LAZY))
  9793. +
  9794. +#else
  9795. +#define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED)
  9796. +#define tif_need_resched_now() test_thread_flag(TIF_NEED_RESCHED)
  9797. +#define tif_need_resched_lazy() 0
  9798. +#endif
  9799. #if defined TIF_RESTORE_SIGMASK && !defined HAVE_SET_RESTORE_SIGMASK
  9800. /*
  9801. diff -Nur linux-3.18.12.orig/include/linux/timer.h linux-3.18.12/include/linux/timer.h
  9802. --- linux-3.18.12.orig/include/linux/timer.h 2015-04-20 14:48:02.000000000 -0500
  9803. +++ linux-3.18.12/include/linux/timer.h 2015-04-26 13:32:22.427684003 -0500
  9804. @@ -241,7 +241,7 @@
  9805. extern int try_to_del_timer_sync(struct timer_list *timer);
  9806. -#ifdef CONFIG_SMP
  9807. +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
  9808. extern int del_timer_sync(struct timer_list *timer);
  9809. #else
  9810. # define del_timer_sync(t) del_timer(t)
  9811. diff -Nur linux-3.18.12.orig/include/linux/uaccess.h linux-3.18.12/include/linux/uaccess.h
  9812. --- linux-3.18.12.orig/include/linux/uaccess.h 2015-04-20 14:48:02.000000000 -0500
  9813. +++ linux-3.18.12/include/linux/uaccess.h 2015-04-26 13:32:22.427684003 -0500
  9814. @@ -6,14 +6,9 @@
  9815. /*
  9816. * These routines enable/disable the pagefault handler in that
  9817. - * it will not take any locks and go straight to the fixup table.
  9818. - *
  9819. - * They have great resemblance to the preempt_disable/enable calls
  9820. - * and in fact they are identical; this is because currently there is
  9821. - * no other way to make the pagefault handlers do this. So we do
  9822. - * disable preemption but we don't necessarily care about that.
  9823. + * it will not take any MM locks and go straight to the fixup table.
  9824. */
  9825. -static inline void pagefault_disable(void)
  9826. +static inline void raw_pagefault_disable(void)
  9827. {
  9828. preempt_count_inc();
  9829. /*
  9830. @@ -23,7 +18,7 @@
  9831. barrier();
  9832. }
  9833. -static inline void pagefault_enable(void)
  9834. +static inline void raw_pagefault_enable(void)
  9835. {
  9836. #ifndef CONFIG_PREEMPT
  9837. /*
  9838. @@ -37,6 +32,21 @@
  9839. #endif
  9840. }
  9841. +#ifndef CONFIG_PREEMPT_RT_FULL
  9842. +static inline void pagefault_disable(void)
  9843. +{
  9844. + raw_pagefault_disable();
  9845. +}
  9846. +
  9847. +static inline void pagefault_enable(void)
  9848. +{
  9849. + raw_pagefault_enable();
  9850. +}
  9851. +#else
  9852. +extern void pagefault_disable(void);
  9853. +extern void pagefault_enable(void);
  9854. +#endif
  9855. +
  9856. #ifndef ARCH_HAS_NOCACHE_UACCESS
  9857. static inline unsigned long __copy_from_user_inatomic_nocache(void *to,
  9858. @@ -76,9 +86,9 @@
  9859. mm_segment_t old_fs = get_fs(); \
  9860. \
  9861. set_fs(KERNEL_DS); \
  9862. - pagefault_disable(); \
  9863. + raw_pagefault_disable(); \
  9864. ret = __copy_from_user_inatomic(&(retval), (__force typeof(retval) __user *)(addr), sizeof(retval)); \
  9865. - pagefault_enable(); \
  9866. + raw_pagefault_enable(); \
  9867. set_fs(old_fs); \
  9868. ret; \
  9869. })
  9870. diff -Nur linux-3.18.12.orig/include/linux/uprobes.h linux-3.18.12/include/linux/uprobes.h
  9871. --- linux-3.18.12.orig/include/linux/uprobes.h 2015-04-20 14:48:02.000000000 -0500
  9872. +++ linux-3.18.12/include/linux/uprobes.h 2015-04-26 13:32:22.427684003 -0500
  9873. @@ -27,6 +27,7 @@
  9874. #include <linux/errno.h>
  9875. #include <linux/rbtree.h>
  9876. #include <linux/types.h>
  9877. +#include <linux/wait.h>
  9878. struct vm_area_struct;
  9879. struct mm_struct;
  9880. diff -Nur linux-3.18.12.orig/include/linux/vmstat.h linux-3.18.12/include/linux/vmstat.h
  9881. --- linux-3.18.12.orig/include/linux/vmstat.h 2015-04-20 14:48:02.000000000 -0500
  9882. +++ linux-3.18.12/include/linux/vmstat.h 2015-04-26 13:32:22.427684003 -0500
  9883. @@ -33,7 +33,9 @@
  9884. */
  9885. static inline void __count_vm_event(enum vm_event_item item)
  9886. {
  9887. + preempt_disable_rt();
  9888. raw_cpu_inc(vm_event_states.event[item]);
  9889. + preempt_enable_rt();
  9890. }
  9891. static inline void count_vm_event(enum vm_event_item item)
  9892. @@ -43,7 +45,9 @@
  9893. static inline void __count_vm_events(enum vm_event_item item, long delta)
  9894. {
  9895. + preempt_disable_rt();
  9896. raw_cpu_add(vm_event_states.event[item], delta);
  9897. + preempt_enable_rt();
  9898. }
  9899. static inline void count_vm_events(enum vm_event_item item, long delta)
  9900. diff -Nur linux-3.18.12.orig/include/linux/wait.h linux-3.18.12/include/linux/wait.h
  9901. --- linux-3.18.12.orig/include/linux/wait.h 2015-04-20 14:48:02.000000000 -0500
  9902. +++ linux-3.18.12/include/linux/wait.h 2015-04-26 13:32:22.427684003 -0500
  9903. @@ -8,6 +8,7 @@
  9904. #include <linux/spinlock.h>
  9905. #include <asm/current.h>
  9906. #include <uapi/linux/wait.h>
  9907. +#include <linux/atomic.h>
  9908. typedef struct __wait_queue wait_queue_t;
  9909. typedef int (*wait_queue_func_t)(wait_queue_t *wait, unsigned mode, int flags, void *key);
  9910. diff -Nur linux-3.18.12.orig/include/linux/wait-simple.h linux-3.18.12/include/linux/wait-simple.h
  9911. --- linux-3.18.12.orig/include/linux/wait-simple.h 1969-12-31 18:00:00.000000000 -0600
  9912. +++ linux-3.18.12/include/linux/wait-simple.h 2015-04-26 13:32:22.427684003 -0500
  9913. @@ -0,0 +1,207 @@
  9914. +#ifndef _LINUX_WAIT_SIMPLE_H
  9915. +#define _LINUX_WAIT_SIMPLE_H
  9916. +
  9917. +#include <linux/spinlock.h>
  9918. +#include <linux/list.h>
  9919. +
  9920. +#include <asm/current.h>
  9921. +
  9922. +struct swaiter {
  9923. + struct task_struct *task;
  9924. + struct list_head node;
  9925. +};
  9926. +
  9927. +#define DEFINE_SWAITER(name) \
  9928. + struct swaiter name = { \
  9929. + .task = current, \
  9930. + .node = LIST_HEAD_INIT((name).node), \
  9931. + }
  9932. +
  9933. +struct swait_head {
  9934. + raw_spinlock_t lock;
  9935. + struct list_head list;
  9936. +};
  9937. +
  9938. +#define SWAIT_HEAD_INITIALIZER(name) { \
  9939. + .lock = __RAW_SPIN_LOCK_UNLOCKED(name.lock), \
  9940. + .list = LIST_HEAD_INIT((name).list), \
  9941. + }
  9942. +
  9943. +#define DEFINE_SWAIT_HEAD(name) \
  9944. + struct swait_head name = SWAIT_HEAD_INITIALIZER(name)
  9945. +
  9946. +extern void __init_swait_head(struct swait_head *h, struct lock_class_key *key);
  9947. +
  9948. +#define init_swait_head(swh) \
  9949. + do { \
  9950. + static struct lock_class_key __key; \
  9951. + \
  9952. + __init_swait_head((swh), &__key); \
  9953. + } while (0)
  9954. +
  9955. +/*
  9956. + * Waiter functions
  9957. + */
  9958. +extern void swait_prepare_locked(struct swait_head *head, struct swaiter *w);
  9959. +extern void swait_prepare(struct swait_head *head, struct swaiter *w, int state);
  9960. +extern void swait_finish_locked(struct swait_head *head, struct swaiter *w);
  9961. +extern void swait_finish(struct swait_head *head, struct swaiter *w);
  9962. +
  9963. +/* Check whether a head has waiters enqueued */
  9964. +static inline bool swaitqueue_active(struct swait_head *h)
  9965. +{
  9966. + /* Make sure the condition is visible before checking list_empty() */
  9967. + smp_mb();
  9968. + return !list_empty(&h->list);
  9969. +}
  9970. +
  9971. +/*
  9972. + * Wakeup functions
  9973. + */
  9974. +extern unsigned int __swait_wake(struct swait_head *head, unsigned int state, unsigned int num);
  9975. +extern unsigned int __swait_wake_locked(struct swait_head *head, unsigned int state, unsigned int num);
  9976. +
  9977. +#define swait_wake(head) __swait_wake(head, TASK_NORMAL, 1)
  9978. +#define swait_wake_interruptible(head) __swait_wake(head, TASK_INTERRUPTIBLE, 1)
  9979. +#define swait_wake_all(head) __swait_wake(head, TASK_NORMAL, 0)
  9980. +#define swait_wake_all_interruptible(head) __swait_wake(head, TASK_INTERRUPTIBLE, 0)
  9981. +
  9982. +/*
  9983. + * Event API
  9984. + */
  9985. +#define __swait_event(wq, condition) \
  9986. +do { \
  9987. + DEFINE_SWAITER(__wait); \
  9988. + \
  9989. + for (;;) { \
  9990. + swait_prepare(&wq, &__wait, TASK_UNINTERRUPTIBLE); \
  9991. + if (condition) \
  9992. + break; \
  9993. + schedule(); \
  9994. + } \
  9995. + swait_finish(&wq, &__wait); \
  9996. +} while (0)
  9997. +
  9998. +/**
  9999. + * swait_event - sleep until a condition gets true
  10000. + * @wq: the waitqueue to wait on
  10001. + * @condition: a C expression for the event to wait for
  10002. + *
  10003. + * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
  10004. + * @condition evaluates to true. The @condition is checked each time
  10005. + * the waitqueue @wq is woken up.
  10006. + *
  10007. + * wake_up() has to be called after changing any variable that could
  10008. + * change the result of the wait condition.
  10009. + */
  10010. +#define swait_event(wq, condition) \
  10011. +do { \
  10012. + if (condition) \
  10013. + break; \
  10014. + __swait_event(wq, condition); \
  10015. +} while (0)
  10016. +
  10017. +#define __swait_event_interruptible(wq, condition, ret) \
  10018. +do { \
  10019. + DEFINE_SWAITER(__wait); \
  10020. + \
  10021. + for (;;) { \
  10022. + swait_prepare(&wq, &__wait, TASK_INTERRUPTIBLE); \
  10023. + if (condition) \
  10024. + break; \
  10025. + if (signal_pending(current)) { \
  10026. + ret = -ERESTARTSYS; \
  10027. + break; \
  10028. + } \
  10029. + schedule(); \
  10030. + } \
  10031. + swait_finish(&wq, &__wait); \
  10032. +} while (0)
  10033. +
  10034. +#define __swait_event_interruptible_timeout(wq, condition, ret) \
  10035. +do { \
  10036. + DEFINE_SWAITER(__wait); \
  10037. + \
  10038. + for (;;) { \
  10039. + swait_prepare(&wq, &__wait, TASK_INTERRUPTIBLE); \
  10040. + if (condition) \
  10041. + break; \
  10042. + if (signal_pending(current)) { \
  10043. + ret = -ERESTARTSYS; \
  10044. + break; \
  10045. + } \
  10046. + ret = schedule_timeout(ret); \
  10047. + if (!ret) \
  10048. + break; \
  10049. + } \
  10050. + swait_finish(&wq, &__wait); \
  10051. +} while (0)
  10052. +
  10053. +/**
  10054. + * swait_event_interruptible - sleep until a condition gets true
  10055. + * @wq: the waitqueue to wait on
  10056. + * @condition: a C expression for the event to wait for
  10057. + *
  10058. + * The process is put to sleep (TASK_INTERRUPTIBLE) until the
  10059. + * @condition evaluates to true. The @condition is checked each time
  10060. + * the waitqueue @wq is woken up.
  10061. + *
  10062. + * wake_up() has to be called after changing any variable that could
  10063. + * change the result of the wait condition.
  10064. + */
  10065. +#define swait_event_interruptible(wq, condition) \
  10066. +({ \
  10067. + int __ret = 0; \
  10068. + if (!(condition)) \
  10069. + __swait_event_interruptible(wq, condition, __ret); \
  10070. + __ret; \
  10071. +})
  10072. +
  10073. +#define swait_event_interruptible_timeout(wq, condition, timeout) \
  10074. +({ \
  10075. + int __ret = timeout; \
  10076. + if (!(condition)) \
  10077. + __swait_event_interruptible_timeout(wq, condition, __ret); \
  10078. + __ret; \
  10079. +})
  10080. +
  10081. +#define __swait_event_timeout(wq, condition, ret) \
  10082. +do { \
  10083. + DEFINE_SWAITER(__wait); \
  10084. + \
  10085. + for (;;) { \
  10086. + swait_prepare(&wq, &__wait, TASK_UNINTERRUPTIBLE); \
  10087. + if (condition) \
  10088. + break; \
  10089. + ret = schedule_timeout(ret); \
  10090. + if (!ret) \
  10091. + break; \
  10092. + } \
  10093. + swait_finish(&wq, &__wait); \
  10094. +} while (0)
  10095. +
  10096. +/**
  10097. + * swait_event_timeout - sleep until a condition gets true or a timeout elapses
  10098. + * @wq: the waitqueue to wait on
  10099. + * @condition: a C expression for the event to wait for
  10100. + * @timeout: timeout, in jiffies
  10101. + *
  10102. + * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
  10103. + * @condition evaluates to true. The @condition is checked each time
  10104. + * the waitqueue @wq is woken up.
  10105. + *
  10106. + * wake_up() has to be called after changing any variable that could
  10107. + * change the result of the wait condition.
  10108. + *
  10109. + * The function returns 0 if the @timeout elapsed, and the remaining
  10110. + * jiffies if the condition evaluated to true before the timeout elapsed.
  10111. + */
  10112. +#define swait_event_timeout(wq, condition, timeout) \
  10113. +({ \
  10114. + long __ret = timeout; \
  10115. + if (!(condition)) \
  10116. + __swait_event_timeout(wq, condition, __ret); \
  10117. + __ret; \
  10118. +})
  10119. +
  10120. +#endif
  10121. diff -Nur linux-3.18.12.orig/include/linux/work-simple.h linux-3.18.12/include/linux/work-simple.h
  10122. --- linux-3.18.12.orig/include/linux/work-simple.h 1969-12-31 18:00:00.000000000 -0600
  10123. +++ linux-3.18.12/include/linux/work-simple.h 2015-04-26 13:32:22.427684003 -0500
  10124. @@ -0,0 +1,24 @@
  10125. +#ifndef _LINUX_SWORK_H
  10126. +#define _LINUX_SWORK_H
  10127. +
  10128. +#include <linux/list.h>
  10129. +
  10130. +struct swork_event {
  10131. + struct list_head item;
  10132. + unsigned long flags;
  10133. + void (*func)(struct swork_event *);
  10134. +};
  10135. +
  10136. +static inline void INIT_SWORK(struct swork_event *event,
  10137. + void (*func)(struct swork_event *))
  10138. +{
  10139. + event->flags = 0;
  10140. + event->func = func;
  10141. +}
  10142. +
  10143. +bool swork_queue(struct swork_event *sev);
  10144. +
  10145. +int swork_get(void);
  10146. +void swork_put(void);
  10147. +
  10148. +#endif /* _LINUX_SWORK_H */
  10149. diff -Nur linux-3.18.12.orig/include/net/dst.h linux-3.18.12/include/net/dst.h
  10150. --- linux-3.18.12.orig/include/net/dst.h 2015-04-20 14:48:02.000000000 -0500
  10151. +++ linux-3.18.12/include/net/dst.h 2015-04-26 13:32:22.427684003 -0500
  10152. @@ -403,7 +403,7 @@
  10153. static inline int dst_neigh_output(struct dst_entry *dst, struct neighbour *n,
  10154. struct sk_buff *skb)
  10155. {
  10156. - const struct hh_cache *hh;
  10157. + struct hh_cache *hh;
  10158. if (dst->pending_confirm) {
  10159. unsigned long now = jiffies;
  10160. diff -Nur linux-3.18.12.orig/include/net/neighbour.h linux-3.18.12/include/net/neighbour.h
  10161. --- linux-3.18.12.orig/include/net/neighbour.h 2015-04-20 14:48:02.000000000 -0500
  10162. +++ linux-3.18.12/include/net/neighbour.h 2015-04-26 13:32:22.427684003 -0500
  10163. @@ -387,7 +387,7 @@
  10164. }
  10165. #endif
  10166. -static inline int neigh_hh_output(const struct hh_cache *hh, struct sk_buff *skb)
  10167. +static inline int neigh_hh_output(struct hh_cache *hh, struct sk_buff *skb)
  10168. {
  10169. unsigned int seq;
  10170. int hh_len;
  10171. @@ -442,7 +442,7 @@
  10172. #define NEIGH_CB(skb) ((struct neighbour_cb *)(skb)->cb)
  10173. -static inline void neigh_ha_snapshot(char *dst, const struct neighbour *n,
  10174. +static inline void neigh_ha_snapshot(char *dst, struct neighbour *n,
  10175. const struct net_device *dev)
  10176. {
  10177. unsigned int seq;
  10178. diff -Nur linux-3.18.12.orig/include/net/netns/ipv4.h linux-3.18.12/include/net/netns/ipv4.h
  10179. --- linux-3.18.12.orig/include/net/netns/ipv4.h 2015-04-20 14:48:02.000000000 -0500
  10180. +++ linux-3.18.12/include/net/netns/ipv4.h 2015-04-26 13:32:22.427684003 -0500
  10181. @@ -67,6 +67,7 @@
  10182. int sysctl_icmp_echo_ignore_all;
  10183. int sysctl_icmp_echo_ignore_broadcasts;
  10184. + int sysctl_icmp_echo_sysrq;
  10185. int sysctl_icmp_ignore_bogus_error_responses;
  10186. int sysctl_icmp_ratelimit;
  10187. int sysctl_icmp_ratemask;
  10188. diff -Nur linux-3.18.12.orig/include/trace/events/hist.h linux-3.18.12/include/trace/events/hist.h
  10189. --- linux-3.18.12.orig/include/trace/events/hist.h 1969-12-31 18:00:00.000000000 -0600
  10190. +++ linux-3.18.12/include/trace/events/hist.h 2015-04-26 13:32:22.427684003 -0500
  10191. @@ -0,0 +1,72 @@
  10192. +#undef TRACE_SYSTEM
  10193. +#define TRACE_SYSTEM hist
  10194. +
  10195. +#if !defined(_TRACE_HIST_H) || defined(TRACE_HEADER_MULTI_READ)
  10196. +#define _TRACE_HIST_H
  10197. +
  10198. +#include "latency_hist.h"
  10199. +#include <linux/tracepoint.h>
  10200. +
  10201. +#if !defined(CONFIG_PREEMPT_OFF_HIST) && !defined(CONFIG_INTERRUPT_OFF_HIST)
  10202. +#define trace_preemptirqsoff_hist(a, b)
  10203. +#else
  10204. +TRACE_EVENT(preemptirqsoff_hist,
  10205. +
  10206. + TP_PROTO(int reason, int starthist),
  10207. +
  10208. + TP_ARGS(reason, starthist),
  10209. +
  10210. + TP_STRUCT__entry(
  10211. + __field(int, reason)
  10212. + __field(int, starthist)
  10213. + ),
  10214. +
  10215. + TP_fast_assign(
  10216. + __entry->reason = reason;
  10217. + __entry->starthist = starthist;
  10218. + ),
  10219. +
  10220. + TP_printk("reason=%s starthist=%s", getaction(__entry->reason),
  10221. + __entry->starthist ? "start" : "stop")
  10222. +);
  10223. +#endif
  10224. +
  10225. +#ifndef CONFIG_MISSED_TIMER_OFFSETS_HIST
  10226. +#define trace_hrtimer_interrupt(a, b, c, d)
  10227. +#else
  10228. +TRACE_EVENT(hrtimer_interrupt,
  10229. +
  10230. + TP_PROTO(int cpu, long long offset, struct task_struct *curr,
  10231. + struct task_struct *task),
  10232. +
  10233. + TP_ARGS(cpu, offset, curr, task),
  10234. +
  10235. + TP_STRUCT__entry(
  10236. + __field(int, cpu)
  10237. + __field(long long, offset)
  10238. + __array(char, ccomm, TASK_COMM_LEN)
  10239. + __field(int, cprio)
  10240. + __array(char, tcomm, TASK_COMM_LEN)
  10241. + __field(int, tprio)
  10242. + ),
  10243. +
  10244. + TP_fast_assign(
  10245. + __entry->cpu = cpu;
  10246. + __entry->offset = offset;
  10247. + memcpy(__entry->ccomm, curr->comm, TASK_COMM_LEN);
  10248. + __entry->cprio = curr->prio;
  10249. + memcpy(__entry->tcomm, task != NULL ? task->comm : "<none>",
  10250. + task != NULL ? TASK_COMM_LEN : 7);
  10251. + __entry->tprio = task != NULL ? task->prio : -1;
  10252. + ),
  10253. +
  10254. + TP_printk("cpu=%d offset=%lld curr=%s[%d] thread=%s[%d]",
  10255. + __entry->cpu, __entry->offset, __entry->ccomm,
  10256. + __entry->cprio, __entry->tcomm, __entry->tprio)
  10257. +);
  10258. +#endif
  10259. +
  10260. +#endif /* _TRACE_HIST_H */
  10261. +
  10262. +/* This part must be outside protection */
  10263. +#include <trace/define_trace.h>
  10264. diff -Nur linux-3.18.12.orig/include/trace/events/latency_hist.h linux-3.18.12/include/trace/events/latency_hist.h
  10265. --- linux-3.18.12.orig/include/trace/events/latency_hist.h 1969-12-31 18:00:00.000000000 -0600
  10266. +++ linux-3.18.12/include/trace/events/latency_hist.h 2015-04-26 13:32:22.427684003 -0500
  10267. @@ -0,0 +1,29 @@
  10268. +#ifndef _LATENCY_HIST_H
  10269. +#define _LATENCY_HIST_H
  10270. +
  10271. +enum hist_action {
  10272. + IRQS_ON,
  10273. + PREEMPT_ON,
  10274. + TRACE_STOP,
  10275. + IRQS_OFF,
  10276. + PREEMPT_OFF,
  10277. + TRACE_START,
  10278. +};
  10279. +
  10280. +static char *actions[] = {
  10281. + "IRQS_ON",
  10282. + "PREEMPT_ON",
  10283. + "TRACE_STOP",
  10284. + "IRQS_OFF",
  10285. + "PREEMPT_OFF",
  10286. + "TRACE_START",
  10287. +};
  10288. +
  10289. +static inline char *getaction(int action)
  10290. +{
  10291. + if (action >= 0 && action <= sizeof(actions)/sizeof(actions[0]))
  10292. + return actions[action];
  10293. + return "unknown";
  10294. +}
  10295. +
  10296. +#endif /* _LATENCY_HIST_H */
  10297. diff -Nur linux-3.18.12.orig/init/Kconfig linux-3.18.12/init/Kconfig
  10298. --- linux-3.18.12.orig/init/Kconfig 2015-04-20 14:48:02.000000000 -0500
  10299. +++ linux-3.18.12/init/Kconfig 2015-04-26 13:32:22.427684003 -0500
  10300. @@ -635,7 +635,7 @@
  10301. config RCU_FAST_NO_HZ
  10302. bool "Accelerate last non-dyntick-idle CPU's grace periods"
  10303. - depends on NO_HZ_COMMON && SMP
  10304. + depends on NO_HZ_COMMON && SMP && !PREEMPT_RT_FULL
  10305. default n
  10306. help
  10307. This option permits CPUs to enter dynticks-idle state even if
  10308. @@ -662,7 +662,7 @@
  10309. config RCU_BOOST
  10310. bool "Enable RCU priority boosting"
  10311. depends on RT_MUTEXES && PREEMPT_RCU
  10312. - default n
  10313. + default y if PREEMPT_RT_FULL
  10314. help
  10315. This option boosts the priority of preempted RCU readers that
  10316. block the current preemptible RCU grace period for too long.
  10317. @@ -1106,6 +1106,7 @@
  10318. config RT_GROUP_SCHED
  10319. bool "Group scheduling for SCHED_RR/FIFO"
  10320. depends on CGROUP_SCHED
  10321. + depends on !PREEMPT_RT_FULL
  10322. default n
  10323. help
  10324. This feature lets you explicitly allocate real CPU bandwidth
  10325. @@ -1677,6 +1678,7 @@
  10326. config SLAB
  10327. bool "SLAB"
  10328. + depends on !PREEMPT_RT_FULL
  10329. help
  10330. The regular slab allocator that is established and known to work
  10331. well in all environments. It organizes cache hot objects in
  10332. @@ -1695,6 +1697,7 @@
  10333. config SLOB
  10334. depends on EXPERT
  10335. bool "SLOB (Simple Allocator)"
  10336. + depends on !PREEMPT_RT_FULL
  10337. help
  10338. SLOB replaces the stock allocator with a drastically simpler
  10339. allocator. SLOB is generally more space efficient but
  10340. diff -Nur linux-3.18.12.orig/init/main.c linux-3.18.12/init/main.c
  10341. --- linux-3.18.12.orig/init/main.c 2015-04-20 14:48:02.000000000 -0500
  10342. +++ linux-3.18.12/init/main.c 2015-04-26 13:32:22.427684003 -0500
  10343. @@ -533,6 +533,7 @@
  10344. setup_command_line(command_line);
  10345. setup_nr_cpu_ids();
  10346. setup_per_cpu_areas();
  10347. + softirq_early_init();
  10348. smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */
  10349. build_all_zonelists(NULL, NULL);
  10350. diff -Nur linux-3.18.12.orig/init/Makefile linux-3.18.12/init/Makefile
  10351. --- linux-3.18.12.orig/init/Makefile 2015-04-20 14:48:02.000000000 -0500
  10352. +++ linux-3.18.12/init/Makefile 2015-04-26 13:32:22.427684003 -0500
  10353. @@ -33,4 +33,4 @@
  10354. include/generated/compile.h: FORCE
  10355. @$($(quiet)chk_compile.h)
  10356. $(Q)$(CONFIG_SHELL) $(srctree)/scripts/mkcompile_h $@ \
  10357. - "$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" "$(CC) $(KBUILD_CFLAGS)"
  10358. + "$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" "$(CONFIG_PREEMPT_RT_FULL)" "$(CC) $(KBUILD_CFLAGS)"
  10359. diff -Nur linux-3.18.12.orig/ipc/mqueue.c linux-3.18.12/ipc/mqueue.c
  10360. --- linux-3.18.12.orig/ipc/mqueue.c 2015-04-20 14:48:02.000000000 -0500
  10361. +++ linux-3.18.12/ipc/mqueue.c 2015-04-26 13:32:22.427684003 -0500
  10362. @@ -923,12 +923,17 @@
  10363. struct msg_msg *message,
  10364. struct ext_wait_queue *receiver)
  10365. {
  10366. + /*
  10367. + * Keep them in one critical section for PREEMPT_RT:
  10368. + */
  10369. + preempt_disable_rt();
  10370. receiver->msg = message;
  10371. list_del(&receiver->list);
  10372. receiver->state = STATE_PENDING;
  10373. wake_up_process(receiver->task);
  10374. smp_wmb();
  10375. receiver->state = STATE_READY;
  10376. + preempt_enable_rt();
  10377. }
  10378. /* pipelined_receive() - if there is task waiting in sys_mq_timedsend()
  10379. @@ -942,13 +947,18 @@
  10380. wake_up_interruptible(&info->wait_q);
  10381. return;
  10382. }
  10383. - if (msg_insert(sender->msg, info))
  10384. - return;
  10385. - list_del(&sender->list);
  10386. - sender->state = STATE_PENDING;
  10387. - wake_up_process(sender->task);
  10388. - smp_wmb();
  10389. - sender->state = STATE_READY;
  10390. + /*
  10391. + * Keep them in one critical section for PREEMPT_RT:
  10392. + */
  10393. + preempt_disable_rt();
  10394. + if (!msg_insert(sender->msg, info)) {
  10395. + list_del(&sender->list);
  10396. + sender->state = STATE_PENDING;
  10397. + wake_up_process(sender->task);
  10398. + smp_wmb();
  10399. + sender->state = STATE_READY;
  10400. + }
  10401. + preempt_enable_rt();
  10402. }
  10403. SYSCALL_DEFINE5(mq_timedsend, mqd_t, mqdes, const char __user *, u_msg_ptr,
  10404. diff -Nur linux-3.18.12.orig/ipc/msg.c linux-3.18.12/ipc/msg.c
  10405. --- linux-3.18.12.orig/ipc/msg.c 2015-04-20 14:48:02.000000000 -0500
  10406. +++ linux-3.18.12/ipc/msg.c 2015-04-26 13:32:22.427684003 -0500
  10407. @@ -188,6 +188,12 @@
  10408. struct msg_receiver *msr, *t;
  10409. list_for_each_entry_safe(msr, t, &msq->q_receivers, r_list) {
  10410. + /*
  10411. + * Make sure that the wakeup doesnt preempt
  10412. + * this CPU prematurely. (on PREEMPT_RT)
  10413. + */
  10414. + preempt_disable_rt();
  10415. +
  10416. msr->r_msg = NULL; /* initialize expunge ordering */
  10417. wake_up_process(msr->r_tsk);
  10418. /*
  10419. @@ -198,6 +204,8 @@
  10420. */
  10421. smp_mb();
  10422. msr->r_msg = ERR_PTR(res);
  10423. +
  10424. + preempt_enable_rt();
  10425. }
  10426. }
  10427. @@ -574,6 +582,11 @@
  10428. if (testmsg(msg, msr->r_msgtype, msr->r_mode) &&
  10429. !security_msg_queue_msgrcv(msq, msg, msr->r_tsk,
  10430. msr->r_msgtype, msr->r_mode)) {
  10431. + /*
  10432. + * Make sure that the wakeup doesnt preempt
  10433. + * this CPU prematurely. (on PREEMPT_RT)
  10434. + */
  10435. + preempt_disable_rt();
  10436. list_del(&msr->r_list);
  10437. if (msr->r_maxsize < msg->m_ts) {
  10438. @@ -595,12 +608,13 @@
  10439. */
  10440. smp_mb();
  10441. msr->r_msg = msg;
  10442. + preempt_enable_rt();
  10443. return 1;
  10444. }
  10445. + preempt_enable_rt();
  10446. }
  10447. }
  10448. -
  10449. return 0;
  10450. }
  10451. diff -Nur linux-3.18.12.orig/ipc/sem.c linux-3.18.12/ipc/sem.c
  10452. --- linux-3.18.12.orig/ipc/sem.c 2015-04-20 14:48:02.000000000 -0500
  10453. +++ linux-3.18.12/ipc/sem.c 2015-04-26 13:32:22.431684003 -0500
  10454. @@ -673,6 +673,13 @@
  10455. static void wake_up_sem_queue_prepare(struct list_head *pt,
  10456. struct sem_queue *q, int error)
  10457. {
  10458. +#ifdef CONFIG_PREEMPT_RT_BASE
  10459. + struct task_struct *p = q->sleeper;
  10460. + get_task_struct(p);
  10461. + q->status = error;
  10462. + wake_up_process(p);
  10463. + put_task_struct(p);
  10464. +#else
  10465. if (list_empty(pt)) {
  10466. /*
  10467. * Hold preempt off so that we don't get preempted and have the
  10468. @@ -684,6 +691,7 @@
  10469. q->pid = error;
  10470. list_add_tail(&q->list, pt);
  10471. +#endif
  10472. }
  10473. /**
  10474. @@ -697,6 +705,7 @@
  10475. */
  10476. static void wake_up_sem_queue_do(struct list_head *pt)
  10477. {
  10478. +#ifndef CONFIG_PREEMPT_RT_BASE
  10479. struct sem_queue *q, *t;
  10480. int did_something;
  10481. @@ -709,6 +718,7 @@
  10482. }
  10483. if (did_something)
  10484. preempt_enable();
  10485. +#endif
  10486. }
  10487. static void unlink_queue(struct sem_array *sma, struct sem_queue *q)
  10488. diff -Nur linux-3.18.12.orig/kernel/cgroup.c linux-3.18.12/kernel/cgroup.c
  10489. --- linux-3.18.12.orig/kernel/cgroup.c 2015-04-20 14:48:02.000000000 -0500
  10490. +++ linux-3.18.12/kernel/cgroup.c 2015-04-26 13:32:22.431684003 -0500
  10491. @@ -4355,10 +4355,10 @@
  10492. queue_work(cgroup_destroy_wq, &css->destroy_work);
  10493. }
  10494. -static void css_release_work_fn(struct work_struct *work)
  10495. +static void css_release_work_fn(struct swork_event *sev)
  10496. {
  10497. struct cgroup_subsys_state *css =
  10498. - container_of(work, struct cgroup_subsys_state, destroy_work);
  10499. + container_of(sev, struct cgroup_subsys_state, destroy_swork);
  10500. struct cgroup_subsys *ss = css->ss;
  10501. struct cgroup *cgrp = css->cgroup;
  10502. @@ -4395,8 +4395,8 @@
  10503. struct cgroup_subsys_state *css =
  10504. container_of(ref, struct cgroup_subsys_state, refcnt);
  10505. - INIT_WORK(&css->destroy_work, css_release_work_fn);
  10506. - queue_work(cgroup_destroy_wq, &css->destroy_work);
  10507. + INIT_SWORK(&css->destroy_swork, css_release_work_fn);
  10508. + swork_queue(&css->destroy_swork);
  10509. }
  10510. static void init_and_link_css(struct cgroup_subsys_state *css,
  10511. @@ -4997,6 +4997,7 @@
  10512. */
  10513. cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
  10514. BUG_ON(!cgroup_destroy_wq);
  10515. + BUG_ON(swork_get());
  10516. /*
  10517. * Used to destroy pidlists and separate to serve as flush domain.
  10518. diff -Nur linux-3.18.12.orig/kernel/cpu.c linux-3.18.12/kernel/cpu.c
  10519. --- linux-3.18.12.orig/kernel/cpu.c 2015-04-20 14:48:02.000000000 -0500
  10520. +++ linux-3.18.12/kernel/cpu.c 2015-04-26 13:32:22.431684003 -0500
  10521. @@ -86,6 +86,290 @@
  10522. #define cpuhp_lock_acquire() lock_map_acquire(&cpu_hotplug.dep_map)
  10523. #define cpuhp_lock_release() lock_map_release(&cpu_hotplug.dep_map)
  10524. +/**
  10525. + * hotplug_pcp - per cpu hotplug descriptor
  10526. + * @unplug: set when pin_current_cpu() needs to sync tasks
  10527. + * @sync_tsk: the task that waits for tasks to finish pinned sections
  10528. + * @refcount: counter of tasks in pinned sections
  10529. + * @grab_lock: set when the tasks entering pinned sections should wait
  10530. + * @synced: notifier for @sync_tsk to tell cpu_down it's finished
  10531. + * @mutex: the mutex to make tasks wait (used when @grab_lock is true)
  10532. + * @mutex_init: zero if the mutex hasn't been initialized yet.
  10533. + *
  10534. + * Although @unplug and @sync_tsk may point to the same task, the @unplug
  10535. + * is used as a flag and still exists after @sync_tsk has exited and
  10536. + * @sync_tsk set to NULL.
  10537. + */
  10538. +struct hotplug_pcp {
  10539. + struct task_struct *unplug;
  10540. + struct task_struct *sync_tsk;
  10541. + int refcount;
  10542. + int grab_lock;
  10543. + struct completion synced;
  10544. + struct completion unplug_wait;
  10545. +#ifdef CONFIG_PREEMPT_RT_FULL
  10546. + /*
  10547. + * Note, on PREEMPT_RT, the hotplug lock must save the state of
  10548. + * the task, otherwise the mutex will cause the task to fail
  10549. + * to sleep when required. (Because it's called from migrate_disable())
  10550. + *
  10551. + * The spinlock_t on PREEMPT_RT is a mutex that saves the task's
  10552. + * state.
  10553. + */
  10554. + spinlock_t lock;
  10555. +#else
  10556. + struct mutex mutex;
  10557. +#endif
  10558. + int mutex_init;
  10559. +};
  10560. +
  10561. +#ifdef CONFIG_PREEMPT_RT_FULL
  10562. +# define hotplug_lock(hp) rt_spin_lock(&(hp)->lock)
  10563. +# define hotplug_unlock(hp) rt_spin_unlock(&(hp)->lock)
  10564. +#else
  10565. +# define hotplug_lock(hp) mutex_lock(&(hp)->mutex)
  10566. +# define hotplug_unlock(hp) mutex_unlock(&(hp)->mutex)
  10567. +#endif
  10568. +
  10569. +static DEFINE_PER_CPU(struct hotplug_pcp, hotplug_pcp);
  10570. +
  10571. +/**
  10572. + * pin_current_cpu - Prevent the current cpu from being unplugged
  10573. + *
  10574. + * Lightweight version of get_online_cpus() to prevent cpu from being
  10575. + * unplugged when code runs in a migration disabled region.
  10576. + *
  10577. + * Must be called with preemption disabled (preempt_count = 1)!
  10578. + */
  10579. +void pin_current_cpu(void)
  10580. +{
  10581. + struct hotplug_pcp *hp;
  10582. + int force = 0;
  10583. +
  10584. +retry:
  10585. + hp = &__get_cpu_var(hotplug_pcp);
  10586. +
  10587. + if (!hp->unplug || hp->refcount || force || preempt_count() > 1 ||
  10588. + hp->unplug == current) {
  10589. + hp->refcount++;
  10590. + return;
  10591. + }
  10592. + if (hp->grab_lock) {
  10593. + preempt_enable();
  10594. + hotplug_lock(hp);
  10595. + hotplug_unlock(hp);
  10596. + } else {
  10597. + preempt_enable();
  10598. + /*
  10599. + * Try to push this task off of this CPU.
  10600. + */
  10601. + if (!migrate_me()) {
  10602. + preempt_disable();
  10603. + hp = &__get_cpu_var(hotplug_pcp);
  10604. + if (!hp->grab_lock) {
  10605. + /*
  10606. + * Just let it continue it's already pinned
  10607. + * or about to sleep.
  10608. + */
  10609. + force = 1;
  10610. + goto retry;
  10611. + }
  10612. + preempt_enable();
  10613. + }
  10614. + }
  10615. + preempt_disable();
  10616. + goto retry;
  10617. +}
  10618. +
  10619. +/**
  10620. + * unpin_current_cpu - Allow unplug of current cpu
  10621. + *
  10622. + * Must be called with preemption or interrupts disabled!
  10623. + */
  10624. +void unpin_current_cpu(void)
  10625. +{
  10626. + struct hotplug_pcp *hp = &__get_cpu_var(hotplug_pcp);
  10627. +
  10628. + WARN_ON(hp->refcount <= 0);
  10629. +
  10630. + /* This is safe. sync_unplug_thread is pinned to this cpu */
  10631. + if (!--hp->refcount && hp->unplug && hp->unplug != current)
  10632. + wake_up_process(hp->unplug);
  10633. +}
  10634. +
  10635. +static void wait_for_pinned_cpus(struct hotplug_pcp *hp)
  10636. +{
  10637. + set_current_state(TASK_UNINTERRUPTIBLE);
  10638. + while (hp->refcount) {
  10639. + schedule_preempt_disabled();
  10640. + set_current_state(TASK_UNINTERRUPTIBLE);
  10641. + }
  10642. +}
  10643. +
  10644. +static int sync_unplug_thread(void *data)
  10645. +{
  10646. + struct hotplug_pcp *hp = data;
  10647. +
  10648. + wait_for_completion(&hp->unplug_wait);
  10649. + preempt_disable();
  10650. + hp->unplug = current;
  10651. + wait_for_pinned_cpus(hp);
  10652. +
  10653. + /*
  10654. + * This thread will synchronize the cpu_down() with threads
  10655. + * that have pinned the CPU. When the pinned CPU count reaches
  10656. + * zero, we inform the cpu_down code to continue to the next step.
  10657. + */
  10658. + set_current_state(TASK_UNINTERRUPTIBLE);
  10659. + preempt_enable();
  10660. + complete(&hp->synced);
  10661. +
  10662. + /*
  10663. + * If all succeeds, the next step will need tasks to wait till
  10664. + * the CPU is offline before continuing. To do this, the grab_lock
  10665. + * is set and tasks going into pin_current_cpu() will block on the
  10666. + * mutex. But we still need to wait for those that are already in
  10667. + * pinned CPU sections. If the cpu_down() failed, the kthread_should_stop()
  10668. + * will kick this thread out.
  10669. + */
  10670. + while (!hp->grab_lock && !kthread_should_stop()) {
  10671. + schedule();
  10672. + set_current_state(TASK_UNINTERRUPTIBLE);
  10673. + }
  10674. +
  10675. + /* Make sure grab_lock is seen before we see a stale completion */
  10676. + smp_mb();
  10677. +
  10678. + /*
  10679. + * Now just before cpu_down() enters stop machine, we need to make
  10680. + * sure all tasks that are in pinned CPU sections are out, and new
  10681. + * tasks will now grab the lock, keeping them from entering pinned
  10682. + * CPU sections.
  10683. + */
  10684. + if (!kthread_should_stop()) {
  10685. + preempt_disable();
  10686. + wait_for_pinned_cpus(hp);
  10687. + preempt_enable();
  10688. + complete(&hp->synced);
  10689. + }
  10690. +
  10691. + set_current_state(TASK_UNINTERRUPTIBLE);
  10692. + while (!kthread_should_stop()) {
  10693. + schedule();
  10694. + set_current_state(TASK_UNINTERRUPTIBLE);
  10695. + }
  10696. + set_current_state(TASK_RUNNING);
  10697. +
  10698. + /*
  10699. + * Force this thread off this CPU as it's going down and
  10700. + * we don't want any more work on this CPU.
  10701. + */
  10702. + current->flags &= ~PF_NO_SETAFFINITY;
  10703. + set_cpus_allowed_ptr(current, cpu_present_mask);
  10704. + migrate_me();
  10705. + return 0;
  10706. +}
  10707. +
  10708. +static void __cpu_unplug_sync(struct hotplug_pcp *hp)
  10709. +{
  10710. + wake_up_process(hp->sync_tsk);
  10711. + wait_for_completion(&hp->synced);
  10712. +}
  10713. +
  10714. +static void __cpu_unplug_wait(unsigned int cpu)
  10715. +{
  10716. + struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
  10717. +
  10718. + complete(&hp->unplug_wait);
  10719. + wait_for_completion(&hp->synced);
  10720. +}
  10721. +
  10722. +/*
  10723. + * Start the sync_unplug_thread on the target cpu and wait for it to
  10724. + * complete.
  10725. + */
  10726. +static int cpu_unplug_begin(unsigned int cpu)
  10727. +{
  10728. + struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
  10729. + int err;
  10730. +
  10731. + /* Protected by cpu_hotplug.lock */
  10732. + if (!hp->mutex_init) {
  10733. +#ifdef CONFIG_PREEMPT_RT_FULL
  10734. + spin_lock_init(&hp->lock);
  10735. +#else
  10736. + mutex_init(&hp->mutex);
  10737. +#endif
  10738. + hp->mutex_init = 1;
  10739. + }
  10740. +
  10741. + /* Inform the scheduler to migrate tasks off this CPU */
  10742. + tell_sched_cpu_down_begin(cpu);
  10743. +
  10744. + init_completion(&hp->synced);
  10745. + init_completion(&hp->unplug_wait);
  10746. +
  10747. + hp->sync_tsk = kthread_create(sync_unplug_thread, hp, "sync_unplug/%d", cpu);
  10748. + if (IS_ERR(hp->sync_tsk)) {
  10749. + err = PTR_ERR(hp->sync_tsk);
  10750. + hp->sync_tsk = NULL;
  10751. + return err;
  10752. + }
  10753. + kthread_bind(hp->sync_tsk, cpu);
  10754. +
  10755. + /*
  10756. + * Wait for tasks to get out of the pinned sections,
  10757. + * it's still OK if new tasks enter. Some CPU notifiers will
  10758. + * wait for tasks that are going to enter these sections and
  10759. + * we must not have them block.
  10760. + */
  10761. + wake_up_process(hp->sync_tsk);
  10762. + return 0;
  10763. +}
  10764. +
  10765. +static void cpu_unplug_sync(unsigned int cpu)
  10766. +{
  10767. + struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
  10768. +
  10769. + init_completion(&hp->synced);
  10770. + /* The completion needs to be initialzied before setting grab_lock */
  10771. + smp_wmb();
  10772. +
  10773. + /* Grab the mutex before setting grab_lock */
  10774. + hotplug_lock(hp);
  10775. + hp->grab_lock = 1;
  10776. +
  10777. + /*
  10778. + * The CPU notifiers have been completed.
  10779. + * Wait for tasks to get out of pinned CPU sections and have new
  10780. + * tasks block until the CPU is completely down.
  10781. + */
  10782. + __cpu_unplug_sync(hp);
  10783. +
  10784. + /* All done with the sync thread */
  10785. + kthread_stop(hp->sync_tsk);
  10786. + hp->sync_tsk = NULL;
  10787. +}
  10788. +
  10789. +static void cpu_unplug_done(unsigned int cpu)
  10790. +{
  10791. + struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
  10792. +
  10793. + hp->unplug = NULL;
  10794. + /* Let all tasks know cpu unplug is finished before cleaning up */
  10795. + smp_wmb();
  10796. +
  10797. + if (hp->sync_tsk)
  10798. + kthread_stop(hp->sync_tsk);
  10799. +
  10800. + if (hp->grab_lock) {
  10801. + hotplug_unlock(hp);
  10802. + /* protected by cpu_hotplug.lock */
  10803. + hp->grab_lock = 0;
  10804. + }
  10805. + tell_sched_cpu_down_done(cpu);
  10806. +}
  10807. +
  10808. void get_online_cpus(void)
  10809. {
  10810. might_sleep();
  10811. @@ -102,6 +386,7 @@
  10812. {
  10813. if (cpu_hotplug.active_writer == current)
  10814. return true;
  10815. +
  10816. if (!mutex_trylock(&cpu_hotplug.lock))
  10817. return false;
  10818. cpuhp_lock_acquire_tryread();
  10819. @@ -349,13 +634,15 @@
  10820. /* Requires cpu_add_remove_lock to be held */
  10821. static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
  10822. {
  10823. - int err, nr_calls = 0;
  10824. + int mycpu, err, nr_calls = 0;
  10825. void *hcpu = (void *)(long)cpu;
  10826. unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
  10827. struct take_cpu_down_param tcd_param = {
  10828. .mod = mod,
  10829. .hcpu = hcpu,
  10830. };
  10831. + cpumask_var_t cpumask;
  10832. + cpumask_var_t cpumask_org;
  10833. if (num_online_cpus() == 1)
  10834. return -EBUSY;
  10835. @@ -363,7 +650,34 @@
  10836. if (!cpu_online(cpu))
  10837. return -EINVAL;
  10838. + /* Move the downtaker off the unplug cpu */
  10839. + if (!alloc_cpumask_var(&cpumask, GFP_KERNEL))
  10840. + return -ENOMEM;
  10841. + if (!alloc_cpumask_var(&cpumask_org, GFP_KERNEL)) {
  10842. + free_cpumask_var(cpumask);
  10843. + return -ENOMEM;
  10844. + }
  10845. +
  10846. + cpumask_copy(cpumask_org, tsk_cpus_allowed(current));
  10847. + cpumask_andnot(cpumask, cpu_online_mask, cpumask_of(cpu));
  10848. + set_cpus_allowed_ptr(current, cpumask);
  10849. + free_cpumask_var(cpumask);
  10850. + migrate_disable();
  10851. + mycpu = smp_processor_id();
  10852. + if (mycpu == cpu) {
  10853. + printk(KERN_ERR "Yuck! Still on unplug CPU\n!");
  10854. + migrate_enable();
  10855. + err = -EBUSY;
  10856. + goto restore_cpus;
  10857. + }
  10858. + migrate_enable();
  10859. +
  10860. cpu_hotplug_begin();
  10861. + err = cpu_unplug_begin(cpu);
  10862. + if (err) {
  10863. + printk("cpu_unplug_begin(%d) failed\n", cpu);
  10864. + goto out_cancel;
  10865. + }
  10866. err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls);
  10867. if (err) {
  10868. @@ -389,8 +703,12 @@
  10869. #endif
  10870. synchronize_rcu();
  10871. + __cpu_unplug_wait(cpu);
  10872. smpboot_park_threads(cpu);
  10873. + /* Notifiers are done. Don't let any more tasks pin this CPU. */
  10874. + cpu_unplug_sync(cpu);
  10875. +
  10876. /*
  10877. * So now all preempt/rcu users must observe !cpu_active().
  10878. */
  10879. @@ -423,9 +741,14 @@
  10880. check_for_tasks(cpu);
  10881. out_release:
  10882. + cpu_unplug_done(cpu);
  10883. +out_cancel:
  10884. cpu_hotplug_done();
  10885. if (!err)
  10886. cpu_notify_nofail(CPU_POST_DEAD | mod, hcpu);
  10887. +restore_cpus:
  10888. + set_cpus_allowed_ptr(current, cpumask_org);
  10889. + free_cpumask_var(cpumask_org);
  10890. return err;
  10891. }
  10892. diff -Nur linux-3.18.12.orig/kernel/debug/kdb/kdb_io.c linux-3.18.12/kernel/debug/kdb/kdb_io.c
  10893. --- linux-3.18.12.orig/kernel/debug/kdb/kdb_io.c 2015-04-20 14:48:02.000000000 -0500
  10894. +++ linux-3.18.12/kernel/debug/kdb/kdb_io.c 2015-04-26 13:32:22.431684003 -0500
  10895. @@ -554,7 +554,6 @@
  10896. int linecount;
  10897. int colcount;
  10898. int logging, saved_loglevel = 0;
  10899. - int saved_trap_printk;
  10900. int got_printf_lock = 0;
  10901. int retlen = 0;
  10902. int fnd, len;
  10903. @@ -565,8 +564,6 @@
  10904. unsigned long uninitialized_var(flags);
  10905. preempt_disable();
  10906. - saved_trap_printk = kdb_trap_printk;
  10907. - kdb_trap_printk = 0;
  10908. /* Serialize kdb_printf if multiple cpus try to write at once.
  10909. * But if any cpu goes recursive in kdb, just print the output,
  10910. @@ -833,7 +830,6 @@
  10911. } else {
  10912. __release(kdb_printf_lock);
  10913. }
  10914. - kdb_trap_printk = saved_trap_printk;
  10915. preempt_enable();
  10916. return retlen;
  10917. }
  10918. @@ -843,9 +839,11 @@
  10919. va_list ap;
  10920. int r;
  10921. + kdb_trap_printk++;
  10922. va_start(ap, fmt);
  10923. r = vkdb_printf(fmt, ap);
  10924. va_end(ap);
  10925. + kdb_trap_printk--;
  10926. return r;
  10927. }
  10928. diff -Nur linux-3.18.12.orig/kernel/events/core.c linux-3.18.12/kernel/events/core.c
  10929. --- linux-3.18.12.orig/kernel/events/core.c 2015-04-20 14:48:02.000000000 -0500
  10930. +++ linux-3.18.12/kernel/events/core.c 2015-04-26 13:32:22.431684003 -0500
  10931. @@ -6346,6 +6346,7 @@
  10932. hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
  10933. hwc->hrtimer.function = perf_swevent_hrtimer;
  10934. + hwc->hrtimer.irqsafe = 1;
  10935. /*
  10936. * Since hrtimers have a fixed rate, we can do a static freq->period
  10937. diff -Nur linux-3.18.12.orig/kernel/events/core.c.orig linux-3.18.12/kernel/events/core.c.orig
  10938. --- linux-3.18.12.orig/kernel/events/core.c.orig 1969-12-31 18:00:00.000000000 -0600
  10939. +++ linux-3.18.12/kernel/events/core.c.orig 2015-04-20 14:48:02.000000000 -0500
  10940. @@ -0,0 +1,8339 @@
  10941. +/*
  10942. + * Performance events core code:
  10943. + *
  10944. + * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
  10945. + * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
  10946. + * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
  10947. + * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
  10948. + *
  10949. + * For licensing details see kernel-base/COPYING
  10950. + */
  10951. +
  10952. +#include <linux/fs.h>
  10953. +#include <linux/mm.h>
  10954. +#include <linux/cpu.h>
  10955. +#include <linux/smp.h>
  10956. +#include <linux/idr.h>
  10957. +#include <linux/file.h>
  10958. +#include <linux/poll.h>
  10959. +#include <linux/slab.h>
  10960. +#include <linux/hash.h>
  10961. +#include <linux/tick.h>
  10962. +#include <linux/sysfs.h>
  10963. +#include <linux/dcache.h>
  10964. +#include <linux/percpu.h>
  10965. +#include <linux/ptrace.h>
  10966. +#include <linux/reboot.h>
  10967. +#include <linux/vmstat.h>
  10968. +#include <linux/device.h>
  10969. +#include <linux/export.h>
  10970. +#include <linux/vmalloc.h>
  10971. +#include <linux/hardirq.h>
  10972. +#include <linux/rculist.h>
  10973. +#include <linux/uaccess.h>
  10974. +#include <linux/syscalls.h>
  10975. +#include <linux/anon_inodes.h>
  10976. +#include <linux/kernel_stat.h>
  10977. +#include <linux/perf_event.h>
  10978. +#include <linux/ftrace_event.h>
  10979. +#include <linux/hw_breakpoint.h>
  10980. +#include <linux/mm_types.h>
  10981. +#include <linux/cgroup.h>
  10982. +#include <linux/module.h>
  10983. +#include <linux/mman.h>
  10984. +#include <linux/compat.h>
  10985. +
  10986. +#include "internal.h"
  10987. +
  10988. +#include <asm/irq_regs.h>
  10989. +
  10990. +static struct workqueue_struct *perf_wq;
  10991. +
  10992. +struct remote_function_call {
  10993. + struct task_struct *p;
  10994. + int (*func)(void *info);
  10995. + void *info;
  10996. + int ret;
  10997. +};
  10998. +
  10999. +static void remote_function(void *data)
  11000. +{
  11001. + struct remote_function_call *tfc = data;
  11002. + struct task_struct *p = tfc->p;
  11003. +
  11004. + if (p) {
  11005. + tfc->ret = -EAGAIN;
  11006. + if (task_cpu(p) != smp_processor_id() || !task_curr(p))
  11007. + return;
  11008. + }
  11009. +
  11010. + tfc->ret = tfc->func(tfc->info);
  11011. +}
  11012. +
  11013. +/**
  11014. + * task_function_call - call a function on the cpu on which a task runs
  11015. + * @p: the task to evaluate
  11016. + * @func: the function to be called
  11017. + * @info: the function call argument
  11018. + *
  11019. + * Calls the function @func when the task is currently running. This might
  11020. + * be on the current CPU, which just calls the function directly
  11021. + *
  11022. + * returns: @func return value, or
  11023. + * -ESRCH - when the process isn't running
  11024. + * -EAGAIN - when the process moved away
  11025. + */
  11026. +static int
  11027. +task_function_call(struct task_struct *p, int (*func) (void *info), void *info)
  11028. +{
  11029. + struct remote_function_call data = {
  11030. + .p = p,
  11031. + .func = func,
  11032. + .info = info,
  11033. + .ret = -ESRCH, /* No such (running) process */
  11034. + };
  11035. +
  11036. + if (task_curr(p))
  11037. + smp_call_function_single(task_cpu(p), remote_function, &data, 1);
  11038. +
  11039. + return data.ret;
  11040. +}
  11041. +
  11042. +/**
  11043. + * cpu_function_call - call a function on the cpu
  11044. + * @func: the function to be called
  11045. + * @info: the function call argument
  11046. + *
  11047. + * Calls the function @func on the remote cpu.
  11048. + *
  11049. + * returns: @func return value or -ENXIO when the cpu is offline
  11050. + */
  11051. +static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
  11052. +{
  11053. + struct remote_function_call data = {
  11054. + .p = NULL,
  11055. + .func = func,
  11056. + .info = info,
  11057. + .ret = -ENXIO, /* No such CPU */
  11058. + };
  11059. +
  11060. + smp_call_function_single(cpu, remote_function, &data, 1);
  11061. +
  11062. + return data.ret;
  11063. +}
  11064. +
  11065. +#define EVENT_OWNER_KERNEL ((void *) -1)
  11066. +
  11067. +static bool is_kernel_event(struct perf_event *event)
  11068. +{
  11069. + return event->owner == EVENT_OWNER_KERNEL;
  11070. +}
  11071. +
  11072. +#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
  11073. + PERF_FLAG_FD_OUTPUT |\
  11074. + PERF_FLAG_PID_CGROUP |\
  11075. + PERF_FLAG_FD_CLOEXEC)
  11076. +
  11077. +/*
  11078. + * branch priv levels that need permission checks
  11079. + */
  11080. +#define PERF_SAMPLE_BRANCH_PERM_PLM \
  11081. + (PERF_SAMPLE_BRANCH_KERNEL |\
  11082. + PERF_SAMPLE_BRANCH_HV)
  11083. +
  11084. +enum event_type_t {
  11085. + EVENT_FLEXIBLE = 0x1,
  11086. + EVENT_PINNED = 0x2,
  11087. + EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
  11088. +};
  11089. +
  11090. +/*
  11091. + * perf_sched_events : >0 events exist
  11092. + * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
  11093. + */
  11094. +struct static_key_deferred perf_sched_events __read_mostly;
  11095. +static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
  11096. +static DEFINE_PER_CPU(atomic_t, perf_branch_stack_events);
  11097. +
  11098. +static atomic_t nr_mmap_events __read_mostly;
  11099. +static atomic_t nr_comm_events __read_mostly;
  11100. +static atomic_t nr_task_events __read_mostly;
  11101. +static atomic_t nr_freq_events __read_mostly;
  11102. +
  11103. +static LIST_HEAD(pmus);
  11104. +static DEFINE_MUTEX(pmus_lock);
  11105. +static struct srcu_struct pmus_srcu;
  11106. +
  11107. +/*
  11108. + * perf event paranoia level:
  11109. + * -1 - not paranoid at all
  11110. + * 0 - disallow raw tracepoint access for unpriv
  11111. + * 1 - disallow cpu events for unpriv
  11112. + * 2 - disallow kernel profiling for unpriv
  11113. + */
  11114. +int sysctl_perf_event_paranoid __read_mostly = 1;
  11115. +
  11116. +/* Minimum for 512 kiB + 1 user control page */
  11117. +int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
  11118. +
  11119. +/*
  11120. + * max perf event sample rate
  11121. + */
  11122. +#define DEFAULT_MAX_SAMPLE_RATE 100000
  11123. +#define DEFAULT_SAMPLE_PERIOD_NS (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE)
  11124. +#define DEFAULT_CPU_TIME_MAX_PERCENT 25
  11125. +
  11126. +int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
  11127. +
  11128. +static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
  11129. +static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS;
  11130. +
  11131. +static int perf_sample_allowed_ns __read_mostly =
  11132. + DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
  11133. +
  11134. +void update_perf_cpu_limits(void)
  11135. +{
  11136. + u64 tmp = perf_sample_period_ns;
  11137. +
  11138. + tmp *= sysctl_perf_cpu_time_max_percent;
  11139. + do_div(tmp, 100);
  11140. + ACCESS_ONCE(perf_sample_allowed_ns) = tmp;
  11141. +}
  11142. +
  11143. +static int perf_rotate_context(struct perf_cpu_context *cpuctx);
  11144. +
  11145. +int perf_proc_update_handler(struct ctl_table *table, int write,
  11146. + void __user *buffer, size_t *lenp,
  11147. + loff_t *ppos)
  11148. +{
  11149. + int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
  11150. +
  11151. + if (ret || !write)
  11152. + return ret;
  11153. +
  11154. + max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
  11155. + perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
  11156. + update_perf_cpu_limits();
  11157. +
  11158. + return 0;
  11159. +}
  11160. +
  11161. +int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
  11162. +
  11163. +int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
  11164. + void __user *buffer, size_t *lenp,
  11165. + loff_t *ppos)
  11166. +{
  11167. + int ret = proc_dointvec(table, write, buffer, lenp, ppos);
  11168. +
  11169. + if (ret || !write)
  11170. + return ret;
  11171. +
  11172. + update_perf_cpu_limits();
  11173. +
  11174. + return 0;
  11175. +}
  11176. +
  11177. +/*
  11178. + * perf samples are done in some very critical code paths (NMIs).
  11179. + * If they take too much CPU time, the system can lock up and not
  11180. + * get any real work done. This will drop the sample rate when
  11181. + * we detect that events are taking too long.
  11182. + */
  11183. +#define NR_ACCUMULATED_SAMPLES 128
  11184. +static DEFINE_PER_CPU(u64, running_sample_length);
  11185. +
  11186. +static void perf_duration_warn(struct irq_work *w)
  11187. +{
  11188. + u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
  11189. + u64 avg_local_sample_len;
  11190. + u64 local_samples_len;
  11191. +
  11192. + local_samples_len = __this_cpu_read(running_sample_length);
  11193. + avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
  11194. +
  11195. + printk_ratelimited(KERN_WARNING
  11196. + "perf interrupt took too long (%lld > %lld), lowering "
  11197. + "kernel.perf_event_max_sample_rate to %d\n",
  11198. + avg_local_sample_len, allowed_ns >> 1,
  11199. + sysctl_perf_event_sample_rate);
  11200. +}
  11201. +
  11202. +static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn);
  11203. +
  11204. +void perf_sample_event_took(u64 sample_len_ns)
  11205. +{
  11206. + u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
  11207. + u64 avg_local_sample_len;
  11208. + u64 local_samples_len;
  11209. +
  11210. + if (allowed_ns == 0)
  11211. + return;
  11212. +
  11213. + /* decay the counter by 1 average sample */
  11214. + local_samples_len = __this_cpu_read(running_sample_length);
  11215. + local_samples_len -= local_samples_len/NR_ACCUMULATED_SAMPLES;
  11216. + local_samples_len += sample_len_ns;
  11217. + __this_cpu_write(running_sample_length, local_samples_len);
  11218. +
  11219. + /*
  11220. + * note: this will be biased artifically low until we have
  11221. + * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us
  11222. + * from having to maintain a count.
  11223. + */
  11224. + avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
  11225. +
  11226. + if (avg_local_sample_len <= allowed_ns)
  11227. + return;
  11228. +
  11229. + if (max_samples_per_tick <= 1)
  11230. + return;
  11231. +
  11232. + max_samples_per_tick = DIV_ROUND_UP(max_samples_per_tick, 2);
  11233. + sysctl_perf_event_sample_rate = max_samples_per_tick * HZ;
  11234. + perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
  11235. +
  11236. + update_perf_cpu_limits();
  11237. +
  11238. + if (!irq_work_queue(&perf_duration_work)) {
  11239. + early_printk("perf interrupt took too long (%lld > %lld), lowering "
  11240. + "kernel.perf_event_max_sample_rate to %d\n",
  11241. + avg_local_sample_len, allowed_ns >> 1,
  11242. + sysctl_perf_event_sample_rate);
  11243. + }
  11244. +}
  11245. +
  11246. +static atomic64_t perf_event_id;
  11247. +
  11248. +static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
  11249. + enum event_type_t event_type);
  11250. +
  11251. +static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
  11252. + enum event_type_t event_type,
  11253. + struct task_struct *task);
  11254. +
  11255. +static void update_context_time(struct perf_event_context *ctx);
  11256. +static u64 perf_event_time(struct perf_event *event);
  11257. +
  11258. +void __weak perf_event_print_debug(void) { }
  11259. +
  11260. +extern __weak const char *perf_pmu_name(void)
  11261. +{
  11262. + return "pmu";
  11263. +}
  11264. +
  11265. +static inline u64 perf_clock(void)
  11266. +{
  11267. + return local_clock();
  11268. +}
  11269. +
  11270. +static inline struct perf_cpu_context *
  11271. +__get_cpu_context(struct perf_event_context *ctx)
  11272. +{
  11273. + return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
  11274. +}
  11275. +
  11276. +static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
  11277. + struct perf_event_context *ctx)
  11278. +{
  11279. + raw_spin_lock(&cpuctx->ctx.lock);
  11280. + if (ctx)
  11281. + raw_spin_lock(&ctx->lock);
  11282. +}
  11283. +
  11284. +static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
  11285. + struct perf_event_context *ctx)
  11286. +{
  11287. + if (ctx)
  11288. + raw_spin_unlock(&ctx->lock);
  11289. + raw_spin_unlock(&cpuctx->ctx.lock);
  11290. +}
  11291. +
  11292. +#ifdef CONFIG_CGROUP_PERF
  11293. +
  11294. +/*
  11295. + * perf_cgroup_info keeps track of time_enabled for a cgroup.
  11296. + * This is a per-cpu dynamically allocated data structure.
  11297. + */
  11298. +struct perf_cgroup_info {
  11299. + u64 time;
  11300. + u64 timestamp;
  11301. +};
  11302. +
  11303. +struct perf_cgroup {
  11304. + struct cgroup_subsys_state css;
  11305. + struct perf_cgroup_info __percpu *info;
  11306. +};
  11307. +
  11308. +/*
  11309. + * Must ensure cgroup is pinned (css_get) before calling
  11310. + * this function. In other words, we cannot call this function
  11311. + * if there is no cgroup event for the current CPU context.
  11312. + */
  11313. +static inline struct perf_cgroup *
  11314. +perf_cgroup_from_task(struct task_struct *task)
  11315. +{
  11316. + return container_of(task_css(task, perf_event_cgrp_id),
  11317. + struct perf_cgroup, css);
  11318. +}
  11319. +
  11320. +static inline bool
  11321. +perf_cgroup_match(struct perf_event *event)
  11322. +{
  11323. + struct perf_event_context *ctx = event->ctx;
  11324. + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
  11325. +
  11326. + /* @event doesn't care about cgroup */
  11327. + if (!event->cgrp)
  11328. + return true;
  11329. +
  11330. + /* wants specific cgroup scope but @cpuctx isn't associated with any */
  11331. + if (!cpuctx->cgrp)
  11332. + return false;
  11333. +
  11334. + /*
  11335. + * Cgroup scoping is recursive. An event enabled for a cgroup is
  11336. + * also enabled for all its descendant cgroups. If @cpuctx's
  11337. + * cgroup is a descendant of @event's (the test covers identity
  11338. + * case), it's a match.
  11339. + */
  11340. + return cgroup_is_descendant(cpuctx->cgrp->css.cgroup,
  11341. + event->cgrp->css.cgroup);
  11342. +}
  11343. +
  11344. +static inline void perf_detach_cgroup(struct perf_event *event)
  11345. +{
  11346. + css_put(&event->cgrp->css);
  11347. + event->cgrp = NULL;
  11348. +}
  11349. +
  11350. +static inline int is_cgroup_event(struct perf_event *event)
  11351. +{
  11352. + return event->cgrp != NULL;
  11353. +}
  11354. +
  11355. +static inline u64 perf_cgroup_event_time(struct perf_event *event)
  11356. +{
  11357. + struct perf_cgroup_info *t;
  11358. +
  11359. + t = per_cpu_ptr(event->cgrp->info, event->cpu);
  11360. + return t->time;
  11361. +}
  11362. +
  11363. +static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
  11364. +{
  11365. + struct perf_cgroup_info *info;
  11366. + u64 now;
  11367. +
  11368. + now = perf_clock();
  11369. +
  11370. + info = this_cpu_ptr(cgrp->info);
  11371. +
  11372. + info->time += now - info->timestamp;
  11373. + info->timestamp = now;
  11374. +}
  11375. +
  11376. +static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
  11377. +{
  11378. + struct perf_cgroup *cgrp_out = cpuctx->cgrp;
  11379. + if (cgrp_out)
  11380. + __update_cgrp_time(cgrp_out);
  11381. +}
  11382. +
  11383. +static inline void update_cgrp_time_from_event(struct perf_event *event)
  11384. +{
  11385. + struct perf_cgroup *cgrp;
  11386. +
  11387. + /*
  11388. + * ensure we access cgroup data only when needed and
  11389. + * when we know the cgroup is pinned (css_get)
  11390. + */
  11391. + if (!is_cgroup_event(event))
  11392. + return;
  11393. +
  11394. + cgrp = perf_cgroup_from_task(current);
  11395. + /*
  11396. + * Do not update time when cgroup is not active
  11397. + */
  11398. + if (cgrp == event->cgrp)
  11399. + __update_cgrp_time(event->cgrp);
  11400. +}
  11401. +
  11402. +static inline void
  11403. +perf_cgroup_set_timestamp(struct task_struct *task,
  11404. + struct perf_event_context *ctx)
  11405. +{
  11406. + struct perf_cgroup *cgrp;
  11407. + struct perf_cgroup_info *info;
  11408. +
  11409. + /*
  11410. + * ctx->lock held by caller
  11411. + * ensure we do not access cgroup data
  11412. + * unless we have the cgroup pinned (css_get)
  11413. + */
  11414. + if (!task || !ctx->nr_cgroups)
  11415. + return;
  11416. +
  11417. + cgrp = perf_cgroup_from_task(task);
  11418. + info = this_cpu_ptr(cgrp->info);
  11419. + info->timestamp = ctx->timestamp;
  11420. +}
  11421. +
  11422. +#define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */
  11423. +#define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */
  11424. +
  11425. +/*
  11426. + * reschedule events based on the cgroup constraint of task.
  11427. + *
  11428. + * mode SWOUT : schedule out everything
  11429. + * mode SWIN : schedule in based on cgroup for next
  11430. + */
  11431. +void perf_cgroup_switch(struct task_struct *task, int mode)
  11432. +{
  11433. + struct perf_cpu_context *cpuctx;
  11434. + struct pmu *pmu;
  11435. + unsigned long flags;
  11436. +
  11437. + /*
  11438. + * disable interrupts to avoid geting nr_cgroup
  11439. + * changes via __perf_event_disable(). Also
  11440. + * avoids preemption.
  11441. + */
  11442. + local_irq_save(flags);
  11443. +
  11444. + /*
  11445. + * we reschedule only in the presence of cgroup
  11446. + * constrained events.
  11447. + */
  11448. + rcu_read_lock();
  11449. +
  11450. + list_for_each_entry_rcu(pmu, &pmus, entry) {
  11451. + cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
  11452. + if (cpuctx->unique_pmu != pmu)
  11453. + continue; /* ensure we process each cpuctx once */
  11454. +
  11455. + /*
  11456. + * perf_cgroup_events says at least one
  11457. + * context on this CPU has cgroup events.
  11458. + *
  11459. + * ctx->nr_cgroups reports the number of cgroup
  11460. + * events for a context.
  11461. + */
  11462. + if (cpuctx->ctx.nr_cgroups > 0) {
  11463. + perf_ctx_lock(cpuctx, cpuctx->task_ctx);
  11464. + perf_pmu_disable(cpuctx->ctx.pmu);
  11465. +
  11466. + if (mode & PERF_CGROUP_SWOUT) {
  11467. + cpu_ctx_sched_out(cpuctx, EVENT_ALL);
  11468. + /*
  11469. + * must not be done before ctxswout due
  11470. + * to event_filter_match() in event_sched_out()
  11471. + */
  11472. + cpuctx->cgrp = NULL;
  11473. + }
  11474. +
  11475. + if (mode & PERF_CGROUP_SWIN) {
  11476. + WARN_ON_ONCE(cpuctx->cgrp);
  11477. + /*
  11478. + * set cgrp before ctxsw in to allow
  11479. + * event_filter_match() to not have to pass
  11480. + * task around
  11481. + */
  11482. + cpuctx->cgrp = perf_cgroup_from_task(task);
  11483. + cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
  11484. + }
  11485. + perf_pmu_enable(cpuctx->ctx.pmu);
  11486. + perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
  11487. + }
  11488. + }
  11489. +
  11490. + rcu_read_unlock();
  11491. +
  11492. + local_irq_restore(flags);
  11493. +}
  11494. +
  11495. +static inline void perf_cgroup_sched_out(struct task_struct *task,
  11496. + struct task_struct *next)
  11497. +{
  11498. + struct perf_cgroup *cgrp1;
  11499. + struct perf_cgroup *cgrp2 = NULL;
  11500. +
  11501. + /*
  11502. + * we come here when we know perf_cgroup_events > 0
  11503. + */
  11504. + cgrp1 = perf_cgroup_from_task(task);
  11505. +
  11506. + /*
  11507. + * next is NULL when called from perf_event_enable_on_exec()
  11508. + * that will systematically cause a cgroup_switch()
  11509. + */
  11510. + if (next)
  11511. + cgrp2 = perf_cgroup_from_task(next);
  11512. +
  11513. + /*
  11514. + * only schedule out current cgroup events if we know
  11515. + * that we are switching to a different cgroup. Otherwise,
  11516. + * do no touch the cgroup events.
  11517. + */
  11518. + if (cgrp1 != cgrp2)
  11519. + perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
  11520. +}
  11521. +
  11522. +static inline void perf_cgroup_sched_in(struct task_struct *prev,
  11523. + struct task_struct *task)
  11524. +{
  11525. + struct perf_cgroup *cgrp1;
  11526. + struct perf_cgroup *cgrp2 = NULL;
  11527. +
  11528. + /*
  11529. + * we come here when we know perf_cgroup_events > 0
  11530. + */
  11531. + cgrp1 = perf_cgroup_from_task(task);
  11532. +
  11533. + /* prev can never be NULL */
  11534. + cgrp2 = perf_cgroup_from_task(prev);
  11535. +
  11536. + /*
  11537. + * only need to schedule in cgroup events if we are changing
  11538. + * cgroup during ctxsw. Cgroup events were not scheduled
  11539. + * out of ctxsw out if that was not the case.
  11540. + */
  11541. + if (cgrp1 != cgrp2)
  11542. + perf_cgroup_switch(task, PERF_CGROUP_SWIN);
  11543. +}
  11544. +
  11545. +static inline int perf_cgroup_connect(int fd, struct perf_event *event,
  11546. + struct perf_event_attr *attr,
  11547. + struct perf_event *group_leader)
  11548. +{
  11549. + struct perf_cgroup *cgrp;
  11550. + struct cgroup_subsys_state *css;
  11551. + struct fd f = fdget(fd);
  11552. + int ret = 0;
  11553. +
  11554. + if (!f.file)
  11555. + return -EBADF;
  11556. +
  11557. + css = css_tryget_online_from_dir(f.file->f_dentry,
  11558. + &perf_event_cgrp_subsys);
  11559. + if (IS_ERR(css)) {
  11560. + ret = PTR_ERR(css);
  11561. + goto out;
  11562. + }
  11563. +
  11564. + cgrp = container_of(css, struct perf_cgroup, css);
  11565. + event->cgrp = cgrp;
  11566. +
  11567. + /*
  11568. + * all events in a group must monitor
  11569. + * the same cgroup because a task belongs
  11570. + * to only one perf cgroup at a time
  11571. + */
  11572. + if (group_leader && group_leader->cgrp != cgrp) {
  11573. + perf_detach_cgroup(event);
  11574. + ret = -EINVAL;
  11575. + }
  11576. +out:
  11577. + fdput(f);
  11578. + return ret;
  11579. +}
  11580. +
  11581. +static inline void
  11582. +perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
  11583. +{
  11584. + struct perf_cgroup_info *t;
  11585. + t = per_cpu_ptr(event->cgrp->info, event->cpu);
  11586. + event->shadow_ctx_time = now - t->timestamp;
  11587. +}
  11588. +
  11589. +static inline void
  11590. +perf_cgroup_defer_enabled(struct perf_event *event)
  11591. +{
  11592. + /*
  11593. + * when the current task's perf cgroup does not match
  11594. + * the event's, we need to remember to call the
  11595. + * perf_mark_enable() function the first time a task with
  11596. + * a matching perf cgroup is scheduled in.
  11597. + */
  11598. + if (is_cgroup_event(event) && !perf_cgroup_match(event))
  11599. + event->cgrp_defer_enabled = 1;
  11600. +}
  11601. +
  11602. +static inline void
  11603. +perf_cgroup_mark_enabled(struct perf_event *event,
  11604. + struct perf_event_context *ctx)
  11605. +{
  11606. + struct perf_event *sub;
  11607. + u64 tstamp = perf_event_time(event);
  11608. +
  11609. + if (!event->cgrp_defer_enabled)
  11610. + return;
  11611. +
  11612. + event->cgrp_defer_enabled = 0;
  11613. +
  11614. + event->tstamp_enabled = tstamp - event->total_time_enabled;
  11615. + list_for_each_entry(sub, &event->sibling_list, group_entry) {
  11616. + if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
  11617. + sub->tstamp_enabled = tstamp - sub->total_time_enabled;
  11618. + sub->cgrp_defer_enabled = 0;
  11619. + }
  11620. + }
  11621. +}
  11622. +#else /* !CONFIG_CGROUP_PERF */
  11623. +
  11624. +static inline bool
  11625. +perf_cgroup_match(struct perf_event *event)
  11626. +{
  11627. + return true;
  11628. +}
  11629. +
  11630. +static inline void perf_detach_cgroup(struct perf_event *event)
  11631. +{}
  11632. +
  11633. +static inline int is_cgroup_event(struct perf_event *event)
  11634. +{
  11635. + return 0;
  11636. +}
  11637. +
  11638. +static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event)
  11639. +{
  11640. + return 0;
  11641. +}
  11642. +
  11643. +static inline void update_cgrp_time_from_event(struct perf_event *event)
  11644. +{
  11645. +}
  11646. +
  11647. +static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
  11648. +{
  11649. +}
  11650. +
  11651. +static inline void perf_cgroup_sched_out(struct task_struct *task,
  11652. + struct task_struct *next)
  11653. +{
  11654. +}
  11655. +
  11656. +static inline void perf_cgroup_sched_in(struct task_struct *prev,
  11657. + struct task_struct *task)
  11658. +{
  11659. +}
  11660. +
  11661. +static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
  11662. + struct perf_event_attr *attr,
  11663. + struct perf_event *group_leader)
  11664. +{
  11665. + return -EINVAL;
  11666. +}
  11667. +
  11668. +static inline void
  11669. +perf_cgroup_set_timestamp(struct task_struct *task,
  11670. + struct perf_event_context *ctx)
  11671. +{
  11672. +}
  11673. +
  11674. +void
  11675. +perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
  11676. +{
  11677. +}
  11678. +
  11679. +static inline void
  11680. +perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
  11681. +{
  11682. +}
  11683. +
  11684. +static inline u64 perf_cgroup_event_time(struct perf_event *event)
  11685. +{
  11686. + return 0;
  11687. +}
  11688. +
  11689. +static inline void
  11690. +perf_cgroup_defer_enabled(struct perf_event *event)
  11691. +{
  11692. +}
  11693. +
  11694. +static inline void
  11695. +perf_cgroup_mark_enabled(struct perf_event *event,
  11696. + struct perf_event_context *ctx)
  11697. +{
  11698. +}
  11699. +#endif
  11700. +
  11701. +/*
  11702. + * set default to be dependent on timer tick just
  11703. + * like original code
  11704. + */
  11705. +#define PERF_CPU_HRTIMER (1000 / HZ)
  11706. +/*
  11707. + * function must be called with interrupts disbled
  11708. + */
  11709. +static enum hrtimer_restart perf_cpu_hrtimer_handler(struct hrtimer *hr)
  11710. +{
  11711. + struct perf_cpu_context *cpuctx;
  11712. + enum hrtimer_restart ret = HRTIMER_NORESTART;
  11713. + int rotations = 0;
  11714. +
  11715. + WARN_ON(!irqs_disabled());
  11716. +
  11717. + cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
  11718. +
  11719. + rotations = perf_rotate_context(cpuctx);
  11720. +
  11721. + /*
  11722. + * arm timer if needed
  11723. + */
  11724. + if (rotations) {
  11725. + hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
  11726. + ret = HRTIMER_RESTART;
  11727. + }
  11728. +
  11729. + return ret;
  11730. +}
  11731. +
  11732. +/* CPU is going down */
  11733. +void perf_cpu_hrtimer_cancel(int cpu)
  11734. +{
  11735. + struct perf_cpu_context *cpuctx;
  11736. + struct pmu *pmu;
  11737. + unsigned long flags;
  11738. +
  11739. + if (WARN_ON(cpu != smp_processor_id()))
  11740. + return;
  11741. +
  11742. + local_irq_save(flags);
  11743. +
  11744. + rcu_read_lock();
  11745. +
  11746. + list_for_each_entry_rcu(pmu, &pmus, entry) {
  11747. + cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
  11748. +
  11749. + if (pmu->task_ctx_nr == perf_sw_context)
  11750. + continue;
  11751. +
  11752. + hrtimer_cancel(&cpuctx->hrtimer);
  11753. + }
  11754. +
  11755. + rcu_read_unlock();
  11756. +
  11757. + local_irq_restore(flags);
  11758. +}
  11759. +
  11760. +static void __perf_cpu_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
  11761. +{
  11762. + struct hrtimer *hr = &cpuctx->hrtimer;
  11763. + struct pmu *pmu = cpuctx->ctx.pmu;
  11764. + int timer;
  11765. +
  11766. + /* no multiplexing needed for SW PMU */
  11767. + if (pmu->task_ctx_nr == perf_sw_context)
  11768. + return;
  11769. +
  11770. + /*
  11771. + * check default is sane, if not set then force to
  11772. + * default interval (1/tick)
  11773. + */
  11774. + timer = pmu->hrtimer_interval_ms;
  11775. + if (timer < 1)
  11776. + timer = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
  11777. +
  11778. + cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
  11779. +
  11780. + hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
  11781. + hr->function = perf_cpu_hrtimer_handler;
  11782. +}
  11783. +
  11784. +static void perf_cpu_hrtimer_restart(struct perf_cpu_context *cpuctx)
  11785. +{
  11786. + struct hrtimer *hr = &cpuctx->hrtimer;
  11787. + struct pmu *pmu = cpuctx->ctx.pmu;
  11788. +
  11789. + /* not for SW PMU */
  11790. + if (pmu->task_ctx_nr == perf_sw_context)
  11791. + return;
  11792. +
  11793. + if (hrtimer_active(hr))
  11794. + return;
  11795. +
  11796. + if (!hrtimer_callback_running(hr))
  11797. + __hrtimer_start_range_ns(hr, cpuctx->hrtimer_interval,
  11798. + 0, HRTIMER_MODE_REL_PINNED, 0);
  11799. +}
  11800. +
  11801. +void perf_pmu_disable(struct pmu *pmu)
  11802. +{
  11803. + int *count = this_cpu_ptr(pmu->pmu_disable_count);
  11804. + if (!(*count)++)
  11805. + pmu->pmu_disable(pmu);
  11806. +}
  11807. +
  11808. +void perf_pmu_enable(struct pmu *pmu)
  11809. +{
  11810. + int *count = this_cpu_ptr(pmu->pmu_disable_count);
  11811. + if (!--(*count))
  11812. + pmu->pmu_enable(pmu);
  11813. +}
  11814. +
  11815. +static DEFINE_PER_CPU(struct list_head, rotation_list);
  11816. +
  11817. +/*
  11818. + * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
  11819. + * because they're strictly cpu affine and rotate_start is called with IRQs
  11820. + * disabled, while rotate_context is called from IRQ context.
  11821. + */
  11822. +static void perf_pmu_rotate_start(struct pmu *pmu)
  11823. +{
  11824. + struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
  11825. + struct list_head *head = this_cpu_ptr(&rotation_list);
  11826. +
  11827. + WARN_ON(!irqs_disabled());
  11828. +
  11829. + if (list_empty(&cpuctx->rotation_list))
  11830. + list_add(&cpuctx->rotation_list, head);
  11831. +}
  11832. +
  11833. +static void get_ctx(struct perf_event_context *ctx)
  11834. +{
  11835. + WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
  11836. +}
  11837. +
  11838. +static void put_ctx(struct perf_event_context *ctx)
  11839. +{
  11840. + if (atomic_dec_and_test(&ctx->refcount)) {
  11841. + if (ctx->parent_ctx)
  11842. + put_ctx(ctx->parent_ctx);
  11843. + if (ctx->task)
  11844. + put_task_struct(ctx->task);
  11845. + kfree_rcu(ctx, rcu_head);
  11846. + }
  11847. +}
  11848. +
  11849. +/*
  11850. + * This must be done under the ctx->lock, such as to serialize against
  11851. + * context_equiv(), therefore we cannot call put_ctx() since that might end up
  11852. + * calling scheduler related locks and ctx->lock nests inside those.
  11853. + */
  11854. +static __must_check struct perf_event_context *
  11855. +unclone_ctx(struct perf_event_context *ctx)
  11856. +{
  11857. + struct perf_event_context *parent_ctx = ctx->parent_ctx;
  11858. +
  11859. + lockdep_assert_held(&ctx->lock);
  11860. +
  11861. + if (parent_ctx)
  11862. + ctx->parent_ctx = NULL;
  11863. + ctx->generation++;
  11864. +
  11865. + return parent_ctx;
  11866. +}
  11867. +
  11868. +static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
  11869. +{
  11870. + /*
  11871. + * only top level events have the pid namespace they were created in
  11872. + */
  11873. + if (event->parent)
  11874. + event = event->parent;
  11875. +
  11876. + return task_tgid_nr_ns(p, event->ns);
  11877. +}
  11878. +
  11879. +static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
  11880. +{
  11881. + /*
  11882. + * only top level events have the pid namespace they were created in
  11883. + */
  11884. + if (event->parent)
  11885. + event = event->parent;
  11886. +
  11887. + return task_pid_nr_ns(p, event->ns);
  11888. +}
  11889. +
  11890. +/*
  11891. + * If we inherit events we want to return the parent event id
  11892. + * to userspace.
  11893. + */
  11894. +static u64 primary_event_id(struct perf_event *event)
  11895. +{
  11896. + u64 id = event->id;
  11897. +
  11898. + if (event->parent)
  11899. + id = event->parent->id;
  11900. +
  11901. + return id;
  11902. +}
  11903. +
  11904. +/*
  11905. + * Get the perf_event_context for a task and lock it.
  11906. + * This has to cope with with the fact that until it is locked,
  11907. + * the context could get moved to another task.
  11908. + */
  11909. +static struct perf_event_context *
  11910. +perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
  11911. +{
  11912. + struct perf_event_context *ctx;
  11913. +
  11914. +retry:
  11915. + /*
  11916. + * One of the few rules of preemptible RCU is that one cannot do
  11917. + * rcu_read_unlock() while holding a scheduler (or nested) lock when
  11918. + * part of the read side critical section was preemptible -- see
  11919. + * rcu_read_unlock_special().
  11920. + *
  11921. + * Since ctx->lock nests under rq->lock we must ensure the entire read
  11922. + * side critical section is non-preemptible.
  11923. + */
  11924. + preempt_disable();
  11925. + rcu_read_lock();
  11926. + ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
  11927. + if (ctx) {
  11928. + /*
  11929. + * If this context is a clone of another, it might
  11930. + * get swapped for another underneath us by
  11931. + * perf_event_task_sched_out, though the
  11932. + * rcu_read_lock() protects us from any context
  11933. + * getting freed. Lock the context and check if it
  11934. + * got swapped before we could get the lock, and retry
  11935. + * if so. If we locked the right context, then it
  11936. + * can't get swapped on us any more.
  11937. + */
  11938. + raw_spin_lock_irqsave(&ctx->lock, *flags);
  11939. + if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
  11940. + raw_spin_unlock_irqrestore(&ctx->lock, *flags);
  11941. + rcu_read_unlock();
  11942. + preempt_enable();
  11943. + goto retry;
  11944. + }
  11945. +
  11946. + if (!atomic_inc_not_zero(&ctx->refcount)) {
  11947. + raw_spin_unlock_irqrestore(&ctx->lock, *flags);
  11948. + ctx = NULL;
  11949. + }
  11950. + }
  11951. + rcu_read_unlock();
  11952. + preempt_enable();
  11953. + return ctx;
  11954. +}
  11955. +
  11956. +/*
  11957. + * Get the context for a task and increment its pin_count so it
  11958. + * can't get swapped to another task. This also increments its
  11959. + * reference count so that the context can't get freed.
  11960. + */
  11961. +static struct perf_event_context *
  11962. +perf_pin_task_context(struct task_struct *task, int ctxn)
  11963. +{
  11964. + struct perf_event_context *ctx;
  11965. + unsigned long flags;
  11966. +
  11967. + ctx = perf_lock_task_context(task, ctxn, &flags);
  11968. + if (ctx) {
  11969. + ++ctx->pin_count;
  11970. + raw_spin_unlock_irqrestore(&ctx->lock, flags);
  11971. + }
  11972. + return ctx;
  11973. +}
  11974. +
  11975. +static void perf_unpin_context(struct perf_event_context *ctx)
  11976. +{
  11977. + unsigned long flags;
  11978. +
  11979. + raw_spin_lock_irqsave(&ctx->lock, flags);
  11980. + --ctx->pin_count;
  11981. + raw_spin_unlock_irqrestore(&ctx->lock, flags);
  11982. +}
  11983. +
  11984. +/*
  11985. + * Update the record of the current time in a context.
  11986. + */
  11987. +static void update_context_time(struct perf_event_context *ctx)
  11988. +{
  11989. + u64 now = perf_clock();
  11990. +
  11991. + ctx->time += now - ctx->timestamp;
  11992. + ctx->timestamp = now;
  11993. +}
  11994. +
  11995. +static u64 perf_event_time(struct perf_event *event)
  11996. +{
  11997. + struct perf_event_context *ctx = event->ctx;
  11998. +
  11999. + if (is_cgroup_event(event))
  12000. + return perf_cgroup_event_time(event);
  12001. +
  12002. + return ctx ? ctx->time : 0;
  12003. +}
  12004. +
  12005. +/*
  12006. + * Update the total_time_enabled and total_time_running fields for a event.
  12007. + * The caller of this function needs to hold the ctx->lock.
  12008. + */
  12009. +static void update_event_times(struct perf_event *event)
  12010. +{
  12011. + struct perf_event_context *ctx = event->ctx;
  12012. + u64 run_end;
  12013. +
  12014. + if (event->state < PERF_EVENT_STATE_INACTIVE ||
  12015. + event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
  12016. + return;
  12017. + /*
  12018. + * in cgroup mode, time_enabled represents
  12019. + * the time the event was enabled AND active
  12020. + * tasks were in the monitored cgroup. This is
  12021. + * independent of the activity of the context as
  12022. + * there may be a mix of cgroup and non-cgroup events.
  12023. + *
  12024. + * That is why we treat cgroup events differently
  12025. + * here.
  12026. + */
  12027. + if (is_cgroup_event(event))
  12028. + run_end = perf_cgroup_event_time(event);
  12029. + else if (ctx->is_active)
  12030. + run_end = ctx->time;
  12031. + else
  12032. + run_end = event->tstamp_stopped;
  12033. +
  12034. + event->total_time_enabled = run_end - event->tstamp_enabled;
  12035. +
  12036. + if (event->state == PERF_EVENT_STATE_INACTIVE)
  12037. + run_end = event->tstamp_stopped;
  12038. + else
  12039. + run_end = perf_event_time(event);
  12040. +
  12041. + event->total_time_running = run_end - event->tstamp_running;
  12042. +
  12043. +}
  12044. +
  12045. +/*
  12046. + * Update total_time_enabled and total_time_running for all events in a group.
  12047. + */
  12048. +static void update_group_times(struct perf_event *leader)
  12049. +{
  12050. + struct perf_event *event;
  12051. +
  12052. + update_event_times(leader);
  12053. + list_for_each_entry(event, &leader->sibling_list, group_entry)
  12054. + update_event_times(event);
  12055. +}
  12056. +
  12057. +static struct list_head *
  12058. +ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
  12059. +{
  12060. + if (event->attr.pinned)
  12061. + return &ctx->pinned_groups;
  12062. + else
  12063. + return &ctx->flexible_groups;
  12064. +}
  12065. +
  12066. +/*
  12067. + * Add a event from the lists for its context.
  12068. + * Must be called with ctx->mutex and ctx->lock held.
  12069. + */
  12070. +static void
  12071. +list_add_event(struct perf_event *event, struct perf_event_context *ctx)
  12072. +{
  12073. + WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
  12074. + event->attach_state |= PERF_ATTACH_CONTEXT;
  12075. +
  12076. + /*
  12077. + * If we're a stand alone event or group leader, we go to the context
  12078. + * list, group events are kept attached to the group so that
  12079. + * perf_group_detach can, at all times, locate all siblings.
  12080. + */
  12081. + if (event->group_leader == event) {
  12082. + struct list_head *list;
  12083. +
  12084. + if (is_software_event(event))
  12085. + event->group_flags |= PERF_GROUP_SOFTWARE;
  12086. +
  12087. + list = ctx_group_list(event, ctx);
  12088. + list_add_tail(&event->group_entry, list);
  12089. + }
  12090. +
  12091. + if (is_cgroup_event(event))
  12092. + ctx->nr_cgroups++;
  12093. +
  12094. + if (has_branch_stack(event))
  12095. + ctx->nr_branch_stack++;
  12096. +
  12097. + list_add_rcu(&event->event_entry, &ctx->event_list);
  12098. + if (!ctx->nr_events)
  12099. + perf_pmu_rotate_start(ctx->pmu);
  12100. + ctx->nr_events++;
  12101. + if (event->attr.inherit_stat)
  12102. + ctx->nr_stat++;
  12103. +
  12104. + ctx->generation++;
  12105. +}
  12106. +
  12107. +/*
  12108. + * Initialize event state based on the perf_event_attr::disabled.
  12109. + */
  12110. +static inline void perf_event__state_init(struct perf_event *event)
  12111. +{
  12112. + event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF :
  12113. + PERF_EVENT_STATE_INACTIVE;
  12114. +}
  12115. +
  12116. +/*
  12117. + * Called at perf_event creation and when events are attached/detached from a
  12118. + * group.
  12119. + */
  12120. +static void perf_event__read_size(struct perf_event *event)
  12121. +{
  12122. + int entry = sizeof(u64); /* value */
  12123. + int size = 0;
  12124. + int nr = 1;
  12125. +
  12126. + if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
  12127. + size += sizeof(u64);
  12128. +
  12129. + if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
  12130. + size += sizeof(u64);
  12131. +
  12132. + if (event->attr.read_format & PERF_FORMAT_ID)
  12133. + entry += sizeof(u64);
  12134. +
  12135. + if (event->attr.read_format & PERF_FORMAT_GROUP) {
  12136. + nr += event->group_leader->nr_siblings;
  12137. + size += sizeof(u64);
  12138. + }
  12139. +
  12140. + size += entry * nr;
  12141. + event->read_size = size;
  12142. +}
  12143. +
  12144. +static void perf_event__header_size(struct perf_event *event)
  12145. +{
  12146. + struct perf_sample_data *data;
  12147. + u64 sample_type = event->attr.sample_type;
  12148. + u16 size = 0;
  12149. +
  12150. + perf_event__read_size(event);
  12151. +
  12152. + if (sample_type & PERF_SAMPLE_IP)
  12153. + size += sizeof(data->ip);
  12154. +
  12155. + if (sample_type & PERF_SAMPLE_ADDR)
  12156. + size += sizeof(data->addr);
  12157. +
  12158. + if (sample_type & PERF_SAMPLE_PERIOD)
  12159. + size += sizeof(data->period);
  12160. +
  12161. + if (sample_type & PERF_SAMPLE_WEIGHT)
  12162. + size += sizeof(data->weight);
  12163. +
  12164. + if (sample_type & PERF_SAMPLE_READ)
  12165. + size += event->read_size;
  12166. +
  12167. + if (sample_type & PERF_SAMPLE_DATA_SRC)
  12168. + size += sizeof(data->data_src.val);
  12169. +
  12170. + if (sample_type & PERF_SAMPLE_TRANSACTION)
  12171. + size += sizeof(data->txn);
  12172. +
  12173. + event->header_size = size;
  12174. +}
  12175. +
  12176. +static void perf_event__id_header_size(struct perf_event *event)
  12177. +{
  12178. + struct perf_sample_data *data;
  12179. + u64 sample_type = event->attr.sample_type;
  12180. + u16 size = 0;
  12181. +
  12182. + if (sample_type & PERF_SAMPLE_TID)
  12183. + size += sizeof(data->tid_entry);
  12184. +
  12185. + if (sample_type & PERF_SAMPLE_TIME)
  12186. + size += sizeof(data->time);
  12187. +
  12188. + if (sample_type & PERF_SAMPLE_IDENTIFIER)
  12189. + size += sizeof(data->id);
  12190. +
  12191. + if (sample_type & PERF_SAMPLE_ID)
  12192. + size += sizeof(data->id);
  12193. +
  12194. + if (sample_type & PERF_SAMPLE_STREAM_ID)
  12195. + size += sizeof(data->stream_id);
  12196. +
  12197. + if (sample_type & PERF_SAMPLE_CPU)
  12198. + size += sizeof(data->cpu_entry);
  12199. +
  12200. + event->id_header_size = size;
  12201. +}
  12202. +
  12203. +static void perf_group_attach(struct perf_event *event)
  12204. +{
  12205. + struct perf_event *group_leader = event->group_leader, *pos;
  12206. +
  12207. + /*
  12208. + * We can have double attach due to group movement in perf_event_open.
  12209. + */
  12210. + if (event->attach_state & PERF_ATTACH_GROUP)
  12211. + return;
  12212. +
  12213. + event->attach_state |= PERF_ATTACH_GROUP;
  12214. +
  12215. + if (group_leader == event)
  12216. + return;
  12217. +
  12218. + if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
  12219. + !is_software_event(event))
  12220. + group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
  12221. +
  12222. + list_add_tail(&event->group_entry, &group_leader->sibling_list);
  12223. + group_leader->nr_siblings++;
  12224. +
  12225. + perf_event__header_size(group_leader);
  12226. +
  12227. + list_for_each_entry(pos, &group_leader->sibling_list, group_entry)
  12228. + perf_event__header_size(pos);
  12229. +}
  12230. +
  12231. +/*
  12232. + * Remove a event from the lists for its context.
  12233. + * Must be called with ctx->mutex and ctx->lock held.
  12234. + */
  12235. +static void
  12236. +list_del_event(struct perf_event *event, struct perf_event_context *ctx)
  12237. +{
  12238. + struct perf_cpu_context *cpuctx;
  12239. + /*
  12240. + * We can have double detach due to exit/hot-unplug + close.
  12241. + */
  12242. + if (!(event->attach_state & PERF_ATTACH_CONTEXT))
  12243. + return;
  12244. +
  12245. + event->attach_state &= ~PERF_ATTACH_CONTEXT;
  12246. +
  12247. + if (is_cgroup_event(event)) {
  12248. + ctx->nr_cgroups--;
  12249. + cpuctx = __get_cpu_context(ctx);
  12250. + /*
  12251. + * if there are no more cgroup events
  12252. + * then cler cgrp to avoid stale pointer
  12253. + * in update_cgrp_time_from_cpuctx()
  12254. + */
  12255. + if (!ctx->nr_cgroups)
  12256. + cpuctx->cgrp = NULL;
  12257. + }
  12258. +
  12259. + if (has_branch_stack(event))
  12260. + ctx->nr_branch_stack--;
  12261. +
  12262. + ctx->nr_events--;
  12263. + if (event->attr.inherit_stat)
  12264. + ctx->nr_stat--;
  12265. +
  12266. + list_del_rcu(&event->event_entry);
  12267. +
  12268. + if (event->group_leader == event)
  12269. + list_del_init(&event->group_entry);
  12270. +
  12271. + update_group_times(event);
  12272. +
  12273. + /*
  12274. + * If event was in error state, then keep it
  12275. + * that way, otherwise bogus counts will be
  12276. + * returned on read(). The only way to get out
  12277. + * of error state is by explicit re-enabling
  12278. + * of the event
  12279. + */
  12280. + if (event->state > PERF_EVENT_STATE_OFF)
  12281. + event->state = PERF_EVENT_STATE_OFF;
  12282. +
  12283. + ctx->generation++;
  12284. +}
  12285. +
  12286. +static void perf_group_detach(struct perf_event *event)
  12287. +{
  12288. + struct perf_event *sibling, *tmp;
  12289. + struct list_head *list = NULL;
  12290. +
  12291. + /*
  12292. + * We can have double detach due to exit/hot-unplug + close.
  12293. + */
  12294. + if (!(event->attach_state & PERF_ATTACH_GROUP))
  12295. + return;
  12296. +
  12297. + event->attach_state &= ~PERF_ATTACH_GROUP;
  12298. +
  12299. + /*
  12300. + * If this is a sibling, remove it from its group.
  12301. + */
  12302. + if (event->group_leader != event) {
  12303. + list_del_init(&event->group_entry);
  12304. + event->group_leader->nr_siblings--;
  12305. + goto out;
  12306. + }
  12307. +
  12308. + if (!list_empty(&event->group_entry))
  12309. + list = &event->group_entry;
  12310. +
  12311. + /*
  12312. + * If this was a group event with sibling events then
  12313. + * upgrade the siblings to singleton events by adding them
  12314. + * to whatever list we are on.
  12315. + */
  12316. + list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
  12317. + if (list)
  12318. + list_move_tail(&sibling->group_entry, list);
  12319. + sibling->group_leader = sibling;
  12320. +
  12321. + /* Inherit group flags from the previous leader */
  12322. + sibling->group_flags = event->group_flags;
  12323. + }
  12324. +
  12325. +out:
  12326. + perf_event__header_size(event->group_leader);
  12327. +
  12328. + list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry)
  12329. + perf_event__header_size(tmp);
  12330. +}
  12331. +
  12332. +/*
  12333. + * User event without the task.
  12334. + */
  12335. +static bool is_orphaned_event(struct perf_event *event)
  12336. +{
  12337. + return event && !is_kernel_event(event) && !event->owner;
  12338. +}
  12339. +
  12340. +/*
  12341. + * Event has a parent but parent's task finished and it's
  12342. + * alive only because of children holding refference.
  12343. + */
  12344. +static bool is_orphaned_child(struct perf_event *event)
  12345. +{
  12346. + return is_orphaned_event(event->parent);
  12347. +}
  12348. +
  12349. +static void orphans_remove_work(struct work_struct *work);
  12350. +
  12351. +static void schedule_orphans_remove(struct perf_event_context *ctx)
  12352. +{
  12353. + if (!ctx->task || ctx->orphans_remove_sched || !perf_wq)
  12354. + return;
  12355. +
  12356. + if (queue_delayed_work(perf_wq, &ctx->orphans_remove, 1)) {
  12357. + get_ctx(ctx);
  12358. + ctx->orphans_remove_sched = true;
  12359. + }
  12360. +}
  12361. +
  12362. +static int __init perf_workqueue_init(void)
  12363. +{
  12364. + perf_wq = create_singlethread_workqueue("perf");
  12365. + WARN(!perf_wq, "failed to create perf workqueue\n");
  12366. + return perf_wq ? 0 : -1;
  12367. +}
  12368. +
  12369. +core_initcall(perf_workqueue_init);
  12370. +
  12371. +static inline int
  12372. +event_filter_match(struct perf_event *event)
  12373. +{
  12374. + return (event->cpu == -1 || event->cpu == smp_processor_id())
  12375. + && perf_cgroup_match(event);
  12376. +}
  12377. +
  12378. +static void
  12379. +event_sched_out(struct perf_event *event,
  12380. + struct perf_cpu_context *cpuctx,
  12381. + struct perf_event_context *ctx)
  12382. +{
  12383. + u64 tstamp = perf_event_time(event);
  12384. + u64 delta;
  12385. + /*
  12386. + * An event which could not be activated because of
  12387. + * filter mismatch still needs to have its timings
  12388. + * maintained, otherwise bogus information is return
  12389. + * via read() for time_enabled, time_running:
  12390. + */
  12391. + if (event->state == PERF_EVENT_STATE_INACTIVE
  12392. + && !event_filter_match(event)) {
  12393. + delta = tstamp - event->tstamp_stopped;
  12394. + event->tstamp_running += delta;
  12395. + event->tstamp_stopped = tstamp;
  12396. + }
  12397. +
  12398. + if (event->state != PERF_EVENT_STATE_ACTIVE)
  12399. + return;
  12400. +
  12401. + perf_pmu_disable(event->pmu);
  12402. +
  12403. + event->state = PERF_EVENT_STATE_INACTIVE;
  12404. + if (event->pending_disable) {
  12405. + event->pending_disable = 0;
  12406. + event->state = PERF_EVENT_STATE_OFF;
  12407. + }
  12408. + event->tstamp_stopped = tstamp;
  12409. + event->pmu->del(event, 0);
  12410. + event->oncpu = -1;
  12411. +
  12412. + if (!is_software_event(event))
  12413. + cpuctx->active_oncpu--;
  12414. + ctx->nr_active--;
  12415. + if (event->attr.freq && event->attr.sample_freq)
  12416. + ctx->nr_freq--;
  12417. + if (event->attr.exclusive || !cpuctx->active_oncpu)
  12418. + cpuctx->exclusive = 0;
  12419. +
  12420. + if (is_orphaned_child(event))
  12421. + schedule_orphans_remove(ctx);
  12422. +
  12423. + perf_pmu_enable(event->pmu);
  12424. +}
  12425. +
  12426. +static void
  12427. +group_sched_out(struct perf_event *group_event,
  12428. + struct perf_cpu_context *cpuctx,
  12429. + struct perf_event_context *ctx)
  12430. +{
  12431. + struct perf_event *event;
  12432. + int state = group_event->state;
  12433. +
  12434. + event_sched_out(group_event, cpuctx, ctx);
  12435. +
  12436. + /*
  12437. + * Schedule out siblings (if any):
  12438. + */
  12439. + list_for_each_entry(event, &group_event->sibling_list, group_entry)
  12440. + event_sched_out(event, cpuctx, ctx);
  12441. +
  12442. + if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive)
  12443. + cpuctx->exclusive = 0;
  12444. +}
  12445. +
  12446. +struct remove_event {
  12447. + struct perf_event *event;
  12448. + bool detach_group;
  12449. +};
  12450. +
  12451. +/*
  12452. + * Cross CPU call to remove a performance event
  12453. + *
  12454. + * We disable the event on the hardware level first. After that we
  12455. + * remove it from the context list.
  12456. + */
  12457. +static int __perf_remove_from_context(void *info)
  12458. +{
  12459. + struct remove_event *re = info;
  12460. + struct perf_event *event = re->event;
  12461. + struct perf_event_context *ctx = event->ctx;
  12462. + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
  12463. +
  12464. + raw_spin_lock(&ctx->lock);
  12465. + event_sched_out(event, cpuctx, ctx);
  12466. + if (re->detach_group)
  12467. + perf_group_detach(event);
  12468. + list_del_event(event, ctx);
  12469. + if (!ctx->nr_events && cpuctx->task_ctx == ctx) {
  12470. + ctx->is_active = 0;
  12471. + cpuctx->task_ctx = NULL;
  12472. + }
  12473. + raw_spin_unlock(&ctx->lock);
  12474. +
  12475. + return 0;
  12476. +}
  12477. +
  12478. +
  12479. +/*
  12480. + * Remove the event from a task's (or a CPU's) list of events.
  12481. + *
  12482. + * CPU events are removed with a smp call. For task events we only
  12483. + * call when the task is on a CPU.
  12484. + *
  12485. + * If event->ctx is a cloned context, callers must make sure that
  12486. + * every task struct that event->ctx->task could possibly point to
  12487. + * remains valid. This is OK when called from perf_release since
  12488. + * that only calls us on the top-level context, which can't be a clone.
  12489. + * When called from perf_event_exit_task, it's OK because the
  12490. + * context has been detached from its task.
  12491. + */
  12492. +static void perf_remove_from_context(struct perf_event *event, bool detach_group)
  12493. +{
  12494. + struct perf_event_context *ctx = event->ctx;
  12495. + struct task_struct *task = ctx->task;
  12496. + struct remove_event re = {
  12497. + .event = event,
  12498. + .detach_group = detach_group,
  12499. + };
  12500. +
  12501. + lockdep_assert_held(&ctx->mutex);
  12502. +
  12503. + if (!task) {
  12504. + /*
  12505. + * Per cpu events are removed via an smp call. The removal can
  12506. + * fail if the CPU is currently offline, but in that case we
  12507. + * already called __perf_remove_from_context from
  12508. + * perf_event_exit_cpu.
  12509. + */
  12510. + cpu_function_call(event->cpu, __perf_remove_from_context, &re);
  12511. + return;
  12512. + }
  12513. +
  12514. +retry:
  12515. + if (!task_function_call(task, __perf_remove_from_context, &re))
  12516. + return;
  12517. +
  12518. + raw_spin_lock_irq(&ctx->lock);
  12519. + /*
  12520. + * If we failed to find a running task, but find the context active now
  12521. + * that we've acquired the ctx->lock, retry.
  12522. + */
  12523. + if (ctx->is_active) {
  12524. + raw_spin_unlock_irq(&ctx->lock);
  12525. + /*
  12526. + * Reload the task pointer, it might have been changed by
  12527. + * a concurrent perf_event_context_sched_out().
  12528. + */
  12529. + task = ctx->task;
  12530. + goto retry;
  12531. + }
  12532. +
  12533. + /*
  12534. + * Since the task isn't running, its safe to remove the event, us
  12535. + * holding the ctx->lock ensures the task won't get scheduled in.
  12536. + */
  12537. + if (detach_group)
  12538. + perf_group_detach(event);
  12539. + list_del_event(event, ctx);
  12540. + raw_spin_unlock_irq(&ctx->lock);
  12541. +}
  12542. +
  12543. +/*
  12544. + * Cross CPU call to disable a performance event
  12545. + */
  12546. +int __perf_event_disable(void *info)
  12547. +{
  12548. + struct perf_event *event = info;
  12549. + struct perf_event_context *ctx = event->ctx;
  12550. + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
  12551. +
  12552. + /*
  12553. + * If this is a per-task event, need to check whether this
  12554. + * event's task is the current task on this cpu.
  12555. + *
  12556. + * Can trigger due to concurrent perf_event_context_sched_out()
  12557. + * flipping contexts around.
  12558. + */
  12559. + if (ctx->task && cpuctx->task_ctx != ctx)
  12560. + return -EINVAL;
  12561. +
  12562. + raw_spin_lock(&ctx->lock);
  12563. +
  12564. + /*
  12565. + * If the event is on, turn it off.
  12566. + * If it is in error state, leave it in error state.
  12567. + */
  12568. + if (event->state >= PERF_EVENT_STATE_INACTIVE) {
  12569. + update_context_time(ctx);
  12570. + update_cgrp_time_from_event(event);
  12571. + update_group_times(event);
  12572. + if (event == event->group_leader)
  12573. + group_sched_out(event, cpuctx, ctx);
  12574. + else
  12575. + event_sched_out(event, cpuctx, ctx);
  12576. + event->state = PERF_EVENT_STATE_OFF;
  12577. + }
  12578. +
  12579. + raw_spin_unlock(&ctx->lock);
  12580. +
  12581. + return 0;
  12582. +}
  12583. +
  12584. +/*
  12585. + * Disable a event.
  12586. + *
  12587. + * If event->ctx is a cloned context, callers must make sure that
  12588. + * every task struct that event->ctx->task could possibly point to
  12589. + * remains valid. This condition is satisifed when called through
  12590. + * perf_event_for_each_child or perf_event_for_each because they
  12591. + * hold the top-level event's child_mutex, so any descendant that
  12592. + * goes to exit will block in sync_child_event.
  12593. + * When called from perf_pending_event it's OK because event->ctx
  12594. + * is the current context on this CPU and preemption is disabled,
  12595. + * hence we can't get into perf_event_task_sched_out for this context.
  12596. + */
  12597. +void perf_event_disable(struct perf_event *event)
  12598. +{
  12599. + struct perf_event_context *ctx = event->ctx;
  12600. + struct task_struct *task = ctx->task;
  12601. +
  12602. + if (!task) {
  12603. + /*
  12604. + * Disable the event on the cpu that it's on
  12605. + */
  12606. + cpu_function_call(event->cpu, __perf_event_disable, event);
  12607. + return;
  12608. + }
  12609. +
  12610. +retry:
  12611. + if (!task_function_call(task, __perf_event_disable, event))
  12612. + return;
  12613. +
  12614. + raw_spin_lock_irq(&ctx->lock);
  12615. + /*
  12616. + * If the event is still active, we need to retry the cross-call.
  12617. + */
  12618. + if (event->state == PERF_EVENT_STATE_ACTIVE) {
  12619. + raw_spin_unlock_irq(&ctx->lock);
  12620. + /*
  12621. + * Reload the task pointer, it might have been changed by
  12622. + * a concurrent perf_event_context_sched_out().
  12623. + */
  12624. + task = ctx->task;
  12625. + goto retry;
  12626. + }
  12627. +
  12628. + /*
  12629. + * Since we have the lock this context can't be scheduled
  12630. + * in, so we can change the state safely.
  12631. + */
  12632. + if (event->state == PERF_EVENT_STATE_INACTIVE) {
  12633. + update_group_times(event);
  12634. + event->state = PERF_EVENT_STATE_OFF;
  12635. + }
  12636. + raw_spin_unlock_irq(&ctx->lock);
  12637. +}
  12638. +EXPORT_SYMBOL_GPL(perf_event_disable);
  12639. +
  12640. +static void perf_set_shadow_time(struct perf_event *event,
  12641. + struct perf_event_context *ctx,
  12642. + u64 tstamp)
  12643. +{
  12644. + /*
  12645. + * use the correct time source for the time snapshot
  12646. + *
  12647. + * We could get by without this by leveraging the
  12648. + * fact that to get to this function, the caller
  12649. + * has most likely already called update_context_time()
  12650. + * and update_cgrp_time_xx() and thus both timestamp
  12651. + * are identical (or very close). Given that tstamp is,
  12652. + * already adjusted for cgroup, we could say that:
  12653. + * tstamp - ctx->timestamp
  12654. + * is equivalent to
  12655. + * tstamp - cgrp->timestamp.
  12656. + *
  12657. + * Then, in perf_output_read(), the calculation would
  12658. + * work with no changes because:
  12659. + * - event is guaranteed scheduled in
  12660. + * - no scheduled out in between
  12661. + * - thus the timestamp would be the same
  12662. + *
  12663. + * But this is a bit hairy.
  12664. + *
  12665. + * So instead, we have an explicit cgroup call to remain
  12666. + * within the time time source all along. We believe it
  12667. + * is cleaner and simpler to understand.
  12668. + */
  12669. + if (is_cgroup_event(event))
  12670. + perf_cgroup_set_shadow_time(event, tstamp);
  12671. + else
  12672. + event->shadow_ctx_time = tstamp - ctx->timestamp;
  12673. +}
  12674. +
  12675. +#define MAX_INTERRUPTS (~0ULL)
  12676. +
  12677. +static void perf_log_throttle(struct perf_event *event, int enable);
  12678. +
  12679. +static int
  12680. +event_sched_in(struct perf_event *event,
  12681. + struct perf_cpu_context *cpuctx,
  12682. + struct perf_event_context *ctx)
  12683. +{
  12684. + u64 tstamp = perf_event_time(event);
  12685. + int ret = 0;
  12686. +
  12687. + lockdep_assert_held(&ctx->lock);
  12688. +
  12689. + if (event->state <= PERF_EVENT_STATE_OFF)
  12690. + return 0;
  12691. +
  12692. + event->state = PERF_EVENT_STATE_ACTIVE;
  12693. + event->oncpu = smp_processor_id();
  12694. +
  12695. + /*
  12696. + * Unthrottle events, since we scheduled we might have missed several
  12697. + * ticks already, also for a heavily scheduling task there is little
  12698. + * guarantee it'll get a tick in a timely manner.
  12699. + */
  12700. + if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
  12701. + perf_log_throttle(event, 1);
  12702. + event->hw.interrupts = 0;
  12703. + }
  12704. +
  12705. + /*
  12706. + * The new state must be visible before we turn it on in the hardware:
  12707. + */
  12708. + smp_wmb();
  12709. +
  12710. + perf_pmu_disable(event->pmu);
  12711. +
  12712. + if (event->pmu->add(event, PERF_EF_START)) {
  12713. + event->state = PERF_EVENT_STATE_INACTIVE;
  12714. + event->oncpu = -1;
  12715. + ret = -EAGAIN;
  12716. + goto out;
  12717. + }
  12718. +
  12719. + event->tstamp_running += tstamp - event->tstamp_stopped;
  12720. +
  12721. + perf_set_shadow_time(event, ctx, tstamp);
  12722. +
  12723. + if (!is_software_event(event))
  12724. + cpuctx->active_oncpu++;
  12725. + ctx->nr_active++;
  12726. + if (event->attr.freq && event->attr.sample_freq)
  12727. + ctx->nr_freq++;
  12728. +
  12729. + if (event->attr.exclusive)
  12730. + cpuctx->exclusive = 1;
  12731. +
  12732. + if (is_orphaned_child(event))
  12733. + schedule_orphans_remove(ctx);
  12734. +
  12735. +out:
  12736. + perf_pmu_enable(event->pmu);
  12737. +
  12738. + return ret;
  12739. +}
  12740. +
  12741. +static int
  12742. +group_sched_in(struct perf_event *group_event,
  12743. + struct perf_cpu_context *cpuctx,
  12744. + struct perf_event_context *ctx)
  12745. +{
  12746. + struct perf_event *event, *partial_group = NULL;
  12747. + struct pmu *pmu = ctx->pmu;
  12748. + u64 now = ctx->time;
  12749. + bool simulate = false;
  12750. +
  12751. + if (group_event->state == PERF_EVENT_STATE_OFF)
  12752. + return 0;
  12753. +
  12754. + pmu->start_txn(pmu);
  12755. +
  12756. + if (event_sched_in(group_event, cpuctx, ctx)) {
  12757. + pmu->cancel_txn(pmu);
  12758. + perf_cpu_hrtimer_restart(cpuctx);
  12759. + return -EAGAIN;
  12760. + }
  12761. +
  12762. + /*
  12763. + * Schedule in siblings as one group (if any):
  12764. + */
  12765. + list_for_each_entry(event, &group_event->sibling_list, group_entry) {
  12766. + if (event_sched_in(event, cpuctx, ctx)) {
  12767. + partial_group = event;
  12768. + goto group_error;
  12769. + }
  12770. + }
  12771. +
  12772. + if (!pmu->commit_txn(pmu))
  12773. + return 0;
  12774. +
  12775. +group_error:
  12776. + /*
  12777. + * Groups can be scheduled in as one unit only, so undo any
  12778. + * partial group before returning:
  12779. + * The events up to the failed event are scheduled out normally,
  12780. + * tstamp_stopped will be updated.
  12781. + *
  12782. + * The failed events and the remaining siblings need to have
  12783. + * their timings updated as if they had gone thru event_sched_in()
  12784. + * and event_sched_out(). This is required to get consistent timings
  12785. + * across the group. This also takes care of the case where the group
  12786. + * could never be scheduled by ensuring tstamp_stopped is set to mark
  12787. + * the time the event was actually stopped, such that time delta
  12788. + * calculation in update_event_times() is correct.
  12789. + */
  12790. + list_for_each_entry(event, &group_event->sibling_list, group_entry) {
  12791. + if (event == partial_group)
  12792. + simulate = true;
  12793. +
  12794. + if (simulate) {
  12795. + event->tstamp_running += now - event->tstamp_stopped;
  12796. + event->tstamp_stopped = now;
  12797. + } else {
  12798. + event_sched_out(event, cpuctx, ctx);
  12799. + }
  12800. + }
  12801. + event_sched_out(group_event, cpuctx, ctx);
  12802. +
  12803. + pmu->cancel_txn(pmu);
  12804. +
  12805. + perf_cpu_hrtimer_restart(cpuctx);
  12806. +
  12807. + return -EAGAIN;
  12808. +}
  12809. +
  12810. +/*
  12811. + * Work out whether we can put this event group on the CPU now.
  12812. + */
  12813. +static int group_can_go_on(struct perf_event *event,
  12814. + struct perf_cpu_context *cpuctx,
  12815. + int can_add_hw)
  12816. +{
  12817. + /*
  12818. + * Groups consisting entirely of software events can always go on.
  12819. + */
  12820. + if (event->group_flags & PERF_GROUP_SOFTWARE)
  12821. + return 1;
  12822. + /*
  12823. + * If an exclusive group is already on, no other hardware
  12824. + * events can go on.
  12825. + */
  12826. + if (cpuctx->exclusive)
  12827. + return 0;
  12828. + /*
  12829. + * If this group is exclusive and there are already
  12830. + * events on the CPU, it can't go on.
  12831. + */
  12832. + if (event->attr.exclusive && cpuctx->active_oncpu)
  12833. + return 0;
  12834. + /*
  12835. + * Otherwise, try to add it if all previous groups were able
  12836. + * to go on.
  12837. + */
  12838. + return can_add_hw;
  12839. +}
  12840. +
  12841. +static void add_event_to_ctx(struct perf_event *event,
  12842. + struct perf_event_context *ctx)
  12843. +{
  12844. + u64 tstamp = perf_event_time(event);
  12845. +
  12846. + list_add_event(event, ctx);
  12847. + perf_group_attach(event);
  12848. + event->tstamp_enabled = tstamp;
  12849. + event->tstamp_running = tstamp;
  12850. + event->tstamp_stopped = tstamp;
  12851. +}
  12852. +
  12853. +static void task_ctx_sched_out(struct perf_event_context *ctx);
  12854. +static void
  12855. +ctx_sched_in(struct perf_event_context *ctx,
  12856. + struct perf_cpu_context *cpuctx,
  12857. + enum event_type_t event_type,
  12858. + struct task_struct *task);
  12859. +
  12860. +static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
  12861. + struct perf_event_context *ctx,
  12862. + struct task_struct *task)
  12863. +{
  12864. + cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
  12865. + if (ctx)
  12866. + ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
  12867. + cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
  12868. + if (ctx)
  12869. + ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
  12870. +}
  12871. +
  12872. +/*
  12873. + * Cross CPU call to install and enable a performance event
  12874. + *
  12875. + * Must be called with ctx->mutex held
  12876. + */
  12877. +static int __perf_install_in_context(void *info)
  12878. +{
  12879. + struct perf_event *event = info;
  12880. + struct perf_event_context *ctx = event->ctx;
  12881. + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
  12882. + struct perf_event_context *task_ctx = cpuctx->task_ctx;
  12883. + struct task_struct *task = current;
  12884. +
  12885. + perf_ctx_lock(cpuctx, task_ctx);
  12886. + perf_pmu_disable(cpuctx->ctx.pmu);
  12887. +
  12888. + /*
  12889. + * If there was an active task_ctx schedule it out.
  12890. + */
  12891. + if (task_ctx)
  12892. + task_ctx_sched_out(task_ctx);
  12893. +
  12894. + /*
  12895. + * If the context we're installing events in is not the
  12896. + * active task_ctx, flip them.
  12897. + */
  12898. + if (ctx->task && task_ctx != ctx) {
  12899. + if (task_ctx)
  12900. + raw_spin_unlock(&task_ctx->lock);
  12901. + raw_spin_lock(&ctx->lock);
  12902. + task_ctx = ctx;
  12903. + }
  12904. +
  12905. + if (task_ctx) {
  12906. + cpuctx->task_ctx = task_ctx;
  12907. + task = task_ctx->task;
  12908. + }
  12909. +
  12910. + cpu_ctx_sched_out(cpuctx, EVENT_ALL);
  12911. +
  12912. + update_context_time(ctx);
  12913. + /*
  12914. + * update cgrp time only if current cgrp
  12915. + * matches event->cgrp. Must be done before
  12916. + * calling add_event_to_ctx()
  12917. + */
  12918. + update_cgrp_time_from_event(event);
  12919. +
  12920. + add_event_to_ctx(event, ctx);
  12921. +
  12922. + /*
  12923. + * Schedule everything back in
  12924. + */
  12925. + perf_event_sched_in(cpuctx, task_ctx, task);
  12926. +
  12927. + perf_pmu_enable(cpuctx->ctx.pmu);
  12928. + perf_ctx_unlock(cpuctx, task_ctx);
  12929. +
  12930. + return 0;
  12931. +}
  12932. +
  12933. +/*
  12934. + * Attach a performance event to a context
  12935. + *
  12936. + * First we add the event to the list with the hardware enable bit
  12937. + * in event->hw_config cleared.
  12938. + *
  12939. + * If the event is attached to a task which is on a CPU we use a smp
  12940. + * call to enable it in the task context. The task might have been
  12941. + * scheduled away, but we check this in the smp call again.
  12942. + */
  12943. +static void
  12944. +perf_install_in_context(struct perf_event_context *ctx,
  12945. + struct perf_event *event,
  12946. + int cpu)
  12947. +{
  12948. + struct task_struct *task = ctx->task;
  12949. +
  12950. + lockdep_assert_held(&ctx->mutex);
  12951. +
  12952. + event->ctx = ctx;
  12953. + if (event->cpu != -1)
  12954. + event->cpu = cpu;
  12955. +
  12956. + if (!task) {
  12957. + /*
  12958. + * Per cpu events are installed via an smp call and
  12959. + * the install is always successful.
  12960. + */
  12961. + cpu_function_call(cpu, __perf_install_in_context, event);
  12962. + return;
  12963. + }
  12964. +
  12965. +retry:
  12966. + if (!task_function_call(task, __perf_install_in_context, event))
  12967. + return;
  12968. +
  12969. + raw_spin_lock_irq(&ctx->lock);
  12970. + /*
  12971. + * If we failed to find a running task, but find the context active now
  12972. + * that we've acquired the ctx->lock, retry.
  12973. + */
  12974. + if (ctx->is_active) {
  12975. + raw_spin_unlock_irq(&ctx->lock);
  12976. + /*
  12977. + * Reload the task pointer, it might have been changed by
  12978. + * a concurrent perf_event_context_sched_out().
  12979. + */
  12980. + task = ctx->task;
  12981. + goto retry;
  12982. + }
  12983. +
  12984. + /*
  12985. + * Since the task isn't running, its safe to add the event, us holding
  12986. + * the ctx->lock ensures the task won't get scheduled in.
  12987. + */
  12988. + add_event_to_ctx(event, ctx);
  12989. + raw_spin_unlock_irq(&ctx->lock);
  12990. +}
  12991. +
  12992. +/*
  12993. + * Put a event into inactive state and update time fields.
  12994. + * Enabling the leader of a group effectively enables all
  12995. + * the group members that aren't explicitly disabled, so we
  12996. + * have to update their ->tstamp_enabled also.
  12997. + * Note: this works for group members as well as group leaders
  12998. + * since the non-leader members' sibling_lists will be empty.
  12999. + */
  13000. +static void __perf_event_mark_enabled(struct perf_event *event)
  13001. +{
  13002. + struct perf_event *sub;
  13003. + u64 tstamp = perf_event_time(event);
  13004. +
  13005. + event->state = PERF_EVENT_STATE_INACTIVE;
  13006. + event->tstamp_enabled = tstamp - event->total_time_enabled;
  13007. + list_for_each_entry(sub, &event->sibling_list, group_entry) {
  13008. + if (sub->state >= PERF_EVENT_STATE_INACTIVE)
  13009. + sub->tstamp_enabled = tstamp - sub->total_time_enabled;
  13010. + }
  13011. +}
  13012. +
  13013. +/*
  13014. + * Cross CPU call to enable a performance event
  13015. + */
  13016. +static int __perf_event_enable(void *info)
  13017. +{
  13018. + struct perf_event *event = info;
  13019. + struct perf_event_context *ctx = event->ctx;
  13020. + struct perf_event *leader = event->group_leader;
  13021. + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
  13022. + int err;
  13023. +
  13024. + /*
  13025. + * There's a time window between 'ctx->is_active' check
  13026. + * in perf_event_enable function and this place having:
  13027. + * - IRQs on
  13028. + * - ctx->lock unlocked
  13029. + *
  13030. + * where the task could be killed and 'ctx' deactivated
  13031. + * by perf_event_exit_task.
  13032. + */
  13033. + if (!ctx->is_active)
  13034. + return -EINVAL;
  13035. +
  13036. + raw_spin_lock(&ctx->lock);
  13037. + update_context_time(ctx);
  13038. +
  13039. + if (event->state >= PERF_EVENT_STATE_INACTIVE)
  13040. + goto unlock;
  13041. +
  13042. + /*
  13043. + * set current task's cgroup time reference point
  13044. + */
  13045. + perf_cgroup_set_timestamp(current, ctx);
  13046. +
  13047. + __perf_event_mark_enabled(event);
  13048. +
  13049. + if (!event_filter_match(event)) {
  13050. + if (is_cgroup_event(event))
  13051. + perf_cgroup_defer_enabled(event);
  13052. + goto unlock;
  13053. + }
  13054. +
  13055. + /*
  13056. + * If the event is in a group and isn't the group leader,
  13057. + * then don't put it on unless the group is on.
  13058. + */
  13059. + if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
  13060. + goto unlock;
  13061. +
  13062. + if (!group_can_go_on(event, cpuctx, 1)) {
  13063. + err = -EEXIST;
  13064. + } else {
  13065. + if (event == leader)
  13066. + err = group_sched_in(event, cpuctx, ctx);
  13067. + else
  13068. + err = event_sched_in(event, cpuctx, ctx);
  13069. + }
  13070. +
  13071. + if (err) {
  13072. + /*
  13073. + * If this event can't go on and it's part of a
  13074. + * group, then the whole group has to come off.
  13075. + */
  13076. + if (leader != event) {
  13077. + group_sched_out(leader, cpuctx, ctx);
  13078. + perf_cpu_hrtimer_restart(cpuctx);
  13079. + }
  13080. + if (leader->attr.pinned) {
  13081. + update_group_times(leader);
  13082. + leader->state = PERF_EVENT_STATE_ERROR;
  13083. + }
  13084. + }
  13085. +
  13086. +unlock:
  13087. + raw_spin_unlock(&ctx->lock);
  13088. +
  13089. + return 0;
  13090. +}
  13091. +
  13092. +/*
  13093. + * Enable a event.
  13094. + *
  13095. + * If event->ctx is a cloned context, callers must make sure that
  13096. + * every task struct that event->ctx->task could possibly point to
  13097. + * remains valid. This condition is satisfied when called through
  13098. + * perf_event_for_each_child or perf_event_for_each as described
  13099. + * for perf_event_disable.
  13100. + */
  13101. +void perf_event_enable(struct perf_event *event)
  13102. +{
  13103. + struct perf_event_context *ctx = event->ctx;
  13104. + struct task_struct *task = ctx->task;
  13105. +
  13106. + if (!task) {
  13107. + /*
  13108. + * Enable the event on the cpu that it's on
  13109. + */
  13110. + cpu_function_call(event->cpu, __perf_event_enable, event);
  13111. + return;
  13112. + }
  13113. +
  13114. + raw_spin_lock_irq(&ctx->lock);
  13115. + if (event->state >= PERF_EVENT_STATE_INACTIVE)
  13116. + goto out;
  13117. +
  13118. + /*
  13119. + * If the event is in error state, clear that first.
  13120. + * That way, if we see the event in error state below, we
  13121. + * know that it has gone back into error state, as distinct
  13122. + * from the task having been scheduled away before the
  13123. + * cross-call arrived.
  13124. + */
  13125. + if (event->state == PERF_EVENT_STATE_ERROR)
  13126. + event->state = PERF_EVENT_STATE_OFF;
  13127. +
  13128. +retry:
  13129. + if (!ctx->is_active) {
  13130. + __perf_event_mark_enabled(event);
  13131. + goto out;
  13132. + }
  13133. +
  13134. + raw_spin_unlock_irq(&ctx->lock);
  13135. +
  13136. + if (!task_function_call(task, __perf_event_enable, event))
  13137. + return;
  13138. +
  13139. + raw_spin_lock_irq(&ctx->lock);
  13140. +
  13141. + /*
  13142. + * If the context is active and the event is still off,
  13143. + * we need to retry the cross-call.
  13144. + */
  13145. + if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) {
  13146. + /*
  13147. + * task could have been flipped by a concurrent
  13148. + * perf_event_context_sched_out()
  13149. + */
  13150. + task = ctx->task;
  13151. + goto retry;
  13152. + }
  13153. +
  13154. +out:
  13155. + raw_spin_unlock_irq(&ctx->lock);
  13156. +}
  13157. +EXPORT_SYMBOL_GPL(perf_event_enable);
  13158. +
  13159. +int perf_event_refresh(struct perf_event *event, int refresh)
  13160. +{
  13161. + /*
  13162. + * not supported on inherited events
  13163. + */
  13164. + if (event->attr.inherit || !is_sampling_event(event))
  13165. + return -EINVAL;
  13166. +
  13167. + atomic_add(refresh, &event->event_limit);
  13168. + perf_event_enable(event);
  13169. +
  13170. + return 0;
  13171. +}
  13172. +EXPORT_SYMBOL_GPL(perf_event_refresh);
  13173. +
  13174. +static void ctx_sched_out(struct perf_event_context *ctx,
  13175. + struct perf_cpu_context *cpuctx,
  13176. + enum event_type_t event_type)
  13177. +{
  13178. + struct perf_event *event;
  13179. + int is_active = ctx->is_active;
  13180. +
  13181. + ctx->is_active &= ~event_type;
  13182. + if (likely(!ctx->nr_events))
  13183. + return;
  13184. +
  13185. + update_context_time(ctx);
  13186. + update_cgrp_time_from_cpuctx(cpuctx);
  13187. + if (!ctx->nr_active)
  13188. + return;
  13189. +
  13190. + perf_pmu_disable(ctx->pmu);
  13191. + if ((is_active & EVENT_PINNED) && (event_type & EVENT_PINNED)) {
  13192. + list_for_each_entry(event, &ctx->pinned_groups, group_entry)
  13193. + group_sched_out(event, cpuctx, ctx);
  13194. + }
  13195. +
  13196. + if ((is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE)) {
  13197. + list_for_each_entry(event, &ctx->flexible_groups, group_entry)
  13198. + group_sched_out(event, cpuctx, ctx);
  13199. + }
  13200. + perf_pmu_enable(ctx->pmu);
  13201. +}
  13202. +
  13203. +/*
  13204. + * Test whether two contexts are equivalent, i.e. whether they have both been
  13205. + * cloned from the same version of the same context.
  13206. + *
  13207. + * Equivalence is measured using a generation number in the context that is
  13208. + * incremented on each modification to it; see unclone_ctx(), list_add_event()
  13209. + * and list_del_event().
  13210. + */
  13211. +static int context_equiv(struct perf_event_context *ctx1,
  13212. + struct perf_event_context *ctx2)
  13213. +{
  13214. + lockdep_assert_held(&ctx1->lock);
  13215. + lockdep_assert_held(&ctx2->lock);
  13216. +
  13217. + /* Pinning disables the swap optimization */
  13218. + if (ctx1->pin_count || ctx2->pin_count)
  13219. + return 0;
  13220. +
  13221. + /* If ctx1 is the parent of ctx2 */
  13222. + if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
  13223. + return 1;
  13224. +
  13225. + /* If ctx2 is the parent of ctx1 */
  13226. + if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation)
  13227. + return 1;
  13228. +
  13229. + /*
  13230. + * If ctx1 and ctx2 have the same parent; we flatten the parent
  13231. + * hierarchy, see perf_event_init_context().
  13232. + */
  13233. + if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
  13234. + ctx1->parent_gen == ctx2->parent_gen)
  13235. + return 1;
  13236. +
  13237. + /* Unmatched */
  13238. + return 0;
  13239. +}
  13240. +
  13241. +static void __perf_event_sync_stat(struct perf_event *event,
  13242. + struct perf_event *next_event)
  13243. +{
  13244. + u64 value;
  13245. +
  13246. + if (!event->attr.inherit_stat)
  13247. + return;
  13248. +
  13249. + /*
  13250. + * Update the event value, we cannot use perf_event_read()
  13251. + * because we're in the middle of a context switch and have IRQs
  13252. + * disabled, which upsets smp_call_function_single(), however
  13253. + * we know the event must be on the current CPU, therefore we
  13254. + * don't need to use it.
  13255. + */
  13256. + switch (event->state) {
  13257. + case PERF_EVENT_STATE_ACTIVE:
  13258. + event->pmu->read(event);
  13259. + /* fall-through */
  13260. +
  13261. + case PERF_EVENT_STATE_INACTIVE:
  13262. + update_event_times(event);
  13263. + break;
  13264. +
  13265. + default:
  13266. + break;
  13267. + }
  13268. +
  13269. + /*
  13270. + * In order to keep per-task stats reliable we need to flip the event
  13271. + * values when we flip the contexts.
  13272. + */
  13273. + value = local64_read(&next_event->count);
  13274. + value = local64_xchg(&event->count, value);
  13275. + local64_set(&next_event->count, value);
  13276. +
  13277. + swap(event->total_time_enabled, next_event->total_time_enabled);
  13278. + swap(event->total_time_running, next_event->total_time_running);
  13279. +
  13280. + /*
  13281. + * Since we swizzled the values, update the user visible data too.
  13282. + */
  13283. + perf_event_update_userpage(event);
  13284. + perf_event_update_userpage(next_event);
  13285. +}
  13286. +
  13287. +static void perf_event_sync_stat(struct perf_event_context *ctx,
  13288. + struct perf_event_context *next_ctx)
  13289. +{
  13290. + struct perf_event *event, *next_event;
  13291. +
  13292. + if (!ctx->nr_stat)
  13293. + return;
  13294. +
  13295. + update_context_time(ctx);
  13296. +
  13297. + event = list_first_entry(&ctx->event_list,
  13298. + struct perf_event, event_entry);
  13299. +
  13300. + next_event = list_first_entry(&next_ctx->event_list,
  13301. + struct perf_event, event_entry);
  13302. +
  13303. + while (&event->event_entry != &ctx->event_list &&
  13304. + &next_event->event_entry != &next_ctx->event_list) {
  13305. +
  13306. + __perf_event_sync_stat(event, next_event);
  13307. +
  13308. + event = list_next_entry(event, event_entry);
  13309. + next_event = list_next_entry(next_event, event_entry);
  13310. + }
  13311. +}
  13312. +
  13313. +static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
  13314. + struct task_struct *next)
  13315. +{
  13316. + struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
  13317. + struct perf_event_context *next_ctx;
  13318. + struct perf_event_context *parent, *next_parent;
  13319. + struct perf_cpu_context *cpuctx;
  13320. + int do_switch = 1;
  13321. +
  13322. + if (likely(!ctx))
  13323. + return;
  13324. +
  13325. + cpuctx = __get_cpu_context(ctx);
  13326. + if (!cpuctx->task_ctx)
  13327. + return;
  13328. +
  13329. + rcu_read_lock();
  13330. + next_ctx = next->perf_event_ctxp[ctxn];
  13331. + if (!next_ctx)
  13332. + goto unlock;
  13333. +
  13334. + parent = rcu_dereference(ctx->parent_ctx);
  13335. + next_parent = rcu_dereference(next_ctx->parent_ctx);
  13336. +
  13337. + /* If neither context have a parent context; they cannot be clones. */
  13338. + if (!parent && !next_parent)
  13339. + goto unlock;
  13340. +
  13341. + if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
  13342. + /*
  13343. + * Looks like the two contexts are clones, so we might be
  13344. + * able to optimize the context switch. We lock both
  13345. + * contexts and check that they are clones under the
  13346. + * lock (including re-checking that neither has been
  13347. + * uncloned in the meantime). It doesn't matter which
  13348. + * order we take the locks because no other cpu could
  13349. + * be trying to lock both of these tasks.
  13350. + */
  13351. + raw_spin_lock(&ctx->lock);
  13352. + raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
  13353. + if (context_equiv(ctx, next_ctx)) {
  13354. + /*
  13355. + * XXX do we need a memory barrier of sorts
  13356. + * wrt to rcu_dereference() of perf_event_ctxp
  13357. + */
  13358. + task->perf_event_ctxp[ctxn] = next_ctx;
  13359. + next->perf_event_ctxp[ctxn] = ctx;
  13360. + ctx->task = next;
  13361. + next_ctx->task = task;
  13362. + do_switch = 0;
  13363. +
  13364. + perf_event_sync_stat(ctx, next_ctx);
  13365. + }
  13366. + raw_spin_unlock(&next_ctx->lock);
  13367. + raw_spin_unlock(&ctx->lock);
  13368. + }
  13369. +unlock:
  13370. + rcu_read_unlock();
  13371. +
  13372. + if (do_switch) {
  13373. + raw_spin_lock(&ctx->lock);
  13374. + ctx_sched_out(ctx, cpuctx, EVENT_ALL);
  13375. + cpuctx->task_ctx = NULL;
  13376. + raw_spin_unlock(&ctx->lock);
  13377. + }
  13378. +}
  13379. +
  13380. +#define for_each_task_context_nr(ctxn) \
  13381. + for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
  13382. +
  13383. +/*
  13384. + * Called from scheduler to remove the events of the current task,
  13385. + * with interrupts disabled.
  13386. + *
  13387. + * We stop each event and update the event value in event->count.
  13388. + *
  13389. + * This does not protect us against NMI, but disable()
  13390. + * sets the disabled bit in the control field of event _before_
  13391. + * accessing the event control register. If a NMI hits, then it will
  13392. + * not restart the event.
  13393. + */
  13394. +void __perf_event_task_sched_out(struct task_struct *task,
  13395. + struct task_struct *next)
  13396. +{
  13397. + int ctxn;
  13398. +
  13399. + for_each_task_context_nr(ctxn)
  13400. + perf_event_context_sched_out(task, ctxn, next);
  13401. +
  13402. + /*
  13403. + * if cgroup events exist on this CPU, then we need
  13404. + * to check if we have to switch out PMU state.
  13405. + * cgroup event are system-wide mode only
  13406. + */
  13407. + if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
  13408. + perf_cgroup_sched_out(task, next);
  13409. +}
  13410. +
  13411. +static void task_ctx_sched_out(struct perf_event_context *ctx)
  13412. +{
  13413. + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
  13414. +
  13415. + if (!cpuctx->task_ctx)
  13416. + return;
  13417. +
  13418. + if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
  13419. + return;
  13420. +
  13421. + ctx_sched_out(ctx, cpuctx, EVENT_ALL);
  13422. + cpuctx->task_ctx = NULL;
  13423. +}
  13424. +
  13425. +/*
  13426. + * Called with IRQs disabled
  13427. + */
  13428. +static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
  13429. + enum event_type_t event_type)
  13430. +{
  13431. + ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
  13432. +}
  13433. +
  13434. +static void
  13435. +ctx_pinned_sched_in(struct perf_event_context *ctx,
  13436. + struct perf_cpu_context *cpuctx)
  13437. +{
  13438. + struct perf_event *event;
  13439. +
  13440. + list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
  13441. + if (event->state <= PERF_EVENT_STATE_OFF)
  13442. + continue;
  13443. + if (!event_filter_match(event))
  13444. + continue;
  13445. +
  13446. + /* may need to reset tstamp_enabled */
  13447. + if (is_cgroup_event(event))
  13448. + perf_cgroup_mark_enabled(event, ctx);
  13449. +
  13450. + if (group_can_go_on(event, cpuctx, 1))
  13451. + group_sched_in(event, cpuctx, ctx);
  13452. +
  13453. + /*
  13454. + * If this pinned group hasn't been scheduled,
  13455. + * put it in error state.
  13456. + */
  13457. + if (event->state == PERF_EVENT_STATE_INACTIVE) {
  13458. + update_group_times(event);
  13459. + event->state = PERF_EVENT_STATE_ERROR;
  13460. + }
  13461. + }
  13462. +}
  13463. +
  13464. +static void
  13465. +ctx_flexible_sched_in(struct perf_event_context *ctx,
  13466. + struct perf_cpu_context *cpuctx)
  13467. +{
  13468. + struct perf_event *event;
  13469. + int can_add_hw = 1;
  13470. +
  13471. + list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
  13472. + /* Ignore events in OFF or ERROR state */
  13473. + if (event->state <= PERF_EVENT_STATE_OFF)
  13474. + continue;
  13475. + /*
  13476. + * Listen to the 'cpu' scheduling filter constraint
  13477. + * of events:
  13478. + */
  13479. + if (!event_filter_match(event))
  13480. + continue;
  13481. +
  13482. + /* may need to reset tstamp_enabled */
  13483. + if (is_cgroup_event(event))
  13484. + perf_cgroup_mark_enabled(event, ctx);
  13485. +
  13486. + if (group_can_go_on(event, cpuctx, can_add_hw)) {
  13487. + if (group_sched_in(event, cpuctx, ctx))
  13488. + can_add_hw = 0;
  13489. + }
  13490. + }
  13491. +}
  13492. +
  13493. +static void
  13494. +ctx_sched_in(struct perf_event_context *ctx,
  13495. + struct perf_cpu_context *cpuctx,
  13496. + enum event_type_t event_type,
  13497. + struct task_struct *task)
  13498. +{
  13499. + u64 now;
  13500. + int is_active = ctx->is_active;
  13501. +
  13502. + ctx->is_active |= event_type;
  13503. + if (likely(!ctx->nr_events))
  13504. + return;
  13505. +
  13506. + now = perf_clock();
  13507. + ctx->timestamp = now;
  13508. + perf_cgroup_set_timestamp(task, ctx);
  13509. + /*
  13510. + * First go through the list and put on any pinned groups
  13511. + * in order to give them the best chance of going on.
  13512. + */
  13513. + if (!(is_active & EVENT_PINNED) && (event_type & EVENT_PINNED))
  13514. + ctx_pinned_sched_in(ctx, cpuctx);
  13515. +
  13516. + /* Then walk through the lower prio flexible groups */
  13517. + if (!(is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE))
  13518. + ctx_flexible_sched_in(ctx, cpuctx);
  13519. +}
  13520. +
  13521. +static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
  13522. + enum event_type_t event_type,
  13523. + struct task_struct *task)
  13524. +{
  13525. + struct perf_event_context *ctx = &cpuctx->ctx;
  13526. +
  13527. + ctx_sched_in(ctx, cpuctx, event_type, task);
  13528. +}
  13529. +
  13530. +static void perf_event_context_sched_in(struct perf_event_context *ctx,
  13531. + struct task_struct *task)
  13532. +{
  13533. + struct perf_cpu_context *cpuctx;
  13534. +
  13535. + cpuctx = __get_cpu_context(ctx);
  13536. + if (cpuctx->task_ctx == ctx)
  13537. + return;
  13538. +
  13539. + perf_ctx_lock(cpuctx, ctx);
  13540. + perf_pmu_disable(ctx->pmu);
  13541. + /*
  13542. + * We want to keep the following priority order:
  13543. + * cpu pinned (that don't need to move), task pinned,
  13544. + * cpu flexible, task flexible.
  13545. + */
  13546. + cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
  13547. +
  13548. + if (ctx->nr_events)
  13549. + cpuctx->task_ctx = ctx;
  13550. +
  13551. + perf_event_sched_in(cpuctx, cpuctx->task_ctx, task);
  13552. +
  13553. + perf_pmu_enable(ctx->pmu);
  13554. + perf_ctx_unlock(cpuctx, ctx);
  13555. +
  13556. + /*
  13557. + * Since these rotations are per-cpu, we need to ensure the
  13558. + * cpu-context we got scheduled on is actually rotating.
  13559. + */
  13560. + perf_pmu_rotate_start(ctx->pmu);
  13561. +}
  13562. +
  13563. +/*
  13564. + * When sampling the branck stack in system-wide, it may be necessary
  13565. + * to flush the stack on context switch. This happens when the branch
  13566. + * stack does not tag its entries with the pid of the current task.
  13567. + * Otherwise it becomes impossible to associate a branch entry with a
  13568. + * task. This ambiguity is more likely to appear when the branch stack
  13569. + * supports priv level filtering and the user sets it to monitor only
  13570. + * at the user level (which could be a useful measurement in system-wide
  13571. + * mode). In that case, the risk is high of having a branch stack with
  13572. + * branch from multiple tasks. Flushing may mean dropping the existing
  13573. + * entries or stashing them somewhere in the PMU specific code layer.
  13574. + *
  13575. + * This function provides the context switch callback to the lower code
  13576. + * layer. It is invoked ONLY when there is at least one system-wide context
  13577. + * with at least one active event using taken branch sampling.
  13578. + */
  13579. +static void perf_branch_stack_sched_in(struct task_struct *prev,
  13580. + struct task_struct *task)
  13581. +{
  13582. + struct perf_cpu_context *cpuctx;
  13583. + struct pmu *pmu;
  13584. + unsigned long flags;
  13585. +
  13586. + /* no need to flush branch stack if not changing task */
  13587. + if (prev == task)
  13588. + return;
  13589. +
  13590. + local_irq_save(flags);
  13591. +
  13592. + rcu_read_lock();
  13593. +
  13594. + list_for_each_entry_rcu(pmu, &pmus, entry) {
  13595. + cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
  13596. +
  13597. + /*
  13598. + * check if the context has at least one
  13599. + * event using PERF_SAMPLE_BRANCH_STACK
  13600. + */
  13601. + if (cpuctx->ctx.nr_branch_stack > 0
  13602. + && pmu->flush_branch_stack) {
  13603. +
  13604. + perf_ctx_lock(cpuctx, cpuctx->task_ctx);
  13605. +
  13606. + perf_pmu_disable(pmu);
  13607. +
  13608. + pmu->flush_branch_stack();
  13609. +
  13610. + perf_pmu_enable(pmu);
  13611. +
  13612. + perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
  13613. + }
  13614. + }
  13615. +
  13616. + rcu_read_unlock();
  13617. +
  13618. + local_irq_restore(flags);
  13619. +}
  13620. +
  13621. +/*
  13622. + * Called from scheduler to add the events of the current task
  13623. + * with interrupts disabled.
  13624. + *
  13625. + * We restore the event value and then enable it.
  13626. + *
  13627. + * This does not protect us against NMI, but enable()
  13628. + * sets the enabled bit in the control field of event _before_
  13629. + * accessing the event control register. If a NMI hits, then it will
  13630. + * keep the event running.
  13631. + */
  13632. +void __perf_event_task_sched_in(struct task_struct *prev,
  13633. + struct task_struct *task)
  13634. +{
  13635. + struct perf_event_context *ctx;
  13636. + int ctxn;
  13637. +
  13638. + for_each_task_context_nr(ctxn) {
  13639. + ctx = task->perf_event_ctxp[ctxn];
  13640. + if (likely(!ctx))
  13641. + continue;
  13642. +
  13643. + perf_event_context_sched_in(ctx, task);
  13644. + }
  13645. + /*
  13646. + * if cgroup events exist on this CPU, then we need
  13647. + * to check if we have to switch in PMU state.
  13648. + * cgroup event are system-wide mode only
  13649. + */
  13650. + if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
  13651. + perf_cgroup_sched_in(prev, task);
  13652. +
  13653. + /* check for system-wide branch_stack events */
  13654. + if (atomic_read(this_cpu_ptr(&perf_branch_stack_events)))
  13655. + perf_branch_stack_sched_in(prev, task);
  13656. +}
  13657. +
  13658. +static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
  13659. +{
  13660. + u64 frequency = event->attr.sample_freq;
  13661. + u64 sec = NSEC_PER_SEC;
  13662. + u64 divisor, dividend;
  13663. +
  13664. + int count_fls, nsec_fls, frequency_fls, sec_fls;
  13665. +
  13666. + count_fls = fls64(count);
  13667. + nsec_fls = fls64(nsec);
  13668. + frequency_fls = fls64(frequency);
  13669. + sec_fls = 30;
  13670. +
  13671. + /*
  13672. + * We got @count in @nsec, with a target of sample_freq HZ
  13673. + * the target period becomes:
  13674. + *
  13675. + * @count * 10^9
  13676. + * period = -------------------
  13677. + * @nsec * sample_freq
  13678. + *
  13679. + */
  13680. +
  13681. + /*
  13682. + * Reduce accuracy by one bit such that @a and @b converge
  13683. + * to a similar magnitude.
  13684. + */
  13685. +#define REDUCE_FLS(a, b) \
  13686. +do { \
  13687. + if (a##_fls > b##_fls) { \
  13688. + a >>= 1; \
  13689. + a##_fls--; \
  13690. + } else { \
  13691. + b >>= 1; \
  13692. + b##_fls--; \
  13693. + } \
  13694. +} while (0)
  13695. +
  13696. + /*
  13697. + * Reduce accuracy until either term fits in a u64, then proceed with
  13698. + * the other, so that finally we can do a u64/u64 division.
  13699. + */
  13700. + while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
  13701. + REDUCE_FLS(nsec, frequency);
  13702. + REDUCE_FLS(sec, count);
  13703. + }
  13704. +
  13705. + if (count_fls + sec_fls > 64) {
  13706. + divisor = nsec * frequency;
  13707. +
  13708. + while (count_fls + sec_fls > 64) {
  13709. + REDUCE_FLS(count, sec);
  13710. + divisor >>= 1;
  13711. + }
  13712. +
  13713. + dividend = count * sec;
  13714. + } else {
  13715. + dividend = count * sec;
  13716. +
  13717. + while (nsec_fls + frequency_fls > 64) {
  13718. + REDUCE_FLS(nsec, frequency);
  13719. + dividend >>= 1;
  13720. + }
  13721. +
  13722. + divisor = nsec * frequency;
  13723. + }
  13724. +
  13725. + if (!divisor)
  13726. + return dividend;
  13727. +
  13728. + return div64_u64(dividend, divisor);
  13729. +}
  13730. +
  13731. +static DEFINE_PER_CPU(int, perf_throttled_count);
  13732. +static DEFINE_PER_CPU(u64, perf_throttled_seq);
  13733. +
  13734. +static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable)
  13735. +{
  13736. + struct hw_perf_event *hwc = &event->hw;
  13737. + s64 period, sample_period;
  13738. + s64 delta;
  13739. +
  13740. + period = perf_calculate_period(event, nsec, count);
  13741. +
  13742. + delta = (s64)(period - hwc->sample_period);
  13743. + delta = (delta + 7) / 8; /* low pass filter */
  13744. +
  13745. + sample_period = hwc->sample_period + delta;
  13746. +
  13747. + if (!sample_period)
  13748. + sample_period = 1;
  13749. +
  13750. + hwc->sample_period = sample_period;
  13751. +
  13752. + if (local64_read(&hwc->period_left) > 8*sample_period) {
  13753. + if (disable)
  13754. + event->pmu->stop(event, PERF_EF_UPDATE);
  13755. +
  13756. + local64_set(&hwc->period_left, 0);
  13757. +
  13758. + if (disable)
  13759. + event->pmu->start(event, PERF_EF_RELOAD);
  13760. + }
  13761. +}
  13762. +
  13763. +/*
  13764. + * combine freq adjustment with unthrottling to avoid two passes over the
  13765. + * events. At the same time, make sure, having freq events does not change
  13766. + * the rate of unthrottling as that would introduce bias.
  13767. + */
  13768. +static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
  13769. + int needs_unthr)
  13770. +{
  13771. + struct perf_event *event;
  13772. + struct hw_perf_event *hwc;
  13773. + u64 now, period = TICK_NSEC;
  13774. + s64 delta;
  13775. +
  13776. + /*
  13777. + * only need to iterate over all events iff:
  13778. + * - context have events in frequency mode (needs freq adjust)
  13779. + * - there are events to unthrottle on this cpu
  13780. + */
  13781. + if (!(ctx->nr_freq || needs_unthr))
  13782. + return;
  13783. +
  13784. + raw_spin_lock(&ctx->lock);
  13785. + perf_pmu_disable(ctx->pmu);
  13786. +
  13787. + list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
  13788. + if (event->state != PERF_EVENT_STATE_ACTIVE)
  13789. + continue;
  13790. +
  13791. + if (!event_filter_match(event))
  13792. + continue;
  13793. +
  13794. + perf_pmu_disable(event->pmu);
  13795. +
  13796. + hwc = &event->hw;
  13797. +
  13798. + if (hwc->interrupts == MAX_INTERRUPTS) {
  13799. + hwc->interrupts = 0;
  13800. + perf_log_throttle(event, 1);
  13801. + event->pmu->start(event, 0);
  13802. + }
  13803. +
  13804. + if (!event->attr.freq || !event->attr.sample_freq)
  13805. + goto next;
  13806. +
  13807. + /*
  13808. + * stop the event and update event->count
  13809. + */
  13810. + event->pmu->stop(event, PERF_EF_UPDATE);
  13811. +
  13812. + now = local64_read(&event->count);
  13813. + delta = now - hwc->freq_count_stamp;
  13814. + hwc->freq_count_stamp = now;
  13815. +
  13816. + /*
  13817. + * restart the event
  13818. + * reload only if value has changed
  13819. + * we have stopped the event so tell that
  13820. + * to perf_adjust_period() to avoid stopping it
  13821. + * twice.
  13822. + */
  13823. + if (delta > 0)
  13824. + perf_adjust_period(event, period, delta, false);
  13825. +
  13826. + event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
  13827. + next:
  13828. + perf_pmu_enable(event->pmu);
  13829. + }
  13830. +
  13831. + perf_pmu_enable(ctx->pmu);
  13832. + raw_spin_unlock(&ctx->lock);
  13833. +}
  13834. +
  13835. +/*
  13836. + * Round-robin a context's events:
  13837. + */
  13838. +static void rotate_ctx(struct perf_event_context *ctx)
  13839. +{
  13840. + /*
  13841. + * Rotate the first entry last of non-pinned groups. Rotation might be
  13842. + * disabled by the inheritance code.
  13843. + */
  13844. + if (!ctx->rotate_disable)
  13845. + list_rotate_left(&ctx->flexible_groups);
  13846. +}
  13847. +
  13848. +/*
  13849. + * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
  13850. + * because they're strictly cpu affine and rotate_start is called with IRQs
  13851. + * disabled, while rotate_context is called from IRQ context.
  13852. + */
  13853. +static int perf_rotate_context(struct perf_cpu_context *cpuctx)
  13854. +{
  13855. + struct perf_event_context *ctx = NULL;
  13856. + int rotate = 0, remove = 1;
  13857. +
  13858. + if (cpuctx->ctx.nr_events) {
  13859. + remove = 0;
  13860. + if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
  13861. + rotate = 1;
  13862. + }
  13863. +
  13864. + ctx = cpuctx->task_ctx;
  13865. + if (ctx && ctx->nr_events) {
  13866. + remove = 0;
  13867. + if (ctx->nr_events != ctx->nr_active)
  13868. + rotate = 1;
  13869. + }
  13870. +
  13871. + if (!rotate)
  13872. + goto done;
  13873. +
  13874. + perf_ctx_lock(cpuctx, cpuctx->task_ctx);
  13875. + perf_pmu_disable(cpuctx->ctx.pmu);
  13876. +
  13877. + cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
  13878. + if (ctx)
  13879. + ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
  13880. +
  13881. + rotate_ctx(&cpuctx->ctx);
  13882. + if (ctx)
  13883. + rotate_ctx(ctx);
  13884. +
  13885. + perf_event_sched_in(cpuctx, ctx, current);
  13886. +
  13887. + perf_pmu_enable(cpuctx->ctx.pmu);
  13888. + perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
  13889. +done:
  13890. + if (remove)
  13891. + list_del_init(&cpuctx->rotation_list);
  13892. +
  13893. + return rotate;
  13894. +}
  13895. +
  13896. +#ifdef CONFIG_NO_HZ_FULL
  13897. +bool perf_event_can_stop_tick(void)
  13898. +{
  13899. + if (atomic_read(&nr_freq_events) ||
  13900. + __this_cpu_read(perf_throttled_count))
  13901. + return false;
  13902. + else
  13903. + return true;
  13904. +}
  13905. +#endif
  13906. +
  13907. +void perf_event_task_tick(void)
  13908. +{
  13909. + struct list_head *head = this_cpu_ptr(&rotation_list);
  13910. + struct perf_cpu_context *cpuctx, *tmp;
  13911. + struct perf_event_context *ctx;
  13912. + int throttled;
  13913. +
  13914. + WARN_ON(!irqs_disabled());
  13915. +
  13916. + __this_cpu_inc(perf_throttled_seq);
  13917. + throttled = __this_cpu_xchg(perf_throttled_count, 0);
  13918. +
  13919. + list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) {
  13920. + ctx = &cpuctx->ctx;
  13921. + perf_adjust_freq_unthr_context(ctx, throttled);
  13922. +
  13923. + ctx = cpuctx->task_ctx;
  13924. + if (ctx)
  13925. + perf_adjust_freq_unthr_context(ctx, throttled);
  13926. + }
  13927. +}
  13928. +
  13929. +static int event_enable_on_exec(struct perf_event *event,
  13930. + struct perf_event_context *ctx)
  13931. +{
  13932. + if (!event->attr.enable_on_exec)
  13933. + return 0;
  13934. +
  13935. + event->attr.enable_on_exec = 0;
  13936. + if (event->state >= PERF_EVENT_STATE_INACTIVE)
  13937. + return 0;
  13938. +
  13939. + __perf_event_mark_enabled(event);
  13940. +
  13941. + return 1;
  13942. +}
  13943. +
  13944. +/*
  13945. + * Enable all of a task's events that have been marked enable-on-exec.
  13946. + * This expects task == current.
  13947. + */
  13948. +static void perf_event_enable_on_exec(struct perf_event_context *ctx)
  13949. +{
  13950. + struct perf_event_context *clone_ctx = NULL;
  13951. + struct perf_event *event;
  13952. + unsigned long flags;
  13953. + int enabled = 0;
  13954. + int ret;
  13955. +
  13956. + local_irq_save(flags);
  13957. + if (!ctx || !ctx->nr_events)
  13958. + goto out;
  13959. +
  13960. + /*
  13961. + * We must ctxsw out cgroup events to avoid conflict
  13962. + * when invoking perf_task_event_sched_in() later on
  13963. + * in this function. Otherwise we end up trying to
  13964. + * ctxswin cgroup events which are already scheduled
  13965. + * in.
  13966. + */
  13967. + perf_cgroup_sched_out(current, NULL);
  13968. +
  13969. + raw_spin_lock(&ctx->lock);
  13970. + task_ctx_sched_out(ctx);
  13971. +
  13972. + list_for_each_entry(event, &ctx->event_list, event_entry) {
  13973. + ret = event_enable_on_exec(event, ctx);
  13974. + if (ret)
  13975. + enabled = 1;
  13976. + }
  13977. +
  13978. + /*
  13979. + * Unclone this context if we enabled any event.
  13980. + */
  13981. + if (enabled)
  13982. + clone_ctx = unclone_ctx(ctx);
  13983. +
  13984. + raw_spin_unlock(&ctx->lock);
  13985. +
  13986. + /*
  13987. + * Also calls ctxswin for cgroup events, if any:
  13988. + */
  13989. + perf_event_context_sched_in(ctx, ctx->task);
  13990. +out:
  13991. + local_irq_restore(flags);
  13992. +
  13993. + if (clone_ctx)
  13994. + put_ctx(clone_ctx);
  13995. +}
  13996. +
  13997. +void perf_event_exec(void)
  13998. +{
  13999. + struct perf_event_context *ctx;
  14000. + int ctxn;
  14001. +
  14002. + rcu_read_lock();
  14003. + for_each_task_context_nr(ctxn) {
  14004. + ctx = current->perf_event_ctxp[ctxn];
  14005. + if (!ctx)
  14006. + continue;
  14007. +
  14008. + perf_event_enable_on_exec(ctx);
  14009. + }
  14010. + rcu_read_unlock();
  14011. +}
  14012. +
  14013. +/*
  14014. + * Cross CPU call to read the hardware event
  14015. + */
  14016. +static void __perf_event_read(void *info)
  14017. +{
  14018. + struct perf_event *event = info;
  14019. + struct perf_event_context *ctx = event->ctx;
  14020. + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
  14021. +
  14022. + /*
  14023. + * If this is a task context, we need to check whether it is
  14024. + * the current task context of this cpu. If not it has been
  14025. + * scheduled out before the smp call arrived. In that case
  14026. + * event->count would have been updated to a recent sample
  14027. + * when the event was scheduled out.
  14028. + */
  14029. + if (ctx->task && cpuctx->task_ctx != ctx)
  14030. + return;
  14031. +
  14032. + raw_spin_lock(&ctx->lock);
  14033. + if (ctx->is_active) {
  14034. + update_context_time(ctx);
  14035. + update_cgrp_time_from_event(event);
  14036. + }
  14037. + update_event_times(event);
  14038. + if (event->state == PERF_EVENT_STATE_ACTIVE)
  14039. + event->pmu->read(event);
  14040. + raw_spin_unlock(&ctx->lock);
  14041. +}
  14042. +
  14043. +static inline u64 perf_event_count(struct perf_event *event)
  14044. +{
  14045. + return local64_read(&event->count) + atomic64_read(&event->child_count);
  14046. +}
  14047. +
  14048. +static u64 perf_event_read(struct perf_event *event)
  14049. +{
  14050. + /*
  14051. + * If event is enabled and currently active on a CPU, update the
  14052. + * value in the event structure:
  14053. + */
  14054. + if (event->state == PERF_EVENT_STATE_ACTIVE) {
  14055. + smp_call_function_single(event->oncpu,
  14056. + __perf_event_read, event, 1);
  14057. + } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
  14058. + struct perf_event_context *ctx = event->ctx;
  14059. + unsigned long flags;
  14060. +
  14061. + raw_spin_lock_irqsave(&ctx->lock, flags);
  14062. + /*
  14063. + * may read while context is not active
  14064. + * (e.g., thread is blocked), in that case
  14065. + * we cannot update context time
  14066. + */
  14067. + if (ctx->is_active) {
  14068. + update_context_time(ctx);
  14069. + update_cgrp_time_from_event(event);
  14070. + }
  14071. + update_event_times(event);
  14072. + raw_spin_unlock_irqrestore(&ctx->lock, flags);
  14073. + }
  14074. +
  14075. + return perf_event_count(event);
  14076. +}
  14077. +
  14078. +/*
  14079. + * Initialize the perf_event context in a task_struct:
  14080. + */
  14081. +static void __perf_event_init_context(struct perf_event_context *ctx)
  14082. +{
  14083. + raw_spin_lock_init(&ctx->lock);
  14084. + mutex_init(&ctx->mutex);
  14085. + INIT_LIST_HEAD(&ctx->pinned_groups);
  14086. + INIT_LIST_HEAD(&ctx->flexible_groups);
  14087. + INIT_LIST_HEAD(&ctx->event_list);
  14088. + atomic_set(&ctx->refcount, 1);
  14089. + INIT_DELAYED_WORK(&ctx->orphans_remove, orphans_remove_work);
  14090. +}
  14091. +
  14092. +static struct perf_event_context *
  14093. +alloc_perf_context(struct pmu *pmu, struct task_struct *task)
  14094. +{
  14095. + struct perf_event_context *ctx;
  14096. +
  14097. + ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
  14098. + if (!ctx)
  14099. + return NULL;
  14100. +
  14101. + __perf_event_init_context(ctx);
  14102. + if (task) {
  14103. + ctx->task = task;
  14104. + get_task_struct(task);
  14105. + }
  14106. + ctx->pmu = pmu;
  14107. +
  14108. + return ctx;
  14109. +}
  14110. +
  14111. +static struct task_struct *
  14112. +find_lively_task_by_vpid(pid_t vpid)
  14113. +{
  14114. + struct task_struct *task;
  14115. + int err;
  14116. +
  14117. + rcu_read_lock();
  14118. + if (!vpid)
  14119. + task = current;
  14120. + else
  14121. + task = find_task_by_vpid(vpid);
  14122. + if (task)
  14123. + get_task_struct(task);
  14124. + rcu_read_unlock();
  14125. +
  14126. + if (!task)
  14127. + return ERR_PTR(-ESRCH);
  14128. +
  14129. + /* Reuse ptrace permission checks for now. */
  14130. + err = -EACCES;
  14131. + if (!ptrace_may_access(task, PTRACE_MODE_READ))
  14132. + goto errout;
  14133. +
  14134. + return task;
  14135. +errout:
  14136. + put_task_struct(task);
  14137. + return ERR_PTR(err);
  14138. +
  14139. +}
  14140. +
  14141. +/*
  14142. + * Returns a matching context with refcount and pincount.
  14143. + */
  14144. +static struct perf_event_context *
  14145. +find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
  14146. +{
  14147. + struct perf_event_context *ctx, *clone_ctx = NULL;
  14148. + struct perf_cpu_context *cpuctx;
  14149. + unsigned long flags;
  14150. + int ctxn, err;
  14151. +
  14152. + if (!task) {
  14153. + /* Must be root to operate on a CPU event: */
  14154. + if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
  14155. + return ERR_PTR(-EACCES);
  14156. +
  14157. + /*
  14158. + * We could be clever and allow to attach a event to an
  14159. + * offline CPU and activate it when the CPU comes up, but
  14160. + * that's for later.
  14161. + */
  14162. + if (!cpu_online(cpu))
  14163. + return ERR_PTR(-ENODEV);
  14164. +
  14165. + cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
  14166. + ctx = &cpuctx->ctx;
  14167. + get_ctx(ctx);
  14168. + ++ctx->pin_count;
  14169. +
  14170. + return ctx;
  14171. + }
  14172. +
  14173. + err = -EINVAL;
  14174. + ctxn = pmu->task_ctx_nr;
  14175. + if (ctxn < 0)
  14176. + goto errout;
  14177. +
  14178. +retry:
  14179. + ctx = perf_lock_task_context(task, ctxn, &flags);
  14180. + if (ctx) {
  14181. + clone_ctx = unclone_ctx(ctx);
  14182. + ++ctx->pin_count;
  14183. + raw_spin_unlock_irqrestore(&ctx->lock, flags);
  14184. +
  14185. + if (clone_ctx)
  14186. + put_ctx(clone_ctx);
  14187. + } else {
  14188. + ctx = alloc_perf_context(pmu, task);
  14189. + err = -ENOMEM;
  14190. + if (!ctx)
  14191. + goto errout;
  14192. +
  14193. + err = 0;
  14194. + mutex_lock(&task->perf_event_mutex);
  14195. + /*
  14196. + * If it has already passed perf_event_exit_task().
  14197. + * we must see PF_EXITING, it takes this mutex too.
  14198. + */
  14199. + if (task->flags & PF_EXITING)
  14200. + err = -ESRCH;
  14201. + else if (task->perf_event_ctxp[ctxn])
  14202. + err = -EAGAIN;
  14203. + else {
  14204. + get_ctx(ctx);
  14205. + ++ctx->pin_count;
  14206. + rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
  14207. + }
  14208. + mutex_unlock(&task->perf_event_mutex);
  14209. +
  14210. + if (unlikely(err)) {
  14211. + put_ctx(ctx);
  14212. +
  14213. + if (err == -EAGAIN)
  14214. + goto retry;
  14215. + goto errout;
  14216. + }
  14217. + }
  14218. +
  14219. + return ctx;
  14220. +
  14221. +errout:
  14222. + return ERR_PTR(err);
  14223. +}
  14224. +
  14225. +static void perf_event_free_filter(struct perf_event *event);
  14226. +
  14227. +static void free_event_rcu(struct rcu_head *head)
  14228. +{
  14229. + struct perf_event *event;
  14230. +
  14231. + event = container_of(head, struct perf_event, rcu_head);
  14232. + if (event->ns)
  14233. + put_pid_ns(event->ns);
  14234. + perf_event_free_filter(event);
  14235. + kfree(event);
  14236. +}
  14237. +
  14238. +static void ring_buffer_put(struct ring_buffer *rb);
  14239. +static void ring_buffer_attach(struct perf_event *event,
  14240. + struct ring_buffer *rb);
  14241. +
  14242. +static void unaccount_event_cpu(struct perf_event *event, int cpu)
  14243. +{
  14244. + if (event->parent)
  14245. + return;
  14246. +
  14247. + if (has_branch_stack(event)) {
  14248. + if (!(event->attach_state & PERF_ATTACH_TASK))
  14249. + atomic_dec(&per_cpu(perf_branch_stack_events, cpu));
  14250. + }
  14251. + if (is_cgroup_event(event))
  14252. + atomic_dec(&per_cpu(perf_cgroup_events, cpu));
  14253. +}
  14254. +
  14255. +static void unaccount_event(struct perf_event *event)
  14256. +{
  14257. + if (event->parent)
  14258. + return;
  14259. +
  14260. + if (event->attach_state & PERF_ATTACH_TASK)
  14261. + static_key_slow_dec_deferred(&perf_sched_events);
  14262. + if (event->attr.mmap || event->attr.mmap_data)
  14263. + atomic_dec(&nr_mmap_events);
  14264. + if (event->attr.comm)
  14265. + atomic_dec(&nr_comm_events);
  14266. + if (event->attr.task)
  14267. + atomic_dec(&nr_task_events);
  14268. + if (event->attr.freq)
  14269. + atomic_dec(&nr_freq_events);
  14270. + if (is_cgroup_event(event))
  14271. + static_key_slow_dec_deferred(&perf_sched_events);
  14272. + if (has_branch_stack(event))
  14273. + static_key_slow_dec_deferred(&perf_sched_events);
  14274. +
  14275. + unaccount_event_cpu(event, event->cpu);
  14276. +}
  14277. +
  14278. +static void __free_event(struct perf_event *event)
  14279. +{
  14280. + if (!event->parent) {
  14281. + if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
  14282. + put_callchain_buffers();
  14283. + }
  14284. +
  14285. + if (event->destroy)
  14286. + event->destroy(event);
  14287. +
  14288. + if (event->ctx)
  14289. + put_ctx(event->ctx);
  14290. +
  14291. + if (event->pmu)
  14292. + module_put(event->pmu->module);
  14293. +
  14294. + call_rcu(&event->rcu_head, free_event_rcu);
  14295. +}
  14296. +
  14297. +static void _free_event(struct perf_event *event)
  14298. +{
  14299. + irq_work_sync(&event->pending);
  14300. +
  14301. + unaccount_event(event);
  14302. +
  14303. + if (event->rb) {
  14304. + /*
  14305. + * Can happen when we close an event with re-directed output.
  14306. + *
  14307. + * Since we have a 0 refcount, perf_mmap_close() will skip
  14308. + * over us; possibly making our ring_buffer_put() the last.
  14309. + */
  14310. + mutex_lock(&event->mmap_mutex);
  14311. + ring_buffer_attach(event, NULL);
  14312. + mutex_unlock(&event->mmap_mutex);
  14313. + }
  14314. +
  14315. + if (is_cgroup_event(event))
  14316. + perf_detach_cgroup(event);
  14317. +
  14318. + __free_event(event);
  14319. +}
  14320. +
  14321. +/*
  14322. + * Used to free events which have a known refcount of 1, such as in error paths
  14323. + * where the event isn't exposed yet and inherited events.
  14324. + */
  14325. +static void free_event(struct perf_event *event)
  14326. +{
  14327. + if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1,
  14328. + "unexpected event refcount: %ld; ptr=%p\n",
  14329. + atomic_long_read(&event->refcount), event)) {
  14330. + /* leak to avoid use-after-free */
  14331. + return;
  14332. + }
  14333. +
  14334. + _free_event(event);
  14335. +}
  14336. +
  14337. +/*
  14338. + * Remove user event from the owner task.
  14339. + */
  14340. +static void perf_remove_from_owner(struct perf_event *event)
  14341. +{
  14342. + struct task_struct *owner;
  14343. +
  14344. + rcu_read_lock();
  14345. + owner = ACCESS_ONCE(event->owner);
  14346. + /*
  14347. + * Matches the smp_wmb() in perf_event_exit_task(). If we observe
  14348. + * !owner it means the list deletion is complete and we can indeed
  14349. + * free this event, otherwise we need to serialize on
  14350. + * owner->perf_event_mutex.
  14351. + */
  14352. + smp_read_barrier_depends();
  14353. + if (owner) {
  14354. + /*
  14355. + * Since delayed_put_task_struct() also drops the last
  14356. + * task reference we can safely take a new reference
  14357. + * while holding the rcu_read_lock().
  14358. + */
  14359. + get_task_struct(owner);
  14360. + }
  14361. + rcu_read_unlock();
  14362. +
  14363. + if (owner) {
  14364. + mutex_lock(&owner->perf_event_mutex);
  14365. + /*
  14366. + * We have to re-check the event->owner field, if it is cleared
  14367. + * we raced with perf_event_exit_task(), acquiring the mutex
  14368. + * ensured they're done, and we can proceed with freeing the
  14369. + * event.
  14370. + */
  14371. + if (event->owner)
  14372. + list_del_init(&event->owner_entry);
  14373. + mutex_unlock(&owner->perf_event_mutex);
  14374. + put_task_struct(owner);
  14375. + }
  14376. +}
  14377. +
  14378. +/*
  14379. + * Called when the last reference to the file is gone.
  14380. + */
  14381. +static void put_event(struct perf_event *event)
  14382. +{
  14383. + struct perf_event_context *ctx = event->ctx;
  14384. +
  14385. + if (!atomic_long_dec_and_test(&event->refcount))
  14386. + return;
  14387. +
  14388. + if (!is_kernel_event(event))
  14389. + perf_remove_from_owner(event);
  14390. +
  14391. + WARN_ON_ONCE(ctx->parent_ctx);
  14392. + /*
  14393. + * There are two ways this annotation is useful:
  14394. + *
  14395. + * 1) there is a lock recursion from perf_event_exit_task
  14396. + * see the comment there.
  14397. + *
  14398. + * 2) there is a lock-inversion with mmap_sem through
  14399. + * perf_event_read_group(), which takes faults while
  14400. + * holding ctx->mutex, however this is called after
  14401. + * the last filedesc died, so there is no possibility
  14402. + * to trigger the AB-BA case.
  14403. + */
  14404. + mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
  14405. + perf_remove_from_context(event, true);
  14406. + mutex_unlock(&ctx->mutex);
  14407. +
  14408. + _free_event(event);
  14409. +}
  14410. +
  14411. +int perf_event_release_kernel(struct perf_event *event)
  14412. +{
  14413. + put_event(event);
  14414. + return 0;
  14415. +}
  14416. +EXPORT_SYMBOL_GPL(perf_event_release_kernel);
  14417. +
  14418. +static int perf_release(struct inode *inode, struct file *file)
  14419. +{
  14420. + put_event(file->private_data);
  14421. + return 0;
  14422. +}
  14423. +
  14424. +/*
  14425. + * Remove all orphanes events from the context.
  14426. + */
  14427. +static void orphans_remove_work(struct work_struct *work)
  14428. +{
  14429. + struct perf_event_context *ctx;
  14430. + struct perf_event *event, *tmp;
  14431. +
  14432. + ctx = container_of(work, struct perf_event_context,
  14433. + orphans_remove.work);
  14434. +
  14435. + mutex_lock(&ctx->mutex);
  14436. + list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry) {
  14437. + struct perf_event *parent_event = event->parent;
  14438. +
  14439. + if (!is_orphaned_child(event))
  14440. + continue;
  14441. +
  14442. + perf_remove_from_context(event, true);
  14443. +
  14444. + mutex_lock(&parent_event->child_mutex);
  14445. + list_del_init(&event->child_list);
  14446. + mutex_unlock(&parent_event->child_mutex);
  14447. +
  14448. + free_event(event);
  14449. + put_event(parent_event);
  14450. + }
  14451. +
  14452. + raw_spin_lock_irq(&ctx->lock);
  14453. + ctx->orphans_remove_sched = false;
  14454. + raw_spin_unlock_irq(&ctx->lock);
  14455. + mutex_unlock(&ctx->mutex);
  14456. +
  14457. + put_ctx(ctx);
  14458. +}
  14459. +
  14460. +u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
  14461. +{
  14462. + struct perf_event *child;
  14463. + u64 total = 0;
  14464. +
  14465. + *enabled = 0;
  14466. + *running = 0;
  14467. +
  14468. + mutex_lock(&event->child_mutex);
  14469. + total += perf_event_read(event);
  14470. + *enabled += event->total_time_enabled +
  14471. + atomic64_read(&event->child_total_time_enabled);
  14472. + *running += event->total_time_running +
  14473. + atomic64_read(&event->child_total_time_running);
  14474. +
  14475. + list_for_each_entry(child, &event->child_list, child_list) {
  14476. + total += perf_event_read(child);
  14477. + *enabled += child->total_time_enabled;
  14478. + *running += child->total_time_running;
  14479. + }
  14480. + mutex_unlock(&event->child_mutex);
  14481. +
  14482. + return total;
  14483. +}
  14484. +EXPORT_SYMBOL_GPL(perf_event_read_value);
  14485. +
  14486. +static int perf_event_read_group(struct perf_event *event,
  14487. + u64 read_format, char __user *buf)
  14488. +{
  14489. + struct perf_event *leader = event->group_leader, *sub;
  14490. + int n = 0, size = 0, ret = -EFAULT;
  14491. + struct perf_event_context *ctx = leader->ctx;
  14492. + u64 values[5];
  14493. + u64 count, enabled, running;
  14494. +
  14495. + mutex_lock(&ctx->mutex);
  14496. + count = perf_event_read_value(leader, &enabled, &running);
  14497. +
  14498. + values[n++] = 1 + leader->nr_siblings;
  14499. + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
  14500. + values[n++] = enabled;
  14501. + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
  14502. + values[n++] = running;
  14503. + values[n++] = count;
  14504. + if (read_format & PERF_FORMAT_ID)
  14505. + values[n++] = primary_event_id(leader);
  14506. +
  14507. + size = n * sizeof(u64);
  14508. +
  14509. + if (copy_to_user(buf, values, size))
  14510. + goto unlock;
  14511. +
  14512. + ret = size;
  14513. +
  14514. + list_for_each_entry(sub, &leader->sibling_list, group_entry) {
  14515. + n = 0;
  14516. +
  14517. + values[n++] = perf_event_read_value(sub, &enabled, &running);
  14518. + if (read_format & PERF_FORMAT_ID)
  14519. + values[n++] = primary_event_id(sub);
  14520. +
  14521. + size = n * sizeof(u64);
  14522. +
  14523. + if (copy_to_user(buf + ret, values, size)) {
  14524. + ret = -EFAULT;
  14525. + goto unlock;
  14526. + }
  14527. +
  14528. + ret += size;
  14529. + }
  14530. +unlock:
  14531. + mutex_unlock(&ctx->mutex);
  14532. +
  14533. + return ret;
  14534. +}
  14535. +
  14536. +static int perf_event_read_one(struct perf_event *event,
  14537. + u64 read_format, char __user *buf)
  14538. +{
  14539. + u64 enabled, running;
  14540. + u64 values[4];
  14541. + int n = 0;
  14542. +
  14543. + values[n++] = perf_event_read_value(event, &enabled, &running);
  14544. + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
  14545. + values[n++] = enabled;
  14546. + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
  14547. + values[n++] = running;
  14548. + if (read_format & PERF_FORMAT_ID)
  14549. + values[n++] = primary_event_id(event);
  14550. +
  14551. + if (copy_to_user(buf, values, n * sizeof(u64)))
  14552. + return -EFAULT;
  14553. +
  14554. + return n * sizeof(u64);
  14555. +}
  14556. +
  14557. +static bool is_event_hup(struct perf_event *event)
  14558. +{
  14559. + bool no_children;
  14560. +
  14561. + if (event->state != PERF_EVENT_STATE_EXIT)
  14562. + return false;
  14563. +
  14564. + mutex_lock(&event->child_mutex);
  14565. + no_children = list_empty(&event->child_list);
  14566. + mutex_unlock(&event->child_mutex);
  14567. + return no_children;
  14568. +}
  14569. +
  14570. +/*
  14571. + * Read the performance event - simple non blocking version for now
  14572. + */
  14573. +static ssize_t
  14574. +perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
  14575. +{
  14576. + u64 read_format = event->attr.read_format;
  14577. + int ret;
  14578. +
  14579. + /*
  14580. + * Return end-of-file for a read on a event that is in
  14581. + * error state (i.e. because it was pinned but it couldn't be
  14582. + * scheduled on to the CPU at some point).
  14583. + */
  14584. + if (event->state == PERF_EVENT_STATE_ERROR)
  14585. + return 0;
  14586. +
  14587. + if (count < event->read_size)
  14588. + return -ENOSPC;
  14589. +
  14590. + WARN_ON_ONCE(event->ctx->parent_ctx);
  14591. + if (read_format & PERF_FORMAT_GROUP)
  14592. + ret = perf_event_read_group(event, read_format, buf);
  14593. + else
  14594. + ret = perf_event_read_one(event, read_format, buf);
  14595. +
  14596. + return ret;
  14597. +}
  14598. +
  14599. +static ssize_t
  14600. +perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
  14601. +{
  14602. + struct perf_event *event = file->private_data;
  14603. +
  14604. + return perf_read_hw(event, buf, count);
  14605. +}
  14606. +
  14607. +static unsigned int perf_poll(struct file *file, poll_table *wait)
  14608. +{
  14609. + struct perf_event *event = file->private_data;
  14610. + struct ring_buffer *rb;
  14611. + unsigned int events = POLLHUP;
  14612. +
  14613. + poll_wait(file, &event->waitq, wait);
  14614. +
  14615. + if (is_event_hup(event))
  14616. + return events;
  14617. +
  14618. + /*
  14619. + * Pin the event->rb by taking event->mmap_mutex; otherwise
  14620. + * perf_event_set_output() can swizzle our rb and make us miss wakeups.
  14621. + */
  14622. + mutex_lock(&event->mmap_mutex);
  14623. + rb = event->rb;
  14624. + if (rb)
  14625. + events = atomic_xchg(&rb->poll, 0);
  14626. + mutex_unlock(&event->mmap_mutex);
  14627. + return events;
  14628. +}
  14629. +
  14630. +static void perf_event_reset(struct perf_event *event)
  14631. +{
  14632. + (void)perf_event_read(event);
  14633. + local64_set(&event->count, 0);
  14634. + perf_event_update_userpage(event);
  14635. +}
  14636. +
  14637. +/*
  14638. + * Holding the top-level event's child_mutex means that any
  14639. + * descendant process that has inherited this event will block
  14640. + * in sync_child_event if it goes to exit, thus satisfying the
  14641. + * task existence requirements of perf_event_enable/disable.
  14642. + */
  14643. +static void perf_event_for_each_child(struct perf_event *event,
  14644. + void (*func)(struct perf_event *))
  14645. +{
  14646. + struct perf_event *child;
  14647. +
  14648. + WARN_ON_ONCE(event->ctx->parent_ctx);
  14649. + mutex_lock(&event->child_mutex);
  14650. + func(event);
  14651. + list_for_each_entry(child, &event->child_list, child_list)
  14652. + func(child);
  14653. + mutex_unlock(&event->child_mutex);
  14654. +}
  14655. +
  14656. +static void perf_event_for_each(struct perf_event *event,
  14657. + void (*func)(struct perf_event *))
  14658. +{
  14659. + struct perf_event_context *ctx = event->ctx;
  14660. + struct perf_event *sibling;
  14661. +
  14662. + WARN_ON_ONCE(ctx->parent_ctx);
  14663. + mutex_lock(&ctx->mutex);
  14664. + event = event->group_leader;
  14665. +
  14666. + perf_event_for_each_child(event, func);
  14667. + list_for_each_entry(sibling, &event->sibling_list, group_entry)
  14668. + perf_event_for_each_child(sibling, func);
  14669. + mutex_unlock(&ctx->mutex);
  14670. +}
  14671. +
  14672. +static int perf_event_period(struct perf_event *event, u64 __user *arg)
  14673. +{
  14674. + struct perf_event_context *ctx = event->ctx;
  14675. + int ret = 0, active;
  14676. + u64 value;
  14677. +
  14678. + if (!is_sampling_event(event))
  14679. + return -EINVAL;
  14680. +
  14681. + if (copy_from_user(&value, arg, sizeof(value)))
  14682. + return -EFAULT;
  14683. +
  14684. + if (!value)
  14685. + return -EINVAL;
  14686. +
  14687. + raw_spin_lock_irq(&ctx->lock);
  14688. + if (event->attr.freq) {
  14689. + if (value > sysctl_perf_event_sample_rate) {
  14690. + ret = -EINVAL;
  14691. + goto unlock;
  14692. + }
  14693. +
  14694. + event->attr.sample_freq = value;
  14695. + } else {
  14696. + event->attr.sample_period = value;
  14697. + event->hw.sample_period = value;
  14698. + }
  14699. +
  14700. + active = (event->state == PERF_EVENT_STATE_ACTIVE);
  14701. + if (active) {
  14702. + perf_pmu_disable(ctx->pmu);
  14703. + event->pmu->stop(event, PERF_EF_UPDATE);
  14704. + }
  14705. +
  14706. + local64_set(&event->hw.period_left, 0);
  14707. +
  14708. + if (active) {
  14709. + event->pmu->start(event, PERF_EF_RELOAD);
  14710. + perf_pmu_enable(ctx->pmu);
  14711. + }
  14712. +
  14713. +unlock:
  14714. + raw_spin_unlock_irq(&ctx->lock);
  14715. +
  14716. + return ret;
  14717. +}
  14718. +
  14719. +static const struct file_operations perf_fops;
  14720. +
  14721. +static inline int perf_fget_light(int fd, struct fd *p)
  14722. +{
  14723. + struct fd f = fdget(fd);
  14724. + if (!f.file)
  14725. + return -EBADF;
  14726. +
  14727. + if (f.file->f_op != &perf_fops) {
  14728. + fdput(f);
  14729. + return -EBADF;
  14730. + }
  14731. + *p = f;
  14732. + return 0;
  14733. +}
  14734. +
  14735. +static int perf_event_set_output(struct perf_event *event,
  14736. + struct perf_event *output_event);
  14737. +static int perf_event_set_filter(struct perf_event *event, void __user *arg);
  14738. +
  14739. +static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
  14740. +{
  14741. + struct perf_event *event = file->private_data;
  14742. + void (*func)(struct perf_event *);
  14743. + u32 flags = arg;
  14744. +
  14745. + switch (cmd) {
  14746. + case PERF_EVENT_IOC_ENABLE:
  14747. + func = perf_event_enable;
  14748. + break;
  14749. + case PERF_EVENT_IOC_DISABLE:
  14750. + func = perf_event_disable;
  14751. + break;
  14752. + case PERF_EVENT_IOC_RESET:
  14753. + func = perf_event_reset;
  14754. + break;
  14755. +
  14756. + case PERF_EVENT_IOC_REFRESH:
  14757. + return perf_event_refresh(event, arg);
  14758. +
  14759. + case PERF_EVENT_IOC_PERIOD:
  14760. + return perf_event_period(event, (u64 __user *)arg);
  14761. +
  14762. + case PERF_EVENT_IOC_ID:
  14763. + {
  14764. + u64 id = primary_event_id(event);
  14765. +
  14766. + if (copy_to_user((void __user *)arg, &id, sizeof(id)))
  14767. + return -EFAULT;
  14768. + return 0;
  14769. + }
  14770. +
  14771. + case PERF_EVENT_IOC_SET_OUTPUT:
  14772. + {
  14773. + int ret;
  14774. + if (arg != -1) {
  14775. + struct perf_event *output_event;
  14776. + struct fd output;
  14777. + ret = perf_fget_light(arg, &output);
  14778. + if (ret)
  14779. + return ret;
  14780. + output_event = output.file->private_data;
  14781. + ret = perf_event_set_output(event, output_event);
  14782. + fdput(output);
  14783. + } else {
  14784. + ret = perf_event_set_output(event, NULL);
  14785. + }
  14786. + return ret;
  14787. + }
  14788. +
  14789. + case PERF_EVENT_IOC_SET_FILTER:
  14790. + return perf_event_set_filter(event, (void __user *)arg);
  14791. +
  14792. + default:
  14793. + return -ENOTTY;
  14794. + }
  14795. +
  14796. + if (flags & PERF_IOC_FLAG_GROUP)
  14797. + perf_event_for_each(event, func);
  14798. + else
  14799. + perf_event_for_each_child(event, func);
  14800. +
  14801. + return 0;
  14802. +}
  14803. +
  14804. +#ifdef CONFIG_COMPAT
  14805. +static long perf_compat_ioctl(struct file *file, unsigned int cmd,
  14806. + unsigned long arg)
  14807. +{
  14808. + switch (_IOC_NR(cmd)) {
  14809. + case _IOC_NR(PERF_EVENT_IOC_SET_FILTER):
  14810. + case _IOC_NR(PERF_EVENT_IOC_ID):
  14811. + /* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */
  14812. + if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) {
  14813. + cmd &= ~IOCSIZE_MASK;
  14814. + cmd |= sizeof(void *) << IOCSIZE_SHIFT;
  14815. + }
  14816. + break;
  14817. + }
  14818. + return perf_ioctl(file, cmd, arg);
  14819. +}
  14820. +#else
  14821. +# define perf_compat_ioctl NULL
  14822. +#endif
  14823. +
  14824. +int perf_event_task_enable(void)
  14825. +{
  14826. + struct perf_event *event;
  14827. +
  14828. + mutex_lock(&current->perf_event_mutex);
  14829. + list_for_each_entry(event, &current->perf_event_list, owner_entry)
  14830. + perf_event_for_each_child(event, perf_event_enable);
  14831. + mutex_unlock(&current->perf_event_mutex);
  14832. +
  14833. + return 0;
  14834. +}
  14835. +
  14836. +int perf_event_task_disable(void)
  14837. +{
  14838. + struct perf_event *event;
  14839. +
  14840. + mutex_lock(&current->perf_event_mutex);
  14841. + list_for_each_entry(event, &current->perf_event_list, owner_entry)
  14842. + perf_event_for_each_child(event, perf_event_disable);
  14843. + mutex_unlock(&current->perf_event_mutex);
  14844. +
  14845. + return 0;
  14846. +}
  14847. +
  14848. +static int perf_event_index(struct perf_event *event)
  14849. +{
  14850. + if (event->hw.state & PERF_HES_STOPPED)
  14851. + return 0;
  14852. +
  14853. + if (event->state != PERF_EVENT_STATE_ACTIVE)
  14854. + return 0;
  14855. +
  14856. + return event->pmu->event_idx(event);
  14857. +}
  14858. +
  14859. +static void calc_timer_values(struct perf_event *event,
  14860. + u64 *now,
  14861. + u64 *enabled,
  14862. + u64 *running)
  14863. +{
  14864. + u64 ctx_time;
  14865. +
  14866. + *now = perf_clock();
  14867. + ctx_time = event->shadow_ctx_time + *now;
  14868. + *enabled = ctx_time - event->tstamp_enabled;
  14869. + *running = ctx_time - event->tstamp_running;
  14870. +}
  14871. +
  14872. +static void perf_event_init_userpage(struct perf_event *event)
  14873. +{
  14874. + struct perf_event_mmap_page *userpg;
  14875. + struct ring_buffer *rb;
  14876. +
  14877. + rcu_read_lock();
  14878. + rb = rcu_dereference(event->rb);
  14879. + if (!rb)
  14880. + goto unlock;
  14881. +
  14882. + userpg = rb->user_page;
  14883. +
  14884. + /* Allow new userspace to detect that bit 0 is deprecated */
  14885. + userpg->cap_bit0_is_deprecated = 1;
  14886. + userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
  14887. +
  14888. +unlock:
  14889. + rcu_read_unlock();
  14890. +}
  14891. +
  14892. +void __weak arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now)
  14893. +{
  14894. +}
  14895. +
  14896. +/*
  14897. + * Callers need to ensure there can be no nesting of this function, otherwise
  14898. + * the seqlock logic goes bad. We can not serialize this because the arch
  14899. + * code calls this from NMI context.
  14900. + */
  14901. +void perf_event_update_userpage(struct perf_event *event)
  14902. +{
  14903. + struct perf_event_mmap_page *userpg;
  14904. + struct ring_buffer *rb;
  14905. + u64 enabled, running, now;
  14906. +
  14907. + rcu_read_lock();
  14908. + rb = rcu_dereference(event->rb);
  14909. + if (!rb)
  14910. + goto unlock;
  14911. +
  14912. + /*
  14913. + * compute total_time_enabled, total_time_running
  14914. + * based on snapshot values taken when the event
  14915. + * was last scheduled in.
  14916. + *
  14917. + * we cannot simply called update_context_time()
  14918. + * because of locking issue as we can be called in
  14919. + * NMI context
  14920. + */
  14921. + calc_timer_values(event, &now, &enabled, &running);
  14922. +
  14923. + userpg = rb->user_page;
  14924. + /*
  14925. + * Disable preemption so as to not let the corresponding user-space
  14926. + * spin too long if we get preempted.
  14927. + */
  14928. + preempt_disable();
  14929. + ++userpg->lock;
  14930. + barrier();
  14931. + userpg->index = perf_event_index(event);
  14932. + userpg->offset = perf_event_count(event);
  14933. + if (userpg->index)
  14934. + userpg->offset -= local64_read(&event->hw.prev_count);
  14935. +
  14936. + userpg->time_enabled = enabled +
  14937. + atomic64_read(&event->child_total_time_enabled);
  14938. +
  14939. + userpg->time_running = running +
  14940. + atomic64_read(&event->child_total_time_running);
  14941. +
  14942. + arch_perf_update_userpage(userpg, now);
  14943. +
  14944. + barrier();
  14945. + ++userpg->lock;
  14946. + preempt_enable();
  14947. +unlock:
  14948. + rcu_read_unlock();
  14949. +}
  14950. +
  14951. +static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
  14952. +{
  14953. + struct perf_event *event = vma->vm_file->private_data;
  14954. + struct ring_buffer *rb;
  14955. + int ret = VM_FAULT_SIGBUS;
  14956. +
  14957. + if (vmf->flags & FAULT_FLAG_MKWRITE) {
  14958. + if (vmf->pgoff == 0)
  14959. + ret = 0;
  14960. + return ret;
  14961. + }
  14962. +
  14963. + rcu_read_lock();
  14964. + rb = rcu_dereference(event->rb);
  14965. + if (!rb)
  14966. + goto unlock;
  14967. +
  14968. + if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
  14969. + goto unlock;
  14970. +
  14971. + vmf->page = perf_mmap_to_page(rb, vmf->pgoff);
  14972. + if (!vmf->page)
  14973. + goto unlock;
  14974. +
  14975. + get_page(vmf->page);
  14976. + vmf->page->mapping = vma->vm_file->f_mapping;
  14977. + vmf->page->index = vmf->pgoff;
  14978. +
  14979. + ret = 0;
  14980. +unlock:
  14981. + rcu_read_unlock();
  14982. +
  14983. + return ret;
  14984. +}
  14985. +
  14986. +static void ring_buffer_attach(struct perf_event *event,
  14987. + struct ring_buffer *rb)
  14988. +{
  14989. + struct ring_buffer *old_rb = NULL;
  14990. + unsigned long flags;
  14991. +
  14992. + if (event->rb) {
  14993. + /*
  14994. + * Should be impossible, we set this when removing
  14995. + * event->rb_entry and wait/clear when adding event->rb_entry.
  14996. + */
  14997. + WARN_ON_ONCE(event->rcu_pending);
  14998. +
  14999. + old_rb = event->rb;
  15000. + event->rcu_batches = get_state_synchronize_rcu();
  15001. + event->rcu_pending = 1;
  15002. +
  15003. + spin_lock_irqsave(&old_rb->event_lock, flags);
  15004. + list_del_rcu(&event->rb_entry);
  15005. + spin_unlock_irqrestore(&old_rb->event_lock, flags);
  15006. + }
  15007. +
  15008. + if (event->rcu_pending && rb) {
  15009. + cond_synchronize_rcu(event->rcu_batches);
  15010. + event->rcu_pending = 0;
  15011. + }
  15012. +
  15013. + if (rb) {
  15014. + spin_lock_irqsave(&rb->event_lock, flags);
  15015. + list_add_rcu(&event->rb_entry, &rb->event_list);
  15016. + spin_unlock_irqrestore(&rb->event_lock, flags);
  15017. + }
  15018. +
  15019. + rcu_assign_pointer(event->rb, rb);
  15020. +
  15021. + if (old_rb) {
  15022. + ring_buffer_put(old_rb);
  15023. + /*
  15024. + * Since we detached before setting the new rb, so that we
  15025. + * could attach the new rb, we could have missed a wakeup.
  15026. + * Provide it now.
  15027. + */
  15028. + wake_up_all(&event->waitq);
  15029. + }
  15030. +}
  15031. +
  15032. +static void ring_buffer_wakeup(struct perf_event *event)
  15033. +{
  15034. + struct ring_buffer *rb;
  15035. +
  15036. + rcu_read_lock();
  15037. + rb = rcu_dereference(event->rb);
  15038. + if (rb) {
  15039. + list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
  15040. + wake_up_all(&event->waitq);
  15041. + }
  15042. + rcu_read_unlock();
  15043. +}
  15044. +
  15045. +static void rb_free_rcu(struct rcu_head *rcu_head)
  15046. +{
  15047. + struct ring_buffer *rb;
  15048. +
  15049. + rb = container_of(rcu_head, struct ring_buffer, rcu_head);
  15050. + rb_free(rb);
  15051. +}
  15052. +
  15053. +static struct ring_buffer *ring_buffer_get(struct perf_event *event)
  15054. +{
  15055. + struct ring_buffer *rb;
  15056. +
  15057. + rcu_read_lock();
  15058. + rb = rcu_dereference(event->rb);
  15059. + if (rb) {
  15060. + if (!atomic_inc_not_zero(&rb->refcount))
  15061. + rb = NULL;
  15062. + }
  15063. + rcu_read_unlock();
  15064. +
  15065. + return rb;
  15066. +}
  15067. +
  15068. +static void ring_buffer_put(struct ring_buffer *rb)
  15069. +{
  15070. + if (!atomic_dec_and_test(&rb->refcount))
  15071. + return;
  15072. +
  15073. + WARN_ON_ONCE(!list_empty(&rb->event_list));
  15074. +
  15075. + call_rcu(&rb->rcu_head, rb_free_rcu);
  15076. +}
  15077. +
  15078. +static void perf_mmap_open(struct vm_area_struct *vma)
  15079. +{
  15080. + struct perf_event *event = vma->vm_file->private_data;
  15081. +
  15082. + atomic_inc(&event->mmap_count);
  15083. + atomic_inc(&event->rb->mmap_count);
  15084. +}
  15085. +
  15086. +/*
  15087. + * A buffer can be mmap()ed multiple times; either directly through the same
  15088. + * event, or through other events by use of perf_event_set_output().
  15089. + *
  15090. + * In order to undo the VM accounting done by perf_mmap() we need to destroy
  15091. + * the buffer here, where we still have a VM context. This means we need
  15092. + * to detach all events redirecting to us.
  15093. + */
  15094. +static void perf_mmap_close(struct vm_area_struct *vma)
  15095. +{
  15096. + struct perf_event *event = vma->vm_file->private_data;
  15097. +
  15098. + struct ring_buffer *rb = ring_buffer_get(event);
  15099. + struct user_struct *mmap_user = rb->mmap_user;
  15100. + int mmap_locked = rb->mmap_locked;
  15101. + unsigned long size = perf_data_size(rb);
  15102. +
  15103. + atomic_dec(&rb->mmap_count);
  15104. +
  15105. + if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
  15106. + goto out_put;
  15107. +
  15108. + ring_buffer_attach(event, NULL);
  15109. + mutex_unlock(&event->mmap_mutex);
  15110. +
  15111. + /* If there's still other mmap()s of this buffer, we're done. */
  15112. + if (atomic_read(&rb->mmap_count))
  15113. + goto out_put;
  15114. +
  15115. + /*
  15116. + * No other mmap()s, detach from all other events that might redirect
  15117. + * into the now unreachable buffer. Somewhat complicated by the
  15118. + * fact that rb::event_lock otherwise nests inside mmap_mutex.
  15119. + */
  15120. +again:
  15121. + rcu_read_lock();
  15122. + list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
  15123. + if (!atomic_long_inc_not_zero(&event->refcount)) {
  15124. + /*
  15125. + * This event is en-route to free_event() which will
  15126. + * detach it and remove it from the list.
  15127. + */
  15128. + continue;
  15129. + }
  15130. + rcu_read_unlock();
  15131. +
  15132. + mutex_lock(&event->mmap_mutex);
  15133. + /*
  15134. + * Check we didn't race with perf_event_set_output() which can
  15135. + * swizzle the rb from under us while we were waiting to
  15136. + * acquire mmap_mutex.
  15137. + *
  15138. + * If we find a different rb; ignore this event, a next
  15139. + * iteration will no longer find it on the list. We have to
  15140. + * still restart the iteration to make sure we're not now
  15141. + * iterating the wrong list.
  15142. + */
  15143. + if (event->rb == rb)
  15144. + ring_buffer_attach(event, NULL);
  15145. +
  15146. + mutex_unlock(&event->mmap_mutex);
  15147. + put_event(event);
  15148. +
  15149. + /*
  15150. + * Restart the iteration; either we're on the wrong list or
  15151. + * destroyed its integrity by doing a deletion.
  15152. + */
  15153. + goto again;
  15154. + }
  15155. + rcu_read_unlock();
  15156. +
  15157. + /*
  15158. + * It could be there's still a few 0-ref events on the list; they'll
  15159. + * get cleaned up by free_event() -- they'll also still have their
  15160. + * ref on the rb and will free it whenever they are done with it.
  15161. + *
  15162. + * Aside from that, this buffer is 'fully' detached and unmapped,
  15163. + * undo the VM accounting.
  15164. + */
  15165. +
  15166. + atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
  15167. + vma->vm_mm->pinned_vm -= mmap_locked;
  15168. + free_uid(mmap_user);
  15169. +
  15170. +out_put:
  15171. + ring_buffer_put(rb); /* could be last */
  15172. +}
  15173. +
  15174. +static const struct vm_operations_struct perf_mmap_vmops = {
  15175. + .open = perf_mmap_open,
  15176. + .close = perf_mmap_close,
  15177. + .fault = perf_mmap_fault,
  15178. + .page_mkwrite = perf_mmap_fault,
  15179. +};
  15180. +
  15181. +static int perf_mmap(struct file *file, struct vm_area_struct *vma)
  15182. +{
  15183. + struct perf_event *event = file->private_data;
  15184. + unsigned long user_locked, user_lock_limit;
  15185. + struct user_struct *user = current_user();
  15186. + unsigned long locked, lock_limit;
  15187. + struct ring_buffer *rb;
  15188. + unsigned long vma_size;
  15189. + unsigned long nr_pages;
  15190. + long user_extra, extra;
  15191. + int ret = 0, flags = 0;
  15192. +
  15193. + /*
  15194. + * Don't allow mmap() of inherited per-task counters. This would
  15195. + * create a performance issue due to all children writing to the
  15196. + * same rb.
  15197. + */
  15198. + if (event->cpu == -1 && event->attr.inherit)
  15199. + return -EINVAL;
  15200. +
  15201. + if (!(vma->vm_flags & VM_SHARED))
  15202. + return -EINVAL;
  15203. +
  15204. + vma_size = vma->vm_end - vma->vm_start;
  15205. + nr_pages = (vma_size / PAGE_SIZE) - 1;
  15206. +
  15207. + /*
  15208. + * If we have rb pages ensure they're a power-of-two number, so we
  15209. + * can do bitmasks instead of modulo.
  15210. + */
  15211. + if (nr_pages != 0 && !is_power_of_2(nr_pages))
  15212. + return -EINVAL;
  15213. +
  15214. + if (vma_size != PAGE_SIZE * (1 + nr_pages))
  15215. + return -EINVAL;
  15216. +
  15217. + if (vma->vm_pgoff != 0)
  15218. + return -EINVAL;
  15219. +
  15220. + WARN_ON_ONCE(event->ctx->parent_ctx);
  15221. +again:
  15222. + mutex_lock(&event->mmap_mutex);
  15223. + if (event->rb) {
  15224. + if (event->rb->nr_pages != nr_pages) {
  15225. + ret = -EINVAL;
  15226. + goto unlock;
  15227. + }
  15228. +
  15229. + if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
  15230. + /*
  15231. + * Raced against perf_mmap_close() through
  15232. + * perf_event_set_output(). Try again, hope for better
  15233. + * luck.
  15234. + */
  15235. + mutex_unlock(&event->mmap_mutex);
  15236. + goto again;
  15237. + }
  15238. +
  15239. + goto unlock;
  15240. + }
  15241. +
  15242. + user_extra = nr_pages + 1;
  15243. + user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
  15244. +
  15245. + /*
  15246. + * Increase the limit linearly with more CPUs:
  15247. + */
  15248. + user_lock_limit *= num_online_cpus();
  15249. +
  15250. + user_locked = atomic_long_read(&user->locked_vm) + user_extra;
  15251. +
  15252. + extra = 0;
  15253. + if (user_locked > user_lock_limit)
  15254. + extra = user_locked - user_lock_limit;
  15255. +
  15256. + lock_limit = rlimit(RLIMIT_MEMLOCK);
  15257. + lock_limit >>= PAGE_SHIFT;
  15258. + locked = vma->vm_mm->pinned_vm + extra;
  15259. +
  15260. + if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
  15261. + !capable(CAP_IPC_LOCK)) {
  15262. + ret = -EPERM;
  15263. + goto unlock;
  15264. + }
  15265. +
  15266. + WARN_ON(event->rb);
  15267. +
  15268. + if (vma->vm_flags & VM_WRITE)
  15269. + flags |= RING_BUFFER_WRITABLE;
  15270. +
  15271. + rb = rb_alloc(nr_pages,
  15272. + event->attr.watermark ? event->attr.wakeup_watermark : 0,
  15273. + event->cpu, flags);
  15274. +
  15275. + if (!rb) {
  15276. + ret = -ENOMEM;
  15277. + goto unlock;
  15278. + }
  15279. +
  15280. + atomic_set(&rb->mmap_count, 1);
  15281. + rb->mmap_locked = extra;
  15282. + rb->mmap_user = get_current_user();
  15283. +
  15284. + atomic_long_add(user_extra, &user->locked_vm);
  15285. + vma->vm_mm->pinned_vm += extra;
  15286. +
  15287. + ring_buffer_attach(event, rb);
  15288. +
  15289. + perf_event_init_userpage(event);
  15290. + perf_event_update_userpage(event);
  15291. +
  15292. +unlock:
  15293. + if (!ret)
  15294. + atomic_inc(&event->mmap_count);
  15295. + mutex_unlock(&event->mmap_mutex);
  15296. +
  15297. + /*
  15298. + * Since pinned accounting is per vm we cannot allow fork() to copy our
  15299. + * vma.
  15300. + */
  15301. + vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
  15302. + vma->vm_ops = &perf_mmap_vmops;
  15303. +
  15304. + return ret;
  15305. +}
  15306. +
  15307. +static int perf_fasync(int fd, struct file *filp, int on)
  15308. +{
  15309. + struct inode *inode = file_inode(filp);
  15310. + struct perf_event *event = filp->private_data;
  15311. + int retval;
  15312. +
  15313. + mutex_lock(&inode->i_mutex);
  15314. + retval = fasync_helper(fd, filp, on, &event->fasync);
  15315. + mutex_unlock(&inode->i_mutex);
  15316. +
  15317. + if (retval < 0)
  15318. + return retval;
  15319. +
  15320. + return 0;
  15321. +}
  15322. +
  15323. +static const struct file_operations perf_fops = {
  15324. + .llseek = no_llseek,
  15325. + .release = perf_release,
  15326. + .read = perf_read,
  15327. + .poll = perf_poll,
  15328. + .unlocked_ioctl = perf_ioctl,
  15329. + .compat_ioctl = perf_compat_ioctl,
  15330. + .mmap = perf_mmap,
  15331. + .fasync = perf_fasync,
  15332. +};
  15333. +
  15334. +/*
  15335. + * Perf event wakeup
  15336. + *
  15337. + * If there's data, ensure we set the poll() state and publish everything
  15338. + * to user-space before waking everybody up.
  15339. + */
  15340. +
  15341. +void perf_event_wakeup(struct perf_event *event)
  15342. +{
  15343. + ring_buffer_wakeup(event);
  15344. +
  15345. + if (event->pending_kill) {
  15346. + kill_fasync(&event->fasync, SIGIO, event->pending_kill);
  15347. + event->pending_kill = 0;
  15348. + }
  15349. +}
  15350. +
  15351. +static void perf_pending_event(struct irq_work *entry)
  15352. +{
  15353. + struct perf_event *event = container_of(entry,
  15354. + struct perf_event, pending);
  15355. + int rctx;
  15356. +
  15357. + rctx = perf_swevent_get_recursion_context();
  15358. + /*
  15359. + * If we 'fail' here, that's OK, it means recursion is already disabled
  15360. + * and we won't recurse 'further'.
  15361. + */
  15362. +
  15363. + if (event->pending_disable) {
  15364. + event->pending_disable = 0;
  15365. + __perf_event_disable(event);
  15366. + }
  15367. +
  15368. + if (event->pending_wakeup) {
  15369. + event->pending_wakeup = 0;
  15370. + perf_event_wakeup(event);
  15371. + }
  15372. +
  15373. + if (rctx >= 0)
  15374. + perf_swevent_put_recursion_context(rctx);
  15375. +}
  15376. +
  15377. +/*
  15378. + * We assume there is only KVM supporting the callbacks.
  15379. + * Later on, we might change it to a list if there is
  15380. + * another virtualization implementation supporting the callbacks.
  15381. + */
  15382. +struct perf_guest_info_callbacks *perf_guest_cbs;
  15383. +
  15384. +int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
  15385. +{
  15386. + perf_guest_cbs = cbs;
  15387. + return 0;
  15388. +}
  15389. +EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
  15390. +
  15391. +int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
  15392. +{
  15393. + perf_guest_cbs = NULL;
  15394. + return 0;
  15395. +}
  15396. +EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
  15397. +
  15398. +static void
  15399. +perf_output_sample_regs(struct perf_output_handle *handle,
  15400. + struct pt_regs *regs, u64 mask)
  15401. +{
  15402. + int bit;
  15403. +
  15404. + for_each_set_bit(bit, (const unsigned long *) &mask,
  15405. + sizeof(mask) * BITS_PER_BYTE) {
  15406. + u64 val;
  15407. +
  15408. + val = perf_reg_value(regs, bit);
  15409. + perf_output_put(handle, val);
  15410. + }
  15411. +}
  15412. +
  15413. +static void perf_sample_regs_user(struct perf_regs_user *regs_user,
  15414. + struct pt_regs *regs)
  15415. +{
  15416. + if (!user_mode(regs)) {
  15417. + if (current->mm)
  15418. + regs = task_pt_regs(current);
  15419. + else
  15420. + regs = NULL;
  15421. + }
  15422. +
  15423. + if (regs) {
  15424. + regs_user->regs = regs;
  15425. + regs_user->abi = perf_reg_abi(current);
  15426. + }
  15427. +}
  15428. +
  15429. +/*
  15430. + * Get remaining task size from user stack pointer.
  15431. + *
  15432. + * It'd be better to take stack vma map and limit this more
  15433. + * precisly, but there's no way to get it safely under interrupt,
  15434. + * so using TASK_SIZE as limit.
  15435. + */
  15436. +static u64 perf_ustack_task_size(struct pt_regs *regs)
  15437. +{
  15438. + unsigned long addr = perf_user_stack_pointer(regs);
  15439. +
  15440. + if (!addr || addr >= TASK_SIZE)
  15441. + return 0;
  15442. +
  15443. + return TASK_SIZE - addr;
  15444. +}
  15445. +
  15446. +static u16
  15447. +perf_sample_ustack_size(u16 stack_size, u16 header_size,
  15448. + struct pt_regs *regs)
  15449. +{
  15450. + u64 task_size;
  15451. +
  15452. + /* No regs, no stack pointer, no dump. */
  15453. + if (!regs)
  15454. + return 0;
  15455. +
  15456. + /*
  15457. + * Check if we fit in with the requested stack size into the:
  15458. + * - TASK_SIZE
  15459. + * If we don't, we limit the size to the TASK_SIZE.
  15460. + *
  15461. + * - remaining sample size
  15462. + * If we don't, we customize the stack size to
  15463. + * fit in to the remaining sample size.
  15464. + */
  15465. +
  15466. + task_size = min((u64) USHRT_MAX, perf_ustack_task_size(regs));
  15467. + stack_size = min(stack_size, (u16) task_size);
  15468. +
  15469. + /* Current header size plus static size and dynamic size. */
  15470. + header_size += 2 * sizeof(u64);
  15471. +
  15472. + /* Do we fit in with the current stack dump size? */
  15473. + if ((u16) (header_size + stack_size) < header_size) {
  15474. + /*
  15475. + * If we overflow the maximum size for the sample,
  15476. + * we customize the stack dump size to fit in.
  15477. + */
  15478. + stack_size = USHRT_MAX - header_size - sizeof(u64);
  15479. + stack_size = round_up(stack_size, sizeof(u64));
  15480. + }
  15481. +
  15482. + return stack_size;
  15483. +}
  15484. +
  15485. +static void
  15486. +perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
  15487. + struct pt_regs *regs)
  15488. +{
  15489. + /* Case of a kernel thread, nothing to dump */
  15490. + if (!regs) {
  15491. + u64 size = 0;
  15492. + perf_output_put(handle, size);
  15493. + } else {
  15494. + unsigned long sp;
  15495. + unsigned int rem;
  15496. + u64 dyn_size;
  15497. +
  15498. + /*
  15499. + * We dump:
  15500. + * static size
  15501. + * - the size requested by user or the best one we can fit
  15502. + * in to the sample max size
  15503. + * data
  15504. + * - user stack dump data
  15505. + * dynamic size
  15506. + * - the actual dumped size
  15507. + */
  15508. +
  15509. + /* Static size. */
  15510. + perf_output_put(handle, dump_size);
  15511. +
  15512. + /* Data. */
  15513. + sp = perf_user_stack_pointer(regs);
  15514. + rem = __output_copy_user(handle, (void *) sp, dump_size);
  15515. + dyn_size = dump_size - rem;
  15516. +
  15517. + perf_output_skip(handle, rem);
  15518. +
  15519. + /* Dynamic size. */
  15520. + perf_output_put(handle, dyn_size);
  15521. + }
  15522. +}
  15523. +
  15524. +static void __perf_event_header__init_id(struct perf_event_header *header,
  15525. + struct perf_sample_data *data,
  15526. + struct perf_event *event)
  15527. +{
  15528. + u64 sample_type = event->attr.sample_type;
  15529. +
  15530. + data->type = sample_type;
  15531. + header->size += event->id_header_size;
  15532. +
  15533. + if (sample_type & PERF_SAMPLE_TID) {
  15534. + /* namespace issues */
  15535. + data->tid_entry.pid = perf_event_pid(event, current);
  15536. + data->tid_entry.tid = perf_event_tid(event, current);
  15537. + }
  15538. +
  15539. + if (sample_type & PERF_SAMPLE_TIME)
  15540. + data->time = perf_clock();
  15541. +
  15542. + if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
  15543. + data->id = primary_event_id(event);
  15544. +
  15545. + if (sample_type & PERF_SAMPLE_STREAM_ID)
  15546. + data->stream_id = event->id;
  15547. +
  15548. + if (sample_type & PERF_SAMPLE_CPU) {
  15549. + data->cpu_entry.cpu = raw_smp_processor_id();
  15550. + data->cpu_entry.reserved = 0;
  15551. + }
  15552. +}
  15553. +
  15554. +void perf_event_header__init_id(struct perf_event_header *header,
  15555. + struct perf_sample_data *data,
  15556. + struct perf_event *event)
  15557. +{
  15558. + if (event->attr.sample_id_all)
  15559. + __perf_event_header__init_id(header, data, event);
  15560. +}
  15561. +
  15562. +static void __perf_event__output_id_sample(struct perf_output_handle *handle,
  15563. + struct perf_sample_data *data)
  15564. +{
  15565. + u64 sample_type = data->type;
  15566. +
  15567. + if (sample_type & PERF_SAMPLE_TID)
  15568. + perf_output_put(handle, data->tid_entry);
  15569. +
  15570. + if (sample_type & PERF_SAMPLE_TIME)
  15571. + perf_output_put(handle, data->time);
  15572. +
  15573. + if (sample_type & PERF_SAMPLE_ID)
  15574. + perf_output_put(handle, data->id);
  15575. +
  15576. + if (sample_type & PERF_SAMPLE_STREAM_ID)
  15577. + perf_output_put(handle, data->stream_id);
  15578. +
  15579. + if (sample_type & PERF_SAMPLE_CPU)
  15580. + perf_output_put(handle, data->cpu_entry);
  15581. +
  15582. + if (sample_type & PERF_SAMPLE_IDENTIFIER)
  15583. + perf_output_put(handle, data->id);
  15584. +}
  15585. +
  15586. +void perf_event__output_id_sample(struct perf_event *event,
  15587. + struct perf_output_handle *handle,
  15588. + struct perf_sample_data *sample)
  15589. +{
  15590. + if (event->attr.sample_id_all)
  15591. + __perf_event__output_id_sample(handle, sample);
  15592. +}
  15593. +
  15594. +static void perf_output_read_one(struct perf_output_handle *handle,
  15595. + struct perf_event *event,
  15596. + u64 enabled, u64 running)
  15597. +{
  15598. + u64 read_format = event->attr.read_format;
  15599. + u64 values[4];
  15600. + int n = 0;
  15601. +
  15602. + values[n++] = perf_event_count(event);
  15603. + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
  15604. + values[n++] = enabled +
  15605. + atomic64_read(&event->child_total_time_enabled);
  15606. + }
  15607. + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
  15608. + values[n++] = running +
  15609. + atomic64_read(&event->child_total_time_running);
  15610. + }
  15611. + if (read_format & PERF_FORMAT_ID)
  15612. + values[n++] = primary_event_id(event);
  15613. +
  15614. + __output_copy(handle, values, n * sizeof(u64));
  15615. +}
  15616. +
  15617. +/*
  15618. + * XXX PERF_FORMAT_GROUP vs inherited events seems difficult.
  15619. + */
  15620. +static void perf_output_read_group(struct perf_output_handle *handle,
  15621. + struct perf_event *event,
  15622. + u64 enabled, u64 running)
  15623. +{
  15624. + struct perf_event *leader = event->group_leader, *sub;
  15625. + u64 read_format = event->attr.read_format;
  15626. + u64 values[5];
  15627. + int n = 0;
  15628. +
  15629. + values[n++] = 1 + leader->nr_siblings;
  15630. +
  15631. + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
  15632. + values[n++] = enabled;
  15633. +
  15634. + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
  15635. + values[n++] = running;
  15636. +
  15637. + if (leader != event)
  15638. + leader->pmu->read(leader);
  15639. +
  15640. + values[n++] = perf_event_count(leader);
  15641. + if (read_format & PERF_FORMAT_ID)
  15642. + values[n++] = primary_event_id(leader);
  15643. +
  15644. + __output_copy(handle, values, n * sizeof(u64));
  15645. +
  15646. + list_for_each_entry(sub, &leader->sibling_list, group_entry) {
  15647. + n = 0;
  15648. +
  15649. + if ((sub != event) &&
  15650. + (sub->state == PERF_EVENT_STATE_ACTIVE))
  15651. + sub->pmu->read(sub);
  15652. +
  15653. + values[n++] = perf_event_count(sub);
  15654. + if (read_format & PERF_FORMAT_ID)
  15655. + values[n++] = primary_event_id(sub);
  15656. +
  15657. + __output_copy(handle, values, n * sizeof(u64));
  15658. + }
  15659. +}
  15660. +
  15661. +#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
  15662. + PERF_FORMAT_TOTAL_TIME_RUNNING)
  15663. +
  15664. +static void perf_output_read(struct perf_output_handle *handle,
  15665. + struct perf_event *event)
  15666. +{
  15667. + u64 enabled = 0, running = 0, now;
  15668. + u64 read_format = event->attr.read_format;
  15669. +
  15670. + /*
  15671. + * compute total_time_enabled, total_time_running
  15672. + * based on snapshot values taken when the event
  15673. + * was last scheduled in.
  15674. + *
  15675. + * we cannot simply called update_context_time()
  15676. + * because of locking issue as we are called in
  15677. + * NMI context
  15678. + */
  15679. + if (read_format & PERF_FORMAT_TOTAL_TIMES)
  15680. + calc_timer_values(event, &now, &enabled, &running);
  15681. +
  15682. + if (event->attr.read_format & PERF_FORMAT_GROUP)
  15683. + perf_output_read_group(handle, event, enabled, running);
  15684. + else
  15685. + perf_output_read_one(handle, event, enabled, running);
  15686. +}
  15687. +
  15688. +void perf_output_sample(struct perf_output_handle *handle,
  15689. + struct perf_event_header *header,
  15690. + struct perf_sample_data *data,
  15691. + struct perf_event *event)
  15692. +{
  15693. + u64 sample_type = data->type;
  15694. +
  15695. + perf_output_put(handle, *header);
  15696. +
  15697. + if (sample_type & PERF_SAMPLE_IDENTIFIER)
  15698. + perf_output_put(handle, data->id);
  15699. +
  15700. + if (sample_type & PERF_SAMPLE_IP)
  15701. + perf_output_put(handle, data->ip);
  15702. +
  15703. + if (sample_type & PERF_SAMPLE_TID)
  15704. + perf_output_put(handle, data->tid_entry);
  15705. +
  15706. + if (sample_type & PERF_SAMPLE_TIME)
  15707. + perf_output_put(handle, data->time);
  15708. +
  15709. + if (sample_type & PERF_SAMPLE_ADDR)
  15710. + perf_output_put(handle, data->addr);
  15711. +
  15712. + if (sample_type & PERF_SAMPLE_ID)
  15713. + perf_output_put(handle, data->id);
  15714. +
  15715. + if (sample_type & PERF_SAMPLE_STREAM_ID)
  15716. + perf_output_put(handle, data->stream_id);
  15717. +
  15718. + if (sample_type & PERF_SAMPLE_CPU)
  15719. + perf_output_put(handle, data->cpu_entry);
  15720. +
  15721. + if (sample_type & PERF_SAMPLE_PERIOD)
  15722. + perf_output_put(handle, data->period);
  15723. +
  15724. + if (sample_type & PERF_SAMPLE_READ)
  15725. + perf_output_read(handle, event);
  15726. +
  15727. + if (sample_type & PERF_SAMPLE_CALLCHAIN) {
  15728. + if (data->callchain) {
  15729. + int size = 1;
  15730. +
  15731. + if (data->callchain)
  15732. + size += data->callchain->nr;
  15733. +
  15734. + size *= sizeof(u64);
  15735. +
  15736. + __output_copy(handle, data->callchain, size);
  15737. + } else {
  15738. + u64 nr = 0;
  15739. + perf_output_put(handle, nr);
  15740. + }
  15741. + }
  15742. +
  15743. + if (sample_type & PERF_SAMPLE_RAW) {
  15744. + if (data->raw) {
  15745. + perf_output_put(handle, data->raw->size);
  15746. + __output_copy(handle, data->raw->data,
  15747. + data->raw->size);
  15748. + } else {
  15749. + struct {
  15750. + u32 size;
  15751. + u32 data;
  15752. + } raw = {
  15753. + .size = sizeof(u32),
  15754. + .data = 0,
  15755. + };
  15756. + perf_output_put(handle, raw);
  15757. + }
  15758. + }
  15759. +
  15760. + if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
  15761. + if (data->br_stack) {
  15762. + size_t size;
  15763. +
  15764. + size = data->br_stack->nr
  15765. + * sizeof(struct perf_branch_entry);
  15766. +
  15767. + perf_output_put(handle, data->br_stack->nr);
  15768. + perf_output_copy(handle, data->br_stack->entries, size);
  15769. + } else {
  15770. + /*
  15771. + * we always store at least the value of nr
  15772. + */
  15773. + u64 nr = 0;
  15774. + perf_output_put(handle, nr);
  15775. + }
  15776. + }
  15777. +
  15778. + if (sample_type & PERF_SAMPLE_REGS_USER) {
  15779. + u64 abi = data->regs_user.abi;
  15780. +
  15781. + /*
  15782. + * If there are no regs to dump, notice it through
  15783. + * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
  15784. + */
  15785. + perf_output_put(handle, abi);
  15786. +
  15787. + if (abi) {
  15788. + u64 mask = event->attr.sample_regs_user;
  15789. + perf_output_sample_regs(handle,
  15790. + data->regs_user.regs,
  15791. + mask);
  15792. + }
  15793. + }
  15794. +
  15795. + if (sample_type & PERF_SAMPLE_STACK_USER) {
  15796. + perf_output_sample_ustack(handle,
  15797. + data->stack_user_size,
  15798. + data->regs_user.regs);
  15799. + }
  15800. +
  15801. + if (sample_type & PERF_SAMPLE_WEIGHT)
  15802. + perf_output_put(handle, data->weight);
  15803. +
  15804. + if (sample_type & PERF_SAMPLE_DATA_SRC)
  15805. + perf_output_put(handle, data->data_src.val);
  15806. +
  15807. + if (sample_type & PERF_SAMPLE_TRANSACTION)
  15808. + perf_output_put(handle, data->txn);
  15809. +
  15810. + if (!event->attr.watermark) {
  15811. + int wakeup_events = event->attr.wakeup_events;
  15812. +
  15813. + if (wakeup_events) {
  15814. + struct ring_buffer *rb = handle->rb;
  15815. + int events = local_inc_return(&rb->events);
  15816. +
  15817. + if (events >= wakeup_events) {
  15818. + local_sub(wakeup_events, &rb->events);
  15819. + local_inc(&rb->wakeup);
  15820. + }
  15821. + }
  15822. + }
  15823. +}
  15824. +
  15825. +void perf_prepare_sample(struct perf_event_header *header,
  15826. + struct perf_sample_data *data,
  15827. + struct perf_event *event,
  15828. + struct pt_regs *regs)
  15829. +{
  15830. + u64 sample_type = event->attr.sample_type;
  15831. +
  15832. + header->type = PERF_RECORD_SAMPLE;
  15833. + header->size = sizeof(*header) + event->header_size;
  15834. +
  15835. + header->misc = 0;
  15836. + header->misc |= perf_misc_flags(regs);
  15837. +
  15838. + __perf_event_header__init_id(header, data, event);
  15839. +
  15840. + if (sample_type & PERF_SAMPLE_IP)
  15841. + data->ip = perf_instruction_pointer(regs);
  15842. +
  15843. + if (sample_type & PERF_SAMPLE_CALLCHAIN) {
  15844. + int size = 1;
  15845. +
  15846. + data->callchain = perf_callchain(event, regs);
  15847. +
  15848. + if (data->callchain)
  15849. + size += data->callchain->nr;
  15850. +
  15851. + header->size += size * sizeof(u64);
  15852. + }
  15853. +
  15854. + if (sample_type & PERF_SAMPLE_RAW) {
  15855. + int size = sizeof(u32);
  15856. +
  15857. + if (data->raw)
  15858. + size += data->raw->size;
  15859. + else
  15860. + size += sizeof(u32);
  15861. +
  15862. + WARN_ON_ONCE(size & (sizeof(u64)-1));
  15863. + header->size += size;
  15864. + }
  15865. +
  15866. + if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
  15867. + int size = sizeof(u64); /* nr */
  15868. + if (data->br_stack) {
  15869. + size += data->br_stack->nr
  15870. + * sizeof(struct perf_branch_entry);
  15871. + }
  15872. + header->size += size;
  15873. + }
  15874. +
  15875. + if (sample_type & PERF_SAMPLE_REGS_USER) {
  15876. + /* regs dump ABI info */
  15877. + int size = sizeof(u64);
  15878. +
  15879. + perf_sample_regs_user(&data->regs_user, regs);
  15880. +
  15881. + if (data->regs_user.regs) {
  15882. + u64 mask = event->attr.sample_regs_user;
  15883. + size += hweight64(mask) * sizeof(u64);
  15884. + }
  15885. +
  15886. + header->size += size;
  15887. + }
  15888. +
  15889. + if (sample_type & PERF_SAMPLE_STACK_USER) {
  15890. + /*
  15891. + * Either we need PERF_SAMPLE_STACK_USER bit to be allways
  15892. + * processed as the last one or have additional check added
  15893. + * in case new sample type is added, because we could eat
  15894. + * up the rest of the sample size.
  15895. + */
  15896. + struct perf_regs_user *uregs = &data->regs_user;
  15897. + u16 stack_size = event->attr.sample_stack_user;
  15898. + u16 size = sizeof(u64);
  15899. +
  15900. + if (!uregs->abi)
  15901. + perf_sample_regs_user(uregs, regs);
  15902. +
  15903. + stack_size = perf_sample_ustack_size(stack_size, header->size,
  15904. + uregs->regs);
  15905. +
  15906. + /*
  15907. + * If there is something to dump, add space for the dump
  15908. + * itself and for the field that tells the dynamic size,
  15909. + * which is how many have been actually dumped.
  15910. + */
  15911. + if (stack_size)
  15912. + size += sizeof(u64) + stack_size;
  15913. +
  15914. + data->stack_user_size = stack_size;
  15915. + header->size += size;
  15916. + }
  15917. +}
  15918. +
  15919. +static void perf_event_output(struct perf_event *event,
  15920. + struct perf_sample_data *data,
  15921. + struct pt_regs *regs)
  15922. +{
  15923. + struct perf_output_handle handle;
  15924. + struct perf_event_header header;
  15925. +
  15926. + /* protect the callchain buffers */
  15927. + rcu_read_lock();
  15928. +
  15929. + perf_prepare_sample(&header, data, event, regs);
  15930. +
  15931. + if (perf_output_begin(&handle, event, header.size))
  15932. + goto exit;
  15933. +
  15934. + perf_output_sample(&handle, &header, data, event);
  15935. +
  15936. + perf_output_end(&handle);
  15937. +
  15938. +exit:
  15939. + rcu_read_unlock();
  15940. +}
  15941. +
  15942. +/*
  15943. + * read event_id
  15944. + */
  15945. +
  15946. +struct perf_read_event {
  15947. + struct perf_event_header header;
  15948. +
  15949. + u32 pid;
  15950. + u32 tid;
  15951. +};
  15952. +
  15953. +static void
  15954. +perf_event_read_event(struct perf_event *event,
  15955. + struct task_struct *task)
  15956. +{
  15957. + struct perf_output_handle handle;
  15958. + struct perf_sample_data sample;
  15959. + struct perf_read_event read_event = {
  15960. + .header = {
  15961. + .type = PERF_RECORD_READ,
  15962. + .misc = 0,
  15963. + .size = sizeof(read_event) + event->read_size,
  15964. + },
  15965. + .pid = perf_event_pid(event, task),
  15966. + .tid = perf_event_tid(event, task),
  15967. + };
  15968. + int ret;
  15969. +
  15970. + perf_event_header__init_id(&read_event.header, &sample, event);
  15971. + ret = perf_output_begin(&handle, event, read_event.header.size);
  15972. + if (ret)
  15973. + return;
  15974. +
  15975. + perf_output_put(&handle, read_event);
  15976. + perf_output_read(&handle, event);
  15977. + perf_event__output_id_sample(event, &handle, &sample);
  15978. +
  15979. + perf_output_end(&handle);
  15980. +}
  15981. +
  15982. +typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data);
  15983. +
  15984. +static void
  15985. +perf_event_aux_ctx(struct perf_event_context *ctx,
  15986. + perf_event_aux_output_cb output,
  15987. + void *data)
  15988. +{
  15989. + struct perf_event *event;
  15990. +
  15991. + list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
  15992. + if (event->state < PERF_EVENT_STATE_INACTIVE)
  15993. + continue;
  15994. + if (!event_filter_match(event))
  15995. + continue;
  15996. + output(event, data);
  15997. + }
  15998. +}
  15999. +
  16000. +static void
  16001. +perf_event_aux(perf_event_aux_output_cb output, void *data,
  16002. + struct perf_event_context *task_ctx)
  16003. +{
  16004. + struct perf_cpu_context *cpuctx;
  16005. + struct perf_event_context *ctx;
  16006. + struct pmu *pmu;
  16007. + int ctxn;
  16008. +
  16009. + rcu_read_lock();
  16010. + list_for_each_entry_rcu(pmu, &pmus, entry) {
  16011. + cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
  16012. + if (cpuctx->unique_pmu != pmu)
  16013. + goto next;
  16014. + perf_event_aux_ctx(&cpuctx->ctx, output, data);
  16015. + if (task_ctx)
  16016. + goto next;
  16017. + ctxn = pmu->task_ctx_nr;
  16018. + if (ctxn < 0)
  16019. + goto next;
  16020. + ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
  16021. + if (ctx)
  16022. + perf_event_aux_ctx(ctx, output, data);
  16023. +next:
  16024. + put_cpu_ptr(pmu->pmu_cpu_context);
  16025. + }
  16026. +
  16027. + if (task_ctx) {
  16028. + preempt_disable();
  16029. + perf_event_aux_ctx(task_ctx, output, data);
  16030. + preempt_enable();
  16031. + }
  16032. + rcu_read_unlock();
  16033. +}
  16034. +
  16035. +/*
  16036. + * task tracking -- fork/exit
  16037. + *
  16038. + * enabled by: attr.comm | attr.mmap | attr.mmap2 | attr.mmap_data | attr.task
  16039. + */
  16040. +
  16041. +struct perf_task_event {
  16042. + struct task_struct *task;
  16043. + struct perf_event_context *task_ctx;
  16044. +
  16045. + struct {
  16046. + struct perf_event_header header;
  16047. +
  16048. + u32 pid;
  16049. + u32 ppid;
  16050. + u32 tid;
  16051. + u32 ptid;
  16052. + u64 time;
  16053. + } event_id;
  16054. +};
  16055. +
  16056. +static int perf_event_task_match(struct perf_event *event)
  16057. +{
  16058. + return event->attr.comm || event->attr.mmap ||
  16059. + event->attr.mmap2 || event->attr.mmap_data ||
  16060. + event->attr.task;
  16061. +}
  16062. +
  16063. +static void perf_event_task_output(struct perf_event *event,
  16064. + void *data)
  16065. +{
  16066. + struct perf_task_event *task_event = data;
  16067. + struct perf_output_handle handle;
  16068. + struct perf_sample_data sample;
  16069. + struct task_struct *task = task_event->task;
  16070. + int ret, size = task_event->event_id.header.size;
  16071. +
  16072. + if (!perf_event_task_match(event))
  16073. + return;
  16074. +
  16075. + perf_event_header__init_id(&task_event->event_id.header, &sample, event);
  16076. +
  16077. + ret = perf_output_begin(&handle, event,
  16078. + task_event->event_id.header.size);
  16079. + if (ret)
  16080. + goto out;
  16081. +
  16082. + task_event->event_id.pid = perf_event_pid(event, task);
  16083. + task_event->event_id.ppid = perf_event_pid(event, current);
  16084. +
  16085. + task_event->event_id.tid = perf_event_tid(event, task);
  16086. + task_event->event_id.ptid = perf_event_tid(event, current);
  16087. +
  16088. + perf_output_put(&handle, task_event->event_id);
  16089. +
  16090. + perf_event__output_id_sample(event, &handle, &sample);
  16091. +
  16092. + perf_output_end(&handle);
  16093. +out:
  16094. + task_event->event_id.header.size = size;
  16095. +}
  16096. +
  16097. +static void perf_event_task(struct task_struct *task,
  16098. + struct perf_event_context *task_ctx,
  16099. + int new)
  16100. +{
  16101. + struct perf_task_event task_event;
  16102. +
  16103. + if (!atomic_read(&nr_comm_events) &&
  16104. + !atomic_read(&nr_mmap_events) &&
  16105. + !atomic_read(&nr_task_events))
  16106. + return;
  16107. +
  16108. + task_event = (struct perf_task_event){
  16109. + .task = task,
  16110. + .task_ctx = task_ctx,
  16111. + .event_id = {
  16112. + .header = {
  16113. + .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
  16114. + .misc = 0,
  16115. + .size = sizeof(task_event.event_id),
  16116. + },
  16117. + /* .pid */
  16118. + /* .ppid */
  16119. + /* .tid */
  16120. + /* .ptid */
  16121. + .time = perf_clock(),
  16122. + },
  16123. + };
  16124. +
  16125. + perf_event_aux(perf_event_task_output,
  16126. + &task_event,
  16127. + task_ctx);
  16128. +}
  16129. +
  16130. +void perf_event_fork(struct task_struct *task)
  16131. +{
  16132. + perf_event_task(task, NULL, 1);
  16133. +}
  16134. +
  16135. +/*
  16136. + * comm tracking
  16137. + */
  16138. +
  16139. +struct perf_comm_event {
  16140. + struct task_struct *task;
  16141. + char *comm;
  16142. + int comm_size;
  16143. +
  16144. + struct {
  16145. + struct perf_event_header header;
  16146. +
  16147. + u32 pid;
  16148. + u32 tid;
  16149. + } event_id;
  16150. +};
  16151. +
  16152. +static int perf_event_comm_match(struct perf_event *event)
  16153. +{
  16154. + return event->attr.comm;
  16155. +}
  16156. +
  16157. +static void perf_event_comm_output(struct perf_event *event,
  16158. + void *data)
  16159. +{
  16160. + struct perf_comm_event *comm_event = data;
  16161. + struct perf_output_handle handle;
  16162. + struct perf_sample_data sample;
  16163. + int size = comm_event->event_id.header.size;
  16164. + int ret;
  16165. +
  16166. + if (!perf_event_comm_match(event))
  16167. + return;
  16168. +
  16169. + perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
  16170. + ret = perf_output_begin(&handle, event,
  16171. + comm_event->event_id.header.size);
  16172. +
  16173. + if (ret)
  16174. + goto out;
  16175. +
  16176. + comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
  16177. + comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
  16178. +
  16179. + perf_output_put(&handle, comm_event->event_id);
  16180. + __output_copy(&handle, comm_event->comm,
  16181. + comm_event->comm_size);
  16182. +
  16183. + perf_event__output_id_sample(event, &handle, &sample);
  16184. +
  16185. + perf_output_end(&handle);
  16186. +out:
  16187. + comm_event->event_id.header.size = size;
  16188. +}
  16189. +
  16190. +static void perf_event_comm_event(struct perf_comm_event *comm_event)
  16191. +{
  16192. + char comm[TASK_COMM_LEN];
  16193. + unsigned int size;
  16194. +
  16195. + memset(comm, 0, sizeof(comm));
  16196. + strlcpy(comm, comm_event->task->comm, sizeof(comm));
  16197. + size = ALIGN(strlen(comm)+1, sizeof(u64));
  16198. +
  16199. + comm_event->comm = comm;
  16200. + comm_event->comm_size = size;
  16201. +
  16202. + comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
  16203. +
  16204. + perf_event_aux(perf_event_comm_output,
  16205. + comm_event,
  16206. + NULL);
  16207. +}
  16208. +
  16209. +void perf_event_comm(struct task_struct *task, bool exec)
  16210. +{
  16211. + struct perf_comm_event comm_event;
  16212. +
  16213. + if (!atomic_read(&nr_comm_events))
  16214. + return;
  16215. +
  16216. + comm_event = (struct perf_comm_event){
  16217. + .task = task,
  16218. + /* .comm */
  16219. + /* .comm_size */
  16220. + .event_id = {
  16221. + .header = {
  16222. + .type = PERF_RECORD_COMM,
  16223. + .misc = exec ? PERF_RECORD_MISC_COMM_EXEC : 0,
  16224. + /* .size */
  16225. + },
  16226. + /* .pid */
  16227. + /* .tid */
  16228. + },
  16229. + };
  16230. +
  16231. + perf_event_comm_event(&comm_event);
  16232. +}
  16233. +
  16234. +/*
  16235. + * mmap tracking
  16236. + */
  16237. +
  16238. +struct perf_mmap_event {
  16239. + struct vm_area_struct *vma;
  16240. +
  16241. + const char *file_name;
  16242. + int file_size;
  16243. + int maj, min;
  16244. + u64 ino;
  16245. + u64 ino_generation;
  16246. + u32 prot, flags;
  16247. +
  16248. + struct {
  16249. + struct perf_event_header header;
  16250. +
  16251. + u32 pid;
  16252. + u32 tid;
  16253. + u64 start;
  16254. + u64 len;
  16255. + u64 pgoff;
  16256. + } event_id;
  16257. +};
  16258. +
  16259. +static int perf_event_mmap_match(struct perf_event *event,
  16260. + void *data)
  16261. +{
  16262. + struct perf_mmap_event *mmap_event = data;
  16263. + struct vm_area_struct *vma = mmap_event->vma;
  16264. + int executable = vma->vm_flags & VM_EXEC;
  16265. +
  16266. + return (!executable && event->attr.mmap_data) ||
  16267. + (executable && (event->attr.mmap || event->attr.mmap2));
  16268. +}
  16269. +
  16270. +static void perf_event_mmap_output(struct perf_event *event,
  16271. + void *data)
  16272. +{
  16273. + struct perf_mmap_event *mmap_event = data;
  16274. + struct perf_output_handle handle;
  16275. + struct perf_sample_data sample;
  16276. + int size = mmap_event->event_id.header.size;
  16277. + int ret;
  16278. +
  16279. + if (!perf_event_mmap_match(event, data))
  16280. + return;
  16281. +
  16282. + if (event->attr.mmap2) {
  16283. + mmap_event->event_id.header.type = PERF_RECORD_MMAP2;
  16284. + mmap_event->event_id.header.size += sizeof(mmap_event->maj);
  16285. + mmap_event->event_id.header.size += sizeof(mmap_event->min);
  16286. + mmap_event->event_id.header.size += sizeof(mmap_event->ino);
  16287. + mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation);
  16288. + mmap_event->event_id.header.size += sizeof(mmap_event->prot);
  16289. + mmap_event->event_id.header.size += sizeof(mmap_event->flags);
  16290. + }
  16291. +
  16292. + perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
  16293. + ret = perf_output_begin(&handle, event,
  16294. + mmap_event->event_id.header.size);
  16295. + if (ret)
  16296. + goto out;
  16297. +
  16298. + mmap_event->event_id.pid = perf_event_pid(event, current);
  16299. + mmap_event->event_id.tid = perf_event_tid(event, current);
  16300. +
  16301. + perf_output_put(&handle, mmap_event->event_id);
  16302. +
  16303. + if (event->attr.mmap2) {
  16304. + perf_output_put(&handle, mmap_event->maj);
  16305. + perf_output_put(&handle, mmap_event->min);
  16306. + perf_output_put(&handle, mmap_event->ino);
  16307. + perf_output_put(&handle, mmap_event->ino_generation);
  16308. + perf_output_put(&handle, mmap_event->prot);
  16309. + perf_output_put(&handle, mmap_event->flags);
  16310. + }
  16311. +
  16312. + __output_copy(&handle, mmap_event->file_name,
  16313. + mmap_event->file_size);
  16314. +
  16315. + perf_event__output_id_sample(event, &handle, &sample);
  16316. +
  16317. + perf_output_end(&handle);
  16318. +out:
  16319. + mmap_event->event_id.header.size = size;
  16320. +}
  16321. +
  16322. +static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
  16323. +{
  16324. + struct vm_area_struct *vma = mmap_event->vma;
  16325. + struct file *file = vma->vm_file;
  16326. + int maj = 0, min = 0;
  16327. + u64 ino = 0, gen = 0;
  16328. + u32 prot = 0, flags = 0;
  16329. + unsigned int size;
  16330. + char tmp[16];
  16331. + char *buf = NULL;
  16332. + char *name;
  16333. +
  16334. + if (file) {
  16335. + struct inode *inode;
  16336. + dev_t dev;
  16337. +
  16338. + buf = kmalloc(PATH_MAX, GFP_KERNEL);
  16339. + if (!buf) {
  16340. + name = "//enomem";
  16341. + goto cpy_name;
  16342. + }
  16343. + /*
  16344. + * d_path() works from the end of the rb backwards, so we
  16345. + * need to add enough zero bytes after the string to handle
  16346. + * the 64bit alignment we do later.
  16347. + */
  16348. + name = d_path(&file->f_path, buf, PATH_MAX - sizeof(u64));
  16349. + if (IS_ERR(name)) {
  16350. + name = "//toolong";
  16351. + goto cpy_name;
  16352. + }
  16353. + inode = file_inode(vma->vm_file);
  16354. + dev = inode->i_sb->s_dev;
  16355. + ino = inode->i_ino;
  16356. + gen = inode->i_generation;
  16357. + maj = MAJOR(dev);
  16358. + min = MINOR(dev);
  16359. +
  16360. + if (vma->vm_flags & VM_READ)
  16361. + prot |= PROT_READ;
  16362. + if (vma->vm_flags & VM_WRITE)
  16363. + prot |= PROT_WRITE;
  16364. + if (vma->vm_flags & VM_EXEC)
  16365. + prot |= PROT_EXEC;
  16366. +
  16367. + if (vma->vm_flags & VM_MAYSHARE)
  16368. + flags = MAP_SHARED;
  16369. + else
  16370. + flags = MAP_PRIVATE;
  16371. +
  16372. + if (vma->vm_flags & VM_DENYWRITE)
  16373. + flags |= MAP_DENYWRITE;
  16374. + if (vma->vm_flags & VM_MAYEXEC)
  16375. + flags |= MAP_EXECUTABLE;
  16376. + if (vma->vm_flags & VM_LOCKED)
  16377. + flags |= MAP_LOCKED;
  16378. + if (vma->vm_flags & VM_HUGETLB)
  16379. + flags |= MAP_HUGETLB;
  16380. +
  16381. + goto got_name;
  16382. + } else {
  16383. + if (vma->vm_ops && vma->vm_ops->name) {
  16384. + name = (char *) vma->vm_ops->name(vma);
  16385. + if (name)
  16386. + goto cpy_name;
  16387. + }
  16388. +
  16389. + name = (char *)arch_vma_name(vma);
  16390. + if (name)
  16391. + goto cpy_name;
  16392. +
  16393. + if (vma->vm_start <= vma->vm_mm->start_brk &&
  16394. + vma->vm_end >= vma->vm_mm->brk) {
  16395. + name = "[heap]";
  16396. + goto cpy_name;
  16397. + }
  16398. + if (vma->vm_start <= vma->vm_mm->start_stack &&
  16399. + vma->vm_end >= vma->vm_mm->start_stack) {
  16400. + name = "[stack]";
  16401. + goto cpy_name;
  16402. + }
  16403. +
  16404. + name = "//anon";
  16405. + goto cpy_name;
  16406. + }
  16407. +
  16408. +cpy_name:
  16409. + strlcpy(tmp, name, sizeof(tmp));
  16410. + name = tmp;
  16411. +got_name:
  16412. + /*
  16413. + * Since our buffer works in 8 byte units we need to align our string
  16414. + * size to a multiple of 8. However, we must guarantee the tail end is
  16415. + * zero'd out to avoid leaking random bits to userspace.
  16416. + */
  16417. + size = strlen(name)+1;
  16418. + while (!IS_ALIGNED(size, sizeof(u64)))
  16419. + name[size++] = '\0';
  16420. +
  16421. + mmap_event->file_name = name;
  16422. + mmap_event->file_size = size;
  16423. + mmap_event->maj = maj;
  16424. + mmap_event->min = min;
  16425. + mmap_event->ino = ino;
  16426. + mmap_event->ino_generation = gen;
  16427. + mmap_event->prot = prot;
  16428. + mmap_event->flags = flags;
  16429. +
  16430. + if (!(vma->vm_flags & VM_EXEC))
  16431. + mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA;
  16432. +
  16433. + mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
  16434. +
  16435. + perf_event_aux(perf_event_mmap_output,
  16436. + mmap_event,
  16437. + NULL);
  16438. +
  16439. + kfree(buf);
  16440. +}
  16441. +
  16442. +void perf_event_mmap(struct vm_area_struct *vma)
  16443. +{
  16444. + struct perf_mmap_event mmap_event;
  16445. +
  16446. + if (!atomic_read(&nr_mmap_events))
  16447. + return;
  16448. +
  16449. + mmap_event = (struct perf_mmap_event){
  16450. + .vma = vma,
  16451. + /* .file_name */
  16452. + /* .file_size */
  16453. + .event_id = {
  16454. + .header = {
  16455. + .type = PERF_RECORD_MMAP,
  16456. + .misc = PERF_RECORD_MISC_USER,
  16457. + /* .size */
  16458. + },
  16459. + /* .pid */
  16460. + /* .tid */
  16461. + .start = vma->vm_start,
  16462. + .len = vma->vm_end - vma->vm_start,
  16463. + .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT,
  16464. + },
  16465. + /* .maj (attr_mmap2 only) */
  16466. + /* .min (attr_mmap2 only) */
  16467. + /* .ino (attr_mmap2 only) */
  16468. + /* .ino_generation (attr_mmap2 only) */
  16469. + /* .prot (attr_mmap2 only) */
  16470. + /* .flags (attr_mmap2 only) */
  16471. + };
  16472. +
  16473. + perf_event_mmap_event(&mmap_event);
  16474. +}
  16475. +
  16476. +/*
  16477. + * IRQ throttle logging
  16478. + */
  16479. +
  16480. +static void perf_log_throttle(struct perf_event *event, int enable)
  16481. +{
  16482. + struct perf_output_handle handle;
  16483. + struct perf_sample_data sample;
  16484. + int ret;
  16485. +
  16486. + struct {
  16487. + struct perf_event_header header;
  16488. + u64 time;
  16489. + u64 id;
  16490. + u64 stream_id;
  16491. + } throttle_event = {
  16492. + .header = {
  16493. + .type = PERF_RECORD_THROTTLE,
  16494. + .misc = 0,
  16495. + .size = sizeof(throttle_event),
  16496. + },
  16497. + .time = perf_clock(),
  16498. + .id = primary_event_id(event),
  16499. + .stream_id = event->id,
  16500. + };
  16501. +
  16502. + if (enable)
  16503. + throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
  16504. +
  16505. + perf_event_header__init_id(&throttle_event.header, &sample, event);
  16506. +
  16507. + ret = perf_output_begin(&handle, event,
  16508. + throttle_event.header.size);
  16509. + if (ret)
  16510. + return;
  16511. +
  16512. + perf_output_put(&handle, throttle_event);
  16513. + perf_event__output_id_sample(event, &handle, &sample);
  16514. + perf_output_end(&handle);
  16515. +}
  16516. +
  16517. +/*
  16518. + * Generic event overflow handling, sampling.
  16519. + */
  16520. +
  16521. +static int __perf_event_overflow(struct perf_event *event,
  16522. + int throttle, struct perf_sample_data *data,
  16523. + struct pt_regs *regs)
  16524. +{
  16525. + int events = atomic_read(&event->event_limit);
  16526. + struct hw_perf_event *hwc = &event->hw;
  16527. + u64 seq;
  16528. + int ret = 0;
  16529. +
  16530. + /*
  16531. + * Non-sampling counters might still use the PMI to fold short
  16532. + * hardware counters, ignore those.
  16533. + */
  16534. + if (unlikely(!is_sampling_event(event)))
  16535. + return 0;
  16536. +
  16537. + seq = __this_cpu_read(perf_throttled_seq);
  16538. + if (seq != hwc->interrupts_seq) {
  16539. + hwc->interrupts_seq = seq;
  16540. + hwc->interrupts = 1;
  16541. + } else {
  16542. + hwc->interrupts++;
  16543. + if (unlikely(throttle
  16544. + && hwc->interrupts >= max_samples_per_tick)) {
  16545. + __this_cpu_inc(perf_throttled_count);
  16546. + hwc->interrupts = MAX_INTERRUPTS;
  16547. + perf_log_throttle(event, 0);
  16548. + tick_nohz_full_kick();
  16549. + ret = 1;
  16550. + }
  16551. + }
  16552. +
  16553. + if (event->attr.freq) {
  16554. + u64 now = perf_clock();
  16555. + s64 delta = now - hwc->freq_time_stamp;
  16556. +
  16557. + hwc->freq_time_stamp = now;
  16558. +
  16559. + if (delta > 0 && delta < 2*TICK_NSEC)
  16560. + perf_adjust_period(event, delta, hwc->last_period, true);
  16561. + }
  16562. +
  16563. + /*
  16564. + * XXX event_limit might not quite work as expected on inherited
  16565. + * events
  16566. + */
  16567. +
  16568. + event->pending_kill = POLL_IN;
  16569. + if (events && atomic_dec_and_test(&event->event_limit)) {
  16570. + ret = 1;
  16571. + event->pending_kill = POLL_HUP;
  16572. + event->pending_disable = 1;
  16573. + irq_work_queue(&event->pending);
  16574. + }
  16575. +
  16576. + if (event->overflow_handler)
  16577. + event->overflow_handler(event, data, regs);
  16578. + else
  16579. + perf_event_output(event, data, regs);
  16580. +
  16581. + if (event->fasync && event->pending_kill) {
  16582. + event->pending_wakeup = 1;
  16583. + irq_work_queue(&event->pending);
  16584. + }
  16585. +
  16586. + return ret;
  16587. +}
  16588. +
  16589. +int perf_event_overflow(struct perf_event *event,
  16590. + struct perf_sample_data *data,
  16591. + struct pt_regs *regs)
  16592. +{
  16593. + return __perf_event_overflow(event, 1, data, regs);
  16594. +}
  16595. +
  16596. +/*
  16597. + * Generic software event infrastructure
  16598. + */
  16599. +
  16600. +struct swevent_htable {
  16601. + struct swevent_hlist *swevent_hlist;
  16602. + struct mutex hlist_mutex;
  16603. + int hlist_refcount;
  16604. +
  16605. + /* Recursion avoidance in each contexts */
  16606. + int recursion[PERF_NR_CONTEXTS];
  16607. +
  16608. + /* Keeps track of cpu being initialized/exited */
  16609. + bool online;
  16610. +};
  16611. +
  16612. +static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
  16613. +
  16614. +/*
  16615. + * We directly increment event->count and keep a second value in
  16616. + * event->hw.period_left to count intervals. This period event
  16617. + * is kept in the range [-sample_period, 0] so that we can use the
  16618. + * sign as trigger.
  16619. + */
  16620. +
  16621. +u64 perf_swevent_set_period(struct perf_event *event)
  16622. +{
  16623. + struct hw_perf_event *hwc = &event->hw;
  16624. + u64 period = hwc->last_period;
  16625. + u64 nr, offset;
  16626. + s64 old, val;
  16627. +
  16628. + hwc->last_period = hwc->sample_period;
  16629. +
  16630. +again:
  16631. + old = val = local64_read(&hwc->period_left);
  16632. + if (val < 0)
  16633. + return 0;
  16634. +
  16635. + nr = div64_u64(period + val, period);
  16636. + offset = nr * period;
  16637. + val -= offset;
  16638. + if (local64_cmpxchg(&hwc->period_left, old, val) != old)
  16639. + goto again;
  16640. +
  16641. + return nr;
  16642. +}
  16643. +
  16644. +static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
  16645. + struct perf_sample_data *data,
  16646. + struct pt_regs *regs)
  16647. +{
  16648. + struct hw_perf_event *hwc = &event->hw;
  16649. + int throttle = 0;
  16650. +
  16651. + if (!overflow)
  16652. + overflow = perf_swevent_set_period(event);
  16653. +
  16654. + if (hwc->interrupts == MAX_INTERRUPTS)
  16655. + return;
  16656. +
  16657. + for (; overflow; overflow--) {
  16658. + if (__perf_event_overflow(event, throttle,
  16659. + data, regs)) {
  16660. + /*
  16661. + * We inhibit the overflow from happening when
  16662. + * hwc->interrupts == MAX_INTERRUPTS.
  16663. + */
  16664. + break;
  16665. + }
  16666. + throttle = 1;
  16667. + }
  16668. +}
  16669. +
  16670. +static void perf_swevent_event(struct perf_event *event, u64 nr,
  16671. + struct perf_sample_data *data,
  16672. + struct pt_regs *regs)
  16673. +{
  16674. + struct hw_perf_event *hwc = &event->hw;
  16675. +
  16676. + local64_add(nr, &event->count);
  16677. +
  16678. + if (!regs)
  16679. + return;
  16680. +
  16681. + if (!is_sampling_event(event))
  16682. + return;
  16683. +
  16684. + if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) {
  16685. + data->period = nr;
  16686. + return perf_swevent_overflow(event, 1, data, regs);
  16687. + } else
  16688. + data->period = event->hw.last_period;
  16689. +
  16690. + if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
  16691. + return perf_swevent_overflow(event, 1, data, regs);
  16692. +
  16693. + if (local64_add_negative(nr, &hwc->period_left))
  16694. + return;
  16695. +
  16696. + perf_swevent_overflow(event, 0, data, regs);
  16697. +}
  16698. +
  16699. +static int perf_exclude_event(struct perf_event *event,
  16700. + struct pt_regs *regs)
  16701. +{
  16702. + if (event->hw.state & PERF_HES_STOPPED)
  16703. + return 1;
  16704. +
  16705. + if (regs) {
  16706. + if (event->attr.exclude_user && user_mode(regs))
  16707. + return 1;
  16708. +
  16709. + if (event->attr.exclude_kernel && !user_mode(regs))
  16710. + return 1;
  16711. + }
  16712. +
  16713. + return 0;
  16714. +}
  16715. +
  16716. +static int perf_swevent_match(struct perf_event *event,
  16717. + enum perf_type_id type,
  16718. + u32 event_id,
  16719. + struct perf_sample_data *data,
  16720. + struct pt_regs *regs)
  16721. +{
  16722. + if (event->attr.type != type)
  16723. + return 0;
  16724. +
  16725. + if (event->attr.config != event_id)
  16726. + return 0;
  16727. +
  16728. + if (perf_exclude_event(event, regs))
  16729. + return 0;
  16730. +
  16731. + return 1;
  16732. +}
  16733. +
  16734. +static inline u64 swevent_hash(u64 type, u32 event_id)
  16735. +{
  16736. + u64 val = event_id | (type << 32);
  16737. +
  16738. + return hash_64(val, SWEVENT_HLIST_BITS);
  16739. +}
  16740. +
  16741. +static inline struct hlist_head *
  16742. +__find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
  16743. +{
  16744. + u64 hash = swevent_hash(type, event_id);
  16745. +
  16746. + return &hlist->heads[hash];
  16747. +}
  16748. +
  16749. +/* For the read side: events when they trigger */
  16750. +static inline struct hlist_head *
  16751. +find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
  16752. +{
  16753. + struct swevent_hlist *hlist;
  16754. +
  16755. + hlist = rcu_dereference(swhash->swevent_hlist);
  16756. + if (!hlist)
  16757. + return NULL;
  16758. +
  16759. + return __find_swevent_head(hlist, type, event_id);
  16760. +}
  16761. +
  16762. +/* For the event head insertion and removal in the hlist */
  16763. +static inline struct hlist_head *
  16764. +find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
  16765. +{
  16766. + struct swevent_hlist *hlist;
  16767. + u32 event_id = event->attr.config;
  16768. + u64 type = event->attr.type;
  16769. +
  16770. + /*
  16771. + * Event scheduling is always serialized against hlist allocation
  16772. + * and release. Which makes the protected version suitable here.
  16773. + * The context lock guarantees that.
  16774. + */
  16775. + hlist = rcu_dereference_protected(swhash->swevent_hlist,
  16776. + lockdep_is_held(&event->ctx->lock));
  16777. + if (!hlist)
  16778. + return NULL;
  16779. +
  16780. + return __find_swevent_head(hlist, type, event_id);
  16781. +}
  16782. +
  16783. +static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
  16784. + u64 nr,
  16785. + struct perf_sample_data *data,
  16786. + struct pt_regs *regs)
  16787. +{
  16788. + struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
  16789. + struct perf_event *event;
  16790. + struct hlist_head *head;
  16791. +
  16792. + rcu_read_lock();
  16793. + head = find_swevent_head_rcu(swhash, type, event_id);
  16794. + if (!head)
  16795. + goto end;
  16796. +
  16797. + hlist_for_each_entry_rcu(event, head, hlist_entry) {
  16798. + if (perf_swevent_match(event, type, event_id, data, regs))
  16799. + perf_swevent_event(event, nr, data, regs);
  16800. + }
  16801. +end:
  16802. + rcu_read_unlock();
  16803. +}
  16804. +
  16805. +int perf_swevent_get_recursion_context(void)
  16806. +{
  16807. + struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
  16808. +
  16809. + return get_recursion_context(swhash->recursion);
  16810. +}
  16811. +EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
  16812. +
  16813. +inline void perf_swevent_put_recursion_context(int rctx)
  16814. +{
  16815. + struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
  16816. +
  16817. + put_recursion_context(swhash->recursion, rctx);
  16818. +}
  16819. +
  16820. +void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
  16821. +{
  16822. + struct perf_sample_data data;
  16823. + int rctx;
  16824. +
  16825. + preempt_disable_notrace();
  16826. + rctx = perf_swevent_get_recursion_context();
  16827. + if (rctx < 0)
  16828. + return;
  16829. +
  16830. + perf_sample_data_init(&data, addr, 0);
  16831. +
  16832. + do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
  16833. +
  16834. + perf_swevent_put_recursion_context(rctx);
  16835. + preempt_enable_notrace();
  16836. +}
  16837. +
  16838. +static void perf_swevent_read(struct perf_event *event)
  16839. +{
  16840. +}
  16841. +
  16842. +static int perf_swevent_add(struct perf_event *event, int flags)
  16843. +{
  16844. + struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
  16845. + struct hw_perf_event *hwc = &event->hw;
  16846. + struct hlist_head *head;
  16847. +
  16848. + if (is_sampling_event(event)) {
  16849. + hwc->last_period = hwc->sample_period;
  16850. + perf_swevent_set_period(event);
  16851. + }
  16852. +
  16853. + hwc->state = !(flags & PERF_EF_START);
  16854. +
  16855. + head = find_swevent_head(swhash, event);
  16856. + if (!head) {
  16857. + /*
  16858. + * We can race with cpu hotplug code. Do not
  16859. + * WARN if the cpu just got unplugged.
  16860. + */
  16861. + WARN_ON_ONCE(swhash->online);
  16862. + return -EINVAL;
  16863. + }
  16864. +
  16865. + hlist_add_head_rcu(&event->hlist_entry, head);
  16866. +
  16867. + return 0;
  16868. +}
  16869. +
  16870. +static void perf_swevent_del(struct perf_event *event, int flags)
  16871. +{
  16872. + hlist_del_rcu(&event->hlist_entry);
  16873. +}
  16874. +
  16875. +static void perf_swevent_start(struct perf_event *event, int flags)
  16876. +{
  16877. + event->hw.state = 0;
  16878. +}
  16879. +
  16880. +static void perf_swevent_stop(struct perf_event *event, int flags)
  16881. +{
  16882. + event->hw.state = PERF_HES_STOPPED;
  16883. +}
  16884. +
  16885. +/* Deref the hlist from the update side */
  16886. +static inline struct swevent_hlist *
  16887. +swevent_hlist_deref(struct swevent_htable *swhash)
  16888. +{
  16889. + return rcu_dereference_protected(swhash->swevent_hlist,
  16890. + lockdep_is_held(&swhash->hlist_mutex));
  16891. +}
  16892. +
  16893. +static void swevent_hlist_release(struct swevent_htable *swhash)
  16894. +{
  16895. + struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
  16896. +
  16897. + if (!hlist)
  16898. + return;
  16899. +
  16900. + RCU_INIT_POINTER(swhash->swevent_hlist, NULL);
  16901. + kfree_rcu(hlist, rcu_head);
  16902. +}
  16903. +
  16904. +static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
  16905. +{
  16906. + struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
  16907. +
  16908. + mutex_lock(&swhash->hlist_mutex);
  16909. +
  16910. + if (!--swhash->hlist_refcount)
  16911. + swevent_hlist_release(swhash);
  16912. +
  16913. + mutex_unlock(&swhash->hlist_mutex);
  16914. +}
  16915. +
  16916. +static void swevent_hlist_put(struct perf_event *event)
  16917. +{
  16918. + int cpu;
  16919. +
  16920. + for_each_possible_cpu(cpu)
  16921. + swevent_hlist_put_cpu(event, cpu);
  16922. +}
  16923. +
  16924. +static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
  16925. +{
  16926. + struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
  16927. + int err = 0;
  16928. +
  16929. + mutex_lock(&swhash->hlist_mutex);
  16930. +
  16931. + if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) {
  16932. + struct swevent_hlist *hlist;
  16933. +
  16934. + hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
  16935. + if (!hlist) {
  16936. + err = -ENOMEM;
  16937. + goto exit;
  16938. + }
  16939. + rcu_assign_pointer(swhash->swevent_hlist, hlist);
  16940. + }
  16941. + swhash->hlist_refcount++;
  16942. +exit:
  16943. + mutex_unlock(&swhash->hlist_mutex);
  16944. +
  16945. + return err;
  16946. +}
  16947. +
  16948. +static int swevent_hlist_get(struct perf_event *event)
  16949. +{
  16950. + int err;
  16951. + int cpu, failed_cpu;
  16952. +
  16953. + get_online_cpus();
  16954. + for_each_possible_cpu(cpu) {
  16955. + err = swevent_hlist_get_cpu(event, cpu);
  16956. + if (err) {
  16957. + failed_cpu = cpu;
  16958. + goto fail;
  16959. + }
  16960. + }
  16961. + put_online_cpus();
  16962. +
  16963. + return 0;
  16964. +fail:
  16965. + for_each_possible_cpu(cpu) {
  16966. + if (cpu == failed_cpu)
  16967. + break;
  16968. + swevent_hlist_put_cpu(event, cpu);
  16969. + }
  16970. +
  16971. + put_online_cpus();
  16972. + return err;
  16973. +}
  16974. +
  16975. +struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
  16976. +
  16977. +static void sw_perf_event_destroy(struct perf_event *event)
  16978. +{
  16979. + u64 event_id = event->attr.config;
  16980. +
  16981. + WARN_ON(event->parent);
  16982. +
  16983. + static_key_slow_dec(&perf_swevent_enabled[event_id]);
  16984. + swevent_hlist_put(event);
  16985. +}
  16986. +
  16987. +static int perf_swevent_init(struct perf_event *event)
  16988. +{
  16989. + u64 event_id = event->attr.config;
  16990. +
  16991. + if (event->attr.type != PERF_TYPE_SOFTWARE)
  16992. + return -ENOENT;
  16993. +
  16994. + /*
  16995. + * no branch sampling for software events
  16996. + */
  16997. + if (has_branch_stack(event))
  16998. + return -EOPNOTSUPP;
  16999. +
  17000. + switch (event_id) {
  17001. + case PERF_COUNT_SW_CPU_CLOCK:
  17002. + case PERF_COUNT_SW_TASK_CLOCK:
  17003. + return -ENOENT;
  17004. +
  17005. + default:
  17006. + break;
  17007. + }
  17008. +
  17009. + if (event_id >= PERF_COUNT_SW_MAX)
  17010. + return -ENOENT;
  17011. +
  17012. + if (!event->parent) {
  17013. + int err;
  17014. +
  17015. + err = swevent_hlist_get(event);
  17016. + if (err)
  17017. + return err;
  17018. +
  17019. + static_key_slow_inc(&perf_swevent_enabled[event_id]);
  17020. + event->destroy = sw_perf_event_destroy;
  17021. + }
  17022. +
  17023. + return 0;
  17024. +}
  17025. +
  17026. +static struct pmu perf_swevent = {
  17027. + .task_ctx_nr = perf_sw_context,
  17028. +
  17029. + .event_init = perf_swevent_init,
  17030. + .add = perf_swevent_add,
  17031. + .del = perf_swevent_del,
  17032. + .start = perf_swevent_start,
  17033. + .stop = perf_swevent_stop,
  17034. + .read = perf_swevent_read,
  17035. +};
  17036. +
  17037. +#ifdef CONFIG_EVENT_TRACING
  17038. +
  17039. +static int perf_tp_filter_match(struct perf_event *event,
  17040. + struct perf_sample_data *data)
  17041. +{
  17042. + void *record = data->raw->data;
  17043. +
  17044. + if (likely(!event->filter) || filter_match_preds(event->filter, record))
  17045. + return 1;
  17046. + return 0;
  17047. +}
  17048. +
  17049. +static int perf_tp_event_match(struct perf_event *event,
  17050. + struct perf_sample_data *data,
  17051. + struct pt_regs *regs)
  17052. +{
  17053. + if (event->hw.state & PERF_HES_STOPPED)
  17054. + return 0;
  17055. + /*
  17056. + * All tracepoints are from kernel-space.
  17057. + */
  17058. + if (event->attr.exclude_kernel)
  17059. + return 0;
  17060. +
  17061. + if (!perf_tp_filter_match(event, data))
  17062. + return 0;
  17063. +
  17064. + return 1;
  17065. +}
  17066. +
  17067. +void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
  17068. + struct pt_regs *regs, struct hlist_head *head, int rctx,
  17069. + struct task_struct *task)
  17070. +{
  17071. + struct perf_sample_data data;
  17072. + struct perf_event *event;
  17073. +
  17074. + struct perf_raw_record raw = {
  17075. + .size = entry_size,
  17076. + .data = record,
  17077. + };
  17078. +
  17079. + perf_sample_data_init(&data, addr, 0);
  17080. + data.raw = &raw;
  17081. +
  17082. + hlist_for_each_entry_rcu(event, head, hlist_entry) {
  17083. + if (perf_tp_event_match(event, &data, regs))
  17084. + perf_swevent_event(event, count, &data, regs);
  17085. + }
  17086. +
  17087. + /*
  17088. + * If we got specified a target task, also iterate its context and
  17089. + * deliver this event there too.
  17090. + */
  17091. + if (task && task != current) {
  17092. + struct perf_event_context *ctx;
  17093. + struct trace_entry *entry = record;
  17094. +
  17095. + rcu_read_lock();
  17096. + ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]);
  17097. + if (!ctx)
  17098. + goto unlock;
  17099. +
  17100. + list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
  17101. + if (event->attr.type != PERF_TYPE_TRACEPOINT)
  17102. + continue;
  17103. + if (event->attr.config != entry->type)
  17104. + continue;
  17105. + if (perf_tp_event_match(event, &data, regs))
  17106. + perf_swevent_event(event, count, &data, regs);
  17107. + }
  17108. +unlock:
  17109. + rcu_read_unlock();
  17110. + }
  17111. +
  17112. + perf_swevent_put_recursion_context(rctx);
  17113. +}
  17114. +EXPORT_SYMBOL_GPL(perf_tp_event);
  17115. +
  17116. +static void tp_perf_event_destroy(struct perf_event *event)
  17117. +{
  17118. + perf_trace_destroy(event);
  17119. +}
  17120. +
  17121. +static int perf_tp_event_init(struct perf_event *event)
  17122. +{
  17123. + int err;
  17124. +
  17125. + if (event->attr.type != PERF_TYPE_TRACEPOINT)
  17126. + return -ENOENT;
  17127. +
  17128. + /*
  17129. + * no branch sampling for tracepoint events
  17130. + */
  17131. + if (has_branch_stack(event))
  17132. + return -EOPNOTSUPP;
  17133. +
  17134. + err = perf_trace_init(event);
  17135. + if (err)
  17136. + return err;
  17137. +
  17138. + event->destroy = tp_perf_event_destroy;
  17139. +
  17140. + return 0;
  17141. +}
  17142. +
  17143. +static struct pmu perf_tracepoint = {
  17144. + .task_ctx_nr = perf_sw_context,
  17145. +
  17146. + .event_init = perf_tp_event_init,
  17147. + .add = perf_trace_add,
  17148. + .del = perf_trace_del,
  17149. + .start = perf_swevent_start,
  17150. + .stop = perf_swevent_stop,
  17151. + .read = perf_swevent_read,
  17152. +};
  17153. +
  17154. +static inline void perf_tp_register(void)
  17155. +{
  17156. + perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
  17157. +}
  17158. +
  17159. +static int perf_event_set_filter(struct perf_event *event, void __user *arg)
  17160. +{
  17161. + char *filter_str;
  17162. + int ret;
  17163. +
  17164. + if (event->attr.type != PERF_TYPE_TRACEPOINT)
  17165. + return -EINVAL;
  17166. +
  17167. + filter_str = strndup_user(arg, PAGE_SIZE);
  17168. + if (IS_ERR(filter_str))
  17169. + return PTR_ERR(filter_str);
  17170. +
  17171. + ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
  17172. +
  17173. + kfree(filter_str);
  17174. + return ret;
  17175. +}
  17176. +
  17177. +static void perf_event_free_filter(struct perf_event *event)
  17178. +{
  17179. + ftrace_profile_free_filter(event);
  17180. +}
  17181. +
  17182. +#else
  17183. +
  17184. +static inline void perf_tp_register(void)
  17185. +{
  17186. +}
  17187. +
  17188. +static int perf_event_set_filter(struct perf_event *event, void __user *arg)
  17189. +{
  17190. + return -ENOENT;
  17191. +}
  17192. +
  17193. +static void perf_event_free_filter(struct perf_event *event)
  17194. +{
  17195. +}
  17196. +
  17197. +#endif /* CONFIG_EVENT_TRACING */
  17198. +
  17199. +#ifdef CONFIG_HAVE_HW_BREAKPOINT
  17200. +void perf_bp_event(struct perf_event *bp, void *data)
  17201. +{
  17202. + struct perf_sample_data sample;
  17203. + struct pt_regs *regs = data;
  17204. +
  17205. + perf_sample_data_init(&sample, bp->attr.bp_addr, 0);
  17206. +
  17207. + if (!bp->hw.state && !perf_exclude_event(bp, regs))
  17208. + perf_swevent_event(bp, 1, &sample, regs);
  17209. +}
  17210. +#endif
  17211. +
  17212. +/*
  17213. + * hrtimer based swevent callback
  17214. + */
  17215. +
  17216. +static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
  17217. +{
  17218. + enum hrtimer_restart ret = HRTIMER_RESTART;
  17219. + struct perf_sample_data data;
  17220. + struct pt_regs *regs;
  17221. + struct perf_event *event;
  17222. + u64 period;
  17223. +
  17224. + event = container_of(hrtimer, struct perf_event, hw.hrtimer);
  17225. +
  17226. + if (event->state != PERF_EVENT_STATE_ACTIVE)
  17227. + return HRTIMER_NORESTART;
  17228. +
  17229. + event->pmu->read(event);
  17230. +
  17231. + perf_sample_data_init(&data, 0, event->hw.last_period);
  17232. + regs = get_irq_regs();
  17233. +
  17234. + if (regs && !perf_exclude_event(event, regs)) {
  17235. + if (!(event->attr.exclude_idle && is_idle_task(current)))
  17236. + if (__perf_event_overflow(event, 1, &data, regs))
  17237. + ret = HRTIMER_NORESTART;
  17238. + }
  17239. +
  17240. + period = max_t(u64, 10000, event->hw.sample_period);
  17241. + hrtimer_forward_now(hrtimer, ns_to_ktime(period));
  17242. +
  17243. + return ret;
  17244. +}
  17245. +
  17246. +static void perf_swevent_start_hrtimer(struct perf_event *event)
  17247. +{
  17248. + struct hw_perf_event *hwc = &event->hw;
  17249. + s64 period;
  17250. +
  17251. + if (!is_sampling_event(event))
  17252. + return;
  17253. +
  17254. + period = local64_read(&hwc->period_left);
  17255. + if (period) {
  17256. + if (period < 0)
  17257. + period = 10000;
  17258. +
  17259. + local64_set(&hwc->period_left, 0);
  17260. + } else {
  17261. + period = max_t(u64, 10000, hwc->sample_period);
  17262. + }
  17263. + __hrtimer_start_range_ns(&hwc->hrtimer,
  17264. + ns_to_ktime(period), 0,
  17265. + HRTIMER_MODE_REL_PINNED, 0);
  17266. +}
  17267. +
  17268. +static void perf_swevent_cancel_hrtimer(struct perf_event *event)
  17269. +{
  17270. + struct hw_perf_event *hwc = &event->hw;
  17271. +
  17272. + if (is_sampling_event(event)) {
  17273. + ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
  17274. + local64_set(&hwc->period_left, ktime_to_ns(remaining));
  17275. +
  17276. + hrtimer_cancel(&hwc->hrtimer);
  17277. + }
  17278. +}
  17279. +
  17280. +static void perf_swevent_init_hrtimer(struct perf_event *event)
  17281. +{
  17282. + struct hw_perf_event *hwc = &event->hw;
  17283. +
  17284. + if (!is_sampling_event(event))
  17285. + return;
  17286. +
  17287. + hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
  17288. + hwc->hrtimer.function = perf_swevent_hrtimer;
  17289. +
  17290. + /*
  17291. + * Since hrtimers have a fixed rate, we can do a static freq->period
  17292. + * mapping and avoid the whole period adjust feedback stuff.
  17293. + */
  17294. + if (event->attr.freq) {
  17295. + long freq = event->attr.sample_freq;
  17296. +
  17297. + event->attr.sample_period = NSEC_PER_SEC / freq;
  17298. + hwc->sample_period = event->attr.sample_period;
  17299. + local64_set(&hwc->period_left, hwc->sample_period);
  17300. + hwc->last_period = hwc->sample_period;
  17301. + event->attr.freq = 0;
  17302. + }
  17303. +}
  17304. +
  17305. +/*
  17306. + * Software event: cpu wall time clock
  17307. + */
  17308. +
  17309. +static void cpu_clock_event_update(struct perf_event *event)
  17310. +{
  17311. + s64 prev;
  17312. + u64 now;
  17313. +
  17314. + now = local_clock();
  17315. + prev = local64_xchg(&event->hw.prev_count, now);
  17316. + local64_add(now - prev, &event->count);
  17317. +}
  17318. +
  17319. +static void cpu_clock_event_start(struct perf_event *event, int flags)
  17320. +{
  17321. + local64_set(&event->hw.prev_count, local_clock());
  17322. + perf_swevent_start_hrtimer(event);
  17323. +}
  17324. +
  17325. +static void cpu_clock_event_stop(struct perf_event *event, int flags)
  17326. +{
  17327. + perf_swevent_cancel_hrtimer(event);
  17328. + cpu_clock_event_update(event);
  17329. +}
  17330. +
  17331. +static int cpu_clock_event_add(struct perf_event *event, int flags)
  17332. +{
  17333. + if (flags & PERF_EF_START)
  17334. + cpu_clock_event_start(event, flags);
  17335. +
  17336. + return 0;
  17337. +}
  17338. +
  17339. +static void cpu_clock_event_del(struct perf_event *event, int flags)
  17340. +{
  17341. + cpu_clock_event_stop(event, flags);
  17342. +}
  17343. +
  17344. +static void cpu_clock_event_read(struct perf_event *event)
  17345. +{
  17346. + cpu_clock_event_update(event);
  17347. +}
  17348. +
  17349. +static int cpu_clock_event_init(struct perf_event *event)
  17350. +{
  17351. + if (event->attr.type != PERF_TYPE_SOFTWARE)
  17352. + return -ENOENT;
  17353. +
  17354. + if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
  17355. + return -ENOENT;
  17356. +
  17357. + /*
  17358. + * no branch sampling for software events
  17359. + */
  17360. + if (has_branch_stack(event))
  17361. + return -EOPNOTSUPP;
  17362. +
  17363. + perf_swevent_init_hrtimer(event);
  17364. +
  17365. + return 0;
  17366. +}
  17367. +
  17368. +static struct pmu perf_cpu_clock = {
  17369. + .task_ctx_nr = perf_sw_context,
  17370. +
  17371. + .event_init = cpu_clock_event_init,
  17372. + .add = cpu_clock_event_add,
  17373. + .del = cpu_clock_event_del,
  17374. + .start = cpu_clock_event_start,
  17375. + .stop = cpu_clock_event_stop,
  17376. + .read = cpu_clock_event_read,
  17377. +};
  17378. +
  17379. +/*
  17380. + * Software event: task time clock
  17381. + */
  17382. +
  17383. +static void task_clock_event_update(struct perf_event *event, u64 now)
  17384. +{
  17385. + u64 prev;
  17386. + s64 delta;
  17387. +
  17388. + prev = local64_xchg(&event->hw.prev_count, now);
  17389. + delta = now - prev;
  17390. + local64_add(delta, &event->count);
  17391. +}
  17392. +
  17393. +static void task_clock_event_start(struct perf_event *event, int flags)
  17394. +{
  17395. + local64_set(&event->hw.prev_count, event->ctx->time);
  17396. + perf_swevent_start_hrtimer(event);
  17397. +}
  17398. +
  17399. +static void task_clock_event_stop(struct perf_event *event, int flags)
  17400. +{
  17401. + perf_swevent_cancel_hrtimer(event);
  17402. + task_clock_event_update(event, event->ctx->time);
  17403. +}
  17404. +
  17405. +static int task_clock_event_add(struct perf_event *event, int flags)
  17406. +{
  17407. + if (flags & PERF_EF_START)
  17408. + task_clock_event_start(event, flags);
  17409. +
  17410. + return 0;
  17411. +}
  17412. +
  17413. +static void task_clock_event_del(struct perf_event *event, int flags)
  17414. +{
  17415. + task_clock_event_stop(event, PERF_EF_UPDATE);
  17416. +}
  17417. +
  17418. +static void task_clock_event_read(struct perf_event *event)
  17419. +{
  17420. + u64 now = perf_clock();
  17421. + u64 delta = now - event->ctx->timestamp;
  17422. + u64 time = event->ctx->time + delta;
  17423. +
  17424. + task_clock_event_update(event, time);
  17425. +}
  17426. +
  17427. +static int task_clock_event_init(struct perf_event *event)
  17428. +{
  17429. + if (event->attr.type != PERF_TYPE_SOFTWARE)
  17430. + return -ENOENT;
  17431. +
  17432. + if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
  17433. + return -ENOENT;
  17434. +
  17435. + /*
  17436. + * no branch sampling for software events
  17437. + */
  17438. + if (has_branch_stack(event))
  17439. + return -EOPNOTSUPP;
  17440. +
  17441. + perf_swevent_init_hrtimer(event);
  17442. +
  17443. + return 0;
  17444. +}
  17445. +
  17446. +static struct pmu perf_task_clock = {
  17447. + .task_ctx_nr = perf_sw_context,
  17448. +
  17449. + .event_init = task_clock_event_init,
  17450. + .add = task_clock_event_add,
  17451. + .del = task_clock_event_del,
  17452. + .start = task_clock_event_start,
  17453. + .stop = task_clock_event_stop,
  17454. + .read = task_clock_event_read,
  17455. +};
  17456. +
  17457. +static void perf_pmu_nop_void(struct pmu *pmu)
  17458. +{
  17459. +}
  17460. +
  17461. +static int perf_pmu_nop_int(struct pmu *pmu)
  17462. +{
  17463. + return 0;
  17464. +}
  17465. +
  17466. +static void perf_pmu_start_txn(struct pmu *pmu)
  17467. +{
  17468. + perf_pmu_disable(pmu);
  17469. +}
  17470. +
  17471. +static int perf_pmu_commit_txn(struct pmu *pmu)
  17472. +{
  17473. + perf_pmu_enable(pmu);
  17474. + return 0;
  17475. +}
  17476. +
  17477. +static void perf_pmu_cancel_txn(struct pmu *pmu)
  17478. +{
  17479. + perf_pmu_enable(pmu);
  17480. +}
  17481. +
  17482. +static int perf_event_idx_default(struct perf_event *event)
  17483. +{
  17484. + return 0;
  17485. +}
  17486. +
  17487. +/*
  17488. + * Ensures all contexts with the same task_ctx_nr have the same
  17489. + * pmu_cpu_context too.
  17490. + */
  17491. +static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
  17492. +{
  17493. + struct pmu *pmu;
  17494. +
  17495. + if (ctxn < 0)
  17496. + return NULL;
  17497. +
  17498. + list_for_each_entry(pmu, &pmus, entry) {
  17499. + if (pmu->task_ctx_nr == ctxn)
  17500. + return pmu->pmu_cpu_context;
  17501. + }
  17502. +
  17503. + return NULL;
  17504. +}
  17505. +
  17506. +static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu)
  17507. +{
  17508. + int cpu;
  17509. +
  17510. + for_each_possible_cpu(cpu) {
  17511. + struct perf_cpu_context *cpuctx;
  17512. +
  17513. + cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
  17514. +
  17515. + if (cpuctx->unique_pmu == old_pmu)
  17516. + cpuctx->unique_pmu = pmu;
  17517. + }
  17518. +}
  17519. +
  17520. +static void free_pmu_context(struct pmu *pmu)
  17521. +{
  17522. + struct pmu *i;
  17523. +
  17524. + mutex_lock(&pmus_lock);
  17525. + /*
  17526. + * Like a real lame refcount.
  17527. + */
  17528. + list_for_each_entry(i, &pmus, entry) {
  17529. + if (i->pmu_cpu_context == pmu->pmu_cpu_context) {
  17530. + update_pmu_context(i, pmu);
  17531. + goto out;
  17532. + }
  17533. + }
  17534. +
  17535. + free_percpu(pmu->pmu_cpu_context);
  17536. +out:
  17537. + mutex_unlock(&pmus_lock);
  17538. +}
  17539. +static struct idr pmu_idr;
  17540. +
  17541. +static ssize_t
  17542. +type_show(struct device *dev, struct device_attribute *attr, char *page)
  17543. +{
  17544. + struct pmu *pmu = dev_get_drvdata(dev);
  17545. +
  17546. + return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
  17547. +}
  17548. +static DEVICE_ATTR_RO(type);
  17549. +
  17550. +static ssize_t
  17551. +perf_event_mux_interval_ms_show(struct device *dev,
  17552. + struct device_attribute *attr,
  17553. + char *page)
  17554. +{
  17555. + struct pmu *pmu = dev_get_drvdata(dev);
  17556. +
  17557. + return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms);
  17558. +}
  17559. +
  17560. +static ssize_t
  17561. +perf_event_mux_interval_ms_store(struct device *dev,
  17562. + struct device_attribute *attr,
  17563. + const char *buf, size_t count)
  17564. +{
  17565. + struct pmu *pmu = dev_get_drvdata(dev);
  17566. + int timer, cpu, ret;
  17567. +
  17568. + ret = kstrtoint(buf, 0, &timer);
  17569. + if (ret)
  17570. + return ret;
  17571. +
  17572. + if (timer < 1)
  17573. + return -EINVAL;
  17574. +
  17575. + /* same value, noting to do */
  17576. + if (timer == pmu->hrtimer_interval_ms)
  17577. + return count;
  17578. +
  17579. + pmu->hrtimer_interval_ms = timer;
  17580. +
  17581. + /* update all cpuctx for this PMU */
  17582. + for_each_possible_cpu(cpu) {
  17583. + struct perf_cpu_context *cpuctx;
  17584. + cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
  17585. + cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
  17586. +
  17587. + if (hrtimer_active(&cpuctx->hrtimer))
  17588. + hrtimer_forward_now(&cpuctx->hrtimer, cpuctx->hrtimer_interval);
  17589. + }
  17590. +
  17591. + return count;
  17592. +}
  17593. +static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
  17594. +
  17595. +static struct attribute *pmu_dev_attrs[] = {
  17596. + &dev_attr_type.attr,
  17597. + &dev_attr_perf_event_mux_interval_ms.attr,
  17598. + NULL,
  17599. +};
  17600. +ATTRIBUTE_GROUPS(pmu_dev);
  17601. +
  17602. +static int pmu_bus_running;
  17603. +static struct bus_type pmu_bus = {
  17604. + .name = "event_source",
  17605. + .dev_groups = pmu_dev_groups,
  17606. +};
  17607. +
  17608. +static void pmu_dev_release(struct device *dev)
  17609. +{
  17610. + kfree(dev);
  17611. +}
  17612. +
  17613. +static int pmu_dev_alloc(struct pmu *pmu)
  17614. +{
  17615. + int ret = -ENOMEM;
  17616. +
  17617. + pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL);
  17618. + if (!pmu->dev)
  17619. + goto out;
  17620. +
  17621. + pmu->dev->groups = pmu->attr_groups;
  17622. + device_initialize(pmu->dev);
  17623. + ret = dev_set_name(pmu->dev, "%s", pmu->name);
  17624. + if (ret)
  17625. + goto free_dev;
  17626. +
  17627. + dev_set_drvdata(pmu->dev, pmu);
  17628. + pmu->dev->bus = &pmu_bus;
  17629. + pmu->dev->release = pmu_dev_release;
  17630. + ret = device_add(pmu->dev);
  17631. + if (ret)
  17632. + goto free_dev;
  17633. +
  17634. +out:
  17635. + return ret;
  17636. +
  17637. +free_dev:
  17638. + put_device(pmu->dev);
  17639. + goto out;
  17640. +}
  17641. +
  17642. +static struct lock_class_key cpuctx_mutex;
  17643. +static struct lock_class_key cpuctx_lock;
  17644. +
  17645. +int perf_pmu_register(struct pmu *pmu, const char *name, int type)
  17646. +{
  17647. + int cpu, ret;
  17648. +
  17649. + mutex_lock(&pmus_lock);
  17650. + ret = -ENOMEM;
  17651. + pmu->pmu_disable_count = alloc_percpu(int);
  17652. + if (!pmu->pmu_disable_count)
  17653. + goto unlock;
  17654. +
  17655. + pmu->type = -1;
  17656. + if (!name)
  17657. + goto skip_type;
  17658. + pmu->name = name;
  17659. +
  17660. + if (type < 0) {
  17661. + type = idr_alloc(&pmu_idr, pmu, PERF_TYPE_MAX, 0, GFP_KERNEL);
  17662. + if (type < 0) {
  17663. + ret = type;
  17664. + goto free_pdc;
  17665. + }
  17666. + }
  17667. + pmu->type = type;
  17668. +
  17669. + if (pmu_bus_running) {
  17670. + ret = pmu_dev_alloc(pmu);
  17671. + if (ret)
  17672. + goto free_idr;
  17673. + }
  17674. +
  17675. +skip_type:
  17676. + pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
  17677. + if (pmu->pmu_cpu_context)
  17678. + goto got_cpu_context;
  17679. +
  17680. + ret = -ENOMEM;
  17681. + pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
  17682. + if (!pmu->pmu_cpu_context)
  17683. + goto free_dev;
  17684. +
  17685. + for_each_possible_cpu(cpu) {
  17686. + struct perf_cpu_context *cpuctx;
  17687. +
  17688. + cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
  17689. + __perf_event_init_context(&cpuctx->ctx);
  17690. + lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
  17691. + lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
  17692. + cpuctx->ctx.type = cpu_context;
  17693. + cpuctx->ctx.pmu = pmu;
  17694. +
  17695. + __perf_cpu_hrtimer_init(cpuctx, cpu);
  17696. +
  17697. + INIT_LIST_HEAD(&cpuctx->rotation_list);
  17698. + cpuctx->unique_pmu = pmu;
  17699. + }
  17700. +
  17701. +got_cpu_context:
  17702. + if (!pmu->start_txn) {
  17703. + if (pmu->pmu_enable) {
  17704. + /*
  17705. + * If we have pmu_enable/pmu_disable calls, install
  17706. + * transaction stubs that use that to try and batch
  17707. + * hardware accesses.
  17708. + */
  17709. + pmu->start_txn = perf_pmu_start_txn;
  17710. + pmu->commit_txn = perf_pmu_commit_txn;
  17711. + pmu->cancel_txn = perf_pmu_cancel_txn;
  17712. + } else {
  17713. + pmu->start_txn = perf_pmu_nop_void;
  17714. + pmu->commit_txn = perf_pmu_nop_int;
  17715. + pmu->cancel_txn = perf_pmu_nop_void;
  17716. + }
  17717. + }
  17718. +
  17719. + if (!pmu->pmu_enable) {
  17720. + pmu->pmu_enable = perf_pmu_nop_void;
  17721. + pmu->pmu_disable = perf_pmu_nop_void;
  17722. + }
  17723. +
  17724. + if (!pmu->event_idx)
  17725. + pmu->event_idx = perf_event_idx_default;
  17726. +
  17727. + list_add_rcu(&pmu->entry, &pmus);
  17728. + ret = 0;
  17729. +unlock:
  17730. + mutex_unlock(&pmus_lock);
  17731. +
  17732. + return ret;
  17733. +
  17734. +free_dev:
  17735. + device_del(pmu->dev);
  17736. + put_device(pmu->dev);
  17737. +
  17738. +free_idr:
  17739. + if (pmu->type >= PERF_TYPE_MAX)
  17740. + idr_remove(&pmu_idr, pmu->type);
  17741. +
  17742. +free_pdc:
  17743. + free_percpu(pmu->pmu_disable_count);
  17744. + goto unlock;
  17745. +}
  17746. +EXPORT_SYMBOL_GPL(perf_pmu_register);
  17747. +
  17748. +void perf_pmu_unregister(struct pmu *pmu)
  17749. +{
  17750. + mutex_lock(&pmus_lock);
  17751. + list_del_rcu(&pmu->entry);
  17752. + mutex_unlock(&pmus_lock);
  17753. +
  17754. + /*
  17755. + * We dereference the pmu list under both SRCU and regular RCU, so
  17756. + * synchronize against both of those.
  17757. + */
  17758. + synchronize_srcu(&pmus_srcu);
  17759. + synchronize_rcu();
  17760. +
  17761. + free_percpu(pmu->pmu_disable_count);
  17762. + if (pmu->type >= PERF_TYPE_MAX)
  17763. + idr_remove(&pmu_idr, pmu->type);
  17764. + device_del(pmu->dev);
  17765. + put_device(pmu->dev);
  17766. + free_pmu_context(pmu);
  17767. +}
  17768. +EXPORT_SYMBOL_GPL(perf_pmu_unregister);
  17769. +
  17770. +struct pmu *perf_init_event(struct perf_event *event)
  17771. +{
  17772. + struct pmu *pmu = NULL;
  17773. + int idx;
  17774. + int ret;
  17775. +
  17776. + idx = srcu_read_lock(&pmus_srcu);
  17777. +
  17778. + rcu_read_lock();
  17779. + pmu = idr_find(&pmu_idr, event->attr.type);
  17780. + rcu_read_unlock();
  17781. + if (pmu) {
  17782. + if (!try_module_get(pmu->module)) {
  17783. + pmu = ERR_PTR(-ENODEV);
  17784. + goto unlock;
  17785. + }
  17786. + event->pmu = pmu;
  17787. + ret = pmu->event_init(event);
  17788. + if (ret)
  17789. + pmu = ERR_PTR(ret);
  17790. + goto unlock;
  17791. + }
  17792. +
  17793. + list_for_each_entry_rcu(pmu, &pmus, entry) {
  17794. + if (!try_module_get(pmu->module)) {
  17795. + pmu = ERR_PTR(-ENODEV);
  17796. + goto unlock;
  17797. + }
  17798. + event->pmu = pmu;
  17799. + ret = pmu->event_init(event);
  17800. + if (!ret)
  17801. + goto unlock;
  17802. +
  17803. + if (ret != -ENOENT) {
  17804. + pmu = ERR_PTR(ret);
  17805. + goto unlock;
  17806. + }
  17807. + }
  17808. + pmu = ERR_PTR(-ENOENT);
  17809. +unlock:
  17810. + srcu_read_unlock(&pmus_srcu, idx);
  17811. +
  17812. + return pmu;
  17813. +}
  17814. +
  17815. +static void account_event_cpu(struct perf_event *event, int cpu)
  17816. +{
  17817. + if (event->parent)
  17818. + return;
  17819. +
  17820. + if (has_branch_stack(event)) {
  17821. + if (!(event->attach_state & PERF_ATTACH_TASK))
  17822. + atomic_inc(&per_cpu(perf_branch_stack_events, cpu));
  17823. + }
  17824. + if (is_cgroup_event(event))
  17825. + atomic_inc(&per_cpu(perf_cgroup_events, cpu));
  17826. +}
  17827. +
  17828. +static void account_event(struct perf_event *event)
  17829. +{
  17830. + if (event->parent)
  17831. + return;
  17832. +
  17833. + if (event->attach_state & PERF_ATTACH_TASK)
  17834. + static_key_slow_inc(&perf_sched_events.key);
  17835. + if (event->attr.mmap || event->attr.mmap_data)
  17836. + atomic_inc(&nr_mmap_events);
  17837. + if (event->attr.comm)
  17838. + atomic_inc(&nr_comm_events);
  17839. + if (event->attr.task)
  17840. + atomic_inc(&nr_task_events);
  17841. + if (event->attr.freq) {
  17842. + if (atomic_inc_return(&nr_freq_events) == 1)
  17843. + tick_nohz_full_kick_all();
  17844. + }
  17845. + if (has_branch_stack(event))
  17846. + static_key_slow_inc(&perf_sched_events.key);
  17847. + if (is_cgroup_event(event))
  17848. + static_key_slow_inc(&perf_sched_events.key);
  17849. +
  17850. + account_event_cpu(event, event->cpu);
  17851. +}
  17852. +
  17853. +/*
  17854. + * Allocate and initialize a event structure
  17855. + */
  17856. +static struct perf_event *
  17857. +perf_event_alloc(struct perf_event_attr *attr, int cpu,
  17858. + struct task_struct *task,
  17859. + struct perf_event *group_leader,
  17860. + struct perf_event *parent_event,
  17861. + perf_overflow_handler_t overflow_handler,
  17862. + void *context)
  17863. +{
  17864. + struct pmu *pmu;
  17865. + struct perf_event *event;
  17866. + struct hw_perf_event *hwc;
  17867. + long err = -EINVAL;
  17868. +
  17869. + if ((unsigned)cpu >= nr_cpu_ids) {
  17870. + if (!task || cpu != -1)
  17871. + return ERR_PTR(-EINVAL);
  17872. + }
  17873. +
  17874. + event = kzalloc(sizeof(*event), GFP_KERNEL);
  17875. + if (!event)
  17876. + return ERR_PTR(-ENOMEM);
  17877. +
  17878. + /*
  17879. + * Single events are their own group leaders, with an
  17880. + * empty sibling list:
  17881. + */
  17882. + if (!group_leader)
  17883. + group_leader = event;
  17884. +
  17885. + mutex_init(&event->child_mutex);
  17886. + INIT_LIST_HEAD(&event->child_list);
  17887. +
  17888. + INIT_LIST_HEAD(&event->group_entry);
  17889. + INIT_LIST_HEAD(&event->event_entry);
  17890. + INIT_LIST_HEAD(&event->sibling_list);
  17891. + INIT_LIST_HEAD(&event->rb_entry);
  17892. + INIT_LIST_HEAD(&event->active_entry);
  17893. + INIT_HLIST_NODE(&event->hlist_entry);
  17894. +
  17895. +
  17896. + init_waitqueue_head(&event->waitq);
  17897. + init_irq_work(&event->pending, perf_pending_event);
  17898. +
  17899. + mutex_init(&event->mmap_mutex);
  17900. +
  17901. + atomic_long_set(&event->refcount, 1);
  17902. + event->cpu = cpu;
  17903. + event->attr = *attr;
  17904. + event->group_leader = group_leader;
  17905. + event->pmu = NULL;
  17906. + event->oncpu = -1;
  17907. +
  17908. + event->parent = parent_event;
  17909. +
  17910. + event->ns = get_pid_ns(task_active_pid_ns(current));
  17911. + event->id = atomic64_inc_return(&perf_event_id);
  17912. +
  17913. + event->state = PERF_EVENT_STATE_INACTIVE;
  17914. +
  17915. + if (task) {
  17916. + event->attach_state = PERF_ATTACH_TASK;
  17917. +
  17918. + if (attr->type == PERF_TYPE_TRACEPOINT)
  17919. + event->hw.tp_target = task;
  17920. +#ifdef CONFIG_HAVE_HW_BREAKPOINT
  17921. + /*
  17922. + * hw_breakpoint is a bit difficult here..
  17923. + */
  17924. + else if (attr->type == PERF_TYPE_BREAKPOINT)
  17925. + event->hw.bp_target = task;
  17926. +#endif
  17927. + }
  17928. +
  17929. + if (!overflow_handler && parent_event) {
  17930. + overflow_handler = parent_event->overflow_handler;
  17931. + context = parent_event->overflow_handler_context;
  17932. + }
  17933. +
  17934. + event->overflow_handler = overflow_handler;
  17935. + event->overflow_handler_context = context;
  17936. +
  17937. + perf_event__state_init(event);
  17938. +
  17939. + pmu = NULL;
  17940. +
  17941. + hwc = &event->hw;
  17942. + hwc->sample_period = attr->sample_period;
  17943. + if (attr->freq && attr->sample_freq)
  17944. + hwc->sample_period = 1;
  17945. + hwc->last_period = hwc->sample_period;
  17946. +
  17947. + local64_set(&hwc->period_left, hwc->sample_period);
  17948. +
  17949. + /*
  17950. + * we currently do not support PERF_FORMAT_GROUP on inherited events
  17951. + */
  17952. + if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
  17953. + goto err_ns;
  17954. +
  17955. + pmu = perf_init_event(event);
  17956. + if (!pmu)
  17957. + goto err_ns;
  17958. + else if (IS_ERR(pmu)) {
  17959. + err = PTR_ERR(pmu);
  17960. + goto err_ns;
  17961. + }
  17962. +
  17963. + if (!event->parent) {
  17964. + if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
  17965. + err = get_callchain_buffers();
  17966. + if (err)
  17967. + goto err_pmu;
  17968. + }
  17969. + }
  17970. +
  17971. + return event;
  17972. +
  17973. +err_pmu:
  17974. + if (event->destroy)
  17975. + event->destroy(event);
  17976. + module_put(pmu->module);
  17977. +err_ns:
  17978. + if (event->ns)
  17979. + put_pid_ns(event->ns);
  17980. + kfree(event);
  17981. +
  17982. + return ERR_PTR(err);
  17983. +}
  17984. +
  17985. +static int perf_copy_attr(struct perf_event_attr __user *uattr,
  17986. + struct perf_event_attr *attr)
  17987. +{
  17988. + u32 size;
  17989. + int ret;
  17990. +
  17991. + if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
  17992. + return -EFAULT;
  17993. +
  17994. + /*
  17995. + * zero the full structure, so that a short copy will be nice.
  17996. + */
  17997. + memset(attr, 0, sizeof(*attr));
  17998. +
  17999. + ret = get_user(size, &uattr->size);
  18000. + if (ret)
  18001. + return ret;
  18002. +
  18003. + if (size > PAGE_SIZE) /* silly large */
  18004. + goto err_size;
  18005. +
  18006. + if (!size) /* abi compat */
  18007. + size = PERF_ATTR_SIZE_VER0;
  18008. +
  18009. + if (size < PERF_ATTR_SIZE_VER0)
  18010. + goto err_size;
  18011. +
  18012. + /*
  18013. + * If we're handed a bigger struct than we know of,
  18014. + * ensure all the unknown bits are 0 - i.e. new
  18015. + * user-space does not rely on any kernel feature
  18016. + * extensions we dont know about yet.
  18017. + */
  18018. + if (size > sizeof(*attr)) {
  18019. + unsigned char __user *addr;
  18020. + unsigned char __user *end;
  18021. + unsigned char val;
  18022. +
  18023. + addr = (void __user *)uattr + sizeof(*attr);
  18024. + end = (void __user *)uattr + size;
  18025. +
  18026. + for (; addr < end; addr++) {
  18027. + ret = get_user(val, addr);
  18028. + if (ret)
  18029. + return ret;
  18030. + if (val)
  18031. + goto err_size;
  18032. + }
  18033. + size = sizeof(*attr);
  18034. + }
  18035. +
  18036. + ret = copy_from_user(attr, uattr, size);
  18037. + if (ret)
  18038. + return -EFAULT;
  18039. +
  18040. + if (attr->__reserved_1)
  18041. + return -EINVAL;
  18042. +
  18043. + if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
  18044. + return -EINVAL;
  18045. +
  18046. + if (attr->read_format & ~(PERF_FORMAT_MAX-1))
  18047. + return -EINVAL;
  18048. +
  18049. + if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) {
  18050. + u64 mask = attr->branch_sample_type;
  18051. +
  18052. + /* only using defined bits */
  18053. + if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1))
  18054. + return -EINVAL;
  18055. +
  18056. + /* at least one branch bit must be set */
  18057. + if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL))
  18058. + return -EINVAL;
  18059. +
  18060. + /* propagate priv level, when not set for branch */
  18061. + if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) {
  18062. +
  18063. + /* exclude_kernel checked on syscall entry */
  18064. + if (!attr->exclude_kernel)
  18065. + mask |= PERF_SAMPLE_BRANCH_KERNEL;
  18066. +
  18067. + if (!attr->exclude_user)
  18068. + mask |= PERF_SAMPLE_BRANCH_USER;
  18069. +
  18070. + if (!attr->exclude_hv)
  18071. + mask |= PERF_SAMPLE_BRANCH_HV;
  18072. + /*
  18073. + * adjust user setting (for HW filter setup)
  18074. + */
  18075. + attr->branch_sample_type = mask;
  18076. + }
  18077. + /* privileged levels capture (kernel, hv): check permissions */
  18078. + if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM)
  18079. + && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
  18080. + return -EACCES;
  18081. + }
  18082. +
  18083. + if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
  18084. + ret = perf_reg_validate(attr->sample_regs_user);
  18085. + if (ret)
  18086. + return ret;
  18087. + }
  18088. +
  18089. + if (attr->sample_type & PERF_SAMPLE_STACK_USER) {
  18090. + if (!arch_perf_have_user_stack_dump())
  18091. + return -ENOSYS;
  18092. +
  18093. + /*
  18094. + * We have __u32 type for the size, but so far
  18095. + * we can only use __u16 as maximum due to the
  18096. + * __u16 sample size limit.
  18097. + */
  18098. + if (attr->sample_stack_user >= USHRT_MAX)
  18099. + ret = -EINVAL;
  18100. + else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64)))
  18101. + ret = -EINVAL;
  18102. + }
  18103. +
  18104. +out:
  18105. + return ret;
  18106. +
  18107. +err_size:
  18108. + put_user(sizeof(*attr), &uattr->size);
  18109. + ret = -E2BIG;
  18110. + goto out;
  18111. +}
  18112. +
  18113. +static int
  18114. +perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
  18115. +{
  18116. + struct ring_buffer *rb = NULL;
  18117. + int ret = -EINVAL;
  18118. +
  18119. + if (!output_event)
  18120. + goto set;
  18121. +
  18122. + /* don't allow circular references */
  18123. + if (event == output_event)
  18124. + goto out;
  18125. +
  18126. + /*
  18127. + * Don't allow cross-cpu buffers
  18128. + */
  18129. + if (output_event->cpu != event->cpu)
  18130. + goto out;
  18131. +
  18132. + /*
  18133. + * If its not a per-cpu rb, it must be the same task.
  18134. + */
  18135. + if (output_event->cpu == -1 && output_event->ctx != event->ctx)
  18136. + goto out;
  18137. +
  18138. +set:
  18139. + mutex_lock(&event->mmap_mutex);
  18140. + /* Can't redirect output if we've got an active mmap() */
  18141. + if (atomic_read(&event->mmap_count))
  18142. + goto unlock;
  18143. +
  18144. + if (output_event) {
  18145. + /* get the rb we want to redirect to */
  18146. + rb = ring_buffer_get(output_event);
  18147. + if (!rb)
  18148. + goto unlock;
  18149. + }
  18150. +
  18151. + ring_buffer_attach(event, rb);
  18152. +
  18153. + ret = 0;
  18154. +unlock:
  18155. + mutex_unlock(&event->mmap_mutex);
  18156. +
  18157. +out:
  18158. + return ret;
  18159. +}
  18160. +
  18161. +/**
  18162. + * sys_perf_event_open - open a performance event, associate it to a task/cpu
  18163. + *
  18164. + * @attr_uptr: event_id type attributes for monitoring/sampling
  18165. + * @pid: target pid
  18166. + * @cpu: target cpu
  18167. + * @group_fd: group leader event fd
  18168. + */
  18169. +SYSCALL_DEFINE5(perf_event_open,
  18170. + struct perf_event_attr __user *, attr_uptr,
  18171. + pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
  18172. +{
  18173. + struct perf_event *group_leader = NULL, *output_event = NULL;
  18174. + struct perf_event *event, *sibling;
  18175. + struct perf_event_attr attr;
  18176. + struct perf_event_context *ctx;
  18177. + struct file *event_file = NULL;
  18178. + struct fd group = {NULL, 0};
  18179. + struct task_struct *task = NULL;
  18180. + struct pmu *pmu;
  18181. + int event_fd;
  18182. + int move_group = 0;
  18183. + int err;
  18184. + int f_flags = O_RDWR;
  18185. +
  18186. + /* for future expandability... */
  18187. + if (flags & ~PERF_FLAG_ALL)
  18188. + return -EINVAL;
  18189. +
  18190. + err = perf_copy_attr(attr_uptr, &attr);
  18191. + if (err)
  18192. + return err;
  18193. +
  18194. + if (!attr.exclude_kernel) {
  18195. + if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
  18196. + return -EACCES;
  18197. + }
  18198. +
  18199. + if (attr.freq) {
  18200. + if (attr.sample_freq > sysctl_perf_event_sample_rate)
  18201. + return -EINVAL;
  18202. + } else {
  18203. + if (attr.sample_period & (1ULL << 63))
  18204. + return -EINVAL;
  18205. + }
  18206. +
  18207. + /*
  18208. + * In cgroup mode, the pid argument is used to pass the fd
  18209. + * opened to the cgroup directory in cgroupfs. The cpu argument
  18210. + * designates the cpu on which to monitor threads from that
  18211. + * cgroup.
  18212. + */
  18213. + if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
  18214. + return -EINVAL;
  18215. +
  18216. + if (flags & PERF_FLAG_FD_CLOEXEC)
  18217. + f_flags |= O_CLOEXEC;
  18218. +
  18219. + event_fd = get_unused_fd_flags(f_flags);
  18220. + if (event_fd < 0)
  18221. + return event_fd;
  18222. +
  18223. + if (group_fd != -1) {
  18224. + err = perf_fget_light(group_fd, &group);
  18225. + if (err)
  18226. + goto err_fd;
  18227. + group_leader = group.file->private_data;
  18228. + if (flags & PERF_FLAG_FD_OUTPUT)
  18229. + output_event = group_leader;
  18230. + if (flags & PERF_FLAG_FD_NO_GROUP)
  18231. + group_leader = NULL;
  18232. + }
  18233. +
  18234. + if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
  18235. + task = find_lively_task_by_vpid(pid);
  18236. + if (IS_ERR(task)) {
  18237. + err = PTR_ERR(task);
  18238. + goto err_group_fd;
  18239. + }
  18240. + }
  18241. +
  18242. + if (task && group_leader &&
  18243. + group_leader->attr.inherit != attr.inherit) {
  18244. + err = -EINVAL;
  18245. + goto err_task;
  18246. + }
  18247. +
  18248. + get_online_cpus();
  18249. +
  18250. + event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
  18251. + NULL, NULL);
  18252. + if (IS_ERR(event)) {
  18253. + err = PTR_ERR(event);
  18254. + goto err_cpus;
  18255. + }
  18256. +
  18257. + if (flags & PERF_FLAG_PID_CGROUP) {
  18258. + err = perf_cgroup_connect(pid, event, &attr, group_leader);
  18259. + if (err) {
  18260. + __free_event(event);
  18261. + goto err_cpus;
  18262. + }
  18263. + }
  18264. +
  18265. + if (is_sampling_event(event)) {
  18266. + if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
  18267. + err = -ENOTSUPP;
  18268. + goto err_alloc;
  18269. + }
  18270. + }
  18271. +
  18272. + account_event(event);
  18273. +
  18274. + /*
  18275. + * Special case software events and allow them to be part of
  18276. + * any hardware group.
  18277. + */
  18278. + pmu = event->pmu;
  18279. +
  18280. + if (group_leader &&
  18281. + (is_software_event(event) != is_software_event(group_leader))) {
  18282. + if (is_software_event(event)) {
  18283. + /*
  18284. + * If event and group_leader are not both a software
  18285. + * event, and event is, then group leader is not.
  18286. + *
  18287. + * Allow the addition of software events to !software
  18288. + * groups, this is safe because software events never
  18289. + * fail to schedule.
  18290. + */
  18291. + pmu = group_leader->pmu;
  18292. + } else if (is_software_event(group_leader) &&
  18293. + (group_leader->group_flags & PERF_GROUP_SOFTWARE)) {
  18294. + /*
  18295. + * In case the group is a pure software group, and we
  18296. + * try to add a hardware event, move the whole group to
  18297. + * the hardware context.
  18298. + */
  18299. + move_group = 1;
  18300. + }
  18301. + }
  18302. +
  18303. + /*
  18304. + * Get the target context (task or percpu):
  18305. + */
  18306. + ctx = find_get_context(pmu, task, event->cpu);
  18307. + if (IS_ERR(ctx)) {
  18308. + err = PTR_ERR(ctx);
  18309. + goto err_alloc;
  18310. + }
  18311. +
  18312. + if (task) {
  18313. + put_task_struct(task);
  18314. + task = NULL;
  18315. + }
  18316. +
  18317. + /*
  18318. + * Look up the group leader (we will attach this event to it):
  18319. + */
  18320. + if (group_leader) {
  18321. + err = -EINVAL;
  18322. +
  18323. + /*
  18324. + * Do not allow a recursive hierarchy (this new sibling
  18325. + * becoming part of another group-sibling):
  18326. + */
  18327. + if (group_leader->group_leader != group_leader)
  18328. + goto err_context;
  18329. + /*
  18330. + * Do not allow to attach to a group in a different
  18331. + * task or CPU context:
  18332. + */
  18333. + if (move_group) {
  18334. + if (group_leader->ctx->type != ctx->type)
  18335. + goto err_context;
  18336. + } else {
  18337. + if (group_leader->ctx != ctx)
  18338. + goto err_context;
  18339. + }
  18340. +
  18341. + /*
  18342. + * Only a group leader can be exclusive or pinned
  18343. + */
  18344. + if (attr.exclusive || attr.pinned)
  18345. + goto err_context;
  18346. + }
  18347. +
  18348. + if (output_event) {
  18349. + err = perf_event_set_output(event, output_event);
  18350. + if (err)
  18351. + goto err_context;
  18352. + }
  18353. +
  18354. + event_file = anon_inode_getfile("[perf_event]", &perf_fops, event,
  18355. + f_flags);
  18356. + if (IS_ERR(event_file)) {
  18357. + err = PTR_ERR(event_file);
  18358. + goto err_context;
  18359. + }
  18360. +
  18361. + if (move_group) {
  18362. + struct perf_event_context *gctx = group_leader->ctx;
  18363. +
  18364. + mutex_lock(&gctx->mutex);
  18365. + perf_remove_from_context(group_leader, false);
  18366. +
  18367. + /*
  18368. + * Removing from the context ends up with disabled
  18369. + * event. What we want here is event in the initial
  18370. + * startup state, ready to be add into new context.
  18371. + */
  18372. + perf_event__state_init(group_leader);
  18373. + list_for_each_entry(sibling, &group_leader->sibling_list,
  18374. + group_entry) {
  18375. + perf_remove_from_context(sibling, false);
  18376. + perf_event__state_init(sibling);
  18377. + put_ctx(gctx);
  18378. + }
  18379. + mutex_unlock(&gctx->mutex);
  18380. + put_ctx(gctx);
  18381. + }
  18382. +
  18383. + WARN_ON_ONCE(ctx->parent_ctx);
  18384. + mutex_lock(&ctx->mutex);
  18385. +
  18386. + if (move_group) {
  18387. + synchronize_rcu();
  18388. + perf_install_in_context(ctx, group_leader, group_leader->cpu);
  18389. + get_ctx(ctx);
  18390. + list_for_each_entry(sibling, &group_leader->sibling_list,
  18391. + group_entry) {
  18392. + perf_install_in_context(ctx, sibling, sibling->cpu);
  18393. + get_ctx(ctx);
  18394. + }
  18395. + }
  18396. +
  18397. + perf_install_in_context(ctx, event, event->cpu);
  18398. + perf_unpin_context(ctx);
  18399. + mutex_unlock(&ctx->mutex);
  18400. +
  18401. + put_online_cpus();
  18402. +
  18403. + event->owner = current;
  18404. +
  18405. + mutex_lock(&current->perf_event_mutex);
  18406. + list_add_tail(&event->owner_entry, &current->perf_event_list);
  18407. + mutex_unlock(&current->perf_event_mutex);
  18408. +
  18409. + /*
  18410. + * Precalculate sample_data sizes
  18411. + */
  18412. + perf_event__header_size(event);
  18413. + perf_event__id_header_size(event);
  18414. +
  18415. + /*
  18416. + * Drop the reference on the group_event after placing the
  18417. + * new event on the sibling_list. This ensures destruction
  18418. + * of the group leader will find the pointer to itself in
  18419. + * perf_group_detach().
  18420. + */
  18421. + fdput(group);
  18422. + fd_install(event_fd, event_file);
  18423. + return event_fd;
  18424. +
  18425. +err_context:
  18426. + perf_unpin_context(ctx);
  18427. + put_ctx(ctx);
  18428. +err_alloc:
  18429. + free_event(event);
  18430. +err_cpus:
  18431. + put_online_cpus();
  18432. +err_task:
  18433. + if (task)
  18434. + put_task_struct(task);
  18435. +err_group_fd:
  18436. + fdput(group);
  18437. +err_fd:
  18438. + put_unused_fd(event_fd);
  18439. + return err;
  18440. +}
  18441. +
  18442. +/**
  18443. + * perf_event_create_kernel_counter
  18444. + *
  18445. + * @attr: attributes of the counter to create
  18446. + * @cpu: cpu in which the counter is bound
  18447. + * @task: task to profile (NULL for percpu)
  18448. + */
  18449. +struct perf_event *
  18450. +perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
  18451. + struct task_struct *task,
  18452. + perf_overflow_handler_t overflow_handler,
  18453. + void *context)
  18454. +{
  18455. + struct perf_event_context *ctx;
  18456. + struct perf_event *event;
  18457. + int err;
  18458. +
  18459. + /*
  18460. + * Get the target context (task or percpu):
  18461. + */
  18462. +
  18463. + event = perf_event_alloc(attr, cpu, task, NULL, NULL,
  18464. + overflow_handler, context);
  18465. + if (IS_ERR(event)) {
  18466. + err = PTR_ERR(event);
  18467. + goto err;
  18468. + }
  18469. +
  18470. + /* Mark owner so we could distinguish it from user events. */
  18471. + event->owner = EVENT_OWNER_KERNEL;
  18472. +
  18473. + account_event(event);
  18474. +
  18475. + ctx = find_get_context(event->pmu, task, cpu);
  18476. + if (IS_ERR(ctx)) {
  18477. + err = PTR_ERR(ctx);
  18478. + goto err_free;
  18479. + }
  18480. +
  18481. + WARN_ON_ONCE(ctx->parent_ctx);
  18482. + mutex_lock(&ctx->mutex);
  18483. + perf_install_in_context(ctx, event, cpu);
  18484. + perf_unpin_context(ctx);
  18485. + mutex_unlock(&ctx->mutex);
  18486. +
  18487. + return event;
  18488. +
  18489. +err_free:
  18490. + free_event(event);
  18491. +err:
  18492. + return ERR_PTR(err);
  18493. +}
  18494. +EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
  18495. +
  18496. +void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
  18497. +{
  18498. + struct perf_event_context *src_ctx;
  18499. + struct perf_event_context *dst_ctx;
  18500. + struct perf_event *event, *tmp;
  18501. + LIST_HEAD(events);
  18502. +
  18503. + src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx;
  18504. + dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx;
  18505. +
  18506. + mutex_lock(&src_ctx->mutex);
  18507. + list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
  18508. + event_entry) {
  18509. + perf_remove_from_context(event, false);
  18510. + unaccount_event_cpu(event, src_cpu);
  18511. + put_ctx(src_ctx);
  18512. + list_add(&event->migrate_entry, &events);
  18513. + }
  18514. + mutex_unlock(&src_ctx->mutex);
  18515. +
  18516. + synchronize_rcu();
  18517. +
  18518. + mutex_lock(&dst_ctx->mutex);
  18519. + list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
  18520. + list_del(&event->migrate_entry);
  18521. + if (event->state >= PERF_EVENT_STATE_OFF)
  18522. + event->state = PERF_EVENT_STATE_INACTIVE;
  18523. + account_event_cpu(event, dst_cpu);
  18524. + perf_install_in_context(dst_ctx, event, dst_cpu);
  18525. + get_ctx(dst_ctx);
  18526. + }
  18527. + mutex_unlock(&dst_ctx->mutex);
  18528. +}
  18529. +EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);
  18530. +
  18531. +static void sync_child_event(struct perf_event *child_event,
  18532. + struct task_struct *child)
  18533. +{
  18534. + struct perf_event *parent_event = child_event->parent;
  18535. + u64 child_val;
  18536. +
  18537. + if (child_event->attr.inherit_stat)
  18538. + perf_event_read_event(child_event, child);
  18539. +
  18540. + child_val = perf_event_count(child_event);
  18541. +
  18542. + /*
  18543. + * Add back the child's count to the parent's count:
  18544. + */
  18545. + atomic64_add(child_val, &parent_event->child_count);
  18546. + atomic64_add(child_event->total_time_enabled,
  18547. + &parent_event->child_total_time_enabled);
  18548. + atomic64_add(child_event->total_time_running,
  18549. + &parent_event->child_total_time_running);
  18550. +
  18551. + /*
  18552. + * Remove this event from the parent's list
  18553. + */
  18554. + WARN_ON_ONCE(parent_event->ctx->parent_ctx);
  18555. + mutex_lock(&parent_event->child_mutex);
  18556. + list_del_init(&child_event->child_list);
  18557. + mutex_unlock(&parent_event->child_mutex);
  18558. +
  18559. + /*
  18560. + * Make sure user/parent get notified, that we just
  18561. + * lost one event.
  18562. + */
  18563. + perf_event_wakeup(parent_event);
  18564. +
  18565. + /*
  18566. + * Release the parent event, if this was the last
  18567. + * reference to it.
  18568. + */
  18569. + put_event(parent_event);
  18570. +}
  18571. +
  18572. +static void
  18573. +__perf_event_exit_task(struct perf_event *child_event,
  18574. + struct perf_event_context *child_ctx,
  18575. + struct task_struct *child)
  18576. +{
  18577. + /*
  18578. + * Do not destroy the 'original' grouping; because of the context
  18579. + * switch optimization the original events could've ended up in a
  18580. + * random child task.
  18581. + *
  18582. + * If we were to destroy the original group, all group related
  18583. + * operations would cease to function properly after this random
  18584. + * child dies.
  18585. + *
  18586. + * Do destroy all inherited groups, we don't care about those
  18587. + * and being thorough is better.
  18588. + */
  18589. + perf_remove_from_context(child_event, !!child_event->parent);
  18590. +
  18591. + /*
  18592. + * It can happen that the parent exits first, and has events
  18593. + * that are still around due to the child reference. These
  18594. + * events need to be zapped.
  18595. + */
  18596. + if (child_event->parent) {
  18597. + sync_child_event(child_event, child);
  18598. + free_event(child_event);
  18599. + } else {
  18600. + child_event->state = PERF_EVENT_STATE_EXIT;
  18601. + perf_event_wakeup(child_event);
  18602. + }
  18603. +}
  18604. +
  18605. +static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
  18606. +{
  18607. + struct perf_event *child_event, *next;
  18608. + struct perf_event_context *child_ctx, *clone_ctx = NULL;
  18609. + unsigned long flags;
  18610. +
  18611. + if (likely(!child->perf_event_ctxp[ctxn])) {
  18612. + perf_event_task(child, NULL, 0);
  18613. + return;
  18614. + }
  18615. +
  18616. + local_irq_save(flags);
  18617. + /*
  18618. + * We can't reschedule here because interrupts are disabled,
  18619. + * and either child is current or it is a task that can't be
  18620. + * scheduled, so we are now safe from rescheduling changing
  18621. + * our context.
  18622. + */
  18623. + child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]);
  18624. +
  18625. + /*
  18626. + * Take the context lock here so that if find_get_context is
  18627. + * reading child->perf_event_ctxp, we wait until it has
  18628. + * incremented the context's refcount before we do put_ctx below.
  18629. + */
  18630. + raw_spin_lock(&child_ctx->lock);
  18631. + task_ctx_sched_out(child_ctx);
  18632. + child->perf_event_ctxp[ctxn] = NULL;
  18633. +
  18634. + /*
  18635. + * If this context is a clone; unclone it so it can't get
  18636. + * swapped to another process while we're removing all
  18637. + * the events from it.
  18638. + */
  18639. + clone_ctx = unclone_ctx(child_ctx);
  18640. + update_context_time(child_ctx);
  18641. + raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
  18642. +
  18643. + if (clone_ctx)
  18644. + put_ctx(clone_ctx);
  18645. +
  18646. + /*
  18647. + * Report the task dead after unscheduling the events so that we
  18648. + * won't get any samples after PERF_RECORD_EXIT. We can however still
  18649. + * get a few PERF_RECORD_READ events.
  18650. + */
  18651. + perf_event_task(child, child_ctx, 0);
  18652. +
  18653. + /*
  18654. + * We can recurse on the same lock type through:
  18655. + *
  18656. + * __perf_event_exit_task()
  18657. + * sync_child_event()
  18658. + * put_event()
  18659. + * mutex_lock(&ctx->mutex)
  18660. + *
  18661. + * But since its the parent context it won't be the same instance.
  18662. + */
  18663. + mutex_lock(&child_ctx->mutex);
  18664. +
  18665. + list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
  18666. + __perf_event_exit_task(child_event, child_ctx, child);
  18667. +
  18668. + mutex_unlock(&child_ctx->mutex);
  18669. +
  18670. + put_ctx(child_ctx);
  18671. +}
  18672. +
  18673. +/*
  18674. + * When a child task exits, feed back event values to parent events.
  18675. + */
  18676. +void perf_event_exit_task(struct task_struct *child)
  18677. +{
  18678. + struct perf_event *event, *tmp;
  18679. + int ctxn;
  18680. +
  18681. + mutex_lock(&child->perf_event_mutex);
  18682. + list_for_each_entry_safe(event, tmp, &child->perf_event_list,
  18683. + owner_entry) {
  18684. + list_del_init(&event->owner_entry);
  18685. +
  18686. + /*
  18687. + * Ensure the list deletion is visible before we clear
  18688. + * the owner, closes a race against perf_release() where
  18689. + * we need to serialize on the owner->perf_event_mutex.
  18690. + */
  18691. + smp_wmb();
  18692. + event->owner = NULL;
  18693. + }
  18694. + mutex_unlock(&child->perf_event_mutex);
  18695. +
  18696. + for_each_task_context_nr(ctxn)
  18697. + perf_event_exit_task_context(child, ctxn);
  18698. +}
  18699. +
  18700. +static void perf_free_event(struct perf_event *event,
  18701. + struct perf_event_context *ctx)
  18702. +{
  18703. + struct perf_event *parent = event->parent;
  18704. +
  18705. + if (WARN_ON_ONCE(!parent))
  18706. + return;
  18707. +
  18708. + mutex_lock(&parent->child_mutex);
  18709. + list_del_init(&event->child_list);
  18710. + mutex_unlock(&parent->child_mutex);
  18711. +
  18712. + put_event(parent);
  18713. +
  18714. + perf_group_detach(event);
  18715. + list_del_event(event, ctx);
  18716. + free_event(event);
  18717. +}
  18718. +
  18719. +/*
  18720. + * free an unexposed, unused context as created by inheritance by
  18721. + * perf_event_init_task below, used by fork() in case of fail.
  18722. + */
  18723. +void perf_event_free_task(struct task_struct *task)
  18724. +{
  18725. + struct perf_event_context *ctx;
  18726. + struct perf_event *event, *tmp;
  18727. + int ctxn;
  18728. +
  18729. + for_each_task_context_nr(ctxn) {
  18730. + ctx = task->perf_event_ctxp[ctxn];
  18731. + if (!ctx)
  18732. + continue;
  18733. +
  18734. + mutex_lock(&ctx->mutex);
  18735. +again:
  18736. + list_for_each_entry_safe(event, tmp, &ctx->pinned_groups,
  18737. + group_entry)
  18738. + perf_free_event(event, ctx);
  18739. +
  18740. + list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
  18741. + group_entry)
  18742. + perf_free_event(event, ctx);
  18743. +
  18744. + if (!list_empty(&ctx->pinned_groups) ||
  18745. + !list_empty(&ctx->flexible_groups))
  18746. + goto again;
  18747. +
  18748. + mutex_unlock(&ctx->mutex);
  18749. +
  18750. + put_ctx(ctx);
  18751. + }
  18752. +}
  18753. +
  18754. +void perf_event_delayed_put(struct task_struct *task)
  18755. +{
  18756. + int ctxn;
  18757. +
  18758. + for_each_task_context_nr(ctxn)
  18759. + WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
  18760. +}
  18761. +
  18762. +/*
  18763. + * inherit a event from parent task to child task:
  18764. + */
  18765. +static struct perf_event *
  18766. +inherit_event(struct perf_event *parent_event,
  18767. + struct task_struct *parent,
  18768. + struct perf_event_context *parent_ctx,
  18769. + struct task_struct *child,
  18770. + struct perf_event *group_leader,
  18771. + struct perf_event_context *child_ctx)
  18772. +{
  18773. + enum perf_event_active_state parent_state = parent_event->state;
  18774. + struct perf_event *child_event;
  18775. + unsigned long flags;
  18776. +
  18777. + /*
  18778. + * Instead of creating recursive hierarchies of events,
  18779. + * we link inherited events back to the original parent,
  18780. + * which has a filp for sure, which we use as the reference
  18781. + * count:
  18782. + */
  18783. + if (parent_event->parent)
  18784. + parent_event = parent_event->parent;
  18785. +
  18786. + child_event = perf_event_alloc(&parent_event->attr,
  18787. + parent_event->cpu,
  18788. + child,
  18789. + group_leader, parent_event,
  18790. + NULL, NULL);
  18791. + if (IS_ERR(child_event))
  18792. + return child_event;
  18793. +
  18794. + if (is_orphaned_event(parent_event) ||
  18795. + !atomic_long_inc_not_zero(&parent_event->refcount)) {
  18796. + free_event(child_event);
  18797. + return NULL;
  18798. + }
  18799. +
  18800. + get_ctx(child_ctx);
  18801. +
  18802. + /*
  18803. + * Make the child state follow the state of the parent event,
  18804. + * not its attr.disabled bit. We hold the parent's mutex,
  18805. + * so we won't race with perf_event_{en, dis}able_family.
  18806. + */
  18807. + if (parent_state >= PERF_EVENT_STATE_INACTIVE)
  18808. + child_event->state = PERF_EVENT_STATE_INACTIVE;
  18809. + else
  18810. + child_event->state = PERF_EVENT_STATE_OFF;
  18811. +
  18812. + if (parent_event->attr.freq) {
  18813. + u64 sample_period = parent_event->hw.sample_period;
  18814. + struct hw_perf_event *hwc = &child_event->hw;
  18815. +
  18816. + hwc->sample_period = sample_period;
  18817. + hwc->last_period = sample_period;
  18818. +
  18819. + local64_set(&hwc->period_left, sample_period);
  18820. + }
  18821. +
  18822. + child_event->ctx = child_ctx;
  18823. + child_event->overflow_handler = parent_event->overflow_handler;
  18824. + child_event->overflow_handler_context
  18825. + = parent_event->overflow_handler_context;
  18826. +
  18827. + /*
  18828. + * Precalculate sample_data sizes
  18829. + */
  18830. + perf_event__header_size(child_event);
  18831. + perf_event__id_header_size(child_event);
  18832. +
  18833. + /*
  18834. + * Link it up in the child's context:
  18835. + */
  18836. + raw_spin_lock_irqsave(&child_ctx->lock, flags);
  18837. + add_event_to_ctx(child_event, child_ctx);
  18838. + raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
  18839. +
  18840. + /*
  18841. + * Link this into the parent event's child list
  18842. + */
  18843. + WARN_ON_ONCE(parent_event->ctx->parent_ctx);
  18844. + mutex_lock(&parent_event->child_mutex);
  18845. + list_add_tail(&child_event->child_list, &parent_event->child_list);
  18846. + mutex_unlock(&parent_event->child_mutex);
  18847. +
  18848. + return child_event;
  18849. +}
  18850. +
  18851. +static int inherit_group(struct perf_event *parent_event,
  18852. + struct task_struct *parent,
  18853. + struct perf_event_context *parent_ctx,
  18854. + struct task_struct *child,
  18855. + struct perf_event_context *child_ctx)
  18856. +{
  18857. + struct perf_event *leader;
  18858. + struct perf_event *sub;
  18859. + struct perf_event *child_ctr;
  18860. +
  18861. + leader = inherit_event(parent_event, parent, parent_ctx,
  18862. + child, NULL, child_ctx);
  18863. + if (IS_ERR(leader))
  18864. + return PTR_ERR(leader);
  18865. + list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
  18866. + child_ctr = inherit_event(sub, parent, parent_ctx,
  18867. + child, leader, child_ctx);
  18868. + if (IS_ERR(child_ctr))
  18869. + return PTR_ERR(child_ctr);
  18870. + }
  18871. + return 0;
  18872. +}
  18873. +
  18874. +static int
  18875. +inherit_task_group(struct perf_event *event, struct task_struct *parent,
  18876. + struct perf_event_context *parent_ctx,
  18877. + struct task_struct *child, int ctxn,
  18878. + int *inherited_all)
  18879. +{
  18880. + int ret;
  18881. + struct perf_event_context *child_ctx;
  18882. +
  18883. + if (!event->attr.inherit) {
  18884. + *inherited_all = 0;
  18885. + return 0;
  18886. + }
  18887. +
  18888. + child_ctx = child->perf_event_ctxp[ctxn];
  18889. + if (!child_ctx) {
  18890. + /*
  18891. + * This is executed from the parent task context, so
  18892. + * inherit events that have been marked for cloning.
  18893. + * First allocate and initialize a context for the
  18894. + * child.
  18895. + */
  18896. +
  18897. + child_ctx = alloc_perf_context(parent_ctx->pmu, child);
  18898. + if (!child_ctx)
  18899. + return -ENOMEM;
  18900. +
  18901. + child->perf_event_ctxp[ctxn] = child_ctx;
  18902. + }
  18903. +
  18904. + ret = inherit_group(event, parent, parent_ctx,
  18905. + child, child_ctx);
  18906. +
  18907. + if (ret)
  18908. + *inherited_all = 0;
  18909. +
  18910. + return ret;
  18911. +}
  18912. +
  18913. +/*
  18914. + * Initialize the perf_event context in task_struct
  18915. + */
  18916. +static int perf_event_init_context(struct task_struct *child, int ctxn)
  18917. +{
  18918. + struct perf_event_context *child_ctx, *parent_ctx;
  18919. + struct perf_event_context *cloned_ctx;
  18920. + struct perf_event *event;
  18921. + struct task_struct *parent = current;
  18922. + int inherited_all = 1;
  18923. + unsigned long flags;
  18924. + int ret = 0;
  18925. +
  18926. + if (likely(!parent->perf_event_ctxp[ctxn]))
  18927. + return 0;
  18928. +
  18929. + /*
  18930. + * If the parent's context is a clone, pin it so it won't get
  18931. + * swapped under us.
  18932. + */
  18933. + parent_ctx = perf_pin_task_context(parent, ctxn);
  18934. + if (!parent_ctx)
  18935. + return 0;
  18936. +
  18937. + /*
  18938. + * No need to check if parent_ctx != NULL here; since we saw
  18939. + * it non-NULL earlier, the only reason for it to become NULL
  18940. + * is if we exit, and since we're currently in the middle of
  18941. + * a fork we can't be exiting at the same time.
  18942. + */
  18943. +
  18944. + /*
  18945. + * Lock the parent list. No need to lock the child - not PID
  18946. + * hashed yet and not running, so nobody can access it.
  18947. + */
  18948. + mutex_lock(&parent_ctx->mutex);
  18949. +
  18950. + /*
  18951. + * We dont have to disable NMIs - we are only looking at
  18952. + * the list, not manipulating it:
  18953. + */
  18954. + list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
  18955. + ret = inherit_task_group(event, parent, parent_ctx,
  18956. + child, ctxn, &inherited_all);
  18957. + if (ret)
  18958. + break;
  18959. + }
  18960. +
  18961. + /*
  18962. + * We can't hold ctx->lock when iterating the ->flexible_group list due
  18963. + * to allocations, but we need to prevent rotation because
  18964. + * rotate_ctx() will change the list from interrupt context.
  18965. + */
  18966. + raw_spin_lock_irqsave(&parent_ctx->lock, flags);
  18967. + parent_ctx->rotate_disable = 1;
  18968. + raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
  18969. +
  18970. + list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
  18971. + ret = inherit_task_group(event, parent, parent_ctx,
  18972. + child, ctxn, &inherited_all);
  18973. + if (ret)
  18974. + break;
  18975. + }
  18976. +
  18977. + raw_spin_lock_irqsave(&parent_ctx->lock, flags);
  18978. + parent_ctx->rotate_disable = 0;
  18979. +
  18980. + child_ctx = child->perf_event_ctxp[ctxn];
  18981. +
  18982. + if (child_ctx && inherited_all) {
  18983. + /*
  18984. + * Mark the child context as a clone of the parent
  18985. + * context, or of whatever the parent is a clone of.
  18986. + *
  18987. + * Note that if the parent is a clone, the holding of
  18988. + * parent_ctx->lock avoids it from being uncloned.
  18989. + */
  18990. + cloned_ctx = parent_ctx->parent_ctx;
  18991. + if (cloned_ctx) {
  18992. + child_ctx->parent_ctx = cloned_ctx;
  18993. + child_ctx->parent_gen = parent_ctx->parent_gen;
  18994. + } else {
  18995. + child_ctx->parent_ctx = parent_ctx;
  18996. + child_ctx->parent_gen = parent_ctx->generation;
  18997. + }
  18998. + get_ctx(child_ctx->parent_ctx);
  18999. + }
  19000. +
  19001. + raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
  19002. + mutex_unlock(&parent_ctx->mutex);
  19003. +
  19004. + perf_unpin_context(parent_ctx);
  19005. + put_ctx(parent_ctx);
  19006. +
  19007. + return ret;
  19008. +}
  19009. +
  19010. +/*
  19011. + * Initialize the perf_event context in task_struct
  19012. + */
  19013. +int perf_event_init_task(struct task_struct *child)
  19014. +{
  19015. + int ctxn, ret;
  19016. +
  19017. + memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp));
  19018. + mutex_init(&child->perf_event_mutex);
  19019. + INIT_LIST_HEAD(&child->perf_event_list);
  19020. +
  19021. + for_each_task_context_nr(ctxn) {
  19022. + ret = perf_event_init_context(child, ctxn);
  19023. + if (ret) {
  19024. + perf_event_free_task(child);
  19025. + return ret;
  19026. + }
  19027. + }
  19028. +
  19029. + return 0;
  19030. +}
  19031. +
  19032. +static void __init perf_event_init_all_cpus(void)
  19033. +{
  19034. + struct swevent_htable *swhash;
  19035. + int cpu;
  19036. +
  19037. + for_each_possible_cpu(cpu) {
  19038. + swhash = &per_cpu(swevent_htable, cpu);
  19039. + mutex_init(&swhash->hlist_mutex);
  19040. + INIT_LIST_HEAD(&per_cpu(rotation_list, cpu));
  19041. + }
  19042. +}
  19043. +
  19044. +static void perf_event_init_cpu(int cpu)
  19045. +{
  19046. + struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
  19047. +
  19048. + mutex_lock(&swhash->hlist_mutex);
  19049. + swhash->online = true;
  19050. + if (swhash->hlist_refcount > 0) {
  19051. + struct swevent_hlist *hlist;
  19052. +
  19053. + hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
  19054. + WARN_ON(!hlist);
  19055. + rcu_assign_pointer(swhash->swevent_hlist, hlist);
  19056. + }
  19057. + mutex_unlock(&swhash->hlist_mutex);
  19058. +}
  19059. +
  19060. +#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC
  19061. +static void perf_pmu_rotate_stop(struct pmu *pmu)
  19062. +{
  19063. + struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
  19064. +
  19065. + WARN_ON(!irqs_disabled());
  19066. +
  19067. + list_del_init(&cpuctx->rotation_list);
  19068. +}
  19069. +
  19070. +static void __perf_event_exit_context(void *__info)
  19071. +{
  19072. + struct remove_event re = { .detach_group = true };
  19073. + struct perf_event_context *ctx = __info;
  19074. +
  19075. + perf_pmu_rotate_stop(ctx->pmu);
  19076. +
  19077. + rcu_read_lock();
  19078. + list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry)
  19079. + __perf_remove_from_context(&re);
  19080. + rcu_read_unlock();
  19081. +}
  19082. +
  19083. +static void perf_event_exit_cpu_context(int cpu)
  19084. +{
  19085. + struct perf_event_context *ctx;
  19086. + struct pmu *pmu;
  19087. + int idx;
  19088. +
  19089. + idx = srcu_read_lock(&pmus_srcu);
  19090. + list_for_each_entry_rcu(pmu, &pmus, entry) {
  19091. + ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx;
  19092. +
  19093. + mutex_lock(&ctx->mutex);
  19094. + smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
  19095. + mutex_unlock(&ctx->mutex);
  19096. + }
  19097. + srcu_read_unlock(&pmus_srcu, idx);
  19098. +}
  19099. +
  19100. +static void perf_event_exit_cpu(int cpu)
  19101. +{
  19102. + struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
  19103. +
  19104. + perf_event_exit_cpu_context(cpu);
  19105. +
  19106. + mutex_lock(&swhash->hlist_mutex);
  19107. + swhash->online = false;
  19108. + swevent_hlist_release(swhash);
  19109. + mutex_unlock(&swhash->hlist_mutex);
  19110. +}
  19111. +#else
  19112. +static inline void perf_event_exit_cpu(int cpu) { }
  19113. +#endif
  19114. +
  19115. +static int
  19116. +perf_reboot(struct notifier_block *notifier, unsigned long val, void *v)
  19117. +{
  19118. + int cpu;
  19119. +
  19120. + for_each_online_cpu(cpu)
  19121. + perf_event_exit_cpu(cpu);
  19122. +
  19123. + return NOTIFY_OK;
  19124. +}
  19125. +
  19126. +/*
  19127. + * Run the perf reboot notifier at the very last possible moment so that
  19128. + * the generic watchdog code runs as long as possible.
  19129. + */
  19130. +static struct notifier_block perf_reboot_notifier = {
  19131. + .notifier_call = perf_reboot,
  19132. + .priority = INT_MIN,
  19133. +};
  19134. +
  19135. +static int
  19136. +perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
  19137. +{
  19138. + unsigned int cpu = (long)hcpu;
  19139. +
  19140. + switch (action & ~CPU_TASKS_FROZEN) {
  19141. +
  19142. + case CPU_UP_PREPARE:
  19143. + case CPU_DOWN_FAILED:
  19144. + perf_event_init_cpu(cpu);
  19145. + break;
  19146. +
  19147. + case CPU_UP_CANCELED:
  19148. + case CPU_DOWN_PREPARE:
  19149. + perf_event_exit_cpu(cpu);
  19150. + break;
  19151. + default:
  19152. + break;
  19153. + }
  19154. +
  19155. + return NOTIFY_OK;
  19156. +}
  19157. +
  19158. +void __init perf_event_init(void)
  19159. +{
  19160. + int ret;
  19161. +
  19162. + idr_init(&pmu_idr);
  19163. +
  19164. + perf_event_init_all_cpus();
  19165. + init_srcu_struct(&pmus_srcu);
  19166. + perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
  19167. + perf_pmu_register(&perf_cpu_clock, NULL, -1);
  19168. + perf_pmu_register(&perf_task_clock, NULL, -1);
  19169. + perf_tp_register();
  19170. + perf_cpu_notifier(perf_cpu_notify);
  19171. + register_reboot_notifier(&perf_reboot_notifier);
  19172. +
  19173. + ret = init_hw_breakpoint();
  19174. + WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
  19175. +
  19176. + /* do not patch jump label more than once per second */
  19177. + jump_label_rate_limit(&perf_sched_events, HZ);
  19178. +
  19179. + /*
  19180. + * Build time assertion that we keep the data_head at the intended
  19181. + * location. IOW, validation we got the __reserved[] size right.
  19182. + */
  19183. + BUILD_BUG_ON((offsetof(struct perf_event_mmap_page, data_head))
  19184. + != 1024);
  19185. +}
  19186. +
  19187. +static int __init perf_event_sysfs_init(void)
  19188. +{
  19189. + struct pmu *pmu;
  19190. + int ret;
  19191. +
  19192. + mutex_lock(&pmus_lock);
  19193. +
  19194. + ret = bus_register(&pmu_bus);
  19195. + if (ret)
  19196. + goto unlock;
  19197. +
  19198. + list_for_each_entry(pmu, &pmus, entry) {
  19199. + if (!pmu->name || pmu->type < 0)
  19200. + continue;
  19201. +
  19202. + ret = pmu_dev_alloc(pmu);
  19203. + WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret);
  19204. + }
  19205. + pmu_bus_running = 1;
  19206. + ret = 0;
  19207. +
  19208. +unlock:
  19209. + mutex_unlock(&pmus_lock);
  19210. +
  19211. + return ret;
  19212. +}
  19213. +device_initcall(perf_event_sysfs_init);
  19214. +
  19215. +#ifdef CONFIG_CGROUP_PERF
  19216. +static struct cgroup_subsys_state *
  19217. +perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
  19218. +{
  19219. + struct perf_cgroup *jc;
  19220. +
  19221. + jc = kzalloc(sizeof(*jc), GFP_KERNEL);
  19222. + if (!jc)
  19223. + return ERR_PTR(-ENOMEM);
  19224. +
  19225. + jc->info = alloc_percpu(struct perf_cgroup_info);
  19226. + if (!jc->info) {
  19227. + kfree(jc);
  19228. + return ERR_PTR(-ENOMEM);
  19229. + }
  19230. +
  19231. + return &jc->css;
  19232. +}
  19233. +
  19234. +static void perf_cgroup_css_free(struct cgroup_subsys_state *css)
  19235. +{
  19236. + struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css);
  19237. +
  19238. + free_percpu(jc->info);
  19239. + kfree(jc);
  19240. +}
  19241. +
  19242. +static int __perf_cgroup_move(void *info)
  19243. +{
  19244. + struct task_struct *task = info;
  19245. + perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
  19246. + return 0;
  19247. +}
  19248. +
  19249. +static void perf_cgroup_attach(struct cgroup_subsys_state *css,
  19250. + struct cgroup_taskset *tset)
  19251. +{
  19252. + struct task_struct *task;
  19253. +
  19254. + cgroup_taskset_for_each(task, tset)
  19255. + task_function_call(task, __perf_cgroup_move, task);
  19256. +}
  19257. +
  19258. +static void perf_cgroup_exit(struct cgroup_subsys_state *css,
  19259. + struct cgroup_subsys_state *old_css,
  19260. + struct task_struct *task)
  19261. +{
  19262. + /*
  19263. + * cgroup_exit() is called in the copy_process() failure path.
  19264. + * Ignore this case since the task hasn't ran yet, this avoids
  19265. + * trying to poke a half freed task state from generic code.
  19266. + */
  19267. + if (!(task->flags & PF_EXITING))
  19268. + return;
  19269. +
  19270. + task_function_call(task, __perf_cgroup_move, task);
  19271. +}
  19272. +
  19273. +struct cgroup_subsys perf_event_cgrp_subsys = {
  19274. + .css_alloc = perf_cgroup_css_alloc,
  19275. + .css_free = perf_cgroup_css_free,
  19276. + .exit = perf_cgroup_exit,
  19277. + .attach = perf_cgroup_attach,
  19278. +};
  19279. +#endif /* CONFIG_CGROUP_PERF */
  19280. diff -Nur linux-3.18.12.orig/kernel/exit.c linux-3.18.12/kernel/exit.c
  19281. --- linux-3.18.12.orig/kernel/exit.c 2015-04-20 14:48:02.000000000 -0500
  19282. +++ linux-3.18.12/kernel/exit.c 2015-04-26 13:32:22.431684003 -0500
  19283. @@ -147,7 +147,7 @@
  19284. * Do this under ->siglock, we can race with another thread
  19285. * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.
  19286. */
  19287. - flush_sigqueue(&tsk->pending);
  19288. + flush_task_sigqueue(tsk);
  19289. tsk->sighand = NULL;
  19290. spin_unlock(&sighand->siglock);
  19291. diff -Nur linux-3.18.12.orig/kernel/fork.c linux-3.18.12/kernel/fork.c
  19292. --- linux-3.18.12.orig/kernel/fork.c 2015-04-20 14:48:02.000000000 -0500
  19293. +++ linux-3.18.12/kernel/fork.c 2015-04-26 13:32:22.435684003 -0500
  19294. @@ -97,7 +97,7 @@
  19295. DEFINE_PER_CPU(unsigned long, process_counts) = 0;
  19296. -__cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */
  19297. +DEFINE_RWLOCK(tasklist_lock); /* outer */
  19298. #ifdef CONFIG_PROVE_RCU
  19299. int lockdep_tasklist_lock_is_held(void)
  19300. @@ -233,7 +233,9 @@
  19301. if (atomic_dec_and_test(&sig->sigcnt))
  19302. free_signal_struct(sig);
  19303. }
  19304. -
  19305. +#ifdef CONFIG_PREEMPT_RT_BASE
  19306. +static
  19307. +#endif
  19308. void __put_task_struct(struct task_struct *tsk)
  19309. {
  19310. WARN_ON(!tsk->exit_state);
  19311. @@ -249,7 +251,18 @@
  19312. if (!profile_handoff_task(tsk))
  19313. free_task(tsk);
  19314. }
  19315. +#ifndef CONFIG_PREEMPT_RT_BASE
  19316. EXPORT_SYMBOL_GPL(__put_task_struct);
  19317. +#else
  19318. +void __put_task_struct_cb(struct rcu_head *rhp)
  19319. +{
  19320. + struct task_struct *tsk = container_of(rhp, struct task_struct, put_rcu);
  19321. +
  19322. + __put_task_struct(tsk);
  19323. +
  19324. +}
  19325. +EXPORT_SYMBOL_GPL(__put_task_struct_cb);
  19326. +#endif
  19327. void __init __weak arch_task_cache_init(void) { }
  19328. @@ -643,6 +656,19 @@
  19329. }
  19330. EXPORT_SYMBOL_GPL(__mmdrop);
  19331. +#ifdef CONFIG_PREEMPT_RT_BASE
  19332. +/*
  19333. + * RCU callback for delayed mm drop. Not strictly rcu, but we don't
  19334. + * want another facility to make this work.
  19335. + */
  19336. +void __mmdrop_delayed(struct rcu_head *rhp)
  19337. +{
  19338. + struct mm_struct *mm = container_of(rhp, struct mm_struct, delayed_drop);
  19339. +
  19340. + __mmdrop(mm);
  19341. +}
  19342. +#endif
  19343. +
  19344. /*
  19345. * Decrement the use count and release all resources for an mm.
  19346. */
  19347. @@ -1157,6 +1183,9 @@
  19348. */
  19349. static void posix_cpu_timers_init(struct task_struct *tsk)
  19350. {
  19351. +#ifdef CONFIG_PREEMPT_RT_BASE
  19352. + tsk->posix_timer_list = NULL;
  19353. +#endif
  19354. tsk->cputime_expires.prof_exp = 0;
  19355. tsk->cputime_expires.virt_exp = 0;
  19356. tsk->cputime_expires.sched_exp = 0;
  19357. @@ -1284,6 +1313,7 @@
  19358. spin_lock_init(&p->alloc_lock);
  19359. init_sigpending(&p->pending);
  19360. + p->sigqueue_cache = NULL;
  19361. p->utime = p->stime = p->gtime = 0;
  19362. p->utimescaled = p->stimescaled = 0;
  19363. @@ -1291,7 +1321,8 @@
  19364. p->prev_cputime.utime = p->prev_cputime.stime = 0;
  19365. #endif
  19366. #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
  19367. - seqlock_init(&p->vtime_seqlock);
  19368. + raw_spin_lock_init(&p->vtime_lock);
  19369. + seqcount_init(&p->vtime_seq);
  19370. p->vtime_snap = 0;
  19371. p->vtime_snap_whence = VTIME_SLEEPING;
  19372. #endif
  19373. @@ -1342,6 +1373,9 @@
  19374. p->hardirq_context = 0;
  19375. p->softirq_context = 0;
  19376. #endif
  19377. +#ifdef CONFIG_PREEMPT_RT_FULL
  19378. + p->pagefault_disabled = 0;
  19379. +#endif
  19380. #ifdef CONFIG_LOCKDEP
  19381. p->lockdep_depth = 0; /* no locks held yet */
  19382. p->curr_chain_key = 0;
  19383. diff -Nur linux-3.18.12.orig/kernel/futex.c linux-3.18.12/kernel/futex.c
  19384. --- linux-3.18.12.orig/kernel/futex.c 2015-04-20 14:48:02.000000000 -0500
  19385. +++ linux-3.18.12/kernel/futex.c 2015-04-26 13:32:22.435684003 -0500
  19386. @@ -738,7 +738,9 @@
  19387. * task still owns the PI-state:
  19388. */
  19389. if (head->next != next) {
  19390. + raw_spin_unlock_irq(&curr->pi_lock);
  19391. spin_unlock(&hb->lock);
  19392. + raw_spin_lock_irq(&curr->pi_lock);
  19393. continue;
  19394. }
  19395. @@ -1705,6 +1707,16 @@
  19396. requeue_pi_wake_futex(this, &key2, hb2);
  19397. drop_count++;
  19398. continue;
  19399. + } else if (ret == -EAGAIN) {
  19400. + /*
  19401. + * Waiter was woken by timeout or
  19402. + * signal and has set pi_blocked_on to
  19403. + * PI_WAKEUP_INPROGRESS before we
  19404. + * tried to enqueue it on the rtmutex.
  19405. + */
  19406. + this->pi_state = NULL;
  19407. + free_pi_state(pi_state);
  19408. + continue;
  19409. } else if (ret) {
  19410. /* -EDEADLK */
  19411. this->pi_state = NULL;
  19412. @@ -2549,7 +2561,7 @@
  19413. struct hrtimer_sleeper timeout, *to = NULL;
  19414. struct rt_mutex_waiter rt_waiter;
  19415. struct rt_mutex *pi_mutex = NULL;
  19416. - struct futex_hash_bucket *hb;
  19417. + struct futex_hash_bucket *hb, *hb2;
  19418. union futex_key key2 = FUTEX_KEY_INIT;
  19419. struct futex_q q = futex_q_init;
  19420. int res, ret;
  19421. @@ -2574,10 +2586,7 @@
  19422. * The waiter is allocated on our stack, manipulated by the requeue
  19423. * code while we sleep on uaddr.
  19424. */
  19425. - debug_rt_mutex_init_waiter(&rt_waiter);
  19426. - RB_CLEAR_NODE(&rt_waiter.pi_tree_entry);
  19427. - RB_CLEAR_NODE(&rt_waiter.tree_entry);
  19428. - rt_waiter.task = NULL;
  19429. + rt_mutex_init_waiter(&rt_waiter, false);
  19430. ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
  19431. if (unlikely(ret != 0))
  19432. @@ -2608,20 +2617,55 @@
  19433. /* Queue the futex_q, drop the hb lock, wait for wakeup. */
  19434. futex_wait_queue_me(hb, &q, to);
  19435. - spin_lock(&hb->lock);
  19436. - ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
  19437. - spin_unlock(&hb->lock);
  19438. - if (ret)
  19439. - goto out_put_keys;
  19440. + /*
  19441. + * On RT we must avoid races with requeue and trying to block
  19442. + * on two mutexes (hb->lock and uaddr2's rtmutex) by
  19443. + * serializing access to pi_blocked_on with pi_lock.
  19444. + */
  19445. + raw_spin_lock_irq(&current->pi_lock);
  19446. + if (current->pi_blocked_on) {
  19447. + /*
  19448. + * We have been requeued or are in the process of
  19449. + * being requeued.
  19450. + */
  19451. + raw_spin_unlock_irq(&current->pi_lock);
  19452. + } else {
  19453. + /*
  19454. + * Setting pi_blocked_on to PI_WAKEUP_INPROGRESS
  19455. + * prevents a concurrent requeue from moving us to the
  19456. + * uaddr2 rtmutex. After that we can safely acquire
  19457. + * (and possibly block on) hb->lock.
  19458. + */
  19459. + current->pi_blocked_on = PI_WAKEUP_INPROGRESS;
  19460. + raw_spin_unlock_irq(&current->pi_lock);
  19461. +
  19462. + spin_lock(&hb->lock);
  19463. +
  19464. + /*
  19465. + * Clean up pi_blocked_on. We might leak it otherwise
  19466. + * when we succeeded with the hb->lock in the fast
  19467. + * path.
  19468. + */
  19469. + raw_spin_lock_irq(&current->pi_lock);
  19470. + current->pi_blocked_on = NULL;
  19471. + raw_spin_unlock_irq(&current->pi_lock);
  19472. +
  19473. + ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
  19474. + spin_unlock(&hb->lock);
  19475. + if (ret)
  19476. + goto out_put_keys;
  19477. + }
  19478. /*
  19479. - * In order for us to be here, we know our q.key == key2, and since
  19480. - * we took the hb->lock above, we also know that futex_requeue() has
  19481. - * completed and we no longer have to concern ourselves with a wakeup
  19482. - * race with the atomic proxy lock acquisition by the requeue code. The
  19483. - * futex_requeue dropped our key1 reference and incremented our key2
  19484. - * reference count.
  19485. + * In order to be here, we have either been requeued, are in
  19486. + * the process of being requeued, or requeue successfully
  19487. + * acquired uaddr2 on our behalf. If pi_blocked_on was
  19488. + * non-null above, we may be racing with a requeue. Do not
  19489. + * rely on q->lock_ptr to be hb2->lock until after blocking on
  19490. + * hb->lock or hb2->lock. The futex_requeue dropped our key1
  19491. + * reference and incremented our key2 reference count.
  19492. */
  19493. + hb2 = hash_futex(&key2);
  19494. /* Check if the requeue code acquired the second futex for us. */
  19495. if (!q.rt_waiter) {
  19496. @@ -2630,9 +2674,10 @@
  19497. * did a lock-steal - fix up the PI-state in that case.
  19498. */
  19499. if (q.pi_state && (q.pi_state->owner != current)) {
  19500. - spin_lock(q.lock_ptr);
  19501. + spin_lock(&hb2->lock);
  19502. + BUG_ON(&hb2->lock != q.lock_ptr);
  19503. ret = fixup_pi_state_owner(uaddr2, &q, current);
  19504. - spin_unlock(q.lock_ptr);
  19505. + spin_unlock(&hb2->lock);
  19506. }
  19507. } else {
  19508. /*
  19509. @@ -2645,7 +2690,8 @@
  19510. ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter);
  19511. debug_rt_mutex_free_waiter(&rt_waiter);
  19512. - spin_lock(q.lock_ptr);
  19513. + spin_lock(&hb2->lock);
  19514. + BUG_ON(&hb2->lock != q.lock_ptr);
  19515. /*
  19516. * Fixup the pi_state owner and possibly acquire the lock if we
  19517. * haven't already.
  19518. diff -Nur linux-3.18.12.orig/kernel/irq/handle.c linux-3.18.12/kernel/irq/handle.c
  19519. --- linux-3.18.12.orig/kernel/irq/handle.c 2015-04-20 14:48:02.000000000 -0500
  19520. +++ linux-3.18.12/kernel/irq/handle.c 2015-04-26 13:32:22.435684003 -0500
  19521. @@ -133,6 +133,8 @@
  19522. irqreturn_t
  19523. handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
  19524. {
  19525. + struct pt_regs *regs = get_irq_regs();
  19526. + u64 ip = regs ? instruction_pointer(regs) : 0;
  19527. irqreturn_t retval = IRQ_NONE;
  19528. unsigned int flags = 0, irq = desc->irq_data.irq;
  19529. @@ -173,7 +175,11 @@
  19530. action = action->next;
  19531. } while (action);
  19532. - add_interrupt_randomness(irq, flags);
  19533. +#ifndef CONFIG_PREEMPT_RT_FULL
  19534. + add_interrupt_randomness(irq, flags, ip);
  19535. +#else
  19536. + desc->random_ip = ip;
  19537. +#endif
  19538. if (!noirqdebug)
  19539. note_interrupt(irq, desc, retval);
  19540. diff -Nur linux-3.18.12.orig/kernel/irq/manage.c linux-3.18.12/kernel/irq/manage.c
  19541. --- linux-3.18.12.orig/kernel/irq/manage.c 2015-04-20 14:48:02.000000000 -0500
  19542. +++ linux-3.18.12/kernel/irq/manage.c 2015-04-26 13:32:22.435684003 -0500
  19543. @@ -22,6 +22,7 @@
  19544. #include "internals.h"
  19545. #ifdef CONFIG_IRQ_FORCED_THREADING
  19546. +# ifndef CONFIG_PREEMPT_RT_BASE
  19547. __read_mostly bool force_irqthreads;
  19548. static int __init setup_forced_irqthreads(char *arg)
  19549. @@ -30,6 +31,7 @@
  19550. return 0;
  19551. }
  19552. early_param("threadirqs", setup_forced_irqthreads);
  19553. +# endif
  19554. #endif
  19555. static void __synchronize_hardirq(struct irq_desc *desc)
  19556. @@ -173,6 +175,62 @@
  19557. irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { }
  19558. #endif
  19559. +#ifdef CONFIG_PREEMPT_RT_FULL
  19560. +static void _irq_affinity_notify(struct irq_affinity_notify *notify);
  19561. +static struct task_struct *set_affinity_helper;
  19562. +static LIST_HEAD(affinity_list);
  19563. +static DEFINE_RAW_SPINLOCK(affinity_list_lock);
  19564. +
  19565. +static int set_affinity_thread(void *unused)
  19566. +{
  19567. + while (1) {
  19568. + struct irq_affinity_notify *notify;
  19569. + int empty;
  19570. +
  19571. + set_current_state(TASK_INTERRUPTIBLE);
  19572. +
  19573. + raw_spin_lock_irq(&affinity_list_lock);
  19574. + empty = list_empty(&affinity_list);
  19575. + raw_spin_unlock_irq(&affinity_list_lock);
  19576. +
  19577. + if (empty)
  19578. + schedule();
  19579. + if (kthread_should_stop())
  19580. + break;
  19581. + set_current_state(TASK_RUNNING);
  19582. +try_next:
  19583. + notify = NULL;
  19584. +
  19585. + raw_spin_lock_irq(&affinity_list_lock);
  19586. + if (!list_empty(&affinity_list)) {
  19587. + notify = list_first_entry(&affinity_list,
  19588. + struct irq_affinity_notify, list);
  19589. + list_del_init(&notify->list);
  19590. + }
  19591. + raw_spin_unlock_irq(&affinity_list_lock);
  19592. +
  19593. + if (!notify)
  19594. + continue;
  19595. + _irq_affinity_notify(notify);
  19596. + goto try_next;
  19597. + }
  19598. + return 0;
  19599. +}
  19600. +
  19601. +static void init_helper_thread(void)
  19602. +{
  19603. + if (set_affinity_helper)
  19604. + return;
  19605. + set_affinity_helper = kthread_run(set_affinity_thread, NULL,
  19606. + "affinity-cb");
  19607. + WARN_ON(IS_ERR(set_affinity_helper));
  19608. +}
  19609. +#else
  19610. +
  19611. +static inline void init_helper_thread(void) { }
  19612. +
  19613. +#endif
  19614. +
  19615. int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
  19616. bool force)
  19617. {
  19618. @@ -211,7 +269,17 @@
  19619. if (desc->affinity_notify) {
  19620. kref_get(&desc->affinity_notify->kref);
  19621. +
  19622. +#ifdef CONFIG_PREEMPT_RT_FULL
  19623. + raw_spin_lock(&affinity_list_lock);
  19624. + if (list_empty(&desc->affinity_notify->list))
  19625. + list_add_tail(&affinity_list,
  19626. + &desc->affinity_notify->list);
  19627. + raw_spin_unlock(&affinity_list_lock);
  19628. + wake_up_process(set_affinity_helper);
  19629. +#else
  19630. schedule_work(&desc->affinity_notify->work);
  19631. +#endif
  19632. }
  19633. irqd_set(data, IRQD_AFFINITY_SET);
  19634. @@ -246,10 +314,8 @@
  19635. }
  19636. EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
  19637. -static void irq_affinity_notify(struct work_struct *work)
  19638. +static void _irq_affinity_notify(struct irq_affinity_notify *notify)
  19639. {
  19640. - struct irq_affinity_notify *notify =
  19641. - container_of(work, struct irq_affinity_notify, work);
  19642. struct irq_desc *desc = irq_to_desc(notify->irq);
  19643. cpumask_var_t cpumask;
  19644. unsigned long flags;
  19645. @@ -271,6 +337,13 @@
  19646. kref_put(&notify->kref, notify->release);
  19647. }
  19648. +static void irq_affinity_notify(struct work_struct *work)
  19649. +{
  19650. + struct irq_affinity_notify *notify =
  19651. + container_of(work, struct irq_affinity_notify, work);
  19652. + _irq_affinity_notify(notify);
  19653. +}
  19654. +
  19655. /**
  19656. * irq_set_affinity_notifier - control notification of IRQ affinity changes
  19657. * @irq: Interrupt for which to enable/disable notification
  19658. @@ -300,6 +373,8 @@
  19659. notify->irq = irq;
  19660. kref_init(&notify->kref);
  19661. INIT_WORK(&notify->work, irq_affinity_notify);
  19662. + INIT_LIST_HEAD(&notify->list);
  19663. + init_helper_thread();
  19664. }
  19665. raw_spin_lock_irqsave(&desc->lock, flags);
  19666. @@ -788,7 +863,15 @@
  19667. local_bh_disable();
  19668. ret = action->thread_fn(action->irq, action->dev_id);
  19669. irq_finalize_oneshot(desc, action);
  19670. - local_bh_enable();
  19671. + /*
  19672. + * Interrupts which have real time requirements can be set up
  19673. + * to avoid softirq processing in the thread handler. This is
  19674. + * safe as these interrupts do not raise soft interrupts.
  19675. + */
  19676. + if (irq_settings_no_softirq_call(desc))
  19677. + _local_bh_enable();
  19678. + else
  19679. + local_bh_enable();
  19680. return ret;
  19681. }
  19682. @@ -871,6 +954,12 @@
  19683. if (action_ret == IRQ_HANDLED)
  19684. atomic_inc(&desc->threads_handled);
  19685. +#ifdef CONFIG_PREEMPT_RT_FULL
  19686. + migrate_disable();
  19687. + add_interrupt_randomness(action->irq, 0,
  19688. + desc->random_ip ^ (unsigned long) action);
  19689. + migrate_enable();
  19690. +#endif
  19691. wake_threads_waitq(desc);
  19692. }
  19693. @@ -1184,6 +1273,9 @@
  19694. irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
  19695. }
  19696. + if (new->flags & IRQF_NO_SOFTIRQ_CALL)
  19697. + irq_settings_set_no_softirq_call(desc);
  19698. +
  19699. /* Set default affinity mask once everything is setup */
  19700. setup_affinity(irq, desc, mask);
  19701. diff -Nur linux-3.18.12.orig/kernel/irq/settings.h linux-3.18.12/kernel/irq/settings.h
  19702. --- linux-3.18.12.orig/kernel/irq/settings.h 2015-04-20 14:48:02.000000000 -0500
  19703. +++ linux-3.18.12/kernel/irq/settings.h 2015-04-26 13:32:22.435684003 -0500
  19704. @@ -15,6 +15,7 @@
  19705. _IRQ_NESTED_THREAD = IRQ_NESTED_THREAD,
  19706. _IRQ_PER_CPU_DEVID = IRQ_PER_CPU_DEVID,
  19707. _IRQ_IS_POLLED = IRQ_IS_POLLED,
  19708. + _IRQ_NO_SOFTIRQ_CALL = IRQ_NO_SOFTIRQ_CALL,
  19709. _IRQF_MODIFY_MASK = IRQF_MODIFY_MASK,
  19710. };
  19711. @@ -28,6 +29,7 @@
  19712. #define IRQ_NESTED_THREAD GOT_YOU_MORON
  19713. #define IRQ_PER_CPU_DEVID GOT_YOU_MORON
  19714. #define IRQ_IS_POLLED GOT_YOU_MORON
  19715. +#define IRQ_NO_SOFTIRQ_CALL GOT_YOU_MORON
  19716. #undef IRQF_MODIFY_MASK
  19717. #define IRQF_MODIFY_MASK GOT_YOU_MORON
  19718. @@ -38,6 +40,16 @@
  19719. desc->status_use_accessors |= (set & _IRQF_MODIFY_MASK);
  19720. }
  19721. +static inline bool irq_settings_no_softirq_call(struct irq_desc *desc)
  19722. +{
  19723. + return desc->status_use_accessors & _IRQ_NO_SOFTIRQ_CALL;
  19724. +}
  19725. +
  19726. +static inline void irq_settings_set_no_softirq_call(struct irq_desc *desc)
  19727. +{
  19728. + desc->status_use_accessors |= _IRQ_NO_SOFTIRQ_CALL;
  19729. +}
  19730. +
  19731. static inline bool irq_settings_is_per_cpu(struct irq_desc *desc)
  19732. {
  19733. return desc->status_use_accessors & _IRQ_PER_CPU;
  19734. diff -Nur linux-3.18.12.orig/kernel/irq/spurious.c linux-3.18.12/kernel/irq/spurious.c
  19735. --- linux-3.18.12.orig/kernel/irq/spurious.c 2015-04-20 14:48:02.000000000 -0500
  19736. +++ linux-3.18.12/kernel/irq/spurious.c 2015-04-26 13:32:22.435684003 -0500
  19737. @@ -444,6 +444,10 @@
  19738. static int __init irqfixup_setup(char *str)
  19739. {
  19740. +#ifdef CONFIG_PREEMPT_RT_BASE
  19741. + pr_warn("irqfixup boot option not supported w/ CONFIG_PREEMPT_RT_BASE\n");
  19742. + return 1;
  19743. +#endif
  19744. irqfixup = 1;
  19745. printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n");
  19746. printk(KERN_WARNING "This may impact system performance.\n");
  19747. @@ -456,6 +460,10 @@
  19748. static int __init irqpoll_setup(char *str)
  19749. {
  19750. +#ifdef CONFIG_PREEMPT_RT_BASE
  19751. + pr_warn("irqpoll boot option not supported w/ CONFIG_PREEMPT_RT_BASE\n");
  19752. + return 1;
  19753. +#endif
  19754. irqfixup = 2;
  19755. printk(KERN_WARNING "Misrouted IRQ fixup and polling support "
  19756. "enabled\n");
  19757. diff -Nur linux-3.18.12.orig/kernel/irq_work.c linux-3.18.12/kernel/irq_work.c
  19758. --- linux-3.18.12.orig/kernel/irq_work.c 2015-04-20 14:48:02.000000000 -0500
  19759. +++ linux-3.18.12/kernel/irq_work.c 2015-04-26 13:32:22.435684003 -0500
  19760. @@ -17,12 +17,15 @@
  19761. #include <linux/cpu.h>
  19762. #include <linux/notifier.h>
  19763. #include <linux/smp.h>
  19764. +#include <linux/interrupt.h>
  19765. #include <asm/processor.h>
  19766. static DEFINE_PER_CPU(struct llist_head, raised_list);
  19767. static DEFINE_PER_CPU(struct llist_head, lazy_list);
  19768. -
  19769. +#ifdef CONFIG_PREEMPT_RT_FULL
  19770. +static DEFINE_PER_CPU(struct llist_head, hirq_work_list);
  19771. +#endif
  19772. /*
  19773. * Claim the entry so that no one else will poke at it.
  19774. */
  19775. @@ -65,6 +68,8 @@
  19776. */
  19777. bool irq_work_queue_on(struct irq_work *work, int cpu)
  19778. {
  19779. + bool raise_irqwork;
  19780. +
  19781. /* All work should have been flushed before going offline */
  19782. WARN_ON_ONCE(cpu_is_offline(cpu));
  19783. @@ -75,7 +80,19 @@
  19784. if (!irq_work_claim(work))
  19785. return false;
  19786. - if (llist_add(&work->llnode, &per_cpu(raised_list, cpu)))
  19787. +#ifdef CONFIG_PREEMPT_RT_FULL
  19788. + if (work->flags & IRQ_WORK_HARD_IRQ)
  19789. + raise_irqwork = llist_add(&work->llnode,
  19790. + &per_cpu(hirq_work_list, cpu));
  19791. + else
  19792. + raise_irqwork = llist_add(&work->llnode,
  19793. + &per_cpu(lazy_list, cpu));
  19794. +#else
  19795. + raise_irqwork = llist_add(&work->llnode,
  19796. + &per_cpu(raised_list, cpu));
  19797. +#endif
  19798. +
  19799. + if (raise_irqwork)
  19800. arch_send_call_function_single_ipi(cpu);
  19801. return true;
  19802. @@ -93,7 +110,16 @@
  19803. /* Queue the entry and raise the IPI if needed. */
  19804. preempt_disable();
  19805. - /* If the work is "lazy", handle it from next tick if any */
  19806. +#ifdef CONFIG_PREEMPT_RT_FULL
  19807. + if (work->flags & IRQ_WORK_HARD_IRQ) {
  19808. + if (llist_add(&work->llnode, this_cpu_ptr(&hirq_work_list)))
  19809. + arch_irq_work_raise();
  19810. + } else {
  19811. + if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) &&
  19812. + tick_nohz_tick_stopped())
  19813. + raise_softirq(TIMER_SOFTIRQ);
  19814. + }
  19815. +#else
  19816. if (work->flags & IRQ_WORK_LAZY) {
  19817. if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) &&
  19818. tick_nohz_tick_stopped())
  19819. @@ -102,6 +128,7 @@
  19820. if (llist_add(&work->llnode, this_cpu_ptr(&raised_list)))
  19821. arch_irq_work_raise();
  19822. }
  19823. +#endif
  19824. preempt_enable();
  19825. @@ -116,9 +143,12 @@
  19826. raised = this_cpu_ptr(&raised_list);
  19827. lazy = this_cpu_ptr(&lazy_list);
  19828. - if (llist_empty(raised) || arch_irq_work_has_interrupt())
  19829. + if (llist_empty(raised))
  19830. if (llist_empty(lazy))
  19831. - return false;
  19832. +#ifdef CONFIG_PREEMPT_RT_FULL
  19833. + if (llist_empty(this_cpu_ptr(&hirq_work_list)))
  19834. +#endif
  19835. + return false;
  19836. /* All work should have been flushed before going offline */
  19837. WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
  19838. @@ -132,7 +162,9 @@
  19839. struct irq_work *work;
  19840. struct llist_node *llnode;
  19841. +#ifndef CONFIG_PREEMPT_RT_FULL
  19842. BUG_ON(!irqs_disabled());
  19843. +#endif
  19844. if (llist_empty(list))
  19845. return;
  19846. @@ -168,18 +200,26 @@
  19847. */
  19848. void irq_work_run(void)
  19849. {
  19850. +#ifdef CONFIG_PREEMPT_RT_FULL
  19851. + irq_work_run_list(this_cpu_ptr(&hirq_work_list));
  19852. +#else
  19853. irq_work_run_list(this_cpu_ptr(&raised_list));
  19854. irq_work_run_list(this_cpu_ptr(&lazy_list));
  19855. +#endif
  19856. }
  19857. EXPORT_SYMBOL_GPL(irq_work_run);
  19858. void irq_work_tick(void)
  19859. {
  19860. +#ifdef CONFIG_PREEMPT_RT_FULL
  19861. + irq_work_run_list(this_cpu_ptr(&lazy_list));
  19862. +#else
  19863. struct llist_head *raised = &__get_cpu_var(raised_list);
  19864. if (!llist_empty(raised) && !arch_irq_work_has_interrupt())
  19865. irq_work_run_list(raised);
  19866. irq_work_run_list(&__get_cpu_var(lazy_list));
  19867. +#endif
  19868. }
  19869. /*
  19870. diff -Nur linux-3.18.12.orig/kernel/Kconfig.locks linux-3.18.12/kernel/Kconfig.locks
  19871. --- linux-3.18.12.orig/kernel/Kconfig.locks 2015-04-20 14:48:02.000000000 -0500
  19872. +++ linux-3.18.12/kernel/Kconfig.locks 2015-04-26 13:32:22.431684003 -0500
  19873. @@ -225,11 +225,11 @@
  19874. config MUTEX_SPIN_ON_OWNER
  19875. def_bool y
  19876. - depends on SMP && !DEBUG_MUTEXES && ARCH_SUPPORTS_ATOMIC_RMW
  19877. + depends on SMP && !DEBUG_MUTEXES && ARCH_SUPPORTS_ATOMIC_RMW && !PREEMPT_RT_FULL
  19878. config RWSEM_SPIN_ON_OWNER
  19879. def_bool y
  19880. - depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW
  19881. + depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW && !PREEMPT_RT_FULL
  19882. config ARCH_USE_QUEUE_RWLOCK
  19883. bool
  19884. diff -Nur linux-3.18.12.orig/kernel/Kconfig.preempt linux-3.18.12/kernel/Kconfig.preempt
  19885. --- linux-3.18.12.orig/kernel/Kconfig.preempt 2015-04-20 14:48:02.000000000 -0500
  19886. +++ linux-3.18.12/kernel/Kconfig.preempt 2015-04-26 13:32:22.431684003 -0500
  19887. @@ -1,3 +1,16 @@
  19888. +config PREEMPT
  19889. + bool
  19890. + select PREEMPT_COUNT
  19891. +
  19892. +config PREEMPT_RT_BASE
  19893. + bool
  19894. + select PREEMPT
  19895. +
  19896. +config HAVE_PREEMPT_LAZY
  19897. + bool
  19898. +
  19899. +config PREEMPT_LAZY
  19900. + def_bool y if HAVE_PREEMPT_LAZY && PREEMPT_RT_FULL
  19901. choice
  19902. prompt "Preemption Model"
  19903. @@ -33,9 +46,9 @@
  19904. Select this if you are building a kernel for a desktop system.
  19905. -config PREEMPT
  19906. +config PREEMPT__LL
  19907. bool "Preemptible Kernel (Low-Latency Desktop)"
  19908. - select PREEMPT_COUNT
  19909. + select PREEMPT
  19910. select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
  19911. help
  19912. This option reduces the latency of the kernel by making
  19913. @@ -52,6 +65,22 @@
  19914. embedded system with latency requirements in the milliseconds
  19915. range.
  19916. +config PREEMPT_RTB
  19917. + bool "Preemptible Kernel (Basic RT)"
  19918. + select PREEMPT_RT_BASE
  19919. + help
  19920. + This option is basically the same as (Low-Latency Desktop) but
  19921. + enables changes which are preliminary for the full preemptible
  19922. + RT kernel.
  19923. +
  19924. +config PREEMPT_RT_FULL
  19925. + bool "Fully Preemptible Kernel (RT)"
  19926. + depends on IRQ_FORCED_THREADING
  19927. + select PREEMPT_RT_BASE
  19928. + select PREEMPT_RCU
  19929. + help
  19930. + All and everything
  19931. +
  19932. endchoice
  19933. config PREEMPT_COUNT
  19934. diff -Nur linux-3.18.12.orig/kernel/ksysfs.c linux-3.18.12/kernel/ksysfs.c
  19935. --- linux-3.18.12.orig/kernel/ksysfs.c 2015-04-20 14:48:02.000000000 -0500
  19936. +++ linux-3.18.12/kernel/ksysfs.c 2015-04-26 13:32:22.435684003 -0500
  19937. @@ -136,6 +136,15 @@
  19938. #endif /* CONFIG_KEXEC */
  19939. +#if defined(CONFIG_PREEMPT_RT_FULL)
  19940. +static ssize_t realtime_show(struct kobject *kobj,
  19941. + struct kobj_attribute *attr, char *buf)
  19942. +{
  19943. + return sprintf(buf, "%d\n", 1);
  19944. +}
  19945. +KERNEL_ATTR_RO(realtime);
  19946. +#endif
  19947. +
  19948. /* whether file capabilities are enabled */
  19949. static ssize_t fscaps_show(struct kobject *kobj,
  19950. struct kobj_attribute *attr, char *buf)
  19951. @@ -203,6 +212,9 @@
  19952. &vmcoreinfo_attr.attr,
  19953. #endif
  19954. &rcu_expedited_attr.attr,
  19955. +#ifdef CONFIG_PREEMPT_RT_FULL
  19956. + &realtime_attr.attr,
  19957. +#endif
  19958. NULL
  19959. };
  19960. diff -Nur linux-3.18.12.orig/kernel/locking/lglock.c linux-3.18.12/kernel/locking/lglock.c
  19961. --- linux-3.18.12.orig/kernel/locking/lglock.c 2015-04-20 14:48:02.000000000 -0500
  19962. +++ linux-3.18.12/kernel/locking/lglock.c 2015-04-26 13:32:22.435684003 -0500
  19963. @@ -4,6 +4,15 @@
  19964. #include <linux/cpu.h>
  19965. #include <linux/string.h>
  19966. +#ifndef CONFIG_PREEMPT_RT_FULL
  19967. +# define lg_lock_ptr arch_spinlock_t
  19968. +# define lg_do_lock(l) arch_spin_lock(l)
  19969. +# define lg_do_unlock(l) arch_spin_unlock(l)
  19970. +#else
  19971. +# define lg_lock_ptr struct rt_mutex
  19972. +# define lg_do_lock(l) __rt_spin_lock(l)
  19973. +# define lg_do_unlock(l) __rt_spin_unlock(l)
  19974. +#endif
  19975. /*
  19976. * Note there is no uninit, so lglocks cannot be defined in
  19977. * modules (but it's fine to use them from there)
  19978. @@ -12,51 +21,60 @@
  19979. void lg_lock_init(struct lglock *lg, char *name)
  19980. {
  19981. +#ifdef CONFIG_PREEMPT_RT_FULL
  19982. + int i;
  19983. +
  19984. + for_each_possible_cpu(i) {
  19985. + struct rt_mutex *lock = per_cpu_ptr(lg->lock, i);
  19986. +
  19987. + rt_mutex_init(lock);
  19988. + }
  19989. +#endif
  19990. LOCKDEP_INIT_MAP(&lg->lock_dep_map, name, &lg->lock_key, 0);
  19991. }
  19992. EXPORT_SYMBOL(lg_lock_init);
  19993. void lg_local_lock(struct lglock *lg)
  19994. {
  19995. - arch_spinlock_t *lock;
  19996. + lg_lock_ptr *lock;
  19997. - preempt_disable();
  19998. + migrate_disable();
  19999. lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
  20000. lock = this_cpu_ptr(lg->lock);
  20001. - arch_spin_lock(lock);
  20002. + lg_do_lock(lock);
  20003. }
  20004. EXPORT_SYMBOL(lg_local_lock);
  20005. void lg_local_unlock(struct lglock *lg)
  20006. {
  20007. - arch_spinlock_t *lock;
  20008. + lg_lock_ptr *lock;
  20009. lock_release(&lg->lock_dep_map, 1, _RET_IP_);
  20010. lock = this_cpu_ptr(lg->lock);
  20011. - arch_spin_unlock(lock);
  20012. - preempt_enable();
  20013. + lg_do_unlock(lock);
  20014. + migrate_enable();
  20015. }
  20016. EXPORT_SYMBOL(lg_local_unlock);
  20017. void lg_local_lock_cpu(struct lglock *lg, int cpu)
  20018. {
  20019. - arch_spinlock_t *lock;
  20020. + lg_lock_ptr *lock;
  20021. - preempt_disable();
  20022. + preempt_disable_nort();
  20023. lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
  20024. lock = per_cpu_ptr(lg->lock, cpu);
  20025. - arch_spin_lock(lock);
  20026. + lg_do_lock(lock);
  20027. }
  20028. EXPORT_SYMBOL(lg_local_lock_cpu);
  20029. void lg_local_unlock_cpu(struct lglock *lg, int cpu)
  20030. {
  20031. - arch_spinlock_t *lock;
  20032. + lg_lock_ptr *lock;
  20033. lock_release(&lg->lock_dep_map, 1, _RET_IP_);
  20034. lock = per_cpu_ptr(lg->lock, cpu);
  20035. - arch_spin_unlock(lock);
  20036. - preempt_enable();
  20037. + lg_do_unlock(lock);
  20038. + preempt_enable_nort();
  20039. }
  20040. EXPORT_SYMBOL(lg_local_unlock_cpu);
  20041. @@ -64,12 +82,12 @@
  20042. {
  20043. int i;
  20044. - preempt_disable();
  20045. + preempt_disable_nort();
  20046. lock_acquire_exclusive(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
  20047. for_each_possible_cpu(i) {
  20048. - arch_spinlock_t *lock;
  20049. + lg_lock_ptr *lock;
  20050. lock = per_cpu_ptr(lg->lock, i);
  20051. - arch_spin_lock(lock);
  20052. + lg_do_lock(lock);
  20053. }
  20054. }
  20055. EXPORT_SYMBOL(lg_global_lock);
  20056. @@ -80,10 +98,35 @@
  20057. lock_release(&lg->lock_dep_map, 1, _RET_IP_);
  20058. for_each_possible_cpu(i) {
  20059. - arch_spinlock_t *lock;
  20060. + lg_lock_ptr *lock;
  20061. lock = per_cpu_ptr(lg->lock, i);
  20062. - arch_spin_unlock(lock);
  20063. + lg_do_unlock(lock);
  20064. }
  20065. - preempt_enable();
  20066. + preempt_enable_nort();
  20067. }
  20068. EXPORT_SYMBOL(lg_global_unlock);
  20069. +
  20070. +#ifdef CONFIG_PREEMPT_RT_FULL
  20071. +/*
  20072. + * HACK: If you use this, you get to keep the pieces.
  20073. + * Used in queue_stop_cpus_work() when stop machinery
  20074. + * is called from inactive CPU, so we can't schedule.
  20075. + */
  20076. +# define lg_do_trylock_relax(l) \
  20077. + do { \
  20078. + while (!__rt_spin_trylock(l)) \
  20079. + cpu_relax(); \
  20080. + } while (0)
  20081. +
  20082. +void lg_global_trylock_relax(struct lglock *lg)
  20083. +{
  20084. + int i;
  20085. +
  20086. + lock_acquire_exclusive(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
  20087. + for_each_possible_cpu(i) {
  20088. + lg_lock_ptr *lock;
  20089. + lock = per_cpu_ptr(lg->lock, i);
  20090. + lg_do_trylock_relax(lock);
  20091. + }
  20092. +}
  20093. +#endif
  20094. diff -Nur linux-3.18.12.orig/kernel/locking/lockdep.c linux-3.18.12/kernel/locking/lockdep.c
  20095. --- linux-3.18.12.orig/kernel/locking/lockdep.c 2015-04-20 14:48:02.000000000 -0500
  20096. +++ linux-3.18.12/kernel/locking/lockdep.c 2015-04-26 13:32:22.435684003 -0500
  20097. @@ -3542,6 +3542,7 @@
  20098. }
  20099. }
  20100. +#ifndef CONFIG_PREEMPT_RT_FULL
  20101. /*
  20102. * We dont accurately track softirq state in e.g.
  20103. * hardirq contexts (such as on 4KSTACKS), so only
  20104. @@ -3556,6 +3557,7 @@
  20105. DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled);
  20106. }
  20107. }
  20108. +#endif
  20109. if (!debug_locks)
  20110. print_irqtrace_events(current);
  20111. diff -Nur linux-3.18.12.orig/kernel/locking/Makefile linux-3.18.12/kernel/locking/Makefile
  20112. --- linux-3.18.12.orig/kernel/locking/Makefile 2015-04-20 14:48:02.000000000 -0500
  20113. +++ linux-3.18.12/kernel/locking/Makefile 2015-04-26 13:32:22.435684003 -0500
  20114. @@ -1,5 +1,5 @@
  20115. -obj-y += mutex.o semaphore.o rwsem.o mcs_spinlock.o
  20116. +obj-y += semaphore.o mcs_spinlock.o
  20117. ifdef CONFIG_FUNCTION_TRACER
  20118. CFLAGS_REMOVE_lockdep.o = -pg
  20119. @@ -8,7 +8,11 @@
  20120. CFLAGS_REMOVE_rtmutex-debug.o = -pg
  20121. endif
  20122. +ifneq ($(CONFIG_PREEMPT_RT_FULL),y)
  20123. +obj-y += mutex.o
  20124. obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
  20125. +obj-y += rwsem.o
  20126. +endif
  20127. obj-$(CONFIG_LOCKDEP) += lockdep.o
  20128. ifeq ($(CONFIG_PROC_FS),y)
  20129. obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
  20130. @@ -21,8 +25,11 @@
  20131. obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
  20132. obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
  20133. obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
  20134. +ifneq ($(CONFIG_PREEMPT_RT_FULL),y)
  20135. obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
  20136. obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o
  20137. +endif
  20138. obj-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o
  20139. +obj-$(CONFIG_PREEMPT_RT_FULL) += rt.o
  20140. obj-$(CONFIG_QUEUE_RWLOCK) += qrwlock.o
  20141. obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o
  20142. diff -Nur linux-3.18.12.orig/kernel/locking/percpu-rwsem.c linux-3.18.12/kernel/locking/percpu-rwsem.c
  20143. --- linux-3.18.12.orig/kernel/locking/percpu-rwsem.c 2015-04-20 14:48:02.000000000 -0500
  20144. +++ linux-3.18.12/kernel/locking/percpu-rwsem.c 2015-04-26 13:32:22.435684003 -0500
  20145. @@ -84,8 +84,12 @@
  20146. down_read(&brw->rw_sem);
  20147. atomic_inc(&brw->slow_read_ctr);
  20148. +#ifdef CONFIG_PREEMPT_RT_FULL
  20149. + up_read(&brw->rw_sem);
  20150. +#else
  20151. /* avoid up_read()->rwsem_release() */
  20152. __up_read(&brw->rw_sem);
  20153. +#endif
  20154. }
  20155. void percpu_up_read(struct percpu_rw_semaphore *brw)
  20156. diff -Nur linux-3.18.12.orig/kernel/locking/rt.c linux-3.18.12/kernel/locking/rt.c
  20157. --- linux-3.18.12.orig/kernel/locking/rt.c 1969-12-31 18:00:00.000000000 -0600
  20158. +++ linux-3.18.12/kernel/locking/rt.c 2015-04-26 13:32:22.435684003 -0500
  20159. @@ -0,0 +1,456 @@
  20160. +/*
  20161. + * kernel/rt.c
  20162. + *
  20163. + * Real-Time Preemption Support
  20164. + *
  20165. + * started by Ingo Molnar:
  20166. + *
  20167. + * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
  20168. + * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
  20169. + *
  20170. + * historic credit for proving that Linux spinlocks can be implemented via
  20171. + * RT-aware mutexes goes to many people: The Pmutex project (Dirk Grambow
  20172. + * and others) who prototyped it on 2.4 and did lots of comparative
  20173. + * research and analysis; TimeSys, for proving that you can implement a
  20174. + * fully preemptible kernel via the use of IRQ threading and mutexes;
  20175. + * Bill Huey for persuasively arguing on lkml that the mutex model is the
  20176. + * right one; and to MontaVista, who ported pmutexes to 2.6.
  20177. + *
  20178. + * This code is a from-scratch implementation and is not based on pmutexes,
  20179. + * but the idea of converting spinlocks to mutexes is used here too.
  20180. + *
  20181. + * lock debugging, locking tree, deadlock detection:
  20182. + *
  20183. + * Copyright (C) 2004, LynuxWorks, Inc., Igor Manyilov, Bill Huey
  20184. + * Released under the General Public License (GPL).
  20185. + *
  20186. + * Includes portions of the generic R/W semaphore implementation from:
  20187. + *
  20188. + * Copyright (c) 2001 David Howells (dhowells@redhat.com).
  20189. + * - Derived partially from idea by Andrea Arcangeli <andrea@suse.de>
  20190. + * - Derived also from comments by Linus
  20191. + *
  20192. + * Pending ownership of locks and ownership stealing:
  20193. + *
  20194. + * Copyright (C) 2005, Kihon Technologies Inc., Steven Rostedt
  20195. + *
  20196. + * (also by Steven Rostedt)
  20197. + * - Converted single pi_lock to individual task locks.
  20198. + *
  20199. + * By Esben Nielsen:
  20200. + * Doing priority inheritance with help of the scheduler.
  20201. + *
  20202. + * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
  20203. + * - major rework based on Esben Nielsens initial patch
  20204. + * - replaced thread_info references by task_struct refs
  20205. + * - removed task->pending_owner dependency
  20206. + * - BKL drop/reacquire for semaphore style locks to avoid deadlocks
  20207. + * in the scheduler return path as discussed with Steven Rostedt
  20208. + *
  20209. + * Copyright (C) 2006, Kihon Technologies Inc.
  20210. + * Steven Rostedt <rostedt@goodmis.org>
  20211. + * - debugged and patched Thomas Gleixner's rework.
  20212. + * - added back the cmpxchg to the rework.
  20213. + * - turned atomic require back on for SMP.
  20214. + */
  20215. +
  20216. +#include <linux/spinlock.h>
  20217. +#include <linux/rtmutex.h>
  20218. +#include <linux/sched.h>
  20219. +#include <linux/delay.h>
  20220. +#include <linux/module.h>
  20221. +#include <linux/kallsyms.h>
  20222. +#include <linux/syscalls.h>
  20223. +#include <linux/interrupt.h>
  20224. +#include <linux/plist.h>
  20225. +#include <linux/fs.h>
  20226. +#include <linux/futex.h>
  20227. +#include <linux/hrtimer.h>
  20228. +
  20229. +#include "rtmutex_common.h"
  20230. +
  20231. +/*
  20232. + * struct mutex functions
  20233. + */
  20234. +void __mutex_do_init(struct mutex *mutex, const char *name,
  20235. + struct lock_class_key *key)
  20236. +{
  20237. +#ifdef CONFIG_DEBUG_LOCK_ALLOC
  20238. + /*
  20239. + * Make sure we are not reinitializing a held lock:
  20240. + */
  20241. + debug_check_no_locks_freed((void *)mutex, sizeof(*mutex));
  20242. + lockdep_init_map(&mutex->dep_map, name, key, 0);
  20243. +#endif
  20244. + mutex->lock.save_state = 0;
  20245. +}
  20246. +EXPORT_SYMBOL(__mutex_do_init);
  20247. +
  20248. +void __lockfunc _mutex_lock(struct mutex *lock)
  20249. +{
  20250. + mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
  20251. + rt_mutex_lock(&lock->lock);
  20252. +}
  20253. +EXPORT_SYMBOL(_mutex_lock);
  20254. +
  20255. +int __lockfunc _mutex_lock_interruptible(struct mutex *lock)
  20256. +{
  20257. + int ret;
  20258. +
  20259. + mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
  20260. + ret = rt_mutex_lock_interruptible(&lock->lock);
  20261. + if (ret)
  20262. + mutex_release(&lock->dep_map, 1, _RET_IP_);
  20263. + return ret;
  20264. +}
  20265. +EXPORT_SYMBOL(_mutex_lock_interruptible);
  20266. +
  20267. +int __lockfunc _mutex_lock_killable(struct mutex *lock)
  20268. +{
  20269. + int ret;
  20270. +
  20271. + mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
  20272. + ret = rt_mutex_lock_killable(&lock->lock);
  20273. + if (ret)
  20274. + mutex_release(&lock->dep_map, 1, _RET_IP_);
  20275. + return ret;
  20276. +}
  20277. +EXPORT_SYMBOL(_mutex_lock_killable);
  20278. +
  20279. +#ifdef CONFIG_DEBUG_LOCK_ALLOC
  20280. +void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass)
  20281. +{
  20282. + mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_);
  20283. + rt_mutex_lock(&lock->lock);
  20284. +}
  20285. +EXPORT_SYMBOL(_mutex_lock_nested);
  20286. +
  20287. +void __lockfunc _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest)
  20288. +{
  20289. + mutex_acquire_nest(&lock->dep_map, 0, 0, nest, _RET_IP_);
  20290. + rt_mutex_lock(&lock->lock);
  20291. +}
  20292. +EXPORT_SYMBOL(_mutex_lock_nest_lock);
  20293. +
  20294. +int __lockfunc _mutex_lock_interruptible_nested(struct mutex *lock, int subclass)
  20295. +{
  20296. + int ret;
  20297. +
  20298. + mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_);
  20299. + ret = rt_mutex_lock_interruptible(&lock->lock);
  20300. + if (ret)
  20301. + mutex_release(&lock->dep_map, 1, _RET_IP_);
  20302. + return ret;
  20303. +}
  20304. +EXPORT_SYMBOL(_mutex_lock_interruptible_nested);
  20305. +
  20306. +int __lockfunc _mutex_lock_killable_nested(struct mutex *lock, int subclass)
  20307. +{
  20308. + int ret;
  20309. +
  20310. + mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
  20311. + ret = rt_mutex_lock_killable(&lock->lock);
  20312. + if (ret)
  20313. + mutex_release(&lock->dep_map, 1, _RET_IP_);
  20314. + return ret;
  20315. +}
  20316. +EXPORT_SYMBOL(_mutex_lock_killable_nested);
  20317. +#endif
  20318. +
  20319. +int __lockfunc _mutex_trylock(struct mutex *lock)
  20320. +{
  20321. + int ret = rt_mutex_trylock(&lock->lock);
  20322. +
  20323. + if (ret)
  20324. + mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
  20325. +
  20326. + return ret;
  20327. +}
  20328. +EXPORT_SYMBOL(_mutex_trylock);
  20329. +
  20330. +void __lockfunc _mutex_unlock(struct mutex *lock)
  20331. +{
  20332. + mutex_release(&lock->dep_map, 1, _RET_IP_);
  20333. + rt_mutex_unlock(&lock->lock);
  20334. +}
  20335. +EXPORT_SYMBOL(_mutex_unlock);
  20336. +
  20337. +/*
  20338. + * rwlock_t functions
  20339. + */
  20340. +int __lockfunc rt_write_trylock(rwlock_t *rwlock)
  20341. +{
  20342. + int ret;
  20343. +
  20344. + migrate_disable();
  20345. + ret = rt_mutex_trylock(&rwlock->lock);
  20346. + if (ret)
  20347. + rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_);
  20348. + else
  20349. + migrate_enable();
  20350. +
  20351. + return ret;
  20352. +}
  20353. +EXPORT_SYMBOL(rt_write_trylock);
  20354. +
  20355. +int __lockfunc rt_write_trylock_irqsave(rwlock_t *rwlock, unsigned long *flags)
  20356. +{
  20357. + int ret;
  20358. +
  20359. + *flags = 0;
  20360. + ret = rt_write_trylock(rwlock);
  20361. + return ret;
  20362. +}
  20363. +EXPORT_SYMBOL(rt_write_trylock_irqsave);
  20364. +
  20365. +int __lockfunc rt_read_trylock(rwlock_t *rwlock)
  20366. +{
  20367. + struct rt_mutex *lock = &rwlock->lock;
  20368. + int ret = 1;
  20369. +
  20370. + /*
  20371. + * recursive read locks succeed when current owns the lock,
  20372. + * but not when read_depth == 0 which means that the lock is
  20373. + * write locked.
  20374. + */
  20375. + if (rt_mutex_owner(lock) != current) {
  20376. + migrate_disable();
  20377. + ret = rt_mutex_trylock(lock);
  20378. + if (ret)
  20379. + rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_);
  20380. + else
  20381. + migrate_enable();
  20382. +
  20383. + } else if (!rwlock->read_depth) {
  20384. + ret = 0;
  20385. + }
  20386. +
  20387. + if (ret)
  20388. + rwlock->read_depth++;
  20389. +
  20390. + return ret;
  20391. +}
  20392. +EXPORT_SYMBOL(rt_read_trylock);
  20393. +
  20394. +void __lockfunc rt_write_lock(rwlock_t *rwlock)
  20395. +{
  20396. + rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_);
  20397. + migrate_disable();
  20398. + __rt_spin_lock(&rwlock->lock);
  20399. +}
  20400. +EXPORT_SYMBOL(rt_write_lock);
  20401. +
  20402. +void __lockfunc rt_read_lock(rwlock_t *rwlock)
  20403. +{
  20404. + struct rt_mutex *lock = &rwlock->lock;
  20405. +
  20406. +
  20407. + /*
  20408. + * recursive read locks succeed when current owns the lock
  20409. + */
  20410. + if (rt_mutex_owner(lock) != current) {
  20411. + migrate_disable();
  20412. + rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_);
  20413. + __rt_spin_lock(lock);
  20414. + }
  20415. + rwlock->read_depth++;
  20416. +}
  20417. +
  20418. +EXPORT_SYMBOL(rt_read_lock);
  20419. +
  20420. +void __lockfunc rt_write_unlock(rwlock_t *rwlock)
  20421. +{
  20422. + /* NOTE: we always pass in '1' for nested, for simplicity */
  20423. + rwlock_release(&rwlock->dep_map, 1, _RET_IP_);
  20424. + __rt_spin_unlock(&rwlock->lock);
  20425. + migrate_enable();
  20426. +}
  20427. +EXPORT_SYMBOL(rt_write_unlock);
  20428. +
  20429. +void __lockfunc rt_read_unlock(rwlock_t *rwlock)
  20430. +{
  20431. + /* Release the lock only when read_depth is down to 0 */
  20432. + if (--rwlock->read_depth == 0) {
  20433. + rwlock_release(&rwlock->dep_map, 1, _RET_IP_);
  20434. + __rt_spin_unlock(&rwlock->lock);
  20435. + migrate_enable();
  20436. + }
  20437. +}
  20438. +EXPORT_SYMBOL(rt_read_unlock);
  20439. +
  20440. +unsigned long __lockfunc rt_write_lock_irqsave(rwlock_t *rwlock)
  20441. +{
  20442. + rt_write_lock(rwlock);
  20443. +
  20444. + return 0;
  20445. +}
  20446. +EXPORT_SYMBOL(rt_write_lock_irqsave);
  20447. +
  20448. +unsigned long __lockfunc rt_read_lock_irqsave(rwlock_t *rwlock)
  20449. +{
  20450. + rt_read_lock(rwlock);
  20451. +
  20452. + return 0;
  20453. +}
  20454. +EXPORT_SYMBOL(rt_read_lock_irqsave);
  20455. +
  20456. +void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key)
  20457. +{
  20458. +#ifdef CONFIG_DEBUG_LOCK_ALLOC
  20459. + /*
  20460. + * Make sure we are not reinitializing a held lock:
  20461. + */
  20462. + debug_check_no_locks_freed((void *)rwlock, sizeof(*rwlock));
  20463. + lockdep_init_map(&rwlock->dep_map, name, key, 0);
  20464. +#endif
  20465. + rwlock->lock.save_state = 1;
  20466. + rwlock->read_depth = 0;
  20467. +}
  20468. +EXPORT_SYMBOL(__rt_rwlock_init);
  20469. +
  20470. +/*
  20471. + * rw_semaphores
  20472. + */
  20473. +
  20474. +void rt_up_write(struct rw_semaphore *rwsem)
  20475. +{
  20476. + rwsem_release(&rwsem->dep_map, 1, _RET_IP_);
  20477. + rt_mutex_unlock(&rwsem->lock);
  20478. +}
  20479. +EXPORT_SYMBOL(rt_up_write);
  20480. +
  20481. +void rt_up_read(struct rw_semaphore *rwsem)
  20482. +{
  20483. + rwsem_release(&rwsem->dep_map, 1, _RET_IP_);
  20484. + if (--rwsem->read_depth == 0)
  20485. + rt_mutex_unlock(&rwsem->lock);
  20486. +}
  20487. +EXPORT_SYMBOL(rt_up_read);
  20488. +
  20489. +/*
  20490. + * downgrade a write lock into a read lock
  20491. + * - just wake up any readers at the front of the queue
  20492. + */
  20493. +void rt_downgrade_write(struct rw_semaphore *rwsem)
  20494. +{
  20495. + BUG_ON(rt_mutex_owner(&rwsem->lock) != current);
  20496. + rwsem->read_depth = 1;
  20497. +}
  20498. +EXPORT_SYMBOL(rt_downgrade_write);
  20499. +
  20500. +int rt_down_write_trylock(struct rw_semaphore *rwsem)
  20501. +{
  20502. + int ret = rt_mutex_trylock(&rwsem->lock);
  20503. +
  20504. + if (ret)
  20505. + rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_);
  20506. + return ret;
  20507. +}
  20508. +EXPORT_SYMBOL(rt_down_write_trylock);
  20509. +
  20510. +void rt_down_write(struct rw_semaphore *rwsem)
  20511. +{
  20512. + rwsem_acquire(&rwsem->dep_map, 0, 0, _RET_IP_);
  20513. + rt_mutex_lock(&rwsem->lock);
  20514. +}
  20515. +EXPORT_SYMBOL(rt_down_write);
  20516. +
  20517. +void rt_down_write_nested(struct rw_semaphore *rwsem, int subclass)
  20518. +{
  20519. + rwsem_acquire(&rwsem->dep_map, subclass, 0, _RET_IP_);
  20520. + rt_mutex_lock(&rwsem->lock);
  20521. +}
  20522. +EXPORT_SYMBOL(rt_down_write_nested);
  20523. +
  20524. +void rt_down_write_nested_lock(struct rw_semaphore *rwsem,
  20525. + struct lockdep_map *nest)
  20526. +{
  20527. + rwsem_acquire_nest(&rwsem->dep_map, 0, 0, nest, _RET_IP_);
  20528. + rt_mutex_lock(&rwsem->lock);
  20529. +}
  20530. +EXPORT_SYMBOL(rt_down_write_nested_lock);
  20531. +
  20532. +int rt_down_read_trylock(struct rw_semaphore *rwsem)
  20533. +{
  20534. + struct rt_mutex *lock = &rwsem->lock;
  20535. + int ret = 1;
  20536. +
  20537. + /*
  20538. + * recursive read locks succeed when current owns the rwsem,
  20539. + * but not when read_depth == 0 which means that the rwsem is
  20540. + * write locked.
  20541. + */
  20542. + if (rt_mutex_owner(lock) != current)
  20543. + ret = rt_mutex_trylock(&rwsem->lock);
  20544. + else if (!rwsem->read_depth)
  20545. + ret = 0;
  20546. +
  20547. + if (ret) {
  20548. + rwsem->read_depth++;
  20549. + rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_);
  20550. + }
  20551. + return ret;
  20552. +}
  20553. +EXPORT_SYMBOL(rt_down_read_trylock);
  20554. +
  20555. +static void __rt_down_read(struct rw_semaphore *rwsem, int subclass)
  20556. +{
  20557. + struct rt_mutex *lock = &rwsem->lock;
  20558. +
  20559. + rwsem_acquire_read(&rwsem->dep_map, subclass, 0, _RET_IP_);
  20560. +
  20561. + if (rt_mutex_owner(lock) != current)
  20562. + rt_mutex_lock(&rwsem->lock);
  20563. + rwsem->read_depth++;
  20564. +}
  20565. +
  20566. +void rt_down_read(struct rw_semaphore *rwsem)
  20567. +{
  20568. + __rt_down_read(rwsem, 0);
  20569. +}
  20570. +EXPORT_SYMBOL(rt_down_read);
  20571. +
  20572. +void rt_down_read_nested(struct rw_semaphore *rwsem, int subclass)
  20573. +{
  20574. + __rt_down_read(rwsem, subclass);
  20575. +}
  20576. +EXPORT_SYMBOL(rt_down_read_nested);
  20577. +
  20578. +void __rt_rwsem_init(struct rw_semaphore *rwsem, const char *name,
  20579. + struct lock_class_key *key)
  20580. +{
  20581. +#ifdef CONFIG_DEBUG_LOCK_ALLOC
  20582. + /*
  20583. + * Make sure we are not reinitializing a held lock:
  20584. + */
  20585. + debug_check_no_locks_freed((void *)rwsem, sizeof(*rwsem));
  20586. + lockdep_init_map(&rwsem->dep_map, name, key, 0);
  20587. +#endif
  20588. + rwsem->read_depth = 0;
  20589. + rwsem->lock.save_state = 0;
  20590. +}
  20591. +EXPORT_SYMBOL(__rt_rwsem_init);
  20592. +
  20593. +/**
  20594. + * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
  20595. + * @cnt: the atomic which we are to dec
  20596. + * @lock: the mutex to return holding if we dec to 0
  20597. + *
  20598. + * return true and hold lock if we dec to 0, return false otherwise
  20599. + */
  20600. +int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock)
  20601. +{
  20602. + /* dec if we can't possibly hit 0 */
  20603. + if (atomic_add_unless(cnt, -1, 1))
  20604. + return 0;
  20605. + /* we might hit 0, so take the lock */
  20606. + mutex_lock(lock);
  20607. + if (!atomic_dec_and_test(cnt)) {
  20608. + /* when we actually did the dec, we didn't hit 0 */
  20609. + mutex_unlock(lock);
  20610. + return 0;
  20611. + }
  20612. + /* we hit 0, and we hold the lock */
  20613. + return 1;
  20614. +}
  20615. +EXPORT_SYMBOL(atomic_dec_and_mutex_lock);
  20616. diff -Nur linux-3.18.12.orig/kernel/locking/rtmutex.c linux-3.18.12/kernel/locking/rtmutex.c
  20617. --- linux-3.18.12.orig/kernel/locking/rtmutex.c 2015-04-20 14:48:02.000000000 -0500
  20618. +++ linux-3.18.12/kernel/locking/rtmutex.c 2015-04-26 13:32:22.439684003 -0500
  20619. @@ -7,6 +7,11 @@
  20620. * Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
  20621. * Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt
  20622. * Copyright (C) 2006 Esben Nielsen
  20623. + * Adaptive Spinlocks:
  20624. + * Copyright (C) 2008 Novell, Inc., Gregory Haskins, Sven Dietrich,
  20625. + * and Peter Morreale,
  20626. + * Adaptive Spinlocks simplification:
  20627. + * Copyright (C) 2008 Red Hat, Inc., Steven Rostedt <srostedt@redhat.com>
  20628. *
  20629. * See Documentation/locking/rt-mutex-design.txt for details.
  20630. */
  20631. @@ -16,6 +21,7 @@
  20632. #include <linux/sched/rt.h>
  20633. #include <linux/sched/deadline.h>
  20634. #include <linux/timer.h>
  20635. +#include <linux/ww_mutex.h>
  20636. #include "rtmutex_common.h"
  20637. @@ -69,6 +75,12 @@
  20638. clear_rt_mutex_waiters(lock);
  20639. }
  20640. +static int rt_mutex_real_waiter(struct rt_mutex_waiter *waiter)
  20641. +{
  20642. + return waiter && waiter != PI_WAKEUP_INPROGRESS &&
  20643. + waiter != PI_REQUEUE_INPROGRESS;
  20644. +}
  20645. +
  20646. /*
  20647. * We can speed up the acquire/release, if the architecture
  20648. * supports cmpxchg and if there's no debugging state to be set up
  20649. @@ -333,6 +345,14 @@
  20650. return debug_rt_mutex_detect_deadlock(waiter, chwalk);
  20651. }
  20652. +static void rt_mutex_wake_waiter(struct rt_mutex_waiter *waiter)
  20653. +{
  20654. + if (waiter->savestate)
  20655. + wake_up_lock_sleeper(waiter->task);
  20656. + else
  20657. + wake_up_process(waiter->task);
  20658. +}
  20659. +
  20660. /*
  20661. * Max number of times we'll walk the boosting chain:
  20662. */
  20663. @@ -340,7 +360,8 @@
  20664. static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p)
  20665. {
  20666. - return p->pi_blocked_on ? p->pi_blocked_on->lock : NULL;
  20667. + return rt_mutex_real_waiter(p->pi_blocked_on) ?
  20668. + p->pi_blocked_on->lock : NULL;
  20669. }
  20670. /*
  20671. @@ -477,7 +498,7 @@
  20672. * reached or the state of the chain has changed while we
  20673. * dropped the locks.
  20674. */
  20675. - if (!waiter)
  20676. + if (!rt_mutex_real_waiter(waiter))
  20677. goto out_unlock_pi;
  20678. /*
  20679. @@ -639,13 +660,16 @@
  20680. * follow here. This is the end of the chain we are walking.
  20681. */
  20682. if (!rt_mutex_owner(lock)) {
  20683. + struct rt_mutex_waiter *lock_top_waiter;
  20684. +
  20685. /*
  20686. * If the requeue [7] above changed the top waiter,
  20687. * then we need to wake the new top waiter up to try
  20688. * to get the lock.
  20689. */
  20690. - if (prerequeue_top_waiter != rt_mutex_top_waiter(lock))
  20691. - wake_up_process(rt_mutex_top_waiter(lock)->task);
  20692. + lock_top_waiter = rt_mutex_top_waiter(lock);
  20693. + if (prerequeue_top_waiter != lock_top_waiter)
  20694. + rt_mutex_wake_waiter(lock_top_waiter);
  20695. raw_spin_unlock(&lock->wait_lock);
  20696. return 0;
  20697. }
  20698. @@ -738,6 +762,25 @@
  20699. return ret;
  20700. }
  20701. +
  20702. +#define STEAL_NORMAL 0
  20703. +#define STEAL_LATERAL 1
  20704. +
  20705. +/*
  20706. + * Note that RT tasks are excluded from lateral-steals to prevent the
  20707. + * introduction of an unbounded latency
  20708. + */
  20709. +static inline int lock_is_stealable(struct task_struct *task,
  20710. + struct task_struct *pendowner, int mode)
  20711. +{
  20712. + if (mode == STEAL_NORMAL || rt_task(task)) {
  20713. + if (task->prio >= pendowner->prio)
  20714. + return 0;
  20715. + } else if (task->prio > pendowner->prio)
  20716. + return 0;
  20717. + return 1;
  20718. +}
  20719. +
  20720. /*
  20721. * Try to take an rt-mutex
  20722. *
  20723. @@ -748,8 +791,9 @@
  20724. * @waiter: The waiter that is queued to the lock's wait list if the
  20725. * callsite called task_blocked_on_lock(), otherwise NULL
  20726. */
  20727. -static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
  20728. - struct rt_mutex_waiter *waiter)
  20729. +static int __try_to_take_rt_mutex(struct rt_mutex *lock,
  20730. + struct task_struct *task,
  20731. + struct rt_mutex_waiter *waiter, int mode)
  20732. {
  20733. unsigned long flags;
  20734. @@ -788,8 +832,10 @@
  20735. * If waiter is not the highest priority waiter of
  20736. * @lock, give up.
  20737. */
  20738. - if (waiter != rt_mutex_top_waiter(lock))
  20739. + if (waiter != rt_mutex_top_waiter(lock)) {
  20740. + /* XXX lock_is_stealable() ? */
  20741. return 0;
  20742. + }
  20743. /*
  20744. * We can acquire the lock. Remove the waiter from the
  20745. @@ -807,14 +853,10 @@
  20746. * not need to be dequeued.
  20747. */
  20748. if (rt_mutex_has_waiters(lock)) {
  20749. - /*
  20750. - * If @task->prio is greater than or equal to
  20751. - * the top waiter priority (kernel view),
  20752. - * @task lost.
  20753. - */
  20754. - if (task->prio >= rt_mutex_top_waiter(lock)->prio)
  20755. - return 0;
  20756. + struct task_struct *pown = rt_mutex_top_waiter(lock)->task;
  20757. + if (task != pown && !lock_is_stealable(task, pown, mode))
  20758. + return 0;
  20759. /*
  20760. * The current top waiter stays enqueued. We
  20761. * don't have to change anything in the lock
  20762. @@ -863,6 +905,369 @@
  20763. return 1;
  20764. }
  20765. +#ifdef CONFIG_PREEMPT_RT_FULL
  20766. +/*
  20767. + * preemptible spin_lock functions:
  20768. + */
  20769. +static inline void rt_spin_lock_fastlock(struct rt_mutex *lock,
  20770. + void (*slowfn)(struct rt_mutex *lock))
  20771. +{
  20772. + might_sleep();
  20773. +
  20774. + if (likely(rt_mutex_cmpxchg(lock, NULL, current)))
  20775. + rt_mutex_deadlock_account_lock(lock, current);
  20776. + else
  20777. + slowfn(lock);
  20778. +}
  20779. +
  20780. +static inline void rt_spin_lock_fastunlock(struct rt_mutex *lock,
  20781. + void (*slowfn)(struct rt_mutex *lock))
  20782. +{
  20783. + if (likely(rt_mutex_cmpxchg(lock, current, NULL)))
  20784. + rt_mutex_deadlock_account_unlock(current);
  20785. + else
  20786. + slowfn(lock);
  20787. +}
  20788. +#ifdef CONFIG_SMP
  20789. +/*
  20790. + * Note that owner is a speculative pointer and dereferencing relies
  20791. + * on rcu_read_lock() and the check against the lock owner.
  20792. + */
  20793. +static int adaptive_wait(struct rt_mutex *lock,
  20794. + struct task_struct *owner)
  20795. +{
  20796. + int res = 0;
  20797. +
  20798. + rcu_read_lock();
  20799. + for (;;) {
  20800. + if (owner != rt_mutex_owner(lock))
  20801. + break;
  20802. + /*
  20803. + * Ensure that owner->on_cpu is dereferenced _after_
  20804. + * checking the above to be valid.
  20805. + */
  20806. + barrier();
  20807. + if (!owner->on_cpu) {
  20808. + res = 1;
  20809. + break;
  20810. + }
  20811. + cpu_relax();
  20812. + }
  20813. + rcu_read_unlock();
  20814. + return res;
  20815. +}
  20816. +#else
  20817. +static int adaptive_wait(struct rt_mutex *lock,
  20818. + struct task_struct *orig_owner)
  20819. +{
  20820. + return 1;
  20821. +}
  20822. +#endif
  20823. +
  20824. +# define pi_lock(lock) raw_spin_lock_irq(lock)
  20825. +# define pi_unlock(lock) raw_spin_unlock_irq(lock)
  20826. +
  20827. +static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
  20828. + struct rt_mutex_waiter *waiter,
  20829. + struct task_struct *task,
  20830. + enum rtmutex_chainwalk chwalk);
  20831. +/*
  20832. + * Slow path lock function spin_lock style: this variant is very
  20833. + * careful not to miss any non-lock wakeups.
  20834. + *
  20835. + * We store the current state under p->pi_lock in p->saved_state and
  20836. + * the try_to_wake_up() code handles this accordingly.
  20837. + */
  20838. +static void noinline __sched rt_spin_lock_slowlock(struct rt_mutex *lock)
  20839. +{
  20840. + struct task_struct *lock_owner, *self = current;
  20841. + struct rt_mutex_waiter waiter, *top_waiter;
  20842. + int ret;
  20843. +
  20844. + rt_mutex_init_waiter(&waiter, true);
  20845. +
  20846. + raw_spin_lock(&lock->wait_lock);
  20847. +
  20848. + if (__try_to_take_rt_mutex(lock, self, NULL, STEAL_LATERAL)) {
  20849. + raw_spin_unlock(&lock->wait_lock);
  20850. + return;
  20851. + }
  20852. +
  20853. + BUG_ON(rt_mutex_owner(lock) == self);
  20854. +
  20855. + /*
  20856. + * We save whatever state the task is in and we'll restore it
  20857. + * after acquiring the lock taking real wakeups into account
  20858. + * as well. We are serialized via pi_lock against wakeups. See
  20859. + * try_to_wake_up().
  20860. + */
  20861. + pi_lock(&self->pi_lock);
  20862. + self->saved_state = self->state;
  20863. + __set_current_state(TASK_UNINTERRUPTIBLE);
  20864. + pi_unlock(&self->pi_lock);
  20865. +
  20866. + ret = task_blocks_on_rt_mutex(lock, &waiter, self, 0);
  20867. + BUG_ON(ret);
  20868. +
  20869. + for (;;) {
  20870. + /* Try to acquire the lock again. */
  20871. + if (__try_to_take_rt_mutex(lock, self, &waiter, STEAL_LATERAL))
  20872. + break;
  20873. +
  20874. + top_waiter = rt_mutex_top_waiter(lock);
  20875. + lock_owner = rt_mutex_owner(lock);
  20876. +
  20877. + raw_spin_unlock(&lock->wait_lock);
  20878. +
  20879. + debug_rt_mutex_print_deadlock(&waiter);
  20880. +
  20881. + if (top_waiter != &waiter || adaptive_wait(lock, lock_owner))
  20882. + schedule_rt_mutex(lock);
  20883. +
  20884. + raw_spin_lock(&lock->wait_lock);
  20885. +
  20886. + pi_lock(&self->pi_lock);
  20887. + __set_current_state(TASK_UNINTERRUPTIBLE);
  20888. + pi_unlock(&self->pi_lock);
  20889. + }
  20890. +
  20891. + /*
  20892. + * Restore the task state to current->saved_state. We set it
  20893. + * to the original state above and the try_to_wake_up() code
  20894. + * has possibly updated it when a real (non-rtmutex) wakeup
  20895. + * happened while we were blocked. Clear saved_state so
  20896. + * try_to_wakeup() does not get confused.
  20897. + */
  20898. + pi_lock(&self->pi_lock);
  20899. + __set_current_state(self->saved_state);
  20900. + self->saved_state = TASK_RUNNING;
  20901. + pi_unlock(&self->pi_lock);
  20902. +
  20903. + /*
  20904. + * try_to_take_rt_mutex() sets the waiter bit
  20905. + * unconditionally. We might have to fix that up:
  20906. + */
  20907. + fixup_rt_mutex_waiters(lock);
  20908. +
  20909. + BUG_ON(rt_mutex_has_waiters(lock) && &waiter == rt_mutex_top_waiter(lock));
  20910. + BUG_ON(!RB_EMPTY_NODE(&waiter.tree_entry));
  20911. +
  20912. + raw_spin_unlock(&lock->wait_lock);
  20913. +
  20914. + debug_rt_mutex_free_waiter(&waiter);
  20915. +}
  20916. +
  20917. +static void wakeup_next_waiter(struct rt_mutex *lock);
  20918. +/*
  20919. + * Slow path to release a rt_mutex spin_lock style
  20920. + */
  20921. +static void __sched __rt_spin_lock_slowunlock(struct rt_mutex *lock)
  20922. +{
  20923. + debug_rt_mutex_unlock(lock);
  20924. +
  20925. + rt_mutex_deadlock_account_unlock(current);
  20926. +
  20927. + if (!rt_mutex_has_waiters(lock)) {
  20928. + lock->owner = NULL;
  20929. + raw_spin_unlock(&lock->wait_lock);
  20930. + return;
  20931. + }
  20932. +
  20933. + wakeup_next_waiter(lock);
  20934. +
  20935. + raw_spin_unlock(&lock->wait_lock);
  20936. +
  20937. + /* Undo pi boosting.when necessary */
  20938. + rt_mutex_adjust_prio(current);
  20939. +}
  20940. +
  20941. +static void noinline __sched rt_spin_lock_slowunlock(struct rt_mutex *lock)
  20942. +{
  20943. + raw_spin_lock(&lock->wait_lock);
  20944. + __rt_spin_lock_slowunlock(lock);
  20945. +}
  20946. +
  20947. +static void noinline __sched rt_spin_lock_slowunlock_hirq(struct rt_mutex *lock)
  20948. +{
  20949. + int ret;
  20950. +
  20951. + do {
  20952. + ret = raw_spin_trylock(&lock->wait_lock);
  20953. + } while (!ret);
  20954. +
  20955. + __rt_spin_lock_slowunlock(lock);
  20956. +}
  20957. +
  20958. +void __lockfunc rt_spin_lock(spinlock_t *lock)
  20959. +{
  20960. + rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock);
  20961. + spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
  20962. +}
  20963. +EXPORT_SYMBOL(rt_spin_lock);
  20964. +
  20965. +void __lockfunc __rt_spin_lock(struct rt_mutex *lock)
  20966. +{
  20967. + rt_spin_lock_fastlock(lock, rt_spin_lock_slowlock);
  20968. +}
  20969. +EXPORT_SYMBOL(__rt_spin_lock);
  20970. +
  20971. +#ifdef CONFIG_DEBUG_LOCK_ALLOC
  20972. +void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass)
  20973. +{
  20974. + rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock);
  20975. + spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
  20976. +}
  20977. +EXPORT_SYMBOL(rt_spin_lock_nested);
  20978. +#endif
  20979. +
  20980. +void __lockfunc rt_spin_unlock(spinlock_t *lock)
  20981. +{
  20982. + /* NOTE: we always pass in '1' for nested, for simplicity */
  20983. + spin_release(&lock->dep_map, 1, _RET_IP_);
  20984. + rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock);
  20985. +}
  20986. +EXPORT_SYMBOL(rt_spin_unlock);
  20987. +
  20988. +void __lockfunc rt_spin_unlock_after_trylock_in_irq(spinlock_t *lock)
  20989. +{
  20990. + /* NOTE: we always pass in '1' for nested, for simplicity */
  20991. + spin_release(&lock->dep_map, 1, _RET_IP_);
  20992. + rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock_hirq);
  20993. +}
  20994. +
  20995. +void __lockfunc __rt_spin_unlock(struct rt_mutex *lock)
  20996. +{
  20997. + rt_spin_lock_fastunlock(lock, rt_spin_lock_slowunlock);
  20998. +}
  20999. +EXPORT_SYMBOL(__rt_spin_unlock);
  21000. +
  21001. +/*
  21002. + * Wait for the lock to get unlocked: instead of polling for an unlock
  21003. + * (like raw spinlocks do), we lock and unlock, to force the kernel to
  21004. + * schedule if there's contention:
  21005. + */
  21006. +void __lockfunc rt_spin_unlock_wait(spinlock_t *lock)
  21007. +{
  21008. + spin_lock(lock);
  21009. + spin_unlock(lock);
  21010. +}
  21011. +EXPORT_SYMBOL(rt_spin_unlock_wait);
  21012. +
  21013. +int __lockfunc __rt_spin_trylock(struct rt_mutex *lock)
  21014. +{
  21015. + return rt_mutex_trylock(lock);
  21016. +}
  21017. +
  21018. +int __lockfunc rt_spin_trylock(spinlock_t *lock)
  21019. +{
  21020. + int ret = rt_mutex_trylock(&lock->lock);
  21021. +
  21022. + if (ret)
  21023. + spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
  21024. + return ret;
  21025. +}
  21026. +EXPORT_SYMBOL(rt_spin_trylock);
  21027. +
  21028. +int __lockfunc rt_spin_trylock_bh(spinlock_t *lock)
  21029. +{
  21030. + int ret;
  21031. +
  21032. + local_bh_disable();
  21033. + ret = rt_mutex_trylock(&lock->lock);
  21034. + if (ret) {
  21035. + migrate_disable();
  21036. + spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
  21037. + } else
  21038. + local_bh_enable();
  21039. + return ret;
  21040. +}
  21041. +EXPORT_SYMBOL(rt_spin_trylock_bh);
  21042. +
  21043. +int __lockfunc rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags)
  21044. +{
  21045. + int ret;
  21046. +
  21047. + *flags = 0;
  21048. + ret = rt_mutex_trylock(&lock->lock);
  21049. + if (ret) {
  21050. + migrate_disable();
  21051. + spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
  21052. + }
  21053. + return ret;
  21054. +}
  21055. +EXPORT_SYMBOL(rt_spin_trylock_irqsave);
  21056. +
  21057. +int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock)
  21058. +{
  21059. + /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */
  21060. + if (atomic_add_unless(atomic, -1, 1))
  21061. + return 0;
  21062. + migrate_disable();
  21063. + rt_spin_lock(lock);
  21064. + if (atomic_dec_and_test(atomic))
  21065. + return 1;
  21066. + rt_spin_unlock(lock);
  21067. + migrate_enable();
  21068. + return 0;
  21069. +}
  21070. +EXPORT_SYMBOL(atomic_dec_and_spin_lock);
  21071. +
  21072. + void
  21073. +__rt_spin_lock_init(spinlock_t *lock, char *name, struct lock_class_key *key)
  21074. +{
  21075. +#ifdef CONFIG_DEBUG_LOCK_ALLOC
  21076. + /*
  21077. + * Make sure we are not reinitializing a held lock:
  21078. + */
  21079. + debug_check_no_locks_freed((void *)lock, sizeof(*lock));
  21080. + lockdep_init_map(&lock->dep_map, name, key, 0);
  21081. +#endif
  21082. +}
  21083. +EXPORT_SYMBOL(__rt_spin_lock_init);
  21084. +
  21085. +#endif /* PREEMPT_RT_FULL */
  21086. +
  21087. +#ifdef CONFIG_PREEMPT_RT_FULL
  21088. + static inline int __sched
  21089. +__mutex_lock_check_stamp(struct rt_mutex *lock, struct ww_acquire_ctx *ctx)
  21090. +{
  21091. + struct ww_mutex *ww = container_of(lock, struct ww_mutex, base.lock);
  21092. + struct ww_acquire_ctx *hold_ctx = ACCESS_ONCE(ww->ctx);
  21093. +
  21094. + if (!hold_ctx)
  21095. + return 0;
  21096. +
  21097. + if (unlikely(ctx == hold_ctx))
  21098. + return -EALREADY;
  21099. +
  21100. + if (ctx->stamp - hold_ctx->stamp <= LONG_MAX &&
  21101. + (ctx->stamp != hold_ctx->stamp || ctx > hold_ctx)) {
  21102. +#ifdef CONFIG_DEBUG_MUTEXES
  21103. + DEBUG_LOCKS_WARN_ON(ctx->contending_lock);
  21104. + ctx->contending_lock = ww;
  21105. +#endif
  21106. + return -EDEADLK;
  21107. + }
  21108. +
  21109. + return 0;
  21110. +}
  21111. +#else
  21112. + static inline int __sched
  21113. +__mutex_lock_check_stamp(struct rt_mutex *lock, struct ww_acquire_ctx *ctx)
  21114. +{
  21115. + BUG();
  21116. + return 0;
  21117. +}
  21118. +
  21119. +#endif
  21120. +
  21121. +static inline int
  21122. +try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
  21123. + struct rt_mutex_waiter *waiter)
  21124. +{
  21125. + return __try_to_take_rt_mutex(lock, task, waiter, STEAL_NORMAL);
  21126. +}
  21127. +
  21128. /*
  21129. * Task blocks on lock.
  21130. *
  21131. @@ -894,6 +1299,23 @@
  21132. return -EDEADLK;
  21133. raw_spin_lock_irqsave(&task->pi_lock, flags);
  21134. +
  21135. + /*
  21136. + * In the case of futex requeue PI, this will be a proxy
  21137. + * lock. The task will wake unaware that it is enqueueed on
  21138. + * this lock. Avoid blocking on two locks and corrupting
  21139. + * pi_blocked_on via the PI_WAKEUP_INPROGRESS
  21140. + * flag. futex_wait_requeue_pi() sets this when it wakes up
  21141. + * before requeue (due to a signal or timeout). Do not enqueue
  21142. + * the task if PI_WAKEUP_INPROGRESS is set.
  21143. + */
  21144. + if (task != current && task->pi_blocked_on == PI_WAKEUP_INPROGRESS) {
  21145. + raw_spin_unlock_irqrestore(&task->pi_lock, flags);
  21146. + return -EAGAIN;
  21147. + }
  21148. +
  21149. + BUG_ON(rt_mutex_real_waiter(task->pi_blocked_on));
  21150. +
  21151. __rt_mutex_adjust_prio(task);
  21152. waiter->task = task;
  21153. waiter->lock = lock;
  21154. @@ -917,7 +1339,7 @@
  21155. rt_mutex_enqueue_pi(owner, waiter);
  21156. __rt_mutex_adjust_prio(owner);
  21157. - if (owner->pi_blocked_on)
  21158. + if (rt_mutex_real_waiter(owner->pi_blocked_on))
  21159. chain_walk = 1;
  21160. } else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) {
  21161. chain_walk = 1;
  21162. @@ -994,7 +1416,7 @@
  21163. * long as we hold lock->wait_lock. The waiter task needs to
  21164. * acquire it in order to dequeue the waiter.
  21165. */
  21166. - wake_up_process(waiter->task);
  21167. + rt_mutex_wake_waiter(waiter);
  21168. }
  21169. /*
  21170. @@ -1008,7 +1430,7 @@
  21171. {
  21172. bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock));
  21173. struct task_struct *owner = rt_mutex_owner(lock);
  21174. - struct rt_mutex *next_lock;
  21175. + struct rt_mutex *next_lock = NULL;
  21176. unsigned long flags;
  21177. raw_spin_lock_irqsave(&current->pi_lock, flags);
  21178. @@ -1033,7 +1455,8 @@
  21179. __rt_mutex_adjust_prio(owner);
  21180. /* Store the lock on which owner is blocked or NULL */
  21181. - next_lock = task_blocked_on_lock(owner);
  21182. + if (rt_mutex_real_waiter(owner->pi_blocked_on))
  21183. + next_lock = task_blocked_on_lock(owner);
  21184. raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
  21185. @@ -1069,17 +1492,17 @@
  21186. raw_spin_lock_irqsave(&task->pi_lock, flags);
  21187. waiter = task->pi_blocked_on;
  21188. - if (!waiter || (waiter->prio == task->prio &&
  21189. + if (!rt_mutex_real_waiter(waiter) || (waiter->prio == task->prio &&
  21190. !dl_prio(task->prio))) {
  21191. raw_spin_unlock_irqrestore(&task->pi_lock, flags);
  21192. return;
  21193. }
  21194. next_lock = waiter->lock;
  21195. - raw_spin_unlock_irqrestore(&task->pi_lock, flags);
  21196. /* gets dropped in rt_mutex_adjust_prio_chain()! */
  21197. get_task_struct(task);
  21198. + raw_spin_unlock_irqrestore(&task->pi_lock, flags);
  21199. rt_mutex_adjust_prio_chain(task, RT_MUTEX_MIN_CHAINWALK, NULL,
  21200. next_lock, NULL, task);
  21201. }
  21202. @@ -1097,7 +1520,8 @@
  21203. static int __sched
  21204. __rt_mutex_slowlock(struct rt_mutex *lock, int state,
  21205. struct hrtimer_sleeper *timeout,
  21206. - struct rt_mutex_waiter *waiter)
  21207. + struct rt_mutex_waiter *waiter,
  21208. + struct ww_acquire_ctx *ww_ctx)
  21209. {
  21210. int ret = 0;
  21211. @@ -1120,6 +1544,12 @@
  21212. break;
  21213. }
  21214. + if (ww_ctx && ww_ctx->acquired > 0) {
  21215. + ret = __mutex_lock_check_stamp(lock, ww_ctx);
  21216. + if (ret)
  21217. + break;
  21218. + }
  21219. +
  21220. raw_spin_unlock(&lock->wait_lock);
  21221. debug_rt_mutex_print_deadlock(waiter);
  21222. @@ -1153,25 +1583,102 @@
  21223. }
  21224. }
  21225. +static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww,
  21226. + struct ww_acquire_ctx *ww_ctx)
  21227. +{
  21228. +#ifdef CONFIG_DEBUG_MUTEXES
  21229. + /*
  21230. + * If this WARN_ON triggers, you used ww_mutex_lock to acquire,
  21231. + * but released with a normal mutex_unlock in this call.
  21232. + *
  21233. + * This should never happen, always use ww_mutex_unlock.
  21234. + */
  21235. + DEBUG_LOCKS_WARN_ON(ww->ctx);
  21236. +
  21237. + /*
  21238. + * Not quite done after calling ww_acquire_done() ?
  21239. + */
  21240. + DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire);
  21241. +
  21242. + if (ww_ctx->contending_lock) {
  21243. + /*
  21244. + * After -EDEADLK you tried to
  21245. + * acquire a different ww_mutex? Bad!
  21246. + */
  21247. + DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww);
  21248. +
  21249. + /*
  21250. + * You called ww_mutex_lock after receiving -EDEADLK,
  21251. + * but 'forgot' to unlock everything else first?
  21252. + */
  21253. + DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0);
  21254. + ww_ctx->contending_lock = NULL;
  21255. + }
  21256. +
  21257. + /*
  21258. + * Naughty, using a different class will lead to undefined behavior!
  21259. + */
  21260. + DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class);
  21261. +#endif
  21262. + ww_ctx->acquired++;
  21263. +}
  21264. +
  21265. +#ifdef CONFIG_PREEMPT_RT_FULL
  21266. +static void ww_mutex_account_lock(struct rt_mutex *lock,
  21267. + struct ww_acquire_ctx *ww_ctx)
  21268. +{
  21269. + struct ww_mutex *ww = container_of(lock, struct ww_mutex, base.lock);
  21270. + struct rt_mutex_waiter *waiter, *n;
  21271. +
  21272. + /*
  21273. + * This branch gets optimized out for the common case,
  21274. + * and is only important for ww_mutex_lock.
  21275. + */
  21276. + ww_mutex_lock_acquired(ww, ww_ctx);
  21277. + ww->ctx = ww_ctx;
  21278. +
  21279. + /*
  21280. + * Give any possible sleeping processes the chance to wake up,
  21281. + * so they can recheck if they have to back off.
  21282. + */
  21283. + rbtree_postorder_for_each_entry_safe(waiter, n, &lock->waiters,
  21284. + tree_entry) {
  21285. + /* XXX debug rt mutex waiter wakeup */
  21286. +
  21287. + BUG_ON(waiter->lock != lock);
  21288. + rt_mutex_wake_waiter(waiter);
  21289. + }
  21290. +}
  21291. +
  21292. +#else
  21293. +
  21294. +static void ww_mutex_account_lock(struct rt_mutex *lock,
  21295. + struct ww_acquire_ctx *ww_ctx)
  21296. +{
  21297. + BUG();
  21298. +}
  21299. +#endif
  21300. +
  21301. /*
  21302. * Slow path lock function:
  21303. */
  21304. static int __sched
  21305. rt_mutex_slowlock(struct rt_mutex *lock, int state,
  21306. struct hrtimer_sleeper *timeout,
  21307. - enum rtmutex_chainwalk chwalk)
  21308. + enum rtmutex_chainwalk chwalk,
  21309. + struct ww_acquire_ctx *ww_ctx)
  21310. {
  21311. struct rt_mutex_waiter waiter;
  21312. int ret = 0;
  21313. - debug_rt_mutex_init_waiter(&waiter);
  21314. - RB_CLEAR_NODE(&waiter.pi_tree_entry);
  21315. - RB_CLEAR_NODE(&waiter.tree_entry);
  21316. + rt_mutex_init_waiter(&waiter, false);
  21317. raw_spin_lock(&lock->wait_lock);
  21318. /* Try to acquire the lock again: */
  21319. if (try_to_take_rt_mutex(lock, current, NULL)) {
  21320. + if (ww_ctx)
  21321. + ww_mutex_account_lock(lock, ww_ctx);
  21322. raw_spin_unlock(&lock->wait_lock);
  21323. return 0;
  21324. }
  21325. @@ -1188,14 +1695,23 @@
  21326. ret = task_blocks_on_rt_mutex(lock, &waiter, current, chwalk);
  21327. if (likely(!ret))
  21328. - ret = __rt_mutex_slowlock(lock, state, timeout, &waiter);
  21329. + ret = __rt_mutex_slowlock(lock, state, timeout, &waiter, ww_ctx);
  21330. + else if (ww_ctx) {
  21331. + /* ww_mutex received EDEADLK, let it become EALREADY */
  21332. + ret = __mutex_lock_check_stamp(lock, ww_ctx);
  21333. + BUG_ON(!ret);
  21334. + }
  21335. set_current_state(TASK_RUNNING);
  21336. if (unlikely(ret)) {
  21337. if (rt_mutex_has_waiters(lock))
  21338. remove_waiter(lock, &waiter);
  21339. - rt_mutex_handle_deadlock(ret, chwalk, &waiter);
  21340. + /* ww_mutex want to report EDEADLK/EALREADY, let them */
  21341. + if (!ww_ctx)
  21342. + rt_mutex_handle_deadlock(ret, chwalk, &waiter);
  21343. + } else if (ww_ctx) {
  21344. + ww_mutex_account_lock(lock, ww_ctx);
  21345. }
  21346. /*
  21347. @@ -1234,7 +1750,8 @@
  21348. * The mutex has currently no owner. Lock the wait lock and
  21349. * try to acquire the lock.
  21350. */
  21351. - raw_spin_lock(&lock->wait_lock);
  21352. + if (!raw_spin_trylock(&lock->wait_lock))
  21353. + return 0;
  21354. ret = try_to_take_rt_mutex(lock, current, NULL);
  21355. @@ -1320,31 +1837,36 @@
  21356. */
  21357. static inline int
  21358. rt_mutex_fastlock(struct rt_mutex *lock, int state,
  21359. + struct ww_acquire_ctx *ww_ctx,
  21360. int (*slowfn)(struct rt_mutex *lock, int state,
  21361. struct hrtimer_sleeper *timeout,
  21362. - enum rtmutex_chainwalk chwalk))
  21363. + enum rtmutex_chainwalk chwalk,
  21364. + struct ww_acquire_ctx *ww_ctx))
  21365. {
  21366. if (likely(rt_mutex_cmpxchg(lock, NULL, current))) {
  21367. rt_mutex_deadlock_account_lock(lock, current);
  21368. return 0;
  21369. } else
  21370. - return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK);
  21371. + return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK,
  21372. + ww_ctx);
  21373. }
  21374. static inline int
  21375. rt_mutex_timed_fastlock(struct rt_mutex *lock, int state,
  21376. struct hrtimer_sleeper *timeout,
  21377. enum rtmutex_chainwalk chwalk,
  21378. + struct ww_acquire_ctx *ww_ctx,
  21379. int (*slowfn)(struct rt_mutex *lock, int state,
  21380. struct hrtimer_sleeper *timeout,
  21381. - enum rtmutex_chainwalk chwalk))
  21382. + enum rtmutex_chainwalk chwalk,
  21383. + struct ww_acquire_ctx *ww_ctx))
  21384. {
  21385. if (chwalk == RT_MUTEX_MIN_CHAINWALK &&
  21386. likely(rt_mutex_cmpxchg(lock, NULL, current))) {
  21387. rt_mutex_deadlock_account_lock(lock, current);
  21388. return 0;
  21389. } else
  21390. - return slowfn(lock, state, timeout, chwalk);
  21391. + return slowfn(lock, state, timeout, chwalk, ww_ctx);
  21392. }
  21393. static inline int
  21394. @@ -1377,7 +1899,7 @@
  21395. {
  21396. might_sleep();
  21397. - rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, rt_mutex_slowlock);
  21398. + rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, NULL, rt_mutex_slowlock);
  21399. }
  21400. EXPORT_SYMBOL_GPL(rt_mutex_lock);
  21401. @@ -1394,7 +1916,7 @@
  21402. {
  21403. might_sleep();
  21404. - return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, rt_mutex_slowlock);
  21405. + return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, NULL, rt_mutex_slowlock);
  21406. }
  21407. EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
  21408. @@ -1407,11 +1929,30 @@
  21409. might_sleep();
  21410. return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
  21411. - RT_MUTEX_FULL_CHAINWALK,
  21412. + RT_MUTEX_FULL_CHAINWALK, NULL,
  21413. rt_mutex_slowlock);
  21414. }
  21415. /**
  21416. + * rt_mutex_lock_killable - lock a rt_mutex killable
  21417. + *
  21418. + * @lock: the rt_mutex to be locked
  21419. + * @detect_deadlock: deadlock detection on/off
  21420. + *
  21421. + * Returns:
  21422. + * 0 on success
  21423. + * -EINTR when interrupted by a signal
  21424. + * -EDEADLK when the lock would deadlock (when deadlock detection is on)
  21425. + */
  21426. +int __sched rt_mutex_lock_killable(struct rt_mutex *lock)
  21427. +{
  21428. + might_sleep();
  21429. +
  21430. + return rt_mutex_fastlock(lock, TASK_KILLABLE, NULL, rt_mutex_slowlock);
  21431. +}
  21432. +EXPORT_SYMBOL_GPL(rt_mutex_lock_killable);
  21433. +
  21434. +/**
  21435. * rt_mutex_timed_lock - lock a rt_mutex interruptible
  21436. * the timeout structure is provided
  21437. * by the caller
  21438. @@ -1431,6 +1972,7 @@
  21439. return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
  21440. RT_MUTEX_MIN_CHAINWALK,
  21441. + NULL,
  21442. rt_mutex_slowlock);
  21443. }
  21444. EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
  21445. @@ -1489,13 +2031,12 @@
  21446. void __rt_mutex_init(struct rt_mutex *lock, const char *name)
  21447. {
  21448. lock->owner = NULL;
  21449. - raw_spin_lock_init(&lock->wait_lock);
  21450. lock->waiters = RB_ROOT;
  21451. lock->waiters_leftmost = NULL;
  21452. debug_rt_mutex_init(lock, name);
  21453. }
  21454. -EXPORT_SYMBOL_GPL(__rt_mutex_init);
  21455. +EXPORT_SYMBOL(__rt_mutex_init);
  21456. /**
  21457. * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a
  21458. @@ -1510,7 +2051,7 @@
  21459. void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
  21460. struct task_struct *proxy_owner)
  21461. {
  21462. - __rt_mutex_init(lock, NULL);
  21463. + rt_mutex_init(lock);
  21464. debug_rt_mutex_proxy_lock(lock, proxy_owner);
  21465. rt_mutex_set_owner(lock, proxy_owner);
  21466. rt_mutex_deadlock_account_lock(lock, proxy_owner);
  21467. @@ -1558,6 +2099,35 @@
  21468. return 1;
  21469. }
  21470. +#ifdef CONFIG_PREEMPT_RT_FULL
  21471. + /*
  21472. + * In PREEMPT_RT there's an added race.
  21473. + * If the task, that we are about to requeue, times out,
  21474. + * it can set the PI_WAKEUP_INPROGRESS. This tells the requeue
  21475. + * to skip this task. But right after the task sets
  21476. + * its pi_blocked_on to PI_WAKEUP_INPROGRESS it can then
  21477. + * block on the spin_lock(&hb->lock), which in RT is an rtmutex.
  21478. + * This will replace the PI_WAKEUP_INPROGRESS with the actual
  21479. + * lock that it blocks on. We *must not* place this task
  21480. + * on this proxy lock in that case.
  21481. + *
  21482. + * To prevent this race, we first take the task's pi_lock
  21483. + * and check if it has updated its pi_blocked_on. If it has,
  21484. + * we assume that it woke up and we return -EAGAIN.
  21485. + * Otherwise, we set the task's pi_blocked_on to
  21486. + * PI_REQUEUE_INPROGRESS, so that if the task is waking up
  21487. + * it will know that we are in the process of requeuing it.
  21488. + */
  21489. + raw_spin_lock_irq(&task->pi_lock);
  21490. + if (task->pi_blocked_on) {
  21491. + raw_spin_unlock_irq(&task->pi_lock);
  21492. + raw_spin_unlock(&lock->wait_lock);
  21493. + return -EAGAIN;
  21494. + }
  21495. + task->pi_blocked_on = PI_REQUEUE_INPROGRESS;
  21496. + raw_spin_unlock_irq(&task->pi_lock);
  21497. +#endif
  21498. +
  21499. /* We enforce deadlock detection for futexes */
  21500. ret = task_blocks_on_rt_mutex(lock, waiter, task,
  21501. RT_MUTEX_FULL_CHAINWALK);
  21502. @@ -1627,7 +2197,7 @@
  21503. set_current_state(TASK_INTERRUPTIBLE);
  21504. - ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
  21505. + ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter, NULL);
  21506. set_current_state(TASK_RUNNING);
  21507. @@ -1644,3 +2214,89 @@
  21508. return ret;
  21509. }
  21510. +
  21511. +static inline int
  21512. +ww_mutex_deadlock_injection(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
  21513. +{
  21514. +#ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH
  21515. + unsigned tmp;
  21516. +
  21517. + if (ctx->deadlock_inject_countdown-- == 0) {
  21518. + tmp = ctx->deadlock_inject_interval;
  21519. + if (tmp > UINT_MAX/4)
  21520. + tmp = UINT_MAX;
  21521. + else
  21522. + tmp = tmp*2 + tmp + tmp/2;
  21523. +
  21524. + ctx->deadlock_inject_interval = tmp;
  21525. + ctx->deadlock_inject_countdown = tmp;
  21526. + ctx->contending_lock = lock;
  21527. +
  21528. + ww_mutex_unlock(lock);
  21529. +
  21530. + return -EDEADLK;
  21531. + }
  21532. +#endif
  21533. +
  21534. + return 0;
  21535. +}
  21536. +
  21537. +#ifdef CONFIG_PREEMPT_RT_FULL
  21538. +int __sched
  21539. +__ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx)
  21540. +{
  21541. + int ret;
  21542. +
  21543. + might_sleep();
  21544. +
  21545. + mutex_acquire_nest(&lock->base.dep_map, 0, 0, &ww_ctx->dep_map, _RET_IP_);
  21546. + ret = rt_mutex_slowlock(&lock->base.lock, TASK_INTERRUPTIBLE, NULL, 0, ww_ctx);
  21547. + if (ret)
  21548. + mutex_release(&lock->base.dep_map, 1, _RET_IP_);
  21549. + else if (!ret && ww_ctx->acquired > 1)
  21550. + return ww_mutex_deadlock_injection(lock, ww_ctx);
  21551. +
  21552. + return ret;
  21553. +}
  21554. +EXPORT_SYMBOL_GPL(__ww_mutex_lock_interruptible);
  21555. +
  21556. +int __sched
  21557. +__ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx)
  21558. +{
  21559. + int ret;
  21560. +
  21561. + might_sleep();
  21562. +
  21563. + mutex_acquire_nest(&lock->base.dep_map, 0, 0, &ww_ctx->dep_map, _RET_IP_);
  21564. + ret = rt_mutex_slowlock(&lock->base.lock, TASK_UNINTERRUPTIBLE, NULL, 0, ww_ctx);
  21565. + if (ret)
  21566. + mutex_release(&lock->base.dep_map, 1, _RET_IP_);
  21567. + else if (!ret && ww_ctx->acquired > 1)
  21568. + return ww_mutex_deadlock_injection(lock, ww_ctx);
  21569. +
  21570. + return ret;
  21571. +}
  21572. +EXPORT_SYMBOL_GPL(__ww_mutex_lock);
  21573. +
  21574. +void __sched ww_mutex_unlock(struct ww_mutex *lock)
  21575. +{
  21576. + int nest = !!lock->ctx;
  21577. +
  21578. + /*
  21579. + * The unlocking fastpath is the 0->1 transition from 'locked'
  21580. + * into 'unlocked' state:
  21581. + */
  21582. + if (nest) {
  21583. +#ifdef CONFIG_DEBUG_MUTEXES
  21584. + DEBUG_LOCKS_WARN_ON(!lock->ctx->acquired);
  21585. +#endif
  21586. + if (lock->ctx->acquired > 0)
  21587. + lock->ctx->acquired--;
  21588. + lock->ctx = NULL;
  21589. + }
  21590. +
  21591. + mutex_release(&lock->base.dep_map, nest, _RET_IP_);
  21592. + rt_mutex_unlock(&lock->base.lock);
  21593. +}
  21594. +EXPORT_SYMBOL(ww_mutex_unlock);
  21595. +#endif
  21596. diff -Nur linux-3.18.12.orig/kernel/locking/rtmutex_common.h linux-3.18.12/kernel/locking/rtmutex_common.h
  21597. --- linux-3.18.12.orig/kernel/locking/rtmutex_common.h 2015-04-20 14:48:02.000000000 -0500
  21598. +++ linux-3.18.12/kernel/locking/rtmutex_common.h 2015-04-26 13:32:22.439684003 -0500
  21599. @@ -49,6 +49,7 @@
  21600. struct rb_node pi_tree_entry;
  21601. struct task_struct *task;
  21602. struct rt_mutex *lock;
  21603. + bool savestate;
  21604. #ifdef CONFIG_DEBUG_RT_MUTEXES
  21605. unsigned long ip;
  21606. struct pid *deadlock_task_pid;
  21607. @@ -119,6 +120,9 @@
  21608. /*
  21609. * PI-futex support (proxy locking functions, etc.):
  21610. */
  21611. +#define PI_WAKEUP_INPROGRESS ((struct rt_mutex_waiter *) 1)
  21612. +#define PI_REQUEUE_INPROGRESS ((struct rt_mutex_waiter *) 2)
  21613. +
  21614. extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);
  21615. extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
  21616. struct task_struct *proxy_owner);
  21617. @@ -138,4 +142,14 @@
  21618. # include "rtmutex.h"
  21619. #endif
  21620. +static inline void
  21621. +rt_mutex_init_waiter(struct rt_mutex_waiter *waiter, bool savestate)
  21622. +{
  21623. + debug_rt_mutex_init_waiter(waiter);
  21624. + waiter->task = NULL;
  21625. + waiter->savestate = savestate;
  21626. + RB_CLEAR_NODE(&waiter->pi_tree_entry);
  21627. + RB_CLEAR_NODE(&waiter->tree_entry);
  21628. +}
  21629. +
  21630. #endif
  21631. diff -Nur linux-3.18.12.orig/kernel/locking/spinlock.c linux-3.18.12/kernel/locking/spinlock.c
  21632. --- linux-3.18.12.orig/kernel/locking/spinlock.c 2015-04-20 14:48:02.000000000 -0500
  21633. +++ linux-3.18.12/kernel/locking/spinlock.c 2015-04-26 13:32:22.439684003 -0500
  21634. @@ -124,8 +124,11 @@
  21635. * __[spin|read|write]_lock_bh()
  21636. */
  21637. BUILD_LOCK_OPS(spin, raw_spinlock);
  21638. +
  21639. +#ifndef CONFIG_PREEMPT_RT_FULL
  21640. BUILD_LOCK_OPS(read, rwlock);
  21641. BUILD_LOCK_OPS(write, rwlock);
  21642. +#endif
  21643. #endif
  21644. @@ -209,6 +212,8 @@
  21645. EXPORT_SYMBOL(_raw_spin_unlock_bh);
  21646. #endif
  21647. +#ifndef CONFIG_PREEMPT_RT_FULL
  21648. +
  21649. #ifndef CONFIG_INLINE_READ_TRYLOCK
  21650. int __lockfunc _raw_read_trylock(rwlock_t *lock)
  21651. {
  21652. @@ -353,6 +358,8 @@
  21653. EXPORT_SYMBOL(_raw_write_unlock_bh);
  21654. #endif
  21655. +#endif /* !PREEMPT_RT_FULL */
  21656. +
  21657. #ifdef CONFIG_DEBUG_LOCK_ALLOC
  21658. void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass)
  21659. diff -Nur linux-3.18.12.orig/kernel/locking/spinlock_debug.c linux-3.18.12/kernel/locking/spinlock_debug.c
  21660. --- linux-3.18.12.orig/kernel/locking/spinlock_debug.c 2015-04-20 14:48:02.000000000 -0500
  21661. +++ linux-3.18.12/kernel/locking/spinlock_debug.c 2015-04-26 13:32:22.439684003 -0500
  21662. @@ -31,6 +31,7 @@
  21663. EXPORT_SYMBOL(__raw_spin_lock_init);
  21664. +#ifndef CONFIG_PREEMPT_RT_FULL
  21665. void __rwlock_init(rwlock_t *lock, const char *name,
  21666. struct lock_class_key *key)
  21667. {
  21668. @@ -48,6 +49,7 @@
  21669. }
  21670. EXPORT_SYMBOL(__rwlock_init);
  21671. +#endif
  21672. static void spin_dump(raw_spinlock_t *lock, const char *msg)
  21673. {
  21674. @@ -159,6 +161,7 @@
  21675. arch_spin_unlock(&lock->raw_lock);
  21676. }
  21677. +#ifndef CONFIG_PREEMPT_RT_FULL
  21678. static void rwlock_bug(rwlock_t *lock, const char *msg)
  21679. {
  21680. if (!debug_locks_off())
  21681. @@ -300,3 +303,5 @@
  21682. debug_write_unlock(lock);
  21683. arch_write_unlock(&lock->raw_lock);
  21684. }
  21685. +
  21686. +#endif
  21687. diff -Nur linux-3.18.12.orig/kernel/panic.c linux-3.18.12/kernel/panic.c
  21688. --- linux-3.18.12.orig/kernel/panic.c 2015-04-20 14:48:02.000000000 -0500
  21689. +++ linux-3.18.12/kernel/panic.c 2015-04-26 13:32:22.439684003 -0500
  21690. @@ -384,9 +384,11 @@
  21691. static int init_oops_id(void)
  21692. {
  21693. +#ifndef CONFIG_PREEMPT_RT_FULL
  21694. if (!oops_id)
  21695. get_random_bytes(&oops_id, sizeof(oops_id));
  21696. else
  21697. +#endif
  21698. oops_id++;
  21699. return 0;
  21700. diff -Nur linux-3.18.12.orig/kernel/power/hibernate.c linux-3.18.12/kernel/power/hibernate.c
  21701. --- linux-3.18.12.orig/kernel/power/hibernate.c 2015-04-20 14:48:02.000000000 -0500
  21702. +++ linux-3.18.12/kernel/power/hibernate.c 2015-04-26 13:32:22.439684003 -0500
  21703. @@ -287,6 +287,8 @@
  21704. local_irq_disable();
  21705. + system_state = SYSTEM_SUSPEND;
  21706. +
  21707. error = syscore_suspend();
  21708. if (error) {
  21709. printk(KERN_ERR "PM: Some system devices failed to power down, "
  21710. @@ -316,6 +318,7 @@
  21711. syscore_resume();
  21712. Enable_irqs:
  21713. + system_state = SYSTEM_RUNNING;
  21714. local_irq_enable();
  21715. Enable_cpus:
  21716. @@ -439,6 +442,7 @@
  21717. goto Enable_cpus;
  21718. local_irq_disable();
  21719. + system_state = SYSTEM_SUSPEND;
  21720. error = syscore_suspend();
  21721. if (error)
  21722. @@ -472,6 +476,7 @@
  21723. syscore_resume();
  21724. Enable_irqs:
  21725. + system_state = SYSTEM_RUNNING;
  21726. local_irq_enable();
  21727. Enable_cpus:
  21728. @@ -557,6 +562,7 @@
  21729. goto Platform_finish;
  21730. local_irq_disable();
  21731. + system_state = SYSTEM_SUSPEND;
  21732. syscore_suspend();
  21733. if (pm_wakeup_pending()) {
  21734. error = -EAGAIN;
  21735. @@ -569,6 +575,7 @@
  21736. Power_up:
  21737. syscore_resume();
  21738. + system_state = SYSTEM_RUNNING;
  21739. local_irq_enable();
  21740. enable_nonboot_cpus();
  21741. diff -Nur linux-3.18.12.orig/kernel/power/suspend.c linux-3.18.12/kernel/power/suspend.c
  21742. --- linux-3.18.12.orig/kernel/power/suspend.c 2015-04-20 14:48:02.000000000 -0500
  21743. +++ linux-3.18.12/kernel/power/suspend.c 2015-04-26 13:32:22.439684003 -0500
  21744. @@ -318,6 +318,8 @@
  21745. arch_suspend_disable_irqs();
  21746. BUG_ON(!irqs_disabled());
  21747. + system_state = SYSTEM_SUSPEND;
  21748. +
  21749. error = syscore_suspend();
  21750. if (!error) {
  21751. *wakeup = pm_wakeup_pending();
  21752. @@ -332,6 +334,8 @@
  21753. syscore_resume();
  21754. }
  21755. + system_state = SYSTEM_RUNNING;
  21756. +
  21757. arch_suspend_enable_irqs();
  21758. BUG_ON(irqs_disabled());
  21759. diff -Nur linux-3.18.12.orig/kernel/printk/printk.c linux-3.18.12/kernel/printk/printk.c
  21760. --- linux-3.18.12.orig/kernel/printk/printk.c 2015-04-20 14:48:02.000000000 -0500
  21761. +++ linux-3.18.12/kernel/printk/printk.c 2015-04-26 13:32:22.439684003 -0500
  21762. @@ -1165,6 +1165,7 @@
  21763. {
  21764. char *text;
  21765. int len = 0;
  21766. + int attempts = 0;
  21767. text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
  21768. if (!text)
  21769. @@ -1176,7 +1177,14 @@
  21770. u64 seq;
  21771. u32 idx;
  21772. enum log_flags prev;
  21773. -
  21774. + int num_msg;
  21775. +try_again:
  21776. + attempts++;
  21777. + if (attempts > 10) {
  21778. + len = -EBUSY;
  21779. + goto out;
  21780. + }
  21781. + num_msg = 0;
  21782. if (clear_seq < log_first_seq) {
  21783. /* messages are gone, move to first available one */
  21784. clear_seq = log_first_seq;
  21785. @@ -1197,6 +1205,14 @@
  21786. prev = msg->flags;
  21787. idx = log_next(idx);
  21788. seq++;
  21789. + num_msg++;
  21790. + if (num_msg > 5) {
  21791. + num_msg = 0;
  21792. + raw_spin_unlock_irq(&logbuf_lock);
  21793. + raw_spin_lock_irq(&logbuf_lock);
  21794. + if (clear_seq < log_first_seq)
  21795. + goto try_again;
  21796. + }
  21797. }
  21798. /* move first record forward until length fits into the buffer */
  21799. @@ -1210,6 +1226,14 @@
  21800. prev = msg->flags;
  21801. idx = log_next(idx);
  21802. seq++;
  21803. + num_msg++;
  21804. + if (num_msg > 5) {
  21805. + num_msg = 0;
  21806. + raw_spin_unlock_irq(&logbuf_lock);
  21807. + raw_spin_lock_irq(&logbuf_lock);
  21808. + if (clear_seq < log_first_seq)
  21809. + goto try_again;
  21810. + }
  21811. }
  21812. /* last message fitting into this dump */
  21813. @@ -1250,6 +1274,7 @@
  21814. clear_seq = log_next_seq;
  21815. clear_idx = log_next_idx;
  21816. }
  21817. +out:
  21818. raw_spin_unlock_irq(&logbuf_lock);
  21819. kfree(text);
  21820. @@ -1407,6 +1432,7 @@
  21821. if (!console_drivers)
  21822. return;
  21823. + migrate_disable();
  21824. for_each_console(con) {
  21825. if (exclusive_console && con != exclusive_console)
  21826. continue;
  21827. @@ -1419,6 +1445,7 @@
  21828. continue;
  21829. con->write(con, text, len);
  21830. }
  21831. + migrate_enable();
  21832. }
  21833. /*
  21834. @@ -1479,6 +1506,15 @@
  21835. static int console_trylock_for_printk(void)
  21836. {
  21837. unsigned int cpu = smp_processor_id();
  21838. +#ifdef CONFIG_PREEMPT_RT_FULL
  21839. + int lock = !early_boot_irqs_disabled && (preempt_count() == 0) &&
  21840. + !irqs_disabled();
  21841. +#else
  21842. + int lock = 1;
  21843. +#endif
  21844. +
  21845. + if (!lock)
  21846. + return 0;
  21847. if (!console_trylock())
  21848. return 0;
  21849. @@ -1613,6 +1649,62 @@
  21850. return textlen;
  21851. }
  21852. +#ifdef CONFIG_EARLY_PRINTK
  21853. +struct console *early_console;
  21854. +
  21855. +void early_vprintk(const char *fmt, va_list ap)
  21856. +{
  21857. + if (early_console) {
  21858. + char buf[512];
  21859. + int n = vscnprintf(buf, sizeof(buf), fmt, ap);
  21860. +
  21861. + early_console->write(early_console, buf, n);
  21862. + }
  21863. +}
  21864. +
  21865. +asmlinkage void early_printk(const char *fmt, ...)
  21866. +{
  21867. + va_list ap;
  21868. +
  21869. + va_start(ap, fmt);
  21870. + early_vprintk(fmt, ap);
  21871. + va_end(ap);
  21872. +}
  21873. +
  21874. +/*
  21875. + * This is independent of any log levels - a global
  21876. + * kill switch that turns off all of printk.
  21877. + *
  21878. + * Used by the NMI watchdog if early-printk is enabled.
  21879. + */
  21880. +static bool __read_mostly printk_killswitch;
  21881. +
  21882. +static int __init force_early_printk_setup(char *str)
  21883. +{
  21884. + printk_killswitch = true;
  21885. + return 0;
  21886. +}
  21887. +early_param("force_early_printk", force_early_printk_setup);
  21888. +
  21889. +void printk_kill(void)
  21890. +{
  21891. + printk_killswitch = true;
  21892. +}
  21893. +
  21894. +static int forced_early_printk(const char *fmt, va_list ap)
  21895. +{
  21896. + if (!printk_killswitch)
  21897. + return 0;
  21898. + early_vprintk(fmt, ap);
  21899. + return 1;
  21900. +}
  21901. +#else
  21902. +static inline int forced_early_printk(const char *fmt, va_list ap)
  21903. +{
  21904. + return 0;
  21905. +}
  21906. +#endif
  21907. +
  21908. asmlinkage int vprintk_emit(int facility, int level,
  21909. const char *dict, size_t dictlen,
  21910. const char *fmt, va_list args)
  21911. @@ -1629,6 +1721,13 @@
  21912. /* cpu currently holding logbuf_lock in this function */
  21913. static volatile unsigned int logbuf_cpu = UINT_MAX;
  21914. + /*
  21915. + * Fall back to early_printk if a debugging subsystem has
  21916. + * killed printk output
  21917. + */
  21918. + if (unlikely(forced_early_printk(fmt, args)))
  21919. + return 1;
  21920. +
  21921. if (level == SCHED_MESSAGE_LOGLEVEL) {
  21922. level = -1;
  21923. in_sched = true;
  21924. @@ -1769,8 +1868,7 @@
  21925. * console_sem which would prevent anyone from printing to
  21926. * console
  21927. */
  21928. - preempt_disable();
  21929. -
  21930. + migrate_disable();
  21931. /*
  21932. * Try to acquire and then immediately release the console
  21933. * semaphore. The release will print out buffers and wake up
  21934. @@ -1778,7 +1876,7 @@
  21935. */
  21936. if (console_trylock_for_printk())
  21937. console_unlock();
  21938. - preempt_enable();
  21939. + migrate_enable();
  21940. lockdep_on();
  21941. }
  21942. @@ -1878,29 +1976,6 @@
  21943. #endif /* CONFIG_PRINTK */
  21944. -#ifdef CONFIG_EARLY_PRINTK
  21945. -struct console *early_console;
  21946. -
  21947. -void early_vprintk(const char *fmt, va_list ap)
  21948. -{
  21949. - if (early_console) {
  21950. - char buf[512];
  21951. - int n = vscnprintf(buf, sizeof(buf), fmt, ap);
  21952. -
  21953. - early_console->write(early_console, buf, n);
  21954. - }
  21955. -}
  21956. -
  21957. -asmlinkage __visible void early_printk(const char *fmt, ...)
  21958. -{
  21959. - va_list ap;
  21960. -
  21961. - va_start(ap, fmt);
  21962. - early_vprintk(fmt, ap);
  21963. - va_end(ap);
  21964. -}
  21965. -#endif
  21966. -
  21967. static int __add_preferred_console(char *name, int idx, char *options,
  21968. char *brl_options)
  21969. {
  21970. @@ -2140,11 +2215,16 @@
  21971. goto out;
  21972. len = cont_print_text(text, size);
  21973. +#ifndef CONFIG_PREEMPT_RT_FULL
  21974. raw_spin_unlock(&logbuf_lock);
  21975. stop_critical_timings();
  21976. call_console_drivers(cont.level, text, len);
  21977. start_critical_timings();
  21978. local_irq_restore(flags);
  21979. +#else
  21980. + raw_spin_unlock_irqrestore(&logbuf_lock, flags);
  21981. + call_console_drivers(cont.level, text, len);
  21982. +#endif
  21983. return;
  21984. out:
  21985. raw_spin_unlock_irqrestore(&logbuf_lock, flags);
  21986. @@ -2232,12 +2312,17 @@
  21987. console_idx = log_next(console_idx);
  21988. console_seq++;
  21989. console_prev = msg->flags;
  21990. +#ifdef CONFIG_PREEMPT_RT_FULL
  21991. + raw_spin_unlock_irqrestore(&logbuf_lock, flags);
  21992. + call_console_drivers(level, text, len);
  21993. +#else
  21994. raw_spin_unlock(&logbuf_lock);
  21995. stop_critical_timings(); /* don't trace print latency */
  21996. call_console_drivers(level, text, len);
  21997. start_critical_timings();
  21998. local_irq_restore(flags);
  21999. +#endif
  22000. }
  22001. console_locked = 0;
  22002. diff -Nur linux-3.18.12.orig/kernel/ptrace.c linux-3.18.12/kernel/ptrace.c
  22003. --- linux-3.18.12.orig/kernel/ptrace.c 2015-04-20 14:48:02.000000000 -0500
  22004. +++ linux-3.18.12/kernel/ptrace.c 2015-04-26 13:32:22.439684003 -0500
  22005. @@ -129,7 +129,12 @@
  22006. spin_lock_irq(&task->sighand->siglock);
  22007. if (task_is_traced(task) && !__fatal_signal_pending(task)) {
  22008. - task->state = __TASK_TRACED;
  22009. + raw_spin_lock_irq(&task->pi_lock);
  22010. + if (task->state & __TASK_TRACED)
  22011. + task->state = __TASK_TRACED;
  22012. + else
  22013. + task->saved_state = __TASK_TRACED;
  22014. + raw_spin_unlock_irq(&task->pi_lock);
  22015. ret = true;
  22016. }
  22017. spin_unlock_irq(&task->sighand->siglock);
  22018. diff -Nur linux-3.18.12.orig/kernel/rcu/tiny.c linux-3.18.12/kernel/rcu/tiny.c
  22019. --- linux-3.18.12.orig/kernel/rcu/tiny.c 2015-04-20 14:48:02.000000000 -0500
  22020. +++ linux-3.18.12/kernel/rcu/tiny.c 2015-04-26 13:32:22.439684003 -0500
  22021. @@ -370,6 +370,7 @@
  22022. }
  22023. EXPORT_SYMBOL_GPL(call_rcu_sched);
  22024. +#ifndef CONFIG_PREEMPT_RT_FULL
  22025. /*
  22026. * Post an RCU bottom-half callback to be invoked after any subsequent
  22027. * quiescent state.
  22028. @@ -379,6 +380,7 @@
  22029. __call_rcu(head, func, &rcu_bh_ctrlblk);
  22030. }
  22031. EXPORT_SYMBOL_GPL(call_rcu_bh);
  22032. +#endif
  22033. void rcu_init(void)
  22034. {
  22035. diff -Nur linux-3.18.12.orig/kernel/rcu/tree.c linux-3.18.12/kernel/rcu/tree.c
  22036. --- linux-3.18.12.orig/kernel/rcu/tree.c 2015-04-20 14:48:02.000000000 -0500
  22037. +++ linux-3.18.12/kernel/rcu/tree.c 2015-04-26 13:32:22.439684003 -0500
  22038. @@ -56,6 +56,11 @@
  22039. #include <linux/random.h>
  22040. #include <linux/ftrace_event.h>
  22041. #include <linux/suspend.h>
  22042. +#include <linux/delay.h>
  22043. +#include <linux/gfp.h>
  22044. +#include <linux/oom.h>
  22045. +#include <linux/smpboot.h>
  22046. +#include "../time/tick-internal.h"
  22047. #include "tree.h"
  22048. #include "rcu.h"
  22049. @@ -152,8 +157,6 @@
  22050. */
  22051. static int rcu_scheduler_fully_active __read_mostly;
  22052. -#ifdef CONFIG_RCU_BOOST
  22053. -
  22054. /*
  22055. * Control variables for per-CPU and per-rcu_node kthreads. These
  22056. * handle all flavors of RCU.
  22057. @@ -163,8 +166,6 @@
  22058. DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
  22059. DEFINE_PER_CPU(char, rcu_cpu_has_work);
  22060. -#endif /* #ifdef CONFIG_RCU_BOOST */
  22061. -
  22062. static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
  22063. static void invoke_rcu_core(void);
  22064. static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
  22065. @@ -207,6 +208,19 @@
  22066. }
  22067. }
  22068. +#ifdef CONFIG_PREEMPT_RT_FULL
  22069. +static void rcu_preempt_qs(void);
  22070. +
  22071. +void rcu_bh_qs(void)
  22072. +{
  22073. + unsigned long flags;
  22074. +
  22075. + /* Callers to this function, rcu_preempt_qs(), must disable irqs. */
  22076. + local_irq_save(flags);
  22077. + rcu_preempt_qs();
  22078. + local_irq_restore(flags);
  22079. +}
  22080. +#else
  22081. void rcu_bh_qs(void)
  22082. {
  22083. if (!__this_cpu_read(rcu_bh_data.passed_quiesce)) {
  22084. @@ -216,6 +230,7 @@
  22085. __this_cpu_write(rcu_bh_data.passed_quiesce, 1);
  22086. }
  22087. }
  22088. +#endif
  22089. static DEFINE_PER_CPU(int, rcu_sched_qs_mask);
  22090. @@ -336,6 +351,7 @@
  22091. }
  22092. EXPORT_SYMBOL_GPL(rcu_batches_completed_sched);
  22093. +#ifndef CONFIG_PREEMPT_RT_FULL
  22094. /*
  22095. * Return the number of RCU BH batches processed thus far for debug & stats.
  22096. */
  22097. @@ -363,6 +379,13 @@
  22098. }
  22099. EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);
  22100. +#else
  22101. +void rcu_force_quiescent_state(void)
  22102. +{
  22103. +}
  22104. +EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
  22105. +#endif
  22106. +
  22107. /*
  22108. * Show the state of the grace-period kthreads.
  22109. */
  22110. @@ -1411,7 +1434,7 @@
  22111. !ACCESS_ONCE(rsp->gp_flags) ||
  22112. !rsp->gp_kthread)
  22113. return;
  22114. - wake_up(&rsp->gp_wq);
  22115. + swait_wake(&rsp->gp_wq);
  22116. }
  22117. /*
  22118. @@ -1793,7 +1816,7 @@
  22119. ACCESS_ONCE(rsp->gpnum),
  22120. TPS("reqwait"));
  22121. rsp->gp_state = RCU_GP_WAIT_GPS;
  22122. - wait_event_interruptible(rsp->gp_wq,
  22123. + swait_event_interruptible(rsp->gp_wq,
  22124. ACCESS_ONCE(rsp->gp_flags) &
  22125. RCU_GP_FLAG_INIT);
  22126. /* Locking provides needed memory barrier. */
  22127. @@ -1821,7 +1844,7 @@
  22128. ACCESS_ONCE(rsp->gpnum),
  22129. TPS("fqswait"));
  22130. rsp->gp_state = RCU_GP_WAIT_FQS;
  22131. - ret = wait_event_interruptible_timeout(rsp->gp_wq,
  22132. + ret = swait_event_interruptible_timeout(rsp->gp_wq,
  22133. ((gf = ACCESS_ONCE(rsp->gp_flags)) &
  22134. RCU_GP_FLAG_FQS) ||
  22135. (!ACCESS_ONCE(rnp->qsmask) &&
  22136. @@ -2565,16 +2588,14 @@
  22137. /*
  22138. * Do RCU core processing for the current CPU.
  22139. */
  22140. -static void rcu_process_callbacks(struct softirq_action *unused)
  22141. +static void rcu_process_callbacks(void)
  22142. {
  22143. struct rcu_state *rsp;
  22144. if (cpu_is_offline(smp_processor_id()))
  22145. return;
  22146. - trace_rcu_utilization(TPS("Start RCU core"));
  22147. for_each_rcu_flavor(rsp)
  22148. __rcu_process_callbacks(rsp);
  22149. - trace_rcu_utilization(TPS("End RCU core"));
  22150. }
  22151. /*
  22152. @@ -2588,18 +2609,105 @@
  22153. {
  22154. if (unlikely(!ACCESS_ONCE(rcu_scheduler_fully_active)))
  22155. return;
  22156. - if (likely(!rsp->boost)) {
  22157. - rcu_do_batch(rsp, rdp);
  22158. + rcu_do_batch(rsp, rdp);
  22159. +}
  22160. +
  22161. +static void rcu_wake_cond(struct task_struct *t, int status)
  22162. +{
  22163. + /*
  22164. + * If the thread is yielding, only wake it when this
  22165. + * is invoked from idle
  22166. + */
  22167. + if (t && (status != RCU_KTHREAD_YIELDING || is_idle_task(current)))
  22168. + wake_up_process(t);
  22169. +}
  22170. +
  22171. +/*
  22172. + * Wake up this CPU's rcuc kthread to do RCU core processing.
  22173. + */
  22174. +static void invoke_rcu_core(void)
  22175. +{
  22176. + unsigned long flags;
  22177. + struct task_struct *t;
  22178. +
  22179. + if (!cpu_online(smp_processor_id()))
  22180. return;
  22181. + local_irq_save(flags);
  22182. + __this_cpu_write(rcu_cpu_has_work, 1);
  22183. + t = __this_cpu_read(rcu_cpu_kthread_task);
  22184. + if (t != NULL && current != t)
  22185. + rcu_wake_cond(t, __this_cpu_read(rcu_cpu_kthread_status));
  22186. + local_irq_restore(flags);
  22187. +}
  22188. +
  22189. +static void rcu_cpu_kthread_park(unsigned int cpu)
  22190. +{
  22191. + per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
  22192. +}
  22193. +
  22194. +static int rcu_cpu_kthread_should_run(unsigned int cpu)
  22195. +{
  22196. + return __this_cpu_read(rcu_cpu_has_work);
  22197. +}
  22198. +
  22199. +/*
  22200. + * Per-CPU kernel thread that invokes RCU callbacks. This replaces the
  22201. + * RCU softirq used in flavors and configurations of RCU that do not
  22202. + * support RCU priority boosting.
  22203. + */
  22204. +static void rcu_cpu_kthread(unsigned int cpu)
  22205. +{
  22206. + unsigned int *statusp = &__get_cpu_var(rcu_cpu_kthread_status);
  22207. + char work, *workp = &__get_cpu_var(rcu_cpu_has_work);
  22208. + int spincnt;
  22209. +
  22210. + for (spincnt = 0; spincnt < 10; spincnt++) {
  22211. + trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
  22212. + local_bh_disable();
  22213. + *statusp = RCU_KTHREAD_RUNNING;
  22214. + this_cpu_inc(rcu_cpu_kthread_loops);
  22215. + local_irq_disable();
  22216. + work = *workp;
  22217. + *workp = 0;
  22218. + local_irq_enable();
  22219. + if (work)
  22220. + rcu_process_callbacks();
  22221. + local_bh_enable();
  22222. + if (*workp == 0) {
  22223. + trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
  22224. + *statusp = RCU_KTHREAD_WAITING;
  22225. + return;
  22226. + }
  22227. }
  22228. - invoke_rcu_callbacks_kthread();
  22229. + *statusp = RCU_KTHREAD_YIELDING;
  22230. + trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
  22231. + schedule_timeout_interruptible(2);
  22232. + trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
  22233. + *statusp = RCU_KTHREAD_WAITING;
  22234. }
  22235. -static void invoke_rcu_core(void)
  22236. +static struct smp_hotplug_thread rcu_cpu_thread_spec = {
  22237. + .store = &rcu_cpu_kthread_task,
  22238. + .thread_should_run = rcu_cpu_kthread_should_run,
  22239. + .thread_fn = rcu_cpu_kthread,
  22240. + .thread_comm = "rcuc/%u",
  22241. + .setup = rcu_cpu_kthread_setup,
  22242. + .park = rcu_cpu_kthread_park,
  22243. +};
  22244. +
  22245. +/*
  22246. + * Spawn per-CPU RCU core processing kthreads.
  22247. + */
  22248. +static int __init rcu_spawn_core_kthreads(void)
  22249. {
  22250. - if (cpu_online(smp_processor_id()))
  22251. - raise_softirq(RCU_SOFTIRQ);
  22252. + int cpu;
  22253. +
  22254. + for_each_possible_cpu(cpu)
  22255. + per_cpu(rcu_cpu_has_work, cpu) = 0;
  22256. + BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
  22257. + return 0;
  22258. }
  22259. +early_initcall(rcu_spawn_core_kthreads);
  22260. /*
  22261. * Handle any core-RCU processing required by a call_rcu() invocation.
  22262. @@ -2734,6 +2842,7 @@
  22263. }
  22264. EXPORT_SYMBOL_GPL(call_rcu_sched);
  22265. +#ifndef CONFIG_PREEMPT_RT_FULL
  22266. /*
  22267. * Queue an RCU callback for invocation after a quicker grace period.
  22268. */
  22269. @@ -2742,6 +2851,7 @@
  22270. __call_rcu(head, func, &rcu_bh_state, -1, 0);
  22271. }
  22272. EXPORT_SYMBOL_GPL(call_rcu_bh);
  22273. +#endif
  22274. /*
  22275. * Queue an RCU callback for lazy invocation after a grace period.
  22276. @@ -2833,6 +2943,7 @@
  22277. }
  22278. EXPORT_SYMBOL_GPL(synchronize_sched);
  22279. +#ifndef CONFIG_PREEMPT_RT_FULL
  22280. /**
  22281. * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed.
  22282. *
  22283. @@ -2859,6 +2970,7 @@
  22284. wait_rcu_gp(call_rcu_bh);
  22285. }
  22286. EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
  22287. +#endif
  22288. /**
  22289. * get_state_synchronize_rcu - Snapshot current RCU state
  22290. @@ -3341,6 +3453,7 @@
  22291. mutex_unlock(&rsp->barrier_mutex);
  22292. }
  22293. +#ifndef CONFIG_PREEMPT_RT_FULL
  22294. /**
  22295. * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete.
  22296. */
  22297. @@ -3349,6 +3462,7 @@
  22298. _rcu_barrier(&rcu_bh_state);
  22299. }
  22300. EXPORT_SYMBOL_GPL(rcu_barrier_bh);
  22301. +#endif
  22302. /**
  22303. * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks.
  22304. @@ -3658,7 +3772,7 @@
  22305. }
  22306. rsp->rda = rda;
  22307. - init_waitqueue_head(&rsp->gp_wq);
  22308. + init_swait_head(&rsp->gp_wq);
  22309. rnp = rsp->level[rcu_num_lvls - 1];
  22310. for_each_possible_cpu(i) {
  22311. while (i > rnp->grphi)
  22312. @@ -3755,7 +3869,6 @@
  22313. rcu_init_one(&rcu_bh_state, &rcu_bh_data);
  22314. rcu_init_one(&rcu_sched_state, &rcu_sched_data);
  22315. __rcu_init_preempt();
  22316. - open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
  22317. /*
  22318. * We don't need protection against CPU-hotplug here because
  22319. diff -Nur linux-3.18.12.orig/kernel/rcu/tree.h linux-3.18.12/kernel/rcu/tree.h
  22320. --- linux-3.18.12.orig/kernel/rcu/tree.h 2015-04-20 14:48:02.000000000 -0500
  22321. +++ linux-3.18.12/kernel/rcu/tree.h 2015-04-26 13:32:22.443684003 -0500
  22322. @@ -28,6 +28,7 @@
  22323. #include <linux/cpumask.h>
  22324. #include <linux/seqlock.h>
  22325. #include <linux/irq_work.h>
  22326. +#include <linux/wait-simple.h>
  22327. /*
  22328. * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and
  22329. @@ -172,11 +173,6 @@
  22330. /* queued on this rcu_node structure that */
  22331. /* are blocking the current grace period, */
  22332. /* there can be no such task. */
  22333. - struct completion boost_completion;
  22334. - /* Used to ensure that the rt_mutex used */
  22335. - /* to carry out the boosting is fully */
  22336. - /* released with no future boostee accesses */
  22337. - /* before that rt_mutex is re-initialized. */
  22338. struct rt_mutex boost_mtx;
  22339. /* Used only for the priority-boosting */
  22340. /* side effect, not as a lock. */
  22341. @@ -208,7 +204,7 @@
  22342. /* This can happen due to race conditions. */
  22343. #endif /* #ifdef CONFIG_RCU_BOOST */
  22344. #ifdef CONFIG_RCU_NOCB_CPU
  22345. - wait_queue_head_t nocb_gp_wq[2];
  22346. + struct swait_head nocb_gp_wq[2];
  22347. /* Place for rcu_nocb_kthread() to wait GP. */
  22348. #endif /* #ifdef CONFIG_RCU_NOCB_CPU */
  22349. int need_future_gp[2];
  22350. @@ -348,7 +344,7 @@
  22351. atomic_long_t nocb_follower_count_lazy; /* (approximate). */
  22352. int nocb_p_count; /* # CBs being invoked by kthread */
  22353. int nocb_p_count_lazy; /* (approximate). */
  22354. - wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */
  22355. + struct swait_head nocb_wq; /* For nocb kthreads to sleep on. */
  22356. struct task_struct *nocb_kthread;
  22357. int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */
  22358. @@ -439,7 +435,7 @@
  22359. unsigned long gpnum; /* Current gp number. */
  22360. unsigned long completed; /* # of last completed gp. */
  22361. struct task_struct *gp_kthread; /* Task for grace periods. */
  22362. - wait_queue_head_t gp_wq; /* Where GP task waits. */
  22363. + struct swait_head gp_wq; /* Where GP task waits. */
  22364. short gp_flags; /* Commands for GP task. */
  22365. short gp_state; /* GP kthread sleep state. */
  22366. @@ -570,10 +566,9 @@
  22367. static void __init __rcu_init_preempt(void);
  22368. static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
  22369. static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
  22370. -static void invoke_rcu_callbacks_kthread(void);
  22371. static bool rcu_is_callbacks_kthread(void);
  22372. +static void rcu_cpu_kthread_setup(unsigned int cpu);
  22373. #ifdef CONFIG_RCU_BOOST
  22374. -static void rcu_preempt_do_callbacks(void);
  22375. static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
  22376. struct rcu_node *rnp);
  22377. #endif /* #ifdef CONFIG_RCU_BOOST */
  22378. diff -Nur linux-3.18.12.orig/kernel/rcu/tree_plugin.h linux-3.18.12/kernel/rcu/tree_plugin.h
  22379. --- linux-3.18.12.orig/kernel/rcu/tree_plugin.h 2015-04-20 14:48:02.000000000 -0500
  22380. +++ linux-3.18.12/kernel/rcu/tree_plugin.h 2015-04-26 13:32:22.443684003 -0500
  22381. @@ -24,12 +24,6 @@
  22382. * Paul E. McKenney <paulmck@linux.vnet.ibm.com>
  22383. */
  22384. -#include <linux/delay.h>
  22385. -#include <linux/gfp.h>
  22386. -#include <linux/oom.h>
  22387. -#include <linux/smpboot.h>
  22388. -#include "../time/tick-internal.h"
  22389. -
  22390. #define RCU_KTHREAD_PRIO 1
  22391. #ifdef CONFIG_RCU_BOOST
  22392. @@ -335,7 +329,7 @@
  22393. }
  22394. /* Hardware IRQ handlers cannot block, complain if they get here. */
  22395. - if (WARN_ON_ONCE(in_irq() || in_serving_softirq())) {
  22396. + if (WARN_ON_ONCE(preempt_count() & (HARDIRQ_MASK | SOFTIRQ_OFFSET))) {
  22397. local_irq_restore(flags);
  22398. return;
  22399. }
  22400. @@ -398,10 +392,8 @@
  22401. #ifdef CONFIG_RCU_BOOST
  22402. /* Unboost if we were boosted. */
  22403. - if (drop_boost_mutex) {
  22404. + if (drop_boost_mutex)
  22405. rt_mutex_unlock(&rnp->boost_mtx);
  22406. - complete(&rnp->boost_completion);
  22407. - }
  22408. #endif /* #ifdef CONFIG_RCU_BOOST */
  22409. /*
  22410. @@ -635,15 +627,6 @@
  22411. t->rcu_read_unlock_special.b.need_qs = true;
  22412. }
  22413. -#ifdef CONFIG_RCU_BOOST
  22414. -
  22415. -static void rcu_preempt_do_callbacks(void)
  22416. -{
  22417. - rcu_do_batch(&rcu_preempt_state, this_cpu_ptr(&rcu_preempt_data));
  22418. -}
  22419. -
  22420. -#endif /* #ifdef CONFIG_RCU_BOOST */
  22421. -
  22422. /*
  22423. * Queue a preemptible-RCU callback for invocation after a grace period.
  22424. */
  22425. @@ -1072,6 +1055,19 @@
  22426. #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
  22427. +/*
  22428. + * If boosting, set rcuc kthreads to realtime priority.
  22429. + */
  22430. +static void rcu_cpu_kthread_setup(unsigned int cpu)
  22431. +{
  22432. +#ifdef CONFIG_RCU_BOOST
  22433. + struct sched_param sp;
  22434. +
  22435. + sp.sched_priority = RCU_KTHREAD_PRIO;
  22436. + sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
  22437. +#endif /* #ifdef CONFIG_RCU_BOOST */
  22438. +}
  22439. +
  22440. #ifdef CONFIG_RCU_BOOST
  22441. #include "../locking/rtmutex_common.h"
  22442. @@ -1103,16 +1099,6 @@
  22443. #endif /* #else #ifdef CONFIG_RCU_TRACE */
  22444. -static void rcu_wake_cond(struct task_struct *t, int status)
  22445. -{
  22446. - /*
  22447. - * If the thread is yielding, only wake it when this
  22448. - * is invoked from idle
  22449. - */
  22450. - if (status != RCU_KTHREAD_YIELDING || is_idle_task(current))
  22451. - wake_up_process(t);
  22452. -}
  22453. -
  22454. /*
  22455. * Carry out RCU priority boosting on the task indicated by ->exp_tasks
  22456. * or ->boost_tasks, advancing the pointer to the next task in the
  22457. @@ -1175,15 +1161,11 @@
  22458. */
  22459. t = container_of(tb, struct task_struct, rcu_node_entry);
  22460. rt_mutex_init_proxy_locked(&rnp->boost_mtx, t);
  22461. - init_completion(&rnp->boost_completion);
  22462. raw_spin_unlock_irqrestore(&rnp->lock, flags);
  22463. /* Lock only for side effect: boosts task t's priority. */
  22464. rt_mutex_lock(&rnp->boost_mtx);
  22465. rt_mutex_unlock(&rnp->boost_mtx); /* Then keep lockdep happy. */
  22466. - /* Wait for boostee to be done w/boost_mtx before reinitializing. */
  22467. - wait_for_completion(&rnp->boost_completion);
  22468. -
  22469. return ACCESS_ONCE(rnp->exp_tasks) != NULL ||
  22470. ACCESS_ONCE(rnp->boost_tasks) != NULL;
  22471. }
  22472. @@ -1261,23 +1243,6 @@
  22473. }
  22474. /*
  22475. - * Wake up the per-CPU kthread to invoke RCU callbacks.
  22476. - */
  22477. -static void invoke_rcu_callbacks_kthread(void)
  22478. -{
  22479. - unsigned long flags;
  22480. -
  22481. - local_irq_save(flags);
  22482. - __this_cpu_write(rcu_cpu_has_work, 1);
  22483. - if (__this_cpu_read(rcu_cpu_kthread_task) != NULL &&
  22484. - current != __this_cpu_read(rcu_cpu_kthread_task)) {
  22485. - rcu_wake_cond(__this_cpu_read(rcu_cpu_kthread_task),
  22486. - __this_cpu_read(rcu_cpu_kthread_status));
  22487. - }
  22488. - local_irq_restore(flags);
  22489. -}
  22490. -
  22491. -/*
  22492. * Is the current CPU running the RCU-callbacks kthread?
  22493. * Caller must have preemption disabled.
  22494. */
  22495. @@ -1332,67 +1297,6 @@
  22496. return 0;
  22497. }
  22498. -static void rcu_kthread_do_work(void)
  22499. -{
  22500. - rcu_do_batch(&rcu_sched_state, this_cpu_ptr(&rcu_sched_data));
  22501. - rcu_do_batch(&rcu_bh_state, this_cpu_ptr(&rcu_bh_data));
  22502. - rcu_preempt_do_callbacks();
  22503. -}
  22504. -
  22505. -static void rcu_cpu_kthread_setup(unsigned int cpu)
  22506. -{
  22507. - struct sched_param sp;
  22508. -
  22509. - sp.sched_priority = RCU_KTHREAD_PRIO;
  22510. - sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
  22511. -}
  22512. -
  22513. -static void rcu_cpu_kthread_park(unsigned int cpu)
  22514. -{
  22515. - per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
  22516. -}
  22517. -
  22518. -static int rcu_cpu_kthread_should_run(unsigned int cpu)
  22519. -{
  22520. - return __this_cpu_read(rcu_cpu_has_work);
  22521. -}
  22522. -
  22523. -/*
  22524. - * Per-CPU kernel thread that invokes RCU callbacks. This replaces the
  22525. - * RCU softirq used in flavors and configurations of RCU that do not
  22526. - * support RCU priority boosting.
  22527. - */
  22528. -static void rcu_cpu_kthread(unsigned int cpu)
  22529. -{
  22530. - unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status);
  22531. - char work, *workp = this_cpu_ptr(&rcu_cpu_has_work);
  22532. - int spincnt;
  22533. -
  22534. - for (spincnt = 0; spincnt < 10; spincnt++) {
  22535. - trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
  22536. - local_bh_disable();
  22537. - *statusp = RCU_KTHREAD_RUNNING;
  22538. - this_cpu_inc(rcu_cpu_kthread_loops);
  22539. - local_irq_disable();
  22540. - work = *workp;
  22541. - *workp = 0;
  22542. - local_irq_enable();
  22543. - if (work)
  22544. - rcu_kthread_do_work();
  22545. - local_bh_enable();
  22546. - if (*workp == 0) {
  22547. - trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
  22548. - *statusp = RCU_KTHREAD_WAITING;
  22549. - return;
  22550. - }
  22551. - }
  22552. - *statusp = RCU_KTHREAD_YIELDING;
  22553. - trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
  22554. - schedule_timeout_interruptible(2);
  22555. - trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
  22556. - *statusp = RCU_KTHREAD_WAITING;
  22557. -}
  22558. -
  22559. /*
  22560. * Set the per-rcu_node kthread's affinity to cover all CPUs that are
  22561. * served by the rcu_node in question. The CPU hotplug lock is still
  22562. @@ -1426,26 +1330,13 @@
  22563. free_cpumask_var(cm);
  22564. }
  22565. -static struct smp_hotplug_thread rcu_cpu_thread_spec = {
  22566. - .store = &rcu_cpu_kthread_task,
  22567. - .thread_should_run = rcu_cpu_kthread_should_run,
  22568. - .thread_fn = rcu_cpu_kthread,
  22569. - .thread_comm = "rcuc/%u",
  22570. - .setup = rcu_cpu_kthread_setup,
  22571. - .park = rcu_cpu_kthread_park,
  22572. -};
  22573. -
  22574. /*
  22575. * Spawn boost kthreads -- called as soon as the scheduler is running.
  22576. */
  22577. static void __init rcu_spawn_boost_kthreads(void)
  22578. {
  22579. struct rcu_node *rnp;
  22580. - int cpu;
  22581. - for_each_possible_cpu(cpu)
  22582. - per_cpu(rcu_cpu_has_work, cpu) = 0;
  22583. - BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
  22584. rnp = rcu_get_root(rcu_state_p);
  22585. (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
  22586. if (NUM_RCU_NODES > 1) {
  22587. @@ -1472,11 +1363,6 @@
  22588. raw_spin_unlock_irqrestore(&rnp->lock, flags);
  22589. }
  22590. -static void invoke_rcu_callbacks_kthread(void)
  22591. -{
  22592. - WARN_ON_ONCE(1);
  22593. -}
  22594. -
  22595. static bool rcu_is_callbacks_kthread(void)
  22596. {
  22597. return false;
  22598. @@ -1500,7 +1386,7 @@
  22599. #endif /* #else #ifdef CONFIG_RCU_BOOST */
  22600. -#if !defined(CONFIG_RCU_FAST_NO_HZ)
  22601. +#if !defined(CONFIG_RCU_FAST_NO_HZ) || defined(CONFIG_PREEMPT_RT_FULL)
  22602. /*
  22603. * Check to see if any future RCU-related work will need to be done
  22604. @@ -1518,7 +1404,9 @@
  22605. return rcu_cpu_has_callbacks(cpu, NULL);
  22606. }
  22607. #endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
  22608. +#endif /* !defined(CONFIG_RCU_FAST_NO_HZ) || defined(CONFIG_PREEMPT_RT_FULL) */
  22609. +#if !defined(CONFIG_RCU_FAST_NO_HZ)
  22610. /*
  22611. * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
  22612. * after it.
  22613. @@ -1615,6 +1503,8 @@
  22614. return cbs_ready;
  22615. }
  22616. +#ifndef CONFIG_PREEMPT_RT_FULL
  22617. +
  22618. /*
  22619. * Allow the CPU to enter dyntick-idle mode unless it has callbacks ready
  22620. * to invoke. If the CPU has callbacks, try to advance them. Tell the
  22621. @@ -1655,7 +1545,7 @@
  22622. return 0;
  22623. }
  22624. #endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
  22625. -
  22626. +#endif /* #ifndef CONFIG_PREEMPT_RT_FULL */
  22627. /*
  22628. * Prepare a CPU for idle from an RCU perspective. The first major task
  22629. * is to sense whether nohz mode has been enabled or disabled via sysfs.
  22630. @@ -2001,7 +1891,7 @@
  22631. */
  22632. static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
  22633. {
  22634. - wake_up_all(&rnp->nocb_gp_wq[rnp->completed & 0x1]);
  22635. + swait_wake_all(&rnp->nocb_gp_wq[rnp->completed & 0x1]);
  22636. }
  22637. /*
  22638. @@ -2019,8 +1909,8 @@
  22639. static void rcu_init_one_nocb(struct rcu_node *rnp)
  22640. {
  22641. - init_waitqueue_head(&rnp->nocb_gp_wq[0]);
  22642. - init_waitqueue_head(&rnp->nocb_gp_wq[1]);
  22643. + init_swait_head(&rnp->nocb_gp_wq[0]);
  22644. + init_swait_head(&rnp->nocb_gp_wq[1]);
  22645. }
  22646. #ifndef CONFIG_RCU_NOCB_CPU_ALL
  22647. @@ -2045,7 +1935,7 @@
  22648. if (ACCESS_ONCE(rdp_leader->nocb_leader_sleep) || force) {
  22649. /* Prior smp_mb__after_atomic() orders against prior enqueue. */
  22650. ACCESS_ONCE(rdp_leader->nocb_leader_sleep) = false;
  22651. - wake_up(&rdp_leader->nocb_wq);
  22652. + swait_wake(&rdp_leader->nocb_wq);
  22653. }
  22654. }
  22655. @@ -2238,7 +2128,7 @@
  22656. */
  22657. trace_rcu_future_gp(rnp, rdp, c, TPS("StartWait"));
  22658. for (;;) {
  22659. - wait_event_interruptible(
  22660. + swait_event_interruptible(
  22661. rnp->nocb_gp_wq[c & 0x1],
  22662. (d = ULONG_CMP_GE(ACCESS_ONCE(rnp->completed), c)));
  22663. if (likely(d))
  22664. @@ -2266,7 +2156,7 @@
  22665. /* Wait for callbacks to appear. */
  22666. if (!rcu_nocb_poll) {
  22667. trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep");
  22668. - wait_event_interruptible(my_rdp->nocb_wq,
  22669. + swait_event_interruptible(my_rdp->nocb_wq,
  22670. !ACCESS_ONCE(my_rdp->nocb_leader_sleep));
  22671. /* Memory barrier handled by smp_mb() calls below and repoll. */
  22672. } else if (firsttime) {
  22673. @@ -2347,7 +2237,7 @@
  22674. * List was empty, wake up the follower.
  22675. * Memory barriers supplied by atomic_long_add().
  22676. */
  22677. - wake_up(&rdp->nocb_wq);
  22678. + swait_wake(&rdp->nocb_wq);
  22679. }
  22680. }
  22681. @@ -2368,7 +2258,7 @@
  22682. if (!rcu_nocb_poll) {
  22683. trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
  22684. "FollowerSleep");
  22685. - wait_event_interruptible(rdp->nocb_wq,
  22686. + swait_event_interruptible(rdp->nocb_wq,
  22687. ACCESS_ONCE(rdp->nocb_follower_head));
  22688. } else if (firsttime) {
  22689. /* Don't drown trace log with "Poll"! */
  22690. @@ -2539,7 +2429,7 @@
  22691. static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
  22692. {
  22693. rdp->nocb_tail = &rdp->nocb_head;
  22694. - init_waitqueue_head(&rdp->nocb_wq);
  22695. + init_swait_head(&rdp->nocb_wq);
  22696. rdp->nocb_follower_tail = &rdp->nocb_follower_head;
  22697. }
  22698. diff -Nur linux-3.18.12.orig/kernel/rcu/update.c linux-3.18.12/kernel/rcu/update.c
  22699. --- linux-3.18.12.orig/kernel/rcu/update.c 2015-04-20 14:48:02.000000000 -0500
  22700. +++ linux-3.18.12/kernel/rcu/update.c 2015-04-26 13:32:22.443684003 -0500
  22701. @@ -170,6 +170,7 @@
  22702. }
  22703. EXPORT_SYMBOL_GPL(rcu_read_lock_held);
  22704. +#ifndef CONFIG_PREEMPT_RT_FULL
  22705. /**
  22706. * rcu_read_lock_bh_held() - might we be in RCU-bh read-side critical section?
  22707. *
  22708. @@ -196,6 +197,7 @@
  22709. return in_softirq() || irqs_disabled();
  22710. }
  22711. EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
  22712. +#endif
  22713. #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
  22714. diff -Nur linux-3.18.12.orig/kernel/relay.c linux-3.18.12/kernel/relay.c
  22715. --- linux-3.18.12.orig/kernel/relay.c 2015-04-20 14:48:02.000000000 -0500
  22716. +++ linux-3.18.12/kernel/relay.c 2015-04-26 13:32:22.443684003 -0500
  22717. @@ -339,6 +339,10 @@
  22718. {
  22719. struct rchan_buf *buf = (struct rchan_buf *)data;
  22720. wake_up_interruptible(&buf->read_wait);
  22721. + /*
  22722. + * Stupid polling for now:
  22723. + */
  22724. + mod_timer(&buf->timer, jiffies + 1);
  22725. }
  22726. /**
  22727. @@ -356,6 +360,7 @@
  22728. init_waitqueue_head(&buf->read_wait);
  22729. kref_init(&buf->kref);
  22730. setup_timer(&buf->timer, wakeup_readers, (unsigned long)buf);
  22731. + mod_timer(&buf->timer, jiffies + 1);
  22732. } else
  22733. del_timer_sync(&buf->timer);
  22734. @@ -739,15 +744,6 @@
  22735. else
  22736. buf->early_bytes += buf->chan->subbuf_size -
  22737. buf->padding[old_subbuf];
  22738. - smp_mb();
  22739. - if (waitqueue_active(&buf->read_wait))
  22740. - /*
  22741. - * Calling wake_up_interruptible() from here
  22742. - * will deadlock if we happen to be logging
  22743. - * from the scheduler (trying to re-grab
  22744. - * rq->lock), so defer it.
  22745. - */
  22746. - mod_timer(&buf->timer, jiffies + 1);
  22747. }
  22748. old = buf->data;
  22749. diff -Nur linux-3.18.12.orig/kernel/res_counter.c linux-3.18.12/kernel/res_counter.c
  22750. --- linux-3.18.12.orig/kernel/res_counter.c 2015-04-20 14:48:02.000000000 -0500
  22751. +++ linux-3.18.12/kernel/res_counter.c 2015-04-26 13:32:22.443684003 -0500
  22752. @@ -59,7 +59,7 @@
  22753. r = ret = 0;
  22754. *limit_fail_at = NULL;
  22755. - local_irq_save(flags);
  22756. + local_irq_save_nort(flags);
  22757. for (c = counter; c != NULL; c = c->parent) {
  22758. spin_lock(&c->lock);
  22759. r = res_counter_charge_locked(c, val, force);
  22760. @@ -79,7 +79,7 @@
  22761. spin_unlock(&u->lock);
  22762. }
  22763. }
  22764. - local_irq_restore(flags);
  22765. + local_irq_restore_nort(flags);
  22766. return ret;
  22767. }
  22768. @@ -104,7 +104,7 @@
  22769. struct res_counter *c;
  22770. u64 ret = 0;
  22771. - local_irq_save(flags);
  22772. + local_irq_save_nort(flags);
  22773. for (c = counter; c != top; c = c->parent) {
  22774. u64 r;
  22775. spin_lock(&c->lock);
  22776. @@ -113,7 +113,7 @@
  22777. ret = r;
  22778. spin_unlock(&c->lock);
  22779. }
  22780. - local_irq_restore(flags);
  22781. + local_irq_restore_nort(flags);
  22782. return ret;
  22783. }
  22784. diff -Nur linux-3.18.12.orig/kernel/sched/completion.c linux-3.18.12/kernel/sched/completion.c
  22785. --- linux-3.18.12.orig/kernel/sched/completion.c 2015-04-20 14:48:02.000000000 -0500
  22786. +++ linux-3.18.12/kernel/sched/completion.c 2015-04-26 13:32:22.443684003 -0500
  22787. @@ -30,10 +30,10 @@
  22788. {
  22789. unsigned long flags;
  22790. - spin_lock_irqsave(&x->wait.lock, flags);
  22791. + raw_spin_lock_irqsave(&x->wait.lock, flags);
  22792. x->done++;
  22793. - __wake_up_locked(&x->wait, TASK_NORMAL, 1);
  22794. - spin_unlock_irqrestore(&x->wait.lock, flags);
  22795. + __swait_wake_locked(&x->wait, TASK_NORMAL, 1);
  22796. + raw_spin_unlock_irqrestore(&x->wait.lock, flags);
  22797. }
  22798. EXPORT_SYMBOL(complete);
  22799. @@ -50,10 +50,10 @@
  22800. {
  22801. unsigned long flags;
  22802. - spin_lock_irqsave(&x->wait.lock, flags);
  22803. + raw_spin_lock_irqsave(&x->wait.lock, flags);
  22804. x->done += UINT_MAX/2;
  22805. - __wake_up_locked(&x->wait, TASK_NORMAL, 0);
  22806. - spin_unlock_irqrestore(&x->wait.lock, flags);
  22807. + __swait_wake_locked(&x->wait, TASK_NORMAL, 0);
  22808. + raw_spin_unlock_irqrestore(&x->wait.lock, flags);
  22809. }
  22810. EXPORT_SYMBOL(complete_all);
  22811. @@ -62,20 +62,20 @@
  22812. long (*action)(long), long timeout, int state)
  22813. {
  22814. if (!x->done) {
  22815. - DECLARE_WAITQUEUE(wait, current);
  22816. + DEFINE_SWAITER(wait);
  22817. - __add_wait_queue_tail_exclusive(&x->wait, &wait);
  22818. + swait_prepare_locked(&x->wait, &wait);
  22819. do {
  22820. if (signal_pending_state(state, current)) {
  22821. timeout = -ERESTARTSYS;
  22822. break;
  22823. }
  22824. __set_current_state(state);
  22825. - spin_unlock_irq(&x->wait.lock);
  22826. + raw_spin_unlock_irq(&x->wait.lock);
  22827. timeout = action(timeout);
  22828. - spin_lock_irq(&x->wait.lock);
  22829. + raw_spin_lock_irq(&x->wait.lock);
  22830. } while (!x->done && timeout);
  22831. - __remove_wait_queue(&x->wait, &wait);
  22832. + swait_finish_locked(&x->wait, &wait);
  22833. if (!x->done)
  22834. return timeout;
  22835. }
  22836. @@ -89,9 +89,9 @@
  22837. {
  22838. might_sleep();
  22839. - spin_lock_irq(&x->wait.lock);
  22840. + raw_spin_lock_irq(&x->wait.lock);
  22841. timeout = do_wait_for_common(x, action, timeout, state);
  22842. - spin_unlock_irq(&x->wait.lock);
  22843. + raw_spin_unlock_irq(&x->wait.lock);
  22844. return timeout;
  22845. }
  22846. @@ -267,12 +267,12 @@
  22847. unsigned long flags;
  22848. int ret = 1;
  22849. - spin_lock_irqsave(&x->wait.lock, flags);
  22850. + raw_spin_lock_irqsave(&x->wait.lock, flags);
  22851. if (!x->done)
  22852. ret = 0;
  22853. else
  22854. x->done--;
  22855. - spin_unlock_irqrestore(&x->wait.lock, flags);
  22856. + raw_spin_unlock_irqrestore(&x->wait.lock, flags);
  22857. return ret;
  22858. }
  22859. EXPORT_SYMBOL(try_wait_for_completion);
  22860. @@ -290,10 +290,10 @@
  22861. unsigned long flags;
  22862. int ret = 1;
  22863. - spin_lock_irqsave(&x->wait.lock, flags);
  22864. + raw_spin_lock_irqsave(&x->wait.lock, flags);
  22865. if (!x->done)
  22866. ret = 0;
  22867. - spin_unlock_irqrestore(&x->wait.lock, flags);
  22868. + raw_spin_unlock_irqrestore(&x->wait.lock, flags);
  22869. return ret;
  22870. }
  22871. EXPORT_SYMBOL(completion_done);
  22872. diff -Nur linux-3.18.12.orig/kernel/sched/core.c linux-3.18.12/kernel/sched/core.c
  22873. --- linux-3.18.12.orig/kernel/sched/core.c 2015-04-20 14:48:02.000000000 -0500
  22874. +++ linux-3.18.12/kernel/sched/core.c 2015-04-26 13:32:22.443684003 -0500
  22875. @@ -280,7 +280,11 @@
  22876. * Number of tasks to iterate in a single balance run.
  22877. * Limited because this is done with IRQs disabled.
  22878. */
  22879. +#ifndef CONFIG_PREEMPT_RT_FULL
  22880. const_debug unsigned int sysctl_sched_nr_migrate = 32;
  22881. +#else
  22882. +const_debug unsigned int sysctl_sched_nr_migrate = 8;
  22883. +#endif
  22884. /*
  22885. * period over which we average the RT time consumption, measured
  22886. @@ -516,6 +520,7 @@
  22887. hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
  22888. rq->hrtick_timer.function = hrtick;
  22889. + rq->hrtick_timer.irqsafe = 1;
  22890. }
  22891. #else /* CONFIG_SCHED_HRTICK */
  22892. static inline void hrtick_clear(struct rq *rq)
  22893. @@ -627,6 +632,38 @@
  22894. trace_sched_wake_idle_without_ipi(cpu);
  22895. }
  22896. +#ifdef CONFIG_PREEMPT_LAZY
  22897. +void resched_curr_lazy(struct rq *rq)
  22898. +{
  22899. + struct task_struct *curr = rq->curr;
  22900. + int cpu;
  22901. +
  22902. + if (!sched_feat(PREEMPT_LAZY)) {
  22903. + resched_curr(rq);
  22904. + return;
  22905. + }
  22906. +
  22907. + lockdep_assert_held(&rq->lock);
  22908. +
  22909. + if (test_tsk_need_resched(curr))
  22910. + return;
  22911. +
  22912. + if (test_tsk_need_resched_lazy(curr))
  22913. + return;
  22914. +
  22915. + set_tsk_need_resched_lazy(curr);
  22916. +
  22917. + cpu = cpu_of(rq);
  22918. + if (cpu == smp_processor_id())
  22919. + return;
  22920. +
  22921. + /* NEED_RESCHED_LAZY must be visible before we test polling */
  22922. + smp_mb();
  22923. + if (!tsk_is_polling(curr))
  22924. + smp_send_reschedule(cpu);
  22925. +}
  22926. +#endif
  22927. +
  22928. void resched_cpu(int cpu)
  22929. {
  22930. struct rq *rq = cpu_rq(cpu);
  22931. @@ -650,12 +687,14 @@
  22932. */
  22933. int get_nohz_timer_target(int pinned)
  22934. {
  22935. - int cpu = smp_processor_id();
  22936. + int cpu;
  22937. int i;
  22938. struct sched_domain *sd;
  22939. + preempt_disable_rt();
  22940. + cpu = smp_processor_id();
  22941. if (pinned || !get_sysctl_timer_migration() || !idle_cpu(cpu))
  22942. - return cpu;
  22943. + goto preempt_en_rt;
  22944. rcu_read_lock();
  22945. for_each_domain(cpu, sd) {
  22946. @@ -668,6 +707,8 @@
  22947. }
  22948. unlock:
  22949. rcu_read_unlock();
  22950. +preempt_en_rt:
  22951. + preempt_enable_rt();
  22952. return cpu;
  22953. }
  22954. /*
  22955. @@ -745,14 +786,29 @@
  22956. #endif /* CONFIG_NO_HZ_COMMON */
  22957. #ifdef CONFIG_NO_HZ_FULL
  22958. +
  22959. +static int ksoftirqd_running(void)
  22960. +{
  22961. + struct task_struct *softirqd;
  22962. +
  22963. + if (!IS_ENABLED(CONFIG_PREEMPT_RT_FULL))
  22964. + return 0;
  22965. + softirqd = this_cpu_ksoftirqd();
  22966. + if (softirqd && softirqd->on_rq)
  22967. + return 1;
  22968. + return 0;
  22969. +}
  22970. +
  22971. bool sched_can_stop_tick(void)
  22972. {
  22973. /*
  22974. * More than one running task need preemption.
  22975. * nr_running update is assumed to be visible
  22976. * after IPI is sent from wakers.
  22977. + *
  22978. + * NOTE, RT: if ksoftirqd is awake, subtract it.
  22979. */
  22980. - if (this_rq()->nr_running > 1)
  22981. + if (this_rq()->nr_running - ksoftirqd_running() > 1)
  22982. return false;
  22983. return true;
  22984. @@ -1198,6 +1254,18 @@
  22985. static int migration_cpu_stop(void *data);
  22986. +static bool check_task_state(struct task_struct *p, long match_state)
  22987. +{
  22988. + bool match = false;
  22989. +
  22990. + raw_spin_lock_irq(&p->pi_lock);
  22991. + if (p->state == match_state || p->saved_state == match_state)
  22992. + match = true;
  22993. + raw_spin_unlock_irq(&p->pi_lock);
  22994. +
  22995. + return match;
  22996. +}
  22997. +
  22998. /*
  22999. * wait_task_inactive - wait for a thread to unschedule.
  23000. *
  23001. @@ -1242,7 +1310,7 @@
  23002. * is actually now running somewhere else!
  23003. */
  23004. while (task_running(rq, p)) {
  23005. - if (match_state && unlikely(p->state != match_state))
  23006. + if (match_state && !check_task_state(p, match_state))
  23007. return 0;
  23008. cpu_relax();
  23009. }
  23010. @@ -1257,7 +1325,8 @@
  23011. running = task_running(rq, p);
  23012. queued = task_on_rq_queued(p);
  23013. ncsw = 0;
  23014. - if (!match_state || p->state == match_state)
  23015. + if (!match_state || p->state == match_state ||
  23016. + p->saved_state == match_state)
  23017. ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
  23018. task_rq_unlock(rq, p, &flags);
  23019. @@ -1482,10 +1551,6 @@
  23020. {
  23021. activate_task(rq, p, en_flags);
  23022. p->on_rq = TASK_ON_RQ_QUEUED;
  23023. -
  23024. - /* if a worker is waking up, notify workqueue */
  23025. - if (p->flags & PF_WQ_WORKER)
  23026. - wq_worker_waking_up(p, cpu_of(rq));
  23027. }
  23028. /*
  23029. @@ -1699,8 +1764,27 @@
  23030. */
  23031. smp_mb__before_spinlock();
  23032. raw_spin_lock_irqsave(&p->pi_lock, flags);
  23033. - if (!(p->state & state))
  23034. + if (!(p->state & state)) {
  23035. + /*
  23036. + * The task might be running due to a spinlock sleeper
  23037. + * wakeup. Check the saved state and set it to running
  23038. + * if the wakeup condition is true.
  23039. + */
  23040. + if (!(wake_flags & WF_LOCK_SLEEPER)) {
  23041. + if (p->saved_state & state) {
  23042. + p->saved_state = TASK_RUNNING;
  23043. + success = 1;
  23044. + }
  23045. + }
  23046. goto out;
  23047. + }
  23048. +
  23049. + /*
  23050. + * If this is a regular wakeup, then we can unconditionally
  23051. + * clear the saved state of a "lock sleeper".
  23052. + */
  23053. + if (!(wake_flags & WF_LOCK_SLEEPER))
  23054. + p->saved_state = TASK_RUNNING;
  23055. success = 1; /* we're going to change ->state */
  23056. cpu = task_cpu(p);
  23057. @@ -1743,42 +1827,6 @@
  23058. }
  23059. /**
  23060. - * try_to_wake_up_local - try to wake up a local task with rq lock held
  23061. - * @p: the thread to be awakened
  23062. - *
  23063. - * Put @p on the run-queue if it's not already there. The caller must
  23064. - * ensure that this_rq() is locked, @p is bound to this_rq() and not
  23065. - * the current task.
  23066. - */
  23067. -static void try_to_wake_up_local(struct task_struct *p)
  23068. -{
  23069. - struct rq *rq = task_rq(p);
  23070. -
  23071. - if (WARN_ON_ONCE(rq != this_rq()) ||
  23072. - WARN_ON_ONCE(p == current))
  23073. - return;
  23074. -
  23075. - lockdep_assert_held(&rq->lock);
  23076. -
  23077. - if (!raw_spin_trylock(&p->pi_lock)) {
  23078. - raw_spin_unlock(&rq->lock);
  23079. - raw_spin_lock(&p->pi_lock);
  23080. - raw_spin_lock(&rq->lock);
  23081. - }
  23082. -
  23083. - if (!(p->state & TASK_NORMAL))
  23084. - goto out;
  23085. -
  23086. - if (!task_on_rq_queued(p))
  23087. - ttwu_activate(rq, p, ENQUEUE_WAKEUP);
  23088. -
  23089. - ttwu_do_wakeup(rq, p, 0);
  23090. - ttwu_stat(p, smp_processor_id(), 0);
  23091. -out:
  23092. - raw_spin_unlock(&p->pi_lock);
  23093. -}
  23094. -
  23095. -/**
  23096. * wake_up_process - Wake up a specific process
  23097. * @p: The process to be woken up.
  23098. *
  23099. @@ -1792,11 +1840,23 @@
  23100. */
  23101. int wake_up_process(struct task_struct *p)
  23102. {
  23103. - WARN_ON(task_is_stopped_or_traced(p));
  23104. + WARN_ON(__task_is_stopped_or_traced(p));
  23105. return try_to_wake_up(p, TASK_NORMAL, 0);
  23106. }
  23107. EXPORT_SYMBOL(wake_up_process);
  23108. +/**
  23109. + * wake_up_lock_sleeper - Wake up a specific process blocked on a "sleeping lock"
  23110. + * @p: The process to be woken up.
  23111. + *
  23112. + * Same as wake_up_process() above, but wake_flags=WF_LOCK_SLEEPER to indicate
  23113. + * the nature of the wakeup.
  23114. + */
  23115. +int wake_up_lock_sleeper(struct task_struct *p)
  23116. +{
  23117. + return try_to_wake_up(p, TASK_ALL, WF_LOCK_SLEEPER);
  23118. +}
  23119. +
  23120. int wake_up_state(struct task_struct *p, unsigned int state)
  23121. {
  23122. return try_to_wake_up(p, state, 0);
  23123. @@ -1987,6 +2047,9 @@
  23124. p->on_cpu = 0;
  23125. #endif
  23126. init_task_preempt_count(p);
  23127. +#ifdef CONFIG_HAVE_PREEMPT_LAZY
  23128. + task_thread_info(p)->preempt_lazy_count = 0;
  23129. +#endif
  23130. #ifdef CONFIG_SMP
  23131. plist_node_init(&p->pushable_tasks, MAX_PRIO);
  23132. RB_CLEAR_NODE(&p->pushable_dl_tasks);
  23133. @@ -2270,8 +2333,12 @@
  23134. finish_arch_post_lock_switch();
  23135. fire_sched_in_preempt_notifiers(current);
  23136. + /*
  23137. + * We use mmdrop_delayed() here so we don't have to do the
  23138. + * full __mmdrop() when we are the last user.
  23139. + */
  23140. if (mm)
  23141. - mmdrop(mm);
  23142. + mmdrop_delayed(mm);
  23143. if (unlikely(prev_state == TASK_DEAD)) {
  23144. if (prev->sched_class->task_dead)
  23145. prev->sched_class->task_dead(prev);
  23146. @@ -2696,6 +2763,133 @@
  23147. schedstat_inc(this_rq(), sched_count);
  23148. }
  23149. +#if defined(CONFIG_PREEMPT_RT_FULL) && defined(CONFIG_SMP)
  23150. +#define MIGRATE_DISABLE_SET_AFFIN (1<<30) /* Can't make a negative */
  23151. +#define migrate_disabled_updated(p) ((p)->migrate_disable & MIGRATE_DISABLE_SET_AFFIN)
  23152. +#define migrate_disable_count(p) ((p)->migrate_disable & ~MIGRATE_DISABLE_SET_AFFIN)
  23153. +
  23154. +static inline void update_migrate_disable(struct task_struct *p)
  23155. +{
  23156. + const struct cpumask *mask;
  23157. +
  23158. + if (likely(!p->migrate_disable))
  23159. + return;
  23160. +
  23161. + /* Did we already update affinity? */
  23162. + if (unlikely(migrate_disabled_updated(p)))
  23163. + return;
  23164. +
  23165. + /*
  23166. + * Since this is always current we can get away with only locking
  23167. + * rq->lock, the ->cpus_allowed value can normally only be changed
  23168. + * while holding both p->pi_lock and rq->lock, but seeing that this
  23169. + * is current, we cannot actually be waking up, so all code that
  23170. + * relies on serialization against p->pi_lock is out of scope.
  23171. + *
  23172. + * Having rq->lock serializes us against things like
  23173. + * set_cpus_allowed_ptr() that can still happen concurrently.
  23174. + */
  23175. + mask = tsk_cpus_allowed(p);
  23176. +
  23177. + if (p->sched_class->set_cpus_allowed)
  23178. + p->sched_class->set_cpus_allowed(p, mask);
  23179. + /* mask==cpumask_of(task_cpu(p)) which has a cpumask_weight==1 */
  23180. + p->nr_cpus_allowed = 1;
  23181. +
  23182. + /* Let migrate_enable know to fix things back up */
  23183. + p->migrate_disable |= MIGRATE_DISABLE_SET_AFFIN;
  23184. +}
  23185. +
  23186. +void migrate_disable(void)
  23187. +{
  23188. + struct task_struct *p = current;
  23189. +
  23190. + if (in_atomic()) {
  23191. +#ifdef CONFIG_SCHED_DEBUG
  23192. + p->migrate_disable_atomic++;
  23193. +#endif
  23194. + return;
  23195. + }
  23196. +
  23197. +#ifdef CONFIG_SCHED_DEBUG
  23198. + if (unlikely(p->migrate_disable_atomic)) {
  23199. + tracing_off();
  23200. + WARN_ON_ONCE(1);
  23201. + }
  23202. +#endif
  23203. +
  23204. + if (p->migrate_disable) {
  23205. + p->migrate_disable++;
  23206. + return;
  23207. + }
  23208. +
  23209. + preempt_disable();
  23210. + preempt_lazy_disable();
  23211. + pin_current_cpu();
  23212. + p->migrate_disable = 1;
  23213. + preempt_enable();
  23214. +}
  23215. +EXPORT_SYMBOL(migrate_disable);
  23216. +
  23217. +void migrate_enable(void)
  23218. +{
  23219. + struct task_struct *p = current;
  23220. + const struct cpumask *mask;
  23221. + unsigned long flags;
  23222. + struct rq *rq;
  23223. +
  23224. + if (in_atomic()) {
  23225. +#ifdef CONFIG_SCHED_DEBUG
  23226. + p->migrate_disable_atomic--;
  23227. +#endif
  23228. + return;
  23229. + }
  23230. +
  23231. +#ifdef CONFIG_SCHED_DEBUG
  23232. + if (unlikely(p->migrate_disable_atomic)) {
  23233. + tracing_off();
  23234. + WARN_ON_ONCE(1);
  23235. + }
  23236. +#endif
  23237. + WARN_ON_ONCE(p->migrate_disable <= 0);
  23238. +
  23239. + if (migrate_disable_count(p) > 1) {
  23240. + p->migrate_disable--;
  23241. + return;
  23242. + }
  23243. +
  23244. + preempt_disable();
  23245. + if (unlikely(migrate_disabled_updated(p))) {
  23246. + /*
  23247. + * Undo whatever update_migrate_disable() did, also see there
  23248. + * about locking.
  23249. + */
  23250. + rq = this_rq();
  23251. + raw_spin_lock_irqsave(&rq->lock, flags);
  23252. +
  23253. + /*
  23254. + * Clearing migrate_disable causes tsk_cpus_allowed to
  23255. + * show the tasks original cpu affinity.
  23256. + */
  23257. + p->migrate_disable = 0;
  23258. + mask = tsk_cpus_allowed(p);
  23259. + if (p->sched_class->set_cpus_allowed)
  23260. + p->sched_class->set_cpus_allowed(p, mask);
  23261. + p->nr_cpus_allowed = cpumask_weight(mask);
  23262. + raw_spin_unlock_irqrestore(&rq->lock, flags);
  23263. + } else
  23264. + p->migrate_disable = 0;
  23265. +
  23266. + unpin_current_cpu();
  23267. + preempt_enable();
  23268. + preempt_lazy_enable();
  23269. +}
  23270. +EXPORT_SYMBOL(migrate_enable);
  23271. +#else
  23272. +static inline void update_migrate_disable(struct task_struct *p) { }
  23273. +#define migrate_disabled_updated(p) 0
  23274. +#endif
  23275. +
  23276. /*
  23277. * Pick up the highest-prio task:
  23278. */
  23279. @@ -2799,6 +2993,8 @@
  23280. smp_mb__before_spinlock();
  23281. raw_spin_lock_irq(&rq->lock);
  23282. + update_migrate_disable(prev);
  23283. +
  23284. switch_count = &prev->nivcsw;
  23285. if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
  23286. if (unlikely(signal_pending_state(prev->state, prev))) {
  23287. @@ -2806,19 +3002,6 @@
  23288. } else {
  23289. deactivate_task(rq, prev, DEQUEUE_SLEEP);
  23290. prev->on_rq = 0;
  23291. -
  23292. - /*
  23293. - * If a worker went to sleep, notify and ask workqueue
  23294. - * whether it wants to wake up a task to maintain
  23295. - * concurrency.
  23296. - */
  23297. - if (prev->flags & PF_WQ_WORKER) {
  23298. - struct task_struct *to_wakeup;
  23299. -
  23300. - to_wakeup = wq_worker_sleeping(prev, cpu);
  23301. - if (to_wakeup)
  23302. - try_to_wake_up_local(to_wakeup);
  23303. - }
  23304. }
  23305. switch_count = &prev->nvcsw;
  23306. }
  23307. @@ -2828,6 +3011,7 @@
  23308. next = pick_next_task(rq, prev);
  23309. clear_tsk_need_resched(prev);
  23310. + clear_tsk_need_resched_lazy(prev);
  23311. clear_preempt_need_resched();
  23312. rq->skip_clock_update = 0;
  23313. @@ -2857,9 +3041,20 @@
  23314. static inline void sched_submit_work(struct task_struct *tsk)
  23315. {
  23316. - if (!tsk->state || tsk_is_pi_blocked(tsk))
  23317. + if (!tsk->state)
  23318. return;
  23319. /*
  23320. + * If a worker went to sleep, notify and ask workqueue whether
  23321. + * it wants to wake up a task to maintain concurrency.
  23322. + */
  23323. + if (tsk->flags & PF_WQ_WORKER)
  23324. + wq_worker_sleeping(tsk);
  23325. +
  23326. +
  23327. + if (tsk_is_pi_blocked(tsk))
  23328. + return;
  23329. +
  23330. + /*
  23331. * If we are going to sleep and we have plugged IO queued,
  23332. * make sure to submit it to avoid deadlocks.
  23333. */
  23334. @@ -2867,12 +3062,19 @@
  23335. blk_schedule_flush_plug(tsk);
  23336. }
  23337. +static inline void sched_update_worker(struct task_struct *tsk)
  23338. +{
  23339. + if (tsk->flags & PF_WQ_WORKER)
  23340. + wq_worker_running(tsk);
  23341. +}
  23342. +
  23343. asmlinkage __visible void __sched schedule(void)
  23344. {
  23345. struct task_struct *tsk = current;
  23346. sched_submit_work(tsk);
  23347. __schedule();
  23348. + sched_update_worker(tsk);
  23349. }
  23350. EXPORT_SYMBOL(schedule);
  23351. @@ -2922,9 +3124,26 @@
  23352. if (likely(!preemptible()))
  23353. return;
  23354. +#ifdef CONFIG_PREEMPT_LAZY
  23355. + /*
  23356. + * Check for lazy preemption
  23357. + */
  23358. + if (current_thread_info()->preempt_lazy_count &&
  23359. + !test_thread_flag(TIF_NEED_RESCHED))
  23360. + return;
  23361. +#endif
  23362. do {
  23363. __preempt_count_add(PREEMPT_ACTIVE);
  23364. + /*
  23365. + * The add/subtract must not be traced by the function
  23366. + * tracer. But we still want to account for the
  23367. + * preempt off latency tracer. Since the _notrace versions
  23368. + * of add/subtract skip the accounting for latency tracer
  23369. + * we must force it manually.
  23370. + */
  23371. + start_critical_timings();
  23372. __schedule();
  23373. + stop_critical_timings();
  23374. __preempt_count_sub(PREEMPT_ACTIVE);
  23375. /*
  23376. @@ -3097,6 +3316,8 @@
  23377. } else {
  23378. if (dl_prio(oldprio))
  23379. p->dl.dl_boosted = 0;
  23380. + if (rt_prio(oldprio))
  23381. + p->rt.timeout = 0;
  23382. p->sched_class = &fair_sched_class;
  23383. }
  23384. @@ -4234,9 +4455,16 @@
  23385. static void __cond_resched(void)
  23386. {
  23387. - __preempt_count_add(PREEMPT_ACTIVE);
  23388. - __schedule();
  23389. - __preempt_count_sub(PREEMPT_ACTIVE);
  23390. + do {
  23391. + __preempt_count_add(PREEMPT_ACTIVE);
  23392. + __schedule();
  23393. + __preempt_count_sub(PREEMPT_ACTIVE);
  23394. + /*
  23395. + * Check again in case we missed a preemption
  23396. + * opportunity between schedule and now.
  23397. + */
  23398. + barrier();
  23399. + } while (need_resched());
  23400. }
  23401. int __sched _cond_resched(void)
  23402. @@ -4277,6 +4505,7 @@
  23403. }
  23404. EXPORT_SYMBOL(__cond_resched_lock);
  23405. +#ifndef CONFIG_PREEMPT_RT_FULL
  23406. int __sched __cond_resched_softirq(void)
  23407. {
  23408. BUG_ON(!in_softirq());
  23409. @@ -4290,6 +4519,7 @@
  23410. return 0;
  23411. }
  23412. EXPORT_SYMBOL(__cond_resched_softirq);
  23413. +#endif
  23414. /**
  23415. * yield - yield the current processor to other threads.
  23416. @@ -4651,7 +4881,9 @@
  23417. /* Set the preempt count _outside_ the spinlocks! */
  23418. init_idle_preempt_count(idle, cpu);
  23419. -
  23420. +#ifdef CONFIG_HAVE_PREEMPT_LAZY
  23421. + task_thread_info(idle)->preempt_lazy_count = 0;
  23422. +#endif
  23423. /*
  23424. * The idle tasks have their own, simple scheduling class:
  23425. */
  23426. @@ -4693,11 +4925,91 @@
  23427. void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
  23428. {
  23429. - if (p->sched_class && p->sched_class->set_cpus_allowed)
  23430. - p->sched_class->set_cpus_allowed(p, new_mask);
  23431. + if (!migrate_disabled_updated(p)) {
  23432. + if (p->sched_class && p->sched_class->set_cpus_allowed)
  23433. + p->sched_class->set_cpus_allowed(p, new_mask);
  23434. + p->nr_cpus_allowed = cpumask_weight(new_mask);
  23435. + }
  23436. cpumask_copy(&p->cpus_allowed, new_mask);
  23437. - p->nr_cpus_allowed = cpumask_weight(new_mask);
  23438. +}
  23439. +
  23440. +static DEFINE_PER_CPU(struct cpumask, sched_cpumasks);
  23441. +static DEFINE_MUTEX(sched_down_mutex);
  23442. +static cpumask_t sched_down_cpumask;
  23443. +
  23444. +void tell_sched_cpu_down_begin(int cpu)
  23445. +{
  23446. + mutex_lock(&sched_down_mutex);
  23447. + cpumask_set_cpu(cpu, &sched_down_cpumask);
  23448. + mutex_unlock(&sched_down_mutex);
  23449. +}
  23450. +
  23451. +void tell_sched_cpu_down_done(int cpu)
  23452. +{
  23453. + mutex_lock(&sched_down_mutex);
  23454. + cpumask_clear_cpu(cpu, &sched_down_cpumask);
  23455. + mutex_unlock(&sched_down_mutex);
  23456. +}
  23457. +
  23458. +/**
  23459. + * migrate_me - try to move the current task off this cpu
  23460. + *
  23461. + * Used by the pin_current_cpu() code to try to get tasks
  23462. + * to move off the current CPU as it is going down.
  23463. + * It will only move the task if the task isn't pinned to
  23464. + * the CPU (with migrate_disable, affinity or NO_SETAFFINITY)
  23465. + * and the task has to be in a RUNNING state. Otherwise the
  23466. + * movement of the task will wake it up (change its state
  23467. + * to running) when the task did not expect it.
  23468. + *
  23469. + * Returns 1 if it succeeded in moving the current task
  23470. + * 0 otherwise.
  23471. + */
  23472. +int migrate_me(void)
  23473. +{
  23474. + struct task_struct *p = current;
  23475. + struct migration_arg arg;
  23476. + struct cpumask *cpumask;
  23477. + struct cpumask *mask;
  23478. + unsigned long flags;
  23479. + unsigned int dest_cpu;
  23480. + struct rq *rq;
  23481. +
  23482. + /*
  23483. + * We can not migrate tasks bounded to a CPU or tasks not
  23484. + * running. The movement of the task will wake it up.
  23485. + */
  23486. + if (p->flags & PF_NO_SETAFFINITY || p->state)
  23487. + return 0;
  23488. +
  23489. + mutex_lock(&sched_down_mutex);
  23490. + rq = task_rq_lock(p, &flags);
  23491. +
  23492. + cpumask = &__get_cpu_var(sched_cpumasks);
  23493. + mask = &p->cpus_allowed;
  23494. +
  23495. + cpumask_andnot(cpumask, mask, &sched_down_cpumask);
  23496. +
  23497. + if (!cpumask_weight(cpumask)) {
  23498. + /* It's only on this CPU? */
  23499. + task_rq_unlock(rq, p, &flags);
  23500. + mutex_unlock(&sched_down_mutex);
  23501. + return 0;
  23502. + }
  23503. +
  23504. + dest_cpu = cpumask_any_and(cpu_active_mask, cpumask);
  23505. +
  23506. + arg.task = p;
  23507. + arg.dest_cpu = dest_cpu;
  23508. +
  23509. + task_rq_unlock(rq, p, &flags);
  23510. +
  23511. + stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
  23512. + tlb_migrate_finish(p->mm);
  23513. + mutex_unlock(&sched_down_mutex);
  23514. +
  23515. + return 1;
  23516. }
  23517. /*
  23518. @@ -4743,7 +5055,7 @@
  23519. do_set_cpus_allowed(p, new_mask);
  23520. /* Can the task run on the task's current CPU? If so, we're done */
  23521. - if (cpumask_test_cpu(task_cpu(p), new_mask))
  23522. + if (cpumask_test_cpu(task_cpu(p), new_mask) || __migrate_disabled(p))
  23523. goto out;
  23524. dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
  23525. @@ -4883,6 +5195,8 @@
  23526. #ifdef CONFIG_HOTPLUG_CPU
  23527. +static DEFINE_PER_CPU(struct mm_struct *, idle_last_mm);
  23528. +
  23529. /*
  23530. * Ensures that the idle task is using init_mm right before its cpu goes
  23531. * offline.
  23532. @@ -4897,7 +5211,11 @@
  23533. switch_mm(mm, &init_mm, current);
  23534. finish_arch_post_lock_switch();
  23535. }
  23536. - mmdrop(mm);
  23537. + /*
  23538. + * Defer the cleanup to an alive cpu. On RT we can neither
  23539. + * call mmdrop() nor mmdrop_delayed() from here.
  23540. + */
  23541. + per_cpu(idle_last_mm, smp_processor_id()) = mm;
  23542. }
  23543. /*
  23544. @@ -5240,6 +5558,10 @@
  23545. case CPU_DEAD:
  23546. calc_load_migrate(rq);
  23547. + if (per_cpu(idle_last_mm, cpu)) {
  23548. + mmdrop(per_cpu(idle_last_mm, cpu));
  23549. + per_cpu(idle_last_mm, cpu) = NULL;
  23550. + }
  23551. break;
  23552. #endif
  23553. }
  23554. @@ -7181,7 +7503,8 @@
  23555. #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
  23556. static inline int preempt_count_equals(int preempt_offset)
  23557. {
  23558. - int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
  23559. + int nested = (preempt_count() & ~PREEMPT_ACTIVE) +
  23560. + sched_rcu_preempt_depth();
  23561. return (nested == preempt_offset);
  23562. }
  23563. diff -Nur linux-3.18.12.orig/kernel/sched/cputime.c linux-3.18.12/kernel/sched/cputime.c
  23564. --- linux-3.18.12.orig/kernel/sched/cputime.c 2015-04-20 14:48:02.000000000 -0500
  23565. +++ linux-3.18.12/kernel/sched/cputime.c 2015-04-26 13:32:22.443684003 -0500
  23566. @@ -675,37 +675,45 @@
  23567. void vtime_account_system(struct task_struct *tsk)
  23568. {
  23569. - write_seqlock(&tsk->vtime_seqlock);
  23570. + raw_spin_lock(&tsk->vtime_lock);
  23571. + write_seqcount_begin(&tsk->vtime_seq);
  23572. __vtime_account_system(tsk);
  23573. - write_sequnlock(&tsk->vtime_seqlock);
  23574. + write_seqcount_end(&tsk->vtime_seq);
  23575. + raw_spin_unlock(&tsk->vtime_lock);
  23576. }
  23577. void vtime_gen_account_irq_exit(struct task_struct *tsk)
  23578. {
  23579. - write_seqlock(&tsk->vtime_seqlock);
  23580. + raw_spin_lock(&tsk->vtime_lock);
  23581. + write_seqcount_begin(&tsk->vtime_seq);
  23582. __vtime_account_system(tsk);
  23583. if (context_tracking_in_user())
  23584. tsk->vtime_snap_whence = VTIME_USER;
  23585. - write_sequnlock(&tsk->vtime_seqlock);
  23586. + write_seqcount_end(&tsk->vtime_seq);
  23587. + raw_spin_unlock(&tsk->vtime_lock);
  23588. }
  23589. void vtime_account_user(struct task_struct *tsk)
  23590. {
  23591. cputime_t delta_cpu;
  23592. - write_seqlock(&tsk->vtime_seqlock);
  23593. + raw_spin_lock(&tsk->vtime_lock);
  23594. + write_seqcount_begin(&tsk->vtime_seq);
  23595. delta_cpu = get_vtime_delta(tsk);
  23596. tsk->vtime_snap_whence = VTIME_SYS;
  23597. account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu));
  23598. - write_sequnlock(&tsk->vtime_seqlock);
  23599. + write_seqcount_end(&tsk->vtime_seq);
  23600. + raw_spin_unlock(&tsk->vtime_lock);
  23601. }
  23602. void vtime_user_enter(struct task_struct *tsk)
  23603. {
  23604. - write_seqlock(&tsk->vtime_seqlock);
  23605. + raw_spin_lock(&tsk->vtime_lock);
  23606. + write_seqcount_begin(&tsk->vtime_seq);
  23607. __vtime_account_system(tsk);
  23608. tsk->vtime_snap_whence = VTIME_USER;
  23609. - write_sequnlock(&tsk->vtime_seqlock);
  23610. + write_seqcount_end(&tsk->vtime_seq);
  23611. + raw_spin_unlock(&tsk->vtime_lock);
  23612. }
  23613. void vtime_guest_enter(struct task_struct *tsk)
  23614. @@ -717,19 +725,23 @@
  23615. * synchronization against the reader (task_gtime())
  23616. * that can thus safely catch up with a tickless delta.
  23617. */
  23618. - write_seqlock(&tsk->vtime_seqlock);
  23619. + raw_spin_lock(&tsk->vtime_lock);
  23620. + write_seqcount_begin(&tsk->vtime_seq);
  23621. __vtime_account_system(tsk);
  23622. current->flags |= PF_VCPU;
  23623. - write_sequnlock(&tsk->vtime_seqlock);
  23624. + write_seqcount_end(&tsk->vtime_seq);
  23625. + raw_spin_unlock(&tsk->vtime_lock);
  23626. }
  23627. EXPORT_SYMBOL_GPL(vtime_guest_enter);
  23628. void vtime_guest_exit(struct task_struct *tsk)
  23629. {
  23630. - write_seqlock(&tsk->vtime_seqlock);
  23631. + raw_spin_lock(&tsk->vtime_lock);
  23632. + write_seqcount_begin(&tsk->vtime_seq);
  23633. __vtime_account_system(tsk);
  23634. current->flags &= ~PF_VCPU;
  23635. - write_sequnlock(&tsk->vtime_seqlock);
  23636. + write_seqcount_end(&tsk->vtime_seq);
  23637. + raw_spin_unlock(&tsk->vtime_lock);
  23638. }
  23639. EXPORT_SYMBOL_GPL(vtime_guest_exit);
  23640. @@ -742,24 +754,30 @@
  23641. void arch_vtime_task_switch(struct task_struct *prev)
  23642. {
  23643. - write_seqlock(&prev->vtime_seqlock);
  23644. + raw_spin_lock(&prev->vtime_lock);
  23645. + write_seqcount_begin(&prev->vtime_seq);
  23646. prev->vtime_snap_whence = VTIME_SLEEPING;
  23647. - write_sequnlock(&prev->vtime_seqlock);
  23648. + write_seqcount_end(&prev->vtime_seq);
  23649. + raw_spin_unlock(&prev->vtime_lock);
  23650. - write_seqlock(&current->vtime_seqlock);
  23651. + raw_spin_lock(&current->vtime_lock);
  23652. + write_seqcount_begin(&current->vtime_seq);
  23653. current->vtime_snap_whence = VTIME_SYS;
  23654. current->vtime_snap = sched_clock_cpu(smp_processor_id());
  23655. - write_sequnlock(&current->vtime_seqlock);
  23656. + write_seqcount_end(&current->vtime_seq);
  23657. + raw_spin_unlock(&current->vtime_lock);
  23658. }
  23659. void vtime_init_idle(struct task_struct *t, int cpu)
  23660. {
  23661. unsigned long flags;
  23662. - write_seqlock_irqsave(&t->vtime_seqlock, flags);
  23663. + raw_spin_lock_irqsave(&t->vtime_lock, flags);
  23664. + write_seqcount_begin(&t->vtime_seq);
  23665. t->vtime_snap_whence = VTIME_SYS;
  23666. t->vtime_snap = sched_clock_cpu(cpu);
  23667. - write_sequnlock_irqrestore(&t->vtime_seqlock, flags);
  23668. + write_seqcount_end(&t->vtime_seq);
  23669. + raw_spin_unlock_irqrestore(&t->vtime_lock, flags);
  23670. }
  23671. cputime_t task_gtime(struct task_struct *t)
  23672. @@ -768,13 +786,13 @@
  23673. cputime_t gtime;
  23674. do {
  23675. - seq = read_seqbegin(&t->vtime_seqlock);
  23676. + seq = read_seqcount_begin(&t->vtime_seq);
  23677. gtime = t->gtime;
  23678. if (t->flags & PF_VCPU)
  23679. gtime += vtime_delta(t);
  23680. - } while (read_seqretry(&t->vtime_seqlock, seq));
  23681. + } while (read_seqcount_retry(&t->vtime_seq, seq));
  23682. return gtime;
  23683. }
  23684. @@ -797,7 +815,7 @@
  23685. *udelta = 0;
  23686. *sdelta = 0;
  23687. - seq = read_seqbegin(&t->vtime_seqlock);
  23688. + seq = read_seqcount_begin(&t->vtime_seq);
  23689. if (u_dst)
  23690. *u_dst = *u_src;
  23691. @@ -821,7 +839,7 @@
  23692. if (t->vtime_snap_whence == VTIME_SYS)
  23693. *sdelta = delta;
  23694. }
  23695. - } while (read_seqretry(&t->vtime_seqlock, seq));
  23696. + } while (read_seqcount_retry(&t->vtime_seq, seq));
  23697. }
  23698. diff -Nur linux-3.18.12.orig/kernel/sched/deadline.c linux-3.18.12/kernel/sched/deadline.c
  23699. --- linux-3.18.12.orig/kernel/sched/deadline.c 2015-04-20 14:48:02.000000000 -0500
  23700. +++ linux-3.18.12/kernel/sched/deadline.c 2015-04-26 13:32:22.447684003 -0500
  23701. @@ -570,6 +570,7 @@
  23702. hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
  23703. timer->function = dl_task_timer;
  23704. + timer->irqsafe = 1;
  23705. }
  23706. static
  23707. diff -Nur linux-3.18.12.orig/kernel/sched/debug.c linux-3.18.12/kernel/sched/debug.c
  23708. --- linux-3.18.12.orig/kernel/sched/debug.c 2015-04-20 14:48:02.000000000 -0500
  23709. +++ linux-3.18.12/kernel/sched/debug.c 2015-04-26 13:32:22.447684003 -0500
  23710. @@ -256,6 +256,9 @@
  23711. P(rt_throttled);
  23712. PN(rt_time);
  23713. PN(rt_runtime);
  23714. +#ifdef CONFIG_SMP
  23715. + P(rt_nr_migratory);
  23716. +#endif
  23717. #undef PN
  23718. #undef P
  23719. @@ -634,6 +637,10 @@
  23720. #endif
  23721. P(policy);
  23722. P(prio);
  23723. +#ifdef CONFIG_PREEMPT_RT_FULL
  23724. + P(migrate_disable);
  23725. +#endif
  23726. + P(nr_cpus_allowed);
  23727. #undef PN
  23728. #undef __PN
  23729. #undef P
  23730. diff -Nur linux-3.18.12.orig/kernel/sched/fair.c linux-3.18.12/kernel/sched/fair.c
  23731. --- linux-3.18.12.orig/kernel/sched/fair.c 2015-04-20 14:48:02.000000000 -0500
  23732. +++ linux-3.18.12/kernel/sched/fair.c 2015-04-26 13:32:22.447684003 -0500
  23733. @@ -2951,7 +2951,7 @@
  23734. ideal_runtime = sched_slice(cfs_rq, curr);
  23735. delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
  23736. if (delta_exec > ideal_runtime) {
  23737. - resched_curr(rq_of(cfs_rq));
  23738. + resched_curr_lazy(rq_of(cfs_rq));
  23739. /*
  23740. * The current task ran long enough, ensure it doesn't get
  23741. * re-elected due to buddy favours.
  23742. @@ -2975,7 +2975,7 @@
  23743. return;
  23744. if (delta > ideal_runtime)
  23745. - resched_curr(rq_of(cfs_rq));
  23746. + resched_curr_lazy(rq_of(cfs_rq));
  23747. }
  23748. static void
  23749. @@ -3115,7 +3115,7 @@
  23750. * validating it and just reschedule.
  23751. */
  23752. if (queued) {
  23753. - resched_curr(rq_of(cfs_rq));
  23754. + resched_curr_lazy(rq_of(cfs_rq));
  23755. return;
  23756. }
  23757. /*
  23758. @@ -3306,7 +3306,7 @@
  23759. * hierarchy can be throttled
  23760. */
  23761. if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
  23762. - resched_curr(rq_of(cfs_rq));
  23763. + resched_curr_lazy(rq_of(cfs_rq));
  23764. }
  23765. static __always_inline
  23766. @@ -3925,7 +3925,7 @@
  23767. if (delta < 0) {
  23768. if (rq->curr == p)
  23769. - resched_curr(rq);
  23770. + resched_curr_lazy(rq);
  23771. return;
  23772. }
  23773. hrtick_start(rq, delta);
  23774. @@ -4792,7 +4792,7 @@
  23775. return;
  23776. preempt:
  23777. - resched_curr(rq);
  23778. + resched_curr_lazy(rq);
  23779. /*
  23780. * Only set the backward buddy when the current task is still
  23781. * on the rq. This can happen when a wakeup gets interleaved
  23782. @@ -7576,7 +7576,7 @@
  23783. * 'current' within the tree based on its new key value.
  23784. */
  23785. swap(curr->vruntime, se->vruntime);
  23786. - resched_curr(rq);
  23787. + resched_curr_lazy(rq);
  23788. }
  23789. se->vruntime -= cfs_rq->min_vruntime;
  23790. @@ -7601,7 +7601,7 @@
  23791. */
  23792. if (rq->curr == p) {
  23793. if (p->prio > oldprio)
  23794. - resched_curr(rq);
  23795. + resched_curr_lazy(rq);
  23796. } else
  23797. check_preempt_curr(rq, p, 0);
  23798. }
  23799. diff -Nur linux-3.18.12.orig/kernel/sched/features.h linux-3.18.12/kernel/sched/features.h
  23800. --- linux-3.18.12.orig/kernel/sched/features.h 2015-04-20 14:48:02.000000000 -0500
  23801. +++ linux-3.18.12/kernel/sched/features.h 2015-04-26 13:32:22.447684003 -0500
  23802. @@ -50,12 +50,18 @@
  23803. */
  23804. SCHED_FEAT(NONTASK_CAPACITY, true)
  23805. +#ifdef CONFIG_PREEMPT_RT_FULL
  23806. +SCHED_FEAT(TTWU_QUEUE, false)
  23807. +# ifdef CONFIG_PREEMPT_LAZY
  23808. +SCHED_FEAT(PREEMPT_LAZY, true)
  23809. +# endif
  23810. +#else
  23811. /*
  23812. * Queue remote wakeups on the target CPU and process them
  23813. * using the scheduler IPI. Reduces rq->lock contention/bounces.
  23814. */
  23815. SCHED_FEAT(TTWU_QUEUE, true)
  23816. -
  23817. +#endif
  23818. SCHED_FEAT(FORCE_SD_OVERLAP, false)
  23819. SCHED_FEAT(RT_RUNTIME_SHARE, true)
  23820. SCHED_FEAT(LB_MIN, false)
  23821. diff -Nur linux-3.18.12.orig/kernel/sched/Makefile linux-3.18.12/kernel/sched/Makefile
  23822. --- linux-3.18.12.orig/kernel/sched/Makefile 2015-04-20 14:48:02.000000000 -0500
  23823. +++ linux-3.18.12/kernel/sched/Makefile 2015-04-26 13:32:22.443684003 -0500
  23824. @@ -13,7 +13,7 @@
  23825. obj-y += core.o proc.o clock.o cputime.o
  23826. obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
  23827. -obj-y += wait.o completion.o idle.o
  23828. +obj-y += wait.o wait-simple.o work-simple.o completion.o idle.o
  23829. obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
  23830. obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
  23831. obj-$(CONFIG_SCHEDSTATS) += stats.o
  23832. diff -Nur linux-3.18.12.orig/kernel/sched/rt.c linux-3.18.12/kernel/sched/rt.c
  23833. --- linux-3.18.12.orig/kernel/sched/rt.c 2015-04-20 14:48:02.000000000 -0500
  23834. +++ linux-3.18.12/kernel/sched/rt.c 2015-04-26 13:32:22.447684003 -0500
  23835. @@ -43,6 +43,7 @@
  23836. hrtimer_init(&rt_b->rt_period_timer,
  23837. CLOCK_MONOTONIC, HRTIMER_MODE_REL);
  23838. + rt_b->rt_period_timer.irqsafe = 1;
  23839. rt_b->rt_period_timer.function = sched_rt_period_timer;
  23840. }
  23841. diff -Nur linux-3.18.12.orig/kernel/sched/sched.h linux-3.18.12/kernel/sched/sched.h
  23842. --- linux-3.18.12.orig/kernel/sched/sched.h 2015-04-20 14:48:02.000000000 -0500
  23843. +++ linux-3.18.12/kernel/sched/sched.h 2015-04-26 13:32:22.447684003 -0500
  23844. @@ -1018,6 +1018,7 @@
  23845. #define WF_SYNC 0x01 /* waker goes to sleep after wakeup */
  23846. #define WF_FORK 0x02 /* child wakeup after fork */
  23847. #define WF_MIGRATED 0x4 /* internal use, task got migrated */
  23848. +#define WF_LOCK_SLEEPER 0x08 /* wakeup spinlock "sleeper" */
  23849. /*
  23850. * To aid in avoiding the subversion of "niceness" due to uneven distribution
  23851. @@ -1210,6 +1211,15 @@
  23852. extern void resched_curr(struct rq *rq);
  23853. extern void resched_cpu(int cpu);
  23854. +#ifdef CONFIG_PREEMPT_LAZY
  23855. +extern void resched_curr_lazy(struct rq *rq);
  23856. +#else
  23857. +static inline void resched_curr_lazy(struct rq *rq)
  23858. +{
  23859. + resched_curr(rq);
  23860. +}
  23861. +#endif
  23862. +
  23863. extern struct rt_bandwidth def_rt_bandwidth;
  23864. extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
  23865. diff -Nur linux-3.18.12.orig/kernel/sched/wait-simple.c linux-3.18.12/kernel/sched/wait-simple.c
  23866. --- linux-3.18.12.orig/kernel/sched/wait-simple.c 1969-12-31 18:00:00.000000000 -0600
  23867. +++ linux-3.18.12/kernel/sched/wait-simple.c 2015-04-26 13:32:22.447684003 -0500
  23868. @@ -0,0 +1,115 @@
  23869. +/*
  23870. + * Simple waitqueues without fancy flags and callbacks
  23871. + *
  23872. + * (C) 2011 Thomas Gleixner <tglx@linutronix.de>
  23873. + *
  23874. + * Based on kernel/wait.c
  23875. + *
  23876. + * For licencing details see kernel-base/COPYING
  23877. + */
  23878. +#include <linux/init.h>
  23879. +#include <linux/export.h>
  23880. +#include <linux/sched.h>
  23881. +#include <linux/wait-simple.h>
  23882. +
  23883. +/* Adds w to head->list. Must be called with head->lock locked. */
  23884. +static inline void __swait_enqueue(struct swait_head *head, struct swaiter *w)
  23885. +{
  23886. + list_add(&w->node, &head->list);
  23887. + /* We can't let the condition leak before the setting of head */
  23888. + smp_mb();
  23889. +}
  23890. +
  23891. +/* Removes w from head->list. Must be called with head->lock locked. */
  23892. +static inline void __swait_dequeue(struct swaiter *w)
  23893. +{
  23894. + list_del_init(&w->node);
  23895. +}
  23896. +
  23897. +void __init_swait_head(struct swait_head *head, struct lock_class_key *key)
  23898. +{
  23899. + raw_spin_lock_init(&head->lock);
  23900. + lockdep_set_class(&head->lock, key);
  23901. + INIT_LIST_HEAD(&head->list);
  23902. +}
  23903. +EXPORT_SYMBOL(__init_swait_head);
  23904. +
  23905. +void swait_prepare_locked(struct swait_head *head, struct swaiter *w)
  23906. +{
  23907. + w->task = current;
  23908. + if (list_empty(&w->node))
  23909. + __swait_enqueue(head, w);
  23910. +}
  23911. +
  23912. +void swait_prepare(struct swait_head *head, struct swaiter *w, int state)
  23913. +{
  23914. + unsigned long flags;
  23915. +
  23916. + raw_spin_lock_irqsave(&head->lock, flags);
  23917. + swait_prepare_locked(head, w);
  23918. + __set_current_state(state);
  23919. + raw_spin_unlock_irqrestore(&head->lock, flags);
  23920. +}
  23921. +EXPORT_SYMBOL(swait_prepare);
  23922. +
  23923. +void swait_finish_locked(struct swait_head *head, struct swaiter *w)
  23924. +{
  23925. + __set_current_state(TASK_RUNNING);
  23926. + if (w->task)
  23927. + __swait_dequeue(w);
  23928. +}
  23929. +
  23930. +void swait_finish(struct swait_head *head, struct swaiter *w)
  23931. +{
  23932. + unsigned long flags;
  23933. +
  23934. + __set_current_state(TASK_RUNNING);
  23935. + if (w->task) {
  23936. + raw_spin_lock_irqsave(&head->lock, flags);
  23937. + __swait_dequeue(w);
  23938. + raw_spin_unlock_irqrestore(&head->lock, flags);
  23939. + }
  23940. +}
  23941. +EXPORT_SYMBOL(swait_finish);
  23942. +
  23943. +unsigned int
  23944. +__swait_wake_locked(struct swait_head *head, unsigned int state, unsigned int num)
  23945. +{
  23946. + struct swaiter *curr, *next;
  23947. + int woken = 0;
  23948. +
  23949. + list_for_each_entry_safe(curr, next, &head->list, node) {
  23950. + if (wake_up_state(curr->task, state)) {
  23951. + __swait_dequeue(curr);
  23952. + /*
  23953. + * The waiting task can free the waiter as
  23954. + * soon as curr->task = NULL is written,
  23955. + * without taking any locks. A memory barrier
  23956. + * is required here to prevent the following
  23957. + * store to curr->task from getting ahead of
  23958. + * the dequeue operation.
  23959. + */
  23960. + smp_wmb();
  23961. + curr->task = NULL;
  23962. + if (++woken == num)
  23963. + break;
  23964. + }
  23965. + }
  23966. + return woken;
  23967. +}
  23968. +
  23969. +unsigned int
  23970. +__swait_wake(struct swait_head *head, unsigned int state, unsigned int num)
  23971. +{
  23972. + unsigned long flags;
  23973. + int woken;
  23974. +
  23975. + if (!swaitqueue_active(head))
  23976. + return 0;
  23977. +
  23978. + raw_spin_lock_irqsave(&head->lock, flags);
  23979. + woken = __swait_wake_locked(head, state, num);
  23980. + raw_spin_unlock_irqrestore(&head->lock, flags);
  23981. + return woken;
  23982. +}
  23983. +EXPORT_SYMBOL(__swait_wake);
  23984. diff -Nur linux-3.18.12.orig/kernel/sched/work-simple.c linux-3.18.12/kernel/sched/work-simple.c
  23985. --- linux-3.18.12.orig/kernel/sched/work-simple.c 1969-12-31 18:00:00.000000000 -0600
  23986. +++ linux-3.18.12/kernel/sched/work-simple.c 2015-04-26 13:32:22.447684003 -0500
  23987. @@ -0,0 +1,172 @@
  23988. +/*
  23989. + * Copyright (C) 2014 BMW Car IT GmbH, Daniel Wagner daniel.wagner@bmw-carit.de
  23990. + *
  23991. + * Provides a framework for enqueuing callbacks from irq context
  23992. + * PREEMPT_RT_FULL safe. The callbacks are executed in kthread context.
  23993. + */
  23994. +
  23995. +#include <linux/wait-simple.h>
  23996. +#include <linux/work-simple.h>
  23997. +#include <linux/kthread.h>
  23998. +#include <linux/slab.h>
  23999. +#include <linux/spinlock.h>
  24000. +
  24001. +#define SWORK_EVENT_PENDING (1 << 0)
  24002. +
  24003. +static DEFINE_MUTEX(worker_mutex);
  24004. +static struct sworker *glob_worker;
  24005. +
  24006. +struct sworker {
  24007. + struct list_head events;
  24008. + struct swait_head wq;
  24009. +
  24010. + raw_spinlock_t lock;
  24011. +
  24012. + struct task_struct *task;
  24013. + int refs;
  24014. +};
  24015. +
  24016. +static bool swork_readable(struct sworker *worker)
  24017. +{
  24018. + bool r;
  24019. +
  24020. + if (kthread_should_stop())
  24021. + return true;
  24022. +
  24023. + raw_spin_lock_irq(&worker->lock);
  24024. + r = !list_empty(&worker->events);
  24025. + raw_spin_unlock_irq(&worker->lock);
  24026. +
  24027. + return r;
  24028. +}
  24029. +
  24030. +static int swork_kthread(void *arg)
  24031. +{
  24032. + struct sworker *worker = arg;
  24033. +
  24034. + for (;;) {
  24035. + swait_event_interruptible(worker->wq,
  24036. + swork_readable(worker));
  24037. + if (kthread_should_stop())
  24038. + break;
  24039. +
  24040. + raw_spin_lock_irq(&worker->lock);
  24041. + while (!list_empty(&worker->events)) {
  24042. + struct swork_event *sev;
  24043. +
  24044. + sev = list_first_entry(&worker->events,
  24045. + struct swork_event, item);
  24046. + list_del(&sev->item);
  24047. + raw_spin_unlock_irq(&worker->lock);
  24048. +
  24049. + WARN_ON_ONCE(!test_and_clear_bit(SWORK_EVENT_PENDING,
  24050. + &sev->flags));
  24051. + sev->func(sev);
  24052. + raw_spin_lock_irq(&worker->lock);
  24053. + }
  24054. + raw_spin_unlock_irq(&worker->lock);
  24055. + }
  24056. + return 0;
  24057. +}
  24058. +
  24059. +static struct sworker *swork_create(void)
  24060. +{
  24061. + struct sworker *worker;
  24062. +
  24063. + worker = kzalloc(sizeof(*worker), GFP_KERNEL);
  24064. + if (!worker)
  24065. + return ERR_PTR(-ENOMEM);
  24066. +
  24067. + INIT_LIST_HEAD(&worker->events);
  24068. + raw_spin_lock_init(&worker->lock);
  24069. + init_swait_head(&worker->wq);
  24070. +
  24071. + worker->task = kthread_run(swork_kthread, worker, "kswork");
  24072. + if (IS_ERR(worker->task)) {
  24073. + kfree(worker);
  24074. + return ERR_PTR(-ENOMEM);
  24075. + }
  24076. +
  24077. + return worker;
  24078. +}
  24079. +
  24080. +static void swork_destroy(struct sworker *worker)
  24081. +{
  24082. + kthread_stop(worker->task);
  24083. +
  24084. + WARN_ON(!list_empty(&worker->events));
  24085. + kfree(worker);
  24086. +}
  24087. +
  24088. +/**
  24089. + * swork_queue - queue swork
  24090. + *
  24091. + * Returns %false if @work was already on a queue, %true otherwise.
  24092. + *
  24093. + * The work is queued and processed on a random CPU
  24094. + */
  24095. +bool swork_queue(struct swork_event *sev)
  24096. +{
  24097. + unsigned long flags;
  24098. +
  24099. + if (test_and_set_bit(SWORK_EVENT_PENDING, &sev->flags))
  24100. + return false;
  24101. +
  24102. + raw_spin_lock_irqsave(&glob_worker->lock, flags);
  24103. + list_add_tail(&sev->item, &glob_worker->events);
  24104. + raw_spin_unlock_irqrestore(&glob_worker->lock, flags);
  24105. +
  24106. + swait_wake(&glob_worker->wq);
  24107. + return true;
  24108. +}
  24109. +EXPORT_SYMBOL_GPL(swork_queue);
  24110. +
  24111. +/**
  24112. + * swork_get - get an instance of the sworker
  24113. + *
  24114. + * Returns an negative error code if the initialization if the worker did not
  24115. + * work, %0 otherwise.
  24116. + *
  24117. + */
  24118. +int swork_get(void)
  24119. +{
  24120. + struct sworker *worker;
  24121. +
  24122. + mutex_lock(&worker_mutex);
  24123. + if (!glob_worker) {
  24124. + worker = swork_create();
  24125. + if (IS_ERR(worker)) {
  24126. + mutex_unlock(&worker_mutex);
  24127. + return -ENOMEM;
  24128. + }
  24129. +
  24130. + glob_worker = worker;
  24131. + }
  24132. +
  24133. + glob_worker->refs++;
  24134. + mutex_unlock(&worker_mutex);
  24135. +
  24136. + return 0;
  24137. +}
  24138. +EXPORT_SYMBOL_GPL(swork_get);
  24139. +
  24140. +/**
  24141. + * swork_put - puts an instance of the sworker
  24142. + *
  24143. + * Will destroy the sworker thread. This function must not be called until all
  24144. + * queued events have been completed.
  24145. + */
  24146. +void swork_put(void)
  24147. +{
  24148. + mutex_lock(&worker_mutex);
  24149. +
  24150. + glob_worker->refs--;
  24151. + if (glob_worker->refs > 0)
  24152. + goto out;
  24153. +
  24154. + swork_destroy(glob_worker);
  24155. + glob_worker = NULL;
  24156. +out:
  24157. + mutex_unlock(&worker_mutex);
  24158. +}
  24159. +EXPORT_SYMBOL_GPL(swork_put);
  24160. diff -Nur linux-3.18.12.orig/kernel/signal.c linux-3.18.12/kernel/signal.c
  24161. --- linux-3.18.12.orig/kernel/signal.c 2015-04-20 14:48:02.000000000 -0500
  24162. +++ linux-3.18.12/kernel/signal.c 2015-04-26 13:32:22.447684003 -0500
  24163. @@ -14,6 +14,7 @@
  24164. #include <linux/export.h>
  24165. #include <linux/init.h>
  24166. #include <linux/sched.h>
  24167. +#include <linux/sched/rt.h>
  24168. #include <linux/fs.h>
  24169. #include <linux/tty.h>
  24170. #include <linux/binfmts.h>
  24171. @@ -352,13 +353,45 @@
  24172. return false;
  24173. }
  24174. +#ifdef __HAVE_ARCH_CMPXCHG
  24175. +static inline struct sigqueue *get_task_cache(struct task_struct *t)
  24176. +{
  24177. + struct sigqueue *q = t->sigqueue_cache;
  24178. +
  24179. + if (cmpxchg(&t->sigqueue_cache, q, NULL) != q)
  24180. + return NULL;
  24181. + return q;
  24182. +}
  24183. +
  24184. +static inline int put_task_cache(struct task_struct *t, struct sigqueue *q)
  24185. +{
  24186. + if (cmpxchg(&t->sigqueue_cache, NULL, q) == NULL)
  24187. + return 0;
  24188. + return 1;
  24189. +}
  24190. +
  24191. +#else
  24192. +
  24193. +static inline struct sigqueue *get_task_cache(struct task_struct *t)
  24194. +{
  24195. + return NULL;
  24196. +}
  24197. +
  24198. +static inline int put_task_cache(struct task_struct *t, struct sigqueue *q)
  24199. +{
  24200. + return 1;
  24201. +}
  24202. +
  24203. +#endif
  24204. +
  24205. /*
  24206. * allocate a new signal queue record
  24207. * - this may be called without locks if and only if t == current, otherwise an
  24208. * appropriate lock must be held to stop the target task from exiting
  24209. */
  24210. static struct sigqueue *
  24211. -__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit)
  24212. +__sigqueue_do_alloc(int sig, struct task_struct *t, gfp_t flags,
  24213. + int override_rlimit, int fromslab)
  24214. {
  24215. struct sigqueue *q = NULL;
  24216. struct user_struct *user;
  24217. @@ -375,7 +408,10 @@
  24218. if (override_rlimit ||
  24219. atomic_read(&user->sigpending) <=
  24220. task_rlimit(t, RLIMIT_SIGPENDING)) {
  24221. - q = kmem_cache_alloc(sigqueue_cachep, flags);
  24222. + if (!fromslab)
  24223. + q = get_task_cache(t);
  24224. + if (!q)
  24225. + q = kmem_cache_alloc(sigqueue_cachep, flags);
  24226. } else {
  24227. print_dropped_signal(sig);
  24228. }
  24229. @@ -392,6 +428,13 @@
  24230. return q;
  24231. }
  24232. +static struct sigqueue *
  24233. +__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags,
  24234. + int override_rlimit)
  24235. +{
  24236. + return __sigqueue_do_alloc(sig, t, flags, override_rlimit, 0);
  24237. +}
  24238. +
  24239. static void __sigqueue_free(struct sigqueue *q)
  24240. {
  24241. if (q->flags & SIGQUEUE_PREALLOC)
  24242. @@ -401,6 +444,21 @@
  24243. kmem_cache_free(sigqueue_cachep, q);
  24244. }
  24245. +static void sigqueue_free_current(struct sigqueue *q)
  24246. +{
  24247. + struct user_struct *up;
  24248. +
  24249. + if (q->flags & SIGQUEUE_PREALLOC)
  24250. + return;
  24251. +
  24252. + up = q->user;
  24253. + if (rt_prio(current->normal_prio) && !put_task_cache(current, q)) {
  24254. + atomic_dec(&up->sigpending);
  24255. + free_uid(up);
  24256. + } else
  24257. + __sigqueue_free(q);
  24258. +}
  24259. +
  24260. void flush_sigqueue(struct sigpending *queue)
  24261. {
  24262. struct sigqueue *q;
  24263. @@ -414,6 +472,21 @@
  24264. }
  24265. /*
  24266. + * Called from __exit_signal. Flush tsk->pending and
  24267. + * tsk->sigqueue_cache
  24268. + */
  24269. +void flush_task_sigqueue(struct task_struct *tsk)
  24270. +{
  24271. + struct sigqueue *q;
  24272. +
  24273. + flush_sigqueue(&tsk->pending);
  24274. +
  24275. + q = get_task_cache(tsk);
  24276. + if (q)
  24277. + kmem_cache_free(sigqueue_cachep, q);
  24278. +}
  24279. +
  24280. +/*
  24281. * Flush all pending signals for a task.
  24282. */
  24283. void __flush_signals(struct task_struct *t)
  24284. @@ -565,7 +638,7 @@
  24285. still_pending:
  24286. list_del_init(&first->list);
  24287. copy_siginfo(info, &first->info);
  24288. - __sigqueue_free(first);
  24289. + sigqueue_free_current(first);
  24290. } else {
  24291. /*
  24292. * Ok, it wasn't in the queue. This must be
  24293. @@ -611,6 +684,8 @@
  24294. {
  24295. int signr;
  24296. + WARN_ON_ONCE(tsk != current);
  24297. +
  24298. /* We only dequeue private signals from ourselves, we don't let
  24299. * signalfd steal them
  24300. */
  24301. @@ -1207,8 +1282,8 @@
  24302. * We don't want to have recursive SIGSEGV's etc, for example,
  24303. * that is why we also clear SIGNAL_UNKILLABLE.
  24304. */
  24305. -int
  24306. -force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
  24307. +static int
  24308. +do_force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
  24309. {
  24310. unsigned long int flags;
  24311. int ret, blocked, ignored;
  24312. @@ -1233,6 +1308,39 @@
  24313. return ret;
  24314. }
  24315. +int force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
  24316. +{
  24317. +/*
  24318. + * On some archs, PREEMPT_RT has to delay sending a signal from a trap
  24319. + * since it can not enable preemption, and the signal code's spin_locks
  24320. + * turn into mutexes. Instead, it must set TIF_NOTIFY_RESUME which will
  24321. + * send the signal on exit of the trap.
  24322. + */
  24323. +#ifdef ARCH_RT_DELAYS_SIGNAL_SEND
  24324. + if (in_atomic()) {
  24325. + if (WARN_ON_ONCE(t != current))
  24326. + return 0;
  24327. + if (WARN_ON_ONCE(t->forced_info.si_signo))
  24328. + return 0;
  24329. +
  24330. + if (is_si_special(info)) {
  24331. + WARN_ON_ONCE(info != SEND_SIG_PRIV);
  24332. + t->forced_info.si_signo = sig;
  24333. + t->forced_info.si_errno = 0;
  24334. + t->forced_info.si_code = SI_KERNEL;
  24335. + t->forced_info.si_pid = 0;
  24336. + t->forced_info.si_uid = 0;
  24337. + } else {
  24338. + t->forced_info = *info;
  24339. + }
  24340. +
  24341. + set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
  24342. + return 0;
  24343. + }
  24344. +#endif
  24345. + return do_force_sig_info(sig, info, t);
  24346. +}
  24347. +
  24348. /*
  24349. * Nuke all other threads in the group.
  24350. */
  24351. @@ -1267,12 +1375,12 @@
  24352. * Disable interrupts early to avoid deadlocks.
  24353. * See rcu_read_unlock() comment header for details.
  24354. */
  24355. - local_irq_save(*flags);
  24356. + local_irq_save_nort(*flags);
  24357. rcu_read_lock();
  24358. sighand = rcu_dereference(tsk->sighand);
  24359. if (unlikely(sighand == NULL)) {
  24360. rcu_read_unlock();
  24361. - local_irq_restore(*flags);
  24362. + local_irq_restore_nort(*flags);
  24363. break;
  24364. }
  24365. @@ -1283,7 +1391,7 @@
  24366. }
  24367. spin_unlock(&sighand->siglock);
  24368. rcu_read_unlock();
  24369. - local_irq_restore(*flags);
  24370. + local_irq_restore_nort(*flags);
  24371. }
  24372. return sighand;
  24373. @@ -1528,7 +1636,8 @@
  24374. */
  24375. struct sigqueue *sigqueue_alloc(void)
  24376. {
  24377. - struct sigqueue *q = __sigqueue_alloc(-1, current, GFP_KERNEL, 0);
  24378. + /* Preallocated sigqueue objects always from the slabcache ! */
  24379. + struct sigqueue *q = __sigqueue_do_alloc(-1, current, GFP_KERNEL, 0, 1);
  24380. if (q)
  24381. q->flags |= SIGQUEUE_PREALLOC;
  24382. @@ -1889,15 +1998,7 @@
  24383. if (gstop_done && ptrace_reparented(current))
  24384. do_notify_parent_cldstop(current, false, why);
  24385. - /*
  24386. - * Don't want to allow preemption here, because
  24387. - * sys_ptrace() needs this task to be inactive.
  24388. - *
  24389. - * XXX: implement read_unlock_no_resched().
  24390. - */
  24391. - preempt_disable();
  24392. read_unlock(&tasklist_lock);
  24393. - preempt_enable_no_resched();
  24394. freezable_schedule();
  24395. } else {
  24396. /*
  24397. diff -Nur linux-3.18.12.orig/kernel/softirq.c linux-3.18.12/kernel/softirq.c
  24398. --- linux-3.18.12.orig/kernel/softirq.c 2015-04-20 14:48:02.000000000 -0500
  24399. +++ linux-3.18.12/kernel/softirq.c 2015-04-26 13:32:22.451684003 -0500
  24400. @@ -21,10 +21,12 @@
  24401. #include <linux/freezer.h>
  24402. #include <linux/kthread.h>
  24403. #include <linux/rcupdate.h>
  24404. +#include <linux/delay.h>
  24405. #include <linux/ftrace.h>
  24406. #include <linux/smp.h>
  24407. #include <linux/smpboot.h>
  24408. #include <linux/tick.h>
  24409. +#include <linux/locallock.h>
  24410. #include <linux/irq.h>
  24411. #define CREATE_TRACE_POINTS
  24412. @@ -62,6 +64,98 @@
  24413. "TASKLET", "SCHED", "HRTIMER", "RCU"
  24414. };
  24415. +#ifdef CONFIG_NO_HZ_COMMON
  24416. +# ifdef CONFIG_PREEMPT_RT_FULL
  24417. +
  24418. +struct softirq_runner {
  24419. + struct task_struct *runner[NR_SOFTIRQS];
  24420. +};
  24421. +
  24422. +static DEFINE_PER_CPU(struct softirq_runner, softirq_runners);
  24423. +
  24424. +static inline void softirq_set_runner(unsigned int sirq)
  24425. +{
  24426. + struct softirq_runner *sr = &__get_cpu_var(softirq_runners);
  24427. +
  24428. + sr->runner[sirq] = current;
  24429. +}
  24430. +
  24431. +static inline void softirq_clr_runner(unsigned int sirq)
  24432. +{
  24433. + struct softirq_runner *sr = &__get_cpu_var(softirq_runners);
  24434. +
  24435. + sr->runner[sirq] = NULL;
  24436. +}
  24437. +
  24438. +/*
  24439. + * On preempt-rt a softirq running context might be blocked on a
  24440. + * lock. There might be no other runnable task on this CPU because the
  24441. + * lock owner runs on some other CPU. So we have to go into idle with
  24442. + * the pending bit set. Therefor we need to check this otherwise we
  24443. + * warn about false positives which confuses users and defeats the
  24444. + * whole purpose of this test.
  24445. + *
  24446. + * This code is called with interrupts disabled.
  24447. + */
  24448. +void softirq_check_pending_idle(void)
  24449. +{
  24450. + static int rate_limit;
  24451. + struct softirq_runner *sr = &__get_cpu_var(softirq_runners);
  24452. + u32 warnpending;
  24453. + int i;
  24454. +
  24455. + if (rate_limit >= 10)
  24456. + return;
  24457. +
  24458. + warnpending = local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK;
  24459. + for (i = 0; i < NR_SOFTIRQS; i++) {
  24460. + struct task_struct *tsk = sr->runner[i];
  24461. +
  24462. + /*
  24463. + * The wakeup code in rtmutex.c wakes up the task
  24464. + * _before_ it sets pi_blocked_on to NULL under
  24465. + * tsk->pi_lock. So we need to check for both: state
  24466. + * and pi_blocked_on.
  24467. + */
  24468. + if (tsk) {
  24469. + raw_spin_lock(&tsk->pi_lock);
  24470. + if (tsk->pi_blocked_on || tsk->state == TASK_RUNNING) {
  24471. + /* Clear all bits pending in that task */
  24472. + warnpending &= ~(tsk->softirqs_raised);
  24473. + warnpending &= ~(1 << i);
  24474. + }
  24475. + raw_spin_unlock(&tsk->pi_lock);
  24476. + }
  24477. + }
  24478. +
  24479. + if (warnpending) {
  24480. + printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
  24481. + warnpending);
  24482. + rate_limit++;
  24483. + }
  24484. +}
  24485. +# else
  24486. +/*
  24487. + * On !PREEMPT_RT we just printk rate limited:
  24488. + */
  24489. +void softirq_check_pending_idle(void)
  24490. +{
  24491. + static int rate_limit;
  24492. +
  24493. + if (rate_limit < 10 &&
  24494. + (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
  24495. + printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
  24496. + local_softirq_pending());
  24497. + rate_limit++;
  24498. + }
  24499. +}
  24500. +# endif
  24501. +
  24502. +#else /* !CONFIG_NO_HZ_COMMON */
  24503. +static inline void softirq_set_runner(unsigned int sirq) { }
  24504. +static inline void softirq_clr_runner(unsigned int sirq) { }
  24505. +#endif
  24506. +
  24507. /*
  24508. * we cannot loop indefinitely here to avoid userspace starvation,
  24509. * but we also don't want to introduce a worst case 1/HZ latency
  24510. @@ -77,6 +171,70 @@
  24511. wake_up_process(tsk);
  24512. }
  24513. +static void handle_softirq(unsigned int vec_nr)
  24514. +{
  24515. + struct softirq_action *h = softirq_vec + vec_nr;
  24516. + int prev_count;
  24517. +
  24518. + prev_count = preempt_count();
  24519. +
  24520. + kstat_incr_softirqs_this_cpu(vec_nr);
  24521. +
  24522. + trace_softirq_entry(vec_nr);
  24523. + h->action(h);
  24524. + trace_softirq_exit(vec_nr);
  24525. + if (unlikely(prev_count != preempt_count())) {
  24526. + pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n",
  24527. + vec_nr, softirq_to_name[vec_nr], h->action,
  24528. + prev_count, preempt_count());
  24529. + preempt_count_set(prev_count);
  24530. + }
  24531. +}
  24532. +
  24533. +#ifndef CONFIG_PREEMPT_RT_FULL
  24534. +static inline int ksoftirqd_softirq_pending(void)
  24535. +{
  24536. + return local_softirq_pending();
  24537. +}
  24538. +
  24539. +static void handle_pending_softirqs(u32 pending, int need_rcu_bh_qs)
  24540. +{
  24541. + struct softirq_action *h = softirq_vec;
  24542. + int softirq_bit;
  24543. +
  24544. + local_irq_enable();
  24545. +
  24546. + h = softirq_vec;
  24547. +
  24548. + while ((softirq_bit = ffs(pending))) {
  24549. + unsigned int vec_nr;
  24550. +
  24551. + h += softirq_bit - 1;
  24552. + vec_nr = h - softirq_vec;
  24553. + handle_softirq(vec_nr);
  24554. +
  24555. + h++;
  24556. + pending >>= softirq_bit;
  24557. + }
  24558. +
  24559. + if (need_rcu_bh_qs)
  24560. + rcu_bh_qs();
  24561. + local_irq_disable();
  24562. +}
  24563. +
  24564. +static void run_ksoftirqd(unsigned int cpu)
  24565. +{
  24566. + local_irq_disable();
  24567. + if (ksoftirqd_softirq_pending()) {
  24568. + __do_softirq();
  24569. + rcu_note_context_switch(cpu);
  24570. + local_irq_enable();
  24571. + cond_resched();
  24572. + return;
  24573. + }
  24574. + local_irq_enable();
  24575. +}
  24576. +
  24577. /*
  24578. * preempt_count and SOFTIRQ_OFFSET usage:
  24579. * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving
  24580. @@ -228,10 +386,8 @@
  24581. unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
  24582. unsigned long old_flags = current->flags;
  24583. int max_restart = MAX_SOFTIRQ_RESTART;
  24584. - struct softirq_action *h;
  24585. bool in_hardirq;
  24586. __u32 pending;
  24587. - int softirq_bit;
  24588. /*
  24589. * Mask out PF_MEMALLOC s current task context is borrowed for the
  24590. @@ -250,36 +406,7 @@
  24591. /* Reset the pending bitmask before enabling irqs */
  24592. set_softirq_pending(0);
  24593. - local_irq_enable();
  24594. -
  24595. - h = softirq_vec;
  24596. -
  24597. - while ((softirq_bit = ffs(pending))) {
  24598. - unsigned int vec_nr;
  24599. - int prev_count;
  24600. -
  24601. - h += softirq_bit - 1;
  24602. -
  24603. - vec_nr = h - softirq_vec;
  24604. - prev_count = preempt_count();
  24605. -
  24606. - kstat_incr_softirqs_this_cpu(vec_nr);
  24607. -
  24608. - trace_softirq_entry(vec_nr);
  24609. - h->action(h);
  24610. - trace_softirq_exit(vec_nr);
  24611. - if (unlikely(prev_count != preempt_count())) {
  24612. - pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n",
  24613. - vec_nr, softirq_to_name[vec_nr], h->action,
  24614. - prev_count, preempt_count());
  24615. - preempt_count_set(prev_count);
  24616. - }
  24617. - h++;
  24618. - pending >>= softirq_bit;
  24619. - }
  24620. -
  24621. - rcu_bh_qs();
  24622. - local_irq_disable();
  24623. + handle_pending_softirqs(pending, 1);
  24624. pending = local_softirq_pending();
  24625. if (pending) {
  24626. @@ -316,6 +443,285 @@
  24627. }
  24628. /*
  24629. + * This function must run with irqs disabled!
  24630. + */
  24631. +void raise_softirq_irqoff(unsigned int nr)
  24632. +{
  24633. + __raise_softirq_irqoff(nr);
  24634. +
  24635. + /*
  24636. + * If we're in an interrupt or softirq, we're done
  24637. + * (this also catches softirq-disabled code). We will
  24638. + * actually run the softirq once we return from
  24639. + * the irq or softirq.
  24640. + *
  24641. + * Otherwise we wake up ksoftirqd to make sure we
  24642. + * schedule the softirq soon.
  24643. + */
  24644. + if (!in_interrupt())
  24645. + wakeup_softirqd();
  24646. +}
  24647. +
  24648. +void __raise_softirq_irqoff(unsigned int nr)
  24649. +{
  24650. + trace_softirq_raise(nr);
  24651. + or_softirq_pending(1UL << nr);
  24652. +}
  24653. +
  24654. +static inline void local_bh_disable_nort(void) { local_bh_disable(); }
  24655. +static inline void _local_bh_enable_nort(void) { _local_bh_enable(); }
  24656. +static void ksoftirqd_set_sched_params(unsigned int cpu) { }
  24657. +static void ksoftirqd_clr_sched_params(unsigned int cpu, bool online) { }
  24658. +
  24659. +#else /* !PREEMPT_RT_FULL */
  24660. +
  24661. +/*
  24662. + * On RT we serialize softirq execution with a cpu local lock per softirq
  24663. + */
  24664. +static DEFINE_PER_CPU(struct local_irq_lock [NR_SOFTIRQS], local_softirq_locks);
  24665. +
  24666. +void __init softirq_early_init(void)
  24667. +{
  24668. + int i;
  24669. +
  24670. + for (i = 0; i < NR_SOFTIRQS; i++)
  24671. + local_irq_lock_init(local_softirq_locks[i]);
  24672. +}
  24673. +
  24674. +static void lock_softirq(int which)
  24675. +{
  24676. + local_lock(local_softirq_locks[which]);
  24677. +}
  24678. +
  24679. +static void unlock_softirq(int which)
  24680. +{
  24681. + local_unlock(local_softirq_locks[which]);
  24682. +}
  24683. +
  24684. +static void do_single_softirq(int which, int need_rcu_bh_qs)
  24685. +{
  24686. + unsigned long old_flags = current->flags;
  24687. +
  24688. + current->flags &= ~PF_MEMALLOC;
  24689. + vtime_account_irq_enter(current);
  24690. + current->flags |= PF_IN_SOFTIRQ;
  24691. + lockdep_softirq_enter();
  24692. + local_irq_enable();
  24693. + handle_softirq(which);
  24694. + local_irq_disable();
  24695. + lockdep_softirq_exit();
  24696. + current->flags &= ~PF_IN_SOFTIRQ;
  24697. + vtime_account_irq_enter(current);
  24698. + tsk_restore_flags(current, old_flags, PF_MEMALLOC);
  24699. +}
  24700. +
  24701. +/*
  24702. + * Called with interrupts disabled. Process softirqs which were raised
  24703. + * in current context (or on behalf of ksoftirqd).
  24704. + */
  24705. +static void do_current_softirqs(int need_rcu_bh_qs)
  24706. +{
  24707. + while (current->softirqs_raised) {
  24708. + int i = __ffs(current->softirqs_raised);
  24709. + unsigned int pending, mask = (1U << i);
  24710. +
  24711. + current->softirqs_raised &= ~mask;
  24712. + local_irq_enable();
  24713. +
  24714. + /*
  24715. + * If the lock is contended, we boost the owner to
  24716. + * process the softirq or leave the critical section
  24717. + * now.
  24718. + */
  24719. + lock_softirq(i);
  24720. + local_irq_disable();
  24721. + softirq_set_runner(i);
  24722. + /*
  24723. + * Check with the local_softirq_pending() bits,
  24724. + * whether we need to process this still or if someone
  24725. + * else took care of it.
  24726. + */
  24727. + pending = local_softirq_pending();
  24728. + if (pending & mask) {
  24729. + set_softirq_pending(pending & ~mask);
  24730. + do_single_softirq(i, need_rcu_bh_qs);
  24731. + }
  24732. + softirq_clr_runner(i);
  24733. + unlock_softirq(i);
  24734. + WARN_ON(current->softirq_nestcnt != 1);
  24735. + }
  24736. +}
  24737. +
  24738. +static void __local_bh_disable(void)
  24739. +{
  24740. + if (++current->softirq_nestcnt == 1)
  24741. + migrate_disable();
  24742. +}
  24743. +
  24744. +void local_bh_disable(void)
  24745. +{
  24746. + __local_bh_disable();
  24747. +}
  24748. +EXPORT_SYMBOL(local_bh_disable);
  24749. +
  24750. +void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
  24751. +{
  24752. + __local_bh_disable();
  24753. + if (cnt & PREEMPT_CHECK_OFFSET)
  24754. + preempt_disable();
  24755. +}
  24756. +
  24757. +static void __local_bh_enable(void)
  24758. +{
  24759. + if (WARN_ON(current->softirq_nestcnt == 0))
  24760. + return;
  24761. +
  24762. + local_irq_disable();
  24763. + if (current->softirq_nestcnt == 1 && current->softirqs_raised)
  24764. + do_current_softirqs(1);
  24765. + local_irq_enable();
  24766. +
  24767. + if (--current->softirq_nestcnt == 0)
  24768. + migrate_enable();
  24769. +}
  24770. +
  24771. +void local_bh_enable(void)
  24772. +{
  24773. + __local_bh_enable();
  24774. +}
  24775. +EXPORT_SYMBOL(local_bh_enable);
  24776. +
  24777. +extern void __local_bh_enable_ip(unsigned long ip, unsigned int cnt)
  24778. +{
  24779. + __local_bh_enable();
  24780. + if (cnt & PREEMPT_CHECK_OFFSET)
  24781. + preempt_enable();
  24782. +}
  24783. +
  24784. +void local_bh_enable_ip(unsigned long ip)
  24785. +{
  24786. + local_bh_enable();
  24787. +}
  24788. +EXPORT_SYMBOL(local_bh_enable_ip);
  24789. +
  24790. +void _local_bh_enable(void)
  24791. +{
  24792. + if (WARN_ON(current->softirq_nestcnt == 0))
  24793. + return;
  24794. + if (--current->softirq_nestcnt == 0)
  24795. + migrate_enable();
  24796. +}
  24797. +EXPORT_SYMBOL(_local_bh_enable);
  24798. +
  24799. +int in_serving_softirq(void)
  24800. +{
  24801. + return current->flags & PF_IN_SOFTIRQ;
  24802. +}
  24803. +EXPORT_SYMBOL(in_serving_softirq);
  24804. +
  24805. +/* Called with preemption disabled */
  24806. +static void run_ksoftirqd(unsigned int cpu)
  24807. +{
  24808. + local_irq_disable();
  24809. + current->softirq_nestcnt++;
  24810. +
  24811. + do_current_softirqs(1);
  24812. + current->softirq_nestcnt--;
  24813. + rcu_note_context_switch(cpu);
  24814. + local_irq_enable();
  24815. +}
  24816. +
  24817. +/*
  24818. + * Called from netif_rx_ni(). Preemption enabled, but migration
  24819. + * disabled. So the cpu can't go away under us.
  24820. + */
  24821. +void thread_do_softirq(void)
  24822. +{
  24823. + if (!in_serving_softirq() && current->softirqs_raised) {
  24824. + current->softirq_nestcnt++;
  24825. + do_current_softirqs(0);
  24826. + current->softirq_nestcnt--;
  24827. + }
  24828. +}
  24829. +
  24830. +static void do_raise_softirq_irqoff(unsigned int nr)
  24831. +{
  24832. + trace_softirq_raise(nr);
  24833. + or_softirq_pending(1UL << nr);
  24834. +
  24835. + /*
  24836. + * If we are not in a hard interrupt and inside a bh disabled
  24837. + * region, we simply raise the flag on current. local_bh_enable()
  24838. + * will make sure that the softirq is executed. Otherwise we
  24839. + * delegate it to ksoftirqd.
  24840. + */
  24841. + if (!in_irq() && current->softirq_nestcnt)
  24842. + current->softirqs_raised |= (1U << nr);
  24843. + else if (__this_cpu_read(ksoftirqd))
  24844. + __this_cpu_read(ksoftirqd)->softirqs_raised |= (1U << nr);
  24845. +}
  24846. +
  24847. +void __raise_softirq_irqoff(unsigned int nr)
  24848. +{
  24849. + do_raise_softirq_irqoff(nr);
  24850. + if (!in_irq() && !current->softirq_nestcnt)
  24851. + wakeup_softirqd();
  24852. +}
  24853. +
  24854. +/*
  24855. + * This function must run with irqs disabled!
  24856. + */
  24857. +void raise_softirq_irqoff(unsigned int nr)
  24858. +{
  24859. + do_raise_softirq_irqoff(nr);
  24860. +
  24861. + /*
  24862. + * If we're in an hard interrupt we let irq return code deal
  24863. + * with the wakeup of ksoftirqd.
  24864. + */
  24865. + if (in_irq())
  24866. + return;
  24867. + /*
  24868. + * If we are in thread context but outside of a bh disabled
  24869. + * region, we need to wake ksoftirqd as well.
  24870. + *
  24871. + * CHECKME: Some of the places which do that could be wrapped
  24872. + * into local_bh_disable/enable pairs. Though it's unclear
  24873. + * whether this is worth the effort. To find those places just
  24874. + * raise a WARN() if the condition is met.
  24875. + */
  24876. + if (!current->softirq_nestcnt)
  24877. + wakeup_softirqd();
  24878. +}
  24879. +
  24880. +static inline int ksoftirqd_softirq_pending(void)
  24881. +{
  24882. + return current->softirqs_raised;
  24883. +}
  24884. +
  24885. +static inline void local_bh_disable_nort(void) { }
  24886. +static inline void _local_bh_enable_nort(void) { }
  24887. +
  24888. +static inline void ksoftirqd_set_sched_params(unsigned int cpu)
  24889. +{
  24890. + struct sched_param param = { .sched_priority = 1 };
  24891. +
  24892. + sched_setscheduler(current, SCHED_FIFO, &param);
  24893. + /* Take over all pending softirqs when starting */
  24894. + local_irq_disable();
  24895. + current->softirqs_raised = local_softirq_pending();
  24896. + local_irq_enable();
  24897. +}
  24898. +
  24899. +static inline void ksoftirqd_clr_sched_params(unsigned int cpu, bool online)
  24900. +{
  24901. + struct sched_param param = { .sched_priority = 0 };
  24902. +
  24903. + sched_setscheduler(current, SCHED_NORMAL, &param);
  24904. +}
  24905. +
  24906. +#endif /* PREEMPT_RT_FULL */
  24907. +/*
  24908. * Enter an interrupt context.
  24909. */
  24910. void irq_enter(void)
  24911. @@ -326,9 +732,9 @@
  24912. * Prevent raise_softirq from needlessly waking up ksoftirqd
  24913. * here, as softirq will be serviced on return from interrupt.
  24914. */
  24915. - local_bh_disable();
  24916. + local_bh_disable_nort();
  24917. tick_irq_enter();
  24918. - _local_bh_enable();
  24919. + _local_bh_enable_nort();
  24920. }
  24921. __irq_enter();
  24922. @@ -336,6 +742,7 @@
  24923. static inline void invoke_softirq(void)
  24924. {
  24925. +#ifndef CONFIG_PREEMPT_RT_FULL
  24926. if (!force_irqthreads) {
  24927. #ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK
  24928. /*
  24929. @@ -355,6 +762,15 @@
  24930. } else {
  24931. wakeup_softirqd();
  24932. }
  24933. +#else /* PREEMPT_RT_FULL */
  24934. + unsigned long flags;
  24935. +
  24936. + local_irq_save(flags);
  24937. + if (__this_cpu_read(ksoftirqd) &&
  24938. + __this_cpu_read(ksoftirqd)->softirqs_raised)
  24939. + wakeup_softirqd();
  24940. + local_irq_restore(flags);
  24941. +#endif
  24942. }
  24943. static inline void tick_irq_exit(void)
  24944. @@ -391,26 +807,6 @@
  24945. trace_hardirq_exit(); /* must be last! */
  24946. }
  24947. -/*
  24948. - * This function must run with irqs disabled!
  24949. - */
  24950. -inline void raise_softirq_irqoff(unsigned int nr)
  24951. -{
  24952. - __raise_softirq_irqoff(nr);
  24953. -
  24954. - /*
  24955. - * If we're in an interrupt or softirq, we're done
  24956. - * (this also catches softirq-disabled code). We will
  24957. - * actually run the softirq once we return from
  24958. - * the irq or softirq.
  24959. - *
  24960. - * Otherwise we wake up ksoftirqd to make sure we
  24961. - * schedule the softirq soon.
  24962. - */
  24963. - if (!in_interrupt())
  24964. - wakeup_softirqd();
  24965. -}
  24966. -
  24967. void raise_softirq(unsigned int nr)
  24968. {
  24969. unsigned long flags;
  24970. @@ -420,12 +816,6 @@
  24971. local_irq_restore(flags);
  24972. }
  24973. -void __raise_softirq_irqoff(unsigned int nr)
  24974. -{
  24975. - trace_softirq_raise(nr);
  24976. - or_softirq_pending(1UL << nr);
  24977. -}
  24978. -
  24979. void open_softirq(int nr, void (*action)(struct softirq_action *))
  24980. {
  24981. softirq_vec[nr].action = action;
  24982. @@ -442,15 +832,45 @@
  24983. static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec);
  24984. static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec);
  24985. +static void inline
  24986. +__tasklet_common_schedule(struct tasklet_struct *t, struct tasklet_head *head, unsigned int nr)
  24987. +{
  24988. + if (tasklet_trylock(t)) {
  24989. +again:
  24990. + /* We may have been preempted before tasklet_trylock
  24991. + * and __tasklet_action may have already run.
  24992. + * So double check the sched bit while the takslet
  24993. + * is locked before adding it to the list.
  24994. + */
  24995. + if (test_bit(TASKLET_STATE_SCHED, &t->state)) {
  24996. + t->next = NULL;
  24997. + *head->tail = t;
  24998. + head->tail = &(t->next);
  24999. + raise_softirq_irqoff(nr);
  25000. + tasklet_unlock(t);
  25001. + } else {
  25002. + /* This is subtle. If we hit the corner case above
  25003. + * It is possible that we get preempted right here,
  25004. + * and another task has successfully called
  25005. + * tasklet_schedule(), then this function, and
  25006. + * failed on the trylock. Thus we must be sure
  25007. + * before releasing the tasklet lock, that the
  25008. + * SCHED_BIT is clear. Otherwise the tasklet
  25009. + * may get its SCHED_BIT set, but not added to the
  25010. + * list
  25011. + */
  25012. + if (!tasklet_tryunlock(t))
  25013. + goto again;
  25014. + }
  25015. + }
  25016. +}
  25017. +
  25018. void __tasklet_schedule(struct tasklet_struct *t)
  25019. {
  25020. unsigned long flags;
  25021. local_irq_save(flags);
  25022. - t->next = NULL;
  25023. - *__this_cpu_read(tasklet_vec.tail) = t;
  25024. - __this_cpu_write(tasklet_vec.tail, &(t->next));
  25025. - raise_softirq_irqoff(TASKLET_SOFTIRQ);
  25026. + __tasklet_common_schedule(t, &__get_cpu_var(tasklet_vec), TASKLET_SOFTIRQ);
  25027. local_irq_restore(flags);
  25028. }
  25029. EXPORT_SYMBOL(__tasklet_schedule);
  25030. @@ -460,10 +880,7 @@
  25031. unsigned long flags;
  25032. local_irq_save(flags);
  25033. - t->next = NULL;
  25034. - *__this_cpu_read(tasklet_hi_vec.tail) = t;
  25035. - __this_cpu_write(tasklet_hi_vec.tail, &(t->next));
  25036. - raise_softirq_irqoff(HI_SOFTIRQ);
  25037. + __tasklet_common_schedule(t, &__get_cpu_var(tasklet_hi_vec), HI_SOFTIRQ);
  25038. local_irq_restore(flags);
  25039. }
  25040. EXPORT_SYMBOL(__tasklet_hi_schedule);
  25041. @@ -472,48 +889,116 @@
  25042. {
  25043. BUG_ON(!irqs_disabled());
  25044. - t->next = __this_cpu_read(tasklet_hi_vec.head);
  25045. - __this_cpu_write(tasklet_hi_vec.head, t);
  25046. - __raise_softirq_irqoff(HI_SOFTIRQ);
  25047. + __tasklet_hi_schedule(t);
  25048. }
  25049. EXPORT_SYMBOL(__tasklet_hi_schedule_first);
  25050. -static void tasklet_action(struct softirq_action *a)
  25051. +void tasklet_enable(struct tasklet_struct *t)
  25052. {
  25053. - struct tasklet_struct *list;
  25054. + if (!atomic_dec_and_test(&t->count))
  25055. + return;
  25056. + if (test_and_clear_bit(TASKLET_STATE_PENDING, &t->state))
  25057. + tasklet_schedule(t);
  25058. +}
  25059. +EXPORT_SYMBOL(tasklet_enable);
  25060. - local_irq_disable();
  25061. - list = __this_cpu_read(tasklet_vec.head);
  25062. - __this_cpu_write(tasklet_vec.head, NULL);
  25063. - __this_cpu_write(tasklet_vec.tail, this_cpu_ptr(&tasklet_vec.head));
  25064. - local_irq_enable();
  25065. +void tasklet_hi_enable(struct tasklet_struct *t)
  25066. +{
  25067. + if (!atomic_dec_and_test(&t->count))
  25068. + return;
  25069. + if (test_and_clear_bit(TASKLET_STATE_PENDING, &t->state))
  25070. + tasklet_hi_schedule(t);
  25071. +}
  25072. +EXPORT_SYMBOL(tasklet_hi_enable);
  25073. +
  25074. +static void __tasklet_action(struct softirq_action *a,
  25075. + struct tasklet_struct *list)
  25076. +{
  25077. + int loops = 1000000;
  25078. while (list) {
  25079. struct tasklet_struct *t = list;
  25080. list = list->next;
  25081. - if (tasklet_trylock(t)) {
  25082. - if (!atomic_read(&t->count)) {
  25083. - if (!test_and_clear_bit(TASKLET_STATE_SCHED,
  25084. - &t->state))
  25085. - BUG();
  25086. - t->func(t->data);
  25087. - tasklet_unlock(t);
  25088. - continue;
  25089. - }
  25090. - tasklet_unlock(t);
  25091. + /*
  25092. + * Should always succeed - after a tasklist got on the
  25093. + * list (after getting the SCHED bit set from 0 to 1),
  25094. + * nothing but the tasklet softirq it got queued to can
  25095. + * lock it:
  25096. + */
  25097. + if (!tasklet_trylock(t)) {
  25098. + WARN_ON(1);
  25099. + continue;
  25100. }
  25101. - local_irq_disable();
  25102. t->next = NULL;
  25103. - *__this_cpu_read(tasklet_vec.tail) = t;
  25104. - __this_cpu_write(tasklet_vec.tail, &(t->next));
  25105. - __raise_softirq_irqoff(TASKLET_SOFTIRQ);
  25106. - local_irq_enable();
  25107. +
  25108. + /*
  25109. + * If we cannot handle the tasklet because it's disabled,
  25110. + * mark it as pending. tasklet_enable() will later
  25111. + * re-schedule the tasklet.
  25112. + */
  25113. + if (unlikely(atomic_read(&t->count))) {
  25114. +out_disabled:
  25115. + /* implicit unlock: */
  25116. + wmb();
  25117. + t->state = TASKLET_STATEF_PENDING;
  25118. + continue;
  25119. + }
  25120. +
  25121. + /*
  25122. + * After this point on the tasklet might be rescheduled
  25123. + * on another CPU, but it can only be added to another
  25124. + * CPU's tasklet list if we unlock the tasklet (which we
  25125. + * dont do yet).
  25126. + */
  25127. + if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
  25128. + WARN_ON(1);
  25129. +
  25130. +again:
  25131. + t->func(t->data);
  25132. +
  25133. + /*
  25134. + * Try to unlock the tasklet. We must use cmpxchg, because
  25135. + * another CPU might have scheduled or disabled the tasklet.
  25136. + * We only allow the STATE_RUN -> 0 transition here.
  25137. + */
  25138. + while (!tasklet_tryunlock(t)) {
  25139. + /*
  25140. + * If it got disabled meanwhile, bail out:
  25141. + */
  25142. + if (atomic_read(&t->count))
  25143. + goto out_disabled;
  25144. + /*
  25145. + * If it got scheduled meanwhile, re-execute
  25146. + * the tasklet function:
  25147. + */
  25148. + if (test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
  25149. + goto again;
  25150. + if (!--loops) {
  25151. + printk("hm, tasklet state: %08lx\n", t->state);
  25152. + WARN_ON(1);
  25153. + tasklet_unlock(t);
  25154. + break;
  25155. + }
  25156. + }
  25157. }
  25158. }
  25159. +static void tasklet_action(struct softirq_action *a)
  25160. +{
  25161. + struct tasklet_struct *list;
  25162. +
  25163. + local_irq_disable();
  25164. + list = __get_cpu_var(tasklet_vec).head;
  25165. + __get_cpu_var(tasklet_vec).head = NULL;
  25166. + __get_cpu_var(tasklet_vec).tail = &__get_cpu_var(tasklet_vec).head;
  25167. + local_irq_enable();
  25168. +
  25169. + __tasklet_action(a, list);
  25170. +}
  25171. +
  25172. static void tasklet_hi_action(struct softirq_action *a)
  25173. {
  25174. struct tasklet_struct *list;
  25175. @@ -524,30 +1009,7 @@
  25176. __this_cpu_write(tasklet_hi_vec.tail, this_cpu_ptr(&tasklet_hi_vec.head));
  25177. local_irq_enable();
  25178. - while (list) {
  25179. - struct tasklet_struct *t = list;
  25180. -
  25181. - list = list->next;
  25182. -
  25183. - if (tasklet_trylock(t)) {
  25184. - if (!atomic_read(&t->count)) {
  25185. - if (!test_and_clear_bit(TASKLET_STATE_SCHED,
  25186. - &t->state))
  25187. - BUG();
  25188. - t->func(t->data);
  25189. - tasklet_unlock(t);
  25190. - continue;
  25191. - }
  25192. - tasklet_unlock(t);
  25193. - }
  25194. -
  25195. - local_irq_disable();
  25196. - t->next = NULL;
  25197. - *__this_cpu_read(tasklet_hi_vec.tail) = t;
  25198. - __this_cpu_write(tasklet_hi_vec.tail, &(t->next));
  25199. - __raise_softirq_irqoff(HI_SOFTIRQ);
  25200. - local_irq_enable();
  25201. - }
  25202. + __tasklet_action(a, list);
  25203. }
  25204. void tasklet_init(struct tasklet_struct *t,
  25205. @@ -568,7 +1030,7 @@
  25206. while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) {
  25207. do {
  25208. - yield();
  25209. + msleep(1);
  25210. } while (test_bit(TASKLET_STATE_SCHED, &t->state));
  25211. }
  25212. tasklet_unlock_wait(t);
  25213. @@ -642,26 +1104,26 @@
  25214. open_softirq(HI_SOFTIRQ, tasklet_hi_action);
  25215. }
  25216. -static int ksoftirqd_should_run(unsigned int cpu)
  25217. -{
  25218. - return local_softirq_pending();
  25219. -}
  25220. -
  25221. -static void run_ksoftirqd(unsigned int cpu)
  25222. +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
  25223. +void tasklet_unlock_wait(struct tasklet_struct *t)
  25224. {
  25225. - local_irq_disable();
  25226. - if (local_softirq_pending()) {
  25227. + while (test_bit(TASKLET_STATE_RUN, &(t)->state)) {
  25228. /*
  25229. - * We can safely run softirq on inline stack, as we are not deep
  25230. - * in the task stack here.
  25231. + * Hack for now to avoid this busy-loop:
  25232. */
  25233. - __do_softirq();
  25234. - rcu_note_context_switch(cpu);
  25235. - local_irq_enable();
  25236. - cond_resched();
  25237. - return;
  25238. +#ifdef CONFIG_PREEMPT_RT_FULL
  25239. + msleep(1);
  25240. +#else
  25241. + barrier();
  25242. +#endif
  25243. }
  25244. - local_irq_enable();
  25245. +}
  25246. +EXPORT_SYMBOL(tasklet_unlock_wait);
  25247. +#endif
  25248. +
  25249. +static int ksoftirqd_should_run(unsigned int cpu)
  25250. +{
  25251. + return ksoftirqd_softirq_pending();
  25252. }
  25253. #ifdef CONFIG_HOTPLUG_CPU
  25254. @@ -743,6 +1205,8 @@
  25255. static struct smp_hotplug_thread softirq_threads = {
  25256. .store = &ksoftirqd,
  25257. + .setup = ksoftirqd_set_sched_params,
  25258. + .cleanup = ksoftirqd_clr_sched_params,
  25259. .thread_should_run = ksoftirqd_should_run,
  25260. .thread_fn = run_ksoftirqd,
  25261. .thread_comm = "ksoftirqd/%u",
  25262. diff -Nur linux-3.18.12.orig/kernel/stop_machine.c linux-3.18.12/kernel/stop_machine.c
  25263. --- linux-3.18.12.orig/kernel/stop_machine.c 2015-04-20 14:48:02.000000000 -0500
  25264. +++ linux-3.18.12/kernel/stop_machine.c 2015-04-26 13:32:22.451684003 -0500
  25265. @@ -30,12 +30,12 @@
  25266. atomic_t nr_todo; /* nr left to execute */
  25267. bool executed; /* actually executed? */
  25268. int ret; /* collected return value */
  25269. - struct completion completion; /* fired if nr_todo reaches 0 */
  25270. + struct task_struct *waiter; /* woken when nr_todo reaches 0 */
  25271. };
  25272. /* the actual stopper, one per every possible cpu, enabled on online cpus */
  25273. struct cpu_stopper {
  25274. - spinlock_t lock;
  25275. + raw_spinlock_t lock;
  25276. bool enabled; /* is this stopper enabled? */
  25277. struct list_head works; /* list of pending works */
  25278. };
  25279. @@ -56,7 +56,7 @@
  25280. {
  25281. memset(done, 0, sizeof(*done));
  25282. atomic_set(&done->nr_todo, nr_todo);
  25283. - init_completion(&done->completion);
  25284. + done->waiter = current;
  25285. }
  25286. /* signal completion unless @done is NULL */
  25287. @@ -65,8 +65,10 @@
  25288. if (done) {
  25289. if (executed)
  25290. done->executed = true;
  25291. - if (atomic_dec_and_test(&done->nr_todo))
  25292. - complete(&done->completion);
  25293. + if (atomic_dec_and_test(&done->nr_todo)) {
  25294. + wake_up_process(done->waiter);
  25295. + done->waiter = NULL;
  25296. + }
  25297. }
  25298. }
  25299. @@ -78,7 +80,7 @@
  25300. unsigned long flags;
  25301. - spin_lock_irqsave(&stopper->lock, flags);
  25302. + raw_spin_lock_irqsave(&stopper->lock, flags);
  25303. if (stopper->enabled) {
  25304. list_add_tail(&work->list, &stopper->works);
  25305. @@ -86,7 +88,23 @@
  25306. } else
  25307. cpu_stop_signal_done(work->done, false);
  25308. - spin_unlock_irqrestore(&stopper->lock, flags);
  25309. + raw_spin_unlock_irqrestore(&stopper->lock, flags);
  25310. +}
  25311. +
  25312. +static void wait_for_stop_done(struct cpu_stop_done *done)
  25313. +{
  25314. + set_current_state(TASK_UNINTERRUPTIBLE);
  25315. + while (atomic_read(&done->nr_todo)) {
  25316. + schedule();
  25317. + set_current_state(TASK_UNINTERRUPTIBLE);
  25318. + }
  25319. + /*
  25320. + * We need to wait until cpu_stop_signal_done() has cleared
  25321. + * done->waiter.
  25322. + */
  25323. + while (done->waiter)
  25324. + cpu_relax();
  25325. + set_current_state(TASK_RUNNING);
  25326. }
  25327. /**
  25328. @@ -120,7 +138,7 @@
  25329. cpu_stop_init_done(&done, 1);
  25330. cpu_stop_queue_work(cpu, &work);
  25331. - wait_for_completion(&done.completion);
  25332. + wait_for_stop_done(&done);
  25333. return done.executed ? done.ret : -ENOENT;
  25334. }
  25335. @@ -248,7 +266,7 @@
  25336. struct irq_cpu_stop_queue_work_info call_args;
  25337. struct multi_stop_data msdata;
  25338. - preempt_disable();
  25339. + preempt_disable_nort();
  25340. msdata = (struct multi_stop_data){
  25341. .fn = fn,
  25342. .data = arg,
  25343. @@ -281,7 +299,7 @@
  25344. * This relies on the stopper workqueues to be FIFO.
  25345. */
  25346. if (!cpu_active(cpu1) || !cpu_active(cpu2)) {
  25347. - preempt_enable();
  25348. + preempt_enable_nort();
  25349. return -ENOENT;
  25350. }
  25351. @@ -295,9 +313,9 @@
  25352. &irq_cpu_stop_queue_work,
  25353. &call_args, 1);
  25354. lg_local_unlock(&stop_cpus_lock);
  25355. - preempt_enable();
  25356. + preempt_enable_nort();
  25357. - wait_for_completion(&done.completion);
  25358. + wait_for_stop_done(&done);
  25359. return done.executed ? done.ret : -ENOENT;
  25360. }
  25361. @@ -329,7 +347,7 @@
  25362. static void queue_stop_cpus_work(const struct cpumask *cpumask,
  25363. cpu_stop_fn_t fn, void *arg,
  25364. - struct cpu_stop_done *done)
  25365. + struct cpu_stop_done *done, bool inactive)
  25366. {
  25367. struct cpu_stop_work *work;
  25368. unsigned int cpu;
  25369. @@ -343,11 +361,13 @@
  25370. }
  25371. /*
  25372. - * Disable preemption while queueing to avoid getting
  25373. - * preempted by a stopper which might wait for other stoppers
  25374. - * to enter @fn which can lead to deadlock.
  25375. + * Make sure that all work is queued on all cpus before
  25376. + * any of the cpus can execute it.
  25377. */
  25378. - lg_global_lock(&stop_cpus_lock);
  25379. + if (!inactive)
  25380. + lg_global_lock(&stop_cpus_lock);
  25381. + else
  25382. + lg_global_trylock_relax(&stop_cpus_lock);
  25383. for_each_cpu(cpu, cpumask)
  25384. cpu_stop_queue_work(cpu, &per_cpu(stop_cpus_work, cpu));
  25385. lg_global_unlock(&stop_cpus_lock);
  25386. @@ -359,8 +379,8 @@
  25387. struct cpu_stop_done done;
  25388. cpu_stop_init_done(&done, cpumask_weight(cpumask));
  25389. - queue_stop_cpus_work(cpumask, fn, arg, &done);
  25390. - wait_for_completion(&done.completion);
  25391. + queue_stop_cpus_work(cpumask, fn, arg, &done, false);
  25392. + wait_for_stop_done(&done);
  25393. return done.executed ? done.ret : -ENOENT;
  25394. }
  25395. @@ -439,9 +459,9 @@
  25396. unsigned long flags;
  25397. int run;
  25398. - spin_lock_irqsave(&stopper->lock, flags);
  25399. + raw_spin_lock_irqsave(&stopper->lock, flags);
  25400. run = !list_empty(&stopper->works);
  25401. - spin_unlock_irqrestore(&stopper->lock, flags);
  25402. + raw_spin_unlock_irqrestore(&stopper->lock, flags);
  25403. return run;
  25404. }
  25405. @@ -453,13 +473,13 @@
  25406. repeat:
  25407. work = NULL;
  25408. - spin_lock_irq(&stopper->lock);
  25409. + raw_spin_lock_irq(&stopper->lock);
  25410. if (!list_empty(&stopper->works)) {
  25411. work = list_first_entry(&stopper->works,
  25412. struct cpu_stop_work, list);
  25413. list_del_init(&work->list);
  25414. }
  25415. - spin_unlock_irq(&stopper->lock);
  25416. + raw_spin_unlock_irq(&stopper->lock);
  25417. if (work) {
  25418. cpu_stop_fn_t fn = work->fn;
  25419. @@ -467,6 +487,16 @@
  25420. struct cpu_stop_done *done = work->done;
  25421. char ksym_buf[KSYM_NAME_LEN] __maybe_unused;
  25422. + /*
  25423. + * Wait until the stopper finished scheduling on all
  25424. + * cpus
  25425. + */
  25426. + lg_global_lock(&stop_cpus_lock);
  25427. + /*
  25428. + * Let other cpu threads continue as well
  25429. + */
  25430. + lg_global_unlock(&stop_cpus_lock);
  25431. +
  25432. /* cpu stop callbacks are not allowed to sleep */
  25433. preempt_disable();
  25434. @@ -481,7 +511,13 @@
  25435. kallsyms_lookup((unsigned long)fn, NULL, NULL, NULL,
  25436. ksym_buf), arg);
  25437. + /*
  25438. + * Make sure that the wakeup and setting done->waiter
  25439. + * to NULL is atomic.
  25440. + */
  25441. + local_irq_disable();
  25442. cpu_stop_signal_done(done, true);
  25443. + local_irq_enable();
  25444. goto repeat;
  25445. }
  25446. }
  25447. @@ -500,20 +536,20 @@
  25448. unsigned long flags;
  25449. /* drain remaining works */
  25450. - spin_lock_irqsave(&stopper->lock, flags);
  25451. + raw_spin_lock_irqsave(&stopper->lock, flags);
  25452. list_for_each_entry(work, &stopper->works, list)
  25453. cpu_stop_signal_done(work->done, false);
  25454. stopper->enabled = false;
  25455. - spin_unlock_irqrestore(&stopper->lock, flags);
  25456. + raw_spin_unlock_irqrestore(&stopper->lock, flags);
  25457. }
  25458. static void cpu_stop_unpark(unsigned int cpu)
  25459. {
  25460. struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
  25461. - spin_lock_irq(&stopper->lock);
  25462. + raw_spin_lock_irq(&stopper->lock);
  25463. stopper->enabled = true;
  25464. - spin_unlock_irq(&stopper->lock);
  25465. + raw_spin_unlock_irq(&stopper->lock);
  25466. }
  25467. static struct smp_hotplug_thread cpu_stop_threads = {
  25468. @@ -535,10 +571,12 @@
  25469. for_each_possible_cpu(cpu) {
  25470. struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
  25471. - spin_lock_init(&stopper->lock);
  25472. + raw_spin_lock_init(&stopper->lock);
  25473. INIT_LIST_HEAD(&stopper->works);
  25474. }
  25475. + lg_lock_init(&stop_cpus_lock, "stop_cpus_lock");
  25476. +
  25477. BUG_ON(smpboot_register_percpu_thread(&cpu_stop_threads));
  25478. stop_machine_initialized = true;
  25479. return 0;
  25480. @@ -634,11 +672,11 @@
  25481. set_state(&msdata, MULTI_STOP_PREPARE);
  25482. cpu_stop_init_done(&done, num_active_cpus());
  25483. queue_stop_cpus_work(cpu_active_mask, multi_cpu_stop, &msdata,
  25484. - &done);
  25485. + &done, true);
  25486. ret = multi_cpu_stop(&msdata);
  25487. /* Busy wait for completion. */
  25488. - while (!completion_done(&done.completion))
  25489. + while (atomic_read(&done.nr_todo))
  25490. cpu_relax();
  25491. mutex_unlock(&stop_cpus_mutex);
  25492. diff -Nur linux-3.18.12.orig/kernel/time/hrtimer.c linux-3.18.12/kernel/time/hrtimer.c
  25493. --- linux-3.18.12.orig/kernel/time/hrtimer.c 2015-04-20 14:48:02.000000000 -0500
  25494. +++ linux-3.18.12/kernel/time/hrtimer.c 2015-04-26 13:32:22.451684003 -0500
  25495. @@ -48,11 +48,13 @@
  25496. #include <linux/sched/rt.h>
  25497. #include <linux/sched/deadline.h>
  25498. #include <linux/timer.h>
  25499. +#include <linux/kthread.h>
  25500. #include <linux/freezer.h>
  25501. #include <asm/uaccess.h>
  25502. #include <trace/events/timer.h>
  25503. +#include <trace/events/hist.h>
  25504. #include "timekeeping.h"
  25505. @@ -568,8 +570,7 @@
  25506. * When the callback is running, we do not reprogram the clock event
  25507. * device. The timer callback is either running on a different CPU or
  25508. * the callback is executed in the hrtimer_interrupt context. The
  25509. - * reprogramming is handled either by the softirq, which called the
  25510. - * callback or at the end of the hrtimer_interrupt.
  25511. + * reprogramming is handled at the end of the hrtimer_interrupt.
  25512. */
  25513. if (hrtimer_callback_running(timer))
  25514. return 0;
  25515. @@ -604,6 +605,9 @@
  25516. return res;
  25517. }
  25518. +static void __run_hrtimer(struct hrtimer *timer, ktime_t *now);
  25519. +static int hrtimer_rt_defer(struct hrtimer *timer);
  25520. +
  25521. /*
  25522. * Initialize the high resolution related parts of cpu_base
  25523. */
  25524. @@ -613,6 +617,21 @@
  25525. base->hres_active = 0;
  25526. }
  25527. +static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
  25528. + struct hrtimer_clock_base *base,
  25529. + int wakeup)
  25530. +{
  25531. + if (!hrtimer_reprogram(timer, base))
  25532. + return 0;
  25533. + if (!wakeup)
  25534. + return -ETIME;
  25535. +#ifdef CONFIG_PREEMPT_RT_BASE
  25536. + if (!hrtimer_rt_defer(timer))
  25537. + return -ETIME;
  25538. +#endif
  25539. + return 1;
  25540. +}
  25541. +
  25542. static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
  25543. {
  25544. ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset;
  25545. @@ -678,6 +697,44 @@
  25546. static DECLARE_WORK(hrtimer_work, clock_was_set_work);
  25547. +#ifdef CONFIG_PREEMPT_RT_FULL
  25548. +/*
  25549. + * RT can not call schedule_work from real interrupt context.
  25550. + * Need to make a thread to do the real work.
  25551. + */
  25552. +static struct task_struct *clock_set_delay_thread;
  25553. +static bool do_clock_set_delay;
  25554. +
  25555. +static int run_clock_set_delay(void *ignore)
  25556. +{
  25557. + while (!kthread_should_stop()) {
  25558. + set_current_state(TASK_INTERRUPTIBLE);
  25559. + if (do_clock_set_delay) {
  25560. + do_clock_set_delay = false;
  25561. + schedule_work(&hrtimer_work);
  25562. + }
  25563. + schedule();
  25564. + }
  25565. + __set_current_state(TASK_RUNNING);
  25566. + return 0;
  25567. +}
  25568. +
  25569. +void clock_was_set_delayed(void)
  25570. +{
  25571. + do_clock_set_delay = true;
  25572. + /* Make visible before waking up process */
  25573. + smp_wmb();
  25574. + wake_up_process(clock_set_delay_thread);
  25575. +}
  25576. +
  25577. +static __init int create_clock_set_delay_thread(void)
  25578. +{
  25579. + clock_set_delay_thread = kthread_run(run_clock_set_delay, NULL, "kclksetdelayd");
  25580. + BUG_ON(!clock_set_delay_thread);
  25581. + return 0;
  25582. +}
  25583. +early_initcall(create_clock_set_delay_thread);
  25584. +#else /* PREEMPT_RT_FULL */
  25585. /*
  25586. * Called from timekeeping and resume code to reprogramm the hrtimer
  25587. * interrupt device on all cpus.
  25588. @@ -686,6 +743,7 @@
  25589. {
  25590. schedule_work(&hrtimer_work);
  25591. }
  25592. +#endif
  25593. #else
  25594. @@ -694,6 +752,13 @@
  25595. static inline int hrtimer_switch_to_hres(void) { return 0; }
  25596. static inline void
  25597. hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { }
  25598. +static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
  25599. + struct hrtimer_clock_base *base,
  25600. + int wakeup)
  25601. +{
  25602. + return 0;
  25603. +}
  25604. +
  25605. static inline int hrtimer_reprogram(struct hrtimer *timer,
  25606. struct hrtimer_clock_base *base)
  25607. {
  25608. @@ -701,7 +766,6 @@
  25609. }
  25610. static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
  25611. static inline void retrigger_next_event(void *arg) { }
  25612. -
  25613. #endif /* CONFIG_HIGH_RES_TIMERS */
  25614. /*
  25615. @@ -819,6 +883,32 @@
  25616. }
  25617. EXPORT_SYMBOL_GPL(hrtimer_forward);
  25618. +#ifdef CONFIG_PREEMPT_RT_BASE
  25619. +# define wake_up_timer_waiters(b) wake_up(&(b)->wait)
  25620. +
  25621. +/**
  25622. + * hrtimer_wait_for_timer - Wait for a running timer
  25623. + *
  25624. + * @timer: timer to wait for
  25625. + *
  25626. + * The function waits in case the timers callback function is
  25627. + * currently executed on the waitqueue of the timer base. The
  25628. + * waitqueue is woken up after the timer callback function has
  25629. + * finished execution.
  25630. + */
  25631. +void hrtimer_wait_for_timer(const struct hrtimer *timer)
  25632. +{
  25633. + struct hrtimer_clock_base *base = timer->base;
  25634. +
  25635. + if (base && base->cpu_base && !timer->irqsafe)
  25636. + wait_event(base->cpu_base->wait,
  25637. + !(timer->state & HRTIMER_STATE_CALLBACK));
  25638. +}
  25639. +
  25640. +#else
  25641. +# define wake_up_timer_waiters(b) do { } while (0)
  25642. +#endif
  25643. +
  25644. /*
  25645. * enqueue_hrtimer - internal function to (re)start a timer
  25646. *
  25647. @@ -862,6 +952,11 @@
  25648. if (!(timer->state & HRTIMER_STATE_ENQUEUED))
  25649. goto out;
  25650. + if (unlikely(!list_empty(&timer->cb_entry))) {
  25651. + list_del_init(&timer->cb_entry);
  25652. + goto out;
  25653. + }
  25654. +
  25655. next_timer = timerqueue_getnext(&base->active);
  25656. timerqueue_del(&base->active, &timer->node);
  25657. if (&timer->node == next_timer) {
  25658. @@ -949,7 +1044,16 @@
  25659. new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED);
  25660. timer_stats_hrtimer_set_start_info(timer);
  25661. +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
  25662. + {
  25663. + ktime_t now = new_base->get_time();
  25664. + if (ktime_to_ns(tim) < ktime_to_ns(now))
  25665. + timer->praecox = now;
  25666. + else
  25667. + timer->praecox = ktime_set(0, 0);
  25668. + }
  25669. +#endif
  25670. leftmost = enqueue_hrtimer(timer, new_base);
  25671. if (!leftmost) {
  25672. @@ -963,15 +1067,26 @@
  25673. * on dynticks target.
  25674. */
  25675. wake_up_nohz_cpu(new_base->cpu_base->cpu);
  25676. - } else if (new_base->cpu_base == this_cpu_ptr(&hrtimer_bases) &&
  25677. - hrtimer_reprogram(timer, new_base)) {
  25678. + } else if (new_base->cpu_base == this_cpu_ptr(&hrtimer_bases)) {
  25679. +
  25680. + ret = hrtimer_enqueue_reprogram(timer, new_base, wakeup);
  25681. + if (ret < 0) {
  25682. + /*
  25683. + * In case we failed to reprogram the timer (mostly
  25684. + * because out current timer is already elapsed),
  25685. + * remove it again and report a failure. This avoids
  25686. + * stale base->first entries.
  25687. + */
  25688. + debug_deactivate(timer);
  25689. + __remove_hrtimer(timer, new_base,
  25690. + timer->state & HRTIMER_STATE_CALLBACK, 0);
  25691. + } else if (ret > 0) {
  25692. /*
  25693. * Only allow reprogramming if the new base is on this CPU.
  25694. * (it might still be on another CPU if the timer was pending)
  25695. *
  25696. * XXX send_remote_softirq() ?
  25697. */
  25698. - if (wakeup) {
  25699. /*
  25700. * We need to drop cpu_base->lock to avoid a
  25701. * lock ordering issue vs. rq->lock.
  25702. @@ -979,9 +1094,7 @@
  25703. raw_spin_unlock(&new_base->cpu_base->lock);
  25704. raise_softirq_irqoff(HRTIMER_SOFTIRQ);
  25705. local_irq_restore(flags);
  25706. - return ret;
  25707. - } else {
  25708. - __raise_softirq_irqoff(HRTIMER_SOFTIRQ);
  25709. + return 0;
  25710. }
  25711. }
  25712. @@ -1072,7 +1185,7 @@
  25713. if (ret >= 0)
  25714. return ret;
  25715. - cpu_relax();
  25716. + hrtimer_wait_for_timer(timer);
  25717. }
  25718. }
  25719. EXPORT_SYMBOL_GPL(hrtimer_cancel);
  25720. @@ -1151,6 +1264,7 @@
  25721. base = hrtimer_clockid_to_base(clock_id);
  25722. timer->base = &cpu_base->clock_base[base];
  25723. + INIT_LIST_HEAD(&timer->cb_entry);
  25724. timerqueue_init(&timer->node);
  25725. #ifdef CONFIG_TIMER_STATS
  25726. @@ -1234,6 +1348,126 @@
  25727. timer->state &= ~HRTIMER_STATE_CALLBACK;
  25728. }
  25729. +static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer);
  25730. +
  25731. +#ifdef CONFIG_PREEMPT_RT_BASE
  25732. +static void hrtimer_rt_reprogram(int restart, struct hrtimer *timer,
  25733. + struct hrtimer_clock_base *base)
  25734. +{
  25735. + /*
  25736. + * Note, we clear the callback flag before we requeue the
  25737. + * timer otherwise we trigger the callback_running() check
  25738. + * in hrtimer_reprogram().
  25739. + */
  25740. + timer->state &= ~HRTIMER_STATE_CALLBACK;
  25741. +
  25742. + if (restart != HRTIMER_NORESTART) {
  25743. + BUG_ON(hrtimer_active(timer));
  25744. + /*
  25745. + * Enqueue the timer, if it's the leftmost timer then
  25746. + * we need to reprogram it.
  25747. + */
  25748. + if (!enqueue_hrtimer(timer, base))
  25749. + return;
  25750. +
  25751. +#ifndef CONFIG_HIGH_RES_TIMERS
  25752. + }
  25753. +#else
  25754. + if (base->cpu_base->hres_active &&
  25755. + hrtimer_reprogram(timer, base))
  25756. + goto requeue;
  25757. +
  25758. + } else if (hrtimer_active(timer)) {
  25759. + /*
  25760. + * If the timer was rearmed on another CPU, reprogram
  25761. + * the event device.
  25762. + */
  25763. + if (&timer->node == base->active.next &&
  25764. + base->cpu_base->hres_active &&
  25765. + hrtimer_reprogram(timer, base))
  25766. + goto requeue;
  25767. + }
  25768. + return;
  25769. +
  25770. +requeue:
  25771. + /*
  25772. + * Timer is expired. Thus move it from tree to pending list
  25773. + * again.
  25774. + */
  25775. + __remove_hrtimer(timer, base, timer->state, 0);
  25776. + list_add_tail(&timer->cb_entry, &base->expired);
  25777. +#endif
  25778. +}
  25779. +
  25780. +/*
  25781. + * The changes in mainline which removed the callback modes from
  25782. + * hrtimer are not yet working with -rt. The non wakeup_process()
  25783. + * based callbacks which involve sleeping locks need to be treated
  25784. + * seperately.
  25785. + */
  25786. +static void hrtimer_rt_run_pending(void)
  25787. +{
  25788. + enum hrtimer_restart (*fn)(struct hrtimer *);
  25789. + struct hrtimer_cpu_base *cpu_base;
  25790. + struct hrtimer_clock_base *base;
  25791. + struct hrtimer *timer;
  25792. + int index, restart;
  25793. +
  25794. + local_irq_disable();
  25795. + cpu_base = &per_cpu(hrtimer_bases, smp_processor_id());
  25796. +
  25797. + raw_spin_lock(&cpu_base->lock);
  25798. +
  25799. + for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) {
  25800. + base = &cpu_base->clock_base[index];
  25801. +
  25802. + while (!list_empty(&base->expired)) {
  25803. + timer = list_first_entry(&base->expired,
  25804. + struct hrtimer, cb_entry);
  25805. +
  25806. + /*
  25807. + * Same as the above __run_hrtimer function
  25808. + * just we run with interrupts enabled.
  25809. + */
  25810. + debug_hrtimer_deactivate(timer);
  25811. + __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
  25812. + timer_stats_account_hrtimer(timer);
  25813. + fn = timer->function;
  25814. +
  25815. + raw_spin_unlock_irq(&cpu_base->lock);
  25816. + restart = fn(timer);
  25817. + raw_spin_lock_irq(&cpu_base->lock);
  25818. +
  25819. + hrtimer_rt_reprogram(restart, timer, base);
  25820. + }
  25821. + }
  25822. +
  25823. + raw_spin_unlock_irq(&cpu_base->lock);
  25824. +
  25825. + wake_up_timer_waiters(cpu_base);
  25826. +}
  25827. +
  25828. +static int hrtimer_rt_defer(struct hrtimer *timer)
  25829. +{
  25830. + if (timer->irqsafe)
  25831. + return 0;
  25832. +
  25833. + __remove_hrtimer(timer, timer->base, timer->state, 0);
  25834. + list_add_tail(&timer->cb_entry, &timer->base->expired);
  25835. + return 1;
  25836. +}
  25837. +
  25838. +#else
  25839. +
  25840. +static inline void hrtimer_rt_run_pending(void)
  25841. +{
  25842. + hrtimer_peek_ahead_timers();
  25843. +}
  25844. +
  25845. +static inline int hrtimer_rt_defer(struct hrtimer *timer) { return 0; }
  25846. +
  25847. +#endif
  25848. +
  25849. #ifdef CONFIG_HIGH_RES_TIMERS
  25850. /*
  25851. @@ -1244,7 +1478,7 @@
  25852. {
  25853. struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
  25854. ktime_t expires_next, now, entry_time, delta;
  25855. - int i, retries = 0;
  25856. + int i, retries = 0, raise = 0;
  25857. BUG_ON(!cpu_base->hres_active);
  25858. cpu_base->nr_events++;
  25859. @@ -1279,6 +1513,15 @@
  25860. timer = container_of(node, struct hrtimer, node);
  25861. + trace_hrtimer_interrupt(raw_smp_processor_id(),
  25862. + ktime_to_ns(ktime_sub(ktime_to_ns(timer->praecox) ?
  25863. + timer->praecox : hrtimer_get_expires(timer),
  25864. + basenow)),
  25865. + current,
  25866. + timer->function == hrtimer_wakeup ?
  25867. + container_of(timer, struct hrtimer_sleeper,
  25868. + timer)->task : NULL);
  25869. +
  25870. /*
  25871. * The immediate goal for using the softexpires is
  25872. * minimizing wakeups, not running timers at the
  25873. @@ -1304,7 +1547,10 @@
  25874. break;
  25875. }
  25876. - __run_hrtimer(timer, &basenow);
  25877. + if (!hrtimer_rt_defer(timer))
  25878. + __run_hrtimer(timer, &basenow);
  25879. + else
  25880. + raise = 1;
  25881. }
  25882. }
  25883. @@ -1319,7 +1565,7 @@
  25884. if (expires_next.tv64 == KTIME_MAX ||
  25885. !tick_program_event(expires_next, 0)) {
  25886. cpu_base->hang_detected = 0;
  25887. - return;
  25888. + goto out;
  25889. }
  25890. /*
  25891. @@ -1363,6 +1609,9 @@
  25892. tick_program_event(expires_next, 1);
  25893. printk_once(KERN_WARNING "hrtimer: interrupt took %llu ns\n",
  25894. ktime_to_ns(delta));
  25895. +out:
  25896. + if (raise)
  25897. + raise_softirq_irqoff(HRTIMER_SOFTIRQ);
  25898. }
  25899. /*
  25900. @@ -1398,18 +1647,18 @@
  25901. __hrtimer_peek_ahead_timers();
  25902. local_irq_restore(flags);
  25903. }
  25904. -
  25905. -static void run_hrtimer_softirq(struct softirq_action *h)
  25906. -{
  25907. - hrtimer_peek_ahead_timers();
  25908. -}
  25909. -
  25910. #else /* CONFIG_HIGH_RES_TIMERS */
  25911. static inline void __hrtimer_peek_ahead_timers(void) { }
  25912. #endif /* !CONFIG_HIGH_RES_TIMERS */
  25913. +
  25914. +static void run_hrtimer_softirq(struct softirq_action *h)
  25915. +{
  25916. + hrtimer_rt_run_pending();
  25917. +}
  25918. +
  25919. /*
  25920. * Called from timer softirq every jiffy, expire hrtimers:
  25921. *
  25922. @@ -1442,7 +1691,7 @@
  25923. struct timerqueue_node *node;
  25924. struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
  25925. struct hrtimer_clock_base *base;
  25926. - int index, gettime = 1;
  25927. + int index, gettime = 1, raise = 0;
  25928. if (hrtimer_hres_active())
  25929. return;
  25930. @@ -1467,10 +1716,16 @@
  25931. hrtimer_get_expires_tv64(timer))
  25932. break;
  25933. - __run_hrtimer(timer, &base->softirq_time);
  25934. + if (!hrtimer_rt_defer(timer))
  25935. + __run_hrtimer(timer, &base->softirq_time);
  25936. + else
  25937. + raise = 1;
  25938. }
  25939. raw_spin_unlock(&cpu_base->lock);
  25940. }
  25941. +
  25942. + if (raise)
  25943. + raise_softirq_irqoff(HRTIMER_SOFTIRQ);
  25944. }
  25945. /*
  25946. @@ -1492,16 +1747,18 @@
  25947. void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
  25948. {
  25949. sl->timer.function = hrtimer_wakeup;
  25950. + sl->timer.irqsafe = 1;
  25951. sl->task = task;
  25952. }
  25953. EXPORT_SYMBOL_GPL(hrtimer_init_sleeper);
  25954. -static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode)
  25955. +static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode,
  25956. + unsigned long state)
  25957. {
  25958. hrtimer_init_sleeper(t, current);
  25959. do {
  25960. - set_current_state(TASK_INTERRUPTIBLE);
  25961. + set_current_state(state);
  25962. hrtimer_start_expires(&t->timer, mode);
  25963. if (!hrtimer_active(&t->timer))
  25964. t->task = NULL;
  25965. @@ -1545,7 +1802,8 @@
  25966. HRTIMER_MODE_ABS);
  25967. hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires);
  25968. - if (do_nanosleep(&t, HRTIMER_MODE_ABS))
  25969. + /* cpu_chill() does not care about restart state. */
  25970. + if (do_nanosleep(&t, HRTIMER_MODE_ABS, TASK_INTERRUPTIBLE))
  25971. goto out;
  25972. rmtp = restart->nanosleep.rmtp;
  25973. @@ -1562,8 +1820,10 @@
  25974. return ret;
  25975. }
  25976. -long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
  25977. - const enum hrtimer_mode mode, const clockid_t clockid)
  25978. +static long
  25979. +__hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
  25980. + const enum hrtimer_mode mode, const clockid_t clockid,
  25981. + unsigned long state)
  25982. {
  25983. struct restart_block *restart;
  25984. struct hrtimer_sleeper t;
  25985. @@ -1576,7 +1836,7 @@
  25986. hrtimer_init_on_stack(&t.timer, clockid, mode);
  25987. hrtimer_set_expires_range_ns(&t.timer, timespec_to_ktime(*rqtp), slack);
  25988. - if (do_nanosleep(&t, mode))
  25989. + if (do_nanosleep(&t, mode, state))
  25990. goto out;
  25991. /* Absolute timers do not update the rmtp value and restart: */
  25992. @@ -1603,6 +1863,12 @@
  25993. return ret;
  25994. }
  25995. +long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
  25996. + const enum hrtimer_mode mode, const clockid_t clockid)
  25997. +{
  25998. + return __hrtimer_nanosleep(rqtp, rmtp, mode, clockid, TASK_INTERRUPTIBLE);
  25999. +}
  26000. +
  26001. SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
  26002. struct timespec __user *, rmtp)
  26003. {
  26004. @@ -1617,6 +1883,26 @@
  26005. return hrtimer_nanosleep(&tu, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
  26006. }
  26007. +#ifdef CONFIG_PREEMPT_RT_FULL
  26008. +/*
  26009. + * Sleep for 1 ms in hope whoever holds what we want will let it go.
  26010. + */
  26011. +void cpu_chill(void)
  26012. +{
  26013. + struct timespec tu = {
  26014. + .tv_nsec = NSEC_PER_MSEC,
  26015. + };
  26016. + unsigned int freeze_flag = current->flags & PF_NOFREEZE;
  26017. +
  26018. + current->flags |= PF_NOFREEZE;
  26019. + __hrtimer_nanosleep(&tu, NULL, HRTIMER_MODE_REL, CLOCK_MONOTONIC,
  26020. + TASK_UNINTERRUPTIBLE);
  26021. + if (!freeze_flag)
  26022. + current->flags &= ~PF_NOFREEZE;
  26023. +}
  26024. +EXPORT_SYMBOL(cpu_chill);
  26025. +#endif
  26026. +
  26027. /*
  26028. * Functions related to boot-time initialization:
  26029. */
  26030. @@ -1628,10 +1914,14 @@
  26031. for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
  26032. cpu_base->clock_base[i].cpu_base = cpu_base;
  26033. timerqueue_init_head(&cpu_base->clock_base[i].active);
  26034. + INIT_LIST_HEAD(&cpu_base->clock_base[i].expired);
  26035. }
  26036. cpu_base->cpu = cpu;
  26037. hrtimer_init_hres(cpu_base);
  26038. +#ifdef CONFIG_PREEMPT_RT_BASE
  26039. + init_waitqueue_head(&cpu_base->wait);
  26040. +#endif
  26041. }
  26042. #ifdef CONFIG_HOTPLUG_CPU
  26043. @@ -1744,9 +2034,7 @@
  26044. hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE,
  26045. (void *)(long)smp_processor_id());
  26046. register_cpu_notifier(&hrtimers_nb);
  26047. -#ifdef CONFIG_HIGH_RES_TIMERS
  26048. open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq);
  26049. -#endif
  26050. }
  26051. /**
  26052. diff -Nur linux-3.18.12.orig/kernel/time/itimer.c linux-3.18.12/kernel/time/itimer.c
  26053. --- linux-3.18.12.orig/kernel/time/itimer.c 2015-04-20 14:48:02.000000000 -0500
  26054. +++ linux-3.18.12/kernel/time/itimer.c 2015-04-26 13:32:22.451684003 -0500
  26055. @@ -213,6 +213,7 @@
  26056. /* We are sharing ->siglock with it_real_fn() */
  26057. if (hrtimer_try_to_cancel(timer) < 0) {
  26058. spin_unlock_irq(&tsk->sighand->siglock);
  26059. + hrtimer_wait_for_timer(&tsk->signal->real_timer);
  26060. goto again;
  26061. }
  26062. expires = timeval_to_ktime(value->it_value);
  26063. diff -Nur linux-3.18.12.orig/kernel/time/jiffies.c linux-3.18.12/kernel/time/jiffies.c
  26064. --- linux-3.18.12.orig/kernel/time/jiffies.c 2015-04-20 14:48:02.000000000 -0500
  26065. +++ linux-3.18.12/kernel/time/jiffies.c 2015-04-26 13:32:22.451684003 -0500
  26066. @@ -73,7 +73,8 @@
  26067. .shift = JIFFIES_SHIFT,
  26068. };
  26069. -__cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock);
  26070. +__cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(jiffies_lock);
  26071. +__cacheline_aligned_in_smp seqcount_t jiffies_seq;
  26072. #if (BITS_PER_LONG < 64)
  26073. u64 get_jiffies_64(void)
  26074. @@ -82,9 +83,9 @@
  26075. u64 ret;
  26076. do {
  26077. - seq = read_seqbegin(&jiffies_lock);
  26078. + seq = read_seqcount_begin(&jiffies_seq);
  26079. ret = jiffies_64;
  26080. - } while (read_seqretry(&jiffies_lock, seq));
  26081. + } while (read_seqcount_retry(&jiffies_seq, seq));
  26082. return ret;
  26083. }
  26084. EXPORT_SYMBOL(get_jiffies_64);
  26085. diff -Nur linux-3.18.12.orig/kernel/time/ntp.c linux-3.18.12/kernel/time/ntp.c
  26086. --- linux-3.18.12.orig/kernel/time/ntp.c 2015-04-20 14:48:02.000000000 -0500
  26087. +++ linux-3.18.12/kernel/time/ntp.c 2015-04-26 13:32:22.451684003 -0500
  26088. @@ -10,6 +10,7 @@
  26089. #include <linux/workqueue.h>
  26090. #include <linux/hrtimer.h>
  26091. #include <linux/jiffies.h>
  26092. +#include <linux/kthread.h>
  26093. #include <linux/math64.h>
  26094. #include <linux/timex.h>
  26095. #include <linux/time.h>
  26096. @@ -519,10 +520,52 @@
  26097. &sync_cmos_work, timespec_to_jiffies(&next));
  26098. }
  26099. +#ifdef CONFIG_PREEMPT_RT_FULL
  26100. +/*
  26101. + * RT can not call schedule_delayed_work from real interrupt context.
  26102. + * Need to make a thread to do the real work.
  26103. + */
  26104. +static struct task_struct *cmos_delay_thread;
  26105. +static bool do_cmos_delay;
  26106. +
  26107. +static int run_cmos_delay(void *ignore)
  26108. +{
  26109. + while (!kthread_should_stop()) {
  26110. + set_current_state(TASK_INTERRUPTIBLE);
  26111. + if (do_cmos_delay) {
  26112. + do_cmos_delay = false;
  26113. + queue_delayed_work(system_power_efficient_wq,
  26114. + &sync_cmos_work, 0);
  26115. + }
  26116. + schedule();
  26117. + }
  26118. + __set_current_state(TASK_RUNNING);
  26119. + return 0;
  26120. +}
  26121. +
  26122. +void ntp_notify_cmos_timer(void)
  26123. +{
  26124. + do_cmos_delay = true;
  26125. + /* Make visible before waking up process */
  26126. + smp_wmb();
  26127. + wake_up_process(cmos_delay_thread);
  26128. +}
  26129. +
  26130. +static __init int create_cmos_delay_thread(void)
  26131. +{
  26132. + cmos_delay_thread = kthread_run(run_cmos_delay, NULL, "kcmosdelayd");
  26133. + BUG_ON(!cmos_delay_thread);
  26134. + return 0;
  26135. +}
  26136. +early_initcall(create_cmos_delay_thread);
  26137. +
  26138. +#else
  26139. +
  26140. void ntp_notify_cmos_timer(void)
  26141. {
  26142. queue_delayed_work(system_power_efficient_wq, &sync_cmos_work, 0);
  26143. }
  26144. +#endif /* CONFIG_PREEMPT_RT_FULL */
  26145. #else
  26146. void ntp_notify_cmos_timer(void) { }
  26147. diff -Nur linux-3.18.12.orig/kernel/time/posix-cpu-timers.c linux-3.18.12/kernel/time/posix-cpu-timers.c
  26148. --- linux-3.18.12.orig/kernel/time/posix-cpu-timers.c 2015-04-20 14:48:02.000000000 -0500
  26149. +++ linux-3.18.12/kernel/time/posix-cpu-timers.c 2015-04-26 13:32:22.451684003 -0500
  26150. @@ -3,6 +3,7 @@
  26151. */
  26152. #include <linux/sched.h>
  26153. +#include <linux/sched/rt.h>
  26154. #include <linux/posix-timers.h>
  26155. #include <linux/errno.h>
  26156. #include <linux/math64.h>
  26157. @@ -626,7 +627,7 @@
  26158. /*
  26159. * Disarm any old timer after extracting its expiry time.
  26160. */
  26161. - WARN_ON_ONCE(!irqs_disabled());
  26162. + WARN_ON_ONCE_NONRT(!irqs_disabled());
  26163. ret = 0;
  26164. old_incr = timer->it.cpu.incr;
  26165. @@ -1047,7 +1048,7 @@
  26166. /*
  26167. * Now re-arm for the new expiry time.
  26168. */
  26169. - WARN_ON_ONCE(!irqs_disabled());
  26170. + WARN_ON_ONCE_NONRT(!irqs_disabled());
  26171. arm_timer(timer);
  26172. unlock_task_sighand(p, &flags);
  26173. @@ -1113,10 +1114,11 @@
  26174. sig = tsk->signal;
  26175. if (sig->cputimer.running) {
  26176. struct task_cputime group_sample;
  26177. + unsigned long flags;
  26178. - raw_spin_lock(&sig->cputimer.lock);
  26179. + raw_spin_lock_irqsave(&sig->cputimer.lock, flags);
  26180. group_sample = sig->cputimer.cputime;
  26181. - raw_spin_unlock(&sig->cputimer.lock);
  26182. + raw_spin_unlock_irqrestore(&sig->cputimer.lock, flags);
  26183. if (task_cputime_expired(&group_sample, &sig->cputime_expires))
  26184. return 1;
  26185. @@ -1130,13 +1132,13 @@
  26186. * already updated our counts. We need to check if any timers fire now.
  26187. * Interrupts are disabled.
  26188. */
  26189. -void run_posix_cpu_timers(struct task_struct *tsk)
  26190. +static void __run_posix_cpu_timers(struct task_struct *tsk)
  26191. {
  26192. LIST_HEAD(firing);
  26193. struct k_itimer *timer, *next;
  26194. unsigned long flags;
  26195. - WARN_ON_ONCE(!irqs_disabled());
  26196. + WARN_ON_ONCE_NONRT(!irqs_disabled());
  26197. /*
  26198. * The fast path checks that there are no expired thread or thread
  26199. @@ -1194,6 +1196,190 @@
  26200. }
  26201. }
  26202. +#ifdef CONFIG_PREEMPT_RT_BASE
  26203. +#include <linux/kthread.h>
  26204. +#include <linux/cpu.h>
  26205. +DEFINE_PER_CPU(struct task_struct *, posix_timer_task);
  26206. +DEFINE_PER_CPU(struct task_struct *, posix_timer_tasklist);
  26207. +
  26208. +static int posix_cpu_timers_thread(void *data)
  26209. +{
  26210. + int cpu = (long)data;
  26211. +
  26212. + BUG_ON(per_cpu(posix_timer_task,cpu) != current);
  26213. +
  26214. + while (!kthread_should_stop()) {
  26215. + struct task_struct *tsk = NULL;
  26216. + struct task_struct *next = NULL;
  26217. +
  26218. + if (cpu_is_offline(cpu))
  26219. + goto wait_to_die;
  26220. +
  26221. + /* grab task list */
  26222. + raw_local_irq_disable();
  26223. + tsk = per_cpu(posix_timer_tasklist, cpu);
  26224. + per_cpu(posix_timer_tasklist, cpu) = NULL;
  26225. + raw_local_irq_enable();
  26226. +
  26227. + /* its possible the list is empty, just return */
  26228. + if (!tsk) {
  26229. + set_current_state(TASK_INTERRUPTIBLE);
  26230. + schedule();
  26231. + __set_current_state(TASK_RUNNING);
  26232. + continue;
  26233. + }
  26234. +
  26235. + /* Process task list */
  26236. + while (1) {
  26237. + /* save next */
  26238. + next = tsk->posix_timer_list;
  26239. +
  26240. + /* run the task timers, clear its ptr and
  26241. + * unreference it
  26242. + */
  26243. + __run_posix_cpu_timers(tsk);
  26244. + tsk->posix_timer_list = NULL;
  26245. + put_task_struct(tsk);
  26246. +
  26247. + /* check if this is the last on the list */
  26248. + if (next == tsk)
  26249. + break;
  26250. + tsk = next;
  26251. + }
  26252. + }
  26253. + return 0;
  26254. +
  26255. +wait_to_die:
  26256. + /* Wait for kthread_stop */
  26257. + set_current_state(TASK_INTERRUPTIBLE);
  26258. + while (!kthread_should_stop()) {
  26259. + schedule();
  26260. + set_current_state(TASK_INTERRUPTIBLE);
  26261. + }
  26262. + __set_current_state(TASK_RUNNING);
  26263. + return 0;
  26264. +}
  26265. +
  26266. +static inline int __fastpath_timer_check(struct task_struct *tsk)
  26267. +{
  26268. + /* tsk == current, ensure it is safe to use ->signal/sighand */
  26269. + if (unlikely(tsk->exit_state))
  26270. + return 0;
  26271. +
  26272. + if (!task_cputime_zero(&tsk->cputime_expires))
  26273. + return 1;
  26274. +
  26275. + if (!task_cputime_zero(&tsk->signal->cputime_expires))
  26276. + return 1;
  26277. +
  26278. + return 0;
  26279. +}
  26280. +
  26281. +void run_posix_cpu_timers(struct task_struct *tsk)
  26282. +{
  26283. + unsigned long cpu = smp_processor_id();
  26284. + struct task_struct *tasklist;
  26285. +
  26286. + BUG_ON(!irqs_disabled());
  26287. + if(!per_cpu(posix_timer_task, cpu))
  26288. + return;
  26289. + /* get per-cpu references */
  26290. + tasklist = per_cpu(posix_timer_tasklist, cpu);
  26291. +
  26292. + /* check to see if we're already queued */
  26293. + if (!tsk->posix_timer_list && __fastpath_timer_check(tsk)) {
  26294. + get_task_struct(tsk);
  26295. + if (tasklist) {
  26296. + tsk->posix_timer_list = tasklist;
  26297. + } else {
  26298. + /*
  26299. + * The list is terminated by a self-pointing
  26300. + * task_struct
  26301. + */
  26302. + tsk->posix_timer_list = tsk;
  26303. + }
  26304. + per_cpu(posix_timer_tasklist, cpu) = tsk;
  26305. +
  26306. + wake_up_process(per_cpu(posix_timer_task, cpu));
  26307. + }
  26308. +}
  26309. +
  26310. +/*
  26311. + * posix_cpu_thread_call - callback that gets triggered when a CPU is added.
  26312. + * Here we can start up the necessary migration thread for the new CPU.
  26313. + */
  26314. +static int posix_cpu_thread_call(struct notifier_block *nfb,
  26315. + unsigned long action, void *hcpu)
  26316. +{
  26317. + int cpu = (long)hcpu;
  26318. + struct task_struct *p;
  26319. + struct sched_param param;
  26320. +
  26321. + switch (action) {
  26322. + case CPU_UP_PREPARE:
  26323. + p = kthread_create(posix_cpu_timers_thread, hcpu,
  26324. + "posixcputmr/%d",cpu);
  26325. + if (IS_ERR(p))
  26326. + return NOTIFY_BAD;
  26327. + p->flags |= PF_NOFREEZE;
  26328. + kthread_bind(p, cpu);
  26329. + /* Must be high prio to avoid getting starved */
  26330. + param.sched_priority = MAX_RT_PRIO-1;
  26331. + sched_setscheduler(p, SCHED_FIFO, &param);
  26332. + per_cpu(posix_timer_task,cpu) = p;
  26333. + break;
  26334. + case CPU_ONLINE:
  26335. + /* Strictly unneccessary, as first user will wake it. */
  26336. + wake_up_process(per_cpu(posix_timer_task,cpu));
  26337. + break;
  26338. +#ifdef CONFIG_HOTPLUG_CPU
  26339. + case CPU_UP_CANCELED:
  26340. + /* Unbind it from offline cpu so it can run. Fall thru. */
  26341. + kthread_bind(per_cpu(posix_timer_task, cpu),
  26342. + cpumask_any(cpu_online_mask));
  26343. + kthread_stop(per_cpu(posix_timer_task,cpu));
  26344. + per_cpu(posix_timer_task,cpu) = NULL;
  26345. + break;
  26346. + case CPU_DEAD:
  26347. + kthread_stop(per_cpu(posix_timer_task,cpu));
  26348. + per_cpu(posix_timer_task,cpu) = NULL;
  26349. + break;
  26350. +#endif
  26351. + }
  26352. + return NOTIFY_OK;
  26353. +}
  26354. +
  26355. +/* Register at highest priority so that task migration (migrate_all_tasks)
  26356. + * happens before everything else.
  26357. + */
  26358. +static struct notifier_block posix_cpu_thread_notifier = {
  26359. + .notifier_call = posix_cpu_thread_call,
  26360. + .priority = 10
  26361. +};
  26362. +
  26363. +static int __init posix_cpu_thread_init(void)
  26364. +{
  26365. + void *hcpu = (void *)(long)smp_processor_id();
  26366. + /* Start one for boot CPU. */
  26367. + unsigned long cpu;
  26368. +
  26369. + /* init the per-cpu posix_timer_tasklets */
  26370. + for_each_possible_cpu(cpu)
  26371. + per_cpu(posix_timer_tasklist, cpu) = NULL;
  26372. +
  26373. + posix_cpu_thread_call(&posix_cpu_thread_notifier, CPU_UP_PREPARE, hcpu);
  26374. + posix_cpu_thread_call(&posix_cpu_thread_notifier, CPU_ONLINE, hcpu);
  26375. + register_cpu_notifier(&posix_cpu_thread_notifier);
  26376. + return 0;
  26377. +}
  26378. +early_initcall(posix_cpu_thread_init);
  26379. +#else /* CONFIG_PREEMPT_RT_BASE */
  26380. +void run_posix_cpu_timers(struct task_struct *tsk)
  26381. +{
  26382. + __run_posix_cpu_timers(tsk);
  26383. +}
  26384. +#endif /* CONFIG_PREEMPT_RT_BASE */
  26385. +
  26386. /*
  26387. * Set one of the process-wide special case CPU timers or RLIMIT_CPU.
  26388. * The tsk->sighand->siglock must be held by the caller.
  26389. diff -Nur linux-3.18.12.orig/kernel/time/posix-timers.c linux-3.18.12/kernel/time/posix-timers.c
  26390. --- linux-3.18.12.orig/kernel/time/posix-timers.c 2015-04-20 14:48:02.000000000 -0500
  26391. +++ linux-3.18.12/kernel/time/posix-timers.c 2015-04-26 13:32:22.451684003 -0500
  26392. @@ -499,6 +499,7 @@
  26393. static struct pid *good_sigevent(sigevent_t * event)
  26394. {
  26395. struct task_struct *rtn = current->group_leader;
  26396. + int sig = event->sigev_signo;
  26397. if ((event->sigev_notify & SIGEV_THREAD_ID ) &&
  26398. (!(rtn = find_task_by_vpid(event->sigev_notify_thread_id)) ||
  26399. @@ -507,7 +508,8 @@
  26400. return NULL;
  26401. if (((event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) &&
  26402. - ((event->sigev_signo <= 0) || (event->sigev_signo > SIGRTMAX)))
  26403. + (sig <= 0 || sig > SIGRTMAX || sig_kernel_only(sig) ||
  26404. + sig_kernel_coredump(sig)))
  26405. return NULL;
  26406. return task_pid(rtn);
  26407. @@ -819,6 +821,20 @@
  26408. return overrun;
  26409. }
  26410. +/*
  26411. + * Protected by RCU!
  26412. + */
  26413. +static void timer_wait_for_callback(struct k_clock *kc, struct k_itimer *timr)
  26414. +{
  26415. +#ifdef CONFIG_PREEMPT_RT_FULL
  26416. + if (kc->timer_set == common_timer_set)
  26417. + hrtimer_wait_for_timer(&timr->it.real.timer);
  26418. + else
  26419. + /* FIXME: Whacky hack for posix-cpu-timers */
  26420. + schedule_timeout(1);
  26421. +#endif
  26422. +}
  26423. +
  26424. /* Set a POSIX.1b interval timer. */
  26425. /* timr->it_lock is taken. */
  26426. static int
  26427. @@ -896,6 +912,7 @@
  26428. if (!timr)
  26429. return -EINVAL;
  26430. + rcu_read_lock();
  26431. kc = clockid_to_kclock(timr->it_clock);
  26432. if (WARN_ON_ONCE(!kc || !kc->timer_set))
  26433. error = -EINVAL;
  26434. @@ -904,9 +921,12 @@
  26435. unlock_timer(timr, flag);
  26436. if (error == TIMER_RETRY) {
  26437. + timer_wait_for_callback(kc, timr);
  26438. rtn = NULL; // We already got the old time...
  26439. + rcu_read_unlock();
  26440. goto retry;
  26441. }
  26442. + rcu_read_unlock();
  26443. if (old_setting && !error &&
  26444. copy_to_user(old_setting, &old_spec, sizeof (old_spec)))
  26445. @@ -944,10 +964,15 @@
  26446. if (!timer)
  26447. return -EINVAL;
  26448. + rcu_read_lock();
  26449. if (timer_delete_hook(timer) == TIMER_RETRY) {
  26450. unlock_timer(timer, flags);
  26451. + timer_wait_for_callback(clockid_to_kclock(timer->it_clock),
  26452. + timer);
  26453. + rcu_read_unlock();
  26454. goto retry_delete;
  26455. }
  26456. + rcu_read_unlock();
  26457. spin_lock(&current->sighand->siglock);
  26458. list_del(&timer->list);
  26459. @@ -973,8 +998,18 @@
  26460. retry_delete:
  26461. spin_lock_irqsave(&timer->it_lock, flags);
  26462. + /* On RT we can race with a deletion */
  26463. + if (!timer->it_signal) {
  26464. + unlock_timer(timer, flags);
  26465. + return;
  26466. + }
  26467. +
  26468. if (timer_delete_hook(timer) == TIMER_RETRY) {
  26469. + rcu_read_lock();
  26470. unlock_timer(timer, flags);
  26471. + timer_wait_for_callback(clockid_to_kclock(timer->it_clock),
  26472. + timer);
  26473. + rcu_read_unlock();
  26474. goto retry_delete;
  26475. }
  26476. list_del(&timer->list);
  26477. diff -Nur linux-3.18.12.orig/kernel/time/tick-common.c linux-3.18.12/kernel/time/tick-common.c
  26478. --- linux-3.18.12.orig/kernel/time/tick-common.c 2015-04-20 14:48:02.000000000 -0500
  26479. +++ linux-3.18.12/kernel/time/tick-common.c 2015-04-26 13:32:22.451684003 -0500
  26480. @@ -78,13 +78,15 @@
  26481. static void tick_periodic(int cpu)
  26482. {
  26483. if (tick_do_timer_cpu == cpu) {
  26484. - write_seqlock(&jiffies_lock);
  26485. + raw_spin_lock(&jiffies_lock);
  26486. + write_seqcount_begin(&jiffies_seq);
  26487. /* Keep track of the next tick event */
  26488. tick_next_period = ktime_add(tick_next_period, tick_period);
  26489. do_timer(1);
  26490. - write_sequnlock(&jiffies_lock);
  26491. + write_seqcount_end(&jiffies_seq);
  26492. + raw_spin_unlock(&jiffies_lock);
  26493. update_wall_time();
  26494. }
  26495. @@ -146,9 +148,9 @@
  26496. ktime_t next;
  26497. do {
  26498. - seq = read_seqbegin(&jiffies_lock);
  26499. + seq = read_seqcount_begin(&jiffies_seq);
  26500. next = tick_next_period;
  26501. - } while (read_seqretry(&jiffies_lock, seq));
  26502. + } while (read_seqcount_retry(&jiffies_seq, seq));
  26503. clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
  26504. diff -Nur linux-3.18.12.orig/kernel/time/tick-internal.h linux-3.18.12/kernel/time/tick-internal.h
  26505. --- linux-3.18.12.orig/kernel/time/tick-internal.h 2015-04-20 14:48:02.000000000 -0500
  26506. +++ linux-3.18.12/kernel/time/tick-internal.h 2015-04-26 13:32:22.451684003 -0500
  26507. @@ -6,7 +6,8 @@
  26508. #include "timekeeping.h"
  26509. -extern seqlock_t jiffies_lock;
  26510. +extern raw_spinlock_t jiffies_lock;
  26511. +extern seqcount_t jiffies_seq;
  26512. #define CS_NAME_LEN 32
  26513. diff -Nur linux-3.18.12.orig/kernel/time/tick-sched.c linux-3.18.12/kernel/time/tick-sched.c
  26514. --- linux-3.18.12.orig/kernel/time/tick-sched.c 2015-04-20 14:48:02.000000000 -0500
  26515. +++ linux-3.18.12/kernel/time/tick-sched.c 2015-04-26 13:32:22.451684003 -0500
  26516. @@ -62,7 +62,8 @@
  26517. return;
  26518. /* Reevalute with jiffies_lock held */
  26519. - write_seqlock(&jiffies_lock);
  26520. + raw_spin_lock(&jiffies_lock);
  26521. + write_seqcount_begin(&jiffies_seq);
  26522. delta = ktime_sub(now, last_jiffies_update);
  26523. if (delta.tv64 >= tick_period.tv64) {
  26524. @@ -85,10 +86,12 @@
  26525. /* Keep the tick_next_period variable up to date */
  26526. tick_next_period = ktime_add(last_jiffies_update, tick_period);
  26527. } else {
  26528. - write_sequnlock(&jiffies_lock);
  26529. + write_seqcount_end(&jiffies_seq);
  26530. + raw_spin_unlock(&jiffies_lock);
  26531. return;
  26532. }
  26533. - write_sequnlock(&jiffies_lock);
  26534. + write_seqcount_end(&jiffies_seq);
  26535. + raw_spin_unlock(&jiffies_lock);
  26536. update_wall_time();
  26537. }
  26538. @@ -99,12 +102,14 @@
  26539. {
  26540. ktime_t period;
  26541. - write_seqlock(&jiffies_lock);
  26542. + raw_spin_lock(&jiffies_lock);
  26543. + write_seqcount_begin(&jiffies_seq);
  26544. /* Did we start the jiffies update yet ? */
  26545. if (last_jiffies_update.tv64 == 0)
  26546. last_jiffies_update = tick_next_period;
  26547. period = last_jiffies_update;
  26548. - write_sequnlock(&jiffies_lock);
  26549. + write_seqcount_end(&jiffies_seq);
  26550. + raw_spin_unlock(&jiffies_lock);
  26551. return period;
  26552. }
  26553. @@ -176,6 +181,11 @@
  26554. return false;
  26555. }
  26556. + if (!arch_irq_work_has_interrupt()) {
  26557. + trace_tick_stop(0, "missing irq work interrupt\n");
  26558. + return false;
  26559. + }
  26560. +
  26561. /* sched_clock_tick() needs us? */
  26562. #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
  26563. /*
  26564. @@ -217,11 +227,17 @@
  26565. static void nohz_full_kick_work_func(struct irq_work *work)
  26566. {
  26567. + unsigned long flags;
  26568. +
  26569. + /* ksoftirqd processes sirqs with interrupts enabled */
  26570. + local_irq_save(flags);
  26571. __tick_nohz_full_check();
  26572. + local_irq_restore(flags);
  26573. }
  26574. static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
  26575. .func = nohz_full_kick_work_func,
  26576. + .flags = IRQ_WORK_HARD_IRQ,
  26577. };
  26578. /*
  26579. @@ -580,10 +596,10 @@
  26580. /* Read jiffies and the time when jiffies were updated last */
  26581. do {
  26582. - seq = read_seqbegin(&jiffies_lock);
  26583. + seq = read_seqcount_begin(&jiffies_seq);
  26584. last_update = last_jiffies_update;
  26585. last_jiffies = jiffies;
  26586. - } while (read_seqretry(&jiffies_lock, seq));
  26587. + } while (read_seqcount_retry(&jiffies_seq, seq));
  26588. if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) ||
  26589. arch_needs_cpu() || irq_work_needs_cpu()) {
  26590. @@ -761,14 +777,7 @@
  26591. return false;
  26592. if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
  26593. - static int ratelimit;
  26594. -
  26595. - if (ratelimit < 10 &&
  26596. - (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
  26597. - pr_warn("NOHZ: local_softirq_pending %02x\n",
  26598. - (unsigned int) local_softirq_pending());
  26599. - ratelimit++;
  26600. - }
  26601. + softirq_check_pending_idle();
  26602. return false;
  26603. }
  26604. @@ -1156,6 +1165,7 @@
  26605. * Emulate tick processing via per-CPU hrtimers:
  26606. */
  26607. hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
  26608. + ts->sched_timer.irqsafe = 1;
  26609. ts->sched_timer.function = tick_sched_timer;
  26610. /* Get the next period (per cpu) */
  26611. diff -Nur linux-3.18.12.orig/kernel/time/timekeeping.c linux-3.18.12/kernel/time/timekeeping.c
  26612. --- linux-3.18.12.orig/kernel/time/timekeeping.c 2015-04-20 14:48:02.000000000 -0500
  26613. +++ linux-3.18.12/kernel/time/timekeeping.c 2015-04-26 13:32:22.451684003 -0500
  26614. @@ -1814,8 +1814,10 @@
  26615. */
  26616. void xtime_update(unsigned long ticks)
  26617. {
  26618. - write_seqlock(&jiffies_lock);
  26619. + raw_spin_lock(&jiffies_lock);
  26620. + write_seqcount_begin(&jiffies_seq);
  26621. do_timer(ticks);
  26622. - write_sequnlock(&jiffies_lock);
  26623. + write_seqcount_end(&jiffies_seq);
  26624. + raw_spin_unlock(&jiffies_lock);
  26625. update_wall_time();
  26626. }
  26627. diff -Nur linux-3.18.12.orig/kernel/time/timer.c linux-3.18.12/kernel/time/timer.c
  26628. --- linux-3.18.12.orig/kernel/time/timer.c 2015-04-20 14:48:02.000000000 -0500
  26629. +++ linux-3.18.12/kernel/time/timer.c 2015-04-26 13:32:22.455684003 -0500
  26630. @@ -78,6 +78,9 @@
  26631. struct tvec_base {
  26632. spinlock_t lock;
  26633. struct timer_list *running_timer;
  26634. +#ifdef CONFIG_PREEMPT_RT_FULL
  26635. + wait_queue_head_t wait_for_running_timer;
  26636. +#endif
  26637. unsigned long timer_jiffies;
  26638. unsigned long next_timer;
  26639. unsigned long active_timers;
  26640. @@ -758,6 +761,36 @@
  26641. }
  26642. }
  26643. +#ifndef CONFIG_PREEMPT_RT_FULL
  26644. +static inline struct tvec_base *switch_timer_base(struct timer_list *timer,
  26645. + struct tvec_base *old,
  26646. + struct tvec_base *new)
  26647. +{
  26648. + /* See the comment in lock_timer_base() */
  26649. + timer_set_base(timer, NULL);
  26650. + spin_unlock(&old->lock);
  26651. + spin_lock(&new->lock);
  26652. + timer_set_base(timer, new);
  26653. + return new;
  26654. +}
  26655. +#else
  26656. +static inline struct tvec_base *switch_timer_base(struct timer_list *timer,
  26657. + struct tvec_base *old,
  26658. + struct tvec_base *new)
  26659. +{
  26660. + /*
  26661. + * We cannot do the above because we might be preempted and
  26662. + * then the preempter would see NULL and loop forever.
  26663. + */
  26664. + if (spin_trylock(&new->lock)) {
  26665. + timer_set_base(timer, new);
  26666. + spin_unlock(&old->lock);
  26667. + return new;
  26668. + }
  26669. + return old;
  26670. +}
  26671. +#endif
  26672. +
  26673. static inline int
  26674. __mod_timer(struct timer_list *timer, unsigned long expires,
  26675. bool pending_only, int pinned)
  26676. @@ -788,14 +821,8 @@
  26677. * handler yet has not finished. This also guarantees that
  26678. * the timer is serialized wrt itself.
  26679. */
  26680. - if (likely(base->running_timer != timer)) {
  26681. - /* See the comment in lock_timer_base() */
  26682. - timer_set_base(timer, NULL);
  26683. - spin_unlock(&base->lock);
  26684. - base = new_base;
  26685. - spin_lock(&base->lock);
  26686. - timer_set_base(timer, base);
  26687. - }
  26688. + if (likely(base->running_timer != timer))
  26689. + base = switch_timer_base(timer, base, new_base);
  26690. }
  26691. timer->expires = expires;
  26692. @@ -969,6 +996,29 @@
  26693. }
  26694. EXPORT_SYMBOL_GPL(add_timer_on);
  26695. +#ifdef CONFIG_PREEMPT_RT_FULL
  26696. +/*
  26697. + * Wait for a running timer
  26698. + */
  26699. +static void wait_for_running_timer(struct timer_list *timer)
  26700. +{
  26701. + struct tvec_base *base = timer->base;
  26702. +
  26703. + if (base->running_timer == timer)
  26704. + wait_event(base->wait_for_running_timer,
  26705. + base->running_timer != timer);
  26706. +}
  26707. +
  26708. +# define wakeup_timer_waiters(b) wake_up(&(b)->wait_for_running_timer)
  26709. +#else
  26710. +static inline void wait_for_running_timer(struct timer_list *timer)
  26711. +{
  26712. + cpu_relax();
  26713. +}
  26714. +
  26715. +# define wakeup_timer_waiters(b) do { } while (0)
  26716. +#endif
  26717. +
  26718. /**
  26719. * del_timer - deactive a timer.
  26720. * @timer: the timer to be deactivated
  26721. @@ -1026,7 +1076,7 @@
  26722. }
  26723. EXPORT_SYMBOL(try_to_del_timer_sync);
  26724. -#ifdef CONFIG_SMP
  26725. +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
  26726. /**
  26727. * del_timer_sync - deactivate a timer and wait for the handler to finish.
  26728. * @timer: the timer to be deactivated
  26729. @@ -1086,7 +1136,7 @@
  26730. int ret = try_to_del_timer_sync(timer);
  26731. if (ret >= 0)
  26732. return ret;
  26733. - cpu_relax();
  26734. + wait_for_running_timer(timer);
  26735. }
  26736. }
  26737. EXPORT_SYMBOL(del_timer_sync);
  26738. @@ -1207,15 +1257,17 @@
  26739. if (irqsafe) {
  26740. spin_unlock(&base->lock);
  26741. call_timer_fn(timer, fn, data);
  26742. + base->running_timer = NULL;
  26743. spin_lock(&base->lock);
  26744. } else {
  26745. spin_unlock_irq(&base->lock);
  26746. call_timer_fn(timer, fn, data);
  26747. + base->running_timer = NULL;
  26748. spin_lock_irq(&base->lock);
  26749. }
  26750. }
  26751. }
  26752. - base->running_timer = NULL;
  26753. + wakeup_timer_waiters(base);
  26754. spin_unlock_irq(&base->lock);
  26755. }
  26756. @@ -1355,17 +1407,31 @@
  26757. if (cpu_is_offline(smp_processor_id()))
  26758. return expires;
  26759. +#ifdef CONFIG_PREEMPT_RT_FULL
  26760. + /*
  26761. + * On PREEMPT_RT we cannot sleep here. If the trylock does not
  26762. + * succeed then we return the worst-case 'expires in 1 tick'
  26763. + * value. We use the rt functions here directly to avoid a
  26764. + * migrate_disable() call.
  26765. + */
  26766. + if (!spin_do_trylock(&base->lock))
  26767. + return now + 1;
  26768. +#else
  26769. spin_lock(&base->lock);
  26770. +#endif
  26771. if (base->active_timers) {
  26772. if (time_before_eq(base->next_timer, base->timer_jiffies))
  26773. base->next_timer = __next_timer_interrupt(base);
  26774. expires = base->next_timer;
  26775. }
  26776. +#ifdef CONFIG_PREEMPT_RT_FULL
  26777. + rt_spin_unlock_after_trylock_in_irq(&base->lock);
  26778. +#else
  26779. spin_unlock(&base->lock);
  26780. +#endif
  26781. if (time_before_eq(expires, now))
  26782. return now;
  26783. -
  26784. return cmp_next_hrtimer_event(now, expires);
  26785. }
  26786. #endif
  26787. @@ -1381,13 +1447,13 @@
  26788. /* Note: this timer irq context must be accounted for as well. */
  26789. account_process_tick(p, user_tick);
  26790. + scheduler_tick();
  26791. run_local_timers();
  26792. rcu_check_callbacks(cpu, user_tick);
  26793. -#ifdef CONFIG_IRQ_WORK
  26794. - if (in_irq())
  26795. - irq_work_tick();
  26796. +
  26797. +#if defined(CONFIG_IRQ_WORK) && !defined(CONFIG_PREEMPT_RT_FULL)
  26798. + irq_work_tick();
  26799. #endif
  26800. - scheduler_tick();
  26801. run_posix_cpu_timers(p);
  26802. }
  26803. @@ -1400,6 +1466,10 @@
  26804. hrtimer_run_pending();
  26805. +#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_PREEMPT_RT_FULL)
  26806. + irq_work_tick();
  26807. +#endif
  26808. +
  26809. if (time_after_eq(jiffies, base->timer_jiffies))
  26810. __run_timers(base);
  26811. }
  26812. @@ -1574,6 +1644,9 @@
  26813. base = per_cpu(tvec_bases, cpu);
  26814. }
  26815. +#ifdef CONFIG_PREEMPT_RT_FULL
  26816. + init_waitqueue_head(&base->wait_for_running_timer);
  26817. +#endif
  26818. for (j = 0; j < TVN_SIZE; j++) {
  26819. INIT_LIST_HEAD(base->tv5.vec + j);
  26820. @@ -1613,7 +1686,7 @@
  26821. BUG_ON(cpu_online(cpu));
  26822. old_base = per_cpu(tvec_bases, cpu);
  26823. - new_base = get_cpu_var(tvec_bases);
  26824. + new_base = get_local_var(tvec_bases);
  26825. /*
  26826. * The caller is globally serialized and nobody else
  26827. * takes two locks at once, deadlock is not possible.
  26828. @@ -1634,7 +1707,7 @@
  26829. spin_unlock(&old_base->lock);
  26830. spin_unlock_irq(&new_base->lock);
  26831. - put_cpu_var(tvec_bases);
  26832. + put_local_var(tvec_bases);
  26833. }
  26834. #endif /* CONFIG_HOTPLUG_CPU */
  26835. diff -Nur linux-3.18.12.orig/kernel/trace/Kconfig linux-3.18.12/kernel/trace/Kconfig
  26836. --- linux-3.18.12.orig/kernel/trace/Kconfig 2015-04-20 14:48:02.000000000 -0500
  26837. +++ linux-3.18.12/kernel/trace/Kconfig 2015-04-26 13:32:22.455684003 -0500
  26838. @@ -187,6 +187,24 @@
  26839. enabled. This option and the preempt-off timing option can be
  26840. used together or separately.)
  26841. +config INTERRUPT_OFF_HIST
  26842. + bool "Interrupts-off Latency Histogram"
  26843. + depends on IRQSOFF_TRACER
  26844. + help
  26845. + This option generates continuously updated histograms (one per cpu)
  26846. + of the duration of time periods with interrupts disabled. The
  26847. + histograms are disabled by default. To enable them, write a non-zero
  26848. + number to
  26849. +
  26850. + /sys/kernel/debug/tracing/latency_hist/enable/preemptirqsoff
  26851. +
  26852. + If PREEMPT_OFF_HIST is also selected, additional histograms (one
  26853. + per cpu) are generated that accumulate the duration of time periods
  26854. + when both interrupts and preemption are disabled. The histogram data
  26855. + will be located in the debug file system at
  26856. +
  26857. + /sys/kernel/debug/tracing/latency_hist/irqsoff
  26858. +
  26859. config PREEMPT_TRACER
  26860. bool "Preemption-off Latency Tracer"
  26861. default n
  26862. @@ -211,6 +229,24 @@
  26863. enabled. This option and the irqs-off timing option can be
  26864. used together or separately.)
  26865. +config PREEMPT_OFF_HIST
  26866. + bool "Preemption-off Latency Histogram"
  26867. + depends on PREEMPT_TRACER
  26868. + help
  26869. + This option generates continuously updated histograms (one per cpu)
  26870. + of the duration of time periods with preemption disabled. The
  26871. + histograms are disabled by default. To enable them, write a non-zero
  26872. + number to
  26873. +
  26874. + /sys/kernel/debug/tracing/latency_hist/enable/preemptirqsoff
  26875. +
  26876. + If INTERRUPT_OFF_HIST is also selected, additional histograms (one
  26877. + per cpu) are generated that accumulate the duration of time periods
  26878. + when both interrupts and preemption are disabled. The histogram data
  26879. + will be located in the debug file system at
  26880. +
  26881. + /sys/kernel/debug/tracing/latency_hist/preemptoff
  26882. +
  26883. config SCHED_TRACER
  26884. bool "Scheduling Latency Tracer"
  26885. select GENERIC_TRACER
  26886. @@ -221,6 +257,74 @@
  26887. This tracer tracks the latency of the highest priority task
  26888. to be scheduled in, starting from the point it has woken up.
  26889. +config WAKEUP_LATENCY_HIST
  26890. + bool "Scheduling Latency Histogram"
  26891. + depends on SCHED_TRACER
  26892. + help
  26893. + This option generates continuously updated histograms (one per cpu)
  26894. + of the scheduling latency of the highest priority task.
  26895. + The histograms are disabled by default. To enable them, write a
  26896. + non-zero number to
  26897. +
  26898. + /sys/kernel/debug/tracing/latency_hist/enable/wakeup
  26899. +
  26900. + Two different algorithms are used, one to determine the latency of
  26901. + processes that exclusively use the highest priority of the system and
  26902. + another one to determine the latency of processes that share the
  26903. + highest system priority with other processes. The former is used to
  26904. + improve hardware and system software, the latter to optimize the
  26905. + priority design of a given system. The histogram data will be
  26906. + located in the debug file system at
  26907. +
  26908. + /sys/kernel/debug/tracing/latency_hist/wakeup
  26909. +
  26910. + and
  26911. +
  26912. + /sys/kernel/debug/tracing/latency_hist/wakeup/sharedprio
  26913. +
  26914. + If both Scheduling Latency Histogram and Missed Timer Offsets
  26915. + Histogram are selected, additional histogram data will be collected
  26916. + that contain, in addition to the wakeup latency, the timer latency, in
  26917. + case the wakeup was triggered by an expired timer. These histograms
  26918. + are available in the
  26919. +
  26920. + /sys/kernel/debug/tracing/latency_hist/timerandwakeup
  26921. +
  26922. + directory. They reflect the apparent interrupt and scheduling latency
  26923. + and are best suitable to determine the worst-case latency of a given
  26924. + system. To enable these histograms, write a non-zero number to
  26925. +
  26926. + /sys/kernel/debug/tracing/latency_hist/enable/timerandwakeup
  26927. +
  26928. +config MISSED_TIMER_OFFSETS_HIST
  26929. + depends on HIGH_RES_TIMERS
  26930. + select GENERIC_TRACER
  26931. + bool "Missed Timer Offsets Histogram"
  26932. + help
  26933. + Generate a histogram of missed timer offsets in microseconds. The
  26934. + histograms are disabled by default. To enable them, write a non-zero
  26935. + number to
  26936. +
  26937. + /sys/kernel/debug/tracing/latency_hist/enable/missed_timer_offsets
  26938. +
  26939. + The histogram data will be located in the debug file system at
  26940. +
  26941. + /sys/kernel/debug/tracing/latency_hist/missed_timer_offsets
  26942. +
  26943. + If both Scheduling Latency Histogram and Missed Timer Offsets
  26944. + Histogram are selected, additional histogram data will be collected
  26945. + that contain, in addition to the wakeup latency, the timer latency, in
  26946. + case the wakeup was triggered by an expired timer. These histograms
  26947. + are available in the
  26948. +
  26949. + /sys/kernel/debug/tracing/latency_hist/timerandwakeup
  26950. +
  26951. + directory. They reflect the apparent interrupt and scheduling latency
  26952. + and are best suitable to determine the worst-case latency of a given
  26953. + system. To enable these histograms, write a non-zero number to
  26954. +
  26955. + /sys/kernel/debug/tracing/latency_hist/enable/timerandwakeup
  26956. +
  26957. config ENABLE_DEFAULT_TRACERS
  26958. bool "Trace process context switches and events"
  26959. depends on !GENERIC_TRACER
  26960. diff -Nur linux-3.18.12.orig/kernel/trace/latency_hist.c linux-3.18.12/kernel/trace/latency_hist.c
  26961. --- linux-3.18.12.orig/kernel/trace/latency_hist.c 1969-12-31 18:00:00.000000000 -0600
  26962. +++ linux-3.18.12/kernel/trace/latency_hist.c 2015-04-26 13:32:22.455684003 -0500
  26963. @@ -0,0 +1,1178 @@
  26964. +/*
  26965. + * kernel/trace/latency_hist.c
  26966. + *
  26967. + * Add support for histograms of preemption-off latency and
  26968. + * interrupt-off latency and wakeup latency, it depends on
  26969. + * Real-Time Preemption Support.
  26970. + *
  26971. + * Copyright (C) 2005 MontaVista Software, Inc.
  26972. + * Yi Yang <yyang@ch.mvista.com>
  26973. + *
  26974. + * Converted to work with the new latency tracer.
  26975. + * Copyright (C) 2008 Red Hat, Inc.
  26976. + * Steven Rostedt <srostedt@redhat.com>
  26977. + *
  26978. + */
  26979. +#include <linux/module.h>
  26980. +#include <linux/debugfs.h>
  26981. +#include <linux/seq_file.h>
  26982. +#include <linux/percpu.h>
  26983. +#include <linux/kallsyms.h>
  26984. +#include <linux/uaccess.h>
  26985. +#include <linux/sched.h>
  26986. +#include <linux/sched/rt.h>
  26987. +#include <linux/slab.h>
  26988. +#include <linux/atomic.h>
  26989. +#include <asm/div64.h>
  26990. +
  26991. +#include "trace.h"
  26992. +#include <trace/events/sched.h>
  26993. +
  26994. +#define NSECS_PER_USECS 1000L
  26995. +
  26996. +#define CREATE_TRACE_POINTS
  26997. +#include <trace/events/hist.h>
  26998. +
  26999. +enum {
  27000. + IRQSOFF_LATENCY = 0,
  27001. + PREEMPTOFF_LATENCY,
  27002. + PREEMPTIRQSOFF_LATENCY,
  27003. + WAKEUP_LATENCY,
  27004. + WAKEUP_LATENCY_SHAREDPRIO,
  27005. + MISSED_TIMER_OFFSETS,
  27006. + TIMERANDWAKEUP_LATENCY,
  27007. + MAX_LATENCY_TYPE,
  27008. +};
  27009. +
  27010. +#define MAX_ENTRY_NUM 10240
  27011. +
  27012. +struct hist_data {
  27013. + atomic_t hist_mode; /* 0 log, 1 don't log */
  27014. + long offset; /* set it to MAX_ENTRY_NUM/2 for a bipolar scale */
  27015. + long min_lat;
  27016. + long max_lat;
  27017. + unsigned long long below_hist_bound_samples;
  27018. + unsigned long long above_hist_bound_samples;
  27019. + long long accumulate_lat;
  27020. + unsigned long long total_samples;
  27021. + unsigned long long hist_array[MAX_ENTRY_NUM];
  27022. +};
  27023. +
  27024. +struct enable_data {
  27025. + int latency_type;
  27026. + int enabled;
  27027. +};
  27028. +
  27029. +static char *latency_hist_dir_root = "latency_hist";
  27030. +
  27031. +#ifdef CONFIG_INTERRUPT_OFF_HIST
  27032. +static DEFINE_PER_CPU(struct hist_data, irqsoff_hist);
  27033. +static char *irqsoff_hist_dir = "irqsoff";
  27034. +static DEFINE_PER_CPU(cycles_t, hist_irqsoff_start);
  27035. +static DEFINE_PER_CPU(int, hist_irqsoff_counting);
  27036. +#endif
  27037. +
  27038. +#ifdef CONFIG_PREEMPT_OFF_HIST
  27039. +static DEFINE_PER_CPU(struct hist_data, preemptoff_hist);
  27040. +static char *preemptoff_hist_dir = "preemptoff";
  27041. +static DEFINE_PER_CPU(cycles_t, hist_preemptoff_start);
  27042. +static DEFINE_PER_CPU(int, hist_preemptoff_counting);
  27043. +#endif
  27044. +
  27045. +#if defined(CONFIG_PREEMPT_OFF_HIST) && defined(CONFIG_INTERRUPT_OFF_HIST)
  27046. +static DEFINE_PER_CPU(struct hist_data, preemptirqsoff_hist);
  27047. +static char *preemptirqsoff_hist_dir = "preemptirqsoff";
  27048. +static DEFINE_PER_CPU(cycles_t, hist_preemptirqsoff_start);
  27049. +static DEFINE_PER_CPU(int, hist_preemptirqsoff_counting);
  27050. +#endif
  27051. +
  27052. +#if defined(CONFIG_PREEMPT_OFF_HIST) || defined(CONFIG_INTERRUPT_OFF_HIST)
  27053. +static notrace void probe_preemptirqsoff_hist(void *v, int reason, int start);
  27054. +static struct enable_data preemptirqsoff_enabled_data = {
  27055. + .latency_type = PREEMPTIRQSOFF_LATENCY,
  27056. + .enabled = 0,
  27057. +};
  27058. +#endif
  27059. +
  27060. +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
  27061. + defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
  27062. +struct maxlatproc_data {
  27063. + char comm[FIELD_SIZEOF(struct task_struct, comm)];
  27064. + char current_comm[FIELD_SIZEOF(struct task_struct, comm)];
  27065. + int pid;
  27066. + int current_pid;
  27067. + int prio;
  27068. + int current_prio;
  27069. + long latency;
  27070. + long timeroffset;
  27071. + cycle_t timestamp;
  27072. +};
  27073. +#endif
  27074. +
  27075. +#ifdef CONFIG_WAKEUP_LATENCY_HIST
  27076. +static DEFINE_PER_CPU(struct hist_data, wakeup_latency_hist);
  27077. +static DEFINE_PER_CPU(struct hist_data, wakeup_latency_hist_sharedprio);
  27078. +static char *wakeup_latency_hist_dir = "wakeup";
  27079. +static char *wakeup_latency_hist_dir_sharedprio = "sharedprio";
  27080. +static notrace void probe_wakeup_latency_hist_start(void *v,
  27081. + struct task_struct *p, int success);
  27082. +static notrace void probe_wakeup_latency_hist_stop(void *v,
  27083. + struct task_struct *prev, struct task_struct *next);
  27084. +static notrace void probe_sched_migrate_task(void *,
  27085. + struct task_struct *task, int cpu);
  27086. +static struct enable_data wakeup_latency_enabled_data = {
  27087. + .latency_type = WAKEUP_LATENCY,
  27088. + .enabled = 0,
  27089. +};
  27090. +static DEFINE_PER_CPU(struct maxlatproc_data, wakeup_maxlatproc);
  27091. +static DEFINE_PER_CPU(struct maxlatproc_data, wakeup_maxlatproc_sharedprio);
  27092. +static DEFINE_PER_CPU(struct task_struct *, wakeup_task);
  27093. +static DEFINE_PER_CPU(int, wakeup_sharedprio);
  27094. +static unsigned long wakeup_pid;
  27095. +#endif
  27096. +
  27097. +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
  27098. +static DEFINE_PER_CPU(struct hist_data, missed_timer_offsets);
  27099. +static char *missed_timer_offsets_dir = "missed_timer_offsets";
  27100. +static notrace void probe_hrtimer_interrupt(void *v, int cpu,
  27101. + long long offset, struct task_struct *curr, struct task_struct *task);
  27102. +static struct enable_data missed_timer_offsets_enabled_data = {
  27103. + .latency_type = MISSED_TIMER_OFFSETS,
  27104. + .enabled = 0,
  27105. +};
  27106. +static DEFINE_PER_CPU(struct maxlatproc_data, missed_timer_offsets_maxlatproc);
  27107. +static unsigned long missed_timer_offsets_pid;
  27108. +#endif
  27109. +
  27110. +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
  27111. + defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
  27112. +static DEFINE_PER_CPU(struct hist_data, timerandwakeup_latency_hist);
  27113. +static char *timerandwakeup_latency_hist_dir = "timerandwakeup";
  27114. +static struct enable_data timerandwakeup_enabled_data = {
  27115. + .latency_type = TIMERANDWAKEUP_LATENCY,
  27116. + .enabled = 0,
  27117. +};
  27118. +static DEFINE_PER_CPU(struct maxlatproc_data, timerandwakeup_maxlatproc);
  27119. +#endif
  27120. +
  27121. +void notrace latency_hist(int latency_type, int cpu, long latency,
  27122. + long timeroffset, cycle_t stop,
  27123. + struct task_struct *p)
  27124. +{
  27125. + struct hist_data *my_hist;
  27126. +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
  27127. + defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
  27128. + struct maxlatproc_data *mp = NULL;
  27129. +#endif
  27130. +
  27131. + if (!cpu_possible(cpu) || latency_type < 0 ||
  27132. + latency_type >= MAX_LATENCY_TYPE)
  27133. + return;
  27134. +
  27135. + switch (latency_type) {
  27136. +#ifdef CONFIG_INTERRUPT_OFF_HIST
  27137. + case IRQSOFF_LATENCY:
  27138. + my_hist = &per_cpu(irqsoff_hist, cpu);
  27139. + break;
  27140. +#endif
  27141. +#ifdef CONFIG_PREEMPT_OFF_HIST
  27142. + case PREEMPTOFF_LATENCY:
  27143. + my_hist = &per_cpu(preemptoff_hist, cpu);
  27144. + break;
  27145. +#endif
  27146. +#if defined(CONFIG_PREEMPT_OFF_HIST) && defined(CONFIG_INTERRUPT_OFF_HIST)
  27147. + case PREEMPTIRQSOFF_LATENCY:
  27148. + my_hist = &per_cpu(preemptirqsoff_hist, cpu);
  27149. + break;
  27150. +#endif
  27151. +#ifdef CONFIG_WAKEUP_LATENCY_HIST
  27152. + case WAKEUP_LATENCY:
  27153. + my_hist = &per_cpu(wakeup_latency_hist, cpu);
  27154. + mp = &per_cpu(wakeup_maxlatproc, cpu);
  27155. + break;
  27156. + case WAKEUP_LATENCY_SHAREDPRIO:
  27157. + my_hist = &per_cpu(wakeup_latency_hist_sharedprio, cpu);
  27158. + mp = &per_cpu(wakeup_maxlatproc_sharedprio, cpu);
  27159. + break;
  27160. +#endif
  27161. +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
  27162. + case MISSED_TIMER_OFFSETS:
  27163. + my_hist = &per_cpu(missed_timer_offsets, cpu);
  27164. + mp = &per_cpu(missed_timer_offsets_maxlatproc, cpu);
  27165. + break;
  27166. +#endif
  27167. +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
  27168. + defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
  27169. + case TIMERANDWAKEUP_LATENCY:
  27170. + my_hist = &per_cpu(timerandwakeup_latency_hist, cpu);
  27171. + mp = &per_cpu(timerandwakeup_maxlatproc, cpu);
  27172. + break;
  27173. +#endif
  27174. +
  27175. + default:
  27176. + return;
  27177. + }
  27178. +
  27179. + latency += my_hist->offset;
  27180. +
  27181. + if (atomic_read(&my_hist->hist_mode) == 0)
  27182. + return;
  27183. +
  27184. + if (latency < 0 || latency >= MAX_ENTRY_NUM) {
  27185. + if (latency < 0)
  27186. + my_hist->below_hist_bound_samples++;
  27187. + else
  27188. + my_hist->above_hist_bound_samples++;
  27189. + } else
  27190. + my_hist->hist_array[latency]++;
  27191. +
  27192. + if (unlikely(latency > my_hist->max_lat ||
  27193. + my_hist->min_lat == LONG_MAX)) {
  27194. +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
  27195. + defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
  27196. + if (latency_type == WAKEUP_LATENCY ||
  27197. + latency_type == WAKEUP_LATENCY_SHAREDPRIO ||
  27198. + latency_type == MISSED_TIMER_OFFSETS ||
  27199. + latency_type == TIMERANDWAKEUP_LATENCY) {
  27200. + strncpy(mp->comm, p->comm, sizeof(mp->comm));
  27201. + strncpy(mp->current_comm, current->comm,
  27202. + sizeof(mp->current_comm));
  27203. + mp->pid = task_pid_nr(p);
  27204. + mp->current_pid = task_pid_nr(current);
  27205. + mp->prio = p->prio;
  27206. + mp->current_prio = current->prio;
  27207. + mp->latency = latency;
  27208. + mp->timeroffset = timeroffset;
  27209. + mp->timestamp = stop;
  27210. + }
  27211. +#endif
  27212. + my_hist->max_lat = latency;
  27213. + }
  27214. + if (unlikely(latency < my_hist->min_lat))
  27215. + my_hist->min_lat = latency;
  27216. + my_hist->total_samples++;
  27217. + my_hist->accumulate_lat += latency;
  27218. +}
  27219. +
  27220. +static void *l_start(struct seq_file *m, loff_t *pos)
  27221. +{
  27222. + loff_t *index_ptr = NULL;
  27223. + loff_t index = *pos;
  27224. + struct hist_data *my_hist = m->private;
  27225. +
  27226. + if (index == 0) {
  27227. + char minstr[32], avgstr[32], maxstr[32];
  27228. +
  27229. + atomic_dec(&my_hist->hist_mode);
  27230. +
  27231. + if (likely(my_hist->total_samples)) {
  27232. + long avg = (long) div64_s64(my_hist->accumulate_lat,
  27233. + my_hist->total_samples);
  27234. + snprintf(minstr, sizeof(minstr), "%ld",
  27235. + my_hist->min_lat - my_hist->offset);
  27236. + snprintf(avgstr, sizeof(avgstr), "%ld",
  27237. + avg - my_hist->offset);
  27238. + snprintf(maxstr, sizeof(maxstr), "%ld",
  27239. + my_hist->max_lat - my_hist->offset);
  27240. + } else {
  27241. + strcpy(minstr, "<undef>");
  27242. + strcpy(avgstr, minstr);
  27243. + strcpy(maxstr, minstr);
  27244. + }
  27245. +
  27246. + seq_printf(m, "#Minimum latency: %s microseconds\n"
  27247. + "#Average latency: %s microseconds\n"
  27248. + "#Maximum latency: %s microseconds\n"
  27249. + "#Total samples: %llu\n"
  27250. + "#There are %llu samples lower than %ld"
  27251. + " microseconds.\n"
  27252. + "#There are %llu samples greater or equal"
  27253. + " than %ld microseconds.\n"
  27254. + "#usecs\t%16s\n",
  27255. + minstr, avgstr, maxstr,
  27256. + my_hist->total_samples,
  27257. + my_hist->below_hist_bound_samples,
  27258. + -my_hist->offset,
  27259. + my_hist->above_hist_bound_samples,
  27260. + MAX_ENTRY_NUM - my_hist->offset,
  27261. + "samples");
  27262. + }
  27263. + if (index < MAX_ENTRY_NUM) {
  27264. + index_ptr = kmalloc(sizeof(loff_t), GFP_KERNEL);
  27265. + if (index_ptr)
  27266. + *index_ptr = index;
  27267. + }
  27268. +
  27269. + return index_ptr;
  27270. +}
  27271. +
  27272. +static void *l_next(struct seq_file *m, void *p, loff_t *pos)
  27273. +{
  27274. + loff_t *index_ptr = p;
  27275. + struct hist_data *my_hist = m->private;
  27276. +
  27277. + if (++*pos >= MAX_ENTRY_NUM) {
  27278. + atomic_inc(&my_hist->hist_mode);
  27279. + return NULL;
  27280. + }
  27281. + *index_ptr = *pos;
  27282. + return index_ptr;
  27283. +}
  27284. +
  27285. +static void l_stop(struct seq_file *m, void *p)
  27286. +{
  27287. + kfree(p);
  27288. +}
  27289. +
  27290. +static int l_show(struct seq_file *m, void *p)
  27291. +{
  27292. + int index = *(loff_t *) p;
  27293. + struct hist_data *my_hist = m->private;
  27294. +
  27295. + seq_printf(m, "%6ld\t%16llu\n", index - my_hist->offset,
  27296. + my_hist->hist_array[index]);
  27297. + return 0;
  27298. +}
  27299. +
  27300. +static const struct seq_operations latency_hist_seq_op = {
  27301. + .start = l_start,
  27302. + .next = l_next,
  27303. + .stop = l_stop,
  27304. + .show = l_show
  27305. +};
  27306. +
  27307. +static int latency_hist_open(struct inode *inode, struct file *file)
  27308. +{
  27309. + int ret;
  27310. +
  27311. + ret = seq_open(file, &latency_hist_seq_op);
  27312. + if (!ret) {
  27313. + struct seq_file *seq = file->private_data;
  27314. + seq->private = inode->i_private;
  27315. + }
  27316. + return ret;
  27317. +}
  27318. +
  27319. +static const struct file_operations latency_hist_fops = {
  27320. + .open = latency_hist_open,
  27321. + .read = seq_read,
  27322. + .llseek = seq_lseek,
  27323. + .release = seq_release,
  27324. +};
  27325. +
  27326. +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
  27327. + defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
  27328. +static void clear_maxlatprocdata(struct maxlatproc_data *mp)
  27329. +{
  27330. + mp->comm[0] = mp->current_comm[0] = '\0';
  27331. + mp->prio = mp->current_prio = mp->pid = mp->current_pid =
  27332. + mp->latency = mp->timeroffset = -1;
  27333. + mp->timestamp = 0;
  27334. +}
  27335. +#endif
  27336. +
  27337. +static void hist_reset(struct hist_data *hist)
  27338. +{
  27339. + atomic_dec(&hist->hist_mode);
  27340. +
  27341. + memset(hist->hist_array, 0, sizeof(hist->hist_array));
  27342. + hist->below_hist_bound_samples = 0ULL;
  27343. + hist->above_hist_bound_samples = 0ULL;
  27344. + hist->min_lat = LONG_MAX;
  27345. + hist->max_lat = LONG_MIN;
  27346. + hist->total_samples = 0ULL;
  27347. + hist->accumulate_lat = 0LL;
  27348. +
  27349. + atomic_inc(&hist->hist_mode);
  27350. +}
  27351. +
  27352. +static ssize_t
  27353. +latency_hist_reset(struct file *file, const char __user *a,
  27354. + size_t size, loff_t *off)
  27355. +{
  27356. + int cpu;
  27357. + struct hist_data *hist = NULL;
  27358. +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
  27359. + defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
  27360. + struct maxlatproc_data *mp = NULL;
  27361. +#endif
  27362. + off_t latency_type = (off_t) file->private_data;
  27363. +
  27364. + for_each_online_cpu(cpu) {
  27365. +
  27366. + switch (latency_type) {
  27367. +#ifdef CONFIG_PREEMPT_OFF_HIST
  27368. + case PREEMPTOFF_LATENCY:
  27369. + hist = &per_cpu(preemptoff_hist, cpu);
  27370. + break;
  27371. +#endif
  27372. +#ifdef CONFIG_INTERRUPT_OFF_HIST
  27373. + case IRQSOFF_LATENCY:
  27374. + hist = &per_cpu(irqsoff_hist, cpu);
  27375. + break;
  27376. +#endif
  27377. +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
  27378. + case PREEMPTIRQSOFF_LATENCY:
  27379. + hist = &per_cpu(preemptirqsoff_hist, cpu);
  27380. + break;
  27381. +#endif
  27382. +#ifdef CONFIG_WAKEUP_LATENCY_HIST
  27383. + case WAKEUP_LATENCY:
  27384. + hist = &per_cpu(wakeup_latency_hist, cpu);
  27385. + mp = &per_cpu(wakeup_maxlatproc, cpu);
  27386. + break;
  27387. + case WAKEUP_LATENCY_SHAREDPRIO:
  27388. + hist = &per_cpu(wakeup_latency_hist_sharedprio, cpu);
  27389. + mp = &per_cpu(wakeup_maxlatproc_sharedprio, cpu);
  27390. + break;
  27391. +#endif
  27392. +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
  27393. + case MISSED_TIMER_OFFSETS:
  27394. + hist = &per_cpu(missed_timer_offsets, cpu);
  27395. + mp = &per_cpu(missed_timer_offsets_maxlatproc, cpu);
  27396. + break;
  27397. +#endif
  27398. +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
  27399. + defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
  27400. + case TIMERANDWAKEUP_LATENCY:
  27401. + hist = &per_cpu(timerandwakeup_latency_hist, cpu);
  27402. + mp = &per_cpu(timerandwakeup_maxlatproc, cpu);
  27403. + break;
  27404. +#endif
  27405. + }
  27406. +
  27407. + hist_reset(hist);
  27408. +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
  27409. + defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
  27410. + if (latency_type == WAKEUP_LATENCY ||
  27411. + latency_type == WAKEUP_LATENCY_SHAREDPRIO ||
  27412. + latency_type == MISSED_TIMER_OFFSETS ||
  27413. + latency_type == TIMERANDWAKEUP_LATENCY)
  27414. + clear_maxlatprocdata(mp);
  27415. +#endif
  27416. + }
  27417. +
  27418. + return size;
  27419. +}
  27420. +
  27421. +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
  27422. + defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
  27423. +static ssize_t
  27424. +show_pid(struct file *file, char __user *ubuf, size_t cnt, loff_t *ppos)
  27425. +{
  27426. + char buf[64];
  27427. + int r;
  27428. + unsigned long *this_pid = file->private_data;
  27429. +
  27430. + r = snprintf(buf, sizeof(buf), "%lu\n", *this_pid);
  27431. + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
  27432. +}
  27433. +
  27434. +static ssize_t do_pid(struct file *file, const char __user *ubuf,
  27435. + size_t cnt, loff_t *ppos)
  27436. +{
  27437. + char buf[64];
  27438. + unsigned long pid;
  27439. + unsigned long *this_pid = file->private_data;
  27440. +
  27441. + if (cnt >= sizeof(buf))
  27442. + return -EINVAL;
  27443. +
  27444. + if (copy_from_user(&buf, ubuf, cnt))
  27445. + return -EFAULT;
  27446. +
  27447. + buf[cnt] = '\0';
  27448. +
  27449. + if (kstrtoul(buf, 10, &pid))
  27450. + return -EINVAL;
  27451. +
  27452. + *this_pid = pid;
  27453. +
  27454. + return cnt;
  27455. +}
  27456. +#endif
  27457. +
  27458. +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
  27459. + defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
  27460. +static ssize_t
  27461. +show_maxlatproc(struct file *file, char __user *ubuf, size_t cnt, loff_t *ppos)
  27462. +{
  27463. + int r;
  27464. + struct maxlatproc_data *mp = file->private_data;
  27465. + int strmaxlen = (TASK_COMM_LEN * 2) + (8 * 8);
  27466. + unsigned long long t;
  27467. + unsigned long usecs, secs;
  27468. + char *buf;
  27469. +
  27470. + if (mp->pid == -1 || mp->current_pid == -1) {
  27471. + buf = "(none)\n";
  27472. + return simple_read_from_buffer(ubuf, cnt, ppos, buf,
  27473. + strlen(buf));
  27474. + }
  27475. +
  27476. + buf = kmalloc(strmaxlen, GFP_KERNEL);
  27477. + if (buf == NULL)
  27478. + return -ENOMEM;
  27479. +
  27480. + t = ns2usecs(mp->timestamp);
  27481. + usecs = do_div(t, USEC_PER_SEC);
  27482. + secs = (unsigned long) t;
  27483. + r = snprintf(buf, strmaxlen,
  27484. + "%d %d %ld (%ld) %s <- %d %d %s %lu.%06lu\n", mp->pid,
  27485. + MAX_RT_PRIO-1 - mp->prio, mp->latency, mp->timeroffset, mp->comm,
  27486. + mp->current_pid, MAX_RT_PRIO-1 - mp->current_prio, mp->current_comm,
  27487. + secs, usecs);
  27488. + r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
  27489. + kfree(buf);
  27490. + return r;
  27491. +}
  27492. +#endif
  27493. +
  27494. +static ssize_t
  27495. +show_enable(struct file *file, char __user *ubuf, size_t cnt, loff_t *ppos)
  27496. +{
  27497. + char buf[64];
  27498. + struct enable_data *ed = file->private_data;
  27499. + int r;
  27500. +
  27501. + r = snprintf(buf, sizeof(buf), "%d\n", ed->enabled);
  27502. + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
  27503. +}
  27504. +
  27505. +static ssize_t
  27506. +do_enable(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos)
  27507. +{
  27508. + char buf[64];
  27509. + long enable;
  27510. + struct enable_data *ed = file->private_data;
  27511. +
  27512. + if (cnt >= sizeof(buf))
  27513. + return -EINVAL;
  27514. +
  27515. + if (copy_from_user(&buf, ubuf, cnt))
  27516. + return -EFAULT;
  27517. +
  27518. + buf[cnt] = 0;
  27519. +
  27520. + if (kstrtoul(buf, 10, &enable))
  27521. + return -EINVAL;
  27522. +
  27523. + if ((enable && ed->enabled) || (!enable && !ed->enabled))
  27524. + return cnt;
  27525. +
  27526. + if (enable) {
  27527. + int ret;
  27528. +
  27529. + switch (ed->latency_type) {
  27530. +#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
  27531. + case PREEMPTIRQSOFF_LATENCY:
  27532. + ret = register_trace_preemptirqsoff_hist(
  27533. + probe_preemptirqsoff_hist, NULL);
  27534. + if (ret) {
  27535. + pr_info("wakeup trace: Couldn't assign "
  27536. + "probe_preemptirqsoff_hist "
  27537. + "to trace_preemptirqsoff_hist\n");
  27538. + return ret;
  27539. + }
  27540. + break;
  27541. +#endif
  27542. +#ifdef CONFIG_WAKEUP_LATENCY_HIST
  27543. + case WAKEUP_LATENCY:
  27544. + ret = register_trace_sched_wakeup(
  27545. + probe_wakeup_latency_hist_start, NULL);
  27546. + if (ret) {
  27547. + pr_info("wakeup trace: Couldn't assign "
  27548. + "probe_wakeup_latency_hist_start "
  27549. + "to trace_sched_wakeup\n");
  27550. + return ret;
  27551. + }
  27552. + ret = register_trace_sched_wakeup_new(
  27553. + probe_wakeup_latency_hist_start, NULL);
  27554. + if (ret) {
  27555. + pr_info("wakeup trace: Couldn't assign "
  27556. + "probe_wakeup_latency_hist_start "
  27557. + "to trace_sched_wakeup_new\n");
  27558. + unregister_trace_sched_wakeup(
  27559. + probe_wakeup_latency_hist_start, NULL);
  27560. + return ret;
  27561. + }
  27562. + ret = register_trace_sched_switch(
  27563. + probe_wakeup_latency_hist_stop, NULL);
  27564. + if (ret) {
  27565. + pr_info("wakeup trace: Couldn't assign "
  27566. + "probe_wakeup_latency_hist_stop "
  27567. + "to trace_sched_switch\n");
  27568. + unregister_trace_sched_wakeup(
  27569. + probe_wakeup_latency_hist_start, NULL);
  27570. + unregister_trace_sched_wakeup_new(
  27571. + probe_wakeup_latency_hist_start, NULL);
  27572. + return ret;
  27573. + }
  27574. + ret = register_trace_sched_migrate_task(
  27575. + probe_sched_migrate_task, NULL);
  27576. + if (ret) {
  27577. + pr_info("wakeup trace: Couldn't assign "
  27578. + "probe_sched_migrate_task "
  27579. + "to trace_sched_migrate_task\n");
  27580. + unregister_trace_sched_wakeup(
  27581. + probe_wakeup_latency_hist_start, NULL);
  27582. + unregister_trace_sched_wakeup_new(
  27583. + probe_wakeup_latency_hist_start, NULL);
  27584. + unregister_trace_sched_switch(
  27585. + probe_wakeup_latency_hist_stop, NULL);
  27586. + return ret;
  27587. + }
  27588. + break;
  27589. +#endif
  27590. +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
  27591. + case MISSED_TIMER_OFFSETS:
  27592. + ret = register_trace_hrtimer_interrupt(
  27593. + probe_hrtimer_interrupt, NULL);
  27594. + if (ret) {
  27595. + pr_info("wakeup trace: Couldn't assign "
  27596. + "probe_hrtimer_interrupt "
  27597. + "to trace_hrtimer_interrupt\n");
  27598. + return ret;
  27599. + }
  27600. + break;
  27601. +#endif
  27602. +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
  27603. + defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
  27604. + case TIMERANDWAKEUP_LATENCY:
  27605. + if (!wakeup_latency_enabled_data.enabled ||
  27606. + !missed_timer_offsets_enabled_data.enabled)
  27607. + return -EINVAL;
  27608. + break;
  27609. +#endif
  27610. + default:
  27611. + break;
  27612. + }
  27613. + } else {
  27614. + switch (ed->latency_type) {
  27615. +#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
  27616. + case PREEMPTIRQSOFF_LATENCY:
  27617. + {
  27618. + int cpu;
  27619. +
  27620. + unregister_trace_preemptirqsoff_hist(
  27621. + probe_preemptirqsoff_hist, NULL);
  27622. + for_each_online_cpu(cpu) {
  27623. +#ifdef CONFIG_INTERRUPT_OFF_HIST
  27624. + per_cpu(hist_irqsoff_counting,
  27625. + cpu) = 0;
  27626. +#endif
  27627. +#ifdef CONFIG_PREEMPT_OFF_HIST
  27628. + per_cpu(hist_preemptoff_counting,
  27629. + cpu) = 0;
  27630. +#endif
  27631. +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
  27632. + per_cpu(hist_preemptirqsoff_counting,
  27633. + cpu) = 0;
  27634. +#endif
  27635. + }
  27636. + }
  27637. + break;
  27638. +#endif
  27639. +#ifdef CONFIG_WAKEUP_LATENCY_HIST
  27640. + case WAKEUP_LATENCY:
  27641. + {
  27642. + int cpu;
  27643. +
  27644. + unregister_trace_sched_wakeup(
  27645. + probe_wakeup_latency_hist_start, NULL);
  27646. + unregister_trace_sched_wakeup_new(
  27647. + probe_wakeup_latency_hist_start, NULL);
  27648. + unregister_trace_sched_switch(
  27649. + probe_wakeup_latency_hist_stop, NULL);
  27650. + unregister_trace_sched_migrate_task(
  27651. + probe_sched_migrate_task, NULL);
  27652. +
  27653. + for_each_online_cpu(cpu) {
  27654. + per_cpu(wakeup_task, cpu) = NULL;
  27655. + per_cpu(wakeup_sharedprio, cpu) = 0;
  27656. + }
  27657. + }
  27658. +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
  27659. + timerandwakeup_enabled_data.enabled = 0;
  27660. +#endif
  27661. + break;
  27662. +#endif
  27663. +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
  27664. + case MISSED_TIMER_OFFSETS:
  27665. + unregister_trace_hrtimer_interrupt(
  27666. + probe_hrtimer_interrupt, NULL);
  27667. +#ifdef CONFIG_WAKEUP_LATENCY_HIST
  27668. + timerandwakeup_enabled_data.enabled = 0;
  27669. +#endif
  27670. + break;
  27671. +#endif
  27672. + default:
  27673. + break;
  27674. + }
  27675. + }
  27676. + ed->enabled = enable;
  27677. + return cnt;
  27678. +}
  27679. +
  27680. +static const struct file_operations latency_hist_reset_fops = {
  27681. + .open = tracing_open_generic,
  27682. + .write = latency_hist_reset,
  27683. +};
  27684. +
  27685. +static const struct file_operations enable_fops = {
  27686. + .open = tracing_open_generic,
  27687. + .read = show_enable,
  27688. + .write = do_enable,
  27689. +};
  27690. +
  27691. +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
  27692. + defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
  27693. +static const struct file_operations pid_fops = {
  27694. + .open = tracing_open_generic,
  27695. + .read = show_pid,
  27696. + .write = do_pid,
  27697. +};
  27698. +
  27699. +static const struct file_operations maxlatproc_fops = {
  27700. + .open = tracing_open_generic,
  27701. + .read = show_maxlatproc,
  27702. +};
  27703. +#endif
  27704. +
  27705. +#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
  27706. +static notrace void probe_preemptirqsoff_hist(void *v, int reason,
  27707. + int starthist)
  27708. +{
  27709. + int cpu = raw_smp_processor_id();
  27710. + int time_set = 0;
  27711. +
  27712. + if (starthist) {
  27713. + cycle_t uninitialized_var(start);
  27714. +
  27715. + if (!preempt_count() && !irqs_disabled())
  27716. + return;
  27717. +
  27718. +#ifdef CONFIG_INTERRUPT_OFF_HIST
  27719. + if ((reason == IRQS_OFF || reason == TRACE_START) &&
  27720. + !per_cpu(hist_irqsoff_counting, cpu)) {
  27721. + per_cpu(hist_irqsoff_counting, cpu) = 1;
  27722. + start = ftrace_now(cpu);
  27723. + time_set++;
  27724. + per_cpu(hist_irqsoff_start, cpu) = start;
  27725. + }
  27726. +#endif
  27727. +
  27728. +#ifdef CONFIG_PREEMPT_OFF_HIST
  27729. + if ((reason == PREEMPT_OFF || reason == TRACE_START) &&
  27730. + !per_cpu(hist_preemptoff_counting, cpu)) {
  27731. + per_cpu(hist_preemptoff_counting, cpu) = 1;
  27732. + if (!(time_set++))
  27733. + start = ftrace_now(cpu);
  27734. + per_cpu(hist_preemptoff_start, cpu) = start;
  27735. + }
  27736. +#endif
  27737. +
  27738. +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
  27739. + if (per_cpu(hist_irqsoff_counting, cpu) &&
  27740. + per_cpu(hist_preemptoff_counting, cpu) &&
  27741. + !per_cpu(hist_preemptirqsoff_counting, cpu)) {
  27742. + per_cpu(hist_preemptirqsoff_counting, cpu) = 1;
  27743. + if (!time_set)
  27744. + start = ftrace_now(cpu);
  27745. + per_cpu(hist_preemptirqsoff_start, cpu) = start;
  27746. + }
  27747. +#endif
  27748. + } else {
  27749. + cycle_t uninitialized_var(stop);
  27750. +
  27751. +#ifdef CONFIG_INTERRUPT_OFF_HIST
  27752. + if ((reason == IRQS_ON || reason == TRACE_STOP) &&
  27753. + per_cpu(hist_irqsoff_counting, cpu)) {
  27754. + cycle_t start = per_cpu(hist_irqsoff_start, cpu);
  27755. +
  27756. + stop = ftrace_now(cpu);
  27757. + time_set++;
  27758. + if (start) {
  27759. + long latency = ((long) (stop - start)) /
  27760. + NSECS_PER_USECS;
  27761. +
  27762. + latency_hist(IRQSOFF_LATENCY, cpu, latency, 0,
  27763. + stop, NULL);
  27764. + }
  27765. + per_cpu(hist_irqsoff_counting, cpu) = 0;
  27766. + }
  27767. +#endif
  27768. +
  27769. +#ifdef CONFIG_PREEMPT_OFF_HIST
  27770. + if ((reason == PREEMPT_ON || reason == TRACE_STOP) &&
  27771. + per_cpu(hist_preemptoff_counting, cpu)) {
  27772. + cycle_t start = per_cpu(hist_preemptoff_start, cpu);
  27773. +
  27774. + if (!(time_set++))
  27775. + stop = ftrace_now(cpu);
  27776. + if (start) {
  27777. + long latency = ((long) (stop - start)) /
  27778. + NSECS_PER_USECS;
  27779. +
  27780. + latency_hist(PREEMPTOFF_LATENCY, cpu, latency,
  27781. + 0, stop, NULL);
  27782. + }
  27783. + per_cpu(hist_preemptoff_counting, cpu) = 0;
  27784. + }
  27785. +#endif
  27786. +
  27787. +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
  27788. + if ((!per_cpu(hist_irqsoff_counting, cpu) ||
  27789. + !per_cpu(hist_preemptoff_counting, cpu)) &&
  27790. + per_cpu(hist_preemptirqsoff_counting, cpu)) {
  27791. + cycle_t start = per_cpu(hist_preemptirqsoff_start, cpu);
  27792. +
  27793. + if (!time_set)
  27794. + stop = ftrace_now(cpu);
  27795. + if (start) {
  27796. + long latency = ((long) (stop - start)) /
  27797. + NSECS_PER_USECS;
  27798. +
  27799. + latency_hist(PREEMPTIRQSOFF_LATENCY, cpu,
  27800. + latency, 0, stop, NULL);
  27801. + }
  27802. + per_cpu(hist_preemptirqsoff_counting, cpu) = 0;
  27803. + }
  27804. +#endif
  27805. + }
  27806. +}
  27807. +#endif
  27808. +
  27809. +#ifdef CONFIG_WAKEUP_LATENCY_HIST
  27810. +static DEFINE_RAW_SPINLOCK(wakeup_lock);
  27811. +static notrace void probe_sched_migrate_task(void *v, struct task_struct *task,
  27812. + int cpu)
  27813. +{
  27814. + int old_cpu = task_cpu(task);
  27815. +
  27816. + if (cpu != old_cpu) {
  27817. + unsigned long flags;
  27818. + struct task_struct *cpu_wakeup_task;
  27819. +
  27820. + raw_spin_lock_irqsave(&wakeup_lock, flags);
  27821. +
  27822. + cpu_wakeup_task = per_cpu(wakeup_task, old_cpu);
  27823. + if (task == cpu_wakeup_task) {
  27824. + put_task_struct(cpu_wakeup_task);
  27825. + per_cpu(wakeup_task, old_cpu) = NULL;
  27826. + cpu_wakeup_task = per_cpu(wakeup_task, cpu) = task;
  27827. + get_task_struct(cpu_wakeup_task);
  27828. + }
  27829. +
  27830. + raw_spin_unlock_irqrestore(&wakeup_lock, flags);
  27831. + }
  27832. +}
  27833. +
  27834. +static notrace void probe_wakeup_latency_hist_start(void *v,
  27835. + struct task_struct *p, int success)
  27836. +{
  27837. + unsigned long flags;
  27838. + struct task_struct *curr = current;
  27839. + int cpu = task_cpu(p);
  27840. + struct task_struct *cpu_wakeup_task;
  27841. +
  27842. + raw_spin_lock_irqsave(&wakeup_lock, flags);
  27843. +
  27844. + cpu_wakeup_task = per_cpu(wakeup_task, cpu);
  27845. +
  27846. + if (wakeup_pid) {
  27847. + if ((cpu_wakeup_task && p->prio == cpu_wakeup_task->prio) ||
  27848. + p->prio == curr->prio)
  27849. + per_cpu(wakeup_sharedprio, cpu) = 1;
  27850. + if (likely(wakeup_pid != task_pid_nr(p)))
  27851. + goto out;
  27852. + } else {
  27853. + if (likely(!rt_task(p)) ||
  27854. + (cpu_wakeup_task && p->prio > cpu_wakeup_task->prio) ||
  27855. + p->prio > curr->prio)
  27856. + goto out;
  27857. + if ((cpu_wakeup_task && p->prio == cpu_wakeup_task->prio) ||
  27858. + p->prio == curr->prio)
  27859. + per_cpu(wakeup_sharedprio, cpu) = 1;
  27860. + }
  27861. +
  27862. + if (cpu_wakeup_task)
  27863. + put_task_struct(cpu_wakeup_task);
  27864. + cpu_wakeup_task = per_cpu(wakeup_task, cpu) = p;
  27865. + get_task_struct(cpu_wakeup_task);
  27866. + cpu_wakeup_task->preempt_timestamp_hist =
  27867. + ftrace_now(raw_smp_processor_id());
  27868. +out:
  27869. + raw_spin_unlock_irqrestore(&wakeup_lock, flags);
  27870. +}
  27871. +
  27872. +static notrace void probe_wakeup_latency_hist_stop(void *v,
  27873. + struct task_struct *prev, struct task_struct *next)
  27874. +{
  27875. + unsigned long flags;
  27876. + int cpu = task_cpu(next);
  27877. + long latency;
  27878. + cycle_t stop;
  27879. + struct task_struct *cpu_wakeup_task;
  27880. +
  27881. + raw_spin_lock_irqsave(&wakeup_lock, flags);
  27882. +
  27883. + cpu_wakeup_task = per_cpu(wakeup_task, cpu);
  27884. +
  27885. + if (cpu_wakeup_task == NULL)
  27886. + goto out;
  27887. +
  27888. + /* Already running? */
  27889. + if (unlikely(current == cpu_wakeup_task))
  27890. + goto out_reset;
  27891. +
  27892. + if (next != cpu_wakeup_task) {
  27893. + if (next->prio < cpu_wakeup_task->prio)
  27894. + goto out_reset;
  27895. +
  27896. + if (next->prio == cpu_wakeup_task->prio)
  27897. + per_cpu(wakeup_sharedprio, cpu) = 1;
  27898. +
  27899. + goto out;
  27900. + }
  27901. +
  27902. + if (current->prio == cpu_wakeup_task->prio)
  27903. + per_cpu(wakeup_sharedprio, cpu) = 1;
  27904. +
  27905. + /*
  27906. + * The task we are waiting for is about to be switched to.
  27907. + * Calculate latency and store it in histogram.
  27908. + */
  27909. + stop = ftrace_now(raw_smp_processor_id());
  27910. +
  27911. + latency = ((long) (stop - next->preempt_timestamp_hist)) /
  27912. + NSECS_PER_USECS;
  27913. +
  27914. + if (per_cpu(wakeup_sharedprio, cpu)) {
  27915. + latency_hist(WAKEUP_LATENCY_SHAREDPRIO, cpu, latency, 0, stop,
  27916. + next);
  27917. + per_cpu(wakeup_sharedprio, cpu) = 0;
  27918. + } else {
  27919. + latency_hist(WAKEUP_LATENCY, cpu, latency, 0, stop, next);
  27920. +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
  27921. + if (timerandwakeup_enabled_data.enabled) {
  27922. + latency_hist(TIMERANDWAKEUP_LATENCY, cpu,
  27923. + next->timer_offset + latency, next->timer_offset,
  27924. + stop, next);
  27925. + }
  27926. +#endif
  27927. + }
  27928. +
  27929. +out_reset:
  27930. +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
  27931. + next->timer_offset = 0;
  27932. +#endif
  27933. + put_task_struct(cpu_wakeup_task);
  27934. + per_cpu(wakeup_task, cpu) = NULL;
  27935. +out:
  27936. + raw_spin_unlock_irqrestore(&wakeup_lock, flags);
  27937. +}
  27938. +#endif
  27939. +
  27940. +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
  27941. +static notrace void probe_hrtimer_interrupt(void *v, int cpu,
  27942. + long long latency_ns, struct task_struct *curr,
  27943. + struct task_struct *task)
  27944. +{
  27945. + if (latency_ns <= 0 && task != NULL && rt_task(task) &&
  27946. + (task->prio < curr->prio ||
  27947. + (task->prio == curr->prio &&
  27948. + !cpumask_test_cpu(cpu, &task->cpus_allowed)))) {
  27949. + long latency;
  27950. + cycle_t now;
  27951. +
  27952. + if (missed_timer_offsets_pid) {
  27953. + if (likely(missed_timer_offsets_pid !=
  27954. + task_pid_nr(task)))
  27955. + return;
  27956. + }
  27957. +
  27958. + now = ftrace_now(cpu);
  27959. + latency = (long) div_s64(-latency_ns, NSECS_PER_USECS);
  27960. + latency_hist(MISSED_TIMER_OFFSETS, cpu, latency, latency, now,
  27961. + task);
  27962. +#ifdef CONFIG_WAKEUP_LATENCY_HIST
  27963. + task->timer_offset = latency;
  27964. +#endif
  27965. + }
  27966. +}
  27967. +#endif
  27968. +
  27969. +static __init int latency_hist_init(void)
  27970. +{
  27971. + struct dentry *latency_hist_root = NULL;
  27972. + struct dentry *dentry;
  27973. +#ifdef CONFIG_WAKEUP_LATENCY_HIST
  27974. + struct dentry *dentry_sharedprio;
  27975. +#endif
  27976. + struct dentry *entry;
  27977. + struct dentry *enable_root;
  27978. + int i = 0;
  27979. + struct hist_data *my_hist;
  27980. + char name[64];
  27981. + char *cpufmt = "CPU%d";
  27982. +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
  27983. + defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
  27984. + char *cpufmt_maxlatproc = "max_latency-CPU%d";
  27985. + struct maxlatproc_data *mp = NULL;
  27986. +#endif
  27987. +
  27988. + dentry = tracing_init_dentry();
  27989. + latency_hist_root = debugfs_create_dir(latency_hist_dir_root, dentry);
  27990. + enable_root = debugfs_create_dir("enable", latency_hist_root);
  27991. +
  27992. +#ifdef CONFIG_INTERRUPT_OFF_HIST
  27993. + dentry = debugfs_create_dir(irqsoff_hist_dir, latency_hist_root);
  27994. + for_each_possible_cpu(i) {
  27995. + sprintf(name, cpufmt, i);
  27996. + entry = debugfs_create_file(name, 0444, dentry,
  27997. + &per_cpu(irqsoff_hist, i), &latency_hist_fops);
  27998. + my_hist = &per_cpu(irqsoff_hist, i);
  27999. + atomic_set(&my_hist->hist_mode, 1);
  28000. + my_hist->min_lat = LONG_MAX;
  28001. + }
  28002. + entry = debugfs_create_file("reset", 0644, dentry,
  28003. + (void *)IRQSOFF_LATENCY, &latency_hist_reset_fops);
  28004. +#endif
  28005. +
  28006. +#ifdef CONFIG_PREEMPT_OFF_HIST
  28007. + dentry = debugfs_create_dir(preemptoff_hist_dir,
  28008. + latency_hist_root);
  28009. + for_each_possible_cpu(i) {
  28010. + sprintf(name, cpufmt, i);
  28011. + entry = debugfs_create_file(name, 0444, dentry,
  28012. + &per_cpu(preemptoff_hist, i), &latency_hist_fops);
  28013. + my_hist = &per_cpu(preemptoff_hist, i);
  28014. + atomic_set(&my_hist->hist_mode, 1);
  28015. + my_hist->min_lat = LONG_MAX;
  28016. + }
  28017. + entry = debugfs_create_file("reset", 0644, dentry,
  28018. + (void *)PREEMPTOFF_LATENCY, &latency_hist_reset_fops);
  28019. +#endif
  28020. +
  28021. +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
  28022. + dentry = debugfs_create_dir(preemptirqsoff_hist_dir,
  28023. + latency_hist_root);
  28024. + for_each_possible_cpu(i) {
  28025. + sprintf(name, cpufmt, i);
  28026. + entry = debugfs_create_file(name, 0444, dentry,
  28027. + &per_cpu(preemptirqsoff_hist, i), &latency_hist_fops);
  28028. + my_hist = &per_cpu(preemptirqsoff_hist, i);
  28029. + atomic_set(&my_hist->hist_mode, 1);
  28030. + my_hist->min_lat = LONG_MAX;
  28031. + }
  28032. + entry = debugfs_create_file("reset", 0644, dentry,
  28033. + (void *)PREEMPTIRQSOFF_LATENCY, &latency_hist_reset_fops);
  28034. +#endif
  28035. +
  28036. +#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
  28037. + entry = debugfs_create_file("preemptirqsoff", 0644,
  28038. + enable_root, (void *)&preemptirqsoff_enabled_data,
  28039. + &enable_fops);
  28040. +#endif
  28041. +
  28042. +#ifdef CONFIG_WAKEUP_LATENCY_HIST
  28043. + dentry = debugfs_create_dir(wakeup_latency_hist_dir,
  28044. + latency_hist_root);
  28045. + dentry_sharedprio = debugfs_create_dir(
  28046. + wakeup_latency_hist_dir_sharedprio, dentry);
  28047. + for_each_possible_cpu(i) {
  28048. + sprintf(name, cpufmt, i);
  28049. +
  28050. + entry = debugfs_create_file(name, 0444, dentry,
  28051. + &per_cpu(wakeup_latency_hist, i),
  28052. + &latency_hist_fops);
  28053. + my_hist = &per_cpu(wakeup_latency_hist, i);
  28054. + atomic_set(&my_hist->hist_mode, 1);
  28055. + my_hist->min_lat = LONG_MAX;
  28056. +
  28057. + entry = debugfs_create_file(name, 0444, dentry_sharedprio,
  28058. + &per_cpu(wakeup_latency_hist_sharedprio, i),
  28059. + &latency_hist_fops);
  28060. + my_hist = &per_cpu(wakeup_latency_hist_sharedprio, i);
  28061. + atomic_set(&my_hist->hist_mode, 1);
  28062. + my_hist->min_lat = LONG_MAX;
  28063. +
  28064. + sprintf(name, cpufmt_maxlatproc, i);
  28065. +
  28066. + mp = &per_cpu(wakeup_maxlatproc, i);
  28067. + entry = debugfs_create_file(name, 0444, dentry, mp,
  28068. + &maxlatproc_fops);
  28069. + clear_maxlatprocdata(mp);
  28070. +
  28071. + mp = &per_cpu(wakeup_maxlatproc_sharedprio, i);
  28072. + entry = debugfs_create_file(name, 0444, dentry_sharedprio, mp,
  28073. + &maxlatproc_fops);
  28074. + clear_maxlatprocdata(mp);
  28075. + }
  28076. + entry = debugfs_create_file("pid", 0644, dentry,
  28077. + (void *)&wakeup_pid, &pid_fops);
  28078. + entry = debugfs_create_file("reset", 0644, dentry,
  28079. + (void *)WAKEUP_LATENCY, &latency_hist_reset_fops);
  28080. + entry = debugfs_create_file("reset", 0644, dentry_sharedprio,
  28081. + (void *)WAKEUP_LATENCY_SHAREDPRIO, &latency_hist_reset_fops);
  28082. + entry = debugfs_create_file("wakeup", 0644,
  28083. + enable_root, (void *)&wakeup_latency_enabled_data,
  28084. + &enable_fops);
  28085. +#endif
  28086. +
  28087. +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
  28088. + dentry = debugfs_create_dir(missed_timer_offsets_dir,
  28089. + latency_hist_root);
  28090. + for_each_possible_cpu(i) {
  28091. + sprintf(name, cpufmt, i);
  28092. + entry = debugfs_create_file(name, 0444, dentry,
  28093. + &per_cpu(missed_timer_offsets, i), &latency_hist_fops);
  28094. + my_hist = &per_cpu(missed_timer_offsets, i);
  28095. + atomic_set(&my_hist->hist_mode, 1);
  28096. + my_hist->min_lat = LONG_MAX;
  28097. +
  28098. + sprintf(name, cpufmt_maxlatproc, i);
  28099. + mp = &per_cpu(missed_timer_offsets_maxlatproc, i);
  28100. + entry = debugfs_create_file(name, 0444, dentry, mp,
  28101. + &maxlatproc_fops);
  28102. + clear_maxlatprocdata(mp);
  28103. + }
  28104. + entry = debugfs_create_file("pid", 0644, dentry,
  28105. + (void *)&missed_timer_offsets_pid, &pid_fops);
  28106. + entry = debugfs_create_file("reset", 0644, dentry,
  28107. + (void *)MISSED_TIMER_OFFSETS, &latency_hist_reset_fops);
  28108. + entry = debugfs_create_file("missed_timer_offsets", 0644,
  28109. + enable_root, (void *)&missed_timer_offsets_enabled_data,
  28110. + &enable_fops);
  28111. +#endif
  28112. +
  28113. +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
  28114. + defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
  28115. + dentry = debugfs_create_dir(timerandwakeup_latency_hist_dir,
  28116. + latency_hist_root);
  28117. + for_each_possible_cpu(i) {
  28118. + sprintf(name, cpufmt, i);
  28119. + entry = debugfs_create_file(name, 0444, dentry,
  28120. + &per_cpu(timerandwakeup_latency_hist, i),
  28121. + &latency_hist_fops);
  28122. + my_hist = &per_cpu(timerandwakeup_latency_hist, i);
  28123. + atomic_set(&my_hist->hist_mode, 1);
  28124. + my_hist->min_lat = LONG_MAX;
  28125. +
  28126. + sprintf(name, cpufmt_maxlatproc, i);
  28127. + mp = &per_cpu(timerandwakeup_maxlatproc, i);
  28128. + entry = debugfs_create_file(name, 0444, dentry, mp,
  28129. + &maxlatproc_fops);
  28130. + clear_maxlatprocdata(mp);
  28131. + }
  28132. + entry = debugfs_create_file("reset", 0644, dentry,
  28133. + (void *)TIMERANDWAKEUP_LATENCY, &latency_hist_reset_fops);
  28134. + entry = debugfs_create_file("timerandwakeup", 0644,
  28135. + enable_root, (void *)&timerandwakeup_enabled_data,
  28136. + &enable_fops);
  28137. +#endif
  28138. + return 0;
  28139. +}
  28140. +
  28141. +device_initcall(latency_hist_init);
  28142. diff -Nur linux-3.18.12.orig/kernel/trace/Makefile linux-3.18.12/kernel/trace/Makefile
  28143. --- linux-3.18.12.orig/kernel/trace/Makefile 2015-04-20 14:48:02.000000000 -0500
  28144. +++ linux-3.18.12/kernel/trace/Makefile 2015-04-26 13:32:22.455684003 -0500
  28145. @@ -36,6 +36,10 @@
  28146. obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
  28147. obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
  28148. obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
  28149. +obj-$(CONFIG_INTERRUPT_OFF_HIST) += latency_hist.o
  28150. +obj-$(CONFIG_PREEMPT_OFF_HIST) += latency_hist.o
  28151. +obj-$(CONFIG_WAKEUP_LATENCY_HIST) += latency_hist.o
  28152. +obj-$(CONFIG_MISSED_TIMER_OFFSETS_HIST) += latency_hist.o
  28153. obj-$(CONFIG_NOP_TRACER) += trace_nop.o
  28154. obj-$(CONFIG_STACK_TRACER) += trace_stack.o
  28155. obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
  28156. diff -Nur linux-3.18.12.orig/kernel/trace/trace.c linux-3.18.12/kernel/trace/trace.c
  28157. --- linux-3.18.12.orig/kernel/trace/trace.c 2015-04-20 14:48:02.000000000 -0500
  28158. +++ linux-3.18.12/kernel/trace/trace.c 2015-04-26 13:32:22.455684003 -0500
  28159. @@ -1579,6 +1579,7 @@
  28160. struct task_struct *tsk = current;
  28161. entry->preempt_count = pc & 0xff;
  28162. + entry->preempt_lazy_count = preempt_lazy_count();
  28163. entry->pid = (tsk) ? tsk->pid : 0;
  28164. entry->flags =
  28165. #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
  28166. @@ -1588,8 +1589,11 @@
  28167. #endif
  28168. ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
  28169. ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
  28170. - (tif_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) |
  28171. + (tif_need_resched_now() ? TRACE_FLAG_NEED_RESCHED : 0) |
  28172. + (need_resched_lazy() ? TRACE_FLAG_NEED_RESCHED_LAZY : 0) |
  28173. (test_preempt_need_resched() ? TRACE_FLAG_PREEMPT_RESCHED : 0);
  28174. +
  28175. + entry->migrate_disable = (tsk) ? __migrate_disabled(tsk) & 0xFF : 0;
  28176. }
  28177. EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
  28178. @@ -2509,14 +2513,17 @@
  28179. static void print_lat_help_header(struct seq_file *m)
  28180. {
  28181. - seq_puts(m, "# _------=> CPU# \n");
  28182. - seq_puts(m, "# / _-----=> irqs-off \n");
  28183. - seq_puts(m, "# | / _----=> need-resched \n");
  28184. - seq_puts(m, "# || / _---=> hardirq/softirq \n");
  28185. - seq_puts(m, "# ||| / _--=> preempt-depth \n");
  28186. - seq_puts(m, "# |||| / delay \n");
  28187. - seq_puts(m, "# cmd pid ||||| time | caller \n");
  28188. - seq_puts(m, "# \\ / ||||| \\ | / \n");
  28189. + seq_puts(m, "# _--------=> CPU# \n");
  28190. + seq_puts(m, "# / _-------=> irqs-off \n");
  28191. + seq_puts(m, "# | / _------=> need-resched \n");
  28192. + seq_puts(m, "# || / _-----=> need-resched_lazy \n");
  28193. + seq_puts(m, "# ||| / _----=> hardirq/softirq \n");
  28194. + seq_puts(m, "# |||| / _---=> preempt-depth \n");
  28195. + seq_puts(m, "# ||||| / _--=> preempt-lazy-depth\n");
  28196. + seq_puts(m, "# |||||| / _-=> migrate-disable \n");
  28197. + seq_puts(m, "# ||||||| / delay \n");
  28198. + seq_puts(m, "# cmd pid |||||||| time | caller \n");
  28199. + seq_puts(m, "# \\ / |||||||| \\ | / \n");
  28200. }
  28201. static void print_event_info(struct trace_buffer *buf, struct seq_file *m)
  28202. @@ -2540,13 +2547,16 @@
  28203. static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m)
  28204. {
  28205. print_event_info(buf, m);
  28206. - seq_puts(m, "# _-----=> irqs-off\n");
  28207. - seq_puts(m, "# / _----=> need-resched\n");
  28208. - seq_puts(m, "# | / _---=> hardirq/softirq\n");
  28209. - seq_puts(m, "# || / _--=> preempt-depth\n");
  28210. - seq_puts(m, "# ||| / delay\n");
  28211. - seq_puts(m, "# TASK-PID CPU# |||| TIMESTAMP FUNCTION\n");
  28212. - seq_puts(m, "# | | | |||| | |\n");
  28213. + seq_puts(m, "# _-------=> irqs-off \n");
  28214. + seq_puts(m, "# / _------=> need-resched \n");
  28215. + seq_puts(m, "# |/ _-----=> need-resched_lazy \n");
  28216. + seq_puts(m, "# ||/ _----=> hardirq/softirq \n");
  28217. + seq_puts(m, "# |||/ _---=> preempt-depth \n");
  28218. + seq_puts(m, "# ||||/ _--=> preempt-lazy-depth\n");
  28219. + seq_puts(m, "# ||||| / _-=> migrate-disable \n");
  28220. + seq_puts(m, "# |||||| / delay\n");
  28221. + seq_puts(m, "# TASK-PID CPU# |||||| TIMESTAMP FUNCTION\n");
  28222. + seq_puts(m, "# | | | |||||| | |\n");
  28223. }
  28224. void
  28225. diff -Nur linux-3.18.12.orig/kernel/trace/trace_events.c linux-3.18.12/kernel/trace/trace_events.c
  28226. --- linux-3.18.12.orig/kernel/trace/trace_events.c 2015-04-20 14:48:02.000000000 -0500
  28227. +++ linux-3.18.12/kernel/trace/trace_events.c 2015-04-26 13:32:22.455684003 -0500
  28228. @@ -162,6 +162,8 @@
  28229. __common_field(unsigned char, flags);
  28230. __common_field(unsigned char, preempt_count);
  28231. __common_field(int, pid);
  28232. + __common_field(unsigned short, migrate_disable);
  28233. + __common_field(unsigned short, padding);
  28234. return ret;
  28235. }
  28236. diff -Nur linux-3.18.12.orig/kernel/trace/trace.h linux-3.18.12/kernel/trace/trace.h
  28237. --- linux-3.18.12.orig/kernel/trace/trace.h 2015-04-20 14:48:02.000000000 -0500
  28238. +++ linux-3.18.12/kernel/trace/trace.h 2015-04-26 13:32:22.455684003 -0500
  28239. @@ -119,6 +119,7 @@
  28240. * NEED_RESCHED - reschedule is requested
  28241. * HARDIRQ - inside an interrupt handler
  28242. * SOFTIRQ - inside a softirq handler
  28243. + * NEED_RESCHED_LAZY - lazy reschedule is requested
  28244. */
  28245. enum trace_flag_type {
  28246. TRACE_FLAG_IRQS_OFF = 0x01,
  28247. @@ -127,6 +128,7 @@
  28248. TRACE_FLAG_HARDIRQ = 0x08,
  28249. TRACE_FLAG_SOFTIRQ = 0x10,
  28250. TRACE_FLAG_PREEMPT_RESCHED = 0x20,
  28251. + TRACE_FLAG_NEED_RESCHED_LAZY = 0x40,
  28252. };
  28253. #define TRACE_BUF_SIZE 1024
  28254. diff -Nur linux-3.18.12.orig/kernel/trace/trace_irqsoff.c linux-3.18.12/kernel/trace/trace_irqsoff.c
  28255. --- linux-3.18.12.orig/kernel/trace/trace_irqsoff.c 2015-04-20 14:48:02.000000000 -0500
  28256. +++ linux-3.18.12/kernel/trace/trace_irqsoff.c 2015-04-26 13:32:22.455684003 -0500
  28257. @@ -17,6 +17,7 @@
  28258. #include <linux/fs.h>
  28259. #include "trace.h"
  28260. +#include <trace/events/hist.h>
  28261. static struct trace_array *irqsoff_trace __read_mostly;
  28262. static int tracer_enabled __read_mostly;
  28263. @@ -435,11 +436,13 @@
  28264. {
  28265. if (preempt_trace() || irq_trace())
  28266. start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
  28267. + trace_preemptirqsoff_hist(TRACE_START, 1);
  28268. }
  28269. EXPORT_SYMBOL_GPL(start_critical_timings);
  28270. void stop_critical_timings(void)
  28271. {
  28272. + trace_preemptirqsoff_hist(TRACE_STOP, 0);
  28273. if (preempt_trace() || irq_trace())
  28274. stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
  28275. }
  28276. @@ -449,6 +452,7 @@
  28277. #ifdef CONFIG_PROVE_LOCKING
  28278. void time_hardirqs_on(unsigned long a0, unsigned long a1)
  28279. {
  28280. + trace_preemptirqsoff_hist(IRQS_ON, 0);
  28281. if (!preempt_trace() && irq_trace())
  28282. stop_critical_timing(a0, a1);
  28283. }
  28284. @@ -457,6 +461,7 @@
  28285. {
  28286. if (!preempt_trace() && irq_trace())
  28287. start_critical_timing(a0, a1);
  28288. + trace_preemptirqsoff_hist(IRQS_OFF, 1);
  28289. }
  28290. #else /* !CONFIG_PROVE_LOCKING */
  28291. @@ -482,6 +487,7 @@
  28292. */
  28293. void trace_hardirqs_on(void)
  28294. {
  28295. + trace_preemptirqsoff_hist(IRQS_ON, 0);
  28296. if (!preempt_trace() && irq_trace())
  28297. stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
  28298. }
  28299. @@ -491,11 +497,13 @@
  28300. {
  28301. if (!preempt_trace() && irq_trace())
  28302. start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
  28303. + trace_preemptirqsoff_hist(IRQS_OFF, 1);
  28304. }
  28305. EXPORT_SYMBOL(trace_hardirqs_off);
  28306. __visible void trace_hardirqs_on_caller(unsigned long caller_addr)
  28307. {
  28308. + trace_preemptirqsoff_hist(IRQS_ON, 0);
  28309. if (!preempt_trace() && irq_trace())
  28310. stop_critical_timing(CALLER_ADDR0, caller_addr);
  28311. }
  28312. @@ -505,6 +513,7 @@
  28313. {
  28314. if (!preempt_trace() && irq_trace())
  28315. start_critical_timing(CALLER_ADDR0, caller_addr);
  28316. + trace_preemptirqsoff_hist(IRQS_OFF, 1);
  28317. }
  28318. EXPORT_SYMBOL(trace_hardirqs_off_caller);
  28319. @@ -514,12 +523,14 @@
  28320. #ifdef CONFIG_PREEMPT_TRACER
  28321. void trace_preempt_on(unsigned long a0, unsigned long a1)
  28322. {
  28323. + trace_preemptirqsoff_hist(PREEMPT_ON, 0);
  28324. if (preempt_trace() && !irq_trace())
  28325. stop_critical_timing(a0, a1);
  28326. }
  28327. void trace_preempt_off(unsigned long a0, unsigned long a1)
  28328. {
  28329. + trace_preemptirqsoff_hist(PREEMPT_ON, 1);
  28330. if (preempt_trace() && !irq_trace())
  28331. start_critical_timing(a0, a1);
  28332. }
  28333. diff -Nur linux-3.18.12.orig/kernel/trace/trace_output.c linux-3.18.12/kernel/trace/trace_output.c
  28334. --- linux-3.18.12.orig/kernel/trace/trace_output.c 2015-04-20 14:48:02.000000000 -0500
  28335. +++ linux-3.18.12/kernel/trace/trace_output.c 2015-04-26 13:32:22.455684003 -0500
  28336. @@ -410,6 +410,7 @@
  28337. {
  28338. char hardsoft_irq;
  28339. char need_resched;
  28340. + char need_resched_lazy;
  28341. char irqs_off;
  28342. int hardirq;
  28343. int softirq;
  28344. @@ -438,6 +439,8 @@
  28345. need_resched = '.';
  28346. break;
  28347. }
  28348. + need_resched_lazy =
  28349. + (entry->flags & TRACE_FLAG_NEED_RESCHED_LAZY) ? 'L' : '.';
  28350. hardsoft_irq =
  28351. (hardirq && softirq) ? 'H' :
  28352. @@ -445,8 +448,9 @@
  28353. softirq ? 's' :
  28354. '.';
  28355. - if (!trace_seq_printf(s, "%c%c%c",
  28356. - irqs_off, need_resched, hardsoft_irq))
  28357. + if (!trace_seq_printf(s, "%c%c%c%c",
  28358. + irqs_off, need_resched, need_resched_lazy,
  28359. + hardsoft_irq))
  28360. return 0;
  28361. if (entry->preempt_count)
  28362. @@ -454,6 +458,16 @@
  28363. else
  28364. ret = trace_seq_putc(s, '.');
  28365. + if (entry->preempt_lazy_count)
  28366. + ret = trace_seq_printf(s, "%x", entry->preempt_lazy_count);
  28367. + else
  28368. + ret = trace_seq_putc(s, '.');
  28369. +
  28370. + if (entry->migrate_disable)
  28371. + ret = trace_seq_printf(s, "%x", entry->migrate_disable);
  28372. + else
  28373. + ret = trace_seq_putc(s, '.');
  28374. +
  28375. return ret;
  28376. }
  28377. diff -Nur linux-3.18.12.orig/kernel/user.c linux-3.18.12/kernel/user.c
  28378. --- linux-3.18.12.orig/kernel/user.c 2015-04-20 14:48:02.000000000 -0500
  28379. +++ linux-3.18.12/kernel/user.c 2015-04-26 13:32:22.455684003 -0500
  28380. @@ -158,11 +158,11 @@
  28381. if (!up)
  28382. return;
  28383. - local_irq_save(flags);
  28384. + local_irq_save_nort(flags);
  28385. if (atomic_dec_and_lock(&up->__count, &uidhash_lock))
  28386. free_user(up, flags);
  28387. else
  28388. - local_irq_restore(flags);
  28389. + local_irq_restore_nort(flags);
  28390. }
  28391. struct user_struct *alloc_uid(kuid_t uid)
  28392. diff -Nur linux-3.18.12.orig/kernel/watchdog.c linux-3.18.12/kernel/watchdog.c
  28393. --- linux-3.18.12.orig/kernel/watchdog.c 2015-04-20 14:48:02.000000000 -0500
  28394. +++ linux-3.18.12/kernel/watchdog.c 2015-04-26 13:32:22.459684003 -0500
  28395. @@ -248,6 +248,8 @@
  28396. #ifdef CONFIG_HARDLOCKUP_DETECTOR
  28397. +static DEFINE_RAW_SPINLOCK(watchdog_output_lock);
  28398. +
  28399. static struct perf_event_attr wd_hw_attr = {
  28400. .type = PERF_TYPE_HARDWARE,
  28401. .config = PERF_COUNT_HW_CPU_CYCLES,
  28402. @@ -281,13 +283,21 @@
  28403. /* only print hardlockups once */
  28404. if (__this_cpu_read(hard_watchdog_warn) == true)
  28405. return;
  28406. + /*
  28407. + * If early-printk is enabled then make sure we do not
  28408. + * lock up in printk() and kill console logging:
  28409. + */
  28410. + printk_kill();
  28411. - if (hardlockup_panic)
  28412. + if (hardlockup_panic) {
  28413. panic("Watchdog detected hard LOCKUP on cpu %d",
  28414. this_cpu);
  28415. - else
  28416. + } else {
  28417. + raw_spin_lock(&watchdog_output_lock);
  28418. WARN(1, "Watchdog detected hard LOCKUP on cpu %d",
  28419. this_cpu);
  28420. + raw_spin_unlock(&watchdog_output_lock);
  28421. + }
  28422. __this_cpu_write(hard_watchdog_warn, true);
  28423. return;
  28424. @@ -430,6 +440,7 @@
  28425. /* kick off the timer for the hardlockup detector */
  28426. hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
  28427. hrtimer->function = watchdog_timer_fn;
  28428. + hrtimer->irqsafe = 1;
  28429. /* Enable the perf event */
  28430. watchdog_nmi_enable(cpu);
  28431. diff -Nur linux-3.18.12.orig/kernel/workqueue.c linux-3.18.12/kernel/workqueue.c
  28432. --- linux-3.18.12.orig/kernel/workqueue.c 2015-04-20 14:48:02.000000000 -0500
  28433. +++ linux-3.18.12/kernel/workqueue.c 2015-04-26 13:32:22.459684003 -0500
  28434. @@ -48,6 +48,8 @@
  28435. #include <linux/nodemask.h>
  28436. #include <linux/moduleparam.h>
  28437. #include <linux/uaccess.h>
  28438. +#include <linux/locallock.h>
  28439. +#include <linux/delay.h>
  28440. #include "workqueue_internal.h"
  28441. @@ -121,15 +123,20 @@
  28442. * cpu or grabbing pool->lock is enough for read access. If
  28443. * POOL_DISASSOCIATED is set, it's identical to L.
  28444. *
  28445. + * On RT we need the extra protection via rt_lock_idle_list() for
  28446. + * the list manipulations against read access from
  28447. + * wq_worker_sleeping(). All other places are nicely serialized via
  28448. + * pool->lock.
  28449. + *
  28450. * A: pool->attach_mutex protected.
  28451. *
  28452. * PL: wq_pool_mutex protected.
  28453. *
  28454. - * PR: wq_pool_mutex protected for writes. Sched-RCU protected for reads.
  28455. + * PR: wq_pool_mutex protected for writes. RCU protected for reads.
  28456. *
  28457. * WQ: wq->mutex protected.
  28458. *
  28459. - * WR: wq->mutex protected for writes. Sched-RCU protected for reads.
  28460. + * WR: wq->mutex protected for writes. RCU protected for reads.
  28461. *
  28462. * MD: wq_mayday_lock protected.
  28463. */
  28464. @@ -177,7 +184,7 @@
  28465. atomic_t nr_running ____cacheline_aligned_in_smp;
  28466. /*
  28467. - * Destruction of pool is sched-RCU protected to allow dereferences
  28468. + * Destruction of pool is RCU protected to allow dereferences
  28469. * from get_work_pool().
  28470. */
  28471. struct rcu_head rcu;
  28472. @@ -206,7 +213,7 @@
  28473. /*
  28474. * Release of unbound pwq is punted to system_wq. See put_pwq()
  28475. * and pwq_unbound_release_workfn() for details. pool_workqueue
  28476. - * itself is also sched-RCU protected so that the first pwq can be
  28477. + * itself is also RCU protected so that the first pwq can be
  28478. * determined without grabbing wq->mutex.
  28479. */
  28480. struct work_struct unbound_release_work;
  28481. @@ -321,6 +328,8 @@
  28482. struct workqueue_struct *system_freezable_power_efficient_wq __read_mostly;
  28483. EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq);
  28484. +static DEFINE_LOCAL_IRQ_LOCK(pendingb_lock);
  28485. +
  28486. static int worker_thread(void *__worker);
  28487. static void copy_workqueue_attrs(struct workqueue_attrs *to,
  28488. const struct workqueue_attrs *from);
  28489. @@ -329,14 +338,14 @@
  28490. #include <trace/events/workqueue.h>
  28491. #define assert_rcu_or_pool_mutex() \
  28492. - rcu_lockdep_assert(rcu_read_lock_sched_held() || \
  28493. + rcu_lockdep_assert(rcu_read_lock_held() || \
  28494. lockdep_is_held(&wq_pool_mutex), \
  28495. - "sched RCU or wq_pool_mutex should be held")
  28496. + "RCU or wq_pool_mutex should be held")
  28497. #define assert_rcu_or_wq_mutex(wq) \
  28498. - rcu_lockdep_assert(rcu_read_lock_sched_held() || \
  28499. + rcu_lockdep_assert(rcu_read_lock_held() || \
  28500. lockdep_is_held(&wq->mutex), \
  28501. - "sched RCU or wq->mutex should be held")
  28502. + "RCU or wq->mutex should be held")
  28503. #define for_each_cpu_worker_pool(pool, cpu) \
  28504. for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0]; \
  28505. @@ -348,7 +357,7 @@
  28506. * @pool: iteration cursor
  28507. * @pi: integer used for iteration
  28508. *
  28509. - * This must be called either with wq_pool_mutex held or sched RCU read
  28510. + * This must be called either with wq_pool_mutex held or RCU read
  28511. * locked. If the pool needs to be used beyond the locking in effect, the
  28512. * caller is responsible for guaranteeing that the pool stays online.
  28513. *
  28514. @@ -380,7 +389,7 @@
  28515. * @pwq: iteration cursor
  28516. * @wq: the target workqueue
  28517. *
  28518. - * This must be called either with wq->mutex held or sched RCU read locked.
  28519. + * This must be called either with wq->mutex held or RCU read locked.
  28520. * If the pwq needs to be used beyond the locking in effect, the caller is
  28521. * responsible for guaranteeing that the pwq stays online.
  28522. *
  28523. @@ -392,6 +401,31 @@
  28524. if (({ assert_rcu_or_wq_mutex(wq); false; })) { } \
  28525. else
  28526. +#ifdef CONFIG_PREEMPT_RT_BASE
  28527. +static inline void rt_lock_idle_list(struct worker_pool *pool)
  28528. +{
  28529. + preempt_disable();
  28530. +}
  28531. +static inline void rt_unlock_idle_list(struct worker_pool *pool)
  28532. +{
  28533. + preempt_enable();
  28534. +}
  28535. +static inline void sched_lock_idle_list(struct worker_pool *pool) { }
  28536. +static inline void sched_unlock_idle_list(struct worker_pool *pool) { }
  28537. +#else
  28538. +static inline void rt_lock_idle_list(struct worker_pool *pool) { }
  28539. +static inline void rt_unlock_idle_list(struct worker_pool *pool) { }
  28540. +static inline void sched_lock_idle_list(struct worker_pool *pool)
  28541. +{
  28542. + spin_lock_irq(&pool->lock);
  28543. +}
  28544. +static inline void sched_unlock_idle_list(struct worker_pool *pool)
  28545. +{
  28546. + spin_unlock_irq(&pool->lock);
  28547. +}
  28548. +#endif
  28549. +
  28550. +
  28551. #ifdef CONFIG_DEBUG_OBJECTS_WORK
  28552. static struct debug_obj_descr work_debug_descr;
  28553. @@ -542,7 +576,7 @@
  28554. * @wq: the target workqueue
  28555. * @node: the node ID
  28556. *
  28557. - * This must be called either with pwq_lock held or sched RCU read locked.
  28558. + * This must be called either with pwq_lock held or RCU read locked.
  28559. * If the pwq needs to be used beyond the locking in effect, the caller is
  28560. * responsible for guaranteeing that the pwq stays online.
  28561. *
  28562. @@ -646,8 +680,8 @@
  28563. * @work: the work item of interest
  28564. *
  28565. * Pools are created and destroyed under wq_pool_mutex, and allows read
  28566. - * access under sched-RCU read lock. As such, this function should be
  28567. - * called under wq_pool_mutex or with preemption disabled.
  28568. + * access under RCU read lock. As such, this function should be
  28569. + * called under wq_pool_mutex or inside of a rcu_read_lock() region.
  28570. *
  28571. * All fields of the returned pool are accessible as long as the above
  28572. * mentioned locking is in effect. If the returned pool needs to be used
  28573. @@ -784,51 +818,44 @@
  28574. */
  28575. static void wake_up_worker(struct worker_pool *pool)
  28576. {
  28577. - struct worker *worker = first_idle_worker(pool);
  28578. + struct worker *worker;
  28579. +
  28580. + rt_lock_idle_list(pool);
  28581. +
  28582. + worker = first_idle_worker(pool);
  28583. if (likely(worker))
  28584. wake_up_process(worker->task);
  28585. +
  28586. + rt_unlock_idle_list(pool);
  28587. }
  28588. /**
  28589. - * wq_worker_waking_up - a worker is waking up
  28590. - * @task: task waking up
  28591. - * @cpu: CPU @task is waking up to
  28592. - *
  28593. - * This function is called during try_to_wake_up() when a worker is
  28594. - * being awoken.
  28595. + * wq_worker_running - a worker is running again
  28596. + * @task: task returning from sleep
  28597. *
  28598. - * CONTEXT:
  28599. - * spin_lock_irq(rq->lock)
  28600. + * This function is called when a worker returns from schedule()
  28601. */
  28602. -void wq_worker_waking_up(struct task_struct *task, int cpu)
  28603. +void wq_worker_running(struct task_struct *task)
  28604. {
  28605. struct worker *worker = kthread_data(task);
  28606. - if (!(worker->flags & WORKER_NOT_RUNNING)) {
  28607. - WARN_ON_ONCE(worker->pool->cpu != cpu);
  28608. + if (!worker->sleeping)
  28609. + return;
  28610. + if (!(worker->flags & WORKER_NOT_RUNNING))
  28611. atomic_inc(&worker->pool->nr_running);
  28612. - }
  28613. + worker->sleeping = 0;
  28614. }
  28615. /**
  28616. * wq_worker_sleeping - a worker is going to sleep
  28617. * @task: task going to sleep
  28618. - * @cpu: CPU in question, must be the current CPU number
  28619. - *
  28620. - * This function is called during schedule() when a busy worker is
  28621. - * going to sleep. Worker on the same cpu can be woken up by
  28622. - * returning pointer to its task.
  28623. - *
  28624. - * CONTEXT:
  28625. - * spin_lock_irq(rq->lock)
  28626. - *
  28627. - * Return:
  28628. - * Worker task on @cpu to wake up, %NULL if none.
  28629. + * This function is called from schedule() when a busy worker is
  28630. + * going to sleep.
  28631. */
  28632. -struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu)
  28633. +void wq_worker_sleeping(struct task_struct *task)
  28634. {
  28635. - struct worker *worker = kthread_data(task), *to_wakeup = NULL;
  28636. + struct worker *worker = kthread_data(task);
  28637. struct worker_pool *pool;
  28638. /*
  28639. @@ -837,29 +864,26 @@
  28640. * checking NOT_RUNNING.
  28641. */
  28642. if (worker->flags & WORKER_NOT_RUNNING)
  28643. - return NULL;
  28644. + return;
  28645. pool = worker->pool;
  28646. - /* this can only happen on the local cpu */
  28647. - if (WARN_ON_ONCE(cpu != raw_smp_processor_id() || pool->cpu != cpu))
  28648. - return NULL;
  28649. + if (WARN_ON_ONCE(worker->sleeping))
  28650. + return;
  28651. +
  28652. + worker->sleeping = 1;
  28653. /*
  28654. * The counterpart of the following dec_and_test, implied mb,
  28655. * worklist not empty test sequence is in insert_work().
  28656. * Please read comment there.
  28657. - *
  28658. - * NOT_RUNNING is clear. This means that we're bound to and
  28659. - * running on the local cpu w/ rq lock held and preemption
  28660. - * disabled, which in turn means that none else could be
  28661. - * manipulating idle_list, so dereferencing idle_list without pool
  28662. - * lock is safe.
  28663. */
  28664. if (atomic_dec_and_test(&pool->nr_running) &&
  28665. - !list_empty(&pool->worklist))
  28666. - to_wakeup = first_idle_worker(pool);
  28667. - return to_wakeup ? to_wakeup->task : NULL;
  28668. + !list_empty(&pool->worklist)) {
  28669. + sched_lock_idle_list(pool);
  28670. + wake_up_worker(pool);
  28671. + sched_unlock_idle_list(pool);
  28672. + }
  28673. }
  28674. /**
  28675. @@ -1053,12 +1077,12 @@
  28676. {
  28677. if (pwq) {
  28678. /*
  28679. - * As both pwqs and pools are sched-RCU protected, the
  28680. + * As both pwqs and pools are RCU protected, the
  28681. * following lock operations are safe.
  28682. */
  28683. - spin_lock_irq(&pwq->pool->lock);
  28684. + local_spin_lock_irq(pendingb_lock, &pwq->pool->lock);
  28685. put_pwq(pwq);
  28686. - spin_unlock_irq(&pwq->pool->lock);
  28687. + local_spin_unlock_irq(pendingb_lock, &pwq->pool->lock);
  28688. }
  28689. }
  28690. @@ -1160,7 +1184,7 @@
  28691. struct worker_pool *pool;
  28692. struct pool_workqueue *pwq;
  28693. - local_irq_save(*flags);
  28694. + local_lock_irqsave(pendingb_lock, *flags);
  28695. /* try to steal the timer if it exists */
  28696. if (is_dwork) {
  28697. @@ -1179,6 +1203,7 @@
  28698. if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
  28699. return 0;
  28700. + rcu_read_lock();
  28701. /*
  28702. * The queueing is in progress, or it is already queued. Try to
  28703. * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
  28704. @@ -1217,14 +1242,16 @@
  28705. set_work_pool_and_keep_pending(work, pool->id);
  28706. spin_unlock(&pool->lock);
  28707. + rcu_read_unlock();
  28708. return 1;
  28709. }
  28710. spin_unlock(&pool->lock);
  28711. fail:
  28712. - local_irq_restore(*flags);
  28713. + rcu_read_unlock();
  28714. + local_unlock_irqrestore(pendingb_lock, *flags);
  28715. if (work_is_canceling(work))
  28716. return -ENOENT;
  28717. - cpu_relax();
  28718. + cpu_chill();
  28719. return -EAGAIN;
  28720. }
  28721. @@ -1293,7 +1320,7 @@
  28722. * queued or lose PENDING. Grabbing PENDING and queueing should
  28723. * happen with IRQ disabled.
  28724. */
  28725. - WARN_ON_ONCE(!irqs_disabled());
  28726. + WARN_ON_ONCE_NONRT(!irqs_disabled());
  28727. debug_work_activate(work);
  28728. @@ -1301,6 +1328,8 @@
  28729. if (unlikely(wq->flags & __WQ_DRAINING) &&
  28730. WARN_ON_ONCE(!is_chained_work(wq)))
  28731. return;
  28732. +
  28733. + rcu_read_lock();
  28734. retry:
  28735. if (req_cpu == WORK_CPU_UNBOUND)
  28736. cpu = raw_smp_processor_id();
  28737. @@ -1357,10 +1386,8 @@
  28738. /* pwq determined, queue */
  28739. trace_workqueue_queue_work(req_cpu, pwq, work);
  28740. - if (WARN_ON(!list_empty(&work->entry))) {
  28741. - spin_unlock(&pwq->pool->lock);
  28742. - return;
  28743. - }
  28744. + if (WARN_ON(!list_empty(&work->entry)))
  28745. + goto out;
  28746. pwq->nr_in_flight[pwq->work_color]++;
  28747. work_flags = work_color_to_flags(pwq->work_color);
  28748. @@ -1376,7 +1403,9 @@
  28749. insert_work(pwq, work, worklist, work_flags);
  28750. +out:
  28751. spin_unlock(&pwq->pool->lock);
  28752. + rcu_read_unlock();
  28753. }
  28754. /**
  28755. @@ -1396,14 +1425,14 @@
  28756. bool ret = false;
  28757. unsigned long flags;
  28758. - local_irq_save(flags);
  28759. + local_lock_irqsave(pendingb_lock,flags);
  28760. if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
  28761. __queue_work(cpu, wq, work);
  28762. ret = true;
  28763. }
  28764. - local_irq_restore(flags);
  28765. + local_unlock_irqrestore(pendingb_lock, flags);
  28766. return ret;
  28767. }
  28768. EXPORT_SYMBOL(queue_work_on);
  28769. @@ -1470,14 +1499,14 @@
  28770. unsigned long flags;
  28771. /* read the comment in __queue_work() */
  28772. - local_irq_save(flags);
  28773. + local_lock_irqsave(pendingb_lock, flags);
  28774. if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
  28775. __queue_delayed_work(cpu, wq, dwork, delay);
  28776. ret = true;
  28777. }
  28778. - local_irq_restore(flags);
  28779. + local_unlock_irqrestore(pendingb_lock, flags);
  28780. return ret;
  28781. }
  28782. EXPORT_SYMBOL(queue_delayed_work_on);
  28783. @@ -1512,7 +1541,7 @@
  28784. if (likely(ret >= 0)) {
  28785. __queue_delayed_work(cpu, wq, dwork, delay);
  28786. - local_irq_restore(flags);
  28787. + local_unlock_irqrestore(pendingb_lock, flags);
  28788. }
  28789. /* -ENOENT from try_to_grab_pending() becomes %true */
  28790. @@ -1545,7 +1574,9 @@
  28791. worker->last_active = jiffies;
  28792. /* idle_list is LIFO */
  28793. + rt_lock_idle_list(pool);
  28794. list_add(&worker->entry, &pool->idle_list);
  28795. + rt_unlock_idle_list(pool);
  28796. if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
  28797. mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
  28798. @@ -1578,7 +1609,9 @@
  28799. return;
  28800. worker_clr_flags(worker, WORKER_IDLE);
  28801. pool->nr_idle--;
  28802. + rt_lock_idle_list(pool);
  28803. list_del_init(&worker->entry);
  28804. + rt_unlock_idle_list(pool);
  28805. }
  28806. static struct worker *alloc_worker(int node)
  28807. @@ -1746,7 +1779,9 @@
  28808. pool->nr_workers--;
  28809. pool->nr_idle--;
  28810. + rt_lock_idle_list(pool);
  28811. list_del_init(&worker->entry);
  28812. + rt_unlock_idle_list(pool);
  28813. worker->flags |= WORKER_DIE;
  28814. wake_up_process(worker->task);
  28815. }
  28816. @@ -2641,14 +2676,14 @@
  28817. might_sleep();
  28818. - local_irq_disable();
  28819. + rcu_read_lock();
  28820. pool = get_work_pool(work);
  28821. if (!pool) {
  28822. - local_irq_enable();
  28823. + rcu_read_unlock();
  28824. return false;
  28825. }
  28826. - spin_lock(&pool->lock);
  28827. + spin_lock_irq(&pool->lock);
  28828. /* see the comment in try_to_grab_pending() with the same code */
  28829. pwq = get_work_pwq(work);
  28830. if (pwq) {
  28831. @@ -2675,10 +2710,11 @@
  28832. else
  28833. lock_map_acquire_read(&pwq->wq->lockdep_map);
  28834. lock_map_release(&pwq->wq->lockdep_map);
  28835. -
  28836. + rcu_read_unlock();
  28837. return true;
  28838. already_gone:
  28839. spin_unlock_irq(&pool->lock);
  28840. + rcu_read_unlock();
  28841. return false;
  28842. }
  28843. @@ -2765,7 +2801,7 @@
  28844. /* tell other tasks trying to grab @work to back off */
  28845. mark_work_canceling(work);
  28846. - local_irq_restore(flags);
  28847. + local_unlock_irqrestore(pendingb_lock, flags);
  28848. flush_work(work);
  28849. clear_work_data(work);
  28850. @@ -2820,10 +2856,10 @@
  28851. */
  28852. bool flush_delayed_work(struct delayed_work *dwork)
  28853. {
  28854. - local_irq_disable();
  28855. + local_lock_irq(pendingb_lock);
  28856. if (del_timer_sync(&dwork->timer))
  28857. __queue_work(dwork->cpu, dwork->wq, &dwork->work);
  28858. - local_irq_enable();
  28859. + local_unlock_irq(pendingb_lock);
  28860. return flush_work(&dwork->work);
  28861. }
  28862. EXPORT_SYMBOL(flush_delayed_work);
  28863. @@ -2858,7 +2894,7 @@
  28864. set_work_pool_and_clear_pending(&dwork->work,
  28865. get_work_pool_id(&dwork->work));
  28866. - local_irq_restore(flags);
  28867. + local_unlock_irqrestore(pendingb_lock, flags);
  28868. return ret;
  28869. }
  28870. EXPORT_SYMBOL(cancel_delayed_work);
  28871. @@ -3044,7 +3080,8 @@
  28872. const char *delim = "";
  28873. int node, written = 0;
  28874. - rcu_read_lock_sched();
  28875. + get_online_cpus();
  28876. + rcu_read_lock();
  28877. for_each_node(node) {
  28878. written += scnprintf(buf + written, PAGE_SIZE - written,
  28879. "%s%d:%d", delim, node,
  28880. @@ -3052,7 +3089,8 @@
  28881. delim = " ";
  28882. }
  28883. written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
  28884. - rcu_read_unlock_sched();
  28885. + rcu_read_unlock();
  28886. + put_online_cpus();
  28887. return written;
  28888. }
  28889. @@ -3420,7 +3458,7 @@
  28890. * put_unbound_pool - put a worker_pool
  28891. * @pool: worker_pool to put
  28892. *
  28893. - * Put @pool. If its refcnt reaches zero, it gets destroyed in sched-RCU
  28894. + * Put @pool. If its refcnt reaches zero, it gets destroyed in RCU
  28895. * safe manner. get_unbound_pool() calls this function on its failure path
  28896. * and this function should be able to release pools which went through,
  28897. * successfully or not, init_worker_pool().
  28898. @@ -3474,8 +3512,8 @@
  28899. del_timer_sync(&pool->idle_timer);
  28900. del_timer_sync(&pool->mayday_timer);
  28901. - /* sched-RCU protected to allow dereferences from get_work_pool() */
  28902. - call_rcu_sched(&pool->rcu, rcu_free_pool);
  28903. + /* RCU protected to allow dereferences from get_work_pool() */
  28904. + call_rcu(&pool->rcu, rcu_free_pool);
  28905. }
  28906. /**
  28907. @@ -3580,7 +3618,7 @@
  28908. put_unbound_pool(pool);
  28909. mutex_unlock(&wq_pool_mutex);
  28910. - call_rcu_sched(&pwq->rcu, rcu_free_pwq);
  28911. + call_rcu(&pwq->rcu, rcu_free_pwq);
  28912. /*
  28913. * If we're the last pwq going away, @wq is already dead and no one
  28914. @@ -4292,7 +4330,8 @@
  28915. struct pool_workqueue *pwq;
  28916. bool ret;
  28917. - rcu_read_lock_sched();
  28918. + rcu_read_lock();
  28919. + preempt_disable();
  28920. if (cpu == WORK_CPU_UNBOUND)
  28921. cpu = smp_processor_id();
  28922. @@ -4303,7 +4342,8 @@
  28923. pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));
  28924. ret = !list_empty(&pwq->delayed_works);
  28925. - rcu_read_unlock_sched();
  28926. + preempt_enable();
  28927. + rcu_read_unlock();
  28928. return ret;
  28929. }
  28930. @@ -4329,16 +4369,15 @@
  28931. if (work_pending(work))
  28932. ret |= WORK_BUSY_PENDING;
  28933. - local_irq_save(flags);
  28934. + rcu_read_lock();
  28935. pool = get_work_pool(work);
  28936. if (pool) {
  28937. - spin_lock(&pool->lock);
  28938. + spin_lock_irqsave(&pool->lock, flags);
  28939. if (find_worker_executing_work(pool, work))
  28940. ret |= WORK_BUSY_RUNNING;
  28941. - spin_unlock(&pool->lock);
  28942. + spin_unlock_irqrestore(&pool->lock, flags);
  28943. }
  28944. - local_irq_restore(flags);
  28945. -
  28946. + rcu_read_unlock();
  28947. return ret;
  28948. }
  28949. EXPORT_SYMBOL_GPL(work_busy);
  28950. @@ -4767,16 +4806,16 @@
  28951. * nr_active is monotonically decreasing. It's safe
  28952. * to peek without lock.
  28953. */
  28954. - rcu_read_lock_sched();
  28955. + rcu_read_lock();
  28956. for_each_pwq(pwq, wq) {
  28957. WARN_ON_ONCE(pwq->nr_active < 0);
  28958. if (pwq->nr_active) {
  28959. busy = true;
  28960. - rcu_read_unlock_sched();
  28961. + rcu_read_unlock();
  28962. goto out_unlock;
  28963. }
  28964. }
  28965. - rcu_read_unlock_sched();
  28966. + rcu_read_unlock();
  28967. }
  28968. out_unlock:
  28969. mutex_unlock(&wq_pool_mutex);
  28970. diff -Nur linux-3.18.12.orig/kernel/workqueue_internal.h linux-3.18.12/kernel/workqueue_internal.h
  28971. --- linux-3.18.12.orig/kernel/workqueue_internal.h 2015-04-20 14:48:02.000000000 -0500
  28972. +++ linux-3.18.12/kernel/workqueue_internal.h 2015-04-26 13:32:22.459684003 -0500
  28973. @@ -43,6 +43,7 @@
  28974. unsigned long last_active; /* L: last active timestamp */
  28975. unsigned int flags; /* X: flags */
  28976. int id; /* I: worker id */
  28977. + int sleeping; /* None */
  28978. /*
  28979. * Opaque string set with work_set_desc(). Printed out with task
  28980. @@ -68,7 +69,7 @@
  28981. * Scheduler hooks for concurrency managed workqueue. Only to be used from
  28982. * sched/core.c and workqueue.c.
  28983. */
  28984. -void wq_worker_waking_up(struct task_struct *task, int cpu);
  28985. -struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu);
  28986. +void wq_worker_running(struct task_struct *task);
  28987. +void wq_worker_sleeping(struct task_struct *task);
  28988. #endif /* _KERNEL_WORKQUEUE_INTERNAL_H */
  28989. diff -Nur linux-3.18.12.orig/lib/debugobjects.c linux-3.18.12/lib/debugobjects.c
  28990. --- linux-3.18.12.orig/lib/debugobjects.c 2015-04-20 14:48:02.000000000 -0500
  28991. +++ linux-3.18.12/lib/debugobjects.c 2015-04-26 13:32:22.459684003 -0500
  28992. @@ -309,7 +309,10 @@
  28993. struct debug_obj *obj;
  28994. unsigned long flags;
  28995. - fill_pool();
  28996. +#ifdef CONFIG_PREEMPT_RT_FULL
  28997. + if (preempt_count() == 0 && !irqs_disabled())
  28998. +#endif
  28999. + fill_pool();
  29000. db = get_bucket((unsigned long) addr);
  29001. diff -Nur linux-3.18.12.orig/lib/idr.c linux-3.18.12/lib/idr.c
  29002. --- linux-3.18.12.orig/lib/idr.c 2015-04-20 14:48:02.000000000 -0500
  29003. +++ linux-3.18.12/lib/idr.c 2015-04-26 13:32:22.459684003 -0500
  29004. @@ -31,6 +31,7 @@
  29005. #include <linux/spinlock.h>
  29006. #include <linux/percpu.h>
  29007. #include <linux/hardirq.h>
  29008. +#include <linux/locallock.h>
  29009. #define MAX_IDR_SHIFT (sizeof(int) * 8 - 1)
  29010. #define MAX_IDR_BIT (1U << MAX_IDR_SHIFT)
  29011. @@ -367,6 +368,35 @@
  29012. idr_mark_full(pa, id);
  29013. }
  29014. +#ifdef CONFIG_PREEMPT_RT_FULL
  29015. +static DEFINE_LOCAL_IRQ_LOCK(idr_lock);
  29016. +
  29017. +static inline void idr_preload_lock(void)
  29018. +{
  29019. + local_lock(idr_lock);
  29020. +}
  29021. +
  29022. +static inline void idr_preload_unlock(void)
  29023. +{
  29024. + local_unlock(idr_lock);
  29025. +}
  29026. +
  29027. +void idr_preload_end(void)
  29028. +{
  29029. + idr_preload_unlock();
  29030. +}
  29031. +EXPORT_SYMBOL(idr_preload_end);
  29032. +#else
  29033. +static inline void idr_preload_lock(void)
  29034. +{
  29035. + preempt_disable();
  29036. +}
  29037. +
  29038. +static inline void idr_preload_unlock(void)
  29039. +{
  29040. + preempt_enable();
  29041. +}
  29042. +#endif
  29043. /**
  29044. * idr_preload - preload for idr_alloc()
  29045. @@ -402,7 +432,7 @@
  29046. WARN_ON_ONCE(in_interrupt());
  29047. might_sleep_if(gfp_mask & __GFP_WAIT);
  29048. - preempt_disable();
  29049. + idr_preload_lock();
  29050. /*
  29051. * idr_alloc() is likely to succeed w/o full idr_layer buffer and
  29052. @@ -414,9 +444,9 @@
  29053. while (__this_cpu_read(idr_preload_cnt) < MAX_IDR_FREE) {
  29054. struct idr_layer *new;
  29055. - preempt_enable();
  29056. + idr_preload_unlock();
  29057. new = kmem_cache_zalloc(idr_layer_cache, gfp_mask);
  29058. - preempt_disable();
  29059. + idr_preload_lock();
  29060. if (!new)
  29061. break;
  29062. diff -Nur linux-3.18.12.orig/lib/Kconfig linux-3.18.12/lib/Kconfig
  29063. --- linux-3.18.12.orig/lib/Kconfig 2015-04-20 14:48:02.000000000 -0500
  29064. +++ linux-3.18.12/lib/Kconfig 2015-04-26 13:32:22.459684003 -0500
  29065. @@ -383,6 +383,7 @@
  29066. config CPUMASK_OFFSTACK
  29067. bool "Force CPU masks off stack" if DEBUG_PER_CPU_MAPS
  29068. + depends on !PREEMPT_RT_FULL
  29069. help
  29070. Use dynamic allocation for cpumask_var_t, instead of putting
  29071. them on the stack. This is a bit more expensive, but avoids
  29072. diff -Nur linux-3.18.12.orig/lib/Kconfig.debug linux-3.18.12/lib/Kconfig.debug
  29073. --- linux-3.18.12.orig/lib/Kconfig.debug 2015-04-20 14:48:02.000000000 -0500
  29074. +++ linux-3.18.12/lib/Kconfig.debug 2015-04-26 13:32:22.459684003 -0500
  29075. @@ -639,7 +639,7 @@
  29076. config DEBUG_SHIRQ
  29077. bool "Debug shared IRQ handlers"
  29078. - depends on DEBUG_KERNEL
  29079. + depends on DEBUG_KERNEL && !PREEMPT_RT_BASE
  29080. help
  29081. Enable this to generate a spurious interrupt as soon as a shared
  29082. interrupt handler is registered, and just before one is deregistered.
  29083. diff -Nur linux-3.18.12.orig/lib/locking-selftest.c linux-3.18.12/lib/locking-selftest.c
  29084. --- linux-3.18.12.orig/lib/locking-selftest.c 2015-04-20 14:48:02.000000000 -0500
  29085. +++ linux-3.18.12/lib/locking-selftest.c 2015-04-26 13:32:22.459684003 -0500
  29086. @@ -590,6 +590,8 @@
  29087. #include "locking-selftest-spin-hardirq.h"
  29088. GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_hard_spin)
  29089. +#ifndef CONFIG_PREEMPT_RT_FULL
  29090. +
  29091. #include "locking-selftest-rlock-hardirq.h"
  29092. GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_hard_rlock)
  29093. @@ -605,9 +607,12 @@
  29094. #include "locking-selftest-wlock-softirq.h"
  29095. GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_soft_wlock)
  29096. +#endif
  29097. +
  29098. #undef E1
  29099. #undef E2
  29100. +#ifndef CONFIG_PREEMPT_RT_FULL
  29101. /*
  29102. * Enabling hardirqs with a softirq-safe lock held:
  29103. */
  29104. @@ -640,6 +645,8 @@
  29105. #undef E1
  29106. #undef E2
  29107. +#endif
  29108. +
  29109. /*
  29110. * Enabling irqs with an irq-safe lock held:
  29111. */
  29112. @@ -663,6 +670,8 @@
  29113. #include "locking-selftest-spin-hardirq.h"
  29114. GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_hard_spin)
  29115. +#ifndef CONFIG_PREEMPT_RT_FULL
  29116. +
  29117. #include "locking-selftest-rlock-hardirq.h"
  29118. GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_hard_rlock)
  29119. @@ -678,6 +687,8 @@
  29120. #include "locking-selftest-wlock-softirq.h"
  29121. GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_wlock)
  29122. +#endif
  29123. +
  29124. #undef E1
  29125. #undef E2
  29126. @@ -709,6 +720,8 @@
  29127. #include "locking-selftest-spin-hardirq.h"
  29128. GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_hard_spin)
  29129. +#ifndef CONFIG_PREEMPT_RT_FULL
  29130. +
  29131. #include "locking-selftest-rlock-hardirq.h"
  29132. GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_hard_rlock)
  29133. @@ -724,6 +737,8 @@
  29134. #include "locking-selftest-wlock-softirq.h"
  29135. GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_wlock)
  29136. +#endif
  29137. +
  29138. #undef E1
  29139. #undef E2
  29140. #undef E3
  29141. @@ -757,6 +772,8 @@
  29142. #include "locking-selftest-spin-hardirq.h"
  29143. GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_hard_spin)
  29144. +#ifndef CONFIG_PREEMPT_RT_FULL
  29145. +
  29146. #include "locking-selftest-rlock-hardirq.h"
  29147. GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_hard_rlock)
  29148. @@ -772,10 +789,14 @@
  29149. #include "locking-selftest-wlock-softirq.h"
  29150. GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_soft_wlock)
  29151. +#endif
  29152. +
  29153. #undef E1
  29154. #undef E2
  29155. #undef E3
  29156. +#ifndef CONFIG_PREEMPT_RT_FULL
  29157. +
  29158. /*
  29159. * read-lock / write-lock irq inversion.
  29160. *
  29161. @@ -838,6 +859,10 @@
  29162. #undef E2
  29163. #undef E3
  29164. +#endif
  29165. +
  29166. +#ifndef CONFIG_PREEMPT_RT_FULL
  29167. +
  29168. /*
  29169. * read-lock / write-lock recursion that is actually safe.
  29170. */
  29171. @@ -876,6 +901,8 @@
  29172. #undef E2
  29173. #undef E3
  29174. +#endif
  29175. +
  29176. /*
  29177. * read-lock / write-lock recursion that is unsafe.
  29178. */
  29179. @@ -1858,6 +1885,7 @@
  29180. printk(" --------------------------------------------------------------------------\n");
  29181. +#ifndef CONFIG_PREEMPT_RT_FULL
  29182. /*
  29183. * irq-context testcases:
  29184. */
  29185. @@ -1870,6 +1898,28 @@
  29186. DO_TESTCASE_6x2("irq read-recursion", irq_read_recursion);
  29187. // DO_TESTCASE_6x2B("irq read-recursion #2", irq_read_recursion2);
  29188. +#else
  29189. + /* On -rt, we only do hardirq context test for raw spinlock */
  29190. + DO_TESTCASE_1B("hard-irqs-on + irq-safe-A", irqsafe1_hard_spin, 12);
  29191. + DO_TESTCASE_1B("hard-irqs-on + irq-safe-A", irqsafe1_hard_spin, 21);
  29192. +
  29193. + DO_TESTCASE_1B("hard-safe-A + irqs-on", irqsafe2B_hard_spin, 12);
  29194. + DO_TESTCASE_1B("hard-safe-A + irqs-on", irqsafe2B_hard_spin, 21);
  29195. +
  29196. + DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 123);
  29197. + DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 132);
  29198. + DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 213);
  29199. + DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 231);
  29200. + DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 312);
  29201. + DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 321);
  29202. +
  29203. + DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 123);
  29204. + DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 132);
  29205. + DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 213);
  29206. + DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 231);
  29207. + DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 312);
  29208. + DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 321);
  29209. +#endif
  29210. ww_tests();
  29211. diff -Nur linux-3.18.12.orig/lib/percpu_ida.c linux-3.18.12/lib/percpu_ida.c
  29212. --- linux-3.18.12.orig/lib/percpu_ida.c 2015-04-20 14:48:02.000000000 -0500
  29213. +++ linux-3.18.12/lib/percpu_ida.c 2015-04-26 13:32:22.459684003 -0500
  29214. @@ -29,6 +29,9 @@
  29215. #include <linux/string.h>
  29216. #include <linux/spinlock.h>
  29217. #include <linux/percpu_ida.h>
  29218. +#include <linux/locallock.h>
  29219. +
  29220. +static DEFINE_LOCAL_IRQ_LOCK(irq_off_lock);
  29221. struct percpu_ida_cpu {
  29222. /*
  29223. @@ -151,13 +154,13 @@
  29224. unsigned long flags;
  29225. int tag;
  29226. - local_irq_save(flags);
  29227. + local_lock_irqsave(irq_off_lock, flags);
  29228. tags = this_cpu_ptr(pool->tag_cpu);
  29229. /* Fastpath */
  29230. tag = alloc_local_tag(tags);
  29231. if (likely(tag >= 0)) {
  29232. - local_irq_restore(flags);
  29233. + local_unlock_irqrestore(irq_off_lock, flags);
  29234. return tag;
  29235. }
  29236. @@ -176,6 +179,7 @@
  29237. if (!tags->nr_free)
  29238. alloc_global_tags(pool, tags);
  29239. +
  29240. if (!tags->nr_free)
  29241. steal_tags(pool, tags);
  29242. @@ -187,7 +191,7 @@
  29243. }
  29244. spin_unlock(&pool->lock);
  29245. - local_irq_restore(flags);
  29246. + local_unlock_irqrestore(irq_off_lock, flags);
  29247. if (tag >= 0 || state == TASK_RUNNING)
  29248. break;
  29249. @@ -199,7 +203,7 @@
  29250. schedule();
  29251. - local_irq_save(flags);
  29252. + local_lock_irqsave(irq_off_lock, flags);
  29253. tags = this_cpu_ptr(pool->tag_cpu);
  29254. }
  29255. if (state != TASK_RUNNING)
  29256. @@ -224,7 +228,7 @@
  29257. BUG_ON(tag >= pool->nr_tags);
  29258. - local_irq_save(flags);
  29259. + local_lock_irqsave(irq_off_lock, flags);
  29260. tags = this_cpu_ptr(pool->tag_cpu);
  29261. spin_lock(&tags->lock);
  29262. @@ -256,7 +260,7 @@
  29263. spin_unlock(&pool->lock);
  29264. }
  29265. - local_irq_restore(flags);
  29266. + local_unlock_irqrestore(irq_off_lock, flags);
  29267. }
  29268. EXPORT_SYMBOL_GPL(percpu_ida_free);
  29269. @@ -348,7 +352,7 @@
  29270. struct percpu_ida_cpu *remote;
  29271. unsigned cpu, i, err = 0;
  29272. - local_irq_save(flags);
  29273. + local_lock_irqsave(irq_off_lock, flags);
  29274. for_each_possible_cpu(cpu) {
  29275. remote = per_cpu_ptr(pool->tag_cpu, cpu);
  29276. spin_lock(&remote->lock);
  29277. @@ -370,7 +374,7 @@
  29278. }
  29279. spin_unlock(&pool->lock);
  29280. out:
  29281. - local_irq_restore(flags);
  29282. + local_unlock_irqrestore(irq_off_lock, flags);
  29283. return err;
  29284. }
  29285. EXPORT_SYMBOL_GPL(percpu_ida_for_each_free);
  29286. diff -Nur linux-3.18.12.orig/lib/radix-tree.c linux-3.18.12/lib/radix-tree.c
  29287. --- linux-3.18.12.orig/lib/radix-tree.c 2015-04-20 14:48:02.000000000 -0500
  29288. +++ linux-3.18.12/lib/radix-tree.c 2015-04-26 13:32:22.459684003 -0500
  29289. @@ -195,12 +195,13 @@
  29290. * succeed in getting a node here (and never reach
  29291. * kmem_cache_alloc)
  29292. */
  29293. - rtp = this_cpu_ptr(&radix_tree_preloads);
  29294. + rtp = &get_cpu_var(radix_tree_preloads);
  29295. if (rtp->nr) {
  29296. ret = rtp->nodes[rtp->nr - 1];
  29297. rtp->nodes[rtp->nr - 1] = NULL;
  29298. rtp->nr--;
  29299. }
  29300. + put_cpu_var(radix_tree_preloads);
  29301. /*
  29302. * Update the allocation stack trace as this is more useful
  29303. * for debugging.
  29304. @@ -240,6 +241,7 @@
  29305. call_rcu(&node->rcu_head, radix_tree_node_rcu_free);
  29306. }
  29307. +#ifndef CONFIG_PREEMPT_RT_FULL
  29308. /*
  29309. * Load up this CPU's radix_tree_node buffer with sufficient objects to
  29310. * ensure that the addition of a single element in the tree cannot fail. On
  29311. @@ -305,6 +307,7 @@
  29312. return 0;
  29313. }
  29314. EXPORT_SYMBOL(radix_tree_maybe_preload);
  29315. +#endif
  29316. /*
  29317. * Return the maximum key which can be store into a
  29318. diff -Nur linux-3.18.12.orig/lib/scatterlist.c linux-3.18.12/lib/scatterlist.c
  29319. --- linux-3.18.12.orig/lib/scatterlist.c 2015-04-20 14:48:02.000000000 -0500
  29320. +++ linux-3.18.12/lib/scatterlist.c 2015-04-26 13:32:22.459684003 -0500
  29321. @@ -592,7 +592,7 @@
  29322. flush_kernel_dcache_page(miter->page);
  29323. if (miter->__flags & SG_MITER_ATOMIC) {
  29324. - WARN_ON_ONCE(preemptible());
  29325. + WARN_ON_ONCE(!pagefault_disabled());
  29326. kunmap_atomic(miter->addr);
  29327. } else
  29328. kunmap(miter->page);
  29329. @@ -637,7 +637,7 @@
  29330. if (!sg_miter_skip(&miter, skip))
  29331. return false;
  29332. - local_irq_save(flags);
  29333. + local_irq_save_nort(flags);
  29334. while (sg_miter_next(&miter) && offset < buflen) {
  29335. unsigned int len;
  29336. @@ -654,7 +654,7 @@
  29337. sg_miter_stop(&miter);
  29338. - local_irq_restore(flags);
  29339. + local_irq_restore_nort(flags);
  29340. return offset;
  29341. }
  29342. diff -Nur linux-3.18.12.orig/lib/smp_processor_id.c linux-3.18.12/lib/smp_processor_id.c
  29343. --- linux-3.18.12.orig/lib/smp_processor_id.c 2015-04-20 14:48:02.000000000 -0500
  29344. +++ linux-3.18.12/lib/smp_processor_id.c 2015-04-26 13:32:22.459684003 -0500
  29345. @@ -39,8 +39,9 @@
  29346. if (!printk_ratelimit())
  29347. goto out_enable;
  29348. - printk(KERN_ERR "BUG: using %s%s() in preemptible [%08x] code: %s/%d\n",
  29349. - what1, what2, preempt_count() - 1, current->comm, current->pid);
  29350. + printk(KERN_ERR "BUG: using %s%s() in preemptible [%08x %08x] code: %s/%d\n",
  29351. + what1, what2, preempt_count() - 1, __migrate_disabled(current),
  29352. + current->comm, current->pid);
  29353. print_symbol("caller is %s\n", (long)__builtin_return_address(0));
  29354. dump_stack();
  29355. diff -Nur linux-3.18.12.orig/mm/filemap.c linux-3.18.12/mm/filemap.c
  29356. --- linux-3.18.12.orig/mm/filemap.c 2015-04-20 14:48:02.000000000 -0500
  29357. +++ linux-3.18.12/mm/filemap.c 2015-04-26 13:32:22.463684003 -0500
  29358. @@ -168,7 +168,9 @@
  29359. if (!workingset_node_pages(node) &&
  29360. list_empty(&node->private_list)) {
  29361. node->private_data = mapping;
  29362. - list_lru_add(&workingset_shadow_nodes, &node->private_list);
  29363. + local_lock(workingset_shadow_lock);
  29364. + list_lru_add(&__workingset_shadow_nodes, &node->private_list);
  29365. + local_unlock(workingset_shadow_lock);
  29366. }
  29367. }
  29368. @@ -535,9 +537,12 @@
  29369. * node->private_list is protected by
  29370. * mapping->tree_lock.
  29371. */
  29372. - if (!list_empty(&node->private_list))
  29373. - list_lru_del(&workingset_shadow_nodes,
  29374. + if (!list_empty(&node->private_list)) {
  29375. + local_lock(workingset_shadow_lock);
  29376. + list_lru_del(&__workingset_shadow_nodes,
  29377. &node->private_list);
  29378. + local_unlock(workingset_shadow_lock);
  29379. + }
  29380. }
  29381. return 0;
  29382. }
  29383. diff -Nur linux-3.18.12.orig/mm/highmem.c linux-3.18.12/mm/highmem.c
  29384. --- linux-3.18.12.orig/mm/highmem.c 2015-04-20 14:48:02.000000000 -0500
  29385. +++ linux-3.18.12/mm/highmem.c 2015-04-26 13:32:22.463684003 -0500
  29386. @@ -29,10 +29,11 @@
  29387. #include <linux/kgdb.h>
  29388. #include <asm/tlbflush.h>
  29389. -
  29390. +#ifndef CONFIG_PREEMPT_RT_FULL
  29391. #if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32)
  29392. DEFINE_PER_CPU(int, __kmap_atomic_idx);
  29393. #endif
  29394. +#endif
  29395. /*
  29396. * Virtual_count is not a pure "count".
  29397. @@ -107,8 +108,9 @@
  29398. unsigned long totalhigh_pages __read_mostly;
  29399. EXPORT_SYMBOL(totalhigh_pages);
  29400. -
  29401. +#ifndef CONFIG_PREEMPT_RT_FULL
  29402. EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx);
  29403. +#endif
  29404. unsigned int nr_free_highpages (void)
  29405. {
  29406. diff -Nur linux-3.18.12.orig/mm/Kconfig linux-3.18.12/mm/Kconfig
  29407. --- linux-3.18.12.orig/mm/Kconfig 2015-04-20 14:48:02.000000000 -0500
  29408. +++ linux-3.18.12/mm/Kconfig 2015-04-26 13:32:22.463684003 -0500
  29409. @@ -408,7 +408,7 @@
  29410. config TRANSPARENT_HUGEPAGE
  29411. bool "Transparent Hugepage Support"
  29412. - depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE
  29413. + depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE && !PREEMPT_RT_FULL
  29414. select COMPACTION
  29415. help
  29416. Transparent Hugepages allows the kernel to use huge pages and
  29417. diff -Nur linux-3.18.12.orig/mm/memcontrol.c linux-3.18.12/mm/memcontrol.c
  29418. --- linux-3.18.12.orig/mm/memcontrol.c 2015-04-20 14:48:02.000000000 -0500
  29419. +++ linux-3.18.12/mm/memcontrol.c 2015-04-26 13:32:22.463684003 -0500
  29420. @@ -60,6 +60,8 @@
  29421. #include <net/sock.h>
  29422. #include <net/ip.h>
  29423. #include <net/tcp_memcontrol.h>
  29424. +#include <linux/locallock.h>
  29425. +
  29426. #include "slab.h"
  29427. #include <asm/uaccess.h>
  29428. @@ -87,6 +89,7 @@
  29429. #define do_swap_account 0
  29430. #endif
  29431. +static DEFINE_LOCAL_IRQ_LOCK(event_lock);
  29432. static const char * const mem_cgroup_stat_names[] = {
  29433. "cache",
  29434. @@ -2376,14 +2379,17 @@
  29435. */
  29436. static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
  29437. {
  29438. - struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
  29439. + struct memcg_stock_pcp *stock;
  29440. + int cpu = get_cpu_light();
  29441. +
  29442. + stock = &per_cpu(memcg_stock, cpu);
  29443. if (stock->cached != memcg) { /* reset if necessary */
  29444. drain_stock(stock);
  29445. stock->cached = memcg;
  29446. }
  29447. stock->nr_pages += nr_pages;
  29448. - put_cpu_var(memcg_stock);
  29449. + put_cpu_light();
  29450. }
  29451. /*
  29452. @@ -2397,7 +2403,7 @@
  29453. /* Notify other cpus that system-wide "drain" is running */
  29454. get_online_cpus();
  29455. - curcpu = get_cpu();
  29456. + curcpu = get_cpu_light();
  29457. for_each_online_cpu(cpu) {
  29458. struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
  29459. struct mem_cgroup *memcg;
  29460. @@ -2414,7 +2420,7 @@
  29461. schedule_work_on(cpu, &stock->work);
  29462. }
  29463. }
  29464. - put_cpu();
  29465. + put_cpu_light();
  29466. if (!sync)
  29467. goto out;
  29468. @@ -3419,12 +3425,12 @@
  29469. move_unlock_mem_cgroup(from, &flags);
  29470. ret = 0;
  29471. - local_irq_disable();
  29472. + local_lock_irq(event_lock);
  29473. mem_cgroup_charge_statistics(to, page, nr_pages);
  29474. memcg_check_events(to, page);
  29475. mem_cgroup_charge_statistics(from, page, -nr_pages);
  29476. memcg_check_events(from, page);
  29477. - local_irq_enable();
  29478. + local_unlock_irq(event_lock);
  29479. out_unlock:
  29480. unlock_page(page);
  29481. out:
  29482. @@ -6406,10 +6412,10 @@
  29483. VM_BUG_ON_PAGE(!PageTransHuge(page), page);
  29484. }
  29485. - local_irq_disable();
  29486. + local_lock_irq(event_lock);
  29487. mem_cgroup_charge_statistics(memcg, page, nr_pages);
  29488. memcg_check_events(memcg, page);
  29489. - local_irq_enable();
  29490. + local_unlock_irq(event_lock);
  29491. if (do_swap_account && PageSwapCache(page)) {
  29492. swp_entry_t entry = { .val = page_private(page) };
  29493. @@ -6468,14 +6474,14 @@
  29494. memcg_oom_recover(memcg);
  29495. }
  29496. - local_irq_save(flags);
  29497. + local_lock_irqsave(event_lock, flags);
  29498. __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon);
  29499. __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file);
  29500. __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge);
  29501. __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout);
  29502. __this_cpu_add(memcg->stat->nr_page_events, nr_anon + nr_file);
  29503. memcg_check_events(memcg, dummy_page);
  29504. - local_irq_restore(flags);
  29505. + local_unlock_irqrestore(event_lock, flags);
  29506. }
  29507. static void uncharge_list(struct list_head *page_list)
  29508. diff -Nur linux-3.18.12.orig/mm/memory.c linux-3.18.12/mm/memory.c
  29509. --- linux-3.18.12.orig/mm/memory.c 2015-04-20 14:48:02.000000000 -0500
  29510. +++ linux-3.18.12/mm/memory.c 2015-04-26 13:32:22.463684003 -0500
  29511. @@ -3244,6 +3244,32 @@
  29512. return 0;
  29513. }
  29514. +#ifdef CONFIG_PREEMPT_RT_FULL
  29515. +void pagefault_disable(void)
  29516. +{
  29517. + migrate_disable();
  29518. + current->pagefault_disabled++;
  29519. + /*
  29520. + * make sure to have issued the store before a pagefault
  29521. + * can hit.
  29522. + */
  29523. + barrier();
  29524. +}
  29525. +EXPORT_SYMBOL(pagefault_disable);
  29526. +
  29527. +void pagefault_enable(void)
  29528. +{
  29529. + /*
  29530. + * make sure to issue those last loads/stores before enabling
  29531. + * the pagefault handler again.
  29532. + */
  29533. + barrier();
  29534. + current->pagefault_disabled--;
  29535. + migrate_enable();
  29536. +}
  29537. +EXPORT_SYMBOL(pagefault_enable);
  29538. +#endif
  29539. +
  29540. /*
  29541. * By the time we get here, we already hold the mm semaphore
  29542. *
  29543. diff -Nur linux-3.18.12.orig/mm/mmu_context.c linux-3.18.12/mm/mmu_context.c
  29544. --- linux-3.18.12.orig/mm/mmu_context.c 2015-04-20 14:48:02.000000000 -0500
  29545. +++ linux-3.18.12/mm/mmu_context.c 2015-04-26 13:32:22.463684003 -0500
  29546. @@ -23,6 +23,7 @@
  29547. struct task_struct *tsk = current;
  29548. task_lock(tsk);
  29549. + preempt_disable_rt();
  29550. active_mm = tsk->active_mm;
  29551. if (active_mm != mm) {
  29552. atomic_inc(&mm->mm_count);
  29553. @@ -30,6 +31,7 @@
  29554. }
  29555. tsk->mm = mm;
  29556. switch_mm(active_mm, mm, tsk);
  29557. + preempt_enable_rt();
  29558. task_unlock(tsk);
  29559. #ifdef finish_arch_post_lock_switch
  29560. finish_arch_post_lock_switch();
  29561. diff -Nur linux-3.18.12.orig/mm/page_alloc.c linux-3.18.12/mm/page_alloc.c
  29562. --- linux-3.18.12.orig/mm/page_alloc.c 2015-04-20 14:48:02.000000000 -0500
  29563. +++ linux-3.18.12/mm/page_alloc.c 2015-04-26 13:32:22.463684003 -0500
  29564. @@ -59,6 +59,7 @@
  29565. #include <linux/page-debug-flags.h>
  29566. #include <linux/hugetlb.h>
  29567. #include <linux/sched/rt.h>
  29568. +#include <linux/locallock.h>
  29569. #include <asm/sections.h>
  29570. #include <asm/tlbflush.h>
  29571. @@ -230,6 +231,18 @@
  29572. EXPORT_SYMBOL(nr_online_nodes);
  29573. #endif
  29574. +static DEFINE_LOCAL_IRQ_LOCK(pa_lock);
  29575. +
  29576. +#ifdef CONFIG_PREEMPT_RT_BASE
  29577. +# define cpu_lock_irqsave(cpu, flags) \
  29578. + local_lock_irqsave_on(pa_lock, flags, cpu)
  29579. +# define cpu_unlock_irqrestore(cpu, flags) \
  29580. + local_unlock_irqrestore_on(pa_lock, flags, cpu)
  29581. +#else
  29582. +# define cpu_lock_irqsave(cpu, flags) local_irq_save(flags)
  29583. +# define cpu_unlock_irqrestore(cpu, flags) local_irq_restore(flags)
  29584. +#endif
  29585. +
  29586. int page_group_by_mobility_disabled __read_mostly;
  29587. void set_pageblock_migratetype(struct page *page, int migratetype)
  29588. @@ -654,7 +667,7 @@
  29589. }
  29590. /*
  29591. - * Frees a number of pages from the PCP lists
  29592. + * Frees a number of pages which have been collected from the pcp lists.
  29593. * Assumes all pages on list are in same zone, and of same order.
  29594. * count is the number of pages to free.
  29595. *
  29596. @@ -665,18 +678,51 @@
  29597. * pinned" detection logic.
  29598. */
  29599. static void free_pcppages_bulk(struct zone *zone, int count,
  29600. - struct per_cpu_pages *pcp)
  29601. + struct list_head *list)
  29602. {
  29603. - int migratetype = 0;
  29604. - int batch_free = 0;
  29605. int to_free = count;
  29606. unsigned long nr_scanned;
  29607. + unsigned long flags;
  29608. +
  29609. + spin_lock_irqsave(&zone->lock, flags);
  29610. - spin_lock(&zone->lock);
  29611. nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED);
  29612. if (nr_scanned)
  29613. __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned);
  29614. + while (!list_empty(list)) {
  29615. + struct page *page = list_first_entry(list, struct page, lru);
  29616. + int mt; /* migratetype of the to-be-freed page */
  29617. +
  29618. + /* must delete as __free_one_page list manipulates */
  29619. + list_del(&page->lru);
  29620. +
  29621. + mt = get_freepage_migratetype(page);
  29622. + if (unlikely(has_isolate_pageblock(zone)))
  29623. + mt = get_pageblock_migratetype(page);
  29624. +
  29625. + /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
  29626. + __free_one_page(page, page_to_pfn(page), zone, 0, mt);
  29627. + trace_mm_page_pcpu_drain(page, 0, mt);
  29628. + to_free--;
  29629. + }
  29630. + WARN_ON(to_free != 0);
  29631. + spin_unlock_irqrestore(&zone->lock, flags);
  29632. +}
  29633. +
  29634. +/*
  29635. + * Moves a number of pages from the PCP lists to free list which
  29636. + * is freed outside of the locked region.
  29637. + *
  29638. + * Assumes all pages on list are in same zone, and of same order.
  29639. + * count is the number of pages to free.
  29640. + */
  29641. +static void isolate_pcp_pages(int to_free, struct per_cpu_pages *src,
  29642. + struct list_head *dst)
  29643. +{
  29644. + int migratetype = 0;
  29645. + int batch_free = 0;
  29646. +
  29647. while (to_free) {
  29648. struct page *page;
  29649. struct list_head *list;
  29650. @@ -692,7 +738,7 @@
  29651. batch_free++;
  29652. if (++migratetype == MIGRATE_PCPTYPES)
  29653. migratetype = 0;
  29654. - list = &pcp->lists[migratetype];
  29655. + list = &src->lists[migratetype];
  29656. } while (list_empty(list));
  29657. /* This is the only non-empty list. Free them all. */
  29658. @@ -700,21 +746,11 @@
  29659. batch_free = to_free;
  29660. do {
  29661. - int mt; /* migratetype of the to-be-freed page */
  29662. -
  29663. - page = list_entry(list->prev, struct page, lru);
  29664. - /* must delete as __free_one_page list manipulates */
  29665. + page = list_last_entry(list, struct page, lru);
  29666. list_del(&page->lru);
  29667. - mt = get_freepage_migratetype(page);
  29668. - if (unlikely(has_isolate_pageblock(zone)))
  29669. - mt = get_pageblock_migratetype(page);
  29670. -
  29671. - /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
  29672. - __free_one_page(page, page_to_pfn(page), zone, 0, mt);
  29673. - trace_mm_page_pcpu_drain(page, 0, mt);
  29674. + list_add(&page->lru, dst);
  29675. } while (--to_free && --batch_free && !list_empty(list));
  29676. }
  29677. - spin_unlock(&zone->lock);
  29678. }
  29679. static void free_one_page(struct zone *zone,
  29680. @@ -723,7 +759,9 @@
  29681. int migratetype)
  29682. {
  29683. unsigned long nr_scanned;
  29684. - spin_lock(&zone->lock);
  29685. + unsigned long flags;
  29686. +
  29687. + spin_lock_irqsave(&zone->lock, flags);
  29688. nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED);
  29689. if (nr_scanned)
  29690. __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned);
  29691. @@ -733,7 +771,7 @@
  29692. migratetype = get_pfnblock_migratetype(page, pfn);
  29693. }
  29694. __free_one_page(page, pfn, zone, order, migratetype);
  29695. - spin_unlock(&zone->lock);
  29696. + spin_unlock_irqrestore(&zone->lock, flags);
  29697. }
  29698. static bool free_pages_prepare(struct page *page, unsigned int order)
  29699. @@ -773,11 +811,11 @@
  29700. return;
  29701. migratetype = get_pfnblock_migratetype(page, pfn);
  29702. - local_irq_save(flags);
  29703. + local_lock_irqsave(pa_lock, flags);
  29704. __count_vm_events(PGFREE, 1 << order);
  29705. set_freepage_migratetype(page, migratetype);
  29706. free_one_page(page_zone(page), page, pfn, order, migratetype);
  29707. - local_irq_restore(flags);
  29708. + local_unlock_irqrestore(pa_lock, flags);
  29709. }
  29710. void __init __free_pages_bootmem(struct page *page, unsigned int order)
  29711. @@ -1251,16 +1289,18 @@
  29712. void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
  29713. {
  29714. unsigned long flags;
  29715. + LIST_HEAD(dst);
  29716. int to_drain, batch;
  29717. - local_irq_save(flags);
  29718. + local_lock_irqsave(pa_lock, flags);
  29719. batch = ACCESS_ONCE(pcp->batch);
  29720. to_drain = min(pcp->count, batch);
  29721. if (to_drain > 0) {
  29722. - free_pcppages_bulk(zone, to_drain, pcp);
  29723. + isolate_pcp_pages(to_drain, pcp, &dst);
  29724. pcp->count -= to_drain;
  29725. }
  29726. - local_irq_restore(flags);
  29727. + local_unlock_irqrestore(pa_lock, flags);
  29728. + free_pcppages_bulk(zone, to_drain, &dst);
  29729. }
  29730. #endif
  29731. @@ -1279,16 +1319,21 @@
  29732. for_each_populated_zone(zone) {
  29733. struct per_cpu_pageset *pset;
  29734. struct per_cpu_pages *pcp;
  29735. + LIST_HEAD(dst);
  29736. + int count;
  29737. - local_irq_save(flags);
  29738. + cpu_lock_irqsave(cpu, flags);
  29739. pset = per_cpu_ptr(zone->pageset, cpu);
  29740. pcp = &pset->pcp;
  29741. - if (pcp->count) {
  29742. - free_pcppages_bulk(zone, pcp->count, pcp);
  29743. + count = pcp->count;
  29744. + if (count) {
  29745. + isolate_pcp_pages(count, pcp, &dst);
  29746. pcp->count = 0;
  29747. }
  29748. - local_irq_restore(flags);
  29749. + cpu_unlock_irqrestore(cpu, flags);
  29750. + if (count)
  29751. + free_pcppages_bulk(zone, count, &dst);
  29752. }
  29753. }
  29754. @@ -1341,7 +1386,12 @@
  29755. else
  29756. cpumask_clear_cpu(cpu, &cpus_with_pcps);
  29757. }
  29758. +#ifndef CONFIG_PREEMPT_RT_BASE
  29759. on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1);
  29760. +#else
  29761. + for_each_cpu(cpu, &cpus_with_pcps)
  29762. + drain_pages(cpu);
  29763. +#endif
  29764. }
  29765. #ifdef CONFIG_HIBERNATION
  29766. @@ -1397,7 +1447,7 @@
  29767. migratetype = get_pfnblock_migratetype(page, pfn);
  29768. set_freepage_migratetype(page, migratetype);
  29769. - local_irq_save(flags);
  29770. + local_lock_irqsave(pa_lock, flags);
  29771. __count_vm_event(PGFREE);
  29772. /*
  29773. @@ -1423,12 +1473,17 @@
  29774. pcp->count++;
  29775. if (pcp->count >= pcp->high) {
  29776. unsigned long batch = ACCESS_ONCE(pcp->batch);
  29777. - free_pcppages_bulk(zone, batch, pcp);
  29778. + LIST_HEAD(dst);
  29779. +
  29780. + isolate_pcp_pages(batch, pcp, &dst);
  29781. pcp->count -= batch;
  29782. + local_unlock_irqrestore(pa_lock, flags);
  29783. + free_pcppages_bulk(zone, batch, &dst);
  29784. + return;
  29785. }
  29786. out:
  29787. - local_irq_restore(flags);
  29788. + local_unlock_irqrestore(pa_lock, flags);
  29789. }
  29790. /*
  29791. @@ -1558,7 +1613,7 @@
  29792. struct per_cpu_pages *pcp;
  29793. struct list_head *list;
  29794. - local_irq_save(flags);
  29795. + local_lock_irqsave(pa_lock, flags);
  29796. pcp = &this_cpu_ptr(zone->pageset)->pcp;
  29797. list = &pcp->lists[migratetype];
  29798. if (list_empty(list)) {
  29799. @@ -1590,13 +1645,15 @@
  29800. */
  29801. WARN_ON_ONCE(order > 1);
  29802. }
  29803. - spin_lock_irqsave(&zone->lock, flags);
  29804. + local_spin_lock_irqsave(pa_lock, &zone->lock, flags);
  29805. page = __rmqueue(zone, order, migratetype);
  29806. - spin_unlock(&zone->lock);
  29807. - if (!page)
  29808. + if (!page) {
  29809. + spin_unlock(&zone->lock);
  29810. goto failed;
  29811. + }
  29812. __mod_zone_freepage_state(zone, -(1 << order),
  29813. get_freepage_migratetype(page));
  29814. + spin_unlock(&zone->lock);
  29815. }
  29816. __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
  29817. @@ -1606,7 +1663,7 @@
  29818. __count_zone_vm_events(PGALLOC, zone, 1 << order);
  29819. zone_statistics(preferred_zone, zone, gfp_flags);
  29820. - local_irq_restore(flags);
  29821. + local_unlock_irqrestore(pa_lock, flags);
  29822. VM_BUG_ON_PAGE(bad_range(zone, page), page);
  29823. if (prep_new_page(page, order, gfp_flags))
  29824. @@ -1614,7 +1671,7 @@
  29825. return page;
  29826. failed:
  29827. - local_irq_restore(flags);
  29828. + local_unlock_irqrestore(pa_lock, flags);
  29829. return NULL;
  29830. }
  29831. @@ -2325,8 +2382,8 @@
  29832. count_vm_event(COMPACTSTALL);
  29833. /* Page migration frees to the PCP lists but we want merging */
  29834. - drain_pages(get_cpu());
  29835. - put_cpu();
  29836. + drain_pages(get_cpu_light());
  29837. + put_cpu_light();
  29838. page = get_page_from_freelist(gfp_mask, nodemask,
  29839. order, zonelist, high_zoneidx,
  29840. @@ -5565,6 +5622,7 @@
  29841. void __init page_alloc_init(void)
  29842. {
  29843. hotcpu_notifier(page_alloc_cpu_notify, 0);
  29844. + local_irq_lock_init(pa_lock);
  29845. }
  29846. /*
  29847. @@ -6459,7 +6517,7 @@
  29848. struct per_cpu_pageset *pset;
  29849. /* avoid races with drain_pages() */
  29850. - local_irq_save(flags);
  29851. + local_lock_irqsave(pa_lock, flags);
  29852. if (zone->pageset != &boot_pageset) {
  29853. for_each_online_cpu(cpu) {
  29854. pset = per_cpu_ptr(zone->pageset, cpu);
  29855. @@ -6468,7 +6526,7 @@
  29856. free_percpu(zone->pageset);
  29857. zone->pageset = &boot_pageset;
  29858. }
  29859. - local_irq_restore(flags);
  29860. + local_unlock_irqrestore(pa_lock, flags);
  29861. }
  29862. #ifdef CONFIG_MEMORY_HOTREMOVE
  29863. diff -Nur linux-3.18.12.orig/mm/slab.h linux-3.18.12/mm/slab.h
  29864. --- linux-3.18.12.orig/mm/slab.h 2015-04-20 14:48:02.000000000 -0500
  29865. +++ linux-3.18.12/mm/slab.h 2015-04-26 13:32:22.467684003 -0500
  29866. @@ -315,7 +315,11 @@
  29867. * The slab lists for all objects.
  29868. */
  29869. struct kmem_cache_node {
  29870. +#ifdef CONFIG_SLUB
  29871. + raw_spinlock_t list_lock;
  29872. +#else
  29873. spinlock_t list_lock;
  29874. +#endif
  29875. #ifdef CONFIG_SLAB
  29876. struct list_head slabs_partial; /* partial list first, better asm code */
  29877. diff -Nur linux-3.18.12.orig/mm/slub.c linux-3.18.12/mm/slub.c
  29878. --- linux-3.18.12.orig/mm/slub.c 2015-04-20 14:48:02.000000000 -0500
  29879. +++ linux-3.18.12/mm/slub.c 2015-04-26 13:32:22.467684003 -0500
  29880. @@ -1044,7 +1044,7 @@
  29881. {
  29882. struct kmem_cache_node *n = get_node(s, page_to_nid(page));
  29883. - spin_lock_irqsave(&n->list_lock, *flags);
  29884. + raw_spin_lock_irqsave(&n->list_lock, *flags);
  29885. slab_lock(page);
  29886. if (!check_slab(s, page))
  29887. @@ -1091,7 +1091,7 @@
  29888. fail:
  29889. slab_unlock(page);
  29890. - spin_unlock_irqrestore(&n->list_lock, *flags);
  29891. + raw_spin_unlock_irqrestore(&n->list_lock, *flags);
  29892. slab_fix(s, "Object at 0x%p not freed", object);
  29893. return NULL;
  29894. }
  29895. @@ -1219,6 +1219,12 @@
  29896. #endif /* CONFIG_SLUB_DEBUG */
  29897. +struct slub_free_list {
  29898. + raw_spinlock_t lock;
  29899. + struct list_head list;
  29900. +};
  29901. +static DEFINE_PER_CPU(struct slub_free_list, slub_free_list);
  29902. +
  29903. /*
  29904. * Hooks for other subsystems that check memory allocations. In a typical
  29905. * production configuration these hooks all should produce no code at all.
  29906. @@ -1303,10 +1309,15 @@
  29907. struct page *page;
  29908. struct kmem_cache_order_objects oo = s->oo;
  29909. gfp_t alloc_gfp;
  29910. + bool enableirqs;
  29911. flags &= gfp_allowed_mask;
  29912. - if (flags & __GFP_WAIT)
  29913. + enableirqs = (flags & __GFP_WAIT) != 0;
  29914. +#ifdef CONFIG_PREEMPT_RT_FULL
  29915. + enableirqs |= system_state == SYSTEM_RUNNING;
  29916. +#endif
  29917. + if (enableirqs)
  29918. local_irq_enable();
  29919. flags |= s->allocflags;
  29920. @@ -1347,7 +1358,7 @@
  29921. kmemcheck_mark_unallocated_pages(page, pages);
  29922. }
  29923. - if (flags & __GFP_WAIT)
  29924. + if (enableirqs)
  29925. local_irq_disable();
  29926. if (!page)
  29927. return NULL;
  29928. @@ -1365,8 +1376,10 @@
  29929. void *object)
  29930. {
  29931. setup_object_debug(s, page, object);
  29932. +#ifndef CONFIG_PREEMPT_RT_FULL
  29933. if (unlikely(s->ctor))
  29934. s->ctor(object);
  29935. +#endif
  29936. }
  29937. static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
  29938. @@ -1442,6 +1455,16 @@
  29939. memcg_uncharge_slab(s, order);
  29940. }
  29941. +static void free_delayed(struct list_head *h)
  29942. +{
  29943. + while(!list_empty(h)) {
  29944. + struct page *page = list_first_entry(h, struct page, lru);
  29945. +
  29946. + list_del(&page->lru);
  29947. + __free_slab(page->slab_cache, page);
  29948. + }
  29949. +}
  29950. +
  29951. #define need_reserve_slab_rcu \
  29952. (sizeof(((struct page *)NULL)->lru) < sizeof(struct rcu_head))
  29953. @@ -1476,6 +1499,12 @@
  29954. }
  29955. call_rcu(head, rcu_free_slab);
  29956. + } else if (irqs_disabled()) {
  29957. + struct slub_free_list *f = &__get_cpu_var(slub_free_list);
  29958. +
  29959. + raw_spin_lock(&f->lock);
  29960. + list_add(&page->lru, &f->list);
  29961. + raw_spin_unlock(&f->lock);
  29962. } else
  29963. __free_slab(s, page);
  29964. }
  29965. @@ -1589,7 +1618,7 @@
  29966. if (!n || !n->nr_partial)
  29967. return NULL;
  29968. - spin_lock(&n->list_lock);
  29969. + raw_spin_lock(&n->list_lock);
  29970. list_for_each_entry_safe(page, page2, &n->partial, lru) {
  29971. void *t;
  29972. @@ -1614,7 +1643,7 @@
  29973. break;
  29974. }
  29975. - spin_unlock(&n->list_lock);
  29976. + raw_spin_unlock(&n->list_lock);
  29977. return object;
  29978. }
  29979. @@ -1860,7 +1889,7 @@
  29980. * that acquire_slab() will see a slab page that
  29981. * is frozen
  29982. */
  29983. - spin_lock(&n->list_lock);
  29984. + raw_spin_lock(&n->list_lock);
  29985. }
  29986. } else {
  29987. m = M_FULL;
  29988. @@ -1871,7 +1900,7 @@
  29989. * slabs from diagnostic functions will not see
  29990. * any frozen slabs.
  29991. */
  29992. - spin_lock(&n->list_lock);
  29993. + raw_spin_lock(&n->list_lock);
  29994. }
  29995. }
  29996. @@ -1906,7 +1935,7 @@
  29997. goto redo;
  29998. if (lock)
  29999. - spin_unlock(&n->list_lock);
  30000. + raw_spin_unlock(&n->list_lock);
  30001. if (m == M_FREE) {
  30002. stat(s, DEACTIVATE_EMPTY);
  30003. @@ -1938,10 +1967,10 @@
  30004. n2 = get_node(s, page_to_nid(page));
  30005. if (n != n2) {
  30006. if (n)
  30007. - spin_unlock(&n->list_lock);
  30008. + raw_spin_unlock(&n->list_lock);
  30009. n = n2;
  30010. - spin_lock(&n->list_lock);
  30011. + raw_spin_lock(&n->list_lock);
  30012. }
  30013. do {
  30014. @@ -1970,7 +1999,7 @@
  30015. }
  30016. if (n)
  30017. - spin_unlock(&n->list_lock);
  30018. + raw_spin_unlock(&n->list_lock);
  30019. while (discard_page) {
  30020. page = discard_page;
  30021. @@ -2008,14 +2037,21 @@
  30022. pobjects = oldpage->pobjects;
  30023. pages = oldpage->pages;
  30024. if (drain && pobjects > s->cpu_partial) {
  30025. + struct slub_free_list *f;
  30026. unsigned long flags;
  30027. + LIST_HEAD(tofree);
  30028. /*
  30029. * partial array is full. Move the existing
  30030. * set to the per node partial list.
  30031. */
  30032. local_irq_save(flags);
  30033. unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
  30034. + f = &__get_cpu_var(slub_free_list);
  30035. + raw_spin_lock(&f->lock);
  30036. + list_splice_init(&f->list, &tofree);
  30037. + raw_spin_unlock(&f->lock);
  30038. local_irq_restore(flags);
  30039. + free_delayed(&tofree);
  30040. oldpage = NULL;
  30041. pobjects = 0;
  30042. pages = 0;
  30043. @@ -2079,7 +2115,22 @@
  30044. static void flush_all(struct kmem_cache *s)
  30045. {
  30046. + LIST_HEAD(tofree);
  30047. + int cpu;
  30048. +
  30049. on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1, GFP_ATOMIC);
  30050. + for_each_online_cpu(cpu) {
  30051. + struct slub_free_list *f;
  30052. +
  30053. + if (!has_cpu_slab(cpu, s))
  30054. + continue;
  30055. +
  30056. + f = &per_cpu(slub_free_list, cpu);
  30057. + raw_spin_lock_irq(&f->lock);
  30058. + list_splice_init(&f->list, &tofree);
  30059. + raw_spin_unlock_irq(&f->lock);
  30060. + free_delayed(&tofree);
  30061. + }
  30062. }
  30063. /*
  30064. @@ -2115,10 +2166,10 @@
  30065. unsigned long x = 0;
  30066. struct page *page;
  30067. - spin_lock_irqsave(&n->list_lock, flags);
  30068. + raw_spin_lock_irqsave(&n->list_lock, flags);
  30069. list_for_each_entry(page, &n->partial, lru)
  30070. x += get_count(page);
  30071. - spin_unlock_irqrestore(&n->list_lock, flags);
  30072. + raw_spin_unlock_irqrestore(&n->list_lock, flags);
  30073. return x;
  30074. }
  30075. #endif /* CONFIG_SLUB_DEBUG || CONFIG_SYSFS */
  30076. @@ -2255,9 +2306,11 @@
  30077. static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
  30078. unsigned long addr, struct kmem_cache_cpu *c)
  30079. {
  30080. + struct slub_free_list *f;
  30081. void *freelist;
  30082. struct page *page;
  30083. unsigned long flags;
  30084. + LIST_HEAD(tofree);
  30085. local_irq_save(flags);
  30086. #ifdef CONFIG_PREEMPT
  30087. @@ -2325,7 +2378,13 @@
  30088. VM_BUG_ON(!c->page->frozen);
  30089. c->freelist = get_freepointer(s, freelist);
  30090. c->tid = next_tid(c->tid);
  30091. +out:
  30092. + f = &__get_cpu_var(slub_free_list);
  30093. + raw_spin_lock(&f->lock);
  30094. + list_splice_init(&f->list, &tofree);
  30095. + raw_spin_unlock(&f->lock);
  30096. local_irq_restore(flags);
  30097. + free_delayed(&tofree);
  30098. return freelist;
  30099. new_slab:
  30100. @@ -2342,8 +2401,7 @@
  30101. if (unlikely(!freelist)) {
  30102. slab_out_of_memory(s, gfpflags, node);
  30103. - local_irq_restore(flags);
  30104. - return NULL;
  30105. + goto out;
  30106. }
  30107. page = c->page;
  30108. @@ -2358,8 +2416,7 @@
  30109. deactivate_slab(s, page, get_freepointer(s, freelist));
  30110. c->page = NULL;
  30111. c->freelist = NULL;
  30112. - local_irq_restore(flags);
  30113. - return freelist;
  30114. + goto out;
  30115. }
  30116. /*
  30117. @@ -2444,6 +2501,10 @@
  30118. if (unlikely(gfpflags & __GFP_ZERO) && object)
  30119. memset(object, 0, s->object_size);
  30120. +#ifdef CONFIG_PREEMPT_RT_FULL
  30121. + if (unlikely(s->ctor) && object)
  30122. + s->ctor(object);
  30123. +#endif
  30124. slab_post_alloc_hook(s, gfpflags, object);
  30125. @@ -2531,7 +2592,7 @@
  30126. do {
  30127. if (unlikely(n)) {
  30128. - spin_unlock_irqrestore(&n->list_lock, flags);
  30129. + raw_spin_unlock_irqrestore(&n->list_lock, flags);
  30130. n = NULL;
  30131. }
  30132. prior = page->freelist;
  30133. @@ -2563,7 +2624,7 @@
  30134. * Otherwise the list_lock will synchronize with
  30135. * other processors updating the list of slabs.
  30136. */
  30137. - spin_lock_irqsave(&n->list_lock, flags);
  30138. + raw_spin_lock_irqsave(&n->list_lock, flags);
  30139. }
  30140. }
  30141. @@ -2605,7 +2666,7 @@
  30142. add_partial(n, page, DEACTIVATE_TO_TAIL);
  30143. stat(s, FREE_ADD_PARTIAL);
  30144. }
  30145. - spin_unlock_irqrestore(&n->list_lock, flags);
  30146. + raw_spin_unlock_irqrestore(&n->list_lock, flags);
  30147. return;
  30148. slab_empty:
  30149. @@ -2620,7 +2681,7 @@
  30150. remove_full(s, n, page);
  30151. }
  30152. - spin_unlock_irqrestore(&n->list_lock, flags);
  30153. + raw_spin_unlock_irqrestore(&n->list_lock, flags);
  30154. stat(s, FREE_SLAB);
  30155. discard_slab(s, page);
  30156. }
  30157. @@ -2816,7 +2877,7 @@
  30158. init_kmem_cache_node(struct kmem_cache_node *n)
  30159. {
  30160. n->nr_partial = 0;
  30161. - spin_lock_init(&n->list_lock);
  30162. + raw_spin_lock_init(&n->list_lock);
  30163. INIT_LIST_HEAD(&n->partial);
  30164. #ifdef CONFIG_SLUB_DEBUG
  30165. atomic_long_set(&n->nr_slabs, 0);
  30166. @@ -3373,7 +3434,7 @@
  30167. for (i = 0; i < objects; i++)
  30168. INIT_LIST_HEAD(slabs_by_inuse + i);
  30169. - spin_lock_irqsave(&n->list_lock, flags);
  30170. + raw_spin_lock_irqsave(&n->list_lock, flags);
  30171. /*
  30172. * Build lists indexed by the items in use in each slab.
  30173. @@ -3394,7 +3455,7 @@
  30174. for (i = objects - 1; i > 0; i--)
  30175. list_splice(slabs_by_inuse + i, n->partial.prev);
  30176. - spin_unlock_irqrestore(&n->list_lock, flags);
  30177. + raw_spin_unlock_irqrestore(&n->list_lock, flags);
  30178. /* Release empty slabs */
  30179. list_for_each_entry_safe(page, t, slabs_by_inuse, lru)
  30180. @@ -3567,6 +3628,12 @@
  30181. {
  30182. static __initdata struct kmem_cache boot_kmem_cache,
  30183. boot_kmem_cache_node;
  30184. + int cpu;
  30185. +
  30186. + for_each_possible_cpu(cpu) {
  30187. + raw_spin_lock_init(&per_cpu(slub_free_list, cpu).lock);
  30188. + INIT_LIST_HEAD(&per_cpu(slub_free_list, cpu).list);
  30189. + }
  30190. if (debug_guardpage_minorder())
  30191. slub_max_order = 0;
  30192. @@ -3815,7 +3882,7 @@
  30193. struct page *page;
  30194. unsigned long flags;
  30195. - spin_lock_irqsave(&n->list_lock, flags);
  30196. + raw_spin_lock_irqsave(&n->list_lock, flags);
  30197. list_for_each_entry(page, &n->partial, lru) {
  30198. validate_slab_slab(s, page, map);
  30199. @@ -3837,7 +3904,7 @@
  30200. s->name, count, atomic_long_read(&n->nr_slabs));
  30201. out:
  30202. - spin_unlock_irqrestore(&n->list_lock, flags);
  30203. + raw_spin_unlock_irqrestore(&n->list_lock, flags);
  30204. return count;
  30205. }
  30206. @@ -4025,12 +4092,12 @@
  30207. if (!atomic_long_read(&n->nr_slabs))
  30208. continue;
  30209. - spin_lock_irqsave(&n->list_lock, flags);
  30210. + raw_spin_lock_irqsave(&n->list_lock, flags);
  30211. list_for_each_entry(page, &n->partial, lru)
  30212. process_slab(&t, s, page, alloc, map);
  30213. list_for_each_entry(page, &n->full, lru)
  30214. process_slab(&t, s, page, alloc, map);
  30215. - spin_unlock_irqrestore(&n->list_lock, flags);
  30216. + raw_spin_unlock_irqrestore(&n->list_lock, flags);
  30217. }
  30218. for (i = 0; i < t.count; i++) {
  30219. diff -Nur linux-3.18.12.orig/mm/swap.c linux-3.18.12/mm/swap.c
  30220. --- linux-3.18.12.orig/mm/swap.c 2015-04-20 14:48:02.000000000 -0500
  30221. +++ linux-3.18.12/mm/swap.c 2015-04-26 13:32:22.467684003 -0500
  30222. @@ -31,6 +31,7 @@
  30223. #include <linux/memcontrol.h>
  30224. #include <linux/gfp.h>
  30225. #include <linux/uio.h>
  30226. +#include <linux/locallock.h>
  30227. #include "internal.h"
  30228. @@ -44,6 +45,9 @@
  30229. static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
  30230. static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
  30231. +static DEFINE_LOCAL_IRQ_LOCK(rotate_lock);
  30232. +static DEFINE_LOCAL_IRQ_LOCK(swapvec_lock);
  30233. +
  30234. /*
  30235. * This path almost never happens for VM activity - pages are normally
  30236. * freed via pagevecs. But it gets used by networking.
  30237. @@ -473,11 +477,11 @@
  30238. unsigned long flags;
  30239. page_cache_get(page);
  30240. - local_irq_save(flags);
  30241. + local_lock_irqsave(rotate_lock, flags);
  30242. pvec = this_cpu_ptr(&lru_rotate_pvecs);
  30243. if (!pagevec_add(pvec, page))
  30244. pagevec_move_tail(pvec);
  30245. - local_irq_restore(flags);
  30246. + local_unlock_irqrestore(rotate_lock, flags);
  30247. }
  30248. }
  30249. @@ -528,12 +532,13 @@
  30250. void activate_page(struct page *page)
  30251. {
  30252. if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
  30253. - struct pagevec *pvec = &get_cpu_var(activate_page_pvecs);
  30254. + struct pagevec *pvec = &get_locked_var(swapvec_lock,
  30255. + activate_page_pvecs);
  30256. page_cache_get(page);
  30257. if (!pagevec_add(pvec, page))
  30258. pagevec_lru_move_fn(pvec, __activate_page, NULL);
  30259. - put_cpu_var(activate_page_pvecs);
  30260. + put_locked_var(swapvec_lock, activate_page_pvecs);
  30261. }
  30262. }
  30263. @@ -559,7 +564,7 @@
  30264. static void __lru_cache_activate_page(struct page *page)
  30265. {
  30266. - struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
  30267. + struct pagevec *pvec = &get_locked_var(swapvec_lock, lru_add_pvec);
  30268. int i;
  30269. /*
  30270. @@ -581,7 +586,7 @@
  30271. }
  30272. }
  30273. - put_cpu_var(lru_add_pvec);
  30274. + put_locked_var(swapvec_lock, lru_add_pvec);
  30275. }
  30276. /*
  30277. @@ -620,13 +625,13 @@
  30278. static void __lru_cache_add(struct page *page)
  30279. {
  30280. - struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
  30281. + struct pagevec *pvec = &get_locked_var(swapvec_lock, lru_add_pvec);
  30282. page_cache_get(page);
  30283. if (!pagevec_space(pvec))
  30284. __pagevec_lru_add(pvec);
  30285. pagevec_add(pvec, page);
  30286. - put_cpu_var(lru_add_pvec);
  30287. + put_locked_var(swapvec_lock, lru_add_pvec);
  30288. }
  30289. /**
  30290. @@ -806,9 +811,9 @@
  30291. unsigned long flags;
  30292. /* No harm done if a racing interrupt already did this */
  30293. - local_irq_save(flags);
  30294. + local_lock_irqsave(rotate_lock, flags);
  30295. pagevec_move_tail(pvec);
  30296. - local_irq_restore(flags);
  30297. + local_unlock_irqrestore(rotate_lock, flags);
  30298. }
  30299. pvec = &per_cpu(lru_deactivate_pvecs, cpu);
  30300. @@ -836,18 +841,19 @@
  30301. return;
  30302. if (likely(get_page_unless_zero(page))) {
  30303. - struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs);
  30304. + struct pagevec *pvec = &get_locked_var(swapvec_lock,
  30305. + lru_deactivate_pvecs);
  30306. if (!pagevec_add(pvec, page))
  30307. pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
  30308. - put_cpu_var(lru_deactivate_pvecs);
  30309. + put_locked_var(swapvec_lock, lru_deactivate_pvecs);
  30310. }
  30311. }
  30312. void lru_add_drain(void)
  30313. {
  30314. - lru_add_drain_cpu(get_cpu());
  30315. - put_cpu();
  30316. + lru_add_drain_cpu(local_lock_cpu(swapvec_lock));
  30317. + local_unlock_cpu(swapvec_lock);
  30318. }
  30319. static void lru_add_drain_per_cpu(struct work_struct *dummy)
  30320. diff -Nur linux-3.18.12.orig/mm/truncate.c linux-3.18.12/mm/truncate.c
  30321. --- linux-3.18.12.orig/mm/truncate.c 2015-04-20 14:48:02.000000000 -0500
  30322. +++ linux-3.18.12/mm/truncate.c 2015-04-26 13:32:22.467684003 -0500
  30323. @@ -56,8 +56,11 @@
  30324. * protected by mapping->tree_lock.
  30325. */
  30326. if (!workingset_node_shadows(node) &&
  30327. - !list_empty(&node->private_list))
  30328. - list_lru_del(&workingset_shadow_nodes, &node->private_list);
  30329. + !list_empty(&node->private_list)) {
  30330. + local_lock(workingset_shadow_lock);
  30331. + list_lru_del(&__workingset_shadow_nodes, &node->private_list);
  30332. + local_unlock(workingset_shadow_lock);
  30333. + }
  30334. __radix_tree_delete_node(&mapping->page_tree, node);
  30335. unlock:
  30336. spin_unlock_irq(&mapping->tree_lock);
  30337. diff -Nur linux-3.18.12.orig/mm/vmalloc.c linux-3.18.12/mm/vmalloc.c
  30338. --- linux-3.18.12.orig/mm/vmalloc.c 2015-04-20 14:48:02.000000000 -0500
  30339. +++ linux-3.18.12/mm/vmalloc.c 2015-04-26 13:32:22.467684003 -0500
  30340. @@ -798,7 +798,7 @@
  30341. struct vmap_block *vb;
  30342. struct vmap_area *va;
  30343. unsigned long vb_idx;
  30344. - int node, err;
  30345. + int node, err, cpu;
  30346. node = numa_node_id();
  30347. @@ -836,11 +836,12 @@
  30348. BUG_ON(err);
  30349. radix_tree_preload_end();
  30350. - vbq = &get_cpu_var(vmap_block_queue);
  30351. + cpu = get_cpu_light();
  30352. + vbq = &__get_cpu_var(vmap_block_queue);
  30353. spin_lock(&vbq->lock);
  30354. list_add_rcu(&vb->free_list, &vbq->free);
  30355. spin_unlock(&vbq->lock);
  30356. - put_cpu_var(vmap_block_queue);
  30357. + put_cpu_light();
  30358. return vb;
  30359. }
  30360. @@ -908,6 +909,7 @@
  30361. struct vmap_block *vb;
  30362. unsigned long addr = 0;
  30363. unsigned int order;
  30364. + int cpu = 0;
  30365. BUG_ON(size & ~PAGE_MASK);
  30366. BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
  30367. @@ -923,7 +925,8 @@
  30368. again:
  30369. rcu_read_lock();
  30370. - vbq = &get_cpu_var(vmap_block_queue);
  30371. + cpu = get_cpu_light();
  30372. + vbq = &__get_cpu_var(vmap_block_queue);
  30373. list_for_each_entry_rcu(vb, &vbq->free, free_list) {
  30374. int i;
  30375. @@ -947,7 +950,7 @@
  30376. spin_unlock(&vb->lock);
  30377. }
  30378. - put_cpu_var(vmap_block_queue);
  30379. + put_cpu_light();
  30380. rcu_read_unlock();
  30381. if (!addr) {
  30382. diff -Nur linux-3.18.12.orig/mm/vmstat.c linux-3.18.12/mm/vmstat.c
  30383. --- linux-3.18.12.orig/mm/vmstat.c 2015-04-20 14:48:02.000000000 -0500
  30384. +++ linux-3.18.12/mm/vmstat.c 2015-04-26 13:32:22.467684003 -0500
  30385. @@ -221,6 +221,7 @@
  30386. long x;
  30387. long t;
  30388. + preempt_disable_rt();
  30389. x = delta + __this_cpu_read(*p);
  30390. t = __this_cpu_read(pcp->stat_threshold);
  30391. @@ -230,6 +231,7 @@
  30392. x = 0;
  30393. }
  30394. __this_cpu_write(*p, x);
  30395. + preempt_enable_rt();
  30396. }
  30397. EXPORT_SYMBOL(__mod_zone_page_state);
  30398. @@ -262,6 +264,7 @@
  30399. s8 __percpu *p = pcp->vm_stat_diff + item;
  30400. s8 v, t;
  30401. + preempt_disable_rt();
  30402. v = __this_cpu_inc_return(*p);
  30403. t = __this_cpu_read(pcp->stat_threshold);
  30404. if (unlikely(v > t)) {
  30405. @@ -270,6 +273,7 @@
  30406. zone_page_state_add(v + overstep, zone, item);
  30407. __this_cpu_write(*p, -overstep);
  30408. }
  30409. + preempt_enable_rt();
  30410. }
  30411. void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
  30412. @@ -284,6 +288,7 @@
  30413. s8 __percpu *p = pcp->vm_stat_diff + item;
  30414. s8 v, t;
  30415. + preempt_disable_rt();
  30416. v = __this_cpu_dec_return(*p);
  30417. t = __this_cpu_read(pcp->stat_threshold);
  30418. if (unlikely(v < - t)) {
  30419. @@ -292,6 +297,7 @@
  30420. zone_page_state_add(v - overstep, zone, item);
  30421. __this_cpu_write(*p, overstep);
  30422. }
  30423. + preempt_enable_rt();
  30424. }
  30425. void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
  30426. diff -Nur linux-3.18.12.orig/mm/workingset.c linux-3.18.12/mm/workingset.c
  30427. --- linux-3.18.12.orig/mm/workingset.c 2015-04-20 14:48:02.000000000 -0500
  30428. +++ linux-3.18.12/mm/workingset.c 2015-04-26 13:32:22.467684003 -0500
  30429. @@ -264,7 +264,8 @@
  30430. * point where they would still be useful.
  30431. */
  30432. -struct list_lru workingset_shadow_nodes;
  30433. +struct list_lru __workingset_shadow_nodes;
  30434. +DEFINE_LOCAL_IRQ_LOCK(workingset_shadow_lock);
  30435. static unsigned long count_shadow_nodes(struct shrinker *shrinker,
  30436. struct shrink_control *sc)
  30437. @@ -274,9 +275,9 @@
  30438. unsigned long pages;
  30439. /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
  30440. - local_irq_disable();
  30441. - shadow_nodes = list_lru_count_node(&workingset_shadow_nodes, sc->nid);
  30442. - local_irq_enable();
  30443. + local_lock_irq(workingset_shadow_lock);
  30444. + shadow_nodes = list_lru_count_node(&__workingset_shadow_nodes, sc->nid);
  30445. + local_unlock_irq(workingset_shadow_lock);
  30446. pages = node_present_pages(sc->nid);
  30447. /*
  30448. @@ -362,9 +363,9 @@
  30449. spin_unlock(&mapping->tree_lock);
  30450. ret = LRU_REMOVED_RETRY;
  30451. out:
  30452. - local_irq_enable();
  30453. + local_unlock_irq(workingset_shadow_lock);
  30454. cond_resched();
  30455. - local_irq_disable();
  30456. + local_lock_irq(workingset_shadow_lock);
  30457. spin_lock(lru_lock);
  30458. return ret;
  30459. }
  30460. @@ -375,10 +376,10 @@
  30461. unsigned long ret;
  30462. /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
  30463. - local_irq_disable();
  30464. - ret = list_lru_walk_node(&workingset_shadow_nodes, sc->nid,
  30465. + local_lock_irq(workingset_shadow_lock);
  30466. + ret = list_lru_walk_node(&__workingset_shadow_nodes, sc->nid,
  30467. shadow_lru_isolate, NULL, &sc->nr_to_scan);
  30468. - local_irq_enable();
  30469. + local_unlock_irq(workingset_shadow_lock);
  30470. return ret;
  30471. }
  30472. @@ -399,7 +400,7 @@
  30473. {
  30474. int ret;
  30475. - ret = list_lru_init_key(&workingset_shadow_nodes, &shadow_nodes_key);
  30476. + ret = list_lru_init_key(&__workingset_shadow_nodes, &shadow_nodes_key);
  30477. if (ret)
  30478. goto err;
  30479. ret = register_shrinker(&workingset_shadow_shrinker);
  30480. @@ -407,7 +408,7 @@
  30481. goto err_list_lru;
  30482. return 0;
  30483. err_list_lru:
  30484. - list_lru_destroy(&workingset_shadow_nodes);
  30485. + list_lru_destroy(&__workingset_shadow_nodes);
  30486. err:
  30487. return ret;
  30488. }
  30489. diff -Nur linux-3.18.12.orig/net/core/dev.c linux-3.18.12/net/core/dev.c
  30490. --- linux-3.18.12.orig/net/core/dev.c 2015-04-20 14:48:02.000000000 -0500
  30491. +++ linux-3.18.12/net/core/dev.c 2015-04-26 13:32:22.471684003 -0500
  30492. @@ -182,6 +182,7 @@
  30493. static DEFINE_HASHTABLE(napi_hash, 8);
  30494. static seqcount_t devnet_rename_seq;
  30495. +static DEFINE_MUTEX(devnet_rename_mutex);
  30496. static inline void dev_base_seq_inc(struct net *net)
  30497. {
  30498. @@ -203,14 +204,14 @@
  30499. static inline void rps_lock(struct softnet_data *sd)
  30500. {
  30501. #ifdef CONFIG_RPS
  30502. - spin_lock(&sd->input_pkt_queue.lock);
  30503. + raw_spin_lock(&sd->input_pkt_queue.raw_lock);
  30504. #endif
  30505. }
  30506. static inline void rps_unlock(struct softnet_data *sd)
  30507. {
  30508. #ifdef CONFIG_RPS
  30509. - spin_unlock(&sd->input_pkt_queue.lock);
  30510. + raw_spin_unlock(&sd->input_pkt_queue.raw_lock);
  30511. #endif
  30512. }
  30513. @@ -832,7 +833,8 @@
  30514. strcpy(name, dev->name);
  30515. rcu_read_unlock();
  30516. if (read_seqcount_retry(&devnet_rename_seq, seq)) {
  30517. - cond_resched();
  30518. + mutex_lock(&devnet_rename_mutex);
  30519. + mutex_unlock(&devnet_rename_mutex);
  30520. goto retry;
  30521. }
  30522. @@ -1101,20 +1103,17 @@
  30523. if (dev->flags & IFF_UP)
  30524. return -EBUSY;
  30525. - write_seqcount_begin(&devnet_rename_seq);
  30526. + mutex_lock(&devnet_rename_mutex);
  30527. + __raw_write_seqcount_begin(&devnet_rename_seq);
  30528. - if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
  30529. - write_seqcount_end(&devnet_rename_seq);
  30530. - return 0;
  30531. - }
  30532. + if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
  30533. + goto outunlock;
  30534. memcpy(oldname, dev->name, IFNAMSIZ);
  30535. err = dev_get_valid_name(net, dev, newname);
  30536. - if (err < 0) {
  30537. - write_seqcount_end(&devnet_rename_seq);
  30538. - return err;
  30539. - }
  30540. + if (err < 0)
  30541. + goto outunlock;
  30542. if (oldname[0] && !strchr(oldname, '%'))
  30543. netdev_info(dev, "renamed from %s\n", oldname);
  30544. @@ -1127,11 +1126,12 @@
  30545. if (ret) {
  30546. memcpy(dev->name, oldname, IFNAMSIZ);
  30547. dev->name_assign_type = old_assign_type;
  30548. - write_seqcount_end(&devnet_rename_seq);
  30549. - return ret;
  30550. + err = ret;
  30551. + goto outunlock;
  30552. }
  30553. - write_seqcount_end(&devnet_rename_seq);
  30554. + __raw_write_seqcount_end(&devnet_rename_seq);
  30555. + mutex_unlock(&devnet_rename_mutex);
  30556. netdev_adjacent_rename_links(dev, oldname);
  30557. @@ -1152,7 +1152,8 @@
  30558. /* err >= 0 after dev_alloc_name() or stores the first errno */
  30559. if (err >= 0) {
  30560. err = ret;
  30561. - write_seqcount_begin(&devnet_rename_seq);
  30562. + mutex_lock(&devnet_rename_mutex);
  30563. + __raw_write_seqcount_begin(&devnet_rename_seq);
  30564. memcpy(dev->name, oldname, IFNAMSIZ);
  30565. memcpy(oldname, newname, IFNAMSIZ);
  30566. dev->name_assign_type = old_assign_type;
  30567. @@ -1165,6 +1166,11 @@
  30568. }
  30569. return err;
  30570. +
  30571. +outunlock:
  30572. + __raw_write_seqcount_end(&devnet_rename_seq);
  30573. + mutex_unlock(&devnet_rename_mutex);
  30574. + return err;
  30575. }
  30576. /**
  30577. @@ -2160,6 +2166,7 @@
  30578. sd->output_queue_tailp = &q->next_sched;
  30579. raise_softirq_irqoff(NET_TX_SOFTIRQ);
  30580. local_irq_restore(flags);
  30581. + preempt_check_resched_rt();
  30582. }
  30583. void __netif_schedule(struct Qdisc *q)
  30584. @@ -2241,6 +2248,7 @@
  30585. __this_cpu_write(softnet_data.completion_queue, skb);
  30586. raise_softirq_irqoff(NET_TX_SOFTIRQ);
  30587. local_irq_restore(flags);
  30588. + preempt_check_resched_rt();
  30589. }
  30590. EXPORT_SYMBOL(__dev_kfree_skb_irq);
  30591. @@ -3336,6 +3344,7 @@
  30592. rps_unlock(sd);
  30593. local_irq_restore(flags);
  30594. + preempt_check_resched_rt();
  30595. atomic_long_inc(&skb->dev->rx_dropped);
  30596. kfree_skb(skb);
  30597. @@ -3354,7 +3363,7 @@
  30598. struct rps_dev_flow voidflow, *rflow = &voidflow;
  30599. int cpu;
  30600. - preempt_disable();
  30601. + migrate_disable();
  30602. rcu_read_lock();
  30603. cpu = get_rps_cpu(skb->dev, skb, &rflow);
  30604. @@ -3364,13 +3373,13 @@
  30605. ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
  30606. rcu_read_unlock();
  30607. - preempt_enable();
  30608. + migrate_enable();
  30609. } else
  30610. #endif
  30611. {
  30612. unsigned int qtail;
  30613. - ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
  30614. - put_cpu();
  30615. + ret = enqueue_to_backlog(skb, get_cpu_light(), &qtail);
  30616. + put_cpu_light();
  30617. }
  30618. return ret;
  30619. }
  30620. @@ -3404,16 +3413,44 @@
  30621. trace_netif_rx_ni_entry(skb);
  30622. - preempt_disable();
  30623. + local_bh_disable();
  30624. err = netif_rx_internal(skb);
  30625. - if (local_softirq_pending())
  30626. - do_softirq();
  30627. - preempt_enable();
  30628. + local_bh_enable();
  30629. return err;
  30630. }
  30631. EXPORT_SYMBOL(netif_rx_ni);
  30632. +#ifdef CONFIG_PREEMPT_RT_FULL
  30633. +/*
  30634. + * RT runs ksoftirqd as a real time thread and the root_lock is a
  30635. + * "sleeping spinlock". If the trylock fails then we can go into an
  30636. + * infinite loop when ksoftirqd preempted the task which actually
  30637. + * holds the lock, because we requeue q and raise NET_TX softirq
  30638. + * causing ksoftirqd to loop forever.
  30639. + *
  30640. + * It's safe to use spin_lock on RT here as softirqs run in thread
  30641. + * context and cannot deadlock against the thread which is holding
  30642. + * root_lock.
  30643. + *
  30644. + * On !RT the trylock might fail, but there we bail out from the
  30645. + * softirq loop after 10 attempts which we can't do on RT. And the
  30646. + * task holding root_lock cannot be preempted, so the only downside of
  30647. + * that trylock is that we need 10 loops to decide that we should have
  30648. + * given up in the first one :)
  30649. + */
  30650. +static inline int take_root_lock(spinlock_t *lock)
  30651. +{
  30652. + spin_lock(lock);
  30653. + return 1;
  30654. +}
  30655. +#else
  30656. +static inline int take_root_lock(spinlock_t *lock)
  30657. +{
  30658. + return spin_trylock(lock);
  30659. +}
  30660. +#endif
  30661. +
  30662. static void net_tx_action(struct softirq_action *h)
  30663. {
  30664. struct softnet_data *sd = this_cpu_ptr(&softnet_data);
  30665. @@ -3455,7 +3492,7 @@
  30666. head = head->next_sched;
  30667. root_lock = qdisc_lock(q);
  30668. - if (spin_trylock(root_lock)) {
  30669. + if (take_root_lock(root_lock)) {
  30670. smp_mb__before_atomic();
  30671. clear_bit(__QDISC_STATE_SCHED,
  30672. &q->state);
  30673. @@ -3848,7 +3885,7 @@
  30674. skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
  30675. if (skb->dev == dev) {
  30676. __skb_unlink(skb, &sd->input_pkt_queue);
  30677. - kfree_skb(skb);
  30678. + __skb_queue_tail(&sd->tofree_queue, skb);
  30679. input_queue_head_incr(sd);
  30680. }
  30681. }
  30682. @@ -3857,10 +3894,13 @@
  30683. skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
  30684. if (skb->dev == dev) {
  30685. __skb_unlink(skb, &sd->process_queue);
  30686. - kfree_skb(skb);
  30687. + __skb_queue_tail(&sd->tofree_queue, skb);
  30688. input_queue_head_incr(sd);
  30689. }
  30690. }
  30691. +
  30692. + if (!skb_queue_empty(&sd->tofree_queue))
  30693. + raise_softirq_irqoff(NET_RX_SOFTIRQ);
  30694. }
  30695. static int napi_gro_complete(struct sk_buff *skb)
  30696. @@ -4323,6 +4363,7 @@
  30697. } else
  30698. #endif
  30699. local_irq_enable();
  30700. + preempt_check_resched_rt();
  30701. }
  30702. static int process_backlog(struct napi_struct *napi, int quota)
  30703. @@ -4394,6 +4435,7 @@
  30704. local_irq_save(flags);
  30705. ____napi_schedule(this_cpu_ptr(&softnet_data), n);
  30706. local_irq_restore(flags);
  30707. + preempt_check_resched_rt();
  30708. }
  30709. EXPORT_SYMBOL(__napi_schedule);
  30710. @@ -4516,10 +4558,17 @@
  30711. struct softnet_data *sd = this_cpu_ptr(&softnet_data);
  30712. unsigned long time_limit = jiffies + 2;
  30713. int budget = netdev_budget;
  30714. + struct sk_buff *skb;
  30715. void *have;
  30716. local_irq_disable();
  30717. + while ((skb = __skb_dequeue(&sd->tofree_queue))) {
  30718. + local_irq_enable();
  30719. + kfree_skb(skb);
  30720. + local_irq_disable();
  30721. + }
  30722. +
  30723. while (!list_empty(&sd->poll_list)) {
  30724. struct napi_struct *n;
  30725. int work, weight;
  30726. @@ -7008,6 +7057,7 @@
  30727. raise_softirq_irqoff(NET_TX_SOFTIRQ);
  30728. local_irq_enable();
  30729. + preempt_check_resched_rt();
  30730. /* Process offline CPU's input_pkt_queue */
  30731. while ((skb = __skb_dequeue(&oldsd->process_queue))) {
  30732. @@ -7018,6 +7068,9 @@
  30733. netif_rx_internal(skb);
  30734. input_queue_head_incr(oldsd);
  30735. }
  30736. + while ((skb = __skb_dequeue(&oldsd->tofree_queue))) {
  30737. + kfree_skb(skb);
  30738. + }
  30739. return NOTIFY_OK;
  30740. }
  30741. @@ -7319,8 +7372,9 @@
  30742. for_each_possible_cpu(i) {
  30743. struct softnet_data *sd = &per_cpu(softnet_data, i);
  30744. - skb_queue_head_init(&sd->input_pkt_queue);
  30745. - skb_queue_head_init(&sd->process_queue);
  30746. + skb_queue_head_init_raw(&sd->input_pkt_queue);
  30747. + skb_queue_head_init_raw(&sd->process_queue);
  30748. + skb_queue_head_init_raw(&sd->tofree_queue);
  30749. INIT_LIST_HEAD(&sd->poll_list);
  30750. sd->output_queue_tailp = &sd->output_queue;
  30751. #ifdef CONFIG_RPS
  30752. diff -Nur linux-3.18.12.orig/net/core/skbuff.c linux-3.18.12/net/core/skbuff.c
  30753. --- linux-3.18.12.orig/net/core/skbuff.c 2015-04-20 14:48:02.000000000 -0500
  30754. +++ linux-3.18.12/net/core/skbuff.c 2015-04-26 13:32:22.471684003 -0500
  30755. @@ -63,6 +63,7 @@
  30756. #include <linux/errqueue.h>
  30757. #include <linux/prefetch.h>
  30758. #include <linux/if_vlan.h>
  30759. +#include <linux/locallock.h>
  30760. #include <net/protocol.h>
  30761. #include <net/dst.h>
  30762. @@ -336,6 +337,7 @@
  30763. unsigned int pagecnt_bias;
  30764. };
  30765. static DEFINE_PER_CPU(struct netdev_alloc_cache, netdev_alloc_cache);
  30766. +static DEFINE_LOCAL_IRQ_LOCK(netdev_alloc_lock);
  30767. static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
  30768. {
  30769. @@ -344,7 +346,7 @@
  30770. int order;
  30771. unsigned long flags;
  30772. - local_irq_save(flags);
  30773. + local_lock_irqsave(netdev_alloc_lock, flags);
  30774. nc = this_cpu_ptr(&netdev_alloc_cache);
  30775. if (unlikely(!nc->frag.page)) {
  30776. refill:
  30777. @@ -389,7 +391,7 @@
  30778. nc->frag.offset += fragsz;
  30779. nc->pagecnt_bias--;
  30780. end:
  30781. - local_irq_restore(flags);
  30782. + local_unlock_irqrestore(netdev_alloc_lock, flags);
  30783. return data;
  30784. }
  30785. diff -Nur linux-3.18.12.orig/net/core/sock.c linux-3.18.12/net/core/sock.c
  30786. --- linux-3.18.12.orig/net/core/sock.c 2015-04-20 14:48:02.000000000 -0500
  30787. +++ linux-3.18.12/net/core/sock.c 2015-04-26 13:32:22.471684003 -0500
  30788. @@ -2326,12 +2326,11 @@
  30789. if (sk->sk_lock.owned)
  30790. __lock_sock(sk);
  30791. sk->sk_lock.owned = 1;
  30792. - spin_unlock(&sk->sk_lock.slock);
  30793. + spin_unlock_bh(&sk->sk_lock.slock);
  30794. /*
  30795. * The sk_lock has mutex_lock() semantics here:
  30796. */
  30797. mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
  30798. - local_bh_enable();
  30799. }
  30800. EXPORT_SYMBOL(lock_sock_nested);
  30801. diff -Nur linux-3.18.12.orig/net/ipv4/icmp.c linux-3.18.12/net/ipv4/icmp.c
  30802. --- linux-3.18.12.orig/net/ipv4/icmp.c 2015-04-20 14:48:02.000000000 -0500
  30803. +++ linux-3.18.12/net/ipv4/icmp.c 2015-04-26 13:32:22.471684003 -0500
  30804. @@ -69,6 +69,7 @@
  30805. #include <linux/jiffies.h>
  30806. #include <linux/kernel.h>
  30807. #include <linux/fcntl.h>
  30808. +#include <linux/sysrq.h>
  30809. #include <linux/socket.h>
  30810. #include <linux/in.h>
  30811. #include <linux/inet.h>
  30812. @@ -864,6 +865,30 @@
  30813. }
  30814. /*
  30815. + * 32bit and 64bit have different timestamp length, so we check for
  30816. + * the cookie at offset 20 and verify it is repeated at offset 50
  30817. + */
  30818. +#define CO_POS0 20
  30819. +#define CO_POS1 50
  30820. +#define CO_SIZE sizeof(int)
  30821. +#define ICMP_SYSRQ_SIZE 57
  30822. +
  30823. +/*
  30824. + * We got a ICMP_SYSRQ_SIZE sized ping request. Check for the cookie
  30825. + * pattern and if it matches send the next byte as a trigger to sysrq.
  30826. + */
  30827. +static void icmp_check_sysrq(struct net *net, struct sk_buff *skb)
  30828. +{
  30829. + int cookie = htonl(net->ipv4.sysctl_icmp_echo_sysrq);
  30830. + char *p = skb->data;
  30831. +
  30832. + if (!memcmp(&cookie, p + CO_POS0, CO_SIZE) &&
  30833. + !memcmp(&cookie, p + CO_POS1, CO_SIZE) &&
  30834. + p[CO_POS0 + CO_SIZE] == p[CO_POS1 + CO_SIZE])
  30835. + handle_sysrq(p[CO_POS0 + CO_SIZE]);
  30836. +}
  30837. +
  30838. +/*
  30839. * Handle ICMP_ECHO ("ping") requests.
  30840. *
  30841. * RFC 1122: 3.2.2.6 MUST have an echo server that answers ICMP echo
  30842. @@ -890,6 +915,11 @@
  30843. icmp_param.data_len = skb->len;
  30844. icmp_param.head_len = sizeof(struct icmphdr);
  30845. icmp_reply(&icmp_param, skb);
  30846. +
  30847. + if (skb->len == ICMP_SYSRQ_SIZE &&
  30848. + net->ipv4.sysctl_icmp_echo_sysrq) {
  30849. + icmp_check_sysrq(net, skb);
  30850. + }
  30851. }
  30852. }
  30853. diff -Nur linux-3.18.12.orig/net/ipv4/sysctl_net_ipv4.c linux-3.18.12/net/ipv4/sysctl_net_ipv4.c
  30854. --- linux-3.18.12.orig/net/ipv4/sysctl_net_ipv4.c 2015-04-20 14:48:02.000000000 -0500
  30855. +++ linux-3.18.12/net/ipv4/sysctl_net_ipv4.c 2015-04-26 13:32:22.471684003 -0500
  30856. @@ -779,6 +779,13 @@
  30857. .proc_handler = proc_dointvec
  30858. },
  30859. {
  30860. + .procname = "icmp_echo_sysrq",
  30861. + .data = &init_net.ipv4.sysctl_icmp_echo_sysrq,
  30862. + .maxlen = sizeof(int),
  30863. + .mode = 0644,
  30864. + .proc_handler = proc_dointvec
  30865. + },
  30866. + {
  30867. .procname = "icmp_ignore_bogus_error_responses",
  30868. .data = &init_net.ipv4.sysctl_icmp_ignore_bogus_error_responses,
  30869. .maxlen = sizeof(int),
  30870. diff -Nur linux-3.18.12.orig/net/mac80211/rx.c linux-3.18.12/net/mac80211/rx.c
  30871. --- linux-3.18.12.orig/net/mac80211/rx.c 2015-04-20 14:48:02.000000000 -0500
  30872. +++ linux-3.18.12/net/mac80211/rx.c 2015-04-26 13:32:22.471684003 -0500
  30873. @@ -3359,7 +3359,7 @@
  30874. struct ieee80211_supported_band *sband;
  30875. struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
  30876. - WARN_ON_ONCE(softirq_count() == 0);
  30877. + WARN_ON_ONCE_NONRT(softirq_count() == 0);
  30878. if (WARN_ON(status->band >= IEEE80211_NUM_BANDS))
  30879. goto drop;
  30880. diff -Nur linux-3.18.12.orig/net/mac80211/rx.c.orig linux-3.18.12/net/mac80211/rx.c.orig
  30881. --- linux-3.18.12.orig/net/mac80211/rx.c.orig 1969-12-31 18:00:00.000000000 -0600
  30882. +++ linux-3.18.12/net/mac80211/rx.c.orig 2015-04-20 14:48:02.000000000 -0500
  30883. @@ -0,0 +1,3476 @@
  30884. +/*
  30885. + * Copyright 2002-2005, Instant802 Networks, Inc.
  30886. + * Copyright 2005-2006, Devicescape Software, Inc.
  30887. + * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz>
  30888. + * Copyright 2007-2010 Johannes Berg <johannes@sipsolutions.net>
  30889. + * Copyright 2013-2014 Intel Mobile Communications GmbH
  30890. + *
  30891. + * This program is free software; you can redistribute it and/or modify
  30892. + * it under the terms of the GNU General Public License version 2 as
  30893. + * published by the Free Software Foundation.
  30894. + */
  30895. +
  30896. +#include <linux/jiffies.h>
  30897. +#include <linux/slab.h>
  30898. +#include <linux/kernel.h>
  30899. +#include <linux/skbuff.h>
  30900. +#include <linux/netdevice.h>
  30901. +#include <linux/etherdevice.h>
  30902. +#include <linux/rcupdate.h>
  30903. +#include <linux/export.h>
  30904. +#include <net/mac80211.h>
  30905. +#include <net/ieee80211_radiotap.h>
  30906. +#include <asm/unaligned.h>
  30907. +
  30908. +#include "ieee80211_i.h"
  30909. +#include "driver-ops.h"
  30910. +#include "led.h"
  30911. +#include "mesh.h"
  30912. +#include "wep.h"
  30913. +#include "wpa.h"
  30914. +#include "tkip.h"
  30915. +#include "wme.h"
  30916. +#include "rate.h"
  30917. +
  30918. +/*
  30919. + * monitor mode reception
  30920. + *
  30921. + * This function cleans up the SKB, i.e. it removes all the stuff
  30922. + * only useful for monitoring.
  30923. + */
  30924. +static struct sk_buff *remove_monitor_info(struct ieee80211_local *local,
  30925. + struct sk_buff *skb)
  30926. +{
  30927. + if (local->hw.flags & IEEE80211_HW_RX_INCLUDES_FCS) {
  30928. + if (likely(skb->len > FCS_LEN))
  30929. + __pskb_trim(skb, skb->len - FCS_LEN);
  30930. + else {
  30931. + /* driver bug */
  30932. + WARN_ON(1);
  30933. + dev_kfree_skb(skb);
  30934. + return NULL;
  30935. + }
  30936. + }
  30937. +
  30938. + return skb;
  30939. +}
  30940. +
  30941. +static inline bool should_drop_frame(struct sk_buff *skb, int present_fcs_len)
  30942. +{
  30943. + struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
  30944. + struct ieee80211_hdr *hdr = (void *)skb->data;
  30945. +
  30946. + if (status->flag & (RX_FLAG_FAILED_FCS_CRC |
  30947. + RX_FLAG_FAILED_PLCP_CRC |
  30948. + RX_FLAG_AMPDU_IS_ZEROLEN))
  30949. + return true;
  30950. +
  30951. + if (unlikely(skb->len < 16 + present_fcs_len))
  30952. + return true;
  30953. +
  30954. + if (ieee80211_is_ctl(hdr->frame_control) &&
  30955. + !ieee80211_is_pspoll(hdr->frame_control) &&
  30956. + !ieee80211_is_back_req(hdr->frame_control))
  30957. + return true;
  30958. +
  30959. + return false;
  30960. +}
  30961. +
  30962. +static int
  30963. +ieee80211_rx_radiotap_space(struct ieee80211_local *local,
  30964. + struct ieee80211_rx_status *status)
  30965. +{
  30966. + int len;
  30967. +
  30968. + /* always present fields */
  30969. + len = sizeof(struct ieee80211_radiotap_header) + 8;
  30970. +
  30971. + /* allocate extra bitmaps */
  30972. + if (status->chains)
  30973. + len += 4 * hweight8(status->chains);
  30974. +
  30975. + if (ieee80211_have_rx_timestamp(status)) {
  30976. + len = ALIGN(len, 8);
  30977. + len += 8;
  30978. + }
  30979. + if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM)
  30980. + len += 1;
  30981. +
  30982. + /* antenna field, if we don't have per-chain info */
  30983. + if (!status->chains)
  30984. + len += 1;
  30985. +
  30986. + /* padding for RX_FLAGS if necessary */
  30987. + len = ALIGN(len, 2);
  30988. +
  30989. + if (status->flag & RX_FLAG_HT) /* HT info */
  30990. + len += 3;
  30991. +
  30992. + if (status->flag & RX_FLAG_AMPDU_DETAILS) {
  30993. + len = ALIGN(len, 4);
  30994. + len += 8;
  30995. + }
  30996. +
  30997. + if (status->flag & RX_FLAG_VHT) {
  30998. + len = ALIGN(len, 2);
  30999. + len += 12;
  31000. + }
  31001. +
  31002. + if (status->chains) {
  31003. + /* antenna and antenna signal fields */
  31004. + len += 2 * hweight8(status->chains);
  31005. + }
  31006. +
  31007. + return len;
  31008. +}
  31009. +
  31010. +/*
  31011. + * ieee80211_add_rx_radiotap_header - add radiotap header
  31012. + *
  31013. + * add a radiotap header containing all the fields which the hardware provided.
  31014. + */
  31015. +static void
  31016. +ieee80211_add_rx_radiotap_header(struct ieee80211_local *local,
  31017. + struct sk_buff *skb,
  31018. + struct ieee80211_rate *rate,
  31019. + int rtap_len, bool has_fcs)
  31020. +{
  31021. + struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
  31022. + struct ieee80211_radiotap_header *rthdr;
  31023. + unsigned char *pos;
  31024. + __le32 *it_present;
  31025. + u32 it_present_val;
  31026. + u16 rx_flags = 0;
  31027. + u16 channel_flags = 0;
  31028. + int mpdulen, chain;
  31029. + unsigned long chains = status->chains;
  31030. +
  31031. + mpdulen = skb->len;
  31032. + if (!(has_fcs && (local->hw.flags & IEEE80211_HW_RX_INCLUDES_FCS)))
  31033. + mpdulen += FCS_LEN;
  31034. +
  31035. + rthdr = (struct ieee80211_radiotap_header *)skb_push(skb, rtap_len);
  31036. + memset(rthdr, 0, rtap_len);
  31037. + it_present = &rthdr->it_present;
  31038. +
  31039. + /* radiotap header, set always present flags */
  31040. + rthdr->it_len = cpu_to_le16(rtap_len);
  31041. + it_present_val = BIT(IEEE80211_RADIOTAP_FLAGS) |
  31042. + BIT(IEEE80211_RADIOTAP_CHANNEL) |
  31043. + BIT(IEEE80211_RADIOTAP_RX_FLAGS);
  31044. +
  31045. + if (!status->chains)
  31046. + it_present_val |= BIT(IEEE80211_RADIOTAP_ANTENNA);
  31047. +
  31048. + for_each_set_bit(chain, &chains, IEEE80211_MAX_CHAINS) {
  31049. + it_present_val |=
  31050. + BIT(IEEE80211_RADIOTAP_EXT) |
  31051. + BIT(IEEE80211_RADIOTAP_RADIOTAP_NAMESPACE);
  31052. + put_unaligned_le32(it_present_val, it_present);
  31053. + it_present++;
  31054. + it_present_val = BIT(IEEE80211_RADIOTAP_ANTENNA) |
  31055. + BIT(IEEE80211_RADIOTAP_DBM_ANTSIGNAL);
  31056. + }
  31057. +
  31058. + put_unaligned_le32(it_present_val, it_present);
  31059. +
  31060. + pos = (void *)(it_present + 1);
  31061. +
  31062. + /* the order of the following fields is important */
  31063. +
  31064. + /* IEEE80211_RADIOTAP_TSFT */
  31065. + if (ieee80211_have_rx_timestamp(status)) {
  31066. + /* padding */
  31067. + while ((pos - (u8 *)rthdr) & 7)
  31068. + *pos++ = 0;
  31069. + put_unaligned_le64(
  31070. + ieee80211_calculate_rx_timestamp(local, status,
  31071. + mpdulen, 0),
  31072. + pos);
  31073. + rthdr->it_present |= cpu_to_le32(1 << IEEE80211_RADIOTAP_TSFT);
  31074. + pos += 8;
  31075. + }
  31076. +
  31077. + /* IEEE80211_RADIOTAP_FLAGS */
  31078. + if (has_fcs && (local->hw.flags & IEEE80211_HW_RX_INCLUDES_FCS))
  31079. + *pos |= IEEE80211_RADIOTAP_F_FCS;
  31080. + if (status->flag & (RX_FLAG_FAILED_FCS_CRC | RX_FLAG_FAILED_PLCP_CRC))
  31081. + *pos |= IEEE80211_RADIOTAP_F_BADFCS;
  31082. + if (status->flag & RX_FLAG_SHORTPRE)
  31083. + *pos |= IEEE80211_RADIOTAP_F_SHORTPRE;
  31084. + pos++;
  31085. +
  31086. + /* IEEE80211_RADIOTAP_RATE */
  31087. + if (!rate || status->flag & (RX_FLAG_HT | RX_FLAG_VHT)) {
  31088. + /*
  31089. + * Without rate information don't add it. If we have,
  31090. + * MCS information is a separate field in radiotap,
  31091. + * added below. The byte here is needed as padding
  31092. + * for the channel though, so initialise it to 0.
  31093. + */
  31094. + *pos = 0;
  31095. + } else {
  31096. + int shift = 0;
  31097. + rthdr->it_present |= cpu_to_le32(1 << IEEE80211_RADIOTAP_RATE);
  31098. + if (status->flag & RX_FLAG_10MHZ)
  31099. + shift = 1;
  31100. + else if (status->flag & RX_FLAG_5MHZ)
  31101. + shift = 2;
  31102. + *pos = DIV_ROUND_UP(rate->bitrate, 5 * (1 << shift));
  31103. + }
  31104. + pos++;
  31105. +
  31106. + /* IEEE80211_RADIOTAP_CHANNEL */
  31107. + put_unaligned_le16(status->freq, pos);
  31108. + pos += 2;
  31109. + if (status->flag & RX_FLAG_10MHZ)
  31110. + channel_flags |= IEEE80211_CHAN_HALF;
  31111. + else if (status->flag & RX_FLAG_5MHZ)
  31112. + channel_flags |= IEEE80211_CHAN_QUARTER;
  31113. +
  31114. + if (status->band == IEEE80211_BAND_5GHZ)
  31115. + channel_flags |= IEEE80211_CHAN_OFDM | IEEE80211_CHAN_5GHZ;
  31116. + else if (status->flag & (RX_FLAG_HT | RX_FLAG_VHT))
  31117. + channel_flags |= IEEE80211_CHAN_DYN | IEEE80211_CHAN_2GHZ;
  31118. + else if (rate && rate->flags & IEEE80211_RATE_ERP_G)
  31119. + channel_flags |= IEEE80211_CHAN_OFDM | IEEE80211_CHAN_2GHZ;
  31120. + else if (rate)
  31121. + channel_flags |= IEEE80211_CHAN_CCK | IEEE80211_CHAN_2GHZ;
  31122. + else
  31123. + channel_flags |= IEEE80211_CHAN_2GHZ;
  31124. + put_unaligned_le16(channel_flags, pos);
  31125. + pos += 2;
  31126. +
  31127. + /* IEEE80211_RADIOTAP_DBM_ANTSIGNAL */
  31128. + if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM &&
  31129. + !(status->flag & RX_FLAG_NO_SIGNAL_VAL)) {
  31130. + *pos = status->signal;
  31131. + rthdr->it_present |=
  31132. + cpu_to_le32(1 << IEEE80211_RADIOTAP_DBM_ANTSIGNAL);
  31133. + pos++;
  31134. + }
  31135. +
  31136. + /* IEEE80211_RADIOTAP_LOCK_QUALITY is missing */
  31137. +
  31138. + if (!status->chains) {
  31139. + /* IEEE80211_RADIOTAP_ANTENNA */
  31140. + *pos = status->antenna;
  31141. + pos++;
  31142. + }
  31143. +
  31144. + /* IEEE80211_RADIOTAP_DB_ANTNOISE is not used */
  31145. +
  31146. + /* IEEE80211_RADIOTAP_RX_FLAGS */
  31147. + /* ensure 2 byte alignment for the 2 byte field as required */
  31148. + if ((pos - (u8 *)rthdr) & 1)
  31149. + *pos++ = 0;
  31150. + if (status->flag & RX_FLAG_FAILED_PLCP_CRC)
  31151. + rx_flags |= IEEE80211_RADIOTAP_F_RX_BADPLCP;
  31152. + put_unaligned_le16(rx_flags, pos);
  31153. + pos += 2;
  31154. +
  31155. + if (status->flag & RX_FLAG_HT) {
  31156. + unsigned int stbc;
  31157. +
  31158. + rthdr->it_present |= cpu_to_le32(1 << IEEE80211_RADIOTAP_MCS);
  31159. + *pos++ = local->hw.radiotap_mcs_details;
  31160. + *pos = 0;
  31161. + if (status->flag & RX_FLAG_SHORT_GI)
  31162. + *pos |= IEEE80211_RADIOTAP_MCS_SGI;
  31163. + if (status->flag & RX_FLAG_40MHZ)
  31164. + *pos |= IEEE80211_RADIOTAP_MCS_BW_40;
  31165. + if (status->flag & RX_FLAG_HT_GF)
  31166. + *pos |= IEEE80211_RADIOTAP_MCS_FMT_GF;
  31167. + if (status->flag & RX_FLAG_LDPC)
  31168. + *pos |= IEEE80211_RADIOTAP_MCS_FEC_LDPC;
  31169. + stbc = (status->flag & RX_FLAG_STBC_MASK) >> RX_FLAG_STBC_SHIFT;
  31170. + *pos |= stbc << IEEE80211_RADIOTAP_MCS_STBC_SHIFT;
  31171. + pos++;
  31172. + *pos++ = status->rate_idx;
  31173. + }
  31174. +
  31175. + if (status->flag & RX_FLAG_AMPDU_DETAILS) {
  31176. + u16 flags = 0;
  31177. +
  31178. + /* ensure 4 byte alignment */
  31179. + while ((pos - (u8 *)rthdr) & 3)
  31180. + pos++;
  31181. + rthdr->it_present |=
  31182. + cpu_to_le32(1 << IEEE80211_RADIOTAP_AMPDU_STATUS);
  31183. + put_unaligned_le32(status->ampdu_reference, pos);
  31184. + pos += 4;
  31185. + if (status->flag & RX_FLAG_AMPDU_REPORT_ZEROLEN)
  31186. + flags |= IEEE80211_RADIOTAP_AMPDU_REPORT_ZEROLEN;
  31187. + if (status->flag & RX_FLAG_AMPDU_IS_ZEROLEN)
  31188. + flags |= IEEE80211_RADIOTAP_AMPDU_IS_ZEROLEN;
  31189. + if (status->flag & RX_FLAG_AMPDU_LAST_KNOWN)
  31190. + flags |= IEEE80211_RADIOTAP_AMPDU_LAST_KNOWN;
  31191. + if (status->flag & RX_FLAG_AMPDU_IS_LAST)
  31192. + flags |= IEEE80211_RADIOTAP_AMPDU_IS_LAST;
  31193. + if (status->flag & RX_FLAG_AMPDU_DELIM_CRC_ERROR)
  31194. + flags |= IEEE80211_RADIOTAP_AMPDU_DELIM_CRC_ERR;
  31195. + if (status->flag & RX_FLAG_AMPDU_DELIM_CRC_KNOWN)
  31196. + flags |= IEEE80211_RADIOTAP_AMPDU_DELIM_CRC_KNOWN;
  31197. + put_unaligned_le16(flags, pos);
  31198. + pos += 2;
  31199. + if (status->flag & RX_FLAG_AMPDU_DELIM_CRC_KNOWN)
  31200. + *pos++ = status->ampdu_delimiter_crc;
  31201. + else
  31202. + *pos++ = 0;
  31203. + *pos++ = 0;
  31204. + }
  31205. +
  31206. + if (status->flag & RX_FLAG_VHT) {
  31207. + u16 known = local->hw.radiotap_vht_details;
  31208. +
  31209. + rthdr->it_present |= cpu_to_le32(1 << IEEE80211_RADIOTAP_VHT);
  31210. + /* known field - how to handle 80+80? */
  31211. + if (status->vht_flag & RX_VHT_FLAG_80P80MHZ)
  31212. + known &= ~IEEE80211_RADIOTAP_VHT_KNOWN_BANDWIDTH;
  31213. + put_unaligned_le16(known, pos);
  31214. + pos += 2;
  31215. + /* flags */
  31216. + if (status->flag & RX_FLAG_SHORT_GI)
  31217. + *pos |= IEEE80211_RADIOTAP_VHT_FLAG_SGI;
  31218. + /* in VHT, STBC is binary */
  31219. + if (status->flag & RX_FLAG_STBC_MASK)
  31220. + *pos |= IEEE80211_RADIOTAP_VHT_FLAG_STBC;
  31221. + if (status->vht_flag & RX_VHT_FLAG_BF)
  31222. + *pos |= IEEE80211_RADIOTAP_VHT_FLAG_BEAMFORMED;
  31223. + pos++;
  31224. + /* bandwidth */
  31225. + if (status->vht_flag & RX_VHT_FLAG_80MHZ)
  31226. + *pos++ = 4;
  31227. + else if (status->vht_flag & RX_VHT_FLAG_80P80MHZ)
  31228. + *pos++ = 0; /* marked not known above */
  31229. + else if (status->vht_flag & RX_VHT_FLAG_160MHZ)
  31230. + *pos++ = 11;
  31231. + else if (status->flag & RX_FLAG_40MHZ)
  31232. + *pos++ = 1;
  31233. + else /* 20 MHz */
  31234. + *pos++ = 0;
  31235. + /* MCS/NSS */
  31236. + *pos = (status->rate_idx << 4) | status->vht_nss;
  31237. + pos += 4;
  31238. + /* coding field */
  31239. + if (status->flag & RX_FLAG_LDPC)
  31240. + *pos |= IEEE80211_RADIOTAP_CODING_LDPC_USER0;
  31241. + pos++;
  31242. + /* group ID */
  31243. + pos++;
  31244. + /* partial_aid */
  31245. + pos += 2;
  31246. + }
  31247. +
  31248. + for_each_set_bit(chain, &chains, IEEE80211_MAX_CHAINS) {
  31249. + *pos++ = status->chain_signal[chain];
  31250. + *pos++ = chain;
  31251. + }
  31252. +}
  31253. +
  31254. +/*
  31255. + * This function copies a received frame to all monitor interfaces and
  31256. + * returns a cleaned-up SKB that no longer includes the FCS nor the
  31257. + * radiotap header the driver might have added.
  31258. + */
  31259. +static struct sk_buff *
  31260. +ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb,
  31261. + struct ieee80211_rate *rate)
  31262. +{
  31263. + struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(origskb);
  31264. + struct ieee80211_sub_if_data *sdata;
  31265. + int needed_headroom;
  31266. + struct sk_buff *skb, *skb2;
  31267. + struct net_device *prev_dev = NULL;
  31268. + int present_fcs_len = 0;
  31269. +
  31270. + /*
  31271. + * First, we may need to make a copy of the skb because
  31272. + * (1) we need to modify it for radiotap (if not present), and
  31273. + * (2) the other RX handlers will modify the skb we got.
  31274. + *
  31275. + * We don't need to, of course, if we aren't going to return
  31276. + * the SKB because it has a bad FCS/PLCP checksum.
  31277. + */
  31278. +
  31279. + if (local->hw.flags & IEEE80211_HW_RX_INCLUDES_FCS)
  31280. + present_fcs_len = FCS_LEN;
  31281. +
  31282. + /* ensure hdr->frame_control is in skb head */
  31283. + if (!pskb_may_pull(origskb, 2)) {
  31284. + dev_kfree_skb(origskb);
  31285. + return NULL;
  31286. + }
  31287. +
  31288. + if (!local->monitors) {
  31289. + if (should_drop_frame(origskb, present_fcs_len)) {
  31290. + dev_kfree_skb(origskb);
  31291. + return NULL;
  31292. + }
  31293. +
  31294. + return remove_monitor_info(local, origskb);
  31295. + }
  31296. +
  31297. + /* room for the radiotap header based on driver features */
  31298. + needed_headroom = ieee80211_rx_radiotap_space(local, status);
  31299. +
  31300. + if (should_drop_frame(origskb, present_fcs_len)) {
  31301. + /* only need to expand headroom if necessary */
  31302. + skb = origskb;
  31303. + origskb = NULL;
  31304. +
  31305. + /*
  31306. + * This shouldn't trigger often because most devices have an
  31307. + * RX header they pull before we get here, and that should
  31308. + * be big enough for our radiotap information. We should
  31309. + * probably export the length to drivers so that we can have
  31310. + * them allocate enough headroom to start with.
  31311. + */
  31312. + if (skb_headroom(skb) < needed_headroom &&
  31313. + pskb_expand_head(skb, needed_headroom, 0, GFP_ATOMIC)) {
  31314. + dev_kfree_skb(skb);
  31315. + return NULL;
  31316. + }
  31317. + } else {
  31318. + /*
  31319. + * Need to make a copy and possibly remove radiotap header
  31320. + * and FCS from the original.
  31321. + */
  31322. + skb = skb_copy_expand(origskb, needed_headroom, 0, GFP_ATOMIC);
  31323. +
  31324. + origskb = remove_monitor_info(local, origskb);
  31325. +
  31326. + if (!skb)
  31327. + return origskb;
  31328. + }
  31329. +
  31330. + /* prepend radiotap information */
  31331. + ieee80211_add_rx_radiotap_header(local, skb, rate, needed_headroom,
  31332. + true);
  31333. +
  31334. + skb_reset_mac_header(skb);
  31335. + skb->ip_summed = CHECKSUM_UNNECESSARY;
  31336. + skb->pkt_type = PACKET_OTHERHOST;
  31337. + skb->protocol = htons(ETH_P_802_2);
  31338. +
  31339. + list_for_each_entry_rcu(sdata, &local->interfaces, list) {
  31340. + if (sdata->vif.type != NL80211_IFTYPE_MONITOR)
  31341. + continue;
  31342. +
  31343. + if (sdata->u.mntr_flags & MONITOR_FLAG_COOK_FRAMES)
  31344. + continue;
  31345. +
  31346. + if (!ieee80211_sdata_running(sdata))
  31347. + continue;
  31348. +
  31349. + if (prev_dev) {
  31350. + skb2 = skb_clone(skb, GFP_ATOMIC);
  31351. + if (skb2) {
  31352. + skb2->dev = prev_dev;
  31353. + netif_receive_skb(skb2);
  31354. + }
  31355. + }
  31356. +
  31357. + prev_dev = sdata->dev;
  31358. + sdata->dev->stats.rx_packets++;
  31359. + sdata->dev->stats.rx_bytes += skb->len;
  31360. + }
  31361. +
  31362. + if (prev_dev) {
  31363. + skb->dev = prev_dev;
  31364. + netif_receive_skb(skb);
  31365. + } else
  31366. + dev_kfree_skb(skb);
  31367. +
  31368. + return origskb;
  31369. +}
  31370. +
  31371. +static void ieee80211_parse_qos(struct ieee80211_rx_data *rx)
  31372. +{
  31373. + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)rx->skb->data;
  31374. + struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
  31375. + int tid, seqno_idx, security_idx;
  31376. +
  31377. + /* does the frame have a qos control field? */
  31378. + if (ieee80211_is_data_qos(hdr->frame_control)) {
  31379. + u8 *qc = ieee80211_get_qos_ctl(hdr);
  31380. + /* frame has qos control */
  31381. + tid = *qc & IEEE80211_QOS_CTL_TID_MASK;
  31382. + if (*qc & IEEE80211_QOS_CTL_A_MSDU_PRESENT)
  31383. + status->rx_flags |= IEEE80211_RX_AMSDU;
  31384. +
  31385. + seqno_idx = tid;
  31386. + security_idx = tid;
  31387. + } else {
  31388. + /*
  31389. + * IEEE 802.11-2007, 7.1.3.4.1 ("Sequence Number field"):
  31390. + *
  31391. + * Sequence numbers for management frames, QoS data
  31392. + * frames with a broadcast/multicast address in the
  31393. + * Address 1 field, and all non-QoS data frames sent
  31394. + * by QoS STAs are assigned using an additional single
  31395. + * modulo-4096 counter, [...]
  31396. + *
  31397. + * We also use that counter for non-QoS STAs.
  31398. + */
  31399. + seqno_idx = IEEE80211_NUM_TIDS;
  31400. + security_idx = 0;
  31401. + if (ieee80211_is_mgmt(hdr->frame_control))
  31402. + security_idx = IEEE80211_NUM_TIDS;
  31403. + tid = 0;
  31404. + }
  31405. +
  31406. + rx->seqno_idx = seqno_idx;
  31407. + rx->security_idx = security_idx;
  31408. + /* Set skb->priority to 1d tag if highest order bit of TID is not set.
  31409. + * For now, set skb->priority to 0 for other cases. */
  31410. + rx->skb->priority = (tid > 7) ? 0 : tid;
  31411. +}
  31412. +
  31413. +/**
  31414. + * DOC: Packet alignment
  31415. + *
  31416. + * Drivers always need to pass packets that are aligned to two-byte boundaries
  31417. + * to the stack.
  31418. + *
  31419. + * Additionally, should, if possible, align the payload data in a way that
  31420. + * guarantees that the contained IP header is aligned to a four-byte
  31421. + * boundary. In the case of regular frames, this simply means aligning the
  31422. + * payload to a four-byte boundary (because either the IP header is directly
  31423. + * contained, or IV/RFC1042 headers that have a length divisible by four are
  31424. + * in front of it). If the payload data is not properly aligned and the
  31425. + * architecture doesn't support efficient unaligned operations, mac80211
  31426. + * will align the data.
  31427. + *
  31428. + * With A-MSDU frames, however, the payload data address must yield two modulo
  31429. + * four because there are 14-byte 802.3 headers within the A-MSDU frames that
  31430. + * push the IP header further back to a multiple of four again. Thankfully, the
  31431. + * specs were sane enough this time around to require padding each A-MSDU
  31432. + * subframe to a length that is a multiple of four.
  31433. + *
  31434. + * Padding like Atheros hardware adds which is between the 802.11 header and
  31435. + * the payload is not supported, the driver is required to move the 802.11
  31436. + * header to be directly in front of the payload in that case.
  31437. + */
  31438. +static void ieee80211_verify_alignment(struct ieee80211_rx_data *rx)
  31439. +{
  31440. +#ifdef CONFIG_MAC80211_VERBOSE_DEBUG
  31441. + WARN_ONCE((unsigned long)rx->skb->data & 1,
  31442. + "unaligned packet at 0x%p\n", rx->skb->data);
  31443. +#endif
  31444. +}
  31445. +
  31446. +
  31447. +/* rx handlers */
  31448. +
  31449. +static int ieee80211_is_unicast_robust_mgmt_frame(struct sk_buff *skb)
  31450. +{
  31451. + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
  31452. +
  31453. + if (is_multicast_ether_addr(hdr->addr1))
  31454. + return 0;
  31455. +
  31456. + return ieee80211_is_robust_mgmt_frame(skb);
  31457. +}
  31458. +
  31459. +
  31460. +static int ieee80211_is_multicast_robust_mgmt_frame(struct sk_buff *skb)
  31461. +{
  31462. + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
  31463. +
  31464. + if (!is_multicast_ether_addr(hdr->addr1))
  31465. + return 0;
  31466. +
  31467. + return ieee80211_is_robust_mgmt_frame(skb);
  31468. +}
  31469. +
  31470. +
  31471. +/* Get the BIP key index from MMIE; return -1 if this is not a BIP frame */
  31472. +static int ieee80211_get_mmie_keyidx(struct sk_buff *skb)
  31473. +{
  31474. + struct ieee80211_mgmt *hdr = (struct ieee80211_mgmt *) skb->data;
  31475. + struct ieee80211_mmie *mmie;
  31476. +
  31477. + if (skb->len < 24 + sizeof(*mmie) || !is_multicast_ether_addr(hdr->da))
  31478. + return -1;
  31479. +
  31480. + if (!ieee80211_is_robust_mgmt_frame(skb))
  31481. + return -1; /* not a robust management frame */
  31482. +
  31483. + mmie = (struct ieee80211_mmie *)
  31484. + (skb->data + skb->len - sizeof(*mmie));
  31485. + if (mmie->element_id != WLAN_EID_MMIE ||
  31486. + mmie->length != sizeof(*mmie) - 2)
  31487. + return -1;
  31488. +
  31489. + return le16_to_cpu(mmie->key_id);
  31490. +}
  31491. +
  31492. +static int iwl80211_get_cs_keyid(const struct ieee80211_cipher_scheme *cs,
  31493. + struct sk_buff *skb)
  31494. +{
  31495. + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
  31496. + __le16 fc;
  31497. + int hdrlen;
  31498. + u8 keyid;
  31499. +
  31500. + fc = hdr->frame_control;
  31501. + hdrlen = ieee80211_hdrlen(fc);
  31502. +
  31503. + if (skb->len < hdrlen + cs->hdr_len)
  31504. + return -EINVAL;
  31505. +
  31506. + skb_copy_bits(skb, hdrlen + cs->key_idx_off, &keyid, 1);
  31507. + keyid &= cs->key_idx_mask;
  31508. + keyid >>= cs->key_idx_shift;
  31509. +
  31510. + return keyid;
  31511. +}
  31512. +
  31513. +static ieee80211_rx_result ieee80211_rx_mesh_check(struct ieee80211_rx_data *rx)
  31514. +{
  31515. + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)rx->skb->data;
  31516. + char *dev_addr = rx->sdata->vif.addr;
  31517. +
  31518. + if (ieee80211_is_data(hdr->frame_control)) {
  31519. + if (is_multicast_ether_addr(hdr->addr1)) {
  31520. + if (ieee80211_has_tods(hdr->frame_control) ||
  31521. + !ieee80211_has_fromds(hdr->frame_control))
  31522. + return RX_DROP_MONITOR;
  31523. + if (ether_addr_equal(hdr->addr3, dev_addr))
  31524. + return RX_DROP_MONITOR;
  31525. + } else {
  31526. + if (!ieee80211_has_a4(hdr->frame_control))
  31527. + return RX_DROP_MONITOR;
  31528. + if (ether_addr_equal(hdr->addr4, dev_addr))
  31529. + return RX_DROP_MONITOR;
  31530. + }
  31531. + }
  31532. +
  31533. + /* If there is not an established peer link and this is not a peer link
  31534. + * establisment frame, beacon or probe, drop the frame.
  31535. + */
  31536. +
  31537. + if (!rx->sta || sta_plink_state(rx->sta) != NL80211_PLINK_ESTAB) {
  31538. + struct ieee80211_mgmt *mgmt;
  31539. +
  31540. + if (!ieee80211_is_mgmt(hdr->frame_control))
  31541. + return RX_DROP_MONITOR;
  31542. +
  31543. + if (ieee80211_is_action(hdr->frame_control)) {
  31544. + u8 category;
  31545. +
  31546. + /* make sure category field is present */
  31547. + if (rx->skb->len < IEEE80211_MIN_ACTION_SIZE)
  31548. + return RX_DROP_MONITOR;
  31549. +
  31550. + mgmt = (struct ieee80211_mgmt *)hdr;
  31551. + category = mgmt->u.action.category;
  31552. + if (category != WLAN_CATEGORY_MESH_ACTION &&
  31553. + category != WLAN_CATEGORY_SELF_PROTECTED)
  31554. + return RX_DROP_MONITOR;
  31555. + return RX_CONTINUE;
  31556. + }
  31557. +
  31558. + if (ieee80211_is_probe_req(hdr->frame_control) ||
  31559. + ieee80211_is_probe_resp(hdr->frame_control) ||
  31560. + ieee80211_is_beacon(hdr->frame_control) ||
  31561. + ieee80211_is_auth(hdr->frame_control))
  31562. + return RX_CONTINUE;
  31563. +
  31564. + return RX_DROP_MONITOR;
  31565. + }
  31566. +
  31567. + return RX_CONTINUE;
  31568. +}
  31569. +
  31570. +static void ieee80211_release_reorder_frame(struct ieee80211_sub_if_data *sdata,
  31571. + struct tid_ampdu_rx *tid_agg_rx,
  31572. + int index,
  31573. + struct sk_buff_head *frames)
  31574. +{
  31575. + struct sk_buff_head *skb_list = &tid_agg_rx->reorder_buf[index];
  31576. + struct sk_buff *skb;
  31577. + struct ieee80211_rx_status *status;
  31578. +
  31579. + lockdep_assert_held(&tid_agg_rx->reorder_lock);
  31580. +
  31581. + if (skb_queue_empty(skb_list))
  31582. + goto no_frame;
  31583. +
  31584. + if (!ieee80211_rx_reorder_ready(skb_list)) {
  31585. + __skb_queue_purge(skb_list);
  31586. + goto no_frame;
  31587. + }
  31588. +
  31589. + /* release frames from the reorder ring buffer */
  31590. + tid_agg_rx->stored_mpdu_num--;
  31591. + while ((skb = __skb_dequeue(skb_list))) {
  31592. + status = IEEE80211_SKB_RXCB(skb);
  31593. + status->rx_flags |= IEEE80211_RX_DEFERRED_RELEASE;
  31594. + __skb_queue_tail(frames, skb);
  31595. + }
  31596. +
  31597. +no_frame:
  31598. + tid_agg_rx->head_seq_num = ieee80211_sn_inc(tid_agg_rx->head_seq_num);
  31599. +}
  31600. +
  31601. +static void ieee80211_release_reorder_frames(struct ieee80211_sub_if_data *sdata,
  31602. + struct tid_ampdu_rx *tid_agg_rx,
  31603. + u16 head_seq_num,
  31604. + struct sk_buff_head *frames)
  31605. +{
  31606. + int index;
  31607. +
  31608. + lockdep_assert_held(&tid_agg_rx->reorder_lock);
  31609. +
  31610. + while (ieee80211_sn_less(tid_agg_rx->head_seq_num, head_seq_num)) {
  31611. + index = tid_agg_rx->head_seq_num % tid_agg_rx->buf_size;
  31612. + ieee80211_release_reorder_frame(sdata, tid_agg_rx, index,
  31613. + frames);
  31614. + }
  31615. +}
  31616. +
  31617. +/*
  31618. + * Timeout (in jiffies) for skb's that are waiting in the RX reorder buffer. If
  31619. + * the skb was added to the buffer longer than this time ago, the earlier
  31620. + * frames that have not yet been received are assumed to be lost and the skb
  31621. + * can be released for processing. This may also release other skb's from the
  31622. + * reorder buffer if there are no additional gaps between the frames.
  31623. + *
  31624. + * Callers must hold tid_agg_rx->reorder_lock.
  31625. + */
  31626. +#define HT_RX_REORDER_BUF_TIMEOUT (HZ / 10)
  31627. +
  31628. +static void ieee80211_sta_reorder_release(struct ieee80211_sub_if_data *sdata,
  31629. + struct tid_ampdu_rx *tid_agg_rx,
  31630. + struct sk_buff_head *frames)
  31631. +{
  31632. + int index, i, j;
  31633. +
  31634. + lockdep_assert_held(&tid_agg_rx->reorder_lock);
  31635. +
  31636. + /* release the buffer until next missing frame */
  31637. + index = tid_agg_rx->head_seq_num % tid_agg_rx->buf_size;
  31638. + if (!ieee80211_rx_reorder_ready(&tid_agg_rx->reorder_buf[index]) &&
  31639. + tid_agg_rx->stored_mpdu_num) {
  31640. + /*
  31641. + * No buffers ready to be released, but check whether any
  31642. + * frames in the reorder buffer have timed out.
  31643. + */
  31644. + int skipped = 1;
  31645. + for (j = (index + 1) % tid_agg_rx->buf_size; j != index;
  31646. + j = (j + 1) % tid_agg_rx->buf_size) {
  31647. + if (!ieee80211_rx_reorder_ready(
  31648. + &tid_agg_rx->reorder_buf[j])) {
  31649. + skipped++;
  31650. + continue;
  31651. + }
  31652. + if (skipped &&
  31653. + !time_after(jiffies, tid_agg_rx->reorder_time[j] +
  31654. + HT_RX_REORDER_BUF_TIMEOUT))
  31655. + goto set_release_timer;
  31656. +
  31657. + /* don't leave incomplete A-MSDUs around */
  31658. + for (i = (index + 1) % tid_agg_rx->buf_size; i != j;
  31659. + i = (i + 1) % tid_agg_rx->buf_size)
  31660. + __skb_queue_purge(&tid_agg_rx->reorder_buf[i]);
  31661. +
  31662. + ht_dbg_ratelimited(sdata,
  31663. + "release an RX reorder frame due to timeout on earlier frames\n");
  31664. + ieee80211_release_reorder_frame(sdata, tid_agg_rx, j,
  31665. + frames);
  31666. +
  31667. + /*
  31668. + * Increment the head seq# also for the skipped slots.
  31669. + */
  31670. + tid_agg_rx->head_seq_num =
  31671. + (tid_agg_rx->head_seq_num +
  31672. + skipped) & IEEE80211_SN_MASK;
  31673. + skipped = 0;
  31674. + }
  31675. + } else while (ieee80211_rx_reorder_ready(
  31676. + &tid_agg_rx->reorder_buf[index])) {
  31677. + ieee80211_release_reorder_frame(sdata, tid_agg_rx, index,
  31678. + frames);
  31679. + index = tid_agg_rx->head_seq_num % tid_agg_rx->buf_size;
  31680. + }
  31681. +
  31682. + if (tid_agg_rx->stored_mpdu_num) {
  31683. + j = index = tid_agg_rx->head_seq_num % tid_agg_rx->buf_size;
  31684. +
  31685. + for (; j != (index - 1) % tid_agg_rx->buf_size;
  31686. + j = (j + 1) % tid_agg_rx->buf_size) {
  31687. + if (ieee80211_rx_reorder_ready(
  31688. + &tid_agg_rx->reorder_buf[j]))
  31689. + break;
  31690. + }
  31691. +
  31692. + set_release_timer:
  31693. +
  31694. + mod_timer(&tid_agg_rx->reorder_timer,
  31695. + tid_agg_rx->reorder_time[j] + 1 +
  31696. + HT_RX_REORDER_BUF_TIMEOUT);
  31697. + } else {
  31698. + del_timer(&tid_agg_rx->reorder_timer);
  31699. + }
  31700. +}
  31701. +
  31702. +/*
  31703. + * As this function belongs to the RX path it must be under
  31704. + * rcu_read_lock protection. It returns false if the frame
  31705. + * can be processed immediately, true if it was consumed.
  31706. + */
  31707. +static bool ieee80211_sta_manage_reorder_buf(struct ieee80211_sub_if_data *sdata,
  31708. + struct tid_ampdu_rx *tid_agg_rx,
  31709. + struct sk_buff *skb,
  31710. + struct sk_buff_head *frames)
  31711. +{
  31712. + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
  31713. + struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
  31714. + u16 sc = le16_to_cpu(hdr->seq_ctrl);
  31715. + u16 mpdu_seq_num = (sc & IEEE80211_SCTL_SEQ) >> 4;
  31716. + u16 head_seq_num, buf_size;
  31717. + int index;
  31718. + bool ret = true;
  31719. +
  31720. + spin_lock(&tid_agg_rx->reorder_lock);
  31721. +
  31722. + /*
  31723. + * Offloaded BA sessions have no known starting sequence number so pick
  31724. + * one from first Rxed frame for this tid after BA was started.
  31725. + */
  31726. + if (unlikely(tid_agg_rx->auto_seq)) {
  31727. + tid_agg_rx->auto_seq = false;
  31728. + tid_agg_rx->ssn = mpdu_seq_num;
  31729. + tid_agg_rx->head_seq_num = mpdu_seq_num;
  31730. + }
  31731. +
  31732. + buf_size = tid_agg_rx->buf_size;
  31733. + head_seq_num = tid_agg_rx->head_seq_num;
  31734. +
  31735. + /* frame with out of date sequence number */
  31736. + if (ieee80211_sn_less(mpdu_seq_num, head_seq_num)) {
  31737. + dev_kfree_skb(skb);
  31738. + goto out;
  31739. + }
  31740. +
  31741. + /*
  31742. + * If frame the sequence number exceeds our buffering window
  31743. + * size release some previous frames to make room for this one.
  31744. + */
  31745. + if (!ieee80211_sn_less(mpdu_seq_num, head_seq_num + buf_size)) {
  31746. + head_seq_num = ieee80211_sn_inc(
  31747. + ieee80211_sn_sub(mpdu_seq_num, buf_size));
  31748. + /* release stored frames up to new head to stack */
  31749. + ieee80211_release_reorder_frames(sdata, tid_agg_rx,
  31750. + head_seq_num, frames);
  31751. + }
  31752. +
  31753. + /* Now the new frame is always in the range of the reordering buffer */
  31754. +
  31755. + index = mpdu_seq_num % tid_agg_rx->buf_size;
  31756. +
  31757. + /* check if we already stored this frame */
  31758. + if (ieee80211_rx_reorder_ready(&tid_agg_rx->reorder_buf[index])) {
  31759. + dev_kfree_skb(skb);
  31760. + goto out;
  31761. + }
  31762. +
  31763. + /*
  31764. + * If the current MPDU is in the right order and nothing else
  31765. + * is stored we can process it directly, no need to buffer it.
  31766. + * If it is first but there's something stored, we may be able
  31767. + * to release frames after this one.
  31768. + */
  31769. + if (mpdu_seq_num == tid_agg_rx->head_seq_num &&
  31770. + tid_agg_rx->stored_mpdu_num == 0) {
  31771. + if (!(status->flag & RX_FLAG_AMSDU_MORE))
  31772. + tid_agg_rx->head_seq_num =
  31773. + ieee80211_sn_inc(tid_agg_rx->head_seq_num);
  31774. + ret = false;
  31775. + goto out;
  31776. + }
  31777. +
  31778. + /* put the frame in the reordering buffer */
  31779. + __skb_queue_tail(&tid_agg_rx->reorder_buf[index], skb);
  31780. + if (!(status->flag & RX_FLAG_AMSDU_MORE)) {
  31781. + tid_agg_rx->reorder_time[index] = jiffies;
  31782. + tid_agg_rx->stored_mpdu_num++;
  31783. + ieee80211_sta_reorder_release(sdata, tid_agg_rx, frames);
  31784. + }
  31785. +
  31786. + out:
  31787. + spin_unlock(&tid_agg_rx->reorder_lock);
  31788. + return ret;
  31789. +}
  31790. +
  31791. +/*
  31792. + * Reorder MPDUs from A-MPDUs, keeping them on a buffer. Returns
  31793. + * true if the MPDU was buffered, false if it should be processed.
  31794. + */
  31795. +static void ieee80211_rx_reorder_ampdu(struct ieee80211_rx_data *rx,
  31796. + struct sk_buff_head *frames)
  31797. +{
  31798. + struct sk_buff *skb = rx->skb;
  31799. + struct ieee80211_local *local = rx->local;
  31800. + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
  31801. + struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
  31802. + struct sta_info *sta = rx->sta;
  31803. + struct tid_ampdu_rx *tid_agg_rx;
  31804. + u16 sc;
  31805. + u8 tid, ack_policy;
  31806. +
  31807. + if (!ieee80211_is_data_qos(hdr->frame_control) ||
  31808. + is_multicast_ether_addr(hdr->addr1))
  31809. + goto dont_reorder;
  31810. +
  31811. + /*
  31812. + * filter the QoS data rx stream according to
  31813. + * STA/TID and check if this STA/TID is on aggregation
  31814. + */
  31815. +
  31816. + if (!sta)
  31817. + goto dont_reorder;
  31818. +
  31819. + ack_policy = *ieee80211_get_qos_ctl(hdr) &
  31820. + IEEE80211_QOS_CTL_ACK_POLICY_MASK;
  31821. + tid = *ieee80211_get_qos_ctl(hdr) & IEEE80211_QOS_CTL_TID_MASK;
  31822. +
  31823. + tid_agg_rx = rcu_dereference(sta->ampdu_mlme.tid_rx[tid]);
  31824. + if (!tid_agg_rx)
  31825. + goto dont_reorder;
  31826. +
  31827. + /* qos null data frames are excluded */
  31828. + if (unlikely(hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_NULLFUNC)))
  31829. + goto dont_reorder;
  31830. +
  31831. + /* not part of a BA session */
  31832. + if (ack_policy != IEEE80211_QOS_CTL_ACK_POLICY_BLOCKACK &&
  31833. + ack_policy != IEEE80211_QOS_CTL_ACK_POLICY_NORMAL)
  31834. + goto dont_reorder;
  31835. +
  31836. + /* not actually part of this BA session */
  31837. + if (!(status->rx_flags & IEEE80211_RX_RA_MATCH))
  31838. + goto dont_reorder;
  31839. +
  31840. + /* new, potentially un-ordered, ampdu frame - process it */
  31841. +
  31842. + /* reset session timer */
  31843. + if (tid_agg_rx->timeout)
  31844. + tid_agg_rx->last_rx = jiffies;
  31845. +
  31846. + /* if this mpdu is fragmented - terminate rx aggregation session */
  31847. + sc = le16_to_cpu(hdr->seq_ctrl);
  31848. + if (sc & IEEE80211_SCTL_FRAG) {
  31849. + skb->pkt_type = IEEE80211_SDATA_QUEUE_TYPE_FRAME;
  31850. + skb_queue_tail(&rx->sdata->skb_queue, skb);
  31851. + ieee80211_queue_work(&local->hw, &rx->sdata->work);
  31852. + return;
  31853. + }
  31854. +
  31855. + /*
  31856. + * No locking needed -- we will only ever process one
  31857. + * RX packet at a time, and thus own tid_agg_rx. All
  31858. + * other code manipulating it needs to (and does) make
  31859. + * sure that we cannot get to it any more before doing
  31860. + * anything with it.
  31861. + */
  31862. + if (ieee80211_sta_manage_reorder_buf(rx->sdata, tid_agg_rx, skb,
  31863. + frames))
  31864. + return;
  31865. +
  31866. + dont_reorder:
  31867. + __skb_queue_tail(frames, skb);
  31868. +}
  31869. +
  31870. +static ieee80211_rx_result debug_noinline
  31871. +ieee80211_rx_h_check(struct ieee80211_rx_data *rx)
  31872. +{
  31873. + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)rx->skb->data;
  31874. + struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
  31875. +
  31876. + /*
  31877. + * Drop duplicate 802.11 retransmissions
  31878. + * (IEEE 802.11-2012: 9.3.2.10 "Duplicate detection and recovery")
  31879. + */
  31880. + if (rx->skb->len >= 24 && rx->sta &&
  31881. + !ieee80211_is_ctl(hdr->frame_control) &&
  31882. + !ieee80211_is_qos_nullfunc(hdr->frame_control) &&
  31883. + !is_multicast_ether_addr(hdr->addr1)) {
  31884. + if (unlikely(ieee80211_has_retry(hdr->frame_control) &&
  31885. + rx->sta->last_seq_ctrl[rx->seqno_idx] ==
  31886. + hdr->seq_ctrl)) {
  31887. + if (status->rx_flags & IEEE80211_RX_RA_MATCH) {
  31888. + rx->local->dot11FrameDuplicateCount++;
  31889. + rx->sta->num_duplicates++;
  31890. + }
  31891. + return RX_DROP_UNUSABLE;
  31892. + } else if (!(status->flag & RX_FLAG_AMSDU_MORE)) {
  31893. + rx->sta->last_seq_ctrl[rx->seqno_idx] = hdr->seq_ctrl;
  31894. + }
  31895. + }
  31896. +
  31897. + if (unlikely(rx->skb->len < 16)) {
  31898. + I802_DEBUG_INC(rx->local->rx_handlers_drop_short);
  31899. + return RX_DROP_MONITOR;
  31900. + }
  31901. +
  31902. + /* Drop disallowed frame classes based on STA auth/assoc state;
  31903. + * IEEE 802.11, Chap 5.5.
  31904. + *
  31905. + * mac80211 filters only based on association state, i.e. it drops
  31906. + * Class 3 frames from not associated stations. hostapd sends
  31907. + * deauth/disassoc frames when needed. In addition, hostapd is
  31908. + * responsible for filtering on both auth and assoc states.
  31909. + */
  31910. +
  31911. + if (ieee80211_vif_is_mesh(&rx->sdata->vif))
  31912. + return ieee80211_rx_mesh_check(rx);
  31913. +
  31914. + if (unlikely((ieee80211_is_data(hdr->frame_control) ||
  31915. + ieee80211_is_pspoll(hdr->frame_control)) &&
  31916. + rx->sdata->vif.type != NL80211_IFTYPE_ADHOC &&
  31917. + rx->sdata->vif.type != NL80211_IFTYPE_WDS &&
  31918. + (!rx->sta || !test_sta_flag(rx->sta, WLAN_STA_ASSOC)))) {
  31919. + /*
  31920. + * accept port control frames from the AP even when it's not
  31921. + * yet marked ASSOC to prevent a race where we don't set the
  31922. + * assoc bit quickly enough before it sends the first frame
  31923. + */
  31924. + if (rx->sta && rx->sdata->vif.type == NL80211_IFTYPE_STATION &&
  31925. + ieee80211_is_data_present(hdr->frame_control)) {
  31926. + unsigned int hdrlen;
  31927. + __be16 ethertype;
  31928. +
  31929. + hdrlen = ieee80211_hdrlen(hdr->frame_control);
  31930. +
  31931. + if (rx->skb->len < hdrlen + 8)
  31932. + return RX_DROP_MONITOR;
  31933. +
  31934. + skb_copy_bits(rx->skb, hdrlen + 6, &ethertype, 2);
  31935. + if (ethertype == rx->sdata->control_port_protocol)
  31936. + return RX_CONTINUE;
  31937. + }
  31938. +
  31939. + if (rx->sdata->vif.type == NL80211_IFTYPE_AP &&
  31940. + cfg80211_rx_spurious_frame(rx->sdata->dev,
  31941. + hdr->addr2,
  31942. + GFP_ATOMIC))
  31943. + return RX_DROP_UNUSABLE;
  31944. +
  31945. + return RX_DROP_MONITOR;
  31946. + }
  31947. +
  31948. + return RX_CONTINUE;
  31949. +}
  31950. +
  31951. +
  31952. +static ieee80211_rx_result debug_noinline
  31953. +ieee80211_rx_h_check_more_data(struct ieee80211_rx_data *rx)
  31954. +{
  31955. + struct ieee80211_local *local;
  31956. + struct ieee80211_hdr *hdr;
  31957. + struct sk_buff *skb;
  31958. +
  31959. + local = rx->local;
  31960. + skb = rx->skb;
  31961. + hdr = (struct ieee80211_hdr *) skb->data;
  31962. +
  31963. + if (!local->pspolling)
  31964. + return RX_CONTINUE;
  31965. +
  31966. + if (!ieee80211_has_fromds(hdr->frame_control))
  31967. + /* this is not from AP */
  31968. + return RX_CONTINUE;
  31969. +
  31970. + if (!ieee80211_is_data(hdr->frame_control))
  31971. + return RX_CONTINUE;
  31972. +
  31973. + if (!ieee80211_has_moredata(hdr->frame_control)) {
  31974. + /* AP has no more frames buffered for us */
  31975. + local->pspolling = false;
  31976. + return RX_CONTINUE;
  31977. + }
  31978. +
  31979. + /* more data bit is set, let's request a new frame from the AP */
  31980. + ieee80211_send_pspoll(local, rx->sdata);
  31981. +
  31982. + return RX_CONTINUE;
  31983. +}
  31984. +
  31985. +static void sta_ps_start(struct sta_info *sta)
  31986. +{
  31987. + struct ieee80211_sub_if_data *sdata = sta->sdata;
  31988. + struct ieee80211_local *local = sdata->local;
  31989. + struct ps_data *ps;
  31990. +
  31991. + if (sta->sdata->vif.type == NL80211_IFTYPE_AP ||
  31992. + sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
  31993. + ps = &sdata->bss->ps;
  31994. + else
  31995. + return;
  31996. +
  31997. + atomic_inc(&ps->num_sta_ps);
  31998. + set_sta_flag(sta, WLAN_STA_PS_STA);
  31999. + if (!(local->hw.flags & IEEE80211_HW_AP_LINK_PS))
  32000. + drv_sta_notify(local, sdata, STA_NOTIFY_SLEEP, &sta->sta);
  32001. + ps_dbg(sdata, "STA %pM aid %d enters power save mode\n",
  32002. + sta->sta.addr, sta->sta.aid);
  32003. +}
  32004. +
  32005. +static void sta_ps_end(struct sta_info *sta)
  32006. +{
  32007. + ps_dbg(sta->sdata, "STA %pM aid %d exits power save mode\n",
  32008. + sta->sta.addr, sta->sta.aid);
  32009. +
  32010. + if (test_sta_flag(sta, WLAN_STA_PS_DRIVER)) {
  32011. + /*
  32012. + * Clear the flag only if the other one is still set
  32013. + * so that the TX path won't start TX'ing new frames
  32014. + * directly ... In the case that the driver flag isn't
  32015. + * set ieee80211_sta_ps_deliver_wakeup() will clear it.
  32016. + */
  32017. + clear_sta_flag(sta, WLAN_STA_PS_STA);
  32018. + ps_dbg(sta->sdata, "STA %pM aid %d driver-ps-blocked\n",
  32019. + sta->sta.addr, sta->sta.aid);
  32020. + return;
  32021. + }
  32022. +
  32023. + set_sta_flag(sta, WLAN_STA_PS_DELIVER);
  32024. + clear_sta_flag(sta, WLAN_STA_PS_STA);
  32025. + ieee80211_sta_ps_deliver_wakeup(sta);
  32026. +}
  32027. +
  32028. +int ieee80211_sta_ps_transition(struct ieee80211_sta *sta, bool start)
  32029. +{
  32030. + struct sta_info *sta_inf = container_of(sta, struct sta_info, sta);
  32031. + bool in_ps;
  32032. +
  32033. + WARN_ON(!(sta_inf->local->hw.flags & IEEE80211_HW_AP_LINK_PS));
  32034. +
  32035. + /* Don't let the same PS state be set twice */
  32036. + in_ps = test_sta_flag(sta_inf, WLAN_STA_PS_STA);
  32037. + if ((start && in_ps) || (!start && !in_ps))
  32038. + return -EINVAL;
  32039. +
  32040. + if (start)
  32041. + sta_ps_start(sta_inf);
  32042. + else
  32043. + sta_ps_end(sta_inf);
  32044. +
  32045. + return 0;
  32046. +}
  32047. +EXPORT_SYMBOL(ieee80211_sta_ps_transition);
  32048. +
  32049. +static ieee80211_rx_result debug_noinline
  32050. +ieee80211_rx_h_uapsd_and_pspoll(struct ieee80211_rx_data *rx)
  32051. +{
  32052. + struct ieee80211_sub_if_data *sdata = rx->sdata;
  32053. + struct ieee80211_hdr *hdr = (void *)rx->skb->data;
  32054. + struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
  32055. + int tid, ac;
  32056. +
  32057. + if (!rx->sta || !(status->rx_flags & IEEE80211_RX_RA_MATCH))
  32058. + return RX_CONTINUE;
  32059. +
  32060. + if (sdata->vif.type != NL80211_IFTYPE_AP &&
  32061. + sdata->vif.type != NL80211_IFTYPE_AP_VLAN)
  32062. + return RX_CONTINUE;
  32063. +
  32064. + /*
  32065. + * The device handles station powersave, so don't do anything about
  32066. + * uAPSD and PS-Poll frames (the latter shouldn't even come up from
  32067. + * it to mac80211 since they're handled.)
  32068. + */
  32069. + if (sdata->local->hw.flags & IEEE80211_HW_AP_LINK_PS)
  32070. + return RX_CONTINUE;
  32071. +
  32072. + /*
  32073. + * Don't do anything if the station isn't already asleep. In
  32074. + * the uAPSD case, the station will probably be marked asleep,
  32075. + * in the PS-Poll case the station must be confused ...
  32076. + */
  32077. + if (!test_sta_flag(rx->sta, WLAN_STA_PS_STA))
  32078. + return RX_CONTINUE;
  32079. +
  32080. + if (unlikely(ieee80211_is_pspoll(hdr->frame_control))) {
  32081. + if (!test_sta_flag(rx->sta, WLAN_STA_SP)) {
  32082. + if (!test_sta_flag(rx->sta, WLAN_STA_PS_DRIVER))
  32083. + ieee80211_sta_ps_deliver_poll_response(rx->sta);
  32084. + else
  32085. + set_sta_flag(rx->sta, WLAN_STA_PSPOLL);
  32086. + }
  32087. +
  32088. + /* Free PS Poll skb here instead of returning RX_DROP that would
  32089. + * count as an dropped frame. */
  32090. + dev_kfree_skb(rx->skb);
  32091. +
  32092. + return RX_QUEUED;
  32093. + } else if (!ieee80211_has_morefrags(hdr->frame_control) &&
  32094. + !(status->rx_flags & IEEE80211_RX_DEFERRED_RELEASE) &&
  32095. + ieee80211_has_pm(hdr->frame_control) &&
  32096. + (ieee80211_is_data_qos(hdr->frame_control) ||
  32097. + ieee80211_is_qos_nullfunc(hdr->frame_control))) {
  32098. + tid = *ieee80211_get_qos_ctl(hdr) & IEEE80211_QOS_CTL_TID_MASK;
  32099. + ac = ieee802_1d_to_ac[tid & 7];
  32100. +
  32101. + /*
  32102. + * If this AC is not trigger-enabled do nothing.
  32103. + *
  32104. + * NB: This could/should check a separate bitmap of trigger-
  32105. + * enabled queues, but for now we only implement uAPSD w/o
  32106. + * TSPEC changes to the ACs, so they're always the same.
  32107. + */
  32108. + if (!(rx->sta->sta.uapsd_queues & BIT(ac)))
  32109. + return RX_CONTINUE;
  32110. +
  32111. + /* if we are in a service period, do nothing */
  32112. + if (test_sta_flag(rx->sta, WLAN_STA_SP))
  32113. + return RX_CONTINUE;
  32114. +
  32115. + if (!test_sta_flag(rx->sta, WLAN_STA_PS_DRIVER))
  32116. + ieee80211_sta_ps_deliver_uapsd(rx->sta);
  32117. + else
  32118. + set_sta_flag(rx->sta, WLAN_STA_UAPSD);
  32119. + }
  32120. +
  32121. + return RX_CONTINUE;
  32122. +}
  32123. +
  32124. +static ieee80211_rx_result debug_noinline
  32125. +ieee80211_rx_h_sta_process(struct ieee80211_rx_data *rx)
  32126. +{
  32127. + struct sta_info *sta = rx->sta;
  32128. + struct sk_buff *skb = rx->skb;
  32129. + struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
  32130. + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
  32131. + int i;
  32132. +
  32133. + if (!sta)
  32134. + return RX_CONTINUE;
  32135. +
  32136. + /*
  32137. + * Update last_rx only for IBSS packets which are for the current
  32138. + * BSSID and for station already AUTHORIZED to avoid keeping the
  32139. + * current IBSS network alive in cases where other STAs start
  32140. + * using different BSSID. This will also give the station another
  32141. + * chance to restart the authentication/authorization in case
  32142. + * something went wrong the first time.
  32143. + */
  32144. + if (rx->sdata->vif.type == NL80211_IFTYPE_ADHOC) {
  32145. + u8 *bssid = ieee80211_get_bssid(hdr, rx->skb->len,
  32146. + NL80211_IFTYPE_ADHOC);
  32147. + if (ether_addr_equal(bssid, rx->sdata->u.ibss.bssid) &&
  32148. + test_sta_flag(sta, WLAN_STA_AUTHORIZED)) {
  32149. + sta->last_rx = jiffies;
  32150. + if (ieee80211_is_data(hdr->frame_control) &&
  32151. + !is_multicast_ether_addr(hdr->addr1)) {
  32152. + sta->last_rx_rate_idx = status->rate_idx;
  32153. + sta->last_rx_rate_flag = status->flag;
  32154. + sta->last_rx_rate_vht_flag = status->vht_flag;
  32155. + sta->last_rx_rate_vht_nss = status->vht_nss;
  32156. + }
  32157. + }
  32158. + } else if (!is_multicast_ether_addr(hdr->addr1)) {
  32159. + /*
  32160. + * Mesh beacons will update last_rx when if they are found to
  32161. + * match the current local configuration when processed.
  32162. + */
  32163. + sta->last_rx = jiffies;
  32164. + if (ieee80211_is_data(hdr->frame_control)) {
  32165. + sta->last_rx_rate_idx = status->rate_idx;
  32166. + sta->last_rx_rate_flag = status->flag;
  32167. + sta->last_rx_rate_vht_flag = status->vht_flag;
  32168. + sta->last_rx_rate_vht_nss = status->vht_nss;
  32169. + }
  32170. + }
  32171. +
  32172. + if (!(status->rx_flags & IEEE80211_RX_RA_MATCH))
  32173. + return RX_CONTINUE;
  32174. +
  32175. + if (rx->sdata->vif.type == NL80211_IFTYPE_STATION)
  32176. + ieee80211_sta_rx_notify(rx->sdata, hdr);
  32177. +
  32178. + sta->rx_fragments++;
  32179. + sta->rx_bytes += rx->skb->len;
  32180. + if (!(status->flag & RX_FLAG_NO_SIGNAL_VAL)) {
  32181. + sta->last_signal = status->signal;
  32182. + ewma_add(&sta->avg_signal, -status->signal);
  32183. + }
  32184. +
  32185. + if (status->chains) {
  32186. + sta->chains = status->chains;
  32187. + for (i = 0; i < ARRAY_SIZE(status->chain_signal); i++) {
  32188. + int signal = status->chain_signal[i];
  32189. +
  32190. + if (!(status->chains & BIT(i)))
  32191. + continue;
  32192. +
  32193. + sta->chain_signal_last[i] = signal;
  32194. + ewma_add(&sta->chain_signal_avg[i], -signal);
  32195. + }
  32196. + }
  32197. +
  32198. + /*
  32199. + * Change STA power saving mode only at the end of a frame
  32200. + * exchange sequence.
  32201. + */
  32202. + if (!(sta->local->hw.flags & IEEE80211_HW_AP_LINK_PS) &&
  32203. + !ieee80211_has_morefrags(hdr->frame_control) &&
  32204. + !(status->rx_flags & IEEE80211_RX_DEFERRED_RELEASE) &&
  32205. + (rx->sdata->vif.type == NL80211_IFTYPE_AP ||
  32206. + rx->sdata->vif.type == NL80211_IFTYPE_AP_VLAN) &&
  32207. + /* PM bit is only checked in frames where it isn't reserved,
  32208. + * in AP mode it's reserved in non-bufferable management frames
  32209. + * (cf. IEEE 802.11-2012 8.2.4.1.7 Power Management field)
  32210. + */
  32211. + (!ieee80211_is_mgmt(hdr->frame_control) ||
  32212. + ieee80211_is_bufferable_mmpdu(hdr->frame_control))) {
  32213. + if (test_sta_flag(sta, WLAN_STA_PS_STA)) {
  32214. + if (!ieee80211_has_pm(hdr->frame_control))
  32215. + sta_ps_end(sta);
  32216. + } else {
  32217. + if (ieee80211_has_pm(hdr->frame_control))
  32218. + sta_ps_start(sta);
  32219. + }
  32220. + }
  32221. +
  32222. + /* mesh power save support */
  32223. + if (ieee80211_vif_is_mesh(&rx->sdata->vif))
  32224. + ieee80211_mps_rx_h_sta_process(sta, hdr);
  32225. +
  32226. + /*
  32227. + * Drop (qos-)data::nullfunc frames silently, since they
  32228. + * are used only to control station power saving mode.
  32229. + */
  32230. + if (ieee80211_is_nullfunc(hdr->frame_control) ||
  32231. + ieee80211_is_qos_nullfunc(hdr->frame_control)) {
  32232. + I802_DEBUG_INC(rx->local->rx_handlers_drop_nullfunc);
  32233. +
  32234. + /*
  32235. + * If we receive a 4-addr nullfunc frame from a STA
  32236. + * that was not moved to a 4-addr STA vlan yet send
  32237. + * the event to userspace and for older hostapd drop
  32238. + * the frame to the monitor interface.
  32239. + */
  32240. + if (ieee80211_has_a4(hdr->frame_control) &&
  32241. + (rx->sdata->vif.type == NL80211_IFTYPE_AP ||
  32242. + (rx->sdata->vif.type == NL80211_IFTYPE_AP_VLAN &&
  32243. + !rx->sdata->u.vlan.sta))) {
  32244. + if (!test_and_set_sta_flag(sta, WLAN_STA_4ADDR_EVENT))
  32245. + cfg80211_rx_unexpected_4addr_frame(
  32246. + rx->sdata->dev, sta->sta.addr,
  32247. + GFP_ATOMIC);
  32248. + return RX_DROP_MONITOR;
  32249. + }
  32250. + /*
  32251. + * Update counter and free packet here to avoid
  32252. + * counting this as a dropped packed.
  32253. + */
  32254. + sta->rx_packets++;
  32255. + dev_kfree_skb(rx->skb);
  32256. + return RX_QUEUED;
  32257. + }
  32258. +
  32259. + return RX_CONTINUE;
  32260. +} /* ieee80211_rx_h_sta_process */
  32261. +
  32262. +static ieee80211_rx_result debug_noinline
  32263. +ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx)
  32264. +{
  32265. + struct sk_buff *skb = rx->skb;
  32266. + struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
  32267. + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
  32268. + int keyidx;
  32269. + int hdrlen;
  32270. + ieee80211_rx_result result = RX_DROP_UNUSABLE;
  32271. + struct ieee80211_key *sta_ptk = NULL;
  32272. + int mmie_keyidx = -1;
  32273. + __le16 fc;
  32274. + const struct ieee80211_cipher_scheme *cs = NULL;
  32275. +
  32276. + /*
  32277. + * Key selection 101
  32278. + *
  32279. + * There are four types of keys:
  32280. + * - GTK (group keys)
  32281. + * - IGTK (group keys for management frames)
  32282. + * - PTK (pairwise keys)
  32283. + * - STK (station-to-station pairwise keys)
  32284. + *
  32285. + * When selecting a key, we have to distinguish between multicast
  32286. + * (including broadcast) and unicast frames, the latter can only
  32287. + * use PTKs and STKs while the former always use GTKs and IGTKs.
  32288. + * Unless, of course, actual WEP keys ("pre-RSNA") are used, then
  32289. + * unicast frames can also use key indices like GTKs. Hence, if we
  32290. + * don't have a PTK/STK we check the key index for a WEP key.
  32291. + *
  32292. + * Note that in a regular BSS, multicast frames are sent by the
  32293. + * AP only, associated stations unicast the frame to the AP first
  32294. + * which then multicasts it on their behalf.
  32295. + *
  32296. + * There is also a slight problem in IBSS mode: GTKs are negotiated
  32297. + * with each station, that is something we don't currently handle.
  32298. + * The spec seems to expect that one negotiates the same key with
  32299. + * every station but there's no such requirement; VLANs could be
  32300. + * possible.
  32301. + */
  32302. +
  32303. + /*
  32304. + * No point in finding a key and decrypting if the frame is neither
  32305. + * addressed to us nor a multicast frame.
  32306. + */
  32307. + if (!(status->rx_flags & IEEE80211_RX_RA_MATCH))
  32308. + return RX_CONTINUE;
  32309. +
  32310. + /* start without a key */
  32311. + rx->key = NULL;
  32312. + fc = hdr->frame_control;
  32313. +
  32314. + if (rx->sta) {
  32315. + int keyid = rx->sta->ptk_idx;
  32316. +
  32317. + if (ieee80211_has_protected(fc) && rx->sta->cipher_scheme) {
  32318. + cs = rx->sta->cipher_scheme;
  32319. + keyid = iwl80211_get_cs_keyid(cs, rx->skb);
  32320. + if (unlikely(keyid < 0))
  32321. + return RX_DROP_UNUSABLE;
  32322. + }
  32323. + sta_ptk = rcu_dereference(rx->sta->ptk[keyid]);
  32324. + }
  32325. +
  32326. + if (!ieee80211_has_protected(fc))
  32327. + mmie_keyidx = ieee80211_get_mmie_keyidx(rx->skb);
  32328. +
  32329. + if (!is_multicast_ether_addr(hdr->addr1) && sta_ptk) {
  32330. + rx->key = sta_ptk;
  32331. + if ((status->flag & RX_FLAG_DECRYPTED) &&
  32332. + (status->flag & RX_FLAG_IV_STRIPPED))
  32333. + return RX_CONTINUE;
  32334. + /* Skip decryption if the frame is not protected. */
  32335. + if (!ieee80211_has_protected(fc))
  32336. + return RX_CONTINUE;
  32337. + } else if (mmie_keyidx >= 0) {
  32338. + /* Broadcast/multicast robust management frame / BIP */
  32339. + if ((status->flag & RX_FLAG_DECRYPTED) &&
  32340. + (status->flag & RX_FLAG_IV_STRIPPED))
  32341. + return RX_CONTINUE;
  32342. +
  32343. + if (mmie_keyidx < NUM_DEFAULT_KEYS ||
  32344. + mmie_keyidx >= NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS)
  32345. + return RX_DROP_MONITOR; /* unexpected BIP keyidx */
  32346. + if (rx->sta)
  32347. + rx->key = rcu_dereference(rx->sta->gtk[mmie_keyidx]);
  32348. + if (!rx->key)
  32349. + rx->key = rcu_dereference(rx->sdata->keys[mmie_keyidx]);
  32350. + } else if (!ieee80211_has_protected(fc)) {
  32351. + /*
  32352. + * The frame was not protected, so skip decryption. However, we
  32353. + * need to set rx->key if there is a key that could have been
  32354. + * used so that the frame may be dropped if encryption would
  32355. + * have been expected.
  32356. + */
  32357. + struct ieee80211_key *key = NULL;
  32358. + struct ieee80211_sub_if_data *sdata = rx->sdata;
  32359. + int i;
  32360. +
  32361. + if (ieee80211_is_mgmt(fc) &&
  32362. + is_multicast_ether_addr(hdr->addr1) &&
  32363. + (key = rcu_dereference(rx->sdata->default_mgmt_key)))
  32364. + rx->key = key;
  32365. + else {
  32366. + if (rx->sta) {
  32367. + for (i = 0; i < NUM_DEFAULT_KEYS; i++) {
  32368. + key = rcu_dereference(rx->sta->gtk[i]);
  32369. + if (key)
  32370. + break;
  32371. + }
  32372. + }
  32373. + if (!key) {
  32374. + for (i = 0; i < NUM_DEFAULT_KEYS; i++) {
  32375. + key = rcu_dereference(sdata->keys[i]);
  32376. + if (key)
  32377. + break;
  32378. + }
  32379. + }
  32380. + if (key)
  32381. + rx->key = key;
  32382. + }
  32383. + return RX_CONTINUE;
  32384. + } else {
  32385. + u8 keyid;
  32386. +
  32387. + /*
  32388. + * The device doesn't give us the IV so we won't be
  32389. + * able to look up the key. That's ok though, we
  32390. + * don't need to decrypt the frame, we just won't
  32391. + * be able to keep statistics accurate.
  32392. + * Except for key threshold notifications, should
  32393. + * we somehow allow the driver to tell us which key
  32394. + * the hardware used if this flag is set?
  32395. + */
  32396. + if ((status->flag & RX_FLAG_DECRYPTED) &&
  32397. + (status->flag & RX_FLAG_IV_STRIPPED))
  32398. + return RX_CONTINUE;
  32399. +
  32400. + hdrlen = ieee80211_hdrlen(fc);
  32401. +
  32402. + if (cs) {
  32403. + keyidx = iwl80211_get_cs_keyid(cs, rx->skb);
  32404. +
  32405. + if (unlikely(keyidx < 0))
  32406. + return RX_DROP_UNUSABLE;
  32407. + } else {
  32408. + if (rx->skb->len < 8 + hdrlen)
  32409. + return RX_DROP_UNUSABLE; /* TODO: count this? */
  32410. + /*
  32411. + * no need to call ieee80211_wep_get_keyidx,
  32412. + * it verifies a bunch of things we've done already
  32413. + */
  32414. + skb_copy_bits(rx->skb, hdrlen + 3, &keyid, 1);
  32415. + keyidx = keyid >> 6;
  32416. + }
  32417. +
  32418. + /* check per-station GTK first, if multicast packet */
  32419. + if (is_multicast_ether_addr(hdr->addr1) && rx->sta)
  32420. + rx->key = rcu_dereference(rx->sta->gtk[keyidx]);
  32421. +
  32422. + /* if not found, try default key */
  32423. + if (!rx->key) {
  32424. + rx->key = rcu_dereference(rx->sdata->keys[keyidx]);
  32425. +
  32426. + /*
  32427. + * RSNA-protected unicast frames should always be
  32428. + * sent with pairwise or station-to-station keys,
  32429. + * but for WEP we allow using a key index as well.
  32430. + */
  32431. + if (rx->key &&
  32432. + rx->key->conf.cipher != WLAN_CIPHER_SUITE_WEP40 &&
  32433. + rx->key->conf.cipher != WLAN_CIPHER_SUITE_WEP104 &&
  32434. + !is_multicast_ether_addr(hdr->addr1))
  32435. + rx->key = NULL;
  32436. + }
  32437. + }
  32438. +
  32439. + if (rx->key) {
  32440. + if (unlikely(rx->key->flags & KEY_FLAG_TAINTED))
  32441. + return RX_DROP_MONITOR;
  32442. +
  32443. + rx->key->tx_rx_count++;
  32444. + /* TODO: add threshold stuff again */
  32445. + } else {
  32446. + return RX_DROP_MONITOR;
  32447. + }
  32448. +
  32449. + switch (rx->key->conf.cipher) {
  32450. + case WLAN_CIPHER_SUITE_WEP40:
  32451. + case WLAN_CIPHER_SUITE_WEP104:
  32452. + result = ieee80211_crypto_wep_decrypt(rx);
  32453. + break;
  32454. + case WLAN_CIPHER_SUITE_TKIP:
  32455. + result = ieee80211_crypto_tkip_decrypt(rx);
  32456. + break;
  32457. + case WLAN_CIPHER_SUITE_CCMP:
  32458. + result = ieee80211_crypto_ccmp_decrypt(rx);
  32459. + break;
  32460. + case WLAN_CIPHER_SUITE_AES_CMAC:
  32461. + result = ieee80211_crypto_aes_cmac_decrypt(rx);
  32462. + break;
  32463. + default:
  32464. + result = ieee80211_crypto_hw_decrypt(rx);
  32465. + }
  32466. +
  32467. + /* the hdr variable is invalid after the decrypt handlers */
  32468. +
  32469. + /* either the frame has been decrypted or will be dropped */
  32470. + status->flag |= RX_FLAG_DECRYPTED;
  32471. +
  32472. + return result;
  32473. +}
  32474. +
  32475. +static inline struct ieee80211_fragment_entry *
  32476. +ieee80211_reassemble_add(struct ieee80211_sub_if_data *sdata,
  32477. + unsigned int frag, unsigned int seq, int rx_queue,
  32478. + struct sk_buff **skb)
  32479. +{
  32480. + struct ieee80211_fragment_entry *entry;
  32481. +
  32482. + entry = &sdata->fragments[sdata->fragment_next++];
  32483. + if (sdata->fragment_next >= IEEE80211_FRAGMENT_MAX)
  32484. + sdata->fragment_next = 0;
  32485. +
  32486. + if (!skb_queue_empty(&entry->skb_list))
  32487. + __skb_queue_purge(&entry->skb_list);
  32488. +
  32489. + __skb_queue_tail(&entry->skb_list, *skb); /* no need for locking */
  32490. + *skb = NULL;
  32491. + entry->first_frag_time = jiffies;
  32492. + entry->seq = seq;
  32493. + entry->rx_queue = rx_queue;
  32494. + entry->last_frag = frag;
  32495. + entry->ccmp = 0;
  32496. + entry->extra_len = 0;
  32497. +
  32498. + return entry;
  32499. +}
  32500. +
  32501. +static inline struct ieee80211_fragment_entry *
  32502. +ieee80211_reassemble_find(struct ieee80211_sub_if_data *sdata,
  32503. + unsigned int frag, unsigned int seq,
  32504. + int rx_queue, struct ieee80211_hdr *hdr)
  32505. +{
  32506. + struct ieee80211_fragment_entry *entry;
  32507. + int i, idx;
  32508. +
  32509. + idx = sdata->fragment_next;
  32510. + for (i = 0; i < IEEE80211_FRAGMENT_MAX; i++) {
  32511. + struct ieee80211_hdr *f_hdr;
  32512. +
  32513. + idx--;
  32514. + if (idx < 0)
  32515. + idx = IEEE80211_FRAGMENT_MAX - 1;
  32516. +
  32517. + entry = &sdata->fragments[idx];
  32518. + if (skb_queue_empty(&entry->skb_list) || entry->seq != seq ||
  32519. + entry->rx_queue != rx_queue ||
  32520. + entry->last_frag + 1 != frag)
  32521. + continue;
  32522. +
  32523. + f_hdr = (struct ieee80211_hdr *)entry->skb_list.next->data;
  32524. +
  32525. + /*
  32526. + * Check ftype and addresses are equal, else check next fragment
  32527. + */
  32528. + if (((hdr->frame_control ^ f_hdr->frame_control) &
  32529. + cpu_to_le16(IEEE80211_FCTL_FTYPE)) ||
  32530. + !ether_addr_equal(hdr->addr1, f_hdr->addr1) ||
  32531. + !ether_addr_equal(hdr->addr2, f_hdr->addr2))
  32532. + continue;
  32533. +
  32534. + if (time_after(jiffies, entry->first_frag_time + 2 * HZ)) {
  32535. + __skb_queue_purge(&entry->skb_list);
  32536. + continue;
  32537. + }
  32538. + return entry;
  32539. + }
  32540. +
  32541. + return NULL;
  32542. +}
  32543. +
  32544. +static ieee80211_rx_result debug_noinline
  32545. +ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx)
  32546. +{
  32547. + struct ieee80211_hdr *hdr;
  32548. + u16 sc;
  32549. + __le16 fc;
  32550. + unsigned int frag, seq;
  32551. + struct ieee80211_fragment_entry *entry;
  32552. + struct sk_buff *skb;
  32553. + struct ieee80211_rx_status *status;
  32554. +
  32555. + hdr = (struct ieee80211_hdr *)rx->skb->data;
  32556. + fc = hdr->frame_control;
  32557. +
  32558. + if (ieee80211_is_ctl(fc))
  32559. + return RX_CONTINUE;
  32560. +
  32561. + sc = le16_to_cpu(hdr->seq_ctrl);
  32562. + frag = sc & IEEE80211_SCTL_FRAG;
  32563. +
  32564. + if (is_multicast_ether_addr(hdr->addr1)) {
  32565. + rx->local->dot11MulticastReceivedFrameCount++;
  32566. + goto out_no_led;
  32567. + }
  32568. +
  32569. + if (likely(!ieee80211_has_morefrags(fc) && frag == 0))
  32570. + goto out;
  32571. +
  32572. + I802_DEBUG_INC(rx->local->rx_handlers_fragments);
  32573. +
  32574. + if (skb_linearize(rx->skb))
  32575. + return RX_DROP_UNUSABLE;
  32576. +
  32577. + /*
  32578. + * skb_linearize() might change the skb->data and
  32579. + * previously cached variables (in this case, hdr) need to
  32580. + * be refreshed with the new data.
  32581. + */
  32582. + hdr = (struct ieee80211_hdr *)rx->skb->data;
  32583. + seq = (sc & IEEE80211_SCTL_SEQ) >> 4;
  32584. +
  32585. + if (frag == 0) {
  32586. + /* This is the first fragment of a new frame. */
  32587. + entry = ieee80211_reassemble_add(rx->sdata, frag, seq,
  32588. + rx->seqno_idx, &(rx->skb));
  32589. + if (rx->key && rx->key->conf.cipher == WLAN_CIPHER_SUITE_CCMP &&
  32590. + ieee80211_has_protected(fc)) {
  32591. + int queue = rx->security_idx;
  32592. + /* Store CCMP PN so that we can verify that the next
  32593. + * fragment has a sequential PN value. */
  32594. + entry->ccmp = 1;
  32595. + memcpy(entry->last_pn,
  32596. + rx->key->u.ccmp.rx_pn[queue],
  32597. + IEEE80211_CCMP_PN_LEN);
  32598. + }
  32599. + return RX_QUEUED;
  32600. + }
  32601. +
  32602. + /* This is a fragment for a frame that should already be pending in
  32603. + * fragment cache. Add this fragment to the end of the pending entry.
  32604. + */
  32605. + entry = ieee80211_reassemble_find(rx->sdata, frag, seq,
  32606. + rx->seqno_idx, hdr);
  32607. + if (!entry) {
  32608. + I802_DEBUG_INC(rx->local->rx_handlers_drop_defrag);
  32609. + return RX_DROP_MONITOR;
  32610. + }
  32611. +
  32612. + /* Verify that MPDUs within one MSDU have sequential PN values.
  32613. + * (IEEE 802.11i, 8.3.3.4.5) */
  32614. + if (entry->ccmp) {
  32615. + int i;
  32616. + u8 pn[IEEE80211_CCMP_PN_LEN], *rpn;
  32617. + int queue;
  32618. + if (!rx->key || rx->key->conf.cipher != WLAN_CIPHER_SUITE_CCMP)
  32619. + return RX_DROP_UNUSABLE;
  32620. + memcpy(pn, entry->last_pn, IEEE80211_CCMP_PN_LEN);
  32621. + for (i = IEEE80211_CCMP_PN_LEN - 1; i >= 0; i--) {
  32622. + pn[i]++;
  32623. + if (pn[i])
  32624. + break;
  32625. + }
  32626. + queue = rx->security_idx;
  32627. + rpn = rx->key->u.ccmp.rx_pn[queue];
  32628. + if (memcmp(pn, rpn, IEEE80211_CCMP_PN_LEN))
  32629. + return RX_DROP_UNUSABLE;
  32630. + memcpy(entry->last_pn, pn, IEEE80211_CCMP_PN_LEN);
  32631. + }
  32632. +
  32633. + skb_pull(rx->skb, ieee80211_hdrlen(fc));
  32634. + __skb_queue_tail(&entry->skb_list, rx->skb);
  32635. + entry->last_frag = frag;
  32636. + entry->extra_len += rx->skb->len;
  32637. + if (ieee80211_has_morefrags(fc)) {
  32638. + rx->skb = NULL;
  32639. + return RX_QUEUED;
  32640. + }
  32641. +
  32642. + rx->skb = __skb_dequeue(&entry->skb_list);
  32643. + if (skb_tailroom(rx->skb) < entry->extra_len) {
  32644. + I802_DEBUG_INC(rx->local->rx_expand_skb_head2);
  32645. + if (unlikely(pskb_expand_head(rx->skb, 0, entry->extra_len,
  32646. + GFP_ATOMIC))) {
  32647. + I802_DEBUG_INC(rx->local->rx_handlers_drop_defrag);
  32648. + __skb_queue_purge(&entry->skb_list);
  32649. + return RX_DROP_UNUSABLE;
  32650. + }
  32651. + }
  32652. + while ((skb = __skb_dequeue(&entry->skb_list))) {
  32653. + memcpy(skb_put(rx->skb, skb->len), skb->data, skb->len);
  32654. + dev_kfree_skb(skb);
  32655. + }
  32656. +
  32657. + /* Complete frame has been reassembled - process it now */
  32658. + status = IEEE80211_SKB_RXCB(rx->skb);
  32659. + status->rx_flags |= IEEE80211_RX_FRAGMENTED;
  32660. +
  32661. + out:
  32662. + ieee80211_led_rx(rx->local);
  32663. + out_no_led:
  32664. + if (rx->sta)
  32665. + rx->sta->rx_packets++;
  32666. + return RX_CONTINUE;
  32667. +}
  32668. +
  32669. +static int ieee80211_802_1x_port_control(struct ieee80211_rx_data *rx)
  32670. +{
  32671. + if (unlikely(!rx->sta || !test_sta_flag(rx->sta, WLAN_STA_AUTHORIZED)))
  32672. + return -EACCES;
  32673. +
  32674. + return 0;
  32675. +}
  32676. +
  32677. +static int ieee80211_drop_unencrypted(struct ieee80211_rx_data *rx, __le16 fc)
  32678. +{
  32679. + struct sk_buff *skb = rx->skb;
  32680. + struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
  32681. +
  32682. + /*
  32683. + * Pass through unencrypted frames if the hardware has
  32684. + * decrypted them already.
  32685. + */
  32686. + if (status->flag & RX_FLAG_DECRYPTED)
  32687. + return 0;
  32688. +
  32689. + /* Drop unencrypted frames if key is set. */
  32690. + if (unlikely(!ieee80211_has_protected(fc) &&
  32691. + !ieee80211_is_nullfunc(fc) &&
  32692. + ieee80211_is_data(fc) &&
  32693. + (rx->key || rx->sdata->drop_unencrypted)))
  32694. + return -EACCES;
  32695. +
  32696. + return 0;
  32697. +}
  32698. +
  32699. +static int ieee80211_drop_unencrypted_mgmt(struct ieee80211_rx_data *rx)
  32700. +{
  32701. + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)rx->skb->data;
  32702. + struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
  32703. + __le16 fc = hdr->frame_control;
  32704. +
  32705. + /*
  32706. + * Pass through unencrypted frames if the hardware has
  32707. + * decrypted them already.
  32708. + */
  32709. + if (status->flag & RX_FLAG_DECRYPTED)
  32710. + return 0;
  32711. +
  32712. + if (rx->sta && test_sta_flag(rx->sta, WLAN_STA_MFP)) {
  32713. + if (unlikely(!ieee80211_has_protected(fc) &&
  32714. + ieee80211_is_unicast_robust_mgmt_frame(rx->skb) &&
  32715. + rx->key)) {
  32716. + if (ieee80211_is_deauth(fc) ||
  32717. + ieee80211_is_disassoc(fc))
  32718. + cfg80211_rx_unprot_mlme_mgmt(rx->sdata->dev,
  32719. + rx->skb->data,
  32720. + rx->skb->len);
  32721. + return -EACCES;
  32722. + }
  32723. + /* BIP does not use Protected field, so need to check MMIE */
  32724. + if (unlikely(ieee80211_is_multicast_robust_mgmt_frame(rx->skb) &&
  32725. + ieee80211_get_mmie_keyidx(rx->skb) < 0)) {
  32726. + if (ieee80211_is_deauth(fc) ||
  32727. + ieee80211_is_disassoc(fc))
  32728. + cfg80211_rx_unprot_mlme_mgmt(rx->sdata->dev,
  32729. + rx->skb->data,
  32730. + rx->skb->len);
  32731. + return -EACCES;
  32732. + }
  32733. + /*
  32734. + * When using MFP, Action frames are not allowed prior to
  32735. + * having configured keys.
  32736. + */
  32737. + if (unlikely(ieee80211_is_action(fc) && !rx->key &&
  32738. + ieee80211_is_robust_mgmt_frame(rx->skb)))
  32739. + return -EACCES;
  32740. + }
  32741. +
  32742. + return 0;
  32743. +}
  32744. +
  32745. +static int
  32746. +__ieee80211_data_to_8023(struct ieee80211_rx_data *rx, bool *port_control)
  32747. +{
  32748. + struct ieee80211_sub_if_data *sdata = rx->sdata;
  32749. + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)rx->skb->data;
  32750. + bool check_port_control = false;
  32751. + struct ethhdr *ehdr;
  32752. + int ret;
  32753. +
  32754. + *port_control = false;
  32755. + if (ieee80211_has_a4(hdr->frame_control) &&
  32756. + sdata->vif.type == NL80211_IFTYPE_AP_VLAN && !sdata->u.vlan.sta)
  32757. + return -1;
  32758. +
  32759. + if (sdata->vif.type == NL80211_IFTYPE_STATION &&
  32760. + !!sdata->u.mgd.use_4addr != !!ieee80211_has_a4(hdr->frame_control)) {
  32761. +
  32762. + if (!sdata->u.mgd.use_4addr)
  32763. + return -1;
  32764. + else
  32765. + check_port_control = true;
  32766. + }
  32767. +
  32768. + if (is_multicast_ether_addr(hdr->addr1) &&
  32769. + sdata->vif.type == NL80211_IFTYPE_AP_VLAN && sdata->u.vlan.sta)
  32770. + return -1;
  32771. +
  32772. + ret = ieee80211_data_to_8023(rx->skb, sdata->vif.addr, sdata->vif.type);
  32773. + if (ret < 0)
  32774. + return ret;
  32775. +
  32776. + ehdr = (struct ethhdr *) rx->skb->data;
  32777. + if (ehdr->h_proto == rx->sdata->control_port_protocol)
  32778. + *port_control = true;
  32779. + else if (check_port_control)
  32780. + return -1;
  32781. +
  32782. + return 0;
  32783. +}
  32784. +
  32785. +/*
  32786. + * requires that rx->skb is a frame with ethernet header
  32787. + */
  32788. +static bool ieee80211_frame_allowed(struct ieee80211_rx_data *rx, __le16 fc)
  32789. +{
  32790. + static const u8 pae_group_addr[ETH_ALEN] __aligned(2)
  32791. + = { 0x01, 0x80, 0xC2, 0x00, 0x00, 0x03 };
  32792. + struct ethhdr *ehdr = (struct ethhdr *) rx->skb->data;
  32793. +
  32794. + /*
  32795. + * Allow EAPOL frames to us/the PAE group address regardless
  32796. + * of whether the frame was encrypted or not.
  32797. + */
  32798. + if (ehdr->h_proto == rx->sdata->control_port_protocol &&
  32799. + (ether_addr_equal(ehdr->h_dest, rx->sdata->vif.addr) ||
  32800. + ether_addr_equal(ehdr->h_dest, pae_group_addr)))
  32801. + return true;
  32802. +
  32803. + if (ieee80211_802_1x_port_control(rx) ||
  32804. + ieee80211_drop_unencrypted(rx, fc))
  32805. + return false;
  32806. +
  32807. + return true;
  32808. +}
  32809. +
  32810. +/*
  32811. + * requires that rx->skb is a frame with ethernet header
  32812. + */
  32813. +static void
  32814. +ieee80211_deliver_skb(struct ieee80211_rx_data *rx)
  32815. +{
  32816. + struct ieee80211_sub_if_data *sdata = rx->sdata;
  32817. + struct net_device *dev = sdata->dev;
  32818. + struct sk_buff *skb, *xmit_skb;
  32819. + struct ethhdr *ehdr = (struct ethhdr *) rx->skb->data;
  32820. + struct sta_info *dsta;
  32821. + struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
  32822. +
  32823. + skb = rx->skb;
  32824. + xmit_skb = NULL;
  32825. +
  32826. + if ((sdata->vif.type == NL80211_IFTYPE_AP ||
  32827. + sdata->vif.type == NL80211_IFTYPE_AP_VLAN) &&
  32828. + !(sdata->flags & IEEE80211_SDATA_DONT_BRIDGE_PACKETS) &&
  32829. + (status->rx_flags & IEEE80211_RX_RA_MATCH) &&
  32830. + (sdata->vif.type != NL80211_IFTYPE_AP_VLAN || !sdata->u.vlan.sta)) {
  32831. + if (is_multicast_ether_addr(ehdr->h_dest)) {
  32832. + /*
  32833. + * send multicast frames both to higher layers in
  32834. + * local net stack and back to the wireless medium
  32835. + */
  32836. + xmit_skb = skb_copy(skb, GFP_ATOMIC);
  32837. + if (!xmit_skb)
  32838. + net_info_ratelimited("%s: failed to clone multicast frame\n",
  32839. + dev->name);
  32840. + } else {
  32841. + dsta = sta_info_get(sdata, skb->data);
  32842. + if (dsta) {
  32843. + /*
  32844. + * The destination station is associated to
  32845. + * this AP (in this VLAN), so send the frame
  32846. + * directly to it and do not pass it to local
  32847. + * net stack.
  32848. + */
  32849. + xmit_skb = skb;
  32850. + skb = NULL;
  32851. + }
  32852. + }
  32853. + }
  32854. +
  32855. +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
  32856. + if (skb) {
  32857. + /* 'align' will only take the values 0 or 2 here since all
  32858. + * frames are required to be aligned to 2-byte boundaries
  32859. + * when being passed to mac80211; the code here works just
  32860. + * as well if that isn't true, but mac80211 assumes it can
  32861. + * access fields as 2-byte aligned (e.g. for ether_addr_equal)
  32862. + */
  32863. + int align;
  32864. +
  32865. + align = (unsigned long)(skb->data + sizeof(struct ethhdr)) & 3;
  32866. + if (align) {
  32867. + if (WARN_ON(skb_headroom(skb) < 3)) {
  32868. + dev_kfree_skb(skb);
  32869. + skb = NULL;
  32870. + } else {
  32871. + u8 *data = skb->data;
  32872. + size_t len = skb_headlen(skb);
  32873. + skb->data -= align;
  32874. + memmove(skb->data, data, len);
  32875. + skb_set_tail_pointer(skb, len);
  32876. + }
  32877. + }
  32878. + }
  32879. +#endif
  32880. +
  32881. + if (skb) {
  32882. + /* deliver to local stack */
  32883. + skb->protocol = eth_type_trans(skb, dev);
  32884. + memset(skb->cb, 0, sizeof(skb->cb));
  32885. + if (rx->local->napi)
  32886. + napi_gro_receive(rx->local->napi, skb);
  32887. + else
  32888. + netif_receive_skb(skb);
  32889. + }
  32890. +
  32891. + if (xmit_skb) {
  32892. + /*
  32893. + * Send to wireless media and increase priority by 256 to
  32894. + * keep the received priority instead of reclassifying
  32895. + * the frame (see cfg80211_classify8021d).
  32896. + */
  32897. + xmit_skb->priority += 256;
  32898. + xmit_skb->protocol = htons(ETH_P_802_3);
  32899. + skb_reset_network_header(xmit_skb);
  32900. + skb_reset_mac_header(xmit_skb);
  32901. + dev_queue_xmit(xmit_skb);
  32902. + }
  32903. +}
  32904. +
  32905. +static ieee80211_rx_result debug_noinline
  32906. +ieee80211_rx_h_amsdu(struct ieee80211_rx_data *rx)
  32907. +{
  32908. + struct net_device *dev = rx->sdata->dev;
  32909. + struct sk_buff *skb = rx->skb;
  32910. + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
  32911. + __le16 fc = hdr->frame_control;
  32912. + struct sk_buff_head frame_list;
  32913. + struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
  32914. +
  32915. + if (unlikely(!ieee80211_is_data(fc)))
  32916. + return RX_CONTINUE;
  32917. +
  32918. + if (unlikely(!ieee80211_is_data_present(fc)))
  32919. + return RX_DROP_MONITOR;
  32920. +
  32921. + if (!(status->rx_flags & IEEE80211_RX_AMSDU))
  32922. + return RX_CONTINUE;
  32923. +
  32924. + if (ieee80211_has_a4(hdr->frame_control) &&
  32925. + rx->sdata->vif.type == NL80211_IFTYPE_AP_VLAN &&
  32926. + !rx->sdata->u.vlan.sta)
  32927. + return RX_DROP_UNUSABLE;
  32928. +
  32929. + if (is_multicast_ether_addr(hdr->addr1) &&
  32930. + ((rx->sdata->vif.type == NL80211_IFTYPE_AP_VLAN &&
  32931. + rx->sdata->u.vlan.sta) ||
  32932. + (rx->sdata->vif.type == NL80211_IFTYPE_STATION &&
  32933. + rx->sdata->u.mgd.use_4addr)))
  32934. + return RX_DROP_UNUSABLE;
  32935. +
  32936. + skb->dev = dev;
  32937. + __skb_queue_head_init(&frame_list);
  32938. +
  32939. + if (skb_linearize(skb))
  32940. + return RX_DROP_UNUSABLE;
  32941. +
  32942. + ieee80211_amsdu_to_8023s(skb, &frame_list, dev->dev_addr,
  32943. + rx->sdata->vif.type,
  32944. + rx->local->hw.extra_tx_headroom, true);
  32945. +
  32946. + while (!skb_queue_empty(&frame_list)) {
  32947. + rx->skb = __skb_dequeue(&frame_list);
  32948. +
  32949. + if (!ieee80211_frame_allowed(rx, fc)) {
  32950. + dev_kfree_skb(rx->skb);
  32951. + continue;
  32952. + }
  32953. + dev->stats.rx_packets++;
  32954. + dev->stats.rx_bytes += rx->skb->len;
  32955. +
  32956. + ieee80211_deliver_skb(rx);
  32957. + }
  32958. +
  32959. + return RX_QUEUED;
  32960. +}
  32961. +
  32962. +#ifdef CONFIG_MAC80211_MESH
  32963. +static ieee80211_rx_result
  32964. +ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx)
  32965. +{
  32966. + struct ieee80211_hdr *fwd_hdr, *hdr;
  32967. + struct ieee80211_tx_info *info;
  32968. + struct ieee80211s_hdr *mesh_hdr;
  32969. + struct sk_buff *skb = rx->skb, *fwd_skb;
  32970. + struct ieee80211_local *local = rx->local;
  32971. + struct ieee80211_sub_if_data *sdata = rx->sdata;
  32972. + struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
  32973. + struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
  32974. + u16 q, hdrlen;
  32975. +
  32976. + hdr = (struct ieee80211_hdr *) skb->data;
  32977. + hdrlen = ieee80211_hdrlen(hdr->frame_control);
  32978. +
  32979. + /* make sure fixed part of mesh header is there, also checks skb len */
  32980. + if (!pskb_may_pull(rx->skb, hdrlen + 6))
  32981. + return RX_DROP_MONITOR;
  32982. +
  32983. + mesh_hdr = (struct ieee80211s_hdr *) (skb->data + hdrlen);
  32984. +
  32985. + /* make sure full mesh header is there, also checks skb len */
  32986. + if (!pskb_may_pull(rx->skb,
  32987. + hdrlen + ieee80211_get_mesh_hdrlen(mesh_hdr)))
  32988. + return RX_DROP_MONITOR;
  32989. +
  32990. + /* reload pointers */
  32991. + hdr = (struct ieee80211_hdr *) skb->data;
  32992. + mesh_hdr = (struct ieee80211s_hdr *) (skb->data + hdrlen);
  32993. +
  32994. + if (ieee80211_drop_unencrypted(rx, hdr->frame_control))
  32995. + return RX_DROP_MONITOR;
  32996. +
  32997. + /* frame is in RMC, don't forward */
  32998. + if (ieee80211_is_data(hdr->frame_control) &&
  32999. + is_multicast_ether_addr(hdr->addr1) &&
  33000. + mesh_rmc_check(rx->sdata, hdr->addr3, mesh_hdr))
  33001. + return RX_DROP_MONITOR;
  33002. +
  33003. + if (!ieee80211_is_data(hdr->frame_control) ||
  33004. + !(status->rx_flags & IEEE80211_RX_RA_MATCH))
  33005. + return RX_CONTINUE;
  33006. +
  33007. + if (!mesh_hdr->ttl)
  33008. + return RX_DROP_MONITOR;
  33009. +
  33010. + if (mesh_hdr->flags & MESH_FLAGS_AE) {
  33011. + struct mesh_path *mppath;
  33012. + char *proxied_addr;
  33013. + char *mpp_addr;
  33014. +
  33015. + if (is_multicast_ether_addr(hdr->addr1)) {
  33016. + mpp_addr = hdr->addr3;
  33017. + proxied_addr = mesh_hdr->eaddr1;
  33018. + } else if (mesh_hdr->flags & MESH_FLAGS_AE_A5_A6) {
  33019. + /* has_a4 already checked in ieee80211_rx_mesh_check */
  33020. + mpp_addr = hdr->addr4;
  33021. + proxied_addr = mesh_hdr->eaddr2;
  33022. + } else {
  33023. + return RX_DROP_MONITOR;
  33024. + }
  33025. +
  33026. + rcu_read_lock();
  33027. + mppath = mpp_path_lookup(sdata, proxied_addr);
  33028. + if (!mppath) {
  33029. + mpp_path_add(sdata, proxied_addr, mpp_addr);
  33030. + } else {
  33031. + spin_lock_bh(&mppath->state_lock);
  33032. + if (!ether_addr_equal(mppath->mpp, mpp_addr))
  33033. + memcpy(mppath->mpp, mpp_addr, ETH_ALEN);
  33034. + spin_unlock_bh(&mppath->state_lock);
  33035. + }
  33036. + rcu_read_unlock();
  33037. + }
  33038. +
  33039. + /* Frame has reached destination. Don't forward */
  33040. + if (!is_multicast_ether_addr(hdr->addr1) &&
  33041. + ether_addr_equal(sdata->vif.addr, hdr->addr3))
  33042. + return RX_CONTINUE;
  33043. +
  33044. + q = ieee80211_select_queue_80211(sdata, skb, hdr);
  33045. + if (ieee80211_queue_stopped(&local->hw, q)) {
  33046. + IEEE80211_IFSTA_MESH_CTR_INC(ifmsh, dropped_frames_congestion);
  33047. + return RX_DROP_MONITOR;
  33048. + }
  33049. + skb_set_queue_mapping(skb, q);
  33050. +
  33051. + if (!--mesh_hdr->ttl) {
  33052. + IEEE80211_IFSTA_MESH_CTR_INC(ifmsh, dropped_frames_ttl);
  33053. + goto out;
  33054. + }
  33055. +
  33056. + if (!ifmsh->mshcfg.dot11MeshForwarding)
  33057. + goto out;
  33058. +
  33059. + fwd_skb = skb_copy(skb, GFP_ATOMIC);
  33060. + if (!fwd_skb) {
  33061. + net_info_ratelimited("%s: failed to clone mesh frame\n",
  33062. + sdata->name);
  33063. + goto out;
  33064. + }
  33065. +
  33066. + fwd_hdr = (struct ieee80211_hdr *) fwd_skb->data;
  33067. + fwd_hdr->frame_control &= ~cpu_to_le16(IEEE80211_FCTL_RETRY);
  33068. + info = IEEE80211_SKB_CB(fwd_skb);
  33069. + memset(info, 0, sizeof(*info));
  33070. + info->flags |= IEEE80211_TX_INTFL_NEED_TXPROCESSING;
  33071. + info->control.vif = &rx->sdata->vif;
  33072. + info->control.jiffies = jiffies;
  33073. + if (is_multicast_ether_addr(fwd_hdr->addr1)) {
  33074. + IEEE80211_IFSTA_MESH_CTR_INC(ifmsh, fwded_mcast);
  33075. + memcpy(fwd_hdr->addr2, sdata->vif.addr, ETH_ALEN);
  33076. + /* update power mode indication when forwarding */
  33077. + ieee80211_mps_set_frame_flags(sdata, NULL, fwd_hdr);
  33078. + } else if (!mesh_nexthop_lookup(sdata, fwd_skb)) {
  33079. + /* mesh power mode flags updated in mesh_nexthop_lookup */
  33080. + IEEE80211_IFSTA_MESH_CTR_INC(ifmsh, fwded_unicast);
  33081. + } else {
  33082. + /* unable to resolve next hop */
  33083. + mesh_path_error_tx(sdata, ifmsh->mshcfg.element_ttl,
  33084. + fwd_hdr->addr3, 0,
  33085. + WLAN_REASON_MESH_PATH_NOFORWARD,
  33086. + fwd_hdr->addr2);
  33087. + IEEE80211_IFSTA_MESH_CTR_INC(ifmsh, dropped_frames_no_route);
  33088. + kfree_skb(fwd_skb);
  33089. + return RX_DROP_MONITOR;
  33090. + }
  33091. +
  33092. + IEEE80211_IFSTA_MESH_CTR_INC(ifmsh, fwded_frames);
  33093. + ieee80211_add_pending_skb(local, fwd_skb);
  33094. + out:
  33095. + if (is_multicast_ether_addr(hdr->addr1) ||
  33096. + sdata->dev->flags & IFF_PROMISC)
  33097. + return RX_CONTINUE;
  33098. + else
  33099. + return RX_DROP_MONITOR;
  33100. +}
  33101. +#endif
  33102. +
  33103. +static ieee80211_rx_result debug_noinline
  33104. +ieee80211_rx_h_data(struct ieee80211_rx_data *rx)
  33105. +{
  33106. + struct ieee80211_sub_if_data *sdata = rx->sdata;
  33107. + struct ieee80211_local *local = rx->local;
  33108. + struct net_device *dev = sdata->dev;
  33109. + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)rx->skb->data;
  33110. + __le16 fc = hdr->frame_control;
  33111. + bool port_control;
  33112. + int err;
  33113. +
  33114. + if (unlikely(!ieee80211_is_data(hdr->frame_control)))
  33115. + return RX_CONTINUE;
  33116. +
  33117. + if (unlikely(!ieee80211_is_data_present(hdr->frame_control)))
  33118. + return RX_DROP_MONITOR;
  33119. +
  33120. + /*
  33121. + * Send unexpected-4addr-frame event to hostapd. For older versions,
  33122. + * also drop the frame to cooked monitor interfaces.
  33123. + */
  33124. + if (ieee80211_has_a4(hdr->frame_control) &&
  33125. + sdata->vif.type == NL80211_IFTYPE_AP) {
  33126. + if (rx->sta &&
  33127. + !test_and_set_sta_flag(rx->sta, WLAN_STA_4ADDR_EVENT))
  33128. + cfg80211_rx_unexpected_4addr_frame(
  33129. + rx->sdata->dev, rx->sta->sta.addr, GFP_ATOMIC);
  33130. + return RX_DROP_MONITOR;
  33131. + }
  33132. +
  33133. + err = __ieee80211_data_to_8023(rx, &port_control);
  33134. + if (unlikely(err))
  33135. + return RX_DROP_UNUSABLE;
  33136. +
  33137. + if (!ieee80211_frame_allowed(rx, fc))
  33138. + return RX_DROP_MONITOR;
  33139. +
  33140. + if (rx->sdata->vif.type == NL80211_IFTYPE_AP_VLAN &&
  33141. + unlikely(port_control) && sdata->bss) {
  33142. + sdata = container_of(sdata->bss, struct ieee80211_sub_if_data,
  33143. + u.ap);
  33144. + dev = sdata->dev;
  33145. + rx->sdata = sdata;
  33146. + }
  33147. +
  33148. + rx->skb->dev = dev;
  33149. +
  33150. + dev->stats.rx_packets++;
  33151. + dev->stats.rx_bytes += rx->skb->len;
  33152. +
  33153. + if (local->ps_sdata && local->hw.conf.dynamic_ps_timeout > 0 &&
  33154. + !is_multicast_ether_addr(
  33155. + ((struct ethhdr *)rx->skb->data)->h_dest) &&
  33156. + (!local->scanning &&
  33157. + !test_bit(SDATA_STATE_OFFCHANNEL, &sdata->state))) {
  33158. + mod_timer(&local->dynamic_ps_timer, jiffies +
  33159. + msecs_to_jiffies(local->hw.conf.dynamic_ps_timeout));
  33160. + }
  33161. +
  33162. + ieee80211_deliver_skb(rx);
  33163. +
  33164. + return RX_QUEUED;
  33165. +}
  33166. +
  33167. +static ieee80211_rx_result debug_noinline
  33168. +ieee80211_rx_h_ctrl(struct ieee80211_rx_data *rx, struct sk_buff_head *frames)
  33169. +{
  33170. + struct sk_buff *skb = rx->skb;
  33171. + struct ieee80211_bar *bar = (struct ieee80211_bar *)skb->data;
  33172. + struct tid_ampdu_rx *tid_agg_rx;
  33173. + u16 start_seq_num;
  33174. + u16 tid;
  33175. +
  33176. + if (likely(!ieee80211_is_ctl(bar->frame_control)))
  33177. + return RX_CONTINUE;
  33178. +
  33179. + if (ieee80211_is_back_req(bar->frame_control)) {
  33180. + struct {
  33181. + __le16 control, start_seq_num;
  33182. + } __packed bar_data;
  33183. +
  33184. + if (!rx->sta)
  33185. + return RX_DROP_MONITOR;
  33186. +
  33187. + if (skb_copy_bits(skb, offsetof(struct ieee80211_bar, control),
  33188. + &bar_data, sizeof(bar_data)))
  33189. + return RX_DROP_MONITOR;
  33190. +
  33191. + tid = le16_to_cpu(bar_data.control) >> 12;
  33192. +
  33193. + tid_agg_rx = rcu_dereference(rx->sta->ampdu_mlme.tid_rx[tid]);
  33194. + if (!tid_agg_rx)
  33195. + return RX_DROP_MONITOR;
  33196. +
  33197. + start_seq_num = le16_to_cpu(bar_data.start_seq_num) >> 4;
  33198. +
  33199. + /* reset session timer */
  33200. + if (tid_agg_rx->timeout)
  33201. + mod_timer(&tid_agg_rx->session_timer,
  33202. + TU_TO_EXP_TIME(tid_agg_rx->timeout));
  33203. +
  33204. + spin_lock(&tid_agg_rx->reorder_lock);
  33205. + /* release stored frames up to start of BAR */
  33206. + ieee80211_release_reorder_frames(rx->sdata, tid_agg_rx,
  33207. + start_seq_num, frames);
  33208. + spin_unlock(&tid_agg_rx->reorder_lock);
  33209. +
  33210. + kfree_skb(skb);
  33211. + return RX_QUEUED;
  33212. + }
  33213. +
  33214. + /*
  33215. + * After this point, we only want management frames,
  33216. + * so we can drop all remaining control frames to
  33217. + * cooked monitor interfaces.
  33218. + */
  33219. + return RX_DROP_MONITOR;
  33220. +}
  33221. +
  33222. +static void ieee80211_process_sa_query_req(struct ieee80211_sub_if_data *sdata,
  33223. + struct ieee80211_mgmt *mgmt,
  33224. + size_t len)
  33225. +{
  33226. + struct ieee80211_local *local = sdata->local;
  33227. + struct sk_buff *skb;
  33228. + struct ieee80211_mgmt *resp;
  33229. +
  33230. + if (!ether_addr_equal(mgmt->da, sdata->vif.addr)) {
  33231. + /* Not to own unicast address */
  33232. + return;
  33233. + }
  33234. +
  33235. + if (!ether_addr_equal(mgmt->sa, sdata->u.mgd.bssid) ||
  33236. + !ether_addr_equal(mgmt->bssid, sdata->u.mgd.bssid)) {
  33237. + /* Not from the current AP or not associated yet. */
  33238. + return;
  33239. + }
  33240. +
  33241. + if (len < 24 + 1 + sizeof(resp->u.action.u.sa_query)) {
  33242. + /* Too short SA Query request frame */
  33243. + return;
  33244. + }
  33245. +
  33246. + skb = dev_alloc_skb(sizeof(*resp) + local->hw.extra_tx_headroom);
  33247. + if (skb == NULL)
  33248. + return;
  33249. +
  33250. + skb_reserve(skb, local->hw.extra_tx_headroom);
  33251. + resp = (struct ieee80211_mgmt *) skb_put(skb, 24);
  33252. + memset(resp, 0, 24);
  33253. + memcpy(resp->da, mgmt->sa, ETH_ALEN);
  33254. + memcpy(resp->sa, sdata->vif.addr, ETH_ALEN);
  33255. + memcpy(resp->bssid, sdata->u.mgd.bssid, ETH_ALEN);
  33256. + resp->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
  33257. + IEEE80211_STYPE_ACTION);
  33258. + skb_put(skb, 1 + sizeof(resp->u.action.u.sa_query));
  33259. + resp->u.action.category = WLAN_CATEGORY_SA_QUERY;
  33260. + resp->u.action.u.sa_query.action = WLAN_ACTION_SA_QUERY_RESPONSE;
  33261. + memcpy(resp->u.action.u.sa_query.trans_id,
  33262. + mgmt->u.action.u.sa_query.trans_id,
  33263. + WLAN_SA_QUERY_TR_ID_LEN);
  33264. +
  33265. + ieee80211_tx_skb(sdata, skb);
  33266. +}
  33267. +
  33268. +static ieee80211_rx_result debug_noinline
  33269. +ieee80211_rx_h_mgmt_check(struct ieee80211_rx_data *rx)
  33270. +{
  33271. + struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *) rx->skb->data;
  33272. + struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
  33273. +
  33274. + /*
  33275. + * From here on, look only at management frames.
  33276. + * Data and control frames are already handled,
  33277. + * and unknown (reserved) frames are useless.
  33278. + */
  33279. + if (rx->skb->len < 24)
  33280. + return RX_DROP_MONITOR;
  33281. +
  33282. + if (!ieee80211_is_mgmt(mgmt->frame_control))
  33283. + return RX_DROP_MONITOR;
  33284. +
  33285. + if (rx->sdata->vif.type == NL80211_IFTYPE_AP &&
  33286. + ieee80211_is_beacon(mgmt->frame_control) &&
  33287. + !(rx->flags & IEEE80211_RX_BEACON_REPORTED)) {
  33288. + int sig = 0;
  33289. +
  33290. + if (rx->local->hw.flags & IEEE80211_HW_SIGNAL_DBM)
  33291. + sig = status->signal;
  33292. +
  33293. + cfg80211_report_obss_beacon(rx->local->hw.wiphy,
  33294. + rx->skb->data, rx->skb->len,
  33295. + status->freq, sig);
  33296. + rx->flags |= IEEE80211_RX_BEACON_REPORTED;
  33297. + }
  33298. +
  33299. + if (!(status->rx_flags & IEEE80211_RX_RA_MATCH))
  33300. + return RX_DROP_MONITOR;
  33301. +
  33302. + if (ieee80211_drop_unencrypted_mgmt(rx))
  33303. + return RX_DROP_UNUSABLE;
  33304. +
  33305. + return RX_CONTINUE;
  33306. +}
  33307. +
  33308. +static ieee80211_rx_result debug_noinline
  33309. +ieee80211_rx_h_action(struct ieee80211_rx_data *rx)
  33310. +{
  33311. + struct ieee80211_local *local = rx->local;
  33312. + struct ieee80211_sub_if_data *sdata = rx->sdata;
  33313. + struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *) rx->skb->data;
  33314. + struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
  33315. + int len = rx->skb->len;
  33316. +
  33317. + if (!ieee80211_is_action(mgmt->frame_control))
  33318. + return RX_CONTINUE;
  33319. +
  33320. + /* drop too small frames */
  33321. + if (len < IEEE80211_MIN_ACTION_SIZE)
  33322. + return RX_DROP_UNUSABLE;
  33323. +
  33324. + if (!rx->sta && mgmt->u.action.category != WLAN_CATEGORY_PUBLIC &&
  33325. + mgmt->u.action.category != WLAN_CATEGORY_SELF_PROTECTED &&
  33326. + mgmt->u.action.category != WLAN_CATEGORY_SPECTRUM_MGMT)
  33327. + return RX_DROP_UNUSABLE;
  33328. +
  33329. + if (!(status->rx_flags & IEEE80211_RX_RA_MATCH))
  33330. + return RX_DROP_UNUSABLE;
  33331. +
  33332. + switch (mgmt->u.action.category) {
  33333. + case WLAN_CATEGORY_HT:
  33334. + /* reject HT action frames from stations not supporting HT */
  33335. + if (!rx->sta->sta.ht_cap.ht_supported)
  33336. + goto invalid;
  33337. +
  33338. + if (sdata->vif.type != NL80211_IFTYPE_STATION &&
  33339. + sdata->vif.type != NL80211_IFTYPE_MESH_POINT &&
  33340. + sdata->vif.type != NL80211_IFTYPE_AP_VLAN &&
  33341. + sdata->vif.type != NL80211_IFTYPE_AP &&
  33342. + sdata->vif.type != NL80211_IFTYPE_ADHOC)
  33343. + break;
  33344. +
  33345. + /* verify action & smps_control/chanwidth are present */
  33346. + if (len < IEEE80211_MIN_ACTION_SIZE + 2)
  33347. + goto invalid;
  33348. +
  33349. + switch (mgmt->u.action.u.ht_smps.action) {
  33350. + case WLAN_HT_ACTION_SMPS: {
  33351. + struct ieee80211_supported_band *sband;
  33352. + enum ieee80211_smps_mode smps_mode;
  33353. +
  33354. + /* convert to HT capability */
  33355. + switch (mgmt->u.action.u.ht_smps.smps_control) {
  33356. + case WLAN_HT_SMPS_CONTROL_DISABLED:
  33357. + smps_mode = IEEE80211_SMPS_OFF;
  33358. + break;
  33359. + case WLAN_HT_SMPS_CONTROL_STATIC:
  33360. + smps_mode = IEEE80211_SMPS_STATIC;
  33361. + break;
  33362. + case WLAN_HT_SMPS_CONTROL_DYNAMIC:
  33363. + smps_mode = IEEE80211_SMPS_DYNAMIC;
  33364. + break;
  33365. + default:
  33366. + goto invalid;
  33367. + }
  33368. +
  33369. + /* if no change do nothing */
  33370. + if (rx->sta->sta.smps_mode == smps_mode)
  33371. + goto handled;
  33372. + rx->sta->sta.smps_mode = smps_mode;
  33373. +
  33374. + sband = rx->local->hw.wiphy->bands[status->band];
  33375. +
  33376. + rate_control_rate_update(local, sband, rx->sta,
  33377. + IEEE80211_RC_SMPS_CHANGED);
  33378. + goto handled;
  33379. + }
  33380. + case WLAN_HT_ACTION_NOTIFY_CHANWIDTH: {
  33381. + struct ieee80211_supported_band *sband;
  33382. + u8 chanwidth = mgmt->u.action.u.ht_notify_cw.chanwidth;
  33383. + enum ieee80211_sta_rx_bandwidth new_bw;
  33384. +
  33385. + /* If it doesn't support 40 MHz it can't change ... */
  33386. + if (!(rx->sta->sta.ht_cap.cap &
  33387. + IEEE80211_HT_CAP_SUP_WIDTH_20_40))
  33388. + goto handled;
  33389. +
  33390. + if (chanwidth == IEEE80211_HT_CHANWIDTH_20MHZ)
  33391. + new_bw = IEEE80211_STA_RX_BW_20;
  33392. + else
  33393. + new_bw = ieee80211_sta_cur_vht_bw(rx->sta);
  33394. +
  33395. + if (rx->sta->sta.bandwidth == new_bw)
  33396. + goto handled;
  33397. +
  33398. + sband = rx->local->hw.wiphy->bands[status->band];
  33399. +
  33400. + rate_control_rate_update(local, sband, rx->sta,
  33401. + IEEE80211_RC_BW_CHANGED);
  33402. + goto handled;
  33403. + }
  33404. + default:
  33405. + goto invalid;
  33406. + }
  33407. +
  33408. + break;
  33409. + case WLAN_CATEGORY_PUBLIC:
  33410. + if (len < IEEE80211_MIN_ACTION_SIZE + 1)
  33411. + goto invalid;
  33412. + if (sdata->vif.type != NL80211_IFTYPE_STATION)
  33413. + break;
  33414. + if (!rx->sta)
  33415. + break;
  33416. + if (!ether_addr_equal(mgmt->bssid, sdata->u.mgd.bssid))
  33417. + break;
  33418. + if (mgmt->u.action.u.ext_chan_switch.action_code !=
  33419. + WLAN_PUB_ACTION_EXT_CHANSW_ANN)
  33420. + break;
  33421. + if (len < offsetof(struct ieee80211_mgmt,
  33422. + u.action.u.ext_chan_switch.variable))
  33423. + goto invalid;
  33424. + goto queue;
  33425. + case WLAN_CATEGORY_VHT:
  33426. + if (sdata->vif.type != NL80211_IFTYPE_STATION &&
  33427. + sdata->vif.type != NL80211_IFTYPE_MESH_POINT &&
  33428. + sdata->vif.type != NL80211_IFTYPE_AP_VLAN &&
  33429. + sdata->vif.type != NL80211_IFTYPE_AP &&
  33430. + sdata->vif.type != NL80211_IFTYPE_ADHOC)
  33431. + break;
  33432. +
  33433. + /* verify action code is present */
  33434. + if (len < IEEE80211_MIN_ACTION_SIZE + 1)
  33435. + goto invalid;
  33436. +
  33437. + switch (mgmt->u.action.u.vht_opmode_notif.action_code) {
  33438. + case WLAN_VHT_ACTION_OPMODE_NOTIF: {
  33439. + u8 opmode;
  33440. +
  33441. + /* verify opmode is present */
  33442. + if (len < IEEE80211_MIN_ACTION_SIZE + 2)
  33443. + goto invalid;
  33444. +
  33445. + opmode = mgmt->u.action.u.vht_opmode_notif.operating_mode;
  33446. +
  33447. + ieee80211_vht_handle_opmode(rx->sdata, rx->sta,
  33448. + opmode, status->band,
  33449. + false);
  33450. + goto handled;
  33451. + }
  33452. + default:
  33453. + break;
  33454. + }
  33455. + break;
  33456. + case WLAN_CATEGORY_BACK:
  33457. + if (sdata->vif.type != NL80211_IFTYPE_STATION &&
  33458. + sdata->vif.type != NL80211_IFTYPE_MESH_POINT &&
  33459. + sdata->vif.type != NL80211_IFTYPE_AP_VLAN &&
  33460. + sdata->vif.type != NL80211_IFTYPE_AP &&
  33461. + sdata->vif.type != NL80211_IFTYPE_ADHOC)
  33462. + break;
  33463. +
  33464. + /* verify action_code is present */
  33465. + if (len < IEEE80211_MIN_ACTION_SIZE + 1)
  33466. + break;
  33467. +
  33468. + switch (mgmt->u.action.u.addba_req.action_code) {
  33469. + case WLAN_ACTION_ADDBA_REQ:
  33470. + if (len < (IEEE80211_MIN_ACTION_SIZE +
  33471. + sizeof(mgmt->u.action.u.addba_req)))
  33472. + goto invalid;
  33473. + break;
  33474. + case WLAN_ACTION_ADDBA_RESP:
  33475. + if (len < (IEEE80211_MIN_ACTION_SIZE +
  33476. + sizeof(mgmt->u.action.u.addba_resp)))
  33477. + goto invalid;
  33478. + break;
  33479. + case WLAN_ACTION_DELBA:
  33480. + if (len < (IEEE80211_MIN_ACTION_SIZE +
  33481. + sizeof(mgmt->u.action.u.delba)))
  33482. + goto invalid;
  33483. + break;
  33484. + default:
  33485. + goto invalid;
  33486. + }
  33487. +
  33488. + goto queue;
  33489. + case WLAN_CATEGORY_SPECTRUM_MGMT:
  33490. + /* verify action_code is present */
  33491. + if (len < IEEE80211_MIN_ACTION_SIZE + 1)
  33492. + break;
  33493. +
  33494. + switch (mgmt->u.action.u.measurement.action_code) {
  33495. + case WLAN_ACTION_SPCT_MSR_REQ:
  33496. + if (status->band != IEEE80211_BAND_5GHZ)
  33497. + break;
  33498. +
  33499. + if (len < (IEEE80211_MIN_ACTION_SIZE +
  33500. + sizeof(mgmt->u.action.u.measurement)))
  33501. + break;
  33502. +
  33503. + if (sdata->vif.type != NL80211_IFTYPE_STATION)
  33504. + break;
  33505. +
  33506. + ieee80211_process_measurement_req(sdata, mgmt, len);
  33507. + goto handled;
  33508. + case WLAN_ACTION_SPCT_CHL_SWITCH: {
  33509. + u8 *bssid;
  33510. + if (len < (IEEE80211_MIN_ACTION_SIZE +
  33511. + sizeof(mgmt->u.action.u.chan_switch)))
  33512. + break;
  33513. +
  33514. + if (sdata->vif.type != NL80211_IFTYPE_STATION &&
  33515. + sdata->vif.type != NL80211_IFTYPE_ADHOC &&
  33516. + sdata->vif.type != NL80211_IFTYPE_MESH_POINT)
  33517. + break;
  33518. +
  33519. + if (sdata->vif.type == NL80211_IFTYPE_STATION)
  33520. + bssid = sdata->u.mgd.bssid;
  33521. + else if (sdata->vif.type == NL80211_IFTYPE_ADHOC)
  33522. + bssid = sdata->u.ibss.bssid;
  33523. + else if (sdata->vif.type == NL80211_IFTYPE_MESH_POINT)
  33524. + bssid = mgmt->sa;
  33525. + else
  33526. + break;
  33527. +
  33528. + if (!ether_addr_equal(mgmt->bssid, bssid))
  33529. + break;
  33530. +
  33531. + goto queue;
  33532. + }
  33533. + }
  33534. + break;
  33535. + case WLAN_CATEGORY_SA_QUERY:
  33536. + if (len < (IEEE80211_MIN_ACTION_SIZE +
  33537. + sizeof(mgmt->u.action.u.sa_query)))
  33538. + break;
  33539. +
  33540. + switch (mgmt->u.action.u.sa_query.action) {
  33541. + case WLAN_ACTION_SA_QUERY_REQUEST:
  33542. + if (sdata->vif.type != NL80211_IFTYPE_STATION)
  33543. + break;
  33544. + ieee80211_process_sa_query_req(sdata, mgmt, len);
  33545. + goto handled;
  33546. + }
  33547. + break;
  33548. + case WLAN_CATEGORY_SELF_PROTECTED:
  33549. + if (len < (IEEE80211_MIN_ACTION_SIZE +
  33550. + sizeof(mgmt->u.action.u.self_prot.action_code)))
  33551. + break;
  33552. +
  33553. + switch (mgmt->u.action.u.self_prot.action_code) {
  33554. + case WLAN_SP_MESH_PEERING_OPEN:
  33555. + case WLAN_SP_MESH_PEERING_CLOSE:
  33556. + case WLAN_SP_MESH_PEERING_CONFIRM:
  33557. + if (!ieee80211_vif_is_mesh(&sdata->vif))
  33558. + goto invalid;
  33559. + if (sdata->u.mesh.user_mpm)
  33560. + /* userspace handles this frame */
  33561. + break;
  33562. + goto queue;
  33563. + case WLAN_SP_MGK_INFORM:
  33564. + case WLAN_SP_MGK_ACK:
  33565. + if (!ieee80211_vif_is_mesh(&sdata->vif))
  33566. + goto invalid;
  33567. + break;
  33568. + }
  33569. + break;
  33570. + case WLAN_CATEGORY_MESH_ACTION:
  33571. + if (len < (IEEE80211_MIN_ACTION_SIZE +
  33572. + sizeof(mgmt->u.action.u.mesh_action.action_code)))
  33573. + break;
  33574. +
  33575. + if (!ieee80211_vif_is_mesh(&sdata->vif))
  33576. + break;
  33577. + if (mesh_action_is_path_sel(mgmt) &&
  33578. + !mesh_path_sel_is_hwmp(sdata))
  33579. + break;
  33580. + goto queue;
  33581. + }
  33582. +
  33583. + return RX_CONTINUE;
  33584. +
  33585. + invalid:
  33586. + status->rx_flags |= IEEE80211_RX_MALFORMED_ACTION_FRM;
  33587. + /* will return in the next handlers */
  33588. + return RX_CONTINUE;
  33589. +
  33590. + handled:
  33591. + if (rx->sta)
  33592. + rx->sta->rx_packets++;
  33593. + dev_kfree_skb(rx->skb);
  33594. + return RX_QUEUED;
  33595. +
  33596. + queue:
  33597. + rx->skb->pkt_type = IEEE80211_SDATA_QUEUE_TYPE_FRAME;
  33598. + skb_queue_tail(&sdata->skb_queue, rx->skb);
  33599. + ieee80211_queue_work(&local->hw, &sdata->work);
  33600. + if (rx->sta)
  33601. + rx->sta->rx_packets++;
  33602. + return RX_QUEUED;
  33603. +}
  33604. +
  33605. +static ieee80211_rx_result debug_noinline
  33606. +ieee80211_rx_h_userspace_mgmt(struct ieee80211_rx_data *rx)
  33607. +{
  33608. + struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
  33609. + int sig = 0;
  33610. +
  33611. + /* skip known-bad action frames and return them in the next handler */
  33612. + if (status->rx_flags & IEEE80211_RX_MALFORMED_ACTION_FRM)
  33613. + return RX_CONTINUE;
  33614. +
  33615. + /*
  33616. + * Getting here means the kernel doesn't know how to handle
  33617. + * it, but maybe userspace does ... include returned frames
  33618. + * so userspace can register for those to know whether ones
  33619. + * it transmitted were processed or returned.
  33620. + */
  33621. +
  33622. + if (rx->local->hw.flags & IEEE80211_HW_SIGNAL_DBM)
  33623. + sig = status->signal;
  33624. +
  33625. + if (cfg80211_rx_mgmt(&rx->sdata->wdev, status->freq, sig,
  33626. + rx->skb->data, rx->skb->len, 0)) {
  33627. + if (rx->sta)
  33628. + rx->sta->rx_packets++;
  33629. + dev_kfree_skb(rx->skb);
  33630. + return RX_QUEUED;
  33631. + }
  33632. +
  33633. + return RX_CONTINUE;
  33634. +}
  33635. +
  33636. +static ieee80211_rx_result debug_noinline
  33637. +ieee80211_rx_h_action_return(struct ieee80211_rx_data *rx)
  33638. +{
  33639. + struct ieee80211_local *local = rx->local;
  33640. + struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *) rx->skb->data;
  33641. + struct sk_buff *nskb;
  33642. + struct ieee80211_sub_if_data *sdata = rx->sdata;
  33643. + struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
  33644. +
  33645. + if (!ieee80211_is_action(mgmt->frame_control))
  33646. + return RX_CONTINUE;
  33647. +
  33648. + /*
  33649. + * For AP mode, hostapd is responsible for handling any action
  33650. + * frames that we didn't handle, including returning unknown
  33651. + * ones. For all other modes we will return them to the sender,
  33652. + * setting the 0x80 bit in the action category, as required by
  33653. + * 802.11-2012 9.24.4.
  33654. + * Newer versions of hostapd shall also use the management frame
  33655. + * registration mechanisms, but older ones still use cooked
  33656. + * monitor interfaces so push all frames there.
  33657. + */
  33658. + if (!(status->rx_flags & IEEE80211_RX_MALFORMED_ACTION_FRM) &&
  33659. + (sdata->vif.type == NL80211_IFTYPE_AP ||
  33660. + sdata->vif.type == NL80211_IFTYPE_AP_VLAN))
  33661. + return RX_DROP_MONITOR;
  33662. +
  33663. + if (is_multicast_ether_addr(mgmt->da))
  33664. + return RX_DROP_MONITOR;
  33665. +
  33666. + /* do not return rejected action frames */
  33667. + if (mgmt->u.action.category & 0x80)
  33668. + return RX_DROP_UNUSABLE;
  33669. +
  33670. + nskb = skb_copy_expand(rx->skb, local->hw.extra_tx_headroom, 0,
  33671. + GFP_ATOMIC);
  33672. + if (nskb) {
  33673. + struct ieee80211_mgmt *nmgmt = (void *)nskb->data;
  33674. +
  33675. + nmgmt->u.action.category |= 0x80;
  33676. + memcpy(nmgmt->da, nmgmt->sa, ETH_ALEN);
  33677. + memcpy(nmgmt->sa, rx->sdata->vif.addr, ETH_ALEN);
  33678. +
  33679. + memset(nskb->cb, 0, sizeof(nskb->cb));
  33680. +
  33681. + if (rx->sdata->vif.type == NL80211_IFTYPE_P2P_DEVICE) {
  33682. + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(nskb);
  33683. +
  33684. + info->flags = IEEE80211_TX_CTL_TX_OFFCHAN |
  33685. + IEEE80211_TX_INTFL_OFFCHAN_TX_OK |
  33686. + IEEE80211_TX_CTL_NO_CCK_RATE;
  33687. + if (local->hw.flags & IEEE80211_HW_QUEUE_CONTROL)
  33688. + info->hw_queue =
  33689. + local->hw.offchannel_tx_hw_queue;
  33690. + }
  33691. +
  33692. + __ieee80211_tx_skb_tid_band(rx->sdata, nskb, 7,
  33693. + status->band);
  33694. + }
  33695. + dev_kfree_skb(rx->skb);
  33696. + return RX_QUEUED;
  33697. +}
  33698. +
  33699. +static ieee80211_rx_result debug_noinline
  33700. +ieee80211_rx_h_mgmt(struct ieee80211_rx_data *rx)
  33701. +{
  33702. + struct ieee80211_sub_if_data *sdata = rx->sdata;
  33703. + struct ieee80211_mgmt *mgmt = (void *)rx->skb->data;
  33704. + __le16 stype;
  33705. +
  33706. + stype = mgmt->frame_control & cpu_to_le16(IEEE80211_FCTL_STYPE);
  33707. +
  33708. + if (!ieee80211_vif_is_mesh(&sdata->vif) &&
  33709. + sdata->vif.type != NL80211_IFTYPE_ADHOC &&
  33710. + sdata->vif.type != NL80211_IFTYPE_STATION)
  33711. + return RX_DROP_MONITOR;
  33712. +
  33713. + switch (stype) {
  33714. + case cpu_to_le16(IEEE80211_STYPE_AUTH):
  33715. + case cpu_to_le16(IEEE80211_STYPE_BEACON):
  33716. + case cpu_to_le16(IEEE80211_STYPE_PROBE_RESP):
  33717. + /* process for all: mesh, mlme, ibss */
  33718. + break;
  33719. + case cpu_to_le16(IEEE80211_STYPE_ASSOC_RESP):
  33720. + case cpu_to_le16(IEEE80211_STYPE_REASSOC_RESP):
  33721. + case cpu_to_le16(IEEE80211_STYPE_DEAUTH):
  33722. + case cpu_to_le16(IEEE80211_STYPE_DISASSOC):
  33723. + if (is_multicast_ether_addr(mgmt->da) &&
  33724. + !is_broadcast_ether_addr(mgmt->da))
  33725. + return RX_DROP_MONITOR;
  33726. +
  33727. + /* process only for station */
  33728. + if (sdata->vif.type != NL80211_IFTYPE_STATION)
  33729. + return RX_DROP_MONITOR;
  33730. + break;
  33731. + case cpu_to_le16(IEEE80211_STYPE_PROBE_REQ):
  33732. + /* process only for ibss and mesh */
  33733. + if (sdata->vif.type != NL80211_IFTYPE_ADHOC &&
  33734. + sdata->vif.type != NL80211_IFTYPE_MESH_POINT)
  33735. + return RX_DROP_MONITOR;
  33736. + break;
  33737. + default:
  33738. + return RX_DROP_MONITOR;
  33739. + }
  33740. +
  33741. + /* queue up frame and kick off work to process it */
  33742. + rx->skb->pkt_type = IEEE80211_SDATA_QUEUE_TYPE_FRAME;
  33743. + skb_queue_tail(&sdata->skb_queue, rx->skb);
  33744. + ieee80211_queue_work(&rx->local->hw, &sdata->work);
  33745. + if (rx->sta)
  33746. + rx->sta->rx_packets++;
  33747. +
  33748. + return RX_QUEUED;
  33749. +}
  33750. +
  33751. +/* TODO: use IEEE80211_RX_FRAGMENTED */
  33752. +static void ieee80211_rx_cooked_monitor(struct ieee80211_rx_data *rx,
  33753. + struct ieee80211_rate *rate)
  33754. +{
  33755. + struct ieee80211_sub_if_data *sdata;
  33756. + struct ieee80211_local *local = rx->local;
  33757. + struct sk_buff *skb = rx->skb, *skb2;
  33758. + struct net_device *prev_dev = NULL;
  33759. + struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
  33760. + int needed_headroom;
  33761. +
  33762. + /*
  33763. + * If cooked monitor has been processed already, then
  33764. + * don't do it again. If not, set the flag.
  33765. + */
  33766. + if (rx->flags & IEEE80211_RX_CMNTR)
  33767. + goto out_free_skb;
  33768. + rx->flags |= IEEE80211_RX_CMNTR;
  33769. +
  33770. + /* If there are no cooked monitor interfaces, just free the SKB */
  33771. + if (!local->cooked_mntrs)
  33772. + goto out_free_skb;
  33773. +
  33774. + /* room for the radiotap header based on driver features */
  33775. + needed_headroom = ieee80211_rx_radiotap_space(local, status);
  33776. +
  33777. + if (skb_headroom(skb) < needed_headroom &&
  33778. + pskb_expand_head(skb, needed_headroom, 0, GFP_ATOMIC))
  33779. + goto out_free_skb;
  33780. +
  33781. + /* prepend radiotap information */
  33782. + ieee80211_add_rx_radiotap_header(local, skb, rate, needed_headroom,
  33783. + false);
  33784. +
  33785. + skb_set_mac_header(skb, 0);
  33786. + skb->ip_summed = CHECKSUM_UNNECESSARY;
  33787. + skb->pkt_type = PACKET_OTHERHOST;
  33788. + skb->protocol = htons(ETH_P_802_2);
  33789. +
  33790. + list_for_each_entry_rcu(sdata, &local->interfaces, list) {
  33791. + if (!ieee80211_sdata_running(sdata))
  33792. + continue;
  33793. +
  33794. + if (sdata->vif.type != NL80211_IFTYPE_MONITOR ||
  33795. + !(sdata->u.mntr_flags & MONITOR_FLAG_COOK_FRAMES))
  33796. + continue;
  33797. +
  33798. + if (prev_dev) {
  33799. + skb2 = skb_clone(skb, GFP_ATOMIC);
  33800. + if (skb2) {
  33801. + skb2->dev = prev_dev;
  33802. + netif_receive_skb(skb2);
  33803. + }
  33804. + }
  33805. +
  33806. + prev_dev = sdata->dev;
  33807. + sdata->dev->stats.rx_packets++;
  33808. + sdata->dev->stats.rx_bytes += skb->len;
  33809. + }
  33810. +
  33811. + if (prev_dev) {
  33812. + skb->dev = prev_dev;
  33813. + netif_receive_skb(skb);
  33814. + return;
  33815. + }
  33816. +
  33817. + out_free_skb:
  33818. + dev_kfree_skb(skb);
  33819. +}
  33820. +
  33821. +static void ieee80211_rx_handlers_result(struct ieee80211_rx_data *rx,
  33822. + ieee80211_rx_result res)
  33823. +{
  33824. + switch (res) {
  33825. + case RX_DROP_MONITOR:
  33826. + I802_DEBUG_INC(rx->sdata->local->rx_handlers_drop);
  33827. + if (rx->sta)
  33828. + rx->sta->rx_dropped++;
  33829. + /* fall through */
  33830. + case RX_CONTINUE: {
  33831. + struct ieee80211_rate *rate = NULL;
  33832. + struct ieee80211_supported_band *sband;
  33833. + struct ieee80211_rx_status *status;
  33834. +
  33835. + status = IEEE80211_SKB_RXCB((rx->skb));
  33836. +
  33837. + sband = rx->local->hw.wiphy->bands[status->band];
  33838. + if (!(status->flag & RX_FLAG_HT) &&
  33839. + !(status->flag & RX_FLAG_VHT))
  33840. + rate = &sband->bitrates[status->rate_idx];
  33841. +
  33842. + ieee80211_rx_cooked_monitor(rx, rate);
  33843. + break;
  33844. + }
  33845. + case RX_DROP_UNUSABLE:
  33846. + I802_DEBUG_INC(rx->sdata->local->rx_handlers_drop);
  33847. + if (rx->sta)
  33848. + rx->sta->rx_dropped++;
  33849. + dev_kfree_skb(rx->skb);
  33850. + break;
  33851. + case RX_QUEUED:
  33852. + I802_DEBUG_INC(rx->sdata->local->rx_handlers_queued);
  33853. + break;
  33854. + }
  33855. +}
  33856. +
  33857. +static void ieee80211_rx_handlers(struct ieee80211_rx_data *rx,
  33858. + struct sk_buff_head *frames)
  33859. +{
  33860. + ieee80211_rx_result res = RX_DROP_MONITOR;
  33861. + struct sk_buff *skb;
  33862. +
  33863. +#define CALL_RXH(rxh) \
  33864. + do { \
  33865. + res = rxh(rx); \
  33866. + if (res != RX_CONTINUE) \
  33867. + goto rxh_next; \
  33868. + } while (0);
  33869. +
  33870. + spin_lock_bh(&rx->local->rx_path_lock);
  33871. +
  33872. + while ((skb = __skb_dequeue(frames))) {
  33873. + /*
  33874. + * all the other fields are valid across frames
  33875. + * that belong to an aMPDU since they are on the
  33876. + * same TID from the same station
  33877. + */
  33878. + rx->skb = skb;
  33879. +
  33880. + CALL_RXH(ieee80211_rx_h_check_more_data)
  33881. + CALL_RXH(ieee80211_rx_h_uapsd_and_pspoll)
  33882. + CALL_RXH(ieee80211_rx_h_sta_process)
  33883. + CALL_RXH(ieee80211_rx_h_decrypt)
  33884. + CALL_RXH(ieee80211_rx_h_defragment)
  33885. + CALL_RXH(ieee80211_rx_h_michael_mic_verify)
  33886. + /* must be after MMIC verify so header is counted in MPDU mic */
  33887. +#ifdef CONFIG_MAC80211_MESH
  33888. + if (ieee80211_vif_is_mesh(&rx->sdata->vif))
  33889. + CALL_RXH(ieee80211_rx_h_mesh_fwding);
  33890. +#endif
  33891. + CALL_RXH(ieee80211_rx_h_amsdu)
  33892. + CALL_RXH(ieee80211_rx_h_data)
  33893. +
  33894. + /* special treatment -- needs the queue */
  33895. + res = ieee80211_rx_h_ctrl(rx, frames);
  33896. + if (res != RX_CONTINUE)
  33897. + goto rxh_next;
  33898. +
  33899. + CALL_RXH(ieee80211_rx_h_mgmt_check)
  33900. + CALL_RXH(ieee80211_rx_h_action)
  33901. + CALL_RXH(ieee80211_rx_h_userspace_mgmt)
  33902. + CALL_RXH(ieee80211_rx_h_action_return)
  33903. + CALL_RXH(ieee80211_rx_h_mgmt)
  33904. +
  33905. + rxh_next:
  33906. + ieee80211_rx_handlers_result(rx, res);
  33907. +
  33908. +#undef CALL_RXH
  33909. + }
  33910. +
  33911. + spin_unlock_bh(&rx->local->rx_path_lock);
  33912. +}
  33913. +
  33914. +static void ieee80211_invoke_rx_handlers(struct ieee80211_rx_data *rx)
  33915. +{
  33916. + struct sk_buff_head reorder_release;
  33917. + ieee80211_rx_result res = RX_DROP_MONITOR;
  33918. +
  33919. + __skb_queue_head_init(&reorder_release);
  33920. +
  33921. +#define CALL_RXH(rxh) \
  33922. + do { \
  33923. + res = rxh(rx); \
  33924. + if (res != RX_CONTINUE) \
  33925. + goto rxh_next; \
  33926. + } while (0);
  33927. +
  33928. + CALL_RXH(ieee80211_rx_h_check)
  33929. +
  33930. + ieee80211_rx_reorder_ampdu(rx, &reorder_release);
  33931. +
  33932. + ieee80211_rx_handlers(rx, &reorder_release);
  33933. + return;
  33934. +
  33935. + rxh_next:
  33936. + ieee80211_rx_handlers_result(rx, res);
  33937. +
  33938. +#undef CALL_RXH
  33939. +}
  33940. +
  33941. +/*
  33942. + * This function makes calls into the RX path, therefore
  33943. + * it has to be invoked under RCU read lock.
  33944. + */
  33945. +void ieee80211_release_reorder_timeout(struct sta_info *sta, int tid)
  33946. +{
  33947. + struct sk_buff_head frames;
  33948. + struct ieee80211_rx_data rx = {
  33949. + .sta = sta,
  33950. + .sdata = sta->sdata,
  33951. + .local = sta->local,
  33952. + /* This is OK -- must be QoS data frame */
  33953. + .security_idx = tid,
  33954. + .seqno_idx = tid,
  33955. + .flags = 0,
  33956. + };
  33957. + struct tid_ampdu_rx *tid_agg_rx;
  33958. +
  33959. + tid_agg_rx = rcu_dereference(sta->ampdu_mlme.tid_rx[tid]);
  33960. + if (!tid_agg_rx)
  33961. + return;
  33962. +
  33963. + __skb_queue_head_init(&frames);
  33964. +
  33965. + spin_lock(&tid_agg_rx->reorder_lock);
  33966. + ieee80211_sta_reorder_release(sta->sdata, tid_agg_rx, &frames);
  33967. + spin_unlock(&tid_agg_rx->reorder_lock);
  33968. +
  33969. + ieee80211_rx_handlers(&rx, &frames);
  33970. +}
  33971. +
  33972. +/* main receive path */
  33973. +
  33974. +static bool prepare_for_handlers(struct ieee80211_rx_data *rx,
  33975. + struct ieee80211_hdr *hdr)
  33976. +{
  33977. + struct ieee80211_sub_if_data *sdata = rx->sdata;
  33978. + struct sk_buff *skb = rx->skb;
  33979. + struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
  33980. + u8 *bssid = ieee80211_get_bssid(hdr, skb->len, sdata->vif.type);
  33981. + int multicast = is_multicast_ether_addr(hdr->addr1);
  33982. +
  33983. + switch (sdata->vif.type) {
  33984. + case NL80211_IFTYPE_STATION:
  33985. + if (!bssid && !sdata->u.mgd.use_4addr)
  33986. + return false;
  33987. + if (!multicast &&
  33988. + !ether_addr_equal(sdata->vif.addr, hdr->addr1)) {
  33989. + if (!(sdata->dev->flags & IFF_PROMISC) ||
  33990. + sdata->u.mgd.use_4addr)
  33991. + return false;
  33992. + status->rx_flags &= ~IEEE80211_RX_RA_MATCH;
  33993. + }
  33994. + break;
  33995. + case NL80211_IFTYPE_ADHOC:
  33996. + if (!bssid)
  33997. + return false;
  33998. + if (ether_addr_equal(sdata->vif.addr, hdr->addr2) ||
  33999. + ether_addr_equal(sdata->u.ibss.bssid, hdr->addr2))
  34000. + return false;
  34001. + if (ieee80211_is_beacon(hdr->frame_control)) {
  34002. + return true;
  34003. + } else if (!ieee80211_bssid_match(bssid, sdata->u.ibss.bssid)) {
  34004. + return false;
  34005. + } else if (!multicast &&
  34006. + !ether_addr_equal(sdata->vif.addr, hdr->addr1)) {
  34007. + if (!(sdata->dev->flags & IFF_PROMISC))
  34008. + return false;
  34009. + status->rx_flags &= ~IEEE80211_RX_RA_MATCH;
  34010. + } else if (!rx->sta) {
  34011. + int rate_idx;
  34012. + if (status->flag & (RX_FLAG_HT | RX_FLAG_VHT))
  34013. + rate_idx = 0; /* TODO: HT/VHT rates */
  34014. + else
  34015. + rate_idx = status->rate_idx;
  34016. + ieee80211_ibss_rx_no_sta(sdata, bssid, hdr->addr2,
  34017. + BIT(rate_idx));
  34018. + }
  34019. + break;
  34020. + case NL80211_IFTYPE_MESH_POINT:
  34021. + if (!multicast &&
  34022. + !ether_addr_equal(sdata->vif.addr, hdr->addr1)) {
  34023. + if (!(sdata->dev->flags & IFF_PROMISC))
  34024. + return false;
  34025. +
  34026. + status->rx_flags &= ~IEEE80211_RX_RA_MATCH;
  34027. + }
  34028. + break;
  34029. + case NL80211_IFTYPE_AP_VLAN:
  34030. + case NL80211_IFTYPE_AP:
  34031. + if (!bssid) {
  34032. + if (!ether_addr_equal(sdata->vif.addr, hdr->addr1))
  34033. + return false;
  34034. + } else if (!ieee80211_bssid_match(bssid, sdata->vif.addr)) {
  34035. + /*
  34036. + * Accept public action frames even when the
  34037. + * BSSID doesn't match, this is used for P2P
  34038. + * and location updates. Note that mac80211
  34039. + * itself never looks at these frames.
  34040. + */
  34041. + if (!multicast &&
  34042. + !ether_addr_equal(sdata->vif.addr, hdr->addr1))
  34043. + return false;
  34044. + if (ieee80211_is_public_action(hdr, skb->len))
  34045. + return true;
  34046. + if (!ieee80211_is_beacon(hdr->frame_control))
  34047. + return false;
  34048. + status->rx_flags &= ~IEEE80211_RX_RA_MATCH;
  34049. + } else if (!ieee80211_has_tods(hdr->frame_control)) {
  34050. + /* ignore data frames to TDLS-peers */
  34051. + if (ieee80211_is_data(hdr->frame_control))
  34052. + return false;
  34053. + /* ignore action frames to TDLS-peers */
  34054. + if (ieee80211_is_action(hdr->frame_control) &&
  34055. + !ether_addr_equal(bssid, hdr->addr1))
  34056. + return false;
  34057. + }
  34058. + break;
  34059. + case NL80211_IFTYPE_WDS:
  34060. + if (bssid || !ieee80211_is_data(hdr->frame_control))
  34061. + return false;
  34062. + if (!ether_addr_equal(sdata->u.wds.remote_addr, hdr->addr2))
  34063. + return false;
  34064. + break;
  34065. + case NL80211_IFTYPE_P2P_DEVICE:
  34066. + if (!ieee80211_is_public_action(hdr, skb->len) &&
  34067. + !ieee80211_is_probe_req(hdr->frame_control) &&
  34068. + !ieee80211_is_probe_resp(hdr->frame_control) &&
  34069. + !ieee80211_is_beacon(hdr->frame_control))
  34070. + return false;
  34071. + if (!ether_addr_equal(sdata->vif.addr, hdr->addr1) &&
  34072. + !multicast)
  34073. + status->rx_flags &= ~IEEE80211_RX_RA_MATCH;
  34074. + break;
  34075. + default:
  34076. + /* should never get here */
  34077. + WARN_ON_ONCE(1);
  34078. + break;
  34079. + }
  34080. +
  34081. + return true;
  34082. +}
  34083. +
  34084. +/*
  34085. + * This function returns whether or not the SKB
  34086. + * was destined for RX processing or not, which,
  34087. + * if consume is true, is equivalent to whether
  34088. + * or not the skb was consumed.
  34089. + */
  34090. +static bool ieee80211_prepare_and_rx_handle(struct ieee80211_rx_data *rx,
  34091. + struct sk_buff *skb, bool consume)
  34092. +{
  34093. + struct ieee80211_local *local = rx->local;
  34094. + struct ieee80211_sub_if_data *sdata = rx->sdata;
  34095. + struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
  34096. + struct ieee80211_hdr *hdr = (void *)skb->data;
  34097. +
  34098. + rx->skb = skb;
  34099. + status->rx_flags |= IEEE80211_RX_RA_MATCH;
  34100. +
  34101. + if (!prepare_for_handlers(rx, hdr))
  34102. + return false;
  34103. +
  34104. + if (!consume) {
  34105. + skb = skb_copy(skb, GFP_ATOMIC);
  34106. + if (!skb) {
  34107. + if (net_ratelimit())
  34108. + wiphy_debug(local->hw.wiphy,
  34109. + "failed to copy skb for %s\n",
  34110. + sdata->name);
  34111. + return true;
  34112. + }
  34113. +
  34114. + rx->skb = skb;
  34115. + }
  34116. +
  34117. + ieee80211_invoke_rx_handlers(rx);
  34118. + return true;
  34119. +}
  34120. +
  34121. +/*
  34122. + * This is the actual Rx frames handler. as it belongs to Rx path it must
  34123. + * be called with rcu_read_lock protection.
  34124. + */
  34125. +static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw,
  34126. + struct sk_buff *skb)
  34127. +{
  34128. + struct ieee80211_local *local = hw_to_local(hw);
  34129. + struct ieee80211_sub_if_data *sdata;
  34130. + struct ieee80211_hdr *hdr;
  34131. + __le16 fc;
  34132. + struct ieee80211_rx_data rx;
  34133. + struct ieee80211_sub_if_data *prev;
  34134. + struct sta_info *sta, *tmp, *prev_sta;
  34135. + int err = 0;
  34136. +
  34137. + fc = ((struct ieee80211_hdr *)skb->data)->frame_control;
  34138. + memset(&rx, 0, sizeof(rx));
  34139. + rx.skb = skb;
  34140. + rx.local = local;
  34141. +
  34142. + if (ieee80211_is_data(fc) || ieee80211_is_mgmt(fc))
  34143. + local->dot11ReceivedFragmentCount++;
  34144. +
  34145. + if (ieee80211_is_mgmt(fc)) {
  34146. + /* drop frame if too short for header */
  34147. + if (skb->len < ieee80211_hdrlen(fc))
  34148. + err = -ENOBUFS;
  34149. + else
  34150. + err = skb_linearize(skb);
  34151. + } else {
  34152. + err = !pskb_may_pull(skb, ieee80211_hdrlen(fc));
  34153. + }
  34154. +
  34155. + if (err) {
  34156. + dev_kfree_skb(skb);
  34157. + return;
  34158. + }
  34159. +
  34160. + hdr = (struct ieee80211_hdr *)skb->data;
  34161. + ieee80211_parse_qos(&rx);
  34162. + ieee80211_verify_alignment(&rx);
  34163. +
  34164. + if (unlikely(ieee80211_is_probe_resp(hdr->frame_control) ||
  34165. + ieee80211_is_beacon(hdr->frame_control)))
  34166. + ieee80211_scan_rx(local, skb);
  34167. +
  34168. + if (ieee80211_is_data(fc)) {
  34169. + prev_sta = NULL;
  34170. +
  34171. + for_each_sta_info(local, hdr->addr2, sta, tmp) {
  34172. + if (!prev_sta) {
  34173. + prev_sta = sta;
  34174. + continue;
  34175. + }
  34176. +
  34177. + rx.sta = prev_sta;
  34178. + rx.sdata = prev_sta->sdata;
  34179. + ieee80211_prepare_and_rx_handle(&rx, skb, false);
  34180. +
  34181. + prev_sta = sta;
  34182. + }
  34183. +
  34184. + if (prev_sta) {
  34185. + rx.sta = prev_sta;
  34186. + rx.sdata = prev_sta->sdata;
  34187. +
  34188. + if (ieee80211_prepare_and_rx_handle(&rx, skb, true))
  34189. + return;
  34190. + goto out;
  34191. + }
  34192. + }
  34193. +
  34194. + prev = NULL;
  34195. +
  34196. + list_for_each_entry_rcu(sdata, &local->interfaces, list) {
  34197. + if (!ieee80211_sdata_running(sdata))
  34198. + continue;
  34199. +
  34200. + if (sdata->vif.type == NL80211_IFTYPE_MONITOR ||
  34201. + sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
  34202. + continue;
  34203. +
  34204. + /*
  34205. + * frame is destined for this interface, but if it's
  34206. + * not also for the previous one we handle that after
  34207. + * the loop to avoid copying the SKB once too much
  34208. + */
  34209. +
  34210. + if (!prev) {
  34211. + prev = sdata;
  34212. + continue;
  34213. + }
  34214. +
  34215. + rx.sta = sta_info_get_bss(prev, hdr->addr2);
  34216. + rx.sdata = prev;
  34217. + ieee80211_prepare_and_rx_handle(&rx, skb, false);
  34218. +
  34219. + prev = sdata;
  34220. + }
  34221. +
  34222. + if (prev) {
  34223. + rx.sta = sta_info_get_bss(prev, hdr->addr2);
  34224. + rx.sdata = prev;
  34225. +
  34226. + if (ieee80211_prepare_and_rx_handle(&rx, skb, true))
  34227. + return;
  34228. + }
  34229. +
  34230. + out:
  34231. + dev_kfree_skb(skb);
  34232. +}
  34233. +
  34234. +/*
  34235. + * This is the receive path handler. It is called by a low level driver when an
  34236. + * 802.11 MPDU is received from the hardware.
  34237. + */
  34238. +void ieee80211_rx(struct ieee80211_hw *hw, struct sk_buff *skb)
  34239. +{
  34240. + struct ieee80211_local *local = hw_to_local(hw);
  34241. + struct ieee80211_rate *rate = NULL;
  34242. + struct ieee80211_supported_band *sband;
  34243. + struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
  34244. +
  34245. + WARN_ON_ONCE(softirq_count() == 0);
  34246. +
  34247. + if (WARN_ON(status->band >= IEEE80211_NUM_BANDS))
  34248. + goto drop;
  34249. +
  34250. + sband = local->hw.wiphy->bands[status->band];
  34251. + if (WARN_ON(!sband))
  34252. + goto drop;
  34253. +
  34254. + /*
  34255. + * If we're suspending, it is possible although not too likely
  34256. + * that we'd be receiving frames after having already partially
  34257. + * quiesced the stack. We can't process such frames then since
  34258. + * that might, for example, cause stations to be added or other
  34259. + * driver callbacks be invoked.
  34260. + */
  34261. + if (unlikely(local->quiescing || local->suspended))
  34262. + goto drop;
  34263. +
  34264. + /* We might be during a HW reconfig, prevent Rx for the same reason */
  34265. + if (unlikely(local->in_reconfig))
  34266. + goto drop;
  34267. +
  34268. + /*
  34269. + * The same happens when we're not even started,
  34270. + * but that's worth a warning.
  34271. + */
  34272. + if (WARN_ON(!local->started))
  34273. + goto drop;
  34274. +
  34275. + if (likely(!(status->flag & RX_FLAG_FAILED_PLCP_CRC))) {
  34276. + /*
  34277. + * Validate the rate, unless a PLCP error means that
  34278. + * we probably can't have a valid rate here anyway.
  34279. + */
  34280. +
  34281. + if (status->flag & RX_FLAG_HT) {
  34282. + /*
  34283. + * rate_idx is MCS index, which can be [0-76]
  34284. + * as documented on:
  34285. + *
  34286. + * http://wireless.kernel.org/en/developers/Documentation/ieee80211/802.11n
  34287. + *
  34288. + * Anything else would be some sort of driver or
  34289. + * hardware error. The driver should catch hardware
  34290. + * errors.
  34291. + */
  34292. + if (WARN(status->rate_idx > 76,
  34293. + "Rate marked as an HT rate but passed "
  34294. + "status->rate_idx is not "
  34295. + "an MCS index [0-76]: %d (0x%02x)\n",
  34296. + status->rate_idx,
  34297. + status->rate_idx))
  34298. + goto drop;
  34299. + } else if (status->flag & RX_FLAG_VHT) {
  34300. + if (WARN_ONCE(status->rate_idx > 9 ||
  34301. + !status->vht_nss ||
  34302. + status->vht_nss > 8,
  34303. + "Rate marked as a VHT rate but data is invalid: MCS: %d, NSS: %d\n",
  34304. + status->rate_idx, status->vht_nss))
  34305. + goto drop;
  34306. + } else {
  34307. + if (WARN_ON(status->rate_idx >= sband->n_bitrates))
  34308. + goto drop;
  34309. + rate = &sband->bitrates[status->rate_idx];
  34310. + }
  34311. + }
  34312. +
  34313. + status->rx_flags = 0;
  34314. +
  34315. + /*
  34316. + * key references and virtual interfaces are protected using RCU
  34317. + * and this requires that we are in a read-side RCU section during
  34318. + * receive processing
  34319. + */
  34320. + rcu_read_lock();
  34321. +
  34322. + /*
  34323. + * Frames with failed FCS/PLCP checksum are not returned,
  34324. + * all other frames are returned without radiotap header
  34325. + * if it was previously present.
  34326. + * Also, frames with less than 16 bytes are dropped.
  34327. + */
  34328. + skb = ieee80211_rx_monitor(local, skb, rate);
  34329. + if (!skb) {
  34330. + rcu_read_unlock();
  34331. + return;
  34332. + }
  34333. +
  34334. + ieee80211_tpt_led_trig_rx(local,
  34335. + ((struct ieee80211_hdr *)skb->data)->frame_control,
  34336. + skb->len);
  34337. + __ieee80211_rx_handle_packet(hw, skb);
  34338. +
  34339. + rcu_read_unlock();
  34340. +
  34341. + return;
  34342. + drop:
  34343. + kfree_skb(skb);
  34344. +}
  34345. +EXPORT_SYMBOL(ieee80211_rx);
  34346. +
  34347. +/* This is a version of the rx handler that can be called from hard irq
  34348. + * context. Post the skb on the queue and schedule the tasklet */
  34349. +void ieee80211_rx_irqsafe(struct ieee80211_hw *hw, struct sk_buff *skb)
  34350. +{
  34351. + struct ieee80211_local *local = hw_to_local(hw);
  34352. +
  34353. + BUILD_BUG_ON(sizeof(struct ieee80211_rx_status) > sizeof(skb->cb));
  34354. +
  34355. + skb->pkt_type = IEEE80211_RX_MSG;
  34356. + skb_queue_tail(&local->skb_queue, skb);
  34357. + tasklet_schedule(&local->tasklet);
  34358. +}
  34359. +EXPORT_SYMBOL(ieee80211_rx_irqsafe);
  34360. diff -Nur linux-3.18.12.orig/net/netfilter/core.c linux-3.18.12/net/netfilter/core.c
  34361. --- linux-3.18.12.orig/net/netfilter/core.c 2015-04-20 14:48:02.000000000 -0500
  34362. +++ linux-3.18.12/net/netfilter/core.c 2015-04-26 13:32:22.471684003 -0500
  34363. @@ -21,11 +21,17 @@
  34364. #include <linux/proc_fs.h>
  34365. #include <linux/mutex.h>
  34366. #include <linux/slab.h>
  34367. +#include <linux/locallock.h>
  34368. #include <net/net_namespace.h>
  34369. #include <net/sock.h>
  34370. #include "nf_internals.h"
  34371. +#ifdef CONFIG_PREEMPT_RT_BASE
  34372. +DEFINE_LOCAL_IRQ_LOCK(xt_write_lock);
  34373. +EXPORT_PER_CPU_SYMBOL(xt_write_lock);
  34374. +#endif
  34375. +
  34376. static DEFINE_MUTEX(afinfo_mutex);
  34377. const struct nf_afinfo __rcu *nf_afinfo[NFPROTO_NUMPROTO] __read_mostly;
  34378. diff -Nur linux-3.18.12.orig/net/packet/af_packet.c linux-3.18.12/net/packet/af_packet.c
  34379. --- linux-3.18.12.orig/net/packet/af_packet.c 2015-04-20 14:48:02.000000000 -0500
  34380. +++ linux-3.18.12/net/packet/af_packet.c 2015-04-26 13:32:22.471684003 -0500
  34381. @@ -63,6 +63,7 @@
  34382. #include <linux/if_packet.h>
  34383. #include <linux/wireless.h>
  34384. #include <linux/kernel.h>
  34385. +#include <linux/delay.h>
  34386. #include <linux/kmod.h>
  34387. #include <linux/slab.h>
  34388. #include <linux/vmalloc.h>
  34389. @@ -692,7 +693,7 @@
  34390. if (BLOCK_NUM_PKTS(pbd)) {
  34391. while (atomic_read(&pkc->blk_fill_in_prog)) {
  34392. /* Waiting for skb_copy_bits to finish... */
  34393. - cpu_relax();
  34394. + cpu_chill();
  34395. }
  34396. }
  34397. @@ -943,7 +944,7 @@
  34398. if (!(status & TP_STATUS_BLK_TMO)) {
  34399. while (atomic_read(&pkc->blk_fill_in_prog)) {
  34400. /* Waiting for skb_copy_bits to finish... */
  34401. - cpu_relax();
  34402. + cpu_chill();
  34403. }
  34404. }
  34405. prb_close_block(pkc, pbd, po, status);
  34406. diff -Nur linux-3.18.12.orig/net/rds/ib_rdma.c linux-3.18.12/net/rds/ib_rdma.c
  34407. --- linux-3.18.12.orig/net/rds/ib_rdma.c 2015-04-20 14:48:02.000000000 -0500
  34408. +++ linux-3.18.12/net/rds/ib_rdma.c 2015-04-26 13:32:22.471684003 -0500
  34409. @@ -34,6 +34,7 @@
  34410. #include <linux/slab.h>
  34411. #include <linux/rculist.h>
  34412. #include <linux/llist.h>
  34413. +#include <linux/delay.h>
  34414. #include "rds.h"
  34415. #include "ib.h"
  34416. @@ -286,7 +287,7 @@
  34417. for_each_online_cpu(cpu) {
  34418. flag = &per_cpu(clean_list_grace, cpu);
  34419. while (test_bit(CLEAN_LIST_BUSY_BIT, flag))
  34420. - cpu_relax();
  34421. + cpu_chill();
  34422. }
  34423. }
  34424. diff -Nur linux-3.18.12.orig/net/sched/sch_generic.c linux-3.18.12/net/sched/sch_generic.c
  34425. --- linux-3.18.12.orig/net/sched/sch_generic.c 2015-04-20 14:48:02.000000000 -0500
  34426. +++ linux-3.18.12/net/sched/sch_generic.c 2015-04-26 13:32:22.471684003 -0500
  34427. @@ -894,7 +894,7 @@
  34428. /* Wait for outstanding qdisc_run calls. */
  34429. list_for_each_entry(dev, head, close_list)
  34430. while (some_qdisc_is_busy(dev))
  34431. - yield();
  34432. + msleep(1);
  34433. }
  34434. void dev_deactivate(struct net_device *dev)
  34435. diff -Nur linux-3.18.12.orig/net/sunrpc/svc_xprt.c linux-3.18.12/net/sunrpc/svc_xprt.c
  34436. --- linux-3.18.12.orig/net/sunrpc/svc_xprt.c 2015-04-20 14:48:02.000000000 -0500
  34437. +++ linux-3.18.12/net/sunrpc/svc_xprt.c 2015-04-26 13:32:22.475684003 -0500
  34438. @@ -357,7 +357,7 @@
  34439. return;
  34440. }
  34441. - cpu = get_cpu();
  34442. + cpu = get_cpu_light();
  34443. pool = svc_pool_for_cpu(xprt->xpt_server, cpu);
  34444. spin_lock_bh(&pool->sp_lock);
  34445. @@ -390,7 +390,7 @@
  34446. }
  34447. spin_unlock_bh(&pool->sp_lock);
  34448. - put_cpu();
  34449. + put_cpu_light();
  34450. }
  34451. /*
  34452. diff -Nur linux-3.18.12.orig/scripts/mkcompile_h linux-3.18.12/scripts/mkcompile_h
  34453. --- linux-3.18.12.orig/scripts/mkcompile_h 2015-04-20 14:48:02.000000000 -0500
  34454. +++ linux-3.18.12/scripts/mkcompile_h 2015-04-26 13:32:22.475684003 -0500
  34455. @@ -4,7 +4,8 @@
  34456. ARCH=$2
  34457. SMP=$3
  34458. PREEMPT=$4
  34459. -CC=$5
  34460. +RT=$5
  34461. +CC=$6
  34462. vecho() { [ "${quiet}" = "silent_" ] || echo "$@" ; }
  34463. @@ -57,6 +58,7 @@
  34464. CONFIG_FLAGS=""
  34465. if [ -n "$SMP" ] ; then CONFIG_FLAGS="SMP"; fi
  34466. if [ -n "$PREEMPT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS PREEMPT"; fi
  34467. +if [ -n "$RT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS RT"; fi
  34468. UTS_VERSION="$UTS_VERSION $CONFIG_FLAGS $TIMESTAMP"
  34469. # Truncate to maximum length
  34470. diff -Nur linux-3.18.12.orig/sound/core/pcm_native.c linux-3.18.12/sound/core/pcm_native.c
  34471. --- linux-3.18.12.orig/sound/core/pcm_native.c 2015-04-20 14:48:02.000000000 -0500
  34472. +++ linux-3.18.12/sound/core/pcm_native.c 2015-04-26 13:32:22.475684003 -0500
  34473. @@ -104,7 +104,7 @@
  34474. void snd_pcm_stream_lock_irq(struct snd_pcm_substream *substream)
  34475. {
  34476. if (!substream->pcm->nonatomic)
  34477. - local_irq_disable();
  34478. + local_irq_disable_nort();
  34479. snd_pcm_stream_lock(substream);
  34480. }
  34481. EXPORT_SYMBOL_GPL(snd_pcm_stream_lock_irq);
  34482. @@ -113,7 +113,7 @@
  34483. {
  34484. snd_pcm_stream_unlock(substream);
  34485. if (!substream->pcm->nonatomic)
  34486. - local_irq_enable();
  34487. + local_irq_enable_nort();
  34488. }
  34489. EXPORT_SYMBOL_GPL(snd_pcm_stream_unlock_irq);
  34490. @@ -121,7 +121,7 @@
  34491. {
  34492. unsigned long flags = 0;
  34493. if (!substream->pcm->nonatomic)
  34494. - local_irq_save(flags);
  34495. + local_irq_save_nort(flags);
  34496. snd_pcm_stream_lock(substream);
  34497. return flags;
  34498. }
  34499. @@ -132,7 +132,7 @@
  34500. {
  34501. snd_pcm_stream_unlock(substream);
  34502. if (!substream->pcm->nonatomic)
  34503. - local_irq_restore(flags);
  34504. + local_irq_restore_nort(flags);
  34505. }
  34506. EXPORT_SYMBOL_GPL(snd_pcm_stream_unlock_irqrestore);
  34507. diff -Nur linux-3.18.12.orig/virt/kvm/async_pf.c linux-3.18.12/virt/kvm/async_pf.c
  34508. --- linux-3.18.12.orig/virt/kvm/async_pf.c 2015-04-20 14:48:02.000000000 -0500
  34509. +++ linux-3.18.12/virt/kvm/async_pf.c 2015-04-26 13:32:22.475684003 -0500
  34510. @@ -94,8 +94,8 @@
  34511. trace_kvm_async_pf_completed(addr, gva);
  34512. - if (waitqueue_active(&vcpu->wq))
  34513. - wake_up_interruptible(&vcpu->wq);
  34514. + if (swaitqueue_active(&vcpu->wq))
  34515. + swait_wake_interruptible(&vcpu->wq);
  34516. mmput(mm);
  34517. kvm_put_kvm(vcpu->kvm);
  34518. diff -Nur linux-3.18.12.orig/virt/kvm/kvm_main.c linux-3.18.12/virt/kvm/kvm_main.c
  34519. --- linux-3.18.12.orig/virt/kvm/kvm_main.c 2015-04-20 14:48:02.000000000 -0500
  34520. +++ linux-3.18.12/virt/kvm/kvm_main.c 2015-04-26 13:32:22.475684003 -0500
  34521. @@ -221,7 +221,7 @@
  34522. vcpu->kvm = kvm;
  34523. vcpu->vcpu_id = id;
  34524. vcpu->pid = NULL;
  34525. - init_waitqueue_head(&vcpu->wq);
  34526. + init_swait_head(&vcpu->wq);
  34527. kvm_async_pf_vcpu_init(vcpu);
  34528. page = alloc_page(GFP_KERNEL | __GFP_ZERO);
  34529. @@ -1740,10 +1740,10 @@
  34530. */
  34531. void kvm_vcpu_block(struct kvm_vcpu *vcpu)
  34532. {
  34533. - DEFINE_WAIT(wait);
  34534. + DEFINE_SWAITER(wait);
  34535. for (;;) {
  34536. - prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
  34537. + swait_prepare(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
  34538. if (kvm_arch_vcpu_runnable(vcpu)) {
  34539. kvm_make_request(KVM_REQ_UNHALT, vcpu);
  34540. @@ -1757,7 +1757,7 @@
  34541. schedule();
  34542. }
  34543. - finish_wait(&vcpu->wq, &wait);
  34544. + swait_finish(&vcpu->wq, &wait);
  34545. }
  34546. EXPORT_SYMBOL_GPL(kvm_vcpu_block);
  34547. @@ -1769,11 +1769,11 @@
  34548. {
  34549. int me;
  34550. int cpu = vcpu->cpu;
  34551. - wait_queue_head_t *wqp;
  34552. + struct swait_head *wqp;
  34553. wqp = kvm_arch_vcpu_wq(vcpu);
  34554. - if (waitqueue_active(wqp)) {
  34555. - wake_up_interruptible(wqp);
  34556. + if (swaitqueue_active(wqp)) {
  34557. + swait_wake_interruptible(wqp);
  34558. ++vcpu->stat.halt_wakeup;
  34559. }
  34560. @@ -1878,7 +1878,7 @@
  34561. continue;
  34562. if (vcpu == me)
  34563. continue;
  34564. - if (waitqueue_active(&vcpu->wq) && !kvm_arch_vcpu_runnable(vcpu))
  34565. + if (swaitqueue_active(&vcpu->wq) && !kvm_arch_vcpu_runnable(vcpu))
  34566. continue;
  34567. if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
  34568. continue;